You are on page 1of 29

{

"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"df=pd.read_table('positive comments.txt',delimiter='\\n',header=None)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [

"df_tp=pd.read_table(r'../alexis_topic_modeling/input/ThinkingPinoy.txt',delim_whit
espace=True,header=None)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Negative 35438\n",
"Neutral 16438\n",
"Positive 11522\n",
"Negative\\n 13\n",
"Nuetral 6\n",
"negative 4\n",
"neutral 3\n",
"NegativeNegative 2\n",
"Nutral 1\n",
" Neutral 1\n",
"p 1\n",
"NEutral 1\n",
"Negativ 1\n",
"Postive 1\n",
"n 1\n",
"positive 1\n",
"Netral 1\n",
"Negative 1\n",
"Name: Sentiment, dtype: int64"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Sentiment'][df['comment'].isin(df_tp[0].values)].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [

"df=pd.concat([df[df['comment'].isin(df_tp[0].values)==False],df[(df['comment'].isi
n(df_tp[0].values)==True) & (df['Sentiment']=='Positive')],\\\n",
" df[(df['comment'].isin(df_tp[0].values)==True) &
(df['Sentiment']=='Neutral')].head(11000),df[(df['comment'].isin(df_tp[0].values)==
True) & (df['Sentiment']=='Negative')].head(7500)])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import glob\n",
"import os\n",
"import re\n",
"from sklearn.model_selection import train_test_split\n",
"import csv\n",
"import string"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"path= r'Nov 2/input'\n",
"all_files=glob.glob(os.path.join(path,'*.csv'))\n",
"df_from_each_file=(pd.read_csv(f, encoding=\"unicode_escape\") for f in
all_files )\n",
"df = pd.concat(df_from_each_file,ignore_index=True)\n",
"# df.to_csv(r'compiled.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df=df[df['Sentiment'].isnull()==False].reset_index(drop=True)\n",
"df=df[df['comment'].isnull()==False].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Sentiment 108844\n",
"comment 108844\n",
"sentiment 107903\n",
"post 108305\n",
"parent comment 23364\n",
"dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.count()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def removeEmoji(text):\n",
" emoji_pattern = re.compile(\"[\"\n",
" u\"\\U0001F600-\\U0001F64F\" # emoticons\n",
" u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\n",
" u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\n",
" u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\n",
" \"]+\", flags=re.UNICODE)\n",
" return emoji_pattern.sub(r'', text) # no emoji\n",
"\n",
"def removePunctuations(text):\n",
" text = ''.join(filter(lambda x: x in string.printable, text))\n",
"# text = text.encode(\"utf-8\")\n",
"# text = text.translate(string.maketrans(\"\",\"\"),
string.punctuation)\n",
" translator = str.maketrans(dict.fromkeys(string.punctuation))\n",
" text = text.translate(translator)\n",
" return text\n",
"#text = text.lower()\n",
"for index,row in df.iterrows():\n",
" comment=row['comment']\n",
" new_comment=removePunctuations(removeEmoji(comment)).lower()\n",
" new_comment=new_comment.replace('\\n',' ')\n",
" df.set_value(index, 'comment', new_comment)\n",
"\n",
"df=df[(df['comment'] != \"\")].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df['Sentiment'][df['Sentiment'].isnull()==False]=df['Sentiment']
[df['Sentiment'].isnull()==False].apply(lambda x: x.title())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Sentiment</th>\n",
" <th>comment</th>\n",
" <th>sentiment</th>\n",
" <th>post</th>\n",
" <th>parent comment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Neutral</td>\n",
" <td>paano ang ios hindi pwede</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>Neutral</td>\n",
" <td>you</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82</th>\n",
" <td>Neutral</td>\n",
" <td>how po</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>Neutral</td>\n",
" <td>thank you</td>\n",
" <td>positive</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>Neutral</td>\n",
" <td>how po</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>Neutral</td>\n",
" <td>how sirmadam</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>119</th>\n",
" <td>Neutral</td>\n",
" <td>how pls tnx</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>128</th>\n",
" <td>Neutral</td>\n",
" <td>how po</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>137</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>148</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>153</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>155</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>157</th>\n",
" <td>Neutral</td>\n",
" <td>how po</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>168</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>178</th>\n",
" <td>Neutral</td>\n",
" <td>how</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>187</th>\n",
" <td>Neutral</td>\n",
" <td>paano po</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>189</th>\n",
" <td>Neutral</td>\n",
" <td>how po</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>190</th>\n",
" <td>Neutral</td>\n",
" <td>how po</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>215</th>\n",
" <td>Neutral</td>\n",
" <td>how to use this application</td>\n",
" <td>positive</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>227</th>\n",
" <td>Neutral</td>\n",
" <td>how po</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>231</th>\n",
" <td>Neutral</td>\n",
" <td>paano po ba</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>232</th>\n",
" <td>Neutral</td>\n",
" <td>ainah khoilarahh tinuod ni dam</td>\n",
" <td>positive</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234</th>\n",
" <td>Neutral</td>\n",
" <td>kwt qwn ah nodle bgy ko</td>\n",
" <td>negative</td>\n",
" <td>Kung nag-join ka dati sa survey about the app ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106305</th>\n",
" <td>Neutral</td>\n",
" <td>toinks</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106320</th>\n",
" <td>Neutral</td>\n",
" <td>panu na si sara d pwede senator na lang</td>\n",
" <td>positive</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106362</th>\n",
" <td>Neutral</td>\n",
" <td>adjes bry smiley kristelle lyra gillen miguel ...</td>\n",
" <td>positive</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106364</th>\n",
" <td>Neutral</td>\n",
" <td>iam aeb</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106365</th>\n",
" <td>Neutral</td>\n",
" <td>attilla abella vicencio</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106366</th>\n",
" <td>Neutral</td>\n",
" <td>mharcoy adezas chard fronda</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106367</th>\n",
" <td>Neutral</td>\n",
" <td>rivan srifa</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106368</th>\n",
" <td>Neutral</td>\n",
" <td>dang dang ricca joice dalit adrada</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106369</th>\n",
" <td>Neutral</td>\n",
" <td>maribeth ropal</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106370</th>\n",
" <td>Neutral</td>\n",
" <td>vinnie serna</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106371</th>\n",
" <td>Neutral</td>\n",
" <td>ramil reyes</td>\n",
" <td>positive</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106372</th>\n",
" <td>Neutral</td>\n",
" <td>ito yung full interview nyan eh httpswwwyoutu...</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106373</th>\n",
" <td>Neutral</td>\n",
" <td>his interview with general bato httpswwwyoutu...</td>\n",
" <td>positive</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106378</th>\n",
" <td>Neutral</td>\n",
" <td>lol</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>Hoy! Thinking Pinoy!\\nPakirespeto lang si Kuya...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106381</th>\n",
" <td>Neutral</td>\n",
" <td>ang punto ko jan mga kaibigan ang kagalanggala...</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>Hoy! Thinking Pinoy!\\nPakirespeto lang si Kuya...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106443</th>\n",
" <td>Neutral</td>\n",
" <td>edison john barreto thanks</td>\n",
" <td>positive</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>Hoy! Thinking Pinoy!\\nPakirespeto lang si Kuya...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106462</th>\n",
" <td>Neutral</td>\n",
" <td>ano ba namang masama sa post na ito aysos</td>\n",
" <td>negative</td>\n",
" <td>Dahil binigyan niya tayo ng comedic gold this ...</td>\n",
" <td>Hoy! Thinking Pinoy!\\nPakirespeto lang si Kuya...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106544</th>\n",
" <td>Neutral</td>\n",
" <td>field trip picnic</td>\n",
" <td>negative</td>\n",
" <td>Sige nga, Senator Risa Hontiveros, SPELL \"INVA...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106766</th>\n",
" <td>Neutral</td>\n",
" <td>c risa ba yan</td>\n",
" <td>negative</td>\n",
" <td>Tancha ko e baka umeksena ng \"Bullying!\" ang i...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107262</th>\n",
" <td>Neutral</td>\n",
" <td>bullets coming from all directions it must be ...</td>\n",
" <td>positive</td>\n",
" <td>A DIFFERENT KIND OF BAYOT\\n(via SibEx) \\n\\n“Ju...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107469</th>\n",
" <td>Neutral</td>\n",
" <td>before we forget hows the mayor</td>\n",
" <td>positive</td>\n",
" <td>A DIFFERENT KIND OF BAYOT\\n(via SibEx) \\n\\n“Ju...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107530</th>\n",
" <td>Neutral</td>\n",
" <td>cd no ko</td>\n",
" <td>negative</td>\n",
" <td>A DIFFERENT KIND OF BAYOT\\n(via SibEx) \\n\\n“Ju...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107551</th>\n",
" <td>Neutral</td>\n",
" <td>broken arrow</td>\n",
" <td>negative</td>\n",
" <td>A DIFFERENT KIND OF BAYOT\\n(via SibEx) \\n\\n“Ju...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107721</th>\n",
" <td>Neutral</td>\n",
" <td>kawikaan 18 6 ang labi ng mangmang bunga ay al...</td>\n",
" <td>negative</td>\n",
" <td>ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108077</th>\n",
" <td>Neutral</td>\n",
" <td>ang nag iinterview sakanya na si kuya daniel r...</td>\n",
" <td>positive</td>\n",
" <td>ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108221</th>\n",
" <td>Neutral</td>\n",
" <td>may link po ba to gusto ko ishare ung actual i...</td>\n",
" <td>negative</td>\n",
" <td>ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108224</th>\n",
" <td>Neutral</td>\n",
" <td>jhen hahahaha</td>\n",
" <td>negative</td>\n",
" <td>ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108389</th>\n",
" <td>Neutral</td>\n",
" <td>bwahahahshaha</td>\n",
" <td>negative</td>\n",
" <td>ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108395</th>\n",
" <td>Neutral</td>\n",
" <td>httpsenwikipediaorgwikiinvasion ito ang ibig s...</td>\n",
" <td>positive</td>\n",
" <td>ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108423</th>\n",
" <td>Neutral</td>\n",
" <td>can somebody explain what invasion is</td>\n",
" <td>negative</td>\n",
" <td>ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>26689 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" Sentiment comment
sentiment \\\n",
"1 Neutral paano ang ios hindi pwede
negative \n",
"30 Neutral you
negative \n",
"82 Neutral how po
negative \n",
"91 Neutral thank you
positive \n",
"103 Neutral how
negative \n",
"105 Neutral how po
negative \n",
"106 Neutral how
negative \n",
"108 Neutral how
negative \n",
"116 Neutral how
negative \n",
"117 Neutral how sirmadam
negative \n",
"119 Neutral how pls tnx
negative \n",
"128 Neutral how po
negative \n",
"137 Neutral how
negative \n",
"140 Neutral how
negative \n",
"142 Neutral how
negative \n",
"148 Neutral how
negative \n",
"153 Neutral how
negative \n",
"155 Neutral how
negative \n",
"157 Neutral how po
negative \n",
"160 Neutral how
negative \n",
"168 Neutral how
negative \n",
"178 Neutral how
negative \n",
"187 Neutral paano po
negative \n",
"189 Neutral how po
negative \n",
"190 Neutral how po
negative \n",
"215 Neutral how to use this application
positive \n",
"227 Neutral how po
negative \n",
"231 Neutral paano po ba
negative \n",
"232 Neutral ainah khoilarahh tinuod ni dam
positive \n",
"234 Neutral kwt qwn ah nodle bgy ko
negative \n",

"... ... ... ...


\n",
"106305 Neutral toinks
negative \n",
"106320 Neutral panu na si sara d pwede senator na lang
positive \n",
"106362 Neutral adjes bry smiley kristelle lyra gillen miguel ...
positive \n",
"106364 Neutral iam aeb
negative \n",
"106365 Neutral attilla abella vicencio
negative \n",
"106366 Neutral mharcoy adezas chard fronda
negative \n",
"106367 Neutral rivan srifa
negative \n",
"106368 Neutral dang dang ricca joice dalit adrada
negative \n",
"106369 Neutral maribeth ropal
negative \n",
"106370 Neutral vinnie serna
negative \n",
"106371 Neutral ramil reyes
positive \n",
"106372 Neutral ito yung full interview nyan eh httpswwwyoutu...
negative \n",
"106373 Neutral his interview with general bato httpswwwyoutu...
positive \n",
"106378 Neutral lol
negative \n",
"106381 Neutral ang punto ko jan mga kaibigan ang kagalanggala...
negative \n",
"106443 Neutral edison john barreto thanks
positive \n",
"106462 Neutral ano ba namang masama sa post na ito aysos
negative \n",
"106544 Neutral field trip picnic
negative \n",
"106766 Neutral c risa ba yan
negative \n",
"107262 Neutral bullets coming from all directions it must be ...
positive \n",
"107469 Neutral before we forget hows the mayor
positive \n",
"107530 Neutral cd no ko
negative \n",
"107551 Neutral broken arrow
negative \n",
"107721 Neutral kawikaan 18 6 ang labi ng mangmang bunga ay al...
negative \n",
"108077 Neutral ang nag iinterview sakanya na si kuya daniel r...
positive \n",
"108221 Neutral may link po ba to gusto ko ishare ung actual i...
negative \n",
"108224 Neutral jhen hahahaha
negative \n",
"108389 Neutral bwahahahshaha
negative \n",
"108395 Neutral httpsenwikipediaorgwikiinvasion ito ang ibig s...
positive \n",
"108423 Neutral can somebody explain what invasion is
negative \n",
"\n",
" post \\\n",
"1 Kung nag-join ka dati sa survey about the app ... \n",
"30 Kung nag-join ka dati sa survey about the app ... \n",
"82 Kung nag-join ka dati sa survey about the app ... \n",
"91 Kung nag-join ka dati sa survey about the app ... \n",
"103 Kung nag-join ka dati sa survey about the app ... \n",
"105 Kung nag-join ka dati sa survey about the app ... \n",
"106 Kung nag-join ka dati sa survey about the app ... \n",
"108 Kung nag-join ka dati sa survey about the app ... \n",
"116 Kung nag-join ka dati sa survey about the app ... \n",
"117 Kung nag-join ka dati sa survey about the app ... \n",
"119 Kung nag-join ka dati sa survey about the app ... \n",
"128 Kung nag-join ka dati sa survey about the app ... \n",
"137 Kung nag-join ka dati sa survey about the app ... \n",
"140 Kung nag-join ka dati sa survey about the app ... \n",
"142 Kung nag-join ka dati sa survey about the app ... \n",
"148 Kung nag-join ka dati sa survey about the app ... \n",
"153 Kung nag-join ka dati sa survey about the app ... \n",
"155 Kung nag-join ka dati sa survey about the app ... \n",
"157 Kung nag-join ka dati sa survey about the app ... \n",
"160 Kung nag-join ka dati sa survey about the app ... \n",
"168 Kung nag-join ka dati sa survey about the app ... \n",
"178 Kung nag-join ka dati sa survey about the app ... \n",
"187 Kung nag-join ka dati sa survey about the app ... \n",
"189 Kung nag-join ka dati sa survey about the app ... \n",
"190 Kung nag-join ka dati sa survey about the app ... \n",
"215 Kung nag-join ka dati sa survey about the app ... \n",
"227 Kung nag-join ka dati sa survey about the app ... \n",
"231 Kung nag-join ka dati sa survey about the app ... \n",
"232 Kung nag-join ka dati sa survey about the app ... \n",
"234 Kung nag-join ka dati sa survey about the app ... \n",
"... ... \n",
"106305 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106320 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106362 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106364 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106365 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106366 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106367 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106368 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106369 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106370 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106371 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106372 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106373 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106378 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106381 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106443 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106462 Dahil binigyan niya tayo ng comedic gold this ... \n",
"106544 Sige nga, Senator Risa Hontiveros, SPELL \"INVA... \n",
"106766 Tancha ko e baka umeksena ng \"Bullying!\" ang i... \n",
"107262 A DIFFERENT KIND OF BAYOT\\n(via SibEx) \\n\\n“Ju... \n",
"107469 A DIFFERENT KIND OF BAYOT\\n(via SibEx) \\n\\n“Ju... \n",
"107530 A DIFFERENT KIND OF BAYOT\\n(via SibEx) \\n\\n“Ju... \n",
"107551 A DIFFERENT KIND OF BAYOT\\n(via SibEx) \\n\\n“Ju... \n",
"107721 ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ... \n",
"108077 ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ... \n",
"108221 ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ... \n",
"108224 ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ... \n",
"108389 ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ... \n",
"108395 ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ... \n",
"108423 ABUTAN NG BONAMINE SI RISA! \\n\\nRazon: Ano po ... \n",
"\n",
" parent comment \n",
"1 NaN \n",
"30 NaN \n",
"82 NaN \n",
"91 NaN \n",
"103 NaN \n",
"105 NaN \n",
"106 NaN \n",
"108 NaN \n",
"116 NaN \n",
"117 NaN \n",
"119 NaN \n",
"128 NaN \n",
"137 NaN \n",
"140 NaN \n",
"142 NaN \n",
"148 NaN \n",
"153 NaN \n",
"155 NaN \n",
"157 NaN \n",
"160 NaN \n",
"168 NaN \n",
"178 NaN \n",
"187 NaN \n",
"189 NaN \n",
"190 NaN \n",
"215 NaN \n",
"227 NaN \n",
"231 NaN \n",
"232 NaN \n",
"234 NaN \n",
"... ... \n",
"106305 NaN \n",
"106320 NaN \n",
"106362 NaN \n",
"106364 NaN \n",
"106365 NaN \n",
"106366 NaN \n",
"106367 NaN \n",
"106368 NaN \n",
"106369 NaN \n",
"106370 NaN \n",
"106371 NaN \n",
"106372 NaN \n",
"106373 NaN \n",
"106378 Hoy! Thinking Pinoy!\\nPakirespeto lang si Kuya... \n",
"106381 Hoy! Thinking Pinoy!\\nPakirespeto lang si Kuya... \n",
"106443 Hoy! Thinking Pinoy!\\nPakirespeto lang si Kuya... \n",
"106462 Hoy! Thinking Pinoy!\\nPakirespeto lang si Kuya... \n",
"106544 NaN \n",
"106766 NaN \n",
"107262 NaN \n",
"107469 NaN \n",
"107530 NaN \n",
"107551 NaN \n",
"107721 NaN \n",
"108077 NaN \n",
"108221 NaN \n",
"108224 NaN \n",
"108389 NaN \n",
"108395 NaN \n",
"108423 NaN \n",
"\n",
"[26689 rows x 5 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[(df['Sentiment']==\"Neutral\") & (df['comment'].isnull()==False)]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df=df.drop(['sentiment','post','parent comment'],axis=1)\n",
"# df=df.fillna('')\n",
"df = df.rename(columns={'Sentiment': 'labels', 'comment': 'text'})"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Negative 57298\n",
"Neutral 26689\n",
"Positive 24424\n",
"Negative\\n 15\n",
" 1\n",
"Negativ 1\n",
"N 1\n",
" Neutral 1\n",
"Negative 1\n",
"Name: labels, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['labels'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df[df['labels']==\"P\"]=\"Positive\""
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df=df[(df['labels']==\"Positive\") | (df['labels']==\"Negative\") |
(df['labels']==\"Neutral\")]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>labels</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [labels, text]\n",
"Index: []"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['text']==\"\"]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df['word_count']=df['text'].map(lambda x: len(x.split(' ')))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df=df[['labels','text']][df['word_count']>1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Remove Random rows to equalize"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"remove_n=4000\n",
"drop_indices = np.random.choice(df[df['labels']==\"Neutral\"].index, remove_n,
replace=False)\n",
"df_subset = df.drop(drop_indices).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"remove_n=10000\n",
"drop_indices =
np.random.choice(df_subset[df_subset['labels']==\"Negative\"].index, remove_n,
replace=False)\n",
"df_subset = df_subset.drop(drop_indices).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Positive 20736\n",
"Neutral 20173\n",
"Negative 20139\n",
"Name: labels, dtype: int64"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_subset['labels'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Split by Train and Test set"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train, test = train_test_split(df_subset, test_size = 0.2)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Positive 16576\n",
"Neutral 16135\n",
"Negative 16127\n",
"Name: labels, dtype: int64"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train['labels'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Positive 4160\n",
"Neutral 4038\n",
"Negative 4012\n",
"Name: labels, dtype: int64"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test['labels'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Positive 16576\n",
"Neutral 16135\n",
"Negative 16127\n",
"Name: labels, dtype: int64"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train['labels'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train.to_csv(r'Nov 2/dropdata-
1word/sentinel_input/train.csv',index=False,quoting=csv.QUOTE_NONNUMERIC)\n",
"test.to_csv(r'Nov 2/dropdata-
1word/sentinel_input/test.csv',quoting=csv.QUOTE_NONNUMERIC,index=False)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_train=pd.read_csv(r'Nov 2/dropdata/sentinel_input/train.csv')\n",
"df_test=pd.read_csv(r'Nov 2/dropdata/sentinel_input/test.csv')\n",
"\n",
"df_train.to_csv(r'Nov 2/dropdata/sentinel_input/train.csv',index=False)\n",
"df_test.to_csv(r'Nov 2/dropdata/sentinel_input/test.csv',index=False)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'ni si pres duterte nga ehhh walang inangkin na mga proyekto basta para sa
bayan walang personalan'"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train[df_train['text'].isnull()].iloc[0]['labels']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Export whole"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.to_csv('Aug 24/complete.csv',index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# STOP"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_test=pd.read_csv(r'Aug 30\\complete\\sentinel_input\\test.csv')\n",
"df_train=pd.read_csv(r'Aug 30\\complete\\sentinel_input\\train.csv')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train\n",
"\n",
"Neutral 19853\n",
"Negative 13342\n",
"Positive 8576\n",
"Name: labels, dtype: int64\n",
"\n",
"test\n",
"\n",
"Neutral 4875\n",
"Negative 3275\n",
"Positive 2293\n",
"Name: labels, dtype: int64\n"
]
}
],
"source": [
"print 'train\\n'\n",
"print df_train['labels'].value_counts()\n",
"print ''\n",
"print 'test\\n'\n",
"print df_test['labels'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# Alexis Topic Modeling"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df=pd.read_csv(r'20180306/SW HongKong.csv')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PEBA, Inc. 387283\n",
"ThinkingPinoy 231018\n",
"Pambansang Kolokoy 47874\n",
"Proud to be an OFW 41278\n",
"Global OFW Voices 22566\n",
"Promo Hunters Around the World 9975\n",
"OFW Global Community 4248\n",
"OFW Family Club 881\n",
"Ofw World Family 847\n",
"Ang Kaagapay ng Bawat OFW 477\n",
"Name: page, dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['page'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df=df[df['comment'].isnull()==False].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id 6132\n",
"user_id 4118\n",
"user_name 4118\n",
"date 6132\n",
"post_id 6132\n",
"comment 6132\n",
"page_id 6132\n",
"score 6132\n",
"sentiment 6132\n",
"comment_count 6132\n",
"like_count 6132\n",
"love_count 6132\n",
"haha_count 6132\n",
"sad_count 6132\n",
"angry_count 6132\n",
"thankful_count 6132\n",
"wow_count 6132\n",
"share_count 6132\n",
"parent_comment_id 6132\n",
"index_id 6132\n",
"dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.count()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([161008173910148])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['page'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PEBA, Inc. 373107\n",
"ThinkingPinoy 221601\n",
"Pambansang Kolokoy 43889\n",
"Proud to be an OFW 40167\n",
"Global OFW Voices 21774\n",
"Promo Hunters Around the World 9509\n",
"OFW Global Community 3960\n",
"OFW Family Club 864\n",
"Ofw World Family 718\n",
"Ang Kaagapay ng Bawat OFW 466\n",
"Name: page, dtype: int64"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['page'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import arrow\n",
"def get_month(date):\n",
" return arrow.get(date,'YYYY-MM-DD').format('MM')\n",
"df['month']=df['date'].apply(get_month)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['12', '11', '10', '09', '01', '03', '02'], dtype=object)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['month'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"12\n",
"1822\n",
"11\n",
"460\n",
"10\n",
"780\n",
"09\n",
"1031\n",
"01\n",
"1008\n",
"03\n",
"271\n",
"02\n",
"760\n"
]
}
],
"source": [
"for month in df['month'].unique():\n",
" print(month)\n",
" df_temp=df[['comment']][df['month']==month]\n",
" print(len(df_temp))\n",
"
df_temp.to_csv(r'../alexis_topic_modeling/input/per_month/per_month_'+month+'.txt',
header=None, index=None, sep=' ', mode='a')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"page='OFW Global Community'\n",
"df_temp=df[['comment']][df['page']==page]\n",
"df_temp.to_csv(r'../alexis_topic_modeling/input/'+page+'_2.txt', header=None,
index=None, sep=' ', mode='a')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

You might also like