Professional Documents
Culture Documents
Project Part(b)
Notebook Link:
https://www.kaggle.com/quratulain22
066/deep-learning-project
Dataset Link:
https://www.kaggle.com/quratulain22
066/paktweets
Installing Wordcloud and other required libraries
In [1]:
import warnings
warnings.filterwarnings('ignore')
import re
import io
import os
import string
from os import path
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import TweetTokenizer
import numpy as np
Reading Dataset
In [2]:
Content City
count 207961 112527
unique 163982 2318
top RT @ImranKhanPTI : I instructed my team to bri... Pakistan
freq 652 18308
In [3]:
In [4]:
df = df.drop_duplicates()
print(df.describe())
Content City
count 168725 99737
unique 163982 2318
top RT @ImranKhanPTI : I instructed my team to bri... Islamabad, Pakistan
freq 108 17471
In [5]:
In [7]:
comment_words = ''
stopwords = set(STOPWORDS)
plt.show()
In [8]:
!pip install emojis
import io
import os
import string
from os import path
from wordcloud import WordCloud
from collections import Counter
import emojis
class EmojiCloud:
def __init__(self, font_path='Symbola.otf'):
self.font_path = font_path
self.word_cloud = self.initialize_wordcloud()
self.emoji_probability = None
def initialize_wordcloud(self):
return WordCloud(font_path=self.font_path,
width=2000,
height=1000,
background_color='white',
random_state=42,
collocations=False)
current_emoji_probability = self.emoji_probability[word]
if current_emoji_probability >= 0.10:
opacity = 50
else:
opacity = 75 - current_emoji_probability/0.2 * 5
return f"hsl({hue_saturation},{opacity}%)"
plt.figure(figsize=(20,10))
plt.imshow(wc.recolor(color_func=self.color_func, random_state=42))
plt.axis("off")
In [10]:
text = io.open('../input/text-dataset/text_dataset.txt').read()
emoji_cloud = EmojiCloud(font_path='../input/symbola-font/Symbola.otf')
emoji_cloud.generate(text)
Create a plot between positive and negative word counts
https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk
Due to limited time doing this for only 5000 rows it take alot of time
In [11]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
sid = SentimentIntensityAnalyzer()
pos_word_list=[]
neu_word_list=[]
neg_word_list=[]
In [12]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Positive', 'Negative']
students = [len(pos_word_list), len(neg_word_list)]
barlist = ax.bar(langs,students)
barlist[0].set_color('g')
barlist[1].set_color('r')
plt.show()
https://www.geeksforgeeks.org/generating-word-cloud-python/
https://www.geeksforgeeks.org/generating-word-cloud-python/
In [13]:
df['City2'] = df['City'].str.replace('\W', ' ')
df['City2'] = df['City'].str.replace(r'[^\x00-\x7f]', '')
In [14]:
comment_words = ''
stopwords = set(STOPWORDS)
plt.show()
Last Part: Identifying Abuse Word Frequencies
https://www.geeksforgeeks.org/censor-bad-words-in-python-using-better-profanity/
https://www.codeproject.com/Questions/1157796/Coding-challenge-bad-word-filter
In [15]:
In [16]:
In [18]:
wordlist = []
for index, row in df.head(10000).iterrows():
tokenize_string = tknzr.tokenize(str(row['Content2']))
for word in tokenize_string:
if (sid.polarity_scores(word)['compound']) <= -0.65:
wordlist.append(word)
dictionary = wordListToFreqDict(wordlist)
sorteddict = sortFreqDict(dictionary)
(73, 'terrorism')
(37, 'terrorist')
(21, 'tragedy')
(18, 'killed')
(17, 'killing')
(9, 'murder')
(8, 'hell')
(7, 'rape')
(7, 'killings')
(7, 'evil')
(6, 'kill')
(6, 'Terrorism')
(4, 'suicide')
(4, 'Cancer')
(3, 'murderer')
(3, 'Terrorist')
(3, 'Killing')
(2, 'rapist')
(2, 'raped')
(2, 'fucked')
(1, 'suicidal')
(1, 'slavery')
(1, 'raping')
(1, 'rapes')
(1, 'murdered')
(1, 'horrific')
(1, 'catastrophe')
(1, 'cancer')
(1, 'Tragedy')
(1, 'Raped')
(1, 'Rape')
(1, 'Murderer')
(1, 'Murder')