You are on page 1of 9

Deep Learning

Project Part(b)

Quratulain Islam (SAP: 22066)

Notebook Link:
https://www.kaggle.com/quratulain22
066/deep-learning-project

Dataset Link:
https://www.kaggle.com/quratulain22
066/paktweets
Installing Wordcloud and other required libraries
In [1]:

import warnings
warnings.filterwarnings('ignore')

!pip install wordcloud

from wordcloud import WordCloud, STOPWORDS


import matplotlib.pyplot as plt
import pandas as pd

import re
import io
import os
import string
from os import path
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import TweetTokenizer
import numpy as np

Requirement already satisfied: wordcloud in /opt/conda/lib/python3.7/site-packages (1.8.1


)
Requirement already satisfied: pillow in /opt/conda/lib/python3.7/site-packages (from wor
dcloud) (8.2.0)
Requirement already satisfied: numpy>=1.6.1 in /opt/conda/lib/python3.7/site-packages (fr
om wordcloud) (1.19.5)
Requirement already satisfied: matplotlib in /opt/conda/lib/python3.7/site-packages (from
wordcloud) (3.4.2)
Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.7/site-pack
ages (from matplotlib->wordcloud) (2.8.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.7/site-package
s (from matplotlib->wordcloud) (1.3.1)
Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.7/site-packages (fr
om matplotlib->wordcloud) (0.10.0)
Requirement already satisfied: pyparsing>=2.2.1 in /opt/conda/lib/python3.7/site-packages
(from matplotlib->wordcloud) (2.4.7)
Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from cycler
>=0.10->matplotlib->wordcloud) (1.15.0)
WARNING: Running pip as root will break packages and permissions. You should install pack
ages reliably by using venv: https://pip.pypa.io/warnings/venv

Reading Dataset
In [2]:

df = pd.read_csv("../input/paktweets/merge.csv", encoding ="latin-1")


print(df.describe(),'\n\n')
print(df.head)

Content City
count 207961 112527
unique 163982 2318
top RT @ImranKhanPTI : I instructed my team to bri... Pakistan
freq 652 18308

<bound method NDFrame.head of Content


City
0 @TheRealPCB Flying jattt Pakistan's best field... NaN
1 Sunlo yar hum mazlomo ki Pakistan me. Paida ho... NaN
2 @MaryamNSharif You spotted yourself on right p... NaN
3 Ã​¾Ã​§Ã​©Ã​³Ã​ªÃ​§Ã​â​ Ã​©Ã​§ Ã​¹Ã​Â... NaN
4 @sidhant @AKAbdulMomen Bangladesh is only neig... NaN
4 @sidhant @AKAbdulMomen Bangladesh is only neig... NaN
... ... ...
212490 @CHABDULWAHAB19 @ImranKhanPTI Chal bey chootye... NaN
212491 #ImranKhanStudentsKeSunlo @ImranKhanPTI NaN
212492 #ImranKhanStudentsKiSunLo @ImranKhanPTI plz sir NaN
212493 #ImranKhanStudentsKiSunLo cancel exams @Shafqa... NaN
212494 #ImranKhanStudentsKiSunLo @ImranKhanPTI df NaN

[212495 rows x 2 columns]>

Pre-Processing Part (a)


43,979 Rows removed

In [3]:

print('Dataset Size before :',len(df.Content))

Dataset Size before : 212495

In [4]:

df = df.drop_duplicates()
print(df.describe())

Content City
count 168725 99737
unique 163982 2318
top RT @ImranKhanPTI : I instructed my team to bri... Islamabad, Pakistan
freq 108 17471

In [5]:

print('Dataset Size after removing duplicates:',len(df.Content))

Dataset Size after removing duplicates: 169077

Pre-Processing Part (b)


In [6]:

df['Content2'] = df['Content'].str.replace('\W', ' ')


df['Content2'] = df['Content'].str.replace(r'[^\x00-\x7f]', '')

Create a word cloud for the most words used


https://www.geeksforgeeks.org/generating-word-cloud-python/

In [7]:

comment_words = ''
stopwords = set(STOPWORDS)

# iterate through the csv file


for val in df['Content2']:

# typecaste each val to string


val = str(val)

# split the value


tokens = val.split()

# Converts each token into lowercase


for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800,


background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)

# plot the WordCloud image


plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()

Create an emoji cloud for most emoji


https://stackoverflow.com/questions/66473771/wordcloud-for-only-emojis

In [8]:
!pip install emojis
import io
import os
import string
from os import path
from wordcloud import WordCloud
from collections import Counter
import emojis

class EmojiCloud:
def __init__(self, font_path='Symbola.otf'):
self.font_path = font_path
self.word_cloud = self.initialize_wordcloud()
self.emoji_probability = None

def initialize_wordcloud(self):
return WordCloud(font_path=self.font_path,
width=2000,
height=1000,
background_color='white',
random_state=42,
collocations=False)

def color_func(self, word, font_size, position, orientation, random_state=None,


**kwargs):
hue_saturation = '42, 88%'

current_emoji_probability = self.emoji_probability[word]
if current_emoji_probability >= 0.10:
opacity = 50
else:
opacity = 75 - current_emoji_probability/0.2 * 5
return f"hsl({hue_saturation},{opacity}%)"

def generate(self, text):


emoji_frequencies = Counter(emojis.iter(text))
total_count = sum(emoji_frequencies.values())

self.emoji_probability = {emoji: count/total_count for emoji, count in emoji_fre


quencies.items()}
wc = self.word_cloud.generate_from_frequencies(emoji_frequencies)

plt.figure(figsize=(20,10))
plt.imshow(wc.recolor(color_func=self.color_func, random_state=42))
plt.axis("off")

Requirement already satisfied: emojis in /opt/conda/lib/python3.7/site-packages (0.6.0)


WARNING: Running pip as root will break packages and permissions. You should install pack
ages reliably by using venv: https://pip.pypa.io/warnings/venv

In [10]:
text = io.open('../input/text-dataset/text_dataset.txt').read()
emoji_cloud = EmojiCloud(font_path='../input/symbola-font/Symbola.otf')
emoji_cloud.generate(text)
Create a plot between positive and negative word counts
https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk

Due to limited time doing this for only 5000 rows it take alot of time

In [11]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

sid = SentimentIntensityAnalyzer()
pos_word_list=[]
neu_word_list=[]
neg_word_list=[]

for index, row in df.head(5000).iterrows():


tokenize_string = tknzr.tokenize(str(row['Content']))
for word in tokenize_string:
if (sid.polarity_scores(word)['compound']) >= 0.5:
pos_word_list.append(word)
elif (sid.polarity_scores(word)['compound']) <= -0.5:
neg_word_list.append(word)

print('Count of Poisitve Words:', len(pos_word_list))


print('Count of Negative Words:', len(neg_word_list))

Count of Poisitve Words: 898


Count of Negative Words: 673

Ration of Positive & Negative Words


https://www.tutorialspoint.com/matplotlib/matplotlib_bar_plot.htm

In [12]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Positive', 'Negative']
students = [len(pos_word_list), len(neg_word_list)]
barlist = ax.bar(langs,students)
barlist[0].set_color('g')
barlist[1].set_color('r')
plt.show()

Identify the city that uses twitter most


Representing this by wordcloud representation

https://www.geeksforgeeks.org/generating-word-cloud-python/
https://www.geeksforgeeks.org/generating-word-cloud-python/

In [13]:
df['City2'] = df['City'].str.replace('\W', ' ')
df['City2'] = df['City'].str.replace(r'[^\x00-\x7f]', '')

In [14]:

comment_words = ''
stopwords = set(STOPWORDS)

# iterate through the csv file


for val in df['City2']:

# typecaste each val to string


val = str(val)

# split the value


tokens = val.split()

# Converts each token into lowercase


for i in range(len(tokens)):
tokens[i] = tokens[i].lower()

if (len(tokens)>0 and tokens[0]!='nan'):


comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800,


background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)

# plot the WordCloud image


plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()
Last Part: Identifying Abuse Word Frequencies
https://www.geeksforgeeks.org/censor-bad-words-in-python-using-better-profanity/

https://www.codeproject.com/Questions/1157796/Coding-challenge-bad-word-filter

In [15]:

# Gddiven a list of words, return a dictionary of


# word-frequency pairs.
def wordListToFreqDict(wordlist):
wordfreq = [wordlist.count(p) for p in wordlist]
return dict(list(zip(wordlist,wordfreq)))

In [16]:

# Sort a dictionary of word-frequency pairs in


# order of descending frequency.
def sortFreqDict(freqdict):
aux = [(freqdict[key], key) for key in freqdict]
aux.sort()
aux.reverse()
return aux

Due to limited time doing this for only 10000 rows

In [18]:
wordlist = []
for index, row in df.head(10000).iterrows():
tokenize_string = tknzr.tokenize(str(row['Content2']))
for word in tokenize_string:
if (sid.polarity_scores(word)['compound']) <= -0.65:
wordlist.append(word)

dictionary = wordListToFreqDict(wordlist)
sorteddict = sortFreqDict(dictionary)

for s in sorteddict: print(str(s))

(73, 'terrorism')
(37, 'terrorist')
(21, 'tragedy')
(18, 'killed')
(17, 'killing')
(9, 'murder')
(8, 'hell')
(7, 'rape')
(7, 'killings')
(7, 'evil')
(6, 'kill')
(6, 'Terrorism')
(4, 'suicide')
(4, 'Cancer')
(3, 'murderer')
(3, 'Terrorist')
(3, 'Killing')
(2, 'rapist')
(2, 'raped')
(2, 'fucked')
(1, 'suicidal')
(1, 'slavery')
(1, 'raping')
(1, 'rapes')
(1, 'murdered')
(1, 'horrific')
(1, 'catastrophe')
(1, 'cancer')
(1, 'Tragedy')
(1, 'Raped')
(1, 'Rape')
(1, 'Murderer')
(1, 'Murder')

You might also like