You are on page 1of 18

Assessment - 1

CSE 3024: Web Mining Slot: L51 + L52


Fundamentals of NLP and Crawling
REG NO : 18BCE0457

NAME: K MARY NIKITHA


CODE:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import wikipedia
result = wikipedia.search("Web Mining")
page = wikipedia.page(result[0])
content = page.content

f1=open("f1.txt","w")
for x in content:
f1.write(x)
f1.close()
f1=open("f1.txt","r")
d1=f1.read()
w1=d1.split()
stop_words = nltk.corpus.stopwords.words('english')
stop1=[w for w in w1 if not w in stop_words]
count1=dict()
for x in w1:
if x not in stop_words:
count1[x]=count1.get(x,0)+1
print(count1)

print("\nThe Total number of words are:",len(stop1))


print("\nLength of stop list before spl characters", len(stop_words))
stop_words.append('!')
stop_words.append('@')
stop_words.append('#')
stop_words.append('$')
stop_words.append('%')
stop_words.append('^')
stop_words.append('&')
stop_words.append('*')
stop_words.append('(')
stop_words.append(')')
stop_words.append('_')
stop_words.append('-')
stop_words.append('=')
stop_words.append('+')
print("Length of stop list after spl characters", len(stop_words))
stop1=[w for w in w1 if not w in stop_words]
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='english')
tokens = stop1
stemmedlist =[]
lemmalist =[]
lemmatizer = WordNetLemmatizer()
for token in tokens:
stemmed1= stemmer.stem(token)
stemmedlist.append(stemmed1)
lemma1=lemmatizer.lemmatize(token)
lemmalist.append(lemma1)
import pandas as pd
data = {'Original Term': tokens,'Stemmed Term': stemmedlist,'Lemmatized Term':
lemmalist}
df = pd.DataFrame(data)
print(df)
uniquestem=[]
uniquelemma=[]
for x in stemmedlist:
if x not in uniquestem:
uniquestem.append(x)
for x in lemmalist:
if x not in uniquelemma:
uniquelemma.append(x)

print("\nTotal number of unique stemmed words",len(uniquestem))

print("Total number of unique lemmatized words",len(uniquelemma))

stop_words.remove('!')
stop_words.remove('@')
stop_words.remove('#')
stop_words.remove('$')
stop_words.remove('%')
stop_words.remove('^')
stop_words.remove('&')
stop_words.remove('*')
stop_words.remove('(')
stop_words.remove(')')
stop_words.remove('_')
stop_words.remove('-')
stop_words.remove('=')
stop_words.remove('+')
sen_list=[]
sentences = nltk.tokenize.sent_tokenize(d1)
sen_stop_words=[]
count=0
for x in sentences:
y=x.split()
sen_list.append(y)
count=count+1
print(count)
for x in range(97):
df1=pd.DataFrame()

sen_stop_words=[]
for y in sen_list[x]:
if y in stop_words:
sen_stop_words.append(y)
tagged = nltk.pos_tag(sen_stop_words)
df1 = pd.DataFrame(tagged, columns =['Stop-Word', 'pos-tag'])
print(sentences[x])
print(df1)

OUTPUT:

1) Term frequency of each term present in them after applying stop word removal.
2) The total number of terms stop word removal and new stop word list including special
characters

3) Stemming and lemmatization


4) POS tag Sentence wise
CODE:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 2 13:32:52 2020

@author: nikitha
"""

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import wikipedia

result1 = wikipedia.search("Web Mining")


page1= wikipedia.page(result1[0])
content1 = page1.content

result2 = wikipedia.search("Data Mining")


page2= wikipedia.page(result2[0])
content2 = page2.content

b4stop1= content1.split()
stop_words = nltk.corpus.stopwords.words('english')
words1=[w for w in b4stop1 if not w in stop_words]

doc1=open("doc1.doc","w")
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()

file1=open("doc1.doc","r")
d1=file1.read()
w1=d1.split()

stop_words.append('!')
stop_words.append('@')
stop_words.append('#')
stop_words.append('$')
stop_words.append('%')
stop_words.append('^')
stop_words.append('&')
stop_words.append('*')
stop_words.append('(')
stop_words.append(')')
stop_words.append('_')
stop_words.append('-')
stop_words.append('=')
stop_words.append('+')

b4stop2= content2.split()
stop_words = nltk.corpus.stopwords.words('english')
words2=[w for w in b4stop2 if not w in stop_words]

doc2=open("doc2.doc","w")
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
import pandas as pd

file2=open("doc2.doc","r")
d2=file2.read()
w2=d2.split()
w3=w1+w2
values1=[]
values2=[]
for x in range(4086):
if w3[x] in w1:
val1=1
else:
val1=0
values1.append(val1)
if w3[x] in w2:
val2=1
else:
val2=0
values2.append(val2)
booldata={'Term': w3,'Document1': values1,'Document2': values2}
dfbool = pd.DataFrame(booldata)
print(dfbool)

count1=dict()
for x in w1:
if x not in stop_words:
count1[x]=count1.get(x,0)+1
count2=dict()
for x in w2:
if x not in stop_words:
count2[x]=count2.get(x,0)+1
bag1=[]
bag2=[]
print(len(w3))
for x in range(4086):
if w3[x] in w1:
bag1.append(count1[w3[x]])
else:
bag1.append(0)

for x in range(4086):
if w3[x] in w2:
bag2.append(count2[w3[x]])
else:
bag2.append(0)

bagdata={'Term': w3,'Document1': bag1,'Document2': bag2}


dfbag=pd.DataFrame(bagdata)
print(dfbag)

com1=[]
com2=[]
for a in range(4086):
if w3[a] in w1:
indices=[]
indices=[i for i, x in enumerate(w1) if x == w3[a]]
com1.append(indices)
else:
com1.append('NA')

for a in range(4086):
if w3[a] in w2:
indices=[]
indices=[i for i, x in enumerate(w1) if x == w3[a]]
com2.append(indices)
else:
com2.append('NA')
comdata={'Term': w3,'Document1': com1,'Document2': com2}
dfcom=pd.DataFrame(comdata)
print(dfcom)
search=('The term data mining is a misnomer, because the goal is the extraction of patterns and
knowledge from large amounts of data, not the extraction mining of data itself')
search_w=search.split()

print(search)

fre1=0
fre2=0

for x in search_w:
if x in w1:
fre1=fre1+count1[x]
else:
fre1=fre1
if x in w2:
fre2=fre2+count2[x]
else:
fre2=fre2

if fre1>fre2:
print('Document 1 is more similar to search query')
else:
print('Document 2 is more similar to search query')

OUTPUT:

1)Boolean representation
2) Bag of Words representation

3) Complete representation
4) Search Query

CODE:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 2 11:16:20 2020

@author: nikitha
"""

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import wikipedia

result1 = wikipedia.search("Web Mining")


page1= wikipedia.page(result1[0])
content1 = page1.content

result2 = wikipedia.search("Data Mining")


page2= wikipedia.page(result2[0])
content2 = page2.content

b4stop1= content1.split()
stop_words = nltk.corpus.stopwords.words('english')
words1=[w for w in b4stop1 if not w in stop_words]

doc1=open("doc1.doc","w")
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()

file1=open("doc1.doc","r")
d1=file1.read()
w1=d1.split()

b4stop2= content2.split()
stop_words = nltk.corpus.stopwords.words('english')
words2=[w for w in b4stop2 if not w in stop_words]

doc2=open("doc2.doc","w")
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
import pandas as pd

file2=open("doc2.doc","r")
d2=file2.read()
w2=d2.split()

rownum1=[]
rowword1=[]
for i in range(1743):
rownum1.append(i)
rowword1.append(w1[i])
dict1={}
for i in range(1743):
dict1[(rownum1[i])]=rowword1[i]

df1=pd.DataFrame.from_dict(dict1, orient='index',columns=['Words'])

from wordcloud import WordCloud, STOPWORDS


import matplotlib.pyplot as plt
text = df1.Words.values
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
figsize = (40, 30),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

rownum2=[]
rowword2=[]
for i in range(2343):
rownum2.append(i)
rowword2.append(w2[i])
dict2={}
for i in range(1743):
dict2[(rownum2[i])]=rowword2[i]

df2=pd.DataFrame.from_dict(dict2, orient='index',columns=['Words'])

from wordcloud import WordCloud, STOPWORDS


import matplotlib.pyplot as plt
text = df2.Words.values
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
figsize = (40, 30),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

OUTPUT:
CODE:
print('Original sentence is')
sen1=('The quick brown fox jumps over the lazy dog')
sen1_w=sen1.split()
print(sen1)

from nltk.corpus import wordnet


syn1a=[]
for x in wordnet.synsets("quick"):
for l in x.lemmas():
syn1a.append(l.name())
print('\nParaphrased sentences are:')

sen1_w[1]=syn1a[2]

print(' '.join(sen1_w))

sen1_w[1]=syn1a[5]

print(' '.join(sen1_w))

sen1_w[1]=syn1a[6]

print(' '.join(sen1_w))

print('\nOriginal sentence is')


sen1=('Obama and Putin met the previous week')
sen1_w=sen1.split()
print(sen1)

syn1a=[]
for x in wordnet.synsets("previous"):
for l in x.lemmas():
syn1a.append(l.name())

print('\nParaphrased sentences are:')

sen1_w[5]=syn1a[2]

print(' '.join(sen1_w))

syn1a=[]
for x in wordnet.synsets("week"):
for l in x.lemmas():
syn1a.append(l.name())

sen1_w[6]=syn1a[2]

print(' '.join(sen1_w))
sen1_w[6]=syn1a[5]

print(' '.join(sen1_w))

print('\nOriginal sentence is')


sen1=('At least 12 people were killed in the battle last week')
sen1_w=sen1.split()
print(sen1)

syn1a=[]
for x in wordnet.synsets("battle"):
for l in x.lemmas():
syn1a.append(l.name())

print('\nParaphrased sentences are:')

sen1_w[8]=syn1a[2]

print(' '.join(sen1_w))

sen1_w[8]=syn1a[10]

print(' '.join(sen1_w))

sen1_w[8]=syn1a[6]

print(' '.join(sen1_w))

print('\nOriginal sentence is')


sen1=('I will go home and come back tomorrow')
sen1_w=sen1.split()
print(sen1)

syn1a=[]
for x in wordnet.synsets("come"):
for l in x.lemmas():
syn1a.append(l.name())

print('\nParaphrased sentences are:')

sen1_w[5]=syn1a[9]

print(' '.join(sen1_w))

sen1_w[5]=syn1a[8]

print(' '.join(sen1_w))

syn1a=[]
for x in wordnet.synsets("home"):
for l in x.lemmas():
syn1a.append(l.name())

sen1_w[3]=syn1a[2]

print(' ‘.join(sen1_w))
OUTPUT:

You might also like