18bce0457 VL2020210104959 Ast01

Assessment - 1
CSE 3024: Web Mining Slot: L51 + L52

Fundamentals of NLP and Crawling
REG NO : 18BCE0457
NAME: K MARY NIKITHA

CODE:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import wikipedia
result = wikipedia.search("Web Mining")
page = wikipedia.page(result[0])
content = page.content
f1=open("f1.txt","w")
for x in content:
f1.write(x)
f1.close()
f1=open("f1.txt","r")
d1=f1.read()
w1=d1.split()
stop_words = nltk.corpus.stopwords.words('english')
stop1=[w for w in w1 if not w in stop_words]
count1=dict()
for x in w1:
if x not in stop_words:
count1[x]=count1.get(x,0)+1
print(count1)
print("\nThe Total number of words are:",len(stop1))

print("\nLength of stop list before spl characters", len(stop_words))
stop_words.append('!')
stop_words.append('@')
stop_words.append('#')
stop_words.append('$')
stop_words.append('%')
stop_words.append('^')
stop_words.append('&')
stop_words.append('*')
stop_words.append('(')
stop_words.append(')')
stop_words.append('_')
stop_words.append('-')
stop_words.append('=')
stop_words.append('+')
print("Length of stop list after spl characters", len(stop_words))
stop1=[w for w in w1 if not w in stop_words]
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='english')
tokens = stop1
stemmedlist =[]
lemmalist =[]
lemmatizer = WordNetLemmatizer()
for token in tokens:
stemmed1= stemmer.stem(token)
stemmedlist.append(stemmed1)
lemma1=lemmatizer.lemmatize(token)
lemmalist.append(lemma1)
import pandas as pd
data = {'Original Term': tokens,'Stemmed Term': stemmedlist,'Lemmatized Term':
lemmalist}
df = pd.DataFrame(data)
print(df)
uniquestem=[]
uniquelemma=[]
for x in stemmedlist:
if x not in uniquestem:
uniquestem.append(x)
for x in lemmalist:
if x not in uniquelemma:
uniquelemma.append(x)
print("\nTotal number of unique stemmed words",len(uniquestem))
print("Total number of unique lemmatized words",len(uniquelemma))
stop_words.remove('!')
stop_words.remove('@')
stop_words.remove('#')
stop_words.remove('$')
stop_words.remove('%')
stop_words.remove('^')
stop_words.remove('&')
stop_words.remove('*')
stop_words.remove('(')
stop_words.remove(')')
stop_words.remove('_')
stop_words.remove('-')
stop_words.remove('=')
stop_words.remove('+')
sen_list=[]
sentences = nltk.tokenize.sent_tokenize(d1)
sen_stop_words=[]
count=0
for x in sentences:
y=x.split()
sen_list.append(y)
count=count+1
print(count)
for x in range(97):
df1=pd.DataFrame()
sen_stop_words=[]
for y in sen_list[x]:
if y in stop_words:
sen_stop_words.append(y)
tagged = nltk.pos_tag(sen_stop_words)
df1 = pd.DataFrame(tagged, columns =['Stop-Word', 'pos-tag'])
print(sentences[x])
print(df1)
OUTPUT:
1) Term frequency of each term present in them after applying stop word removal.
2) The total number of terms stop word removal and new stop word list including special
characters
3) Stemming and lemmatization

4) POS tag Sentence wise
CODE:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 2 13:32:52 2020
@author: nikitha
"""
import nltk
import wikipedia
result1 = wikipedia.search("Web Mining")

page1= wikipedia.page(result1[0])
content1 = page1.content
result2 = wikipedia.search("Data Mining")

b4stop1= content1.split()
words1=[w for w in b4stop1 if not w in stop_words]
doc1=open("doc1.doc","w")
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
file1=open("doc1.doc","r")
d1=file1.read()
w1=d1.split()
stop_words.append('!')
stop_words.append('@')
stop_words.append('#')
stop_words.append('$')
stop_words.append('%')
stop_words.append('^')
stop_words.append('&')
stop_words.append('*')
stop_words.append('(')
stop_words.append(')')
stop_words.append('_')
stop_words.append('-')
stop_words.append('=')
stop_words.append('+')
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
import pandas as pd
d2=file2.read()
w2=d2.split()
w3=w1+w2
values1=[]
values2=[]
for x in range(4086):
if w3[x] in w1:
val1=1
else:
val1=0
values1.append(val1)
if w3[x] in w2:
val2=1
else:
val2=0
values2.append(val2)
booldata={'Term': w3,'Document1': values1,'Document2': values2}
dfbool = pd.DataFrame(booldata)
print(dfbool)
count1=dict()
for x in w1:
count2=dict()
for x in w2:
bag1=[]
bag2=[]
print(len(w3))
if w3[x] in w1:
bag1.append(count1[w3[x]])
else:
bag1.append(0)
if w3[x] in w2:
bag2.append(count2[w3[x]])
else:
bag2.append(0)
bagdata={'Term': w3,'Document1': bag1,'Document2': bag2}

dfbag=pd.DataFrame(bagdata)
print(dfbag)
com1=[]
com2=[]
for a in range(4086):
if w3[a] in w1:
indices=[]
indices=[i for i, x in enumerate(w1) if x == w3[a]]
com1.append(indices)
else:
com1.append('NA')
for a in range(4086):
if w3[a] in w2:
indices=[]
indices=[i for i, x in enumerate(w1) if x == w3[a]]
com2.append(indices)
else:
com2.append('NA')
comdata={'Term': w3,'Document1': com1,'Document2': com2}
dfcom=pd.DataFrame(comdata)
print(dfcom)
search=('The term data mining is a misnomer, because the goal is the extraction of patterns and
knowledge from large amounts of data, not the extraction mining of data itself')
search_w=search.split()
print(search)
fre1=0
fre2=0
for x in search_w:
if x in w1:
fre1=fre1+count1[x]
else:
fre1=fre1
if x in w2:
fre2=fre2+count2[x]
else:
fre2=fre2
if fre1>fre2:
print('Document 1 is more similar to search query')
else:
print('Document 2 is more similar to search query')
OUTPUT:
1)Boolean representation
2) Bag of Words representation
3) Complete representation
4) Search Query
CODE:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 2 11:16:20 2020
@author: nikitha
"""
import nltk
import wikipedia
result1 = wikipedia.search("Web Mining")

result2 = wikipedia.search("Data Mining")

for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
d1=file1.read()
w1=d1.split()
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
import pandas as pd
d2=file2.read()
w2=d2.split()
rownum1=[]
rowword1=[]
for i in range(1743):
rownum1.append(i)
rowword1.append(w1[i])
dict1={}
dict1[(rownum1[i])]=rowword1[i]
df1=pd.DataFrame.from_dict(dict1, orient='index',columns=['Words'])
from wordcloud import WordCloud, STOPWORDS

import matplotlib.pyplot as plt
text = df1.Words.values
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
figsize = (40, 30),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
rownum2=[]
rowword2=[]
rownum2.append(i)
rowword2.append(w2[i])
dict2={}
dict2[(rownum2[i])]=rowword2[i]
df2=pd.DataFrame.from_dict(dict2, orient='index',columns=['Words'])
from wordcloud import WordCloud, STOPWORDS

import matplotlib.pyplot as plt
text = df2.Words.values
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
figsize = (40, 30),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
OUTPUT:
CODE:
print('Original sentence is')
sen1=('The quick brown fox jumps over the lazy dog')
sen1_w=sen1.split()
print(sen1)
from nltk.corpus import wordnet

syn1a=[]
for x in wordnet.synsets("quick"):
for l in x.lemmas():
syn1a.append(l.name())
print('\nParaphrased sentences are:')
sen1_w[1]=syn1a[2]
print(' '.join(sen1_w))
sen1_w[1]=syn1a[5]
sen1_w[1]=syn1a[6]
print('\nOriginal sentence is')

sen1=('Obama and Putin met the previous week')
sen1_w=sen1.split()
print(sen1)
syn1a=[]
for x in wordnet.synsets("previous"):
sen1_w[5]=syn1a[2]
syn1a=[]
for x in wordnet.synsets("week"):
sen1_w[6]=syn1a[2]
sen1_w[6]=syn1a[5]

sen1=('At least 12 people were killed in the battle last week')
sen1_w=sen1.split()
print(sen1)
syn1a=[]
for x in wordnet.synsets("battle"):
sen1_w[8]=syn1a[2]
sen1_w[8]=syn1a[10]
sen1_w[8]=syn1a[6]

sen1=('I will go home and come back tomorrow')
sen1_w=sen1.split()
print(sen1)
syn1a=[]
for x in wordnet.synsets("come"):
sen1_w[5]=syn1a[9]
sen1_w[5]=syn1a[8]
syn1a=[]
for x in wordnet.synsets("home"):
sen1_w[3]=syn1a[2]
print(' ‘.join(sen1_w))
OUTPUT:

18bce0457 VL2020210104959 Ast01

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

18bce0457 VL2020210104959 Ast01

Uploaded by

Copyright:

Available Formats

Assessment - 1

CSE 3024: Web Mining Slot: L51 + L52

NAME: K MARY NIKITHA

print("\nThe Total number of words are:",len(stop1))

print("\nTotal number of unique stemmed words",len(uniquestem))

print("Total number of unique lemmatized words",len(uniquelemma))

3) Stemming and lemmatization

result1 = wikipedia.search("Web Mining")

result2 = wikipedia.search("Data Mining")

bagdata={'Term': w3,'Document1': bag1,'Document2': bag2}

result1 = wikipedia.search("Web Mining")

result2 = wikipedia.search("Data Mining")

from wordcloud import WordCloud, STOPWORDS

from wordcloud import WordCloud, STOPWORDS

from nltk.corpus import wordnet

print('\nOriginal sentence is')

print('\nParaphrased sentences are:')

print('\nOriginal sentence is')

print('\nParaphrased sentences are:')

print('\nOriginal sentence is')

print('\nParaphrased sentences are:')

You might also like