Professional Documents
Culture Documents
Assessment - 1: Cse 3024: Web Mining Slot: L51 + L52
Assessment - 1: Cse 3024: Web Mining Slot: L51 + L52
f1=open("f1.txt","w")
for x in content:
f1.write(x)
f1.close()
f1=open("f1.txt","r")
d1=f1.read()
w1=d1.split()
stop_words = nltk.corpus.stopwords.words('english')
stop1=[w for w in w1 if not w in stop_words]
count1=dict()
for x in w1:
if x not in stop_words:
count1[x]=count1.get(x,0)+1
print(count1)
stop_words.remove('!')
stop_words.remove('@')
stop_words.remove('#')
stop_words.remove('$')
stop_words.remove('%')
stop_words.remove('^')
stop_words.remove('&')
stop_words.remove('*')
stop_words.remove('(')
stop_words.remove(')')
stop_words.remove('_')
stop_words.remove('-')
stop_words.remove('=')
stop_words.remove('+')
sen_list=[]
sentences = nltk.tokenize.sent_tokenize(d1)
sen_stop_words=[]
count=0
for x in sentences:
y=x.split()
sen_list.append(y)
count=count+1
print(count)
for x in range(97):
df1=pd.DataFrame()
sen_stop_words=[]
for y in sen_list[x]:
if y in stop_words:
sen_stop_words.append(y)
tagged = nltk.pos_tag(sen_stop_words)
df1 = pd.DataFrame(tagged, columns =['Stop-Word', 'pos-tag'])
print(sentences[x])
print(df1)
OUTPUT:
1) Term frequency of each term present in them after applying stop word removal.
2) The total number of terms stop word removal and new stop word list including special
characters
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 2 13:32:52 2020
@author: nikitha
"""
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import wikipedia
b4stop1= content1.split()
stop_words = nltk.corpus.stopwords.words('english')
words1=[w for w in b4stop1 if not w in stop_words]
doc1=open("doc1.doc","w")
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
file1=open("doc1.doc","r")
d1=file1.read()
w1=d1.split()
stop_words.append('!')
stop_words.append('@')
stop_words.append('#')
stop_words.append('$')
stop_words.append('%')
stop_words.append('^')
stop_words.append('&')
stop_words.append('*')
stop_words.append('(')
stop_words.append(')')
stop_words.append('_')
stop_words.append('-')
stop_words.append('=')
stop_words.append('+')
b4stop2= content2.split()
stop_words = nltk.corpus.stopwords.words('english')
words2=[w for w in b4stop2 if not w in stop_words]
doc2=open("doc2.doc","w")
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
import pandas as pd
file2=open("doc2.doc","r")
d2=file2.read()
w2=d2.split()
w3=w1+w2
values1=[]
values2=[]
for x in range(4086):
if w3[x] in w1:
val1=1
else:
val1=0
values1.append(val1)
if w3[x] in w2:
val2=1
else:
val2=0
values2.append(val2)
booldata={'Term': w3,'Document1': values1,'Document2': values2}
dfbool = pd.DataFrame(booldata)
print(dfbool)
count1=dict()
for x in w1:
if x not in stop_words:
count1[x]=count1.get(x,0)+1
count2=dict()
for x in w2:
if x not in stop_words:
count2[x]=count2.get(x,0)+1
bag1=[]
bag2=[]
print(len(w3))
for x in range(4086):
if w3[x] in w1:
bag1.append(count1[w3[x]])
else:
bag1.append(0)
for x in range(4086):
if w3[x] in w2:
bag2.append(count2[w3[x]])
else:
bag2.append(0)
com1=[]
com2=[]
for a in range(4086):
if w3[a] in w1:
indices=[]
indices=[i for i, x in enumerate(w1) if x == w3[a]]
com1.append(indices)
else:
com1.append('NA')
for a in range(4086):
if w3[a] in w2:
indices=[]
indices=[i for i, x in enumerate(w1) if x == w3[a]]
com2.append(indices)
else:
com2.append('NA')
comdata={'Term': w3,'Document1': com1,'Document2': com2}
dfcom=pd.DataFrame(comdata)
print(dfcom)
search=('The term data mining is a misnomer, because the goal is the extraction of patterns and
knowledge from large amounts of data, not the extraction mining of data itself')
search_w=search.split()
print(search)
fre1=0
fre2=0
for x in search_w:
if x in w1:
fre1=fre1+count1[x]
else:
fre1=fre1
if x in w2:
fre2=fre2+count2[x]
else:
fre2=fre2
if fre1>fre2:
print('Document 1 is more similar to search query')
else:
print('Document 2 is more similar to search query')
OUTPUT:
1)Boolean representation
2) Bag of Words representation
3) Complete representation
4) Search Query
CODE:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 2 11:16:20 2020
@author: nikitha
"""
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import wikipedia
b4stop1= content1.split()
stop_words = nltk.corpus.stopwords.words('english')
words1=[w for w in b4stop1 if not w in stop_words]
doc1=open("doc1.doc","w")
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
file1=open("doc1.doc","r")
d1=file1.read()
w1=d1.split()
b4stop2= content2.split()
stop_words = nltk.corpus.stopwords.words('english')
words2=[w for w in b4stop2 if not w in stop_words]
doc2=open("doc2.doc","w")
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
import pandas as pd
file2=open("doc2.doc","r")
d2=file2.read()
w2=d2.split()
rownum1=[]
rowword1=[]
for i in range(1743):
rownum1.append(i)
rowword1.append(w1[i])
dict1={}
for i in range(1743):
dict1[(rownum1[i])]=rowword1[i]
df1=pd.DataFrame.from_dict(dict1, orient='index',columns=['Words'])
rownum2=[]
rowword2=[]
for i in range(2343):
rownum2.append(i)
rowword2.append(w2[i])
dict2={}
for i in range(1743):
dict2[(rownum2[i])]=rowword2[i]
df2=pd.DataFrame.from_dict(dict2, orient='index',columns=['Words'])
OUTPUT:
CODE:
print('Original sentence is')
sen1=('The quick brown fox jumps over the lazy dog')
sen1_w=sen1.split()
print(sen1)
sen1_w[1]=syn1a[2]
print(' '.join(sen1_w))
sen1_w[1]=syn1a[5]
print(' '.join(sen1_w))
sen1_w[1]=syn1a[6]
print(' '.join(sen1_w))
syn1a=[]
for x in wordnet.synsets("previous"):
for l in x.lemmas():
syn1a.append(l.name())
sen1_w[5]=syn1a[2]
print(' '.join(sen1_w))
syn1a=[]
for x in wordnet.synsets("week"):
for l in x.lemmas():
syn1a.append(l.name())
sen1_w[6]=syn1a[2]
print(' '.join(sen1_w))
sen1_w[6]=syn1a[5]
print(' '.join(sen1_w))
syn1a=[]
for x in wordnet.synsets("battle"):
for l in x.lemmas():
syn1a.append(l.name())
sen1_w[8]=syn1a[2]
print(' '.join(sen1_w))
sen1_w[8]=syn1a[10]
print(' '.join(sen1_w))
sen1_w[8]=syn1a[6]
print(' '.join(sen1_w))
syn1a=[]
for x in wordnet.synsets("come"):
for l in x.lemmas():
syn1a.append(l.name())
sen1_w[5]=syn1a[9]
print(' '.join(sen1_w))
sen1_w[5]=syn1a[8]
print(' '.join(sen1_w))
syn1a=[]
for x in wordnet.synsets("home"):
for l in x.lemmas():
syn1a.append(l.name())
sen1_w[3]=syn1a[2]
print(' ‘.join(sen1_w))
OUTPUT: