Professional Documents
Culture Documents
- K MARY NIKITHA
b) Develop the crawler program to handle various challenges (such as Parsing, Stemming,
Lemmitization, Link Extraction, Canonicalization, Spider Trap etc.) faced by crawler while
implementing.
c) Based on the contents retrieved, prepare one inverted index file (with proper representation).
CODE:
import concurrent.futures
import requests
f = open('InvertedIndex.txt','w')
MAX_THREADS = 30
def download_url(url):
print(url)
resp = requests.get(url)
fh.write(resp.content)
time.sleep(0.25)
try:
href = link.get("href")
visited.add(href)
f.write(href)
if href.startswith("http"):
else:
except:
pass
def download_stories(story_urls):
def main(story_urls):
download_stories(story_urls)
urls = ["http://toscrape.com/"]
main(urls)
f.close()
read = file.read()
file.seek(0)
read
line = 1
if word == '\n':
line += 1
array = []
for i in range(line):
array.append(file.readline())
if ele in punc:
read=read.lower()
import nltk
for i in range(1):
text_tokens = word_tokenize(read)
tokens_without_sw = [
dict = {}
for i in range(line):
check = array[i].lower()
if item in check:
dict[item] = []
if item in dict:
dict[item].append(i+1)
for x in dict:
print(x,”:",dict[x])
OUTPUT:
CODE:
import math
N=int(input())
M=int(input())
q = N//M
r=N%M
quo ='0'*q+'1'
b = math.floor(math.log2(M))
k = 2**(b + 1)-M
if r < k:
rem = bin(r)[2:]
l = len(rem)
if l<b:
rem = '0'*(b-l)+rem
else:
rem = bin(r + k)[2:]
l = len(rem)
if l<b + 1:
rem = '0'*(b + 1-l)+rem
golomb_code = quo + rem
print(golomb_code)
OUTPUT:
b) Decode the Golomb encoded sequence 1111111110010001101 with b = 10.
CODE:
import math
def decode(x):
num=0;
for i in range(len(x)):
num+=(int(x[len(x)-1-i])*(math.pow(2,i)));
return num;
x=str(input())
x=list(x)
print("Enter base")
b=int(input())
i=math.floor(math.log(b,2))
d=math.pow(2,i+1)-b
p2=0;
l=1;
while(p2<len(x)):
t=0;
flag=0;
r=[];00
k=i;
q=0;
for p in range(p2,len(x)):
t+=1;
continue;
q=t;
flag=1;
continue;
r.append(x[p]);
k-=1;
if(k==0):
rnum=decode(r);
if(rnum<d):
p2=p+1;
break;
if(k==-1):
rnum=decode(r);
rnum=rnum-d;
p2=p+1;
break;
ans=q*b+rnum;
print(int(ans));
l=0;
OUTPUT:
3. Write a python program to extract the contents (excluding any tags) from two websites:
https://en.wikipedia.org/wiki/Web_mining
https://en.wikipedia.org/wiki/Data_mining
Save the content in two separate files. Construct a trie based on the content retrieved in using
HashMap / B-Tree / Dictionary. Write a program to show the implementation of Predictive Typing
and Auto-Correct using the trie prepared.
1) Predictive typing
CODE:
#!/usr/bin/env python3
"""
@author: nikitha
"""
import nltk
import wikipedia
page1= wikipedia.page(result1[0])
content1 = page1.content
page2= wikipedia.page(result2[0])
content2 = page2.content
b4stop1= content1.split()
stop_words = nltk.corpus.stopwords.words('english')
doc1=open("doc1.doc","w")
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
file1=open("doc1.doc","r")
d1=file1.read()
w1=d1.split()
b4stop2= content2.split()
stop_words = nltk.corpus.stopwords.words('english')
doc2=open("doc2.doc","w")
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
import pandas as pd
file2=open("doc2.doc","r")
d2=file2.read()
w2=d2.split()
w3=w1+w2
class TrieNode():
def __init__(self):
self.children = {}
self.last = False
class Trie():
def __init__(self):
self.root = TrieNode()
self.word_list = []
self.insert(key)
node = self.root
for a in list(key):
if not node.children.get(a):
node.children[a] = TrieNode()
node = node.children[a]
node.last = True
node = self.root
found = True
for a in list(key):
if not node.children.get(a):
found = False
break
node = node.children[a]
if node.last:
self.word_list.append(word)
self.suggestionsRec(n, word + a)
node = self.root
not_found = False
temp_word = ''
for a in list(key):
if not node.children.get(a):
not_found = True
break
temp_word += a
node = node.children[a]
if not_found:
return 0
return -1
self.suggestionsRec(node, temp_word)
for s in self.word_list:
print(s)
return 1
keys = w3
key = input()
t = Trie()
t.formTrie(keys)
comp = t.printAutoSuggestions(key)
if comp == -1:
elif comp == 0:
OUTPUT:
2) Auto correct
CODE:
#!/usr/bin/env python3
"""
@author: nikitha
"""
import nltk
import wikipedia
page1= wikipedia.page(result1[0])
content1 = page1.content
page2= wikipedia.page(result2[0])
content2 = page2.content
b4stop1= content1.split()
stop_words = nltk.corpus.stopwords.words('english')
doc1=open("doc1.doc","w")
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
file1=open("doc1.doc","r")
d1=file1.read()
w1=d1.split()
b4stop2= content2.split()
stop_words = nltk.corpus.stopwords.words('english')
doc2=open("doc2.doc","w")
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
import pandas as pd
file2=open("doc2.doc","r")
d2=file2.read()
w2=d2.split()
w3=w1+w2
import sys
class Node:
'''
'''
self.value = value
self.children = dict()
self.end = False
if key in self.children:
return self.children[key]
return None
self.children[key] = value
def __str__(self):
return str(self.value)
class Trie:
def __init__(self):
self.root = Node('')
'''
'''
word = word.strip()
n = self.root
for l in word:
nxt = n[l]
n = nxt
else:
n[l] = Node(l)
n = n[l]
n.end = True
'''
'''
n = self.root
for l in word:
if l not in n:
return False
n = n[l]
if n.end == True:
return True
return False
class SpellCheck:
def __init__(self):
'''
'''
self.words = Trie()
self.words.add(word)
if word in self.words:
return word
word = word.lower()
if word in self.words:
return word
vowels = 'aeiou'
if node is None:
return None
if word == '':
if node.end == True:
return path
if node.end == False:
return None
ltr = word[0]
if ltr in node:
if result:
return result
ltr = ltr.lower()
if ltr in node:
if result:
return result
if result:
return result
if ltr in vowels:
for v in vowels:
if v != ltr:
if result:
return result
return None
if result:
return result
if __name__ == '__main__':
t = Trie()
with open('/usr/share/dict/words','r') as f:
for word in f:
word = word.strip()
t.add(word)
try:
assert(word in t)
except AssertionError:
sys.exit(1)
s = SpellCheck()
import doctest
doctest.testmod(extraglobs={'s': s })
sys.exit(0)
s = SpellCheck()
while True:
word = input()
print(s.spellcheck(word))
OUPUT:
4. Write a python program to extract the contents (excluding any tags) from the following five
websites:
https://en.wikipedia.org/wiki/Web_mining
https://en.wikipedia.org/wiki/Data_mining
https://en.wikipedia.org/wiki/Artificial_intelligence
https://en.wikipedia.org/wiki/Machine_learning
https://en.wikipedia.org/wiki/Mining
Refined the contents by applying stopword removal and lemmatization process. Save the refined
tokenized content in five separate files. Considering a vector space model and do the following
operations according to the query “Mining large volume of data”.
CODE:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 25 17:11:56 2020
@author: nikitha
"""
import math
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import wikipedia
result5 = wikipedia.search("Mining")
page5= wikipedia.page(result5[0])
content5 = page5.content
stop_words = nltk.corpus.stopwords.words('english')
stop_words.append('@')
stop_words.append(',')
stop_words.append('#')
stop_words.append('$')
stop_words.append('%')
stop_words.append('^')
stop_words.append('&')
stop_words.append('*')
stop_words.append('(')
stop_words.append(')')
stop_words.append('_')
stop_words.append('-')
stop_words.append('=')
stop_words.append('.')
stop_words.append('being')
b4stop1= content1.split()
words1=[w for w in b4stop1 if not w in stop_words]
doc1=open("doc1.doc","w")
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
file1=open("doc1.doc","r")
d1=file1.read()
w1=d1.split()
b4stop2= content2.split()
words2=[w for w in b4stop2 if not w in stop_words]
doc2=open("doc2.doc","w")
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
file2=open("doc2.doc","r")
d2=file2.read()
w2=d2.split()
b4stop3= content3.split()
words3=[w for w in b4stop3 if not w in stop_words]
doc3=open("doc3.doc","w")
for x in words3:
doc3.write(x)
doc3.write(" ")
doc3.close()
file3=open("doc3.doc","r")
d3=file3.read()
w3=d3.split()
b4stop4= content4.split()
words4=[w for w in b4stop4 if not w in stop_words]
doc4=open("doc4.doc","w")
for x in words4:
doc4.write(x)
doc4.write(" ")
doc4.close()
file4=open("doc4.doc","r")
d4=file4.read()
w4=d4.split()
b4stop5= content5.split()
words5=[w for w in b4stop5 if not w in stop_words]
doc5=open("doc5.doc","w")
for x in words5:
doc5.write(x)
doc5.write(" ")
doc5.close()
file5=open("doc5.doc","r")
d5=file5.read()
w5=d5.split()
words=w1+w2+w3+w4+w5
count1=dict()
for x in w1:
if x not in stop_words:
count1[x]=count1.get(x,0)+1
count2=dict()
for x in w2:
if x not in stop_words:
count2[x]=count2.get(x,0)+1
count3=dict()
for x in w3:
if x not in stop_words:
count3[x]=count3.get(x,0)+1
count4=dict()
for x in w4:
if x not in stop_words:
count4[x]=count4.get(x,0)+1
count5=dict()
for x in w5:
if x not in stop_words:
count5[x]=count5.get(x,0)+1
bag1=[]
bag2=[]
bag3=[]
bag4=[]
bag5=[]
for x in range(len(words)):
if words[x] in w1:
bag1.append(count1[words[x]])
else:
bag1.append(0)
for x in range(len(words)):
if words[x] in w2:
bag2.append(count2[words[x]])
else:
bag2.append(0)
for x in range(len(words)):
if words[x] in w3:
bag3.append(count3[words[x]])
else:
bag3.append(0)
for x in range(len(words)):
if words[x] in w4:
bag4.append(count4[words[x]])
else:
bag4.append(0)
for x in range(len(words)):
if words[x] in w5:
bag5.append(count5[words[x]])
else:
bag5.append(0)
import pandas as pd
bagdata={'Term': words,'Document1': bag1,'Document2': bag2,'Document3': bag3,'Document4':
bag4,'Document5': bag5}
dfbag=pd.DataFrame(bagdata)
print(dfbag)
b1=[]
b2=[]
b3=[]
b4=[]
b5=[]
for x in range(len(words)):
if words[x] in w1:
b1.append(count1[words[x]]/len(w1))
else:
b1.append(0)
for x in range(len(words)):
if words[x] in w2:
b2.append(count2[words[x]]/len(w2))
else:
b2.append(0)
for x in range(len(words)):
if words[x] in w3:
b3.append(count3[words[x]]/len(w3))
else:
b3.append(0)
for x in range(len(words)):
if words[x] in w4:
b4.append(count4[words[x]]/len(w4))
else:
b4.append(0)
for x in range(len(words)):
if words[x] in w5:
b5.append(count5[words[x]]/len(w5))
else:
b5.append(0)
tfdata={'Term': words,'Document1': b1,'Document2': b2,'Document3': b3,'Document 4':
b4,'Document 5': b5}
dftf=pd.DataFrame(tfdata)
print(dftf)
s1 = " ".join(w1)
s2 = " ".join(w2)
s3 = " ".join(w3)
s4 = " ".join(w4)
s5 = " ".join(w5)
docs = [s1,s2,s3,s4,s5]
cv=CountVectorizer()
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
df_idf = pd.DataFrame(tfidf_transformer.idf_,
index=cv.get_feature_names(),columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'])
print(df_idf)
count_vector=cv.transform(docs)
tf_idf_vector=tfidf_transformer.transform(count_vector)
print(tf_idf_vector.toarray())
qdict=dict()
y=math.sqrt(0.25*0.25*4)
for x in q:
qdict[x]=0.25/y
print('\n Normalized Query vector is:')
print(qdict)
norm1=0
norm2=0
norm3=0
norm4=0
norm5=0
y1=0
y2=0
y3=0
y4=0
y5=0
for x in tf_idf_vector.toarray()[0]:
y1= y1+x*x
sq1=math.sqrt(y1)
for x in range(len(tf_idf_vector.toarray()[0])):
tf_idf_vector.toarray()[0][x]=tf_idf_vector.toarray()[0][x]/sq1
for x in tf_idf_vector.toarray()[1]:
y2= y2+x*x
sq2=math.sqrt(y2)
for x in range(len(tf_idf_vector.toarray()[1])):
tf_idf_vector.toarray()[1][x]=tf_idf_vector.toarray()[1][x]/sq2
for x in tf_idf_vector.toarray()[2]:
y3= y3+x*x
sq3=math.sqrt(y3)
for x in range(len(tf_idf_vector.toarray()[2])):
tf_idf_vector.toarray()[2][x]=tf_idf_vector.toarray()[2][x]/sq3
for x in tf_idf_vector.toarray()[3]:
y4= y4+x*x
sq4=math.sqrt(y4)
for x in range(len(tf_idf_vector.toarray()[3])):
tf_idf_vector.toarray()[3][x]=tf_idf_vector.toarray()[3][x]/sq4
for x in tf_idf_vector.toarray()[4]:
y5= y5+x*x
sq5=math.sqrt(y5)
for x in range(len(tf_idf_vector.toarray()[4])):
tf_idf_vector.toarray()[4][x]=tf_idf_vector.toarray()[4][x]/sq5
print(tf_idf_vector.toarray())
score1=0
score2=0
score3=0
score4=0
score5=0
for x in q:
if x in w1:
index = w1.index(x)
score1=score1+qdict[x]*tf_idf_vector.toarray()[0][index]
for x in q:
if x in w2:
index = w2.index(x)
score2=score2+qdict[x]*tf_idf_vector.toarray()[1][index]
for x in q:
if x in w3:
index = w3.index(x)
score3=score3+qdict[x]*tf_idf_vector.toarray()[2][index]
for x in q:
if x in w4:
index = w4.index(x)
score4=score4+qdict[x]*tf_idf_vector.toarray()[3][index]
for x in q:
if x in w5:
index = w5.index(x)
score5=score5+qdict[x]*tf_idf_vector.toarray()[4][index]
tfidf_scores=[]
tfidf_scores.append(score1)
tfidf_scores.append(score2)
tfidf_scores.append(score3)
tfidf_scores.append(score4)
tfidf_scores.append(score5)
def myFunc(e):
return e['score']
cosine = [
{'doc': 'Document 1', 'score': score1},
{'doc': 'Document 2', 'score': score2},
{'doc': 'Document 3', 'score': score3},
{'doc': 'Document 4', 'score': score4},
{'doc': 'Document 5', 'score': score5}
]
cosine.sort(reverse=True, key=myFunc)
max1=max(tfidf_scores)
if max1==score1:
print("\nDocument 1 matches with query the most acc to cosine similarity")
if max1==score2:
print("\nDocument 2 matches with query the most acc to cosine similarity")
if max1==score3:
print("\nDocument 3 matches with query the most acc to cosine similarity")
if max1==score4:
print("\nDocument 4 matches with query the most acc to cosine similarity")
if max1==score5:
print("\nDocument 5 matches with query the most acc to cosine similarity")
score1=0
score2=0
score3=0
score4=0
score5=0
for x in q:
if x in w1:
index = w1.index(x)
score1=score1+((qdict[x]-tf_idf_vector.toarray()[0][index])**2)
for x in q:
if x in w2:
index = w2.index(x)
score2=score2+((qdict[x]-tf_idf_vector.toarray()[1][index])**2)
for x in q:
if x in w3:
index = w3.index(x)
score3=score3+((qdict[x]-tf_idf_vector.toarray()[2][index])**2)
for x in q:
if x in w4:
index = w4.index(x)
score4=score4+((qdict[x]-tf_idf_vector.toarray()[3][index])**2)
for x in q:
if x in w5:
index = w5.index(x)
score5=score5+((qdict[x]-tf_idf_vector.toarray()[4][index])**2)
r1=math.sqrt(score1)
r2=math.sqrt(score2)
r3=math.sqrt(score3)
r4=math.sqrt(score4)
r5=math.sqrt(score5)
tfidf_scores=[]
tfidf_scores.append(r1)
tfidf_scores.append(r2)
tfidf_scores.append(r3)
tfidf_scores.append(r4)
tfidf_scores.append(r5)
def myFunc(e):
return e['score']
euc = [
{'doc': 'Document 1', 'score': r1},
{'doc': 'Document 2', 'score': r2},
{'doc': 'Document 3', 'score': r3},
{'doc': 'Document 4', 'score': r4},
{'doc': 'Document 5', 'score': r5}
]
euc.sort(key=myFunc)
max1=min(tfidf_scores)
if max1==r1:
print("\nDocument 1 matches with query the most acc to Euclidean distance")
if max1==r2:
print("\nDocument 2 matches with query the most acc to Euclidean distance")
if max1==r3:
print("\nDocument 3 matches with query the most acc to Euclidean distance")
if max1==r4:
print("\nDocument 4 matches with query the most acc to Euclidean distancey")
if max1==r5:
print("\nDocument 5 matches with query the most acc to Euclidean distance")
similarity=[]
s12=0
for x in range(len(tf_idf_vector.toarray()[0])):
s12=s12+tf_idf_vector.toarray()[0][x]*tf_idf_vector.toarray()[1][x]
similarity.append(s1)
s13=0
for x in range(len(tf_idf_vector.toarray()[0])):
s13=s13+tf_idf_vector.toarray()[0][x]*tf_idf_vector.toarray()[2][x]
similarity.append(s13)
s14=0
for x in range(len(tf_idf_vector.toarray()[0])):
s14=s14+tf_idf_vector.toarray()[0][x]*tf_idf_vector.toarray()[3][x]
similarity.append(s14)
s15=0
for x in range(len(tf_idf_vector.toarray()[0])):
s15=s15+tf_idf_vector.toarray()[0][x]*tf_idf_vector.toarray()[4][x]
similarity.append(s15)
s23=0
for x in range(len(tf_idf_vector.toarray()[0])):
s23=s23+tf_idf_vector.toarray()[1][x]*tf_idf_vector.toarray()[2][x]
similarity.append(s23)
s24=0
for x in range(len(tf_idf_vector.toarray()[0])):
s24=s24+tf_idf_vector.toarray()[1][x]*tf_idf_vector.toarray()[3][x]
similarity.append(s24)
s25=0
for x in range(len(tf_idf_vector.toarray()[0])):
s25=s25+tf_idf_vector.toarray()[1][x]*tf_idf_vector.toarray()[4][x]
similarity.append(s25)
s34=0
for x in range(len(tf_idf_vector.toarray()[0])):
s34=s34+tf_idf_vector.toarray()[2][x]*tf_idf_vector.toarray()[3][x]
similarity.append(s34)
s35=0
for x in range(len(tf_idf_vector.toarray()[0])):
s35=s35+tf_idf_vector.toarray()[2][x]*tf_idf_vector.toarray()[4][x]
similarity.append(s35)
s45=0
for x in range(len(tf_idf_vector.toarray()[0])):
s45=s45+tf_idf_vector.toarray()[3][x]*tf_idf_vector.toarray()[4][x]
similarity.append(s45)
def myFunc(e):
return e['score']
sim = [
{'doc': 'Document 1 is similar to Document 2', 'score': s12},
{'doc': 'Document 1 is similar to Document 3', 'score': s13},
{'doc': 'Document 1 is similar to Document 4', 'score': s14},
{'doc': 'Document 1 is similar to Document 5', 'score': s15},
{'doc': 'Document 2 is similar to Document 3', 'score': s23},
{'doc': 'Document 2 is similar to Document 4', 'score': s24},
{'doc': 'Document 2 is similar to Document 5', 'score': s25},
{'doc': 'Document 3 is similar to Document 4', 'score': s34},
{'doc': 'Document 3 is similar to Document 5', 'score': s35},
{'doc': 'Document 4 is similar to Document 5', 'score': s45},
]
sim.sort(key=myFunc)
for x in sim:
print(x['doc'])
OUTPUT:
• Bag-of-Words (Document corpus)
• TF (Document corpus)
• Normalized (Query)
• Euclidean Distance