Assessment - 2: - K Mary Nikitha

Assessment - 2
- K MARY NIKITHA
CSE 3024: Web Mining Slot: L51 + L52
Crawler Implementation, Index Compression, TF-IDF
1. Write a python program to

a) show the implementation of a concurrent depth-first crawler (No. of threads = 5 and depth = 5).
b) Develop the crawler program to handle various challenges (such as Parsing, Stemming,
Lemmitization, Link Extraction, Canonicalization, Spider Trap etc.) faced by crawler while
implementing.
c) Based on the contents retrieved, prepare one inverted index file (with proper representation).
CODE:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
f = open('InvertedIndex.txt','w')
MAX_THREADS = 30
def download_url(url):
print(url)
resp = requests.get(url)
title = ''.join(x for x in url if x.isalpha()) + "html"
with open(title, "wb") as fh:
fh.write(resp.content)
time.sleep(0.25)
def get_links_recursive(base, path, visited, max_depth=5, depth=0):
if depth <= max_depth:
try:
soup = BeautifulSoup(requests.get(base + path).text, "html.parser")
for link in soup.find_all("a"):
href = link.get("href")
if href not in visited:
visited.add(href)
f.write(href)
#print(f"at depth {depth}: {href}")
if href.startswith("http"):
get_links_recursive(href, "", visited, max_depth, depth + 1)
else:
get_links_recursive(base, href, visited, max_depth, depth + 1)
except:
pass
def download_stories(story_urls):
threads = min(MAX_THREADS, len(story_urls))
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
for url in story_urls:
get_links_recursive(url, "", set([url]))
def main(story_urls):
download_stories(story_urls)
urls = ["http://toscrape.com/"]
main(urls)
f.close()
file = open('InvertedIndex.txt', encoding='utf8')
read = file.read()
file.seek(0)
read
line = 1
for word in read:
if word == '\n':
line += 1
print("Number of lines in file is: ", line)
array = []
for i in range(line):
array.append(file.readline())
punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
for ele in read:
if ele in punc:
read = read.replace(ele, " ")
read=read.lower()
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
for i in range(1):
text_tokens = word_tokenize(read)
tokens_without_sw = [
word for word in text_tokens if not word in stopwords.words()]
dict = {}
for i in range(line):
check = array[i].lower()
for item in tokens_without_sw:
if item in check:
if item not in dict:
dict[item] = []
if item in dict:
dict[item].append(i+1)
for x in dict:
print(x,”:",dict[x])
OUTPUT:
2. Write a python program to show the implementation of Golomb Encoding-decoding technique
a) Encode x=25, 37, with b=11 and b=16
CODE:
import math
N=int(input())
M=int(input())
q = N//M
r=N%M
quo ='0'*q+'1'
b = math.floor(math.log2(M))
k = 2**(b + 1)-M
if r < k:
rem = bin(r)[2:]
l = len(rem)
if l<b:
rem = '0'*(b-l)+rem
else:
rem = bin(r + k)[2:]
l = len(rem)
if l<b + 1:
rem = '0'*(b + 1-l)+rem
golomb_code = quo + rem
print(golomb_code)
OUTPUT:
b) Decode the Golomb encoded sequence 1111111110010001101 with b = 10.
CODE:
import math
def decode(x):
num=0;
for i in range(len(x)):
num+=(int(x[len(x)-1-i])*(math.pow(2,i)));
return num;
print("Enter Golomb encoded sequence")
x=str(input())
x=list(x)
print("Enter base")
b=int(input())
i=math.floor(math.log(b,2))
d=math.pow(2,i+1)-b
p2=0;
l=1;
while(p2<len(x)):
t=0;
flag=0;
r=[];00
k=i;
q=0;
for p in range(p2,len(x)):
if(x[p]=='0' and flag==0):
t+=1;
continue;
if(x[p]=='1' and flag==0):
q=t;
flag=1;
continue;
r.append(x[p]);
k-=1;
if(k==0):
rnum=decode(r);
if(rnum<d):
p2=p+1;
break;
if(k==-1):
rnum=decode(r);
rnum=rnum-d;
p2=p+1;
break;
ans=q*b+rnum;
print(int(ans));
l=0;
OUTPUT:
3. Write a python program to extract the contents (excluding any tags) from two websites:
https://en.wikipedia.org/wiki/Web_mining
https://en.wikipedia.org/wiki/Data_mining
Save the content in two separate files. Construct a trie based on the content retrieved in using
HashMap / B-Tree / Dictionary. Write a program to show the implementation of Predictive Typing
and Auto-Correct using the trie prepared.
1) Predictive typing
CODE:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 24 08:05:19 2020
@author: nikitha
"""
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import wikipedia
result1 = wikipedia.search("Web Mining")
page1= wikipedia.page(result1[0])
content1 = page1.content
result2 = wikipedia.search("Data Mining")
b4stop1= content1.split()
stop_words = nltk.corpus.stopwords.words('english')
words1=[w for w in b4stop1 if not w in stop_words]
doc1=open("doc1.doc","w")
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
file1=open("doc1.doc","r")
d1=file1.read()
w1=d1.split()
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
import pandas as pd
d2=file2.read()
w2=d2.split()
w3=w1+w2
class TrieNode():
def __init__(self):
self.children = {}
self.last = False
class Trie():
def __init__(self):
self.root = TrieNode()
self.word_list = []
def formTrie(self, keys):
for key in keys:
self.insert(key)
def insert(self, key):
node = self.root
for a in list(key):
if not node.children.get(a):
node.children[a] = TrieNode()
node = node.children[a]
node.last = True
def search(self, key):
node = self.root
found = True
for a in list(key):
found = False
break
return node and node.last and found
def suggestionsRec(self, node, word):
if node.last:
self.word_list.append(word)
for a,n in node.children.items():
self.suggestionsRec(n, word + a)
def printAutoSuggestions(self, key):
node = self.root
not_found = False
temp_word = ''
for a in list(key):
not_found = True
break
temp_word += a
if not_found:
return 0
elif node.last and not node.children:
return -1
self.suggestionsRec(node, temp_word)
for s in self.word_list:
print(s)
return 1
keys = w3
print("Enter the prefix")
key = input()
status = ["Not found", "Found"]
t = Trie()
t.formTrie(keys)
comp = t.printAutoSuggestions(key)
if comp == -1:
print("No other strings found with this prefix\n")
elif comp == 0:
print("No string found with this prefix\n")
OUTPUT:
2) Auto correct
CODE:
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 24 17:01:13 2020
@author: nikitha
"""
import nltk
import wikipedia
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
d1=file1.read()
w1=d1.split()
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
import pandas as pd
d2=file2.read()
w2=d2.split()
w3=w1+w2
import sys
class Node:
'''
Node is a store of a single character and a dict of child nodes.
'''
def __init__(self, value):
self.value = value
self.children = dict()
self.end = False
def __getitem__(self, key):
if key in self.children:
return self.children[key]
return None
def __setitem__(self, key, value):
self.children[key] = value
def __contains__(self, value):
return value in self.children
def __str__(self):
return str(self.value)
class Trie:
def __init__(self):
self.root = Node('')
def add(self, word):
'''
Traverse the trie and add new nodes as we go.
'''
word = word.strip()
n = self.root
for l in word:
nxt = n[l]
if nxt is not None:
n = nxt
else:
n[l] = Node(l)
n = n[l]
n.end = True
def __contains__(self, word):
'''
Traverse the trie to find a word.
'''
n = self.root
for l in word:
if l not in n:
return False
n = n[l]
if n.end == True:
return True
return False
class SpellCheck:
def __init__(self):
'''
Load in the dictionary.
'''
self.words = Trie()
for word in w3:
self.words.add(word)
def spellcheck(self, word):
if word in self.words:
return word
word = word.lower()
if word in self.words:
return word
vowels = 'aeiou'
def recurse(path, word, node):
if node is None:
return None
if word == '':
if node.end == True:
return path
if node.end == False:
return None
ltr = word[0]
if ltr in node:
result = recurse(path + ltr, word[1:], node[ltr])
if result:
return result
ltr = ltr.lower()
if ltr in node:
result = recurse(path + ltr, word[1:], node[ltr])
if result:
return result
if len(word) > 1 and ltr == word[1]:
result = recurse(path, word[1:], node)
if result:
return result
if ltr in vowels:
for v in vowels:
if v != ltr:
result = recurse(path + v, word[1:], node[v])
if result:
return result
return None
result = recurse('', word, self.words.root)
if result:
return result
return 'NO SUGGESTION'
if __name__ == '__main__':
if len(sys.argv) > 1 and sys.argv[1] == '-t':
t = Trie()
with open('/usr/share/dict/words','r') as f:
for word in f:
word = word.strip()
t.add(word)
try:
assert(word in t)
except AssertionError:
print (word,"not in trie")
sys.exit(1)
s = SpellCheck()
import doctest
doctest.testmod(extraglobs={'s': s })
sys.exit(0)
s = SpellCheck()
while True:
word = input()
print(s.spellcheck(word))
OUPUT:
4. Write a python program to extract the contents (excluding any tags) from the following five
websites:
https://en.wikipedia.org/wiki/Web_mining
https://en.wikipedia.org/wiki/Data_mining
https://en.wikipedia.org/wiki/Artificial_intelligence
https://en.wikipedia.org/wiki/Machine_learning
https://en.wikipedia.org/wiki/Mining
Refined the contents by applying stopword removal and lemmatization process. Save the refined
tokenized content in five separate files. Considering a vector space model and do the following
operations according to the query “Mining large volume of data”.
CODE:
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 25 17:11:56 2020
@author: nikitha
"""
import math
import nltk
import wikipedia


result3 = wikipedia.search("Artificial Intelligence")

result4 = wikipedia.search("Machine Learning")

result5 = wikipedia.search("Mining")
stop_words.append('@')
stop_words.append(',')
stop_words.append('#')
stop_words.append('$')
stop_words.append('%')
stop_words.append('^')
stop_words.append('&')
stop_words.append('*')
stop_words.append('(')
stop_words.append(')')
stop_words.append('_')
stop_words.append('-')
stop_words.append('=')
stop_words.append('.')
stop_words.append('being')
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
d1=file1.read()
w1=d1.split()
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
d2=file2.read()
w2=d2.split()
for x in words3:
doc3.write(x)
doc3.write(" ")
doc3.close()
d3=file3.read()
w3=d3.split()
for x in words4:
doc4.write(x)
doc4.write(" ")
doc4.close()
d4=file4.read()
w4=d4.split()
for x in words5:
doc5.write(x)
doc5.write(" ")
doc5.close()
d5=file5.read()
w5=d5.split()
words=w1+w2+w3+w4+w5
count1=dict()
for x in w1:
if x not in stop_words:
count1[x]=count1.get(x,0)+1
count2=dict()
for x in w2:
count3=dict()
for x in w3:
count4=dict()
for x in w4:
count5=dict()
for x in w5:
bag1=[]
bag2=[]
bag3=[]
bag4=[]
bag5=[]
for x in range(len(words)):
if words[x] in w1:
bag1.append(count1[words[x]])
else:
bag1.append(0)
if words[x] in w2:
else:
bag2.append(0)
if words[x] in w3:
else:
bag3.append(0)
if words[x] in w4:
else:
bag4.append(0)
if words[x] in w5:
else:
bag5.append(0)
import pandas as pd
bagdata={'Term': words,'Document1': bag1,'Document2': bag2,'Document3': bag3,'Document4':
bag4,'Document5': bag5}
dfbag=pd.DataFrame(bagdata)
print(dfbag)
b1=[]
b2=[]
b3=[]
b4=[]
b5=[]
if words[x] in w1:
b1.append(count1[words[x]]/len(w1))
else:
b1.append(0)
if words[x] in w2:
else:
b2.append(0)
if words[x] in w3:
else:
b3.append(0)
if words[x] in w4:
else:
b4.append(0)
if words[x] in w5:
else:
b5.append(0)
tfdata={'Term': words,'Document1': b1,'Document2': b2,'Document3': b3,'Document 4':
b4,'Document 5': b5}
dftf=pd.DataFrame(tfdata)
print(dftf)
s1 = " ".join(w1)
s2 = " ".join(w2)
s3 = " ".join(w3)
s4 = " ".join(w4)
s5 = " ".join(w5)
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_extraction.text import CountVectorizer
docs = [s1,s2,s3,s4,s5]
cv=CountVectorizer()
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
df_idf = pd.DataFrame(tfidf_transformer.idf_,
index=cv.get_feature_names(),columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'])
print(df_idf)
count_vector=cv.transform(docs)
tf_idf_vector=tfidf_transformer.transform(count_vector)
print(tf_idf_vector.toarray())
query = "Mining large volume of data"

qw=query.split()
q=[w for w in qw if not w in stop_words]
qdict=dict()
y=math.sqrt(0.25*0.25*4)
for x in q:
qdict[x]=0.25/y
print('\n Normalized Query vector is:')
print(qdict)
norm1=0
norm2=0
norm3=0
norm4=0
norm5=0
y1=0
y2=0
y3=0
y4=0
y5=0
for x in tf_idf_vector.toarray()[0]:
y1= y1+x*x
sq1=math.sqrt(y1)
for x in range(len(tf_idf_vector.toarray()[0])):
tf_idf_vector.toarray()[0][x]=tf_idf_vector.toarray()[0][x]/sq1
y2= y2+x*x
sq2=math.sqrt(y2)
y3= y3+x*x
sq3=math.sqrt(y3)
y4= y4+x*x
sq4=math.sqrt(y4)
y5= y5+x*x
sq5=math.sqrt(y5)
print('\nNormalizing factors for the 5 documents are',sq1,sq2,sq3,sq4,sq5)

print('Normalized TF-IDF is')
print(tf_idf_vector.toarray())
score1=0
score2=0
score3=0
score4=0
score5=0
for x in q:
if x in w1:
index = w1.index(x)
score1=score1+qdict[x]*tf_idf_vector.toarray()[0][index]
for x in q:
if x in w2:
index = w2.index(x)
for x in q:
if x in w3:
index = w3.index(x)
for x in q:
if x in w4:
index = w4.index(x)
for x in q:
if x in w5:
index = w5.index(x)
tfidf_scores=[]
tfidf_scores.append(score1)
def myFunc(e):
return e['score']
cosine = [
{'doc': 'Document 1', 'score': score1},
{'doc': 'Document 5', 'score': score5}
]
cosine.sort(reverse=True, key=myFunc)
print('\nDocument ranking according to cosine similarity')

for x in cosine:
print(x['doc'])
max1=max(tfidf_scores)
if max1==score1:
print("\nDocument 1 matches with query the most acc to cosine similarity")
if max1==score2:
if max1==score3:
if max1==score4:
if max1==score5:
score1=0
score2=0
score3=0
score4=0
score5=0
for x in q:
if x in w1:
index = w1.index(x)
score1=score1+((qdict[x]-tf_idf_vector.toarray()[0][index])**2)
for x in q:
if x in w2:
index = w2.index(x)
for x in q:
if x in w3:
index = w3.index(x)
for x in q:
if x in w4:
index = w4.index(x)
for x in q:
if x in w5:
index = w5.index(x)
r1=math.sqrt(score1)
tfidf_scores=[]
tfidf_scores.append(r1)
def myFunc(e):
return e['score']
euc = [
{'doc': 'Document 1', 'score': r1},
{'doc': 'Document 5', 'score': r5}
]
euc.sort(key=myFunc)
print('\nDocument ranking according to Euclidean similarity')

for x in euc:
print(x['doc'])
max1=min(tfidf_scores)
if max1==r1:
print("\nDocument 1 matches with query the most acc to Euclidean distance")
if max1==r2:
if max1==r3:
if max1==r4:
print("\nDocument 4 matches with query the most acc to Euclidean distancey")
if max1==r5:
similarity=[]
s12=0
s12=s12+tf_idf_vector.toarray()[0][x]*tf_idf_vector.toarray()[1][x]
similarity.append(s1)
s13=0
s14=0
s15=0
s23=0
s24=0
s25=0
s34=0
s35=0
s45=0
def myFunc(e):
return e['score']
sim = [
{'doc': 'Document 1 is similar to Document 2', 'score': s12},
]
sim.sort(key=myFunc)
for x in sim:
print(x['doc'])
OUTPUT:
• Bag-of-Words (Document corpus)
• TF (Document corpus)
• IDF (Document corpus)
• TF-IDF (Document corpus)
• Normalized (Query)
• Normalized - TF-IDF (Document corpus)

• Cosine Similarity
• Euclidean Distance
• Document Ranking (Display Order)
• Document Similarity (Among Documents)

Assessment - 2: - K Mary Nikitha

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Assessment - 2: - K Mary Nikitha

Uploaded by

Copyright:

Available Formats

Assessment - 2

CSE 3024: Web Mining Slot: L51 + L52

Crawler Implementation, Index Compression, TF-IDF

1. Write a python program to

from bs4 import BeautifulSoup

title = ''.join(x for x in url if x.isalpha()) + "html"

with open(title, "wb") as fh:

def get_links_recursive(base, path, visited, max_depth=5, depth=0):

if depth <= max_depth:

soup = BeautifulSoup(requests.get(base + path).text, "html.parser")

for link in soup.find_all("a"):

if href not in visited:

#print(f"at depth {depth}: {href}")

get_links_recursive(href, "", visited, max_depth, depth + 1)

get_links_recursive(base, href, visited, max_depth, depth + 1)

threads = min(MAX_THREADS, len(story_urls))

with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:

for url in story_urls:

get_links_recursive(url, "", set([url]))

file = open('InvertedIndex.txt', encoding='utf8')

for word in read:

print("Number of lines in file is: ", line)

punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''

for ele in read:

read = read.replace(ele, " ")

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

word for word in text_tokens if not word in stopwords.words()]

for item in tokens_without_sw:

if item not in dict:

2. Write a python program to show the implementation of Golomb Encoding-decoding technique

a) Encode x=25, 37, with b=11 and b=16

print("Enter Golomb encoded sequence")

if(x[p]=='0' and flag==0):

if(x[p]=='1' and flag==0):

# -*- coding: utf-8 -*-

Created on Mon Aug 24 08:05:19 2020

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import WordNetLemmatizer

result1 = wikipedia.search("Web Mining")

result2 = wikipedia.search("Data Mining")

words1=[w for w in b4stop1 if not w in stop_words]

words2=[w for w in b4stop2 if not w in stop_words]

def formTrie(self, keys):

for key in keys:

def insert(self, key):

def search(self, key):

return node and node.last and found

def suggestionsRec(self, node, word):

for a,n in node.children.items():

def printAutoSuggestions(self, key):

elif node.last and not node.children:

print("Enter the prefix")

status = ["Not found", "Found"]

print("No other strings found with this prefix\n")

print("No string found with this prefix\n")

# -*- coding: utf-8 -*-

Created on Mon Aug 24 17:01:13 2020

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

# -- coding: utf-8 --

# -- coding: utf-8 --

def init(self, value):

def getitem(self, key):

def setitem(self, key, value):

def contains(self, value):

def contains(self, word):