You are on page 1of 27

Assessment - 2

- K MARY NIKITHA

CSE 3024: Web Mining Slot: L51 + L52

Crawler Implementation, Index Compression, TF-IDF

1. Write a python program to


a) show the implementation of a concurrent depth-first crawler (No. of threads = 5 and depth = 5).

b) Develop the crawler program to handle various challenges (such as Parsing, Stemming,
Lemmitization, Link Extraction, Canonicalization, Spider Trap etc.) faced by crawler while
implementing.

c) Based on the contents retrieved, prepare one inverted index file (with proper representation).

CODE:

import concurrent.futures

import requests

from bs4 import BeautifulSoup

f = open('InvertedIndex.txt','w')

MAX_THREADS = 30

def download_url(url):

print(url)

resp = requests.get(url)

title = ''.join(x for x in url if x.isalpha()) + "html"

with open(title, "wb") as fh:

fh.write(resp.content)

time.sleep(0.25)

def get_links_recursive(base, path, visited, max_depth=5, depth=0):

if depth <= max_depth:

try:

soup = BeautifulSoup(requests.get(base + path).text, "html.parser")

for link in soup.find_all("a"):

href = link.get("href")

if href not in visited:

visited.add(href)

f.write(href)

#print(f"at depth {depth}: {href}")

if href.startswith("http"):

get_links_recursive(href, "", visited, max_depth, depth + 1)

else:

get_links_recursive(base, href, visited, max_depth, depth + 1)

except:

pass

def download_stories(story_urls):

threads = min(MAX_THREADS, len(story_urls))

with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:

for url in story_urls:

get_links_recursive(url, "", set([url]))

def main(story_urls):

download_stories(story_urls)

urls = ["http://toscrape.com/"]

main(urls)

f.close()

file = open('InvertedIndex.txt', encoding='utf8')

read = file.read()

file.seek(0)

read

line = 1

for word in read:

if word == '\n':

line += 1

print("Number of lines in file is: ", line)

array = []

for i in range(line):

array.append(file.readline())

punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''

for ele in read:

if ele in punc:

read = read.replace(ele, " ")

read=read.lower()

from nltk.tokenize import word_tokenize

import nltk

from nltk.corpus import stopwords

for i in range(1):

text_tokens = word_tokenize(read)

tokens_without_sw = [

word for word in text_tokens if not word in stopwords.words()]

dict = {}

for i in range(line):

check = array[i].lower()

for item in tokens_without_sw:

if item in check:

if item not in dict:

dict[item] = []

if item in dict:

dict[item].append(i+1)

for x in dict:

print(x,”:",dict[x])

OUTPUT:

2. Write a python program to show the implementation of Golomb Encoding-decoding technique

a) Encode x=25, 37, with b=11 and b=16

CODE:
import math
N=int(input())
M=int(input())
q = N//M
r=N%M
quo ='0'*q+'1'
b = math.floor(math.log2(M))
k = 2**(b + 1)-M
if r < k:
rem = bin(r)[2:]
l = len(rem)
if l<b:
rem = '0'*(b-l)+rem
else:
rem = bin(r + k)[2:]
l = len(rem)
if l<b + 1:
rem = '0'*(b + 1-l)+rem
golomb_code = quo + rem
print(golomb_code)

OUTPUT:
b) Decode the Golomb encoded sequence 1111111110010001101 with b = 10.

CODE:

import math

def decode(x):

num=0;

for i in range(len(x)):

num+=(int(x[len(x)-1-i])*(math.pow(2,i)));

return num;

print("Enter Golomb encoded sequence")

x=str(input())

x=list(x)

print("Enter base")

b=int(input())

i=math.floor(math.log(b,2))

d=math.pow(2,i+1)-b

p2=0;

l=1;

while(p2<len(x)):

t=0;

flag=0;

r=[];00

k=i;

q=0;

for p in range(p2,len(x)):

if(x[p]=='0' and flag==0):

t+=1;

continue;

if(x[p]=='1' and flag==0):

q=t;

flag=1;

continue;

r.append(x[p]);

k-=1;

if(k==0):

rnum=decode(r);

if(rnum<d):

p2=p+1;

break;

if(k==-1):

rnum=decode(r);

rnum=rnum-d;

p2=p+1;

break;

ans=q*b+rnum;

print(int(ans));

l=0;

OUTPUT:

3. Write a python program to extract the contents (excluding any tags) from two websites:

https://en.wikipedia.org/wiki/Web_mining
https://en.wikipedia.org/wiki/Data_mining

Save the content in two separate files. Construct a trie based on the content retrieved in using
HashMap / B-Tree / Dictionary. Write a program to show the implementation of Predictive Typing
and Auto-Correct using the trie prepared.

1) Predictive typing

CODE:

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

"""

Created on Mon Aug 24 08:05:19 2020

@author: nikitha

"""

import nltk

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import WordNetLemmatizer

import wikipedia

result1 = wikipedia.search("Web Mining")

page1= wikipedia.page(result1[0])

content1 = page1.content

result2 = wikipedia.search("Data Mining")

page2= wikipedia.page(result2[0])

content2 = page2.content

b4stop1= content1.split()

stop_words = nltk.corpus.stopwords.words('english')

words1=[w for w in b4stop1 if not w in stop_words]

doc1=open("doc1.doc","w")

for x in words1:

doc1.write(x)

doc1.write(" ")

doc1.close()

file1=open("doc1.doc","r")

d1=file1.read()

w1=d1.split()

b4stop2= content2.split()

stop_words = nltk.corpus.stopwords.words('english')

words2=[w for w in b4stop2 if not w in stop_words]

doc2=open("doc2.doc","w")

for x in words2:

doc2.write(x)

doc2.write(" ")

doc2.close()

import pandas as pd

file2=open("doc2.doc","r")

d2=file2.read()

w2=d2.split()

w3=w1+w2

class TrieNode():

def __init__(self):

self.children = {}

self.last = False

class Trie():

def __init__(self):

self.root = TrieNode()

self.word_list = []

def formTrie(self, keys):

for key in keys:

self.insert(key)

def insert(self, key):

node = self.root

for a in list(key):

if not node.children.get(a):

node.children[a] = TrieNode()

node = node.children[a]

node.last = True

def search(self, key):

node = self.root

found = True

for a in list(key):

if not node.children.get(a):

found = False

break

node = node.children[a]

return node and node.last and found

def suggestionsRec(self, node, word):

if node.last:

self.word_list.append(word)

for a,n in node.children.items():

self.suggestionsRec(n, word + a)

def printAutoSuggestions(self, key):

node = self.root

not_found = False

temp_word = ''

for a in list(key):

if not node.children.get(a):

not_found = True

break

temp_word += a

node = node.children[a]

if not_found:

return 0

elif node.last and not node.children:

return -1

self.suggestionsRec(node, temp_word)

for s in self.word_list:

print(s)

return 1

keys = w3

print("Enter the prefix")

key = input()

status = ["Not found", "Found"]

t = Trie()

t.formTrie(keys)

comp = t.printAutoSuggestions(key)

if comp == -1:

print("No other strings found with this prefix\n")

elif comp == 0:

print("No string found with this prefix\n")

OUTPUT:

2) Auto correct

CODE:

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

"""

Created on Mon Aug 24 17:01:13 2020

@author: nikitha

"""

import nltk

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import WordNetLemmatizer

import wikipedia

result1 = wikipedia.search("Web Mining")

page1= wikipedia.page(result1[0])

content1 = page1.content

result2 = wikipedia.search("Data Mining")

page2= wikipedia.page(result2[0])

content2 = page2.content

b4stop1= content1.split()

stop_words = nltk.corpus.stopwords.words('english')

words1=[w for w in b4stop1 if not w in stop_words]

doc1=open("doc1.doc","w")

for x in words1:

doc1.write(x)

doc1.write(" ")

doc1.close()

file1=open("doc1.doc","r")

d1=file1.read()

w1=d1.split()

b4stop2= content2.split()

stop_words = nltk.corpus.stopwords.words('english')

words2=[w for w in b4stop2 if not w in stop_words]

doc2=open("doc2.doc","w")

for x in words2:

doc2.write(x)

doc2.write(" ")

doc2.close()

import pandas as pd

file2=open("doc2.doc","r")

d2=file2.read()

w2=d2.split()

w3=w1+w2

import sys

class Node:

'''

Node is a store of a single character and a dict of child nodes.

'''

def __init__(self, value):

self.value = value

self.children = dict()

self.end = False

def __getitem__(self, key):

if key in self.children:

return self.children[key]

return None

def __setitem__(self, key, value):

self.children[key] = value

def __contains__(self, value):

return value in self.children

def __str__(self):

return str(self.value)

class Trie:

def __init__(self):

self.root = Node('')

def add(self, word):

'''

Traverse the trie and add new nodes as we go.

'''

word = word.strip()

n = self.root

for l in word:

nxt = n[l]

if nxt is not None:

n = nxt

else:

n[l] = Node(l)

n = n[l]

n.end = True

def __contains__(self, word):

'''

Traverse the trie to find a word.

'''

n = self.root

for l in word:

if l not in n:

return False

n = n[l]

if n.end == True:

return True

return False

class SpellCheck:

def __init__(self):

'''

Load in the dictionary.

'''

self.words = Trie()

for word in w3:

self.words.add(word)

def spellcheck(self, word):

if word in self.words:

return word

word = word.lower()

if word in self.words:

return word

vowels = 'aeiou'

def recurse(path, word, node):

if node is None:

return None

if word == '':

if node.end == True:

return path

if node.end == False:

return None

ltr = word[0]

if ltr in node:

result = recurse(path + ltr, word[1:], node[ltr])

if result:

return result

ltr = ltr.lower()

if ltr in node:

result = recurse(path + ltr, word[1:], node[ltr])

if result:

return result

if len(word) > 1 and ltr == word[1]:

result = recurse(path, word[1:], node)

if result:

return result

if ltr in vowels:

for v in vowels:

if v != ltr:

result = recurse(path + v, word[1:], node[v])

if result:

return result

return None

result = recurse('', word, self.words.root)

if result:

return result

return 'NO SUGGESTION'

if __name__ == '__main__':

if len(sys.argv) > 1 and sys.argv[1] == '-t':

t = Trie()

with open('/usr/share/dict/words','r') as f:

for word in f:

word = word.strip()

t.add(word)

try:

assert(word in t)

except AssertionError:

print (word,"not in trie")

sys.exit(1)

s = SpellCheck()

import doctest

doctest.testmod(extraglobs={'s': s })

sys.exit(0)

s = SpellCheck()

while True:

word = input()

print(s.spellcheck(word))

OUPUT:

4. Write a python program to extract the contents (excluding any tags) from the following five
websites:
https://en.wikipedia.org/wiki/Web_mining
https://en.wikipedia.org/wiki/Data_mining
https://en.wikipedia.org/wiki/Artificial_intelligence
https://en.wikipedia.org/wiki/Machine_learning
https://en.wikipedia.org/wiki/Mining

Refined the contents by applying stopword removal and lemmatization process. Save the refined
tokenized content in five separate files. Considering a vector space model and do the following
operations according to the query “Mining large volume of data”.

CODE:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 25 17:11:56 2020

@author: nikitha
"""

import math
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import wikipedia

result1 = wikipedia.search("Web Mining")


page1= wikipedia.page(result1[0])
content1 = page1.content

result2 = wikipedia.search("Data Mining")


page2= wikipedia.page(result2[0])
content2 = page2.content

result3 = wikipedia.search("Artificial Intelligence")


page3= wikipedia.page(result3[0])
content3 = page3.content

result4 = wikipedia.search("Machine Learning")


page4= wikipedia.page(result4[0])
content4 = page4.content

result5 = wikipedia.search("Mining")
page5= wikipedia.page(result5[0])
content5 = page5.content

stop_words = nltk.corpus.stopwords.words('english')
stop_words.append('@')
stop_words.append(',')
stop_words.append('#')
stop_words.append('$')
stop_words.append('%')
stop_words.append('^')
stop_words.append('&')
stop_words.append('*')
stop_words.append('(')
stop_words.append(')')
stop_words.append('_')
stop_words.append('-')
stop_words.append('=')
stop_words.append('.')
stop_words.append('being')

b4stop1= content1.split()
words1=[w for w in b4stop1 if not w in stop_words]
doc1=open("doc1.doc","w")
for x in words1:
doc1.write(x)
doc1.write(" ")
doc1.close()
file1=open("doc1.doc","r")
d1=file1.read()
w1=d1.split()

b4stop2= content2.split()
words2=[w for w in b4stop2 if not w in stop_words]
doc2=open("doc2.doc","w")
for x in words2:
doc2.write(x)
doc2.write(" ")
doc2.close()
file2=open("doc2.doc","r")
d2=file2.read()
w2=d2.split()

b4stop3= content3.split()
words3=[w for w in b4stop3 if not w in stop_words]
doc3=open("doc3.doc","w")
for x in words3:
doc3.write(x)
doc3.write(" ")
doc3.close()
file3=open("doc3.doc","r")
d3=file3.read()
w3=d3.split()

b4stop4= content4.split()
words4=[w for w in b4stop4 if not w in stop_words]
doc4=open("doc4.doc","w")
for x in words4:
doc4.write(x)
doc4.write(" ")
doc4.close()
file4=open("doc4.doc","r")
d4=file4.read()
w4=d4.split()

b4stop5= content5.split()
words5=[w for w in b4stop5 if not w in stop_words]
doc5=open("doc5.doc","w")
for x in words5:
doc5.write(x)
doc5.write(" ")
doc5.close()
file5=open("doc5.doc","r")
d5=file5.read()
w5=d5.split()

words=w1+w2+w3+w4+w5

count1=dict()
for x in w1:
if x not in stop_words:
count1[x]=count1.get(x,0)+1
count2=dict()
for x in w2:
if x not in stop_words:
count2[x]=count2.get(x,0)+1

count3=dict()
for x in w3:
if x not in stop_words:
count3[x]=count3.get(x,0)+1
count4=dict()
for x in w4:
if x not in stop_words:
count4[x]=count4.get(x,0)+1
count5=dict()
for x in w5:
if x not in stop_words:
count5[x]=count5.get(x,0)+1

bag1=[]
bag2=[]
bag3=[]
bag4=[]
bag5=[]

for x in range(len(words)):
if words[x] in w1:
bag1.append(count1[words[x]])
else:
bag1.append(0)

for x in range(len(words)):
if words[x] in w2:
bag2.append(count2[words[x]])
else:
bag2.append(0)

for x in range(len(words)):
if words[x] in w3:
bag3.append(count3[words[x]])
else:
bag3.append(0)

for x in range(len(words)):
if words[x] in w4:
bag4.append(count4[words[x]])
else:
bag4.append(0)

for x in range(len(words)):
if words[x] in w5:
bag5.append(count5[words[x]])
else:
bag5.append(0)

import pandas as pd
bagdata={'Term': words,'Document1': bag1,'Document2': bag2,'Document3': bag3,'Document4':
bag4,'Document5': bag5}
dfbag=pd.DataFrame(bagdata)
print(dfbag)

b1=[]
b2=[]
b3=[]
b4=[]
b5=[]

for x in range(len(words)):
if words[x] in w1:
b1.append(count1[words[x]]/len(w1))
else:
b1.append(0)
for x in range(len(words)):
if words[x] in w2:
b2.append(count2[words[x]]/len(w2))
else:
b2.append(0)
for x in range(len(words)):
if words[x] in w3:
b3.append(count3[words[x]]/len(w3))
else:
b3.append(0)
for x in range(len(words)):
if words[x] in w4:
b4.append(count4[words[x]]/len(w4))
else:
b4.append(0)
for x in range(len(words)):
if words[x] in w5:
b5.append(count5[words[x]]/len(w5))
else:
b5.append(0)
tfdata={'Term': words,'Document1': b1,'Document2': b2,'Document3': b3,'Document 4':
b4,'Document 5': b5}
dftf=pd.DataFrame(tfdata)
print(dftf)

s1 = " ".join(w1)
s2 = " ".join(w2)
s3 = " ".join(w3)
s4 = " ".join(w4)
s5 = " ".join(w5)

from sklearn.feature_extraction.text import TfidfTransformer


from sklearn.feature_extraction.text import CountVectorizer

docs = [s1,s2,s3,s4,s5]

cv=CountVectorizer()

word_count_vector=cv.fit_transform(docs)

word_count_vector.shape

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

df_idf = pd.DataFrame(tfidf_transformer.idf_,
index=cv.get_feature_names(),columns=["idf_weights"])

df_idf.sort_values(by=['idf_weights'])

print(df_idf)
count_vector=cv.transform(docs)

tf_idf_vector=tfidf_transformer.transform(count_vector)

print(tf_idf_vector.toarray())

query = "Mining large volume of data"


qw=query.split()
q=[w for w in qw if not w in stop_words]

qdict=dict()

y=math.sqrt(0.25*0.25*4)
for x in q:
qdict[x]=0.25/y
print('\n Normalized Query vector is:')
print(qdict)

norm1=0
norm2=0
norm3=0
norm4=0
norm5=0
y1=0
y2=0
y3=0
y4=0
y5=0
for x in tf_idf_vector.toarray()[0]:
y1= y1+x*x
sq1=math.sqrt(y1)

for x in range(len(tf_idf_vector.toarray()[0])):
tf_idf_vector.toarray()[0][x]=tf_idf_vector.toarray()[0][x]/sq1

for x in tf_idf_vector.toarray()[1]:
y2= y2+x*x
sq2=math.sqrt(y2)

for x in range(len(tf_idf_vector.toarray()[1])):
tf_idf_vector.toarray()[1][x]=tf_idf_vector.toarray()[1][x]/sq2

for x in tf_idf_vector.toarray()[2]:
y3= y3+x*x
sq3=math.sqrt(y3)

for x in range(len(tf_idf_vector.toarray()[2])):
tf_idf_vector.toarray()[2][x]=tf_idf_vector.toarray()[2][x]/sq3
for x in tf_idf_vector.toarray()[3]:
y4= y4+x*x
sq4=math.sqrt(y4)

for x in range(len(tf_idf_vector.toarray()[3])):
tf_idf_vector.toarray()[3][x]=tf_idf_vector.toarray()[3][x]/sq4

for x in tf_idf_vector.toarray()[4]:
y5= y5+x*x
sq5=math.sqrt(y5)

for x in range(len(tf_idf_vector.toarray()[4])):
tf_idf_vector.toarray()[4][x]=tf_idf_vector.toarray()[4][x]/sq5

print('\nNormalizing factors for the 5 documents are',sq1,sq2,sq3,sq4,sq5)


print('Normalized TF-IDF is')

print(tf_idf_vector.toarray())

score1=0
score2=0
score3=0
score4=0
score5=0

for x in q:
if x in w1:
index = w1.index(x)
score1=score1+qdict[x]*tf_idf_vector.toarray()[0][index]

for x in q:
if x in w2:
index = w2.index(x)
score2=score2+qdict[x]*tf_idf_vector.toarray()[1][index]

for x in q:
if x in w3:
index = w3.index(x)
score3=score3+qdict[x]*tf_idf_vector.toarray()[2][index]

for x in q:
if x in w4:
index = w4.index(x)
score4=score4+qdict[x]*tf_idf_vector.toarray()[3][index]

for x in q:
if x in w5:
index = w5.index(x)
score5=score5+qdict[x]*tf_idf_vector.toarray()[4][index]
tfidf_scores=[]

tfidf_scores.append(score1)
tfidf_scores.append(score2)
tfidf_scores.append(score3)
tfidf_scores.append(score4)
tfidf_scores.append(score5)

def myFunc(e):
return e['score']

cosine = [
{'doc': 'Document 1', 'score': score1},
{'doc': 'Document 2', 'score': score2},
{'doc': 'Document 3', 'score': score3},
{'doc': 'Document 4', 'score': score4},
{'doc': 'Document 5', 'score': score5}
]

cosine.sort(reverse=True, key=myFunc)

print('\nDocument ranking according to cosine similarity')


for x in cosine:
print(x['doc'])

max1=max(tfidf_scores)
if max1==score1:
print("\nDocument 1 matches with query the most acc to cosine similarity")
if max1==score2:
print("\nDocument 2 matches with query the most acc to cosine similarity")
if max1==score3:
print("\nDocument 3 matches with query the most acc to cosine similarity")
if max1==score4:
print("\nDocument 4 matches with query the most acc to cosine similarity")
if max1==score5:
print("\nDocument 5 matches with query the most acc to cosine similarity")

score1=0
score2=0
score3=0
score4=0
score5=0

for x in q:
if x in w1:
index = w1.index(x)
score1=score1+((qdict[x]-tf_idf_vector.toarray()[0][index])**2)

for x in q:
if x in w2:
index = w2.index(x)
score2=score2+((qdict[x]-tf_idf_vector.toarray()[1][index])**2)

for x in q:
if x in w3:
index = w3.index(x)
score3=score3+((qdict[x]-tf_idf_vector.toarray()[2][index])**2)

for x in q:
if x in w4:
index = w4.index(x)
score4=score4+((qdict[x]-tf_idf_vector.toarray()[3][index])**2)

for x in q:
if x in w5:
index = w5.index(x)
score5=score5+((qdict[x]-tf_idf_vector.toarray()[4][index])**2)

r1=math.sqrt(score1)
r2=math.sqrt(score2)
r3=math.sqrt(score3)
r4=math.sqrt(score4)
r5=math.sqrt(score5)

tfidf_scores=[]
tfidf_scores.append(r1)
tfidf_scores.append(r2)
tfidf_scores.append(r3)
tfidf_scores.append(r4)
tfidf_scores.append(r5)

def myFunc(e):
return e['score']

euc = [
{'doc': 'Document 1', 'score': r1},
{'doc': 'Document 2', 'score': r2},
{'doc': 'Document 3', 'score': r3},
{'doc': 'Document 4', 'score': r4},
{'doc': 'Document 5', 'score': r5}
]

euc.sort(key=myFunc)

print('\nDocument ranking according to Euclidean similarity')


for x in euc:
print(x['doc'])

max1=min(tfidf_scores)
if max1==r1:
print("\nDocument 1 matches with query the most acc to Euclidean distance")
if max1==r2:
print("\nDocument 2 matches with query the most acc to Euclidean distance")
if max1==r3:
print("\nDocument 3 matches with query the most acc to Euclidean distance")
if max1==r4:
print("\nDocument 4 matches with query the most acc to Euclidean distancey")
if max1==r5:
print("\nDocument 5 matches with query the most acc to Euclidean distance")

similarity=[]
s12=0
for x in range(len(tf_idf_vector.toarray()[0])):
s12=s12+tf_idf_vector.toarray()[0][x]*tf_idf_vector.toarray()[1][x]

similarity.append(s1)

s13=0
for x in range(len(tf_idf_vector.toarray()[0])):
s13=s13+tf_idf_vector.toarray()[0][x]*tf_idf_vector.toarray()[2][x]

similarity.append(s13)

s14=0
for x in range(len(tf_idf_vector.toarray()[0])):
s14=s14+tf_idf_vector.toarray()[0][x]*tf_idf_vector.toarray()[3][x]

similarity.append(s14)

s15=0
for x in range(len(tf_idf_vector.toarray()[0])):
s15=s15+tf_idf_vector.toarray()[0][x]*tf_idf_vector.toarray()[4][x]

similarity.append(s15)

s23=0
for x in range(len(tf_idf_vector.toarray()[0])):
s23=s23+tf_idf_vector.toarray()[1][x]*tf_idf_vector.toarray()[2][x]

similarity.append(s23)

s24=0
for x in range(len(tf_idf_vector.toarray()[0])):
s24=s24+tf_idf_vector.toarray()[1][x]*tf_idf_vector.toarray()[3][x]

similarity.append(s24)

s25=0
for x in range(len(tf_idf_vector.toarray()[0])):
s25=s25+tf_idf_vector.toarray()[1][x]*tf_idf_vector.toarray()[4][x]
similarity.append(s25)

s34=0
for x in range(len(tf_idf_vector.toarray()[0])):
s34=s34+tf_idf_vector.toarray()[2][x]*tf_idf_vector.toarray()[3][x]

similarity.append(s34)

s35=0
for x in range(len(tf_idf_vector.toarray()[0])):
s35=s35+tf_idf_vector.toarray()[2][x]*tf_idf_vector.toarray()[4][x]

similarity.append(s35)

s45=0
for x in range(len(tf_idf_vector.toarray()[0])):
s45=s45+tf_idf_vector.toarray()[3][x]*tf_idf_vector.toarray()[4][x]

similarity.append(s45)

def myFunc(e):
return e['score']

sim = [
{'doc': 'Document 1 is similar to Document 2', 'score': s12},
{'doc': 'Document 1 is similar to Document 3', 'score': s13},
{'doc': 'Document 1 is similar to Document 4', 'score': s14},
{'doc': 'Document 1 is similar to Document 5', 'score': s15},
{'doc': 'Document 2 is similar to Document 3', 'score': s23},
{'doc': 'Document 2 is similar to Document 4', 'score': s24},
{'doc': 'Document 2 is similar to Document 5', 'score': s25},
{'doc': 'Document 3 is similar to Document 4', 'score': s34},
{'doc': 'Document 3 is similar to Document 5', 'score': s35},
{'doc': 'Document 4 is similar to Document 5', 'score': s45},
]

sim.sort(key=myFunc)

for x in sim:
print(x['doc'])

OUTPUT:
• Bag-of-Words (Document corpus)
• TF (Document corpus)

• IDF (Document corpus)

• TF-IDF (Document corpus)

• Normalized (Query)

• Normalized - TF-IDF (Document corpus)


• Cosine Similarity

• Euclidean Distance

• Document Ranking (Display Order)

• Document Similarity (Among Documents)

You might also like