Professional Documents
Culture Documents
19BCE2483
Anubhav Bhandary
prob.1.
Vectorize the contents of the web pages and create a data frame for atleast 5 web pages.
———————
Code:
import re
import pandas as pd
sw=set(stopwords.words('english'))
list=[]
urls=["https://vit.ac.in/academics/home",
"https://vit.ac.in/admissions/international/overview",
"https://vit.ac.in/internationalrelations/SAP",
"https://vit.ac.in/placements/internship",
"https://vit.ac.in/academics/library"]
html=request.urlopen(url).read().decode('utf8')
raw=BeautifulSoup(html,'html.parser').get_text()
print(len(raw))
tokens=word_tokenize(raw)
list_of_list=[]
for i in tokens:
temp=re.sub(r"[^a-zA-Z0-9]+"," ",i)
temp=temp.strip()
list_of_list.append(temp)
list.append(list_of_list)
ss=set()
for i in list:
ss=ss|set(i)
d={}
for i in ss:
d[i]=[]
for j in range(5):
d[i].append(list[j].count(i))
df=pd.DataFrame(d)
df.to_csv(‘fileA.csv')
Prob.2.
import pandas as pd
import math
import numpy as np
df=pd.read_csv("fileA.csv")
for i in range(5):
for j in range(len(df.columns)):
df.iloc[i,j]=math.log(1+df.iloc[i,j])
print(df.head())
arr=[]
for i in range(5):
arr.append(list(df.iloc[i]))
max_score=0
for i in range(5):
for j in range(i+1,5):
a=arr[i]
b=arr[j]
dot=np.dot(a,b)
norm_a=np.linalg.norm(a)
norm_b=np.linalg.norm(b)
cosine_value=dot/(norm_a * norm_b)
if cosine_value>max_score:
similar_d=(i,j,round(cosine_value,5))
max_score=cosine_value
print(i,j,round(cosine_value,5))
Output: