You are on page 1of 4

Web Mining

19BCE2483

Anubhav Bhandary

prob.1.

Vectorize the contents of the web pages and create a data frame for atleast 5 web pages.

———————

Code:

import re

import pandas as pd

from bs4 import BeautifulSoup

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

from urllib import request

sw=set(stopwords.words('english'))

list=[]

urls=["https://vit.ac.in/academics/home",

"https://vit.ac.in/admissions/international/overview",

"https://vit.ac.in/internationalrelations/SAP",

"https://vit.ac.in/placements/internship",

"https://vit.ac.in/academics/library"]

for url in urls:

html=request.urlopen(url).read().decode('utf8')

raw=BeautifulSoup(html,'html.parser').get_text()

print(len(raw))

tokens=word_tokenize(raw)

list_of_list=[]

for i in tokens:

temp=re.sub(r"[^a-zA-Z0-9]+"," ",i)

temp=temp.strip()

if len(temp)>2 and temp not in sw:

list_of_list.append(temp)

list.append(list_of_list)

ss=set()

for i in list:

ss=ss|set(i)

d={}

for i in ss:

d[i]=[]

for j in range(5):

d[i].append(list[j].count(i))

df=pd.DataFrame(d)

df.to_csv(‘fileA.csv')

Output: (The screenshot of the spreadsheet is not complete.)

Prob.2.

import pandas as pd

import math

import numpy as np

df=pd.read_csv("fileA.csv")

for i in range(5):

for j in range(len(df.columns)):

df.iloc[i,j]=math.log(1+df.iloc[i,j])

print(df.head())

arr=[]

for i in range(5):

arr.append(list(df.iloc[i]))

max_score=0

for i in range(5):

for j in range(i+1,5):

a=arr[i]

b=arr[j]

dot=np.dot(a,b)

norm_a=np.linalg.norm(a)

norm_b=np.linalg.norm(b)

cosine_value=dot/(norm_a * norm_b)

if cosine_value>max_score:

similar_d=(i,j,round(cosine_value,5))

max_score=cosine_value

print(i,j,round(cosine_value,5))

print("The most similar document is : ",similar_d)

Output:

You might also like