You are on page 1of 9

CSE3024 – Web Mining

WINTER SEMESTER 2019-20

Slot – L13 + 14

E-RECORD
Assessment No.: 01

Submitted By

Deep Agrawal
Reg. No.: 18BCE0518
B.Tech. (Branch) – II Year
SCOPE

VELLORE INSTITUTE OF
TECHNOLOGY VELLORE – 632 014
TAMIL NADU
INDIA
Q1. Write a program to extract the source content (excluding any tags) from the website (https://en.
wikipedia.org/wiki/Web_mining). Display the number of terms and term frequency of each term pre
sent in them after applying stop word removal. Also, apply stemming and lemmatization to the same
document and display the number of terms along with their corresponding stemmed as well as lem
matized words present in them. Count the total number of stemmed and lemmatized words.

import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as soup

url = 'https://en.wikipedia.org/wiki/Web_mining'
urlOpen = urllib.request.urlopen(url)
urlHTML = urlOpen.read()
urlSoup = soup(urlHTML, 'html.parser')
pageText = ''
for _ in urlSoup.findAll('p'):
pageText = pageText + _.text
print(pageText)

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

sw = set(stopwords.words('english'))
sw.add('.')
sw.add(',')
pageText = pageText.lower()
words = word_tokenize(pageText)
terms = [w for w in words if not w in sw]
print('Number of terms in pageText is : ' + str(len(terms)))
Number of terms in pageText is : 1014
terms1 = list(set(terms))
for _ in range(len(terms1)):
print("Frequency of", terms1[_], "is :", terms.count(terms[_]))
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemTerms = []
for w in terms1:
print(w, ":", ps.stem(w))
stemTerms.append(ps.stem(w))

print("Number of words after lemmatization :", len(set(lemTerms)))


Q2. Add one new word to NLTK stopword list and filter the content extracted from the website given
in Q. No. 1 in order to display the number of terms present in them after excluding newly added
stopwords and their term frequency count. Display the POS tag for all the stopwords, which are
removed from the content.

sw.add('[')
sw.add(']')
sw.add('(')
sw.add(')')
terms = [w for w in words if not w in sw]
print('Number of terms in pageText is : ' + str(len(terms)))

terms1 = list(set(terms))

for _ in range(len(terms1)):
print("Frequency of", terms1[_], "is :", terms.count(terms[_]))

page_sens = pageText.split(".")

for _ in range(len(page_sens)):
page_words = word_tokenize(page_sens[_])
page_sw = [w for w in page_words if not w in terms]
for w in page_sw:
print(nltk.pos_tag(nltk.word_tokenize(w)))
Q3. Write a program to extract the contents (excluding any tags) from two websites
(https://en.wikipedia.org/wiki/Web_mining&https://en.wikipedia.org/wiki/Data_mining) and save
the content in two separate .doc file. Remove stopwords from the content and represent the
documents using Boolean, Bag-of-words and Complete representation. Process a search a query and
compare the contents of the both pages with the processed query, display the similarity result based
on highest matching count (bag-of-words).

url1 = 'https://en.wikipedia.org/wiki/Web_mining'
urlOpen1 = urllib.request.urlopen(url1)
urlHTML1 = urlOpen1.read()
url2 = 'https://en.wikipedia.org/wiki/Data_mining'
urlOpen2 = urllib.request.urlopen(url2)
urlHTML2 = urlOpen2.read()
urlSoup1 = soup(urlHTML1, 'html.parser')
urlSoup2 = soup(urlHTML2, 'html.parser')

pageText1 = ''
for _ in urlSoup1.findAll('p'):
pageText1 = pageText1 + _.text
pageText2 = ''
for _ in urlSoup2.findAll('p'):
pageText2 = pageText2 + _.text

with open('doc1.doc', 'r+') as doc1, open('doc2.doc', 'r+') as doc2:


doc1.write(pageText1)
doc2.write(pageText2)
doc1.seek(0)
doc2.seek(0)
d1 = doc1.read()
d2 = doc2.read()

d1 = d1.lower()
d2 = d2.lower()
w1 = word_tokenize(d1)
w2 = word_tokenize(d2)
t1 = [w for w in w1 if not w in sw]
t2 = [w for w in w2 if not w in sw]
all_t = t1 + t2
all_t = list(set(all_t))
ed1 = [0]*len(all_t)
ed2 = [0]*len(all_t)
for _ in range(len(all_t)):
if all_t[_] in t1:
ed1[_] = 1
if all_t[_] in t2:
ed2[_] = 1
table1 = [['aaa', 0, 0]]*len(all_t)
for _ in range(len(all_t)):
table1[_] = [all_t[_], ed1[_], ed2[_]]
from tabulate import tabulate
print("Boolean representation :\n")
print(tabulate(table1, headers=['Term', 'Doc1', 'Doc2']))

cd1 = [0]*len(all_t)
cd2 = [0]*len(all_t)
for i in range(len(all_t)):
for j in range(len(t1)):
if all_t[i] == t1[j]:
cd1[i] = cd1[i] + 1
for i in range(len(all_t)):
for j in range(len(t2)):
if all_t[i] == t2[j]:
cd2[i] = cd2[i] + 1

table2 = [['aaa', 0, 0]]*len(all_t)


for _ in range(len(all_t)):
table2[_] = [all_t[_], cd1[_], cd2[_]]
print("Bag of words representation representation :\n")
print(tabulate(table2, headers=['Term', 'Doc1', 'Doc2']))

pd1 = [[] for x in range(len(all_t))]


pd2 = [[] for x in range(len(all_t))]

for i in range(len(all_t)):
for j in range(len(w1)):
if w1[j] == all_t[i]:
pd1[i].append(j)

for i in range(len(all_t)):
for j in range(len(w2)):
if w2[j] == all_t[i]:
pd2[i].append(j)

table3 = [['aaa', 0, 0]]*len(all_t)


for _ in range(len(all_t)):
table3[_] = [all_t[_], pd1[_], pd2[_]]
print("Complete representation representation :\n")
print(tabulate(table3, headers=['Term', 'Doc1', 'Doc2']))

def termFreq(term, doc):


tf = doc.lower().split()
tfInDoc = tf.count(term.lower())
return tfInDoc
q = str(input("Enter the search query : ")).lower()
d1Freq = 0
d2Freq = 0
for _ in q.split():
print("Term Freq of", _, "in doc1 is :", termFreq(_, d1))
d1Freq += termFreq(_, d1)
print("Term Freq of", _, "in doc2 is :", termFreq(_, d2))
d2Freq += termFreq(_, d2)

if d1Freq >= d2Freq:


print("Page1: \n"+ d1)
else:
print("Page2: \n"+ d2)
Q4. Write a program to show the implementation of sentence paraphrasing through
synonyms (retaining semantic meaning) for the following four sentences. Display at
least three other paraphrased sentences for each sentence mentioned below.
a. The quick brown fox jumps over the lazy dog

b. Obama and Putin met the previous week

c. At least 12 people were killed in the battle last week

d. I will go home and come back tomorrow.

from nltk.corpus import wordnet


from random import randint
import nltk.data

sen1 = "The quick brown fox jumps over the lazy dog"
sen2 = "Obama and Putin met the previous week"
sen3 = "At least 12 people were killed in the battle last week"
sen4 = "At least 12 people were killed in the battle last week"
output = [""]*3

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenized = tokenizer.tokenize(sen2)
words = word_tokenize(sen2)
tagged = nltk.pos_tag(words)
for _ in range(3):
for i in range(0,len(words)):
replacements = []

for syn in wordnet.synsets(words[i]):


if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT':
break
word_type = tagged[i][1][0].lower()
if syn.name().find("."+word_type+"."):
r = syn.name()[0:syn.name().find(".")]
replacements.append(r)

if len(replacements) > 0:
replacement = replacements[randint(0,len(replacements)-1)]
output[_] = output[_] + " " + replacement
else:
output[_] = output[_] + " " + words[i]
print(output[_])

You might also like