CSE3024 - Web Mining: E-Record

CSE3024 – Web Mining
WINTER SEMESTER 2019-20
Slot – L13 + 14
E-RECORD
Assessment No.: 01
Submitted By
Deep Agrawal
Reg. No.: 18BCE0518
B.Tech. (Branch) – II Year
SCOPE
VELLORE INSTITUTE OF
TECHNOLOGY VELLORE – 632 014
TAMIL NADU
INDIA
Q1. Write a program to extract the source content (excluding any tags) from the website (https://en.
wikipedia.org/wiki/Web_mining). Display the number of terms and term frequency of each term pre
sent in them after applying stop word removal. Also, apply stemming and lemmatization to the same
document and display the number of terms along with their corresponding stemmed as well as lem
matized words present in them. Count the total number of stemmed and lemmatized words.
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as soup
url = 'https://en.wikipedia.org/wiki/Web_mining'
urlOpen = urllib.request.urlopen(url)
urlHTML = urlOpen.read()
urlSoup = soup(urlHTML, 'html.parser')
pageText = ''
for _ in urlSoup.findAll('p'):
pageText = pageText + _.text
print(pageText)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
sw = set(stopwords.words('english'))
sw.add('.')
sw.add(',')
pageText = pageText.lower()
words = word_tokenize(pageText)
terms = [w for w in words if not w in sw]
print('Number of terms in pageText is : ' + str(len(terms)))
Number of terms in pageText is : 1014
terms1 = list(set(terms))
for _ in range(len(terms1)):
print("Frequency of", terms1[_], "is :", terms.count(terms[_]))
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemTerms = []
for w in terms1:
print(w, ":", ps.stem(w))
stemTerms.append(ps.stem(w))
print("Number of words after lemmatization :", len(set(lemTerms)))

Q2. Add one new word to NLTK stopword list and filter the content extracted from the website given
in Q. No. 1 in order to display the number of terms present in them after excluding newly added
stopwords and their term frequency count. Display the POS tag for all the stopwords, which are
removed from the content.
sw.add('[')
sw.add(']')
sw.add('(')
sw.add(')')
terms = [w for w in words if not w in sw]
print('Number of terms in pageText is : ' + str(len(terms)))
terms1 = list(set(terms))
for _ in range(len(terms1)):
print("Frequency of", terms1[_], "is :", terms.count(terms[_]))
page_sens = pageText.split(".")
for _ in range(len(page_sens)):
page_words = word_tokenize(page_sens[_])
page_sw = [w for w in page_words if not w in terms]
for w in page_sw:
print(nltk.pos_tag(nltk.word_tokenize(w)))
Q3. Write a program to extract the contents (excluding any tags) from two websites
(https://en.wikipedia.org/wiki/Web_mining&https://en.wikipedia.org/wiki/Data_mining) and save
the content in two separate .doc file. Remove stopwords from the content and represent the
documents using Boolean, Bag-of-words and Complete representation. Process a search a query and
compare the contents of the both pages with the processed query, display the similarity result based
on highest matching count (bag-of-words).
url1 = 'https://en.wikipedia.org/wiki/Web_mining'
urlOpen1 = urllib.request.urlopen(url1)
urlHTML1 = urlOpen1.read()
url2 = 'https://en.wikipedia.org/wiki/Data_mining'
urlOpen2 = urllib.request.urlopen(url2)
urlHTML2 = urlOpen2.read()
urlSoup1 = soup(urlHTML1, 'html.parser')
urlSoup2 = soup(urlHTML2, 'html.parser')
pageText1 = ''
for _ in urlSoup1.findAll('p'):
pageText1 = pageText1 + _.text
pageText2 = ''
for _ in urlSoup2.findAll('p'):
pageText2 = pageText2 + _.text
with open('doc1.doc', 'r+') as doc1, open('doc2.doc', 'r+') as doc2:

doc1.write(pageText1)
doc2.write(pageText2)
doc1.seek(0)
doc2.seek(0)
d1 = doc1.read()
d2 = doc2.read()
d1 = d1.lower()
d2 = d2.lower()
w1 = word_tokenize(d1)
w2 = word_tokenize(d2)
t1 = [w for w in w1 if not w in sw]
t2 = [w for w in w2 if not w in sw]
all_t = t1 + t2
all_t = list(set(all_t))
ed1 = [0]*len(all_t)
ed2 = [0]*len(all_t)
for _ in range(len(all_t)):
if all_t[_] in t1:
ed1[_] = 1
if all_t[_] in t2:
ed2[_] = 1
table1 = [['aaa', 0, 0]]*len(all_t)
table1[_] = [all_t[_], ed1[_], ed2[_]]
from tabulate import tabulate
print("Boolean representation :\n")
print(tabulate(table1, headers=['Term', 'Doc1', 'Doc2']))
cd1 = [0]*len(all_t)
cd2 = [0]*len(all_t)
for i in range(len(all_t)):
for j in range(len(t1)):
if all_t[i] == t1[j]:
cd1[i] = cd1[i] + 1
for j in range(len(t2)):
if all_t[i] == t2[j]:
cd2[i] = cd2[i] + 1

table2[_] = [all_t[_], cd1[_], cd2[_]]
print("Bag of words representation representation :\n")
pd1 = [[] for x in range(len(all_t))]

pd2 = [[] for x in range(len(all_t))]
for j in range(len(w1)):
if w1[j] == all_t[i]:
pd1[i].append(j)
for j in range(len(w2)):
if w2[j] == all_t[i]:
pd2[i].append(j)

table3[_] = [all_t[_], pd1[_], pd2[_]]
print("Complete representation representation :\n")
def termFreq(term, doc):

tf = doc.lower().split()
tfInDoc = tf.count(term.lower())
return tfInDoc
q = str(input("Enter the search query : ")).lower()
d1Freq = 0
d2Freq = 0
for _ in q.split():
print("Term Freq of", _, "in doc1 is :", termFreq(_, d1))
d1Freq += termFreq(_, d1)
print("Term Freq of", _, "in doc2 is :", termFreq(_, d2))
d2Freq += termFreq(_, d2)
if d1Freq >= d2Freq:

print("Page1: \n"+ d1)
else:
print("Page2: \n"+ d2)
Q4. Write a program to show the implementation of sentence paraphrasing through
synonyms (retaining semantic meaning) for the following four sentences. Display at
least three other paraphrased sentences for each sentence mentioned below.
a. The quick brown fox jumps over the lazy dog
b. Obama and Putin met the previous week
c. At least 12 people were killed in the battle last week
d. I will go home and come back tomorrow.
from nltk.corpus import wordnet

from random import randint
import nltk.data
sen1 = "The quick brown fox jumps over the lazy dog"
sen2 = "Obama and Putin met the previous week"
sen3 = "At least 12 people were killed in the battle last week"
sen4 = "At least 12 people were killed in the battle last week"
output = [""]*3
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenized = tokenizer.tokenize(sen2)
words = word_tokenize(sen2)
tagged = nltk.pos_tag(words)
for _ in range(3):
for i in range(0,len(words)):
replacements = []
for syn in wordnet.synsets(words[i]):

if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT':
break
word_type = tagged[i][1][0].lower()
if syn.name().find("."+word_type+"."):
r = syn.name()[0:syn.name().find(".")]
replacements.append(r)
if len(replacements) > 0:
replacement = replacements[randint(0,len(replacements)-1)]
output[_] = output[_] + " " + replacement
else:
output[_] = output[_] + " " + words[i]
print(output[_])

CSE3024 - Web Mining: E-Record

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

CSE3024 - Web Mining: E-Record

Uploaded by

Copyright:

Available Formats

CSE3024 – Web Mining

WINTER SEMESTER 2019-20

print("Number of words after lemmatization :", len(set(lemTerms)))

with open('doc1.doc', 'r+') as doc1, open('doc2.doc', 'r+') as doc2:

table2 = [['aaa', 0, 0]]*len(all_t)

pd1 = [[] for x in range(len(all_t))]

table3 = [['aaa', 0, 0]]*len(all_t)

def termFreq(term, doc):

if d1Freq >= d2Freq:

b. Obama and Putin met the previous week

c. At least 12 people were killed in the battle last week

d. I will go home and come back tomorrow.

from nltk.corpus import wordnet

for syn in wordnet.synsets(words[i]):

You might also like