You are on page 1of 13

CSE 3024: Web Mining

Lab Assessment - 3

Web Structure Mining, Supervised Learning

K MARY NIKITHA 18BCE0457

Web Mining Lab Ass 3 Page 1 of 13


1. Write a python program to calculate the degree prestige, proximity prestige and rank prestige
using a graph dataset given in the following link. http:// snap.stanford.edu/data/wiki-Vote.txt.gz

Code

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 21 10:16:57 2020

@author: nikitha
"""

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

file1=open("data.txt","r")
file2=open("data1.doc","w")
for line in file1:
new_line = line.replace('\t', ' ')
file2.write(new_line)
file1.close()
file2.close()

file3=open("data1.doc","r")
file4=open("data2.doc","w")
file5=open("data3.doc","w")
for line in file3:
new_line1 = line.split(" ")
file4.write(new_line1[0])
file4.write("\n")
file5.write(new_line1[1])
file5.write("\n")
fromnodes=open("data2.doc","r")
tonodes=open("data3.doc","r")

fromn=fromnodes.read()
w1=fromn.split()
ton=tonodes.read()
w2=ton.split()

import networkx as nx
import matplotlib.pyplot as plt
d = nx.DiGraph(Directed=True)

Web Mining Lab Ass 3 Page 2 of 13


for x in range(102747):
d.add_edge(w1[x], w2[x])
#plt.figure(figsize =(15, 15))
#nx.draw_networkx(G, with_labels = True)

n_nodes=d.number_of_nodes()

degree_prestige = dict((v,len(d.in_edges(v))/(n_nodes-1)) for v in d.nodes())


print("DEGREE PRESTIGE :\n")

for i in degree_prestige:
print(i, " : ", degree_prestige[i])

distance = []

temp_dis = 0
n=0
for dest in d.nodes:
temp_dis = 0
n=0
for src in d.nodes:
if (nx.has_path(d,src,dest) == True):
temp_dis = temp_dis + nx.shortest_path_length(d,source = src,target = dest)
n=n+1
if temp_dis == 0:
distance.append([dest, 0])
else:
distance.append([dest, temp_dis/(n - 1)])
print("\nPROXIMITY PRESTIGE :\n")
for i in distance:
print(str(i[0]) + " : " + str(i[1]))

prominance = np.random.randint(1, 4, size=n_nodes)


print("\nASSUME PROMINANCE :\n")
print(prominance)
rank_prestige = np.zeros([n_nodes], dtype = int)

path_matrix = np.zeros([n_nodes, n_nodes], dtype = int)


i=0
j=0
for src in d.nodes:
for dest in d.nodes:
if d.has_edge(dest, src):
path_matrix[i][j] = 1
j = j+1
j=0
i = i+1
for i in range(n_nodes):
pr_i = 0
for j in range(n_nodes):
pr_i = pr_i + path_matrix[i][j] * prominance[j]
rank_prestige[i] = pr_i
Web Mining Lab Ass 3 Page 3 of 13
print("\nRANK PRESTIGE :\n")
print(rank_prestige)

Output:

Fig 1: Degree Prestige

Fig 2 : Proximity prestige

Web Mining Lab Ass 3 Page 4 of 13


Fig 3: Rank prestige

Web Mining Lab Ass 3 Page 5 of 13


Web Mining Lab Ass 3 Page 6 of 13
2. Write a python program to show the implementation HITS algorithm for the following graph and
display the authority as well as hub score for all the nodes. (stopping criteria:- ε = 0.04 for both hub
and authority)

Code:

import networkx as nx
import matplotlib.pyplot as plt

G = nx.DiGraph(Directed=True)
G.add_edges_from([('Wiki', 'Bing'), ('Wiki', 'Google'), ('Rediff','Bing'), ('Bing', 'Google'), ('Altavi',
'Bing'), ('Altavi', 'Google'), ('Yahoo','Bing'),('Yahoo','Altavi'), ('Google','Wiki'), ('Google', 'Bing'),
('Google', 'Rediff'), ('Google','Altavi'),('Google','Yahoo')])
hubs, authorities = nx.hits(G, max_iter = 50, normalized = True, tol=0.04)
plt.figure(figsize =(10, 10))
nx.draw_networkx(G, with_labels = True, node_size=5000)

print("\n\nHub Scores:- ")


for key,val in hubs.items():
print (key, "\t=>", val)

print("\n\nAuthority Scores:- ")


for key,val in authorities.items():
print (key, "\t=>", val)import networkx as nx
import matplotlib.pyplot as plt

G = nx.DiGraph(Directed=True)
G.add_edges_from([('Wiki', 'Bing'), ('Wiki', 'Google'), ('Rediff','Bing'), ('Bing', 'Google'), ('Altavi',
'Bing'), ('Altavi', 'Google'), ('Yahoo','Bing'),('Yahoo','Altavi'), ('Google','Wiki'), ('Google', 'Bing'),
('Google', 'Rediff'), ('Google','Altavi'),('Google','Yahoo')])
hubs, authorities = nx.hits(G, max_iter = 50, normalized = True, tol=0.04)
plt.figure(figsize =(10, 10))
nx.draw_networkx(G, with_labels = True, node_size=5000)

print("\n\nHub Scores:- ")


for key,val in hubs.items():
print (key, "\t=>", val)

print("\n\nAuthority Scores:- ")


for key,val in authorities.items():
print (key, "\t=>", val)

Web Mining Lab Ass 3 Page 7 of 13


Output:

Fig 1: Graph plot

Fig 2: Hub and authority Scores

Web Mining Lab Ass 3 Page 8 of 13


3. Write a python program to show the implementation of Decision Tree and NaïveBayes
techniques using the below mentioned dataset.
Handle missing values, If any
Use 5-fold cross validation technique
Prepare the confusion matrix, find out the precision, recall value, F-measure and prediction
accuracy.
Prepare ROC and AUC curve based on the result obtained. Compare the results obtained using these
two techniques in order to assess their performance for the considered dataset

Code:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 24 16:15:03 2020

@author: nikitha
"""

import pandas as pd
import numpy as np

df=pd.read_excel (r'/Users/nikitha/Desktop/xm/cca.xls')

df.columns=['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15','CLASS']

df[df['A1']=='?']
df.drop(df.index[[248,327,346,374,453,479,489,520,598,601,641,673]],inplace=True)

df = df.reset_index()
df.drop(['index'],axis=1,inplace=True)
#df.drop(['level_0'],axis=1,inplace=True)

df.mean(axis = 0, skipna = True)

df['A3']=df['A3'].replace(['?'],'4.79')
df['A8']=df['A8'].replace(['?'],'2.42')
df['A11']=df['A11'].replace(['?'],'2.43')
df['A15']=df['A15'].replace(['?'],'1031')
df[df['A5']=='?']

df.drop(df.index[[206,269,328,451,584,612]],inplace=True)
df = df.reset_index()
df.drop(['index'],axis=1,inplace=True)

df.drop(df.index[[523,527]],inplace=True)
df[df['A6']=='?']
df = df.reset_index()
df.drop(['index'],axis=1,inplace=True)

df['A14']=df['A14'].replace(['?'],'0')
Web Mining Lab Ass 3 Page 9 of 13
df['A14']=df['A14'].replace(['?'],'0')
df[['A14']] = df[['A14']].apply(pd.to_numeric)
df.mean(axis = 0, skipna = True)
df['A14']=df['A14'].replace(['0'],'1063')

df['A1']=df['A1'].replace(['a','b'],[0,1])

df['A4']=df['A4'].replace(['u'],'0')
df['A4']=df['A4'].replace(['y'],'1')
df['A4']=df['A4'].replace(['l'],'2')
df.A5.unique()

df['A5']=df['A5'].replace(['g', 'p', 'gg'],[0,1,2])

df.A6.unique()
df['A6']=df['A6'].replace(['w', 'q', 'm', 'r', 'cc', 'k', 'c', 'd', 'x', 'i', 'e', 'aa', 'ff',
'j'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13])

df.A7.unique()
df['A7']=df['A7'].replace(['v', 'h', 'bb', 'ff', 'j', 'z', 'o','dd', 'n'],[0,1,2,3,4,5,6,7,8])

df['A9']=df['A9'].replace(['t', 'f'],[0,1])

df['A10']=df['A10'].replace(['t', 'f'],[0,1])
df['A12']=df['A12'].replace(['t', 'f'],[0,1])

df.A13.unique()
df['A13']=df['A13'].replace(['g', 's', 'p'],[0,1,2])

df['CLASS']=df['CLASS'].replace(['+','-'],[0,1])

from sklearn.model_selection import train_test_split

df['A2']=df['A2'].replace(['?'],'0')
df[['A2']] = df[['A2']].apply(pd.to_numeric)
df.mean(axis = 0, skipna = True)
df['A2']=df['A2'].replace([0],30)

train,test=train_test_split(df,random_state=42)
X_train=train[train.columns[0:14]]
y_train=train['CLASS']
X_test=test[test.columns[0:14]]
y_test=test['CLASS']

from sklearn.preprocessing import StandardScaler


feature_scaler=StandardScaler()
X_train=feature_scaler.fit_transform(X_train)
X_test=feature_scaler.transform(X_test)

from sklearn.naive_bayes import GaussianNB


clf=GaussianNB()
Web Mining Lab Ass 3 Page 10 of 13
from sklearn.model_selection import cross_val_score
accuracy=cross_val_score(estimator=clf,X=X_train,y=y_train,cv=5)
print(accuracy)

print(accuracy.mean())

from sklearn import tree


cls = tree.DecisionTreeClassifier()
accuracy1=cross_val_score(estimator=cls,X=X_train,y=y_train,cv=5)
print(accuracy1)
print(accuracy1.mean())

import numpy as np
model = clf.fit(X_train, y_train)

predicted = model.predict(X_test)

print("PREDICTION ACCURACY:\n",np.mean(predicted == y_test))

from sklearn.metrics import confusion_matrix


print("\nConfusion Matrix: \n", confusion_matrix(y_test, predicted))
from sklearn.metrics import classification_report

print("\nMETRICS\n",classification_report(y_test, predicted))

model = cls.fit(X_train, y_train)

predicted1 = model.predict(X_test)

print("PREDICTION ACCURACY:\n",np.mean(predicted1== y_test))

from sklearn.metrics import confusion_matrix


print("\nConfusion Matrix: \n", confusion_matrix(y_test, predicted1))
from sklearn.metrics import classification_report

print("\nMETRICS\n",classification_report(y_test, predicted1))

from sklearn.metrics import roc_curve, auc


import matplotlib.pyplot as plt
NB_fpr, NB_tpr, threshold = roc_curve(y_test, predicted)
auc_NB= auc(NB_fpr, NB_tpr)

DT_fpr, DT_tpr, threshold = roc_curve(y_test, predicted1)


auc_DT= auc(DT_fpr, DT_tpr)

plt.figure(figsize=(5, 5), dpi=100)


plt.plot(NB_fpr, NB_tpr, linestyle='-', label='Naive_Bayes (auc = %0.3f)' % auc_NB)

plt.plot(DT_fpr, DT_tpr, marker='.', label='Decision_tree (auc = %0.3f)' % auc_DT)

plt.xlabel('False Positive Rate -->')


plt.ylabel('True Positive Rate -->')
Web Mining Lab Ass 3 Page 11 of 13
plt.legend()

plt.show()

print("AREA UNDER THE CURVE FOR DECISION TREE:\n ",auc_DT)


print("PREDICTION ACCURACY OF NAIVE BAYES:\n",np.mean(predicted == y_test))
print("AREA UNDER THE CURVE FOR NB:\n ",auc_NB)
print("PREDICTION ACCURACY OF DECISION TREE:\n",np.mean(predicted1== y_test))
print("AREA UNDER THE CURVE FOR DECISION TREE:\n ",auc_DT)

if np.mean(predicted == y_test)>=np.mean(predicted1== y_test):


print("NAIVE BAYES IS BETTER THAN DECISION TREE.")
else:
print("DECISION TREE IS BETTER THAN NAIVE BAYES.”)

Output:

Web Mining Lab Ass 3 Page 12 of 13


Web Mining Lab Ass 3 Page 13 of 13

You might also like