CSE 3024: Web Mining Lab Assessment - 3 Decision Tree vs Naive Bayes Performance

CSE 3024: Web Mining
Lab Assessment - 3
Web Structure Mining, Supervised Learning
K MARY NIKITHA 18BCE0457
Web Mining Lab Ass 3 Page 1 of 13

1. Write a python program to calculate the degree prestige, proximity prestige and rank prestige
using a graph dataset given in the following link. http:// snap.stanford.edu/data/wiki-Vote.txt.gz
Code
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 21 10:16:57 2020
@author: nikitha
"""
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
file1=open("data.txt","r")
file2=open("data1.doc","w")
for line in file1:
new_line = line.replace('\t', ' ')
file2.write(new_line)
file1.close()
file2.close()
file3=open("data1.doc","r")
for line in file3:
new_line1 = line.split(" ")
file4.write(new_line1[0])
file4.write("\n")
file5.write(new_line1[1])
file5.write("\n")
fromnodes=open("data2.doc","r")
tonodes=open("data3.doc","r")
fromn=fromnodes.read()
w1=fromn.split()
ton=tonodes.read()
w2=ton.split()
import networkx as nx
import matplotlib.pyplot as plt
d = nx.DiGraph(Directed=True)

for x in range(102747):
d.add_edge(w1[x], w2[x])
#plt.figure(figsize =(15, 15))
#nx.draw_networkx(G, with_labels = True)
n_nodes=d.number_of_nodes()
degree_prestige = dict((v,len(d.in_edges(v))/(n_nodes-1)) for v in d.nodes())

print("DEGREE PRESTIGE :\n")
for i in degree_prestige:
print(i, " : ", degree_prestige[i])
distance = []
temp_dis = 0
n=0
for dest in d.nodes:
temp_dis = 0
n=0
for src in d.nodes:
if (nx.has_path(d,src,dest) == True):
temp_dis = temp_dis + nx.shortest_path_length(d,source = src,target = dest)
n=n+1
if temp_dis == 0:
distance.append([dest, 0])
else:
distance.append([dest, temp_dis/(n - 1)])
print("\nPROXIMITY PRESTIGE :\n")
for i in distance:
print(str(i[0]) + " : " + str(i[1]))
prominance = np.random.randint(1, 4, size=n_nodes)

print("\nASSUME PROMINANCE :\n")
print(prominance)
rank_prestige = np.zeros([n_nodes], dtype = int)
path_matrix = np.zeros([n_nodes, n_nodes], dtype = int)

i=0
j=0
for src in d.nodes:
for dest in d.nodes:
if d.has_edge(dest, src):
path_matrix[i][j] = 1
j = j+1
j=0
i = i+1
for i in range(n_nodes):
pr_i = 0
for j in range(n_nodes):
pr_i = pr_i + path_matrix[i][j] * prominance[j]
rank_prestige[i] = pr_i
print("\nRANK PRESTIGE :\n")
print(rank_prestige)
Output:
Fig 1: Degree Prestige
Fig 2 : Proximity prestige

Fig 3: Rank prestige

2. Write a python program to show the implementation HITS algorithm for the following graph and
display the authority as well as hub score for all the nodes. (stopping criteria:- ε = 0.04 for both hub
and authority)
Code:
import networkx as nx
G = nx.DiGraph(Directed=True)
G.add_edges_from([('Wiki', 'Bing'), ('Wiki', 'Google'), ('Rediff','Bing'), ('Bing', 'Google'), ('Altavi',
'Bing'), ('Altavi', 'Google'), ('Yahoo','Bing'),('Yahoo','Altavi'), ('Google','Wiki'), ('Google', 'Bing'),
('Google', 'Rediff'), ('Google','Altavi'),('Google','Yahoo')])
hubs, authorities = nx.hits(G, max_iter = 50, normalized = True, tol=0.04)
plt.figure(figsize =(10, 10))
nx.draw_networkx(G, with_labels = True, node_size=5000)
print("\n\nHub Scores:- ")

for key,val in hubs.items():
print (key, "\t=>", val)
print("\n\nAuthority Scores:- ")

for key,val in authorities.items():
print (key, "\t=>", val)import networkx as nx
G = nx.DiGraph(Directed=True)
G.add_edges_from([('Wiki', 'Bing'), ('Wiki', 'Google'), ('Rediff','Bing'), ('Bing', 'Google'), ('Altavi',
'Bing'), ('Altavi', 'Google'), ('Yahoo','Bing'),('Yahoo','Altavi'), ('Google','Wiki'), ('Google', 'Bing'),
('Google', 'Rediff'), ('Google','Altavi'),('Google','Yahoo')])
hubs, authorities = nx.hits(G, max_iter = 50, normalized = True, tol=0.04)
plt.figure(figsize =(10, 10))
nx.draw_networkx(G, with_labels = True, node_size=5000)
print("\n\nHub Scores:- ")

for key,val in hubs.items():
print("\n\nAuthority Scores:- ")

for key,val in authorities.items():

Output:
Fig 1: Graph plot
Fig 2: Hub and authority Scores

3. Write a python program to show the implementation of Decision Tree and NaïveBayes
techniques using the below mentioned dataset.
Handle missing values, If any
Use 5-fold cross validation technique
Prepare the confusion matrix, find out the precision, recall value, F-measure and prediction
accuracy.
Prepare ROC and AUC curve based on the result obtained. Compare the results obtained using these
two techniques in order to assess their performance for the considered dataset
Code:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 24 16:15:03 2020
@author: nikitha
"""
import pandas as pd
import numpy as np
df=pd.read_excel (r'/Users/nikitha/Desktop/xm/cca.xls')
df.columns=['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15','CLASS']
df[df['A1']=='?']
df.drop(df.index[[248,327,346,374,453,479,489,520,598,601,641,673]],inplace=True)
df = df.reset_index()
df.drop(['index'],axis=1,inplace=True)
#df.drop(['level_0'],axis=1,inplace=True)
df.mean(axis = 0, skipna = True)
df['A3']=df['A3'].replace(['?'],'4.79')
df['A8']=df['A8'].replace(['?'],'2.42')
df['A11']=df['A11'].replace(['?'],'2.43')
df['A15']=df['A15'].replace(['?'],'1031')
df[df['A5']=='?']
df.drop(df.index[[206,269,328,451,584,612]],inplace=True)
df.drop(df.index[[523,527]],inplace=True)
df[df['A6']=='?']
df['A14']=df['A14'].replace(['?'],'0')
df['A14']=df['A14'].replace(['?'],'0')
df[['A14']] = df[['A14']].apply(pd.to_numeric)
df['A14']=df['A14'].replace(['0'],'1063')
df['A1']=df['A1'].replace(['a','b'],[0,1])
df['A4']=df['A4'].replace(['u'],'0')
df['A4']=df['A4'].replace(['y'],'1')
df['A4']=df['A4'].replace(['l'],'2')
df.A5.unique()
df['A5']=df['A5'].replace(['g', 'p', 'gg'],[0,1,2])
df.A6.unique()
df['A6']=df['A6'].replace(['w', 'q', 'm', 'r', 'cc', 'k', 'c', 'd', 'x', 'i', 'e', 'aa', 'ff',
'j'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13])
df.A7.unique()
df['A7']=df['A7'].replace(['v', 'h', 'bb', 'ff', 'j', 'z', 'o','dd', 'n'],[0,1,2,3,4,5,6,7,8])
df['A9']=df['A9'].replace(['t', 'f'],[0,1])
df.A13.unique()
df['A13']=df['A13'].replace(['g', 's', 'p'],[0,1,2])
df['CLASS']=df['CLASS'].replace(['+','-'],[0,1])
from sklearn.model_selection import train_test_split
df['A2']=df['A2'].replace(['?'],'0')
df[['A2']] = df[['A2']].apply(pd.to_numeric)
df['A2']=df['A2'].replace([0],30)
train,test=train_test_split(df,random_state=42)
X_train=train[train.columns[0:14]]
y_train=train['CLASS']
X_test=test[test.columns[0:14]]
y_test=test['CLASS']
from sklearn.preprocessing import StandardScaler

feature_scaler=StandardScaler()
X_train=feature_scaler.fit_transform(X_train)
X_test=feature_scaler.transform(X_test)
from sklearn.naive_bayes import GaussianNB

clf=GaussianNB()
from sklearn.model_selection import cross_val_score
accuracy=cross_val_score(estimator=clf,X=X_train,y=y_train,cv=5)
print(accuracy)
print(accuracy.mean())
from sklearn import tree

cls = tree.DecisionTreeClassifier()
accuracy1=cross_val_score(estimator=cls,X=X_train,y=y_train,cv=5)
print(accuracy1)
print(accuracy1.mean())
import numpy as np
model = clf.fit(X_train, y_train)
predicted = model.predict(X_test)
print("PREDICTION ACCURACY:\n",np.mean(predicted == y_test))
from sklearn.metrics import confusion_matrix

print("\nConfusion Matrix: \n", confusion_matrix(y_test, predicted))
from sklearn.metrics import classification_report
print("\nMETRICS\n",classification_report(y_test, predicted))
model = cls.fit(X_train, y_train)
predicted1 = model.predict(X_test)
print("PREDICTION ACCURACY:\n",np.mean(predicted1== y_test))
from sklearn.metrics import confusion_matrix

print("\nConfusion Matrix: \n", confusion_matrix(y_test, predicted1))
from sklearn.metrics import classification_report
print("\nMETRICS\n",classification_report(y_test, predicted1))
from sklearn.metrics import roc_curve, auc

NB_fpr, NB_tpr, threshold = roc_curve(y_test, predicted)
auc_NB= auc(NB_fpr, NB_tpr)
DT_fpr, DT_tpr, threshold = roc_curve(y_test, predicted1)

auc_DT= auc(DT_fpr, DT_tpr)
plt.figure(figsize=(5, 5), dpi=100)

plt.plot(NB_fpr, NB_tpr, linestyle='-', label='Naive_Bayes (auc = %0.3f)' % auc_NB)
plt.plot(DT_fpr, DT_tpr, marker='.', label='Decision_tree (auc = %0.3f)' % auc_DT)
plt.xlabel('False Positive Rate -->')

plt.ylabel('True Positive Rate -->')
plt.legend()
plt.show()
print("AREA UNDER THE CURVE FOR DECISION TREE:\n ",auc_DT)

print("PREDICTION ACCURACY OF NAIVE BAYES:\n",np.mean(predicted == y_test))
print("AREA UNDER THE CURVE FOR NB:\n ",auc_NB)
print("PREDICTION ACCURACY OF DECISION TREE:\n",np.mean(predicted1== y_test))
print("AREA UNDER THE CURVE FOR DECISION TREE:\n ",auc_DT)
if np.mean(predicted == y_test)>=np.mean(predicted1== y_test):

print("NAIVE BAYES IS BETTER THAN DECISION TREE.")
else:
print("DECISION TREE IS BETTER THAN NAIVE BAYES.”)
Output:


CSE 3024: Web Mining Lab Assessment - 3 Decision Tree vs Naive Bayes Performance

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

CSE 3024: Web Mining Lab Assessment - 3 Decision Tree vs Naive Bayes Performance

Uploaded by

Copyright:

Available Formats

CSE 3024: Web Mining

Web Structure Mining, Supervised Learning

K MARY NIKITHA 18BCE0457

Web Mining Lab Ass 3 Page 1 of 13

Web Mining Lab Ass 3 Page 2 of 13

degree_prestige = dict((v,len(d.in_edges(v))/(n_nodes-1)) for v in d.nodes())

prominance = np.random.randint(1, 4, size=n_nodes)

path_matrix = np.zeros([n_nodes, n_nodes], dtype = int)

Fig 1: Degree Prestige

Fig 2 : Proximity prestige

Web Mining Lab Ass 3 Page 4 of 13

Web Mining Lab Ass 3 Page 5 of 13

print("\n\nHub Scores:- ")

print("\n\nAuthority Scores:- ")

print("\n\nHub Scores:- ")

print("\n\nAuthority Scores:- ")

Web Mining Lab Ass 3 Page 7 of 13

Fig 1: Graph plot

Fig 2: Hub and authority Scores

Web Mining Lab Ass 3 Page 8 of 13

df.mean(axis = 0, skipna = True)

df['A5']=df['A5'].replace(['g', 'p', 'gg'],[0,1,2])

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import GaussianNB

from sklearn import tree

print("PREDICTION ACCURACY:\n",np.mean(predicted == y_test))

from sklearn.metrics import confusion_matrix

model = cls.fit(X_train, y_train)

print("PREDICTION ACCURACY:\n",np.mean(predicted1== y_test))

from sklearn.metrics import confusion_matrix

from sklearn.metrics import roc_curve, auc

DT_fpr, DT_tpr, threshold = roc_curve(y_test, predicted1)

plt.figure(figsize=(5, 5), dpi=100)

plt.plot(DT_fpr, DT_tpr, marker='.', label='Decision_tree (auc = %0.3f)' % auc_DT)

plt.xlabel('False Positive Rate -->')

print("AREA UNDER THE CURVE FOR DECISION TREE:\n ",auc_DT)

if np.mean(predicted == y_test)>=np.mean(predicted1== y_test):

Web Mining Lab Ass 3 Page 12 of 13

You might also like