You are on page 1of 7

#Name : Mudu Suman

#RollNo : 222CD017
#ML LAB 7

1. Implement Naive Bayes Classifier algorithm without using inbuilt functions.


dataset = {'Taste':['Salty','Spicy','Spicy','Spicy','Spicy','Sweet','Salty','Sweet','Spicy','Salty'],
'Temperature':['Hot','Hot','Hot','Cold','Hot','Cold','Cold','Hot','Cold','Hot'],
'Texture':['Soft','Soft','Hard','Hard','Hard','Soft','Soft','Soft','Soft','Hard'],
'Eat':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']}
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

def accuracy_score(y_true, y_pred):

""" score = (y_true - y_pred) / len(y_true) """

return round(float(sum(y_pred == y_true))/float(len(y_true)) *


100 ,2)

def pre_processing(df):

""" partioning data into features and target """

X = df.drop([df.columns[-1]], axis = 1)
y = df[df.columns[-1]]

return X, y

class NaiveBayes:

def __init__(self):
self.features = list
self.likelihoods = {}
self.class_priors = {}
self.pred_priors = {}

self.X_train = np.array
self.y_train = np.array
self.train_size = int
self.num_feats = int
def fit(self, X, y):

self.features = list(X.columns)
self.X_train = X
self.y_train = y
self.train_size = X.shape[0]
self.num_feats = X.shape[1]

for feature in self.features:


self.likelihoods[feature] = {}
self.pred_priors[feature] = {}

for feat_val in np.unique(self.X_train[feature]):


self.pred_priors[feature].update({feat_val: 0})

for outcome in np.unique(self.y_train):

self.likelihoods[feature].update({feat_val+'_'+outcome:0})
self.class_priors.update({outcome: 0})

self._calc_class_prior()
self._calc_likelihoods()
self._calc_predictor_prior()

def _calc_class_prior(self):

""" P(c) - Prior Class Probability """

for outcome in np.unique(self.y_train):


outcome_count = sum(self.y_train == outcome)
self.class_priors[outcome] = outcome_count /
self.train_size

def _calc_likelihoods(self):

""" P(x|c) - Likelihood """

for feature in self.features:

for outcome in np.unique(self.y_train):


outcome_count = sum(self.y_train == outcome)
feat_likelihood = self.X_train[feature]
[self.y_train[self.y_train ==
outcome].index.values.tolist()].value_counts().to_dict()

for feat_val, count in feat_likelihood.items():


self.likelihoods[feature][feat_val + '_' +
outcome] = count/outcome_count
def _calc_predictor_prior(self):

""" P(x) - Evidence """

for feature in self.features:


feat_vals =
self.X_train[feature].value_counts().to_dict()

for feat_val, count in feat_vals.items():


self.pred_priors[feature][feat_val] =
count/self.train_size

def predict(self, X):

""" Calculates Posterior probability P(c|x) """

results = []
X = np.array(X)

for query in X:
probs_outcome = {}
for outcome in np.unique(self.y_train):
prior = self.class_priors[outcome]
likelihood = 1
evidence = 1

for feat, feat_val in zip(self.features, query):


likelihood *= self.likelihoods[feat]
[feat_val + '_' + outcome]
evidence *= self.pred_priors[feat]
[feat_val]

posterior = (likelihood * prior) / (evidence)

probs_outcome[outcome] = posterior

result = max(probs_outcome, key = lambda x:


probs_outcome[x])
results.append(result)

return np.array(results)

if __name__ == "__main__":
#Weather Dataset
print("\ndataset:")

df = pd.DataFrame(dataset)
#print(df)

#Split fearures and target


X,y = pre_processing(df)

nb_clf = NaiveBayes()
nb_clf.fit(X, y)

print("Train Accuracy: {}".format(accuracy_score(y,


nb_clf.predict(X))))

#Query 1:
query = np.array([['Salty','Hot', 'Soft']])
print("Query 1:- {} ---> {}".format(query,
nb_clf.predict(query)))

#Query 2:
query = np.array([['Spicy','Hot', 'Soft']])
print("Query 2:- {} ---> {}".format(query,
nb_clf.predict(query)))

#Query 3:
query = np.array([['Salty','Hot', 'Hard']])
print("Query 3:- {} ---> {}".format(query,
nb_clf.predict(query)))

dataset:
Train Accuracy: 70.0
Query 1:- [['Salty' 'Hot' 'Soft']] ---> ['No']
Query 2:- [['Spicy' 'Hot' 'Soft']] ---> ['Yes']
Query 3:- [['Salty' 'Hot' 'Hard']] ---> ['Yes']

2. Implement Decision tree on IRIS Dataset using SK Learn library functions.


Implement methods to avoid over-fitting of the data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
X,y=load_iris(return_X_y=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_train_predicted=clf.predict(X_train)
y_test_predicted=clf.predict(X_test)
accuracy_score(y_train,y_train_predicted)
accuracy_score(y_test,y_test_predicted)

0.9736842105263158

plt.figure(figsize=(16,8))
tree.plot_tree(clf)
plt.show()

path=clf.cost_complexity_pruning_path(X_train,y_train)
#path variable gives two things ccp_alphas and impurities
ccp_alphas,impurities=path.ccp_alphas,path.impurities
print("ccp alpha wil give list of values :",ccp_alphas)
print("***********************************************************")
print("Impurities in Decision Tree :",impurities)

ccp alpha wil give list of values : [0. 0.00869963 0.01339286


0.03571429 0.26539835 0.33279549]
***********************************************************
Impurities in Decision Tree : [0. 0.01739927 0.03079212
0.06650641 0.33190476 0.66470026]

clfs=[] #will store all the models here


for ccp_alpha in ccp_alphas:
clf=DecisionTreeClassifier(random_state=0,ccp_alpha=ccp_alpha)
clf.fit(X_train,y_train)
clfs.append(clf)
print("Last node in Decision tree is {} and ccp_alpha for last node is
{}".format(clfs[-1].tree_.node_count,ccp_alphas[-1]))

Last node in Decision tree is 1 and ccp_alpha for last node is


0.332795493197279

train_scores = [clf.score(X_train, y_train) for clf in clfs]


test_scores = [clf.score(X_test, y_test) for clf in clfs]
fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o',
label="train",drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o',
label="test",drawstyle="steps-post")
ax.legend()
plt.show()

clf=DecisionTreeClassifier(random_state=0,ccp_alpha=0.02)
clf.fit(X_train,y_train)
plt.figure(figsize=(12,8))
tree.plot_tree(clf,rounded=True,filled=True)
plt.show()
accuracy_score(y_test,clf.predict(X_test))

0.9736842105263158

You might also like