You are on page 1of 15

Experiment No.

1
FIND-S Algorithm

Implement and demonstrate the FIND-S algorithm for finding the most specific hypothesis based on a
given set of training data samples. Read the training data from a .CSV file.
import numpy as np
import pandas as pd

data =pd.read_csv('pgm1.csv')

def train(concepts, target):


for i, val in enumerate(target):
if val == "Yes":
specific_h = concepts[i]
break
for i,h in enumerate(concepts):
if target[i] == "Yes":
for x in range(len(specific_h)):
if h[x] == specific_h[x]:
pass
else:
specific_h[x] = "?"
return specific_h

concepts = np.array(data.iloc[:,0:-1])
target = np.array(data.iloc[:,-1])
print(train(concepts,target))

Output:
['Sunny' 'Warm' '?' 'Strong' '?' '?']
Experiment No. 2
Candidate-Elimination Algorithm

For a given set of training data examples stored in a .CSV file, implement and demonstrate the
Candidate-Elimination algorithm to output a description of the set of all hypotheses consistent with
the training examples.

import numpy as np
import pandas as pd
data = pd.DataFrame(data=pd.read_csv('pgm2.csv'))
concepts = np.array(data.iloc[:,0:-1])
target = np.array(data.iloc[:,-1])
def learn(concepts, target):
specific_h = concepts[0].copy()
general_h = [["?" for i in range(len(specific_h))] for i in range(len(specific_h))]
for i, h in enumerate(concepts):
if target[i] == "Yes":
for x in range(len(specific_h)):
if h[x] != specific_h[x]:
specific_h[x] = '?'
general_h[x][x] = '?'
if target[i] == "No":
for x in range(len(specific_h)):
if h[x] != specific_h[x]:
general_h[x][x] = specific_h[x]
else:
general_h[x][x] = '?'
indices = [i for i,val in enumerate(general_h) if val == ['?', '?', '?', '?', '?', '?']]
for i in indices:
general_h.remove(['?', '?', '?', '?', '?', '?'])
return specific_h, general_h
s_final, g_final = learn(concepts, target)
print("Final S:", s_final, sep="\n")
print("Final G:", g_final, sep="\n")
data

Output:
Final S:
['Sunny' 'Warm' 'High' 'Strong' '?' '?']
Final G:
[['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?']]
Experiment No. 3
ID3 Algorithm
Write a program to demonstrate the working of the decision tree based ID3 algorithm. Use an appropriate
data set for building the decision tree and apply this knowledge to classify a new sample.

import pandas as pd
import numpy as np
dataset= pd.read_csv('pgm3.csv',names=['outlook','temperature','humidity','wind','class',])

def entropy(target_col):
elements,counts = np.unique(target_col,return_counts = True)
entropy=np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts))for i in
range(len(elements))])
return entropy

def InfoGain(data,split_attribute_name,target_name="class"):
total_entropy = entropy(data[target_name])
vals,counts= np.unique(data[split_attribute_name],return_counts=True)
Weighted_Entropy =
np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[targe
t_name]) for i in range(len(vals))])
Information_Gain = total_entropy - Weighted_Entropy
return Information_Gain

def ID3(data,originaldata,features,target_attribute_name="class",parent_node_class = None):


if len(np.unique(data[target_attribute_name])) <= 1:
return np.unique(data[target_attribute_name])[0]
elif len(data)==0:
return
np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],ret
urn_counts=True)[1])]
elif len(features) ==0:
return parent_node_class
else:
parent_node_class =
np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=Tr
ue)[1])]
item_values = [InfoGain(data,feature,target_attribute_name) for feature in features]
best_feature_index = np.argmax(item_values)
best_feature = features[best_feature_index]
tree = {best_feature:{}}
features = [i for i in features if i != best_feature]
for value in np.unique(data[best_feature]):
value = value
sub_data = data.where(data[best_feature] == value).dropna()
subtree = ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
tree[best_feature][value] = subtree
return(tree)

tree = ID3(dataset,dataset,dataset.columns[:-1])
print(' \nDisplay Tree\n',tree)

Output:
Display Tree
{'outlook': {0: {'humidity': {0.0: 0.0, 1.0: 1.0}}, 1: 1.0, 2: {'wind': {0.0: 1.0,
1.0: 0.0}}}}
Experiment No. 4
Backpropagation Algorithm
Build an Artificial Neural Network by implementing the Backpropagation algorithm and test the same
using appropriate data sets.

import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X,axis=0)

def sigmoid (x):


return 1/(1 + np.exp(-x))

def derivatives_sigmoid(x):
return x * (1 - x)

epoch=7000
lr=0.1
inputlayer_neurons = 2
hiddenlayer_neurons = 3
output_neurons = 1

wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))

#Forward Propogation
for i in range(epoch):
hinp1=np.dot(X,wh)
hinp=hinp1 + bh
hlayer_act = sigmoid(hinp)
outinp1=np.dot(hlayer_act,wout)
outinp= outinp1+ bout
output = sigmoid(outinp)

#Backpropagation
EO = y-output
outgrad = derivatives_sigmoid(output)
d_output = EO* outgrad
EH = d_output.dot(wout.T)
hiddengrad = derivatives_sigmoid(hlayer_act)
d_hiddenlayer = EH * hiddengrad
wout += hlayer_act.T.dot(d_output) *lr

print("Input: \n" + str(X))


print("Actual Output: \n" + str(y))
print("Predicted Output: \n" ,output)

Output:
Input: Actual Output: Predicted Output:
[[0.66666667 1. ] [[92.] [[0.99999813]
[0.33333333 0.55555556] [86.] [0.9999937 ]
[1. 0.66666667]] [89.]] [0.99999853]]
Experiment No. 5
Naive Bayesian Classifier
Write a program to implement the Naive Bayesian classifier for a sample training data set stored as a .CSV
file. Compute the accuracy of the classifier, considering few test data sets.

import csv
import random
import math
import operator

def safe_div(x,y):
if y == 0:
return 0
return x / y

def loadCsv(filename):
lines = csv.reader(open(filename, "r"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset

def splitDataset(dataset, splitRatio):


trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]

def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated

def mean(numbers):
return safe_div(sum(numbers),float(len(numbers)))

def stdev(numbers):
avg = mean(numbers)
variance = safe_div(sum([pow(x-avg,2) for x in numbers]),float(len(numbers)-1))
return math.sqrt(variance)

def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries

def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries

def calculateProbability(x, mean, stdev):


exponent = math.exp(-safe_div(math.pow(x-mean,2),(2*math.pow(stdev,2))))
final = safe_div(1 , (math.sqrt(2*math.pi) * stdev)) * exponent
return final

def calculateClassProbabilities(summaries, inputVector):


probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities

def predict(summaries, inputVector):


probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel

def getPredictions(summaries, testSet):


predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions

def getAccuracy(testSet, predictions):


correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
accuracy = safe_div(correct,float(len(testSet))) * 100.0
return accuracy

def main():
filename = 'pgm5.csv'
splitRatio = 0.9
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
summaries = summarizeByClass(trainingSet)
predictions = getPredictions(summaries, testSet)
actual = []
for i in range(len(testSet)):
vector = testSet[i]
actual.append(vector[-1])
print('Actual values: {0}%'.format(actual))
print('Predictions: {0}%'.format(predictions))
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%'.format(accuracy))
main()

Output:
Actual values: [5.0, 5.0]%
Predictions: [5.0, 5.0]%
Accuracy: 100.0%
Experiment No. 6
Naive Bayesian Classifier
Assuming a set of documents that need to be classified, use the Naive Bayesian Classifier model to perform
this task. Built-in Java classes/API can be used to write the program. Calculate the accuracy, precision, and
recall for your data set.

from sklearn.datasets import fetch_20newsgroups


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
train = fetch_20newsgroups(subset='train',categories=categories,shuffle=True)
test = fetch_20newsgroups(subset='test',categories=categories,shuffle=True)
print("\n".join(train.data[0].split("\n")))
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_tf = count_vect.fit_transform(train.data)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)
X_train_tfidf.shape
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn import metrics
mod = MultinomialNB()
mod.fit(X_train_tfidf, train.target)
X_test_tf = count_vect.transform(test.data)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)
predicted = mod.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(test.target, predicted))
print(classification_report(test.target,predicted,target_names=test.target_names))
print("confusion matrix is \n",metrics.confusion_matrix(test.target, predicted))

Output:
From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to


convert tif/img/tga files into LaserJet III format. We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance. Michael.


--
Michael Collier (Programmer) The Computer Unit,
Email: M.P.Collier@uk.ac.city The City University,
Tel: 071 477-8000 x3769 London,
Fax: 071 477-8565 EC1V 0HB.

Accuracy: 0.8348868175765646
precision recall f1-score support

alt.atheism 0.97 0.60 0.74 319


comp.graphics 0.96 0.89 0.92 389
sci.med 0.97 0.81 0.88 396
soc.religion.christian 0.65 0.99 0.78 398

micro avg 0.83 0.83 0.83 1502


macro avg 0.89 0.82 0.83 1502
weighted avg 0.88 0.83 0.84 1502

confusion matrix is
[[192 2 6 119]
[ 2 347 4 36]
[ 2 11 322 61]
[ 2 2 1 393]]
Experiment No. 7
Bayesian Network
Write a program to construct a Bayesian network considering medical data. Use this model to demonstrate
the diagnosis of heart patients using standard Heart Disease Data Set. You can use Java/Python ML library
classes/API.

import pandas as pd
data=pd.read_csv("pgm7.csv")
heart_disease=pd.DataFrame(data)

from pgmpy.models import BayesianModel


model=BayesianModel([
('age','Lifestyle'),
('Gender','Lifestyle'),
('Family','heartdisease'),
('diet','cholestrol'),
('Lifestyle','diet'),
('cholestrol','heartdisease'),
('diet','cholestrol')
])

from pgmpy.estimators import MaximumLikelihoodEstimator


model.fit(heart_disease, estimator=MaximumLikelihoodEstimator)

from pgmpy.inference import VariableElimination


HeartDisease_infer = VariableElimination(model)

q = HeartDisease_infer.query(variables=['heartdisease'], evidence={
'age':int(input('Enter age {SuperSeniorCitizen:0, SeniorCitizen:1, MiddleAged:2, Youth:3, Teen:4} :')),
'Gender':int(input('enter Gender {Male:0, Female:1} :')),
'Family':int(input('enter Family history {yes:1, No:0} :')),
'diet':int(input('enter diet {High:0, Medium:1} :')),
'Lifestyle':int(input('enter Lifestyle {Athlete:0, Active:1, Moderate:2, Sedentary:3} :')),
'cholestrol':int(input('enter cholestrol {High:0, BorderLine:1, Normal:2} :'))
})

print(q['heartdisease'])

Output:
Enter age {SuperSeniorCitizen:0, SeniorCitizen:1, MiddleAged:2, Youth:3, Teen:4} :2
enter Gender {Male:0, Female:1} :0
enter Family history {yes:1, No:0} :0
enter diet {High:0, Medium:1} :1
enter Lifestyle {Athlete:0, Active:1, Moderate:2, Sedentary:3} :3
enter cholestrol {High:0, BorderLine:1, Normal:2} :1
+----------------+---------------------+
| heartdisease | phi(heartdisease) |
+================+=====================+
| heartdisease_0 | 0.0000 |
+----------------+---------------------+
| heartdisease_1 | 1.0000 |
+----------------+---------------------+
Experiment No. 8
EM Algorithm and k-Means Algorithm
Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same data set for clustering using k-
Means algorithm. Compare the results of these two algorithms and comment on the quality of clustering. You
can add Java/Python ML library classes/API in the program.

import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.mixture import GaussianMixture
import pandas as pd
X=pd.read_csv("pgm8.csv")
x1 = X['Distance_Feature'].values
x2 = X['Speeding_Feature'].values
X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
plt.plot()
plt.xlim([0, 100])
plt.ylim([0, 50])
plt.title('Dataset')
plt.scatter(x1,x2)

gmm = GaussianMixture(n_components=3)
gmm.fit(X)
em_predictions = gmm.predict(X)
print("\nEM predictions")
print(em_predictions)
print("mean:\n",gmm.means_)
print('\n')
print("Covariances\n",gmm.covariances_)
print(X)
plt.title('Exceptation Maximum')
plt.scatter(X[:,0], X[:,1],c=em_predictions,s=50)
plt.show()

import matplotlib.pyplot as plt1


kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
print(kmeans.cluster_centers_)
print(kmeans.labels_)

plt.title('KMEANS')
plt1.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='rainbow')
plt1.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt1.show()

Output:
EM predictions
[1 1 1 1 1 2 1 2 2 1 0 0 1 1 0 0 0 1 1 1 1 0 1]
mean:
[[51.49401384 38.73668604]
[55.82978127 23.99404542]
[42.4275654 18.53365945]]

Covariances
[[[ 20.64249138 3.27817009]
[ 3.27817009 21.07999619]]
[[ 47.03399108 10.16152267]
[ 10.16152267 14.81719721]]

[[ 51.65448747 -61.81920385]
[-61.81920385 101.39620353]]]
[[71.24 28. ]
[52.53 25. ]
[64.54 27. ]
[55.69 22. ]
[54.58 25. ]
[41.91 10. ]
[58.64 20. ]
[52.02 8. ]
[31.25 34. ]
[44.31 19. ]
[49.35 40. ]
[58.07 45. ]
[44.22 22. ]
[55.73 19. ]
[46.63 43. ]
[52.97 32. ]
[46.25 35. ]
[51.55 27. ]
[57.05 26. ]
[58.45 30. ]
[43.42 23. ]
[55.68 37. ]
[55.15 18. ]]

[[47.87166667 39. ]
[57.34333333 24.91666667]
[45.176 16.4 ]]
[1 1 1 1 1 2 1 2 0 2 0 0 2 1 0 1 0 1 1 1 2 0 1]
Experiment No. 9
K-NearestNeighbour Algorithm
Write a program to implement k-Nearest Neighbour algorithm to classify the iris data set. Print both correct
and wrong predictions. Java/Python ML library classes can be used for this problem.

from sklearn.datasets import load_iris


iris = load_iris()
print("Feature Names:",iris.feature_names,"Iris Data:",iris.data,"Target
Names:",iris.target_names,"Target:",iris.target)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = .25)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
print(" Accuracy=",clf.score(X_test, y_test))
print("Predicted Data")
print(clf.predict(X_test))
prediction=clf.predict(X_test)
print("Test data :")
print(y_test)
diff=prediction-y_test
print("Result is ")
print(diff)
print('Total no of samples misclassied =', sum(abs(diff)))

Output:
Feature Names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal
width (cm)'] Iris Data: [[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5.1 3.8 1.5 0.3]
[5.4 3.4 1.7 0.2]
[5.1 3.7 1.5 0.4]
[4.6 3.6 1. 0.2]
[5.1 3.3 1.7 0.5]
[4.8 3.4 1.9 0.2]
[5. 3. 1.6 0.2]
[5. 3.4 1.6 0.4]
[5.2 3.5 1.5 0.2]
[5.2 3.4 1.4 0.2]
[6.5 3. 5.2 2. ]
[6.2 3.4 5.4 2.3]
[5.9 3. 5.1 1.8]] Target Names: ['setosa' 'versicolor' 'virginica'] Target: [0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2]
Accuracy= 0.9473684210526315
Predicted Data
[2 0 1 1 1 2 0 0 1 2 0 1 0 1 1 0 0 0 1 1 0 2 2 2 2 0 0 2 0 2 2 1 2 2 0 2 0
0]
Test data :
[2 0 1 1 1 2 0 0 1 2 0 1 0 2 2 0 0 0 1 1 0 2 2 2 2 0 0 2 0 2 2 1 2 2 0 2 0
0]
Result is
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Total no of samples misclassied = 2
Experiment No. 10
Locally Weighted Regression Algorithm
Implement the non-parametric Locally Weighted Regression algorithm in order to fit data points. Select
appropriate data set for your experiment and draw graphs.

import operator
from os import listdir
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
def kernel(point,xmat, k):
m,n = np.shape(xmat)
weights = np.mat(np.eye((m)))
for j in range(m):
diff = point - X[j]
weights[j,j] = np.exp(diff*diff.T/(-2.0*k**2))
return weights
def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W = (X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W
def localWeightRegression(xmat,ymat,k):
m,n = np.shape(xmat)
ypred = np.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred

data = pd.read_csv('lr.csv')
bill =np.array(data.colA)
tip = np.array(data.colB)
mbill=np.mat(bill)
mtip=np.mat(tip)

mbill = np.mat(bill)
mtip = np.mat(tip)
m= np.shape(mbill)[1]
one = np.mat(np.ones(m))
X= np.hstack((one.T,mbill.T))

ypred = localWeightRegression(X,mtip,0.2)
SortIndex = X[:,1].argsort(0)
xsort = X[SortIndex][:,0]
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(bill,tip, color='green')
ax.plot(xsort[:,1],ypred[SortIndex], color = 'red', linewidth=5)
plt.xlabel('colA')
plt.ylabel('colB')
plt.show();

Output:

You might also like