Professional Documents
Culture Documents
Department
of
Computer Science & Engineering
Machine Learning
Laboratory Record
Name: _______________
USN: ________________
B M S EVENING
COLLEGE OF ENGINEERING
LABORATORY CERTIFICATE
Date: ____________
Particulars of the Experiments Performed
CONTENTS
Expt Date Experiment Marks Page
No. Obtained No.
01 Implement and demonstrate the FIND- 3-4
S algorithm for finding the most
specific hypothesis based on a given
set of training data samples. Read the
training data from a .CSV file.
02 For a given set of training data 5-7
examples stored in a .CSV file,
implement and demonstrate the
Candidate-Elimination algorithm to
output a description of the set of all
hypotheses consistent with the training
examples.
03 Write a program to demonstrate the 8-10
working of the decision tree based ID3
algorithm. Use an appropriate data set
for building the decision tree and apply
this knowledge to classify a new
sample.
04 Build an Artificial Neural Network by 11-12
implementing the Backpropagation
algorithm and test the same using
appropriate data sets.
05 Write a program to implement the 13-15
naïve Bayesian classifier for a sample
training data set stored as a .CSV file.
Compute the accuracy of the classifier,
considering few test data sets.
06 Assuming a set of documents that need 16-17
to be classified, use the naïve Bayesian
Classifier model to perform this task.
Built-in Java classes/API can be used
to write the program. Calculate the
accuracy, precision, and recall for your
data set.
07 Write a program to construct a 18-21
Bayesian network considering medical
data. Use this model to demonstrate the
diagnosis of heart patients using
standard Heart Disease Data Set. You
can use Java/Python ML library
classes/API.
08 Apply EM algorithm to cluster a set of 22-25
data stored in a .CSV file. Use the same
data set for clustering using k-Means
algorithm. Compare the results of these
two algorithms and comment on the
quality of clustering. You can add
Java/Python ML library classes/API in
the program.
09 Write a program to implement k- 26-27
Nearest Neighbour algorithm to
classify the iris data set. Print both
correct and wrong predictions.
Java/Python ML library classes can be
used for this problem.
10 Implement the non-parametric Locally 28-29
Weighted Regression algorithm in
order to fit data points. Select
appropriate data set for your
experiment and draw graphs.
MACHINE LEARING LABORATORY (17CSL76)
1. Implement and demonstrate the FIND-S algorithm for finding the most
specific hypothesis based on a given set of training data samples. Read the
training data from a .CSV file.
import csv
def loadCsv(filename):
lines = csv.reader(open(filename, "rt"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = dataset[i]
return dataset
attributes = ['Sky','Temp','Humidity','Wind','Water','Forecast']
print(attributes)
n = len(attributes)
dataset = loadCsv("pgm1.csv")
print(dataset)
h=['0'] * n
print("Intial hypothesis")
print(h)
print("The hypothesis are")
for i in range(len(dataset)):
target = dataset[i][-1]
if(target == 'Yes'):
for j in range(n):
if(h[j]=='0'):
h[j] = dataset[i][j]
if(h[j]!= dataset[i][j]):
h[j]='?'
print(i+1,'=',h)
print("Final hypothesis")
print(h)
SAMPLE OUTPUT
2. For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithm to output a description of the
set of all hypotheses consistent with the training examples.
import csv
def get_domains(examples):
d = [set() for i in examples[0]]
for x in examples:
for i, xi in enumerate(x):
d[i].add(xi)
return [list(sorted(x)) for x in d]
S.remove(s)
Splus = min_generalizations(s, x)
## keep only generalizations that have a counterpart in G
S.update([h for h in Splus if any([more_general(g,h) for g in G])])
## remove hypotheses less specific than any other in S
S.difference_update([h for h in S if any([more_general(h, h1) for h1 in S if h != h1])])
return S
return G
def candidate_elimination(examples):
domains = get_domains(examples)[:-1]
n = len(domains)
G = set([("?",)*n])
S = set([("0",)*n])
print("Maximally specific hypotheses - S ")
print("Maximally general hypotheses - G ")
i=0
print("\nS[0]:",str(S),"\nG[0]:",str(G))
for xcx in examples:
i=i+1
x, cx = xcx[:-1], xcx[-1]
if cx=='Y': # x is positive example
G = {g for g in G if fulfills(x, g)}
S = generalize_S(x, G, S)
else:
S = {s for s in S if not fulfills(x, s)}
G = specialize_G(x, domains, G, S)
print("\nS[{0}]:".format(i),S)
print("G[{0}]:".format(i),G)
return
candidate_elimination(examples)
SAMPLE OUTPUT
3. Write a program to demonstrate the working of the decision tree based ID3
algorithm. Use an appropriate data set for building the decision tree and apply
this knowledge to classify a new sample.
import math
import csv
def load_csv(filename):
lines = csv.reader(open(filename, "r"));
dataset = list(lines)
headers = dataset.pop(0)
return dataset, headers
class Node:
def __init__(self, attribute):
self.attribute = attribute
self.children = []
self.answer = ""
def entropy(S):
attr = list(set(S))
if len(attr) == 1: #if all are +v
return 0
counts = [0,0] # Only two values possible 'yes' or 'no'
for i in range(2):
counts[i] = sum( [1 for x in S if attr[i] == x] ) / (len(S) * 1.0)
sums = 0
for cnt in counts:
sums += -1 * cnt * math.log(cnt, 2)
return sums
def classify(node,x_test,features):
if node.answer != "":
print(node.answer)
return
pos = features.index(node.attribute)
for value, n in node.children:
if x_test[pos]==value:
classify(n,x_test,features)
''' Main program '''
dataset, features = load_csv("pgm3a.csv") # Read Tennis data
node = build_tree(dataset, features) # Build decision tree
print("The decision tree for the dataset using ID3 algorithm is ")
print_tree(node, 0)
testdata, features = load_csv("pgm3b.csv")
for xtest in testdata:
print("The test instance : ",xtest)
print("The predicted label : ", end="")
classify(node,xtest,features)
SAMPLE OUTPUT
import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X,axis=0)
y = y/100
def dersig(x):
return x * (1 - x)
e=7000
lr=0.1
iln = 2
hln = 3
oln = 1
wh=np.random.uniform(size=(iln,hln))
bh=np.random.uniform(size=(1,hln))
wout=np.random.uniform(size=(hln,oln))
bout=np.random.uniform(size=(1,oln))
for i in range(e):
h1=np.dot(X,wh)
h=h1 + bh
hla = sigmoid(h)
oi1=np.dot(hla,wout)
oi= oi1+ bout
op = sigmoid(oi)
EO = y-op
og = dersig(op)
dop = EO* og
EH = dop.dot(wout.T)
hg = dersig(hla)
dhl = EH * hg
wout += hla.T.dot(dop) *lr
wh += X.T.dot(dhl) *lr
print("Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n" ,op)
SAMPLE OUTPUT
import csv
import random
import math
def loadCsv(filename):
lines = csv.reader(open(filename, "r"));
dataset = list(lines)
for i in range(len(dataset)):
#converting strings into numbers for processing
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def splitDataset(dataset, splitRatio):
#67% training size
trainSize = int(len(dataset) * splitRatio);
trainSet = []
copy = list(dataset);
while len(trainSet) < trainSize:
#generate indices for the dataset list randomly to pick ele for training data
index = random.randrange(len(copy));
trainSet.append(copy.pop(index))
return [trainSet, copy]
def separateByClass(dataset):
separated = {}
#creates a dictionary of classes 1 and 0 where the values are the instacnes belonging to each class
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)];
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset);
summaries = {}
for classValue, instances in separated.items():
SAMPLE OUTPUT
6. Assuming a set of documents that need to be classified, use the naïve Bayesian
Classifier model to perform this task. Built-in Java classes/API can be used to
write the program. Calculate the accuracy, precision, and recall for your data
set.
import pandas as pd
msg=pd.read_csv('pgm6.csv',names=['message','label'])
print('Total instances in the dataset:',msg.shape[0])
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
Y=msg.labelnum
print('\nThe message and its label of first 5 instances are listed below')
X5, Y5 = X[0:5], msg.label[0:5]
for x, y in zip(X5,Y5):
print(x,',',y)
SAMPLE OUTPUT
Initial Setup
import numpy as np
import pandas as pd
import csv
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination
heartDisease = pd.read_csv('heart.csv')
heartDisease = heartDisease.replace('?',np.nan)
model=
BayesianModel([('age','heartdisease'),('sex','heartdisease'),('exang','heartdisease'),('cp','heartdisease'
),('heartdisease','restecg'),('heartdisease','chol')])
print('\nLearning CPD using Maximum likelihood estimators')
model.fit(heartDisease,estimator=MaximumLikelihoodEstimator)
SAMPLE OUTPUT
8. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same
data set for clustering using k-Means algorithm. Compare the results of these
two algorithms and comment on the quality of clustering. You can add
Java/Python ML library classes/API in the program.
def rename(s):
l2 = []
for i in s:
if i not in l2:
l2.append(i)
for i in range(len(s)):
pos = l2.index(s[i])
s[i] = l1[pos]
return s
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
X.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
y = pd.DataFrame(iris.target)
y.columns = ['Targets']
model = KMeans(n_clusters=3)
model.fit(X)
plt.figure(figsize=(14,7))
colormap = np.array(['red', 'lime', 'black'])
plt.subplot(1, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Real Classification')
plt.subplot(1, 2, 2)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[model.labels_], s=40)
km = rename(model.labels_)
print("\nWhat KMeans thought: \n", km)
print("Accuracy of KMeans is ",sm.accuracy_score(y, km))
print("Confusion Matrix for KMeans is \n",sm.confusion_matrix(y, km))
y_cluster_gmm = gmm.predict(xs)
plt.subplot(1, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y_cluster_gmm], s=40)
plt.title('GMM Classification')
plt.show()
em = rename(y_cluster_gmm)
print("\nWhat EM thought: \n", em)
print("Accuracy of EM is ",sm.accuracy_score(y, em))
print("Confusion Matrix for EM is \n", sm.confusion_matrix(y, em))
SAMPLE OUTPUT
iris=datasets.load_iris()
print("Iris Data set loaded...")
for i in range(len(iris.target_names)):
print("Label", i , "-",str(iris.target_names[i]))
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(x_train, y_train)
y_pred=classifier.predict(x_test)
SAMPLE OUTPUT
def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W = (X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W
def localWeightRegression(xmat,ymat,k):
m,n = np.shape(xmat)
ypred = np.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred
def graphPlot(X,ypred):
sortindex = X[:,1].argsort(0) #argsort - index of the smallest
xsort = X[sortindex][:,0]
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(bill,tip, color='green')
ax.plot(xsort[:,1],ypred[sortindex], color = 'red', linewidth=5)
plt.xlabel('Total bill')
plt.ylabel('Tip')
plt.show();
# load data points
data = pd.read_csv('pgm10.csv')
bill = np.array(data.total_bill) # We use only Bill amount and Tips data
tip = np.array(data.tip)
mbill = np.mat(bill) # .mat will convert nd array is converted in 2D array
mtip = np.mat(tip)
m= np.shape(mbill)[1]
one = np.mat(np.ones(m))
X = np.hstack((one.T,mbill.T)) # 244 rows, 2 cols
# increase k to get smooth curves
ypred = localWeightRegression(X,mtip,3)
graphPlot(X,ypred)
SAMPLE OUTPUT