You are on page 1of 15

Machine Learning Lab Record Book

Gulshan Kumar
22PGMCA15
MCA 2nd

LINEAR REGRESSION
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

df=pd.read_csv("/content/drive/MyDrive/ML_DATASET/Copy of diabetes.csv)
df.head()
df.shape
df.isnull().sum()
x = df.drop(columns = 'BMI',axis = 1)
print(x)
x.shape
y=df['BMI']

from sklearn.preprocessing import StandardScaler


scaler=StandardScaler()
standard_data=scaler.fit_transform(x)

print(standard_data)
x=standard_data
print(x)

#spliting the data into training and testing

x_train,x_tst,y_train,y_tst = train_test_split(x,y,test_size=0.2,random
_state=2)

print(x.shape,x_tst.shape,x_train.shape)
import numpy as np
class Linear_Regression():
# initiating the parameters (learning rate & no. of iterations)
def __init__(self, learning_rate, no_of_iterations):

self.learning_rate = learning_rate
self.no_of_iterations = no_of_iterations
def fit(self, X, Y ):

# number of training examples & number of features

self.m, self.n = X.shape # number of rows & columns

# initiating the weight and bias

self.w = np.zeros(self.n)
self.b = 0
self.X = X #21(SALARY_DATA)
self.Y = Y #9

# implementing Gradient Descent

for i in range(self.no_of_iterations):
self.update_weights()

def update_weights(self):

Y_prediction = self.predict(self.X)

# calculate gradients
dw = - (2 * (self.X.T).dot(self.Y - Y_prediction)) / self.m
db = - 2 * np.sum(self.Y - Y_prediction)/self.m
# upadating the weights
self.w = self.w - self.learning_rate*dw
self.b = self.b - self.learning_rate*db

def predict(self, X):


return X.dot(self.w) + self.b
#spliting the data into training and testing

x_train,x_tst,y_train,y_tst = train_test_split(x,y,test_size=0.2,random
_state=2)
print(x.shape,x_tst.shape,x_train.shape)
model = Linear_Regression(learning_rate = 0.02, no_of_iterations=1000)
model.fit(x_train, y_train)
print('weight = ', model.w[0])
print('bias = ', model.b)
test_data_prediction = model.predict(x_tst)
print(test_data_prediction)
plt.scatter(x_tst, y_tst, color = 'red')
plt.plot(x_tst, test_data_prediction, color='blue')
plt.xlabel('Sympton')
plt.ylabel('Outcome')
plt.title(' Diabetes')
plt.show()

label encoder
import pandas as pd
import numpy as np
dfp=pd.read_csv('/content/drive/MyDrive/ML_DATASET/Copy of Placement_Da
taset.csv')
dfp.shape
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# df.head()

cols = ['workex', 'status', 'gender']


label_encoder=LabelEncoder()
dfp[cols] =dfp[cols].apply(label_encoder.fit_transform)

df.head()
# print(dfp.iloc[1,:])

Logistic Regression
import pandas as pd
import numpy as np
df=pd.read_csv('/content/drive/MyDrive/ML_DATASET/Copy of breast_cancer
_data.csv')
df.shape
df.isnull().sum()
df['Outcome'].values_count()
df.groupby('Outcome').mean()
from sklearn.preprocessing import LabelEncoder
l=LabelEncoder()
label=l.fit_transform(df.diagnosis)
df["outcome"]=label
df.head()
df=df.drop(columns="diagnosis")
x=df.drop(columns='outcome')
y=df['outcome']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =
train_test_split(x,y,test_size=0.3,random_state=10)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
model.predict(X_test)
model.score(X_test, y_test)
0.34502923976608185
model.coef_
model.intercept_
array([-7.40629666e-17])
model.predict(X_test)

PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

# creating dataframe
df = pd.DataFrame(cancer['data'], columns = cancer['feature_names'])

# checking head of dataframe


df.head()
# Importing standardscalar module
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()

# fitting
scalar.fit(df)
scaled_data = scalar.transform(df)

# Importing PCA
from sklearn.decomposition import PCA

# Let's say, components = 2


pca = PCA(n_components = 2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

x_pca.shape
# giving a larger plot
plt.figure(figsize =(8, 6))
plt.scatter(x_pca[:, 0], x_pca[:, 1], c = cancer['target'], cmap ='plas
ma')

# labeling x and y axes


plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
pca.components_
df_comp = pd.DataFrame(pca.components_, columns = cancer['feature_names
'])

plt.figure(figsize =(14, 6))

# plotting heatmap
sns.heatmap(df_comp)

Nominal to numeric label encoding


dumies=pd.get_dumies(df.toen)
dumies
merged=pd.concat([df,dumies],axis='columns')
merged
final=merged.drop(['town'],axis='columns')
final
final=final.drop(['west'],axis='columns')
y=final.price

OneHot Encoding
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("/content/drive/MyDrive/Copy of hometown.csv")
df
dummies = pd.get_dummies(df.town)
dummies
merged = pd.concat([df,dummies],axis='columns')
merged
final = merged.drop(['town'], axis='columns')
final
final = final.drop(['west windsor'], axis='columns')
final
X = final.drop('price', axis='columns')
X
y = final.price
df
X.head()
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
standard_data=scaler.fit_transform(X)
print(standard_data)
S=standard_data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,y)
model.predict(X)
model.score(X,y)
final
model.predict([[3400,0,0]])
model.predict([[2600,0,0]])
model.predict([[3600,0,1]])
model.predict([[2800,0,1]])

Using sklearn OneHotEncoder First step is to use label encoder to convert town names into numbers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle
X = dfle[['town','area']].values
X
y = dfle.price
y
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
standard_data=scaler.fit_transform(X)
print(standard_data)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'p
assthrough')
X = ct.fit_transform(X)
X
X = X[:,1:]
X
model.fit(X,y)
final
model.predict([[0,1,3400]])

SVM
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv("/content/drive/MyDrive/Copy of iris_data.csv")
df.head()
df.shape
df.info()
df.isnull().sum()
df.columns
from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()

l=LabelEncoder()
label=l.fit_transform(df.Species)
df["outcome"]=label
df.head()
df[df.outcome==2].head()
df[df.outcome==1].head()
df[df.outcome==0].head()

from sklearn.model_selection import train_test_split


x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,rand
om_state=2)
print(x.shape,x_test.shape,x_train.shape)
x=df.drop(['Species','Id','outcome'],axis='columns')
y=df.outcome
x_train.shape
from sklearn.svm import SVC
model=SVC()
model.fit(x_train,y_train)
help(SVC())
y_pred=model.predict(x_test)
y_pred
model.score(x_test,y_test)
model.score(x_train,y_train)
model_C=SVC(C=1)
model_C.fit(x_train,y_train)
model_C.score(x_test,y_test)
model_g=SVC(gamma=10)
model_g.fit(x_train,y_train)
model_g.score(x_test,y_test)
model_linear_kernel=SVC(kernel='linear')
model_linear_kernel.fit(x_train,y_train)
model_linear_kernel.score(x_test,y_test)
model.predict([[6.3,3.3,6.0,2.5]])
model.predict([[64.8,3.0,1.5,0.3]])
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test, y_pred)
cm
from sklearn.metrics import confusion_matrix
cm1= confusion_matrix(y_test, y_pred)
from sklearn import metrics
metrics.plot_confusion_matrix(model, x_test, y_test, display_labels=['N
egative', 'Positive'])
confusion = metrics.confusion_matrix(y_test, y_pred)
confusion.ravel()
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

KNN
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
df=pd.read_csv("/content/drive/MyDrive/Copy of iris_data.csv")
df.head()
df.info
df.isnull().sum()
df.columns
from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()

l=LabelEncoder()
label=l.fit_transform(df.Species)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

df=pd.read_csv("/content/drive/MyDrive/Copy of iris_data.csv")
df.head()
df.info
df.isnull().sum()
df.columns
from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()

l=LabelEncoder()
label=l.fit_transform(df.Species)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
df=pd.read_csv("/content/drive/MyDrive/Copy of iris_data.csv")
df.head()
df.info
df.isnull().sum()
df.columns
from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()

l=LabelEncoder()
label=l.fit_transform(df.Species

Decision Tree
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree
Classifier
from sklearn.model_selection import train_test_split # Import train_tes
t_split function
from sklearn import metrics #Import scikit-learn metrics module for acc
uracy calculation
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report, confusion_matrix
df = pd.read_csv('/content/drive/MyDrive/Copy of diabetes.csv')
print(df.head())

X = df.drop('Outcome',axis = 1)
y = df.Outcome
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3
, random_state=1)
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2)
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
plot_tree(clf)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Random Forest
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
df=pd.read_csv("heart.csv")
x = df.drop(columns="target",axis=1)
y = df.target
df.isnull().sum()
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2
5, random_state=2)
clf= RandomForestClassifier(n_estimators= 10, criterion="entropy")
clf=clf.fit(x_train, y_train)
y_pred=classifier.predict(x_test)
cm=confusion_matrix(y_test,y_pred)
cm
print("Accuracy:",metrics.accuracy_score(y_test,y_pred))
model.fit(x_train, y_train)
metrics.plot_confusion_matrix(model, x_test, y_test, display_labels=['N
egative', 'Positive'])

Clustering
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df=pd.read_csv('/content/drive/MyDrive/Copy of iris_data.csv')
df.head()
from sklearn.preprocessing import LabelEncoder
l=LabelEncoder()
label=l.fit_transform(df.Species)
df["outcome"]=label
x=df.drop(["Id","Species","outcome"],axis=1)
y=df.outcome
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
x_train.shape,x_test.shape
wcss=[]
for i in range(1,11):
kmeans=kMEans(n_cluster=i)
kmeans.fit(X)

wcss.append(kmeans.inertia_)
sns.set()
plt.plot(range(1,11),wcss)
plt.title('The elbow point graph')
plt.xlabel('no. of clusters')
plt.ylabel(.wcss)
plt.show()

Hierarchial Clustering(PCAgglomerative, DBSCAN,


PCA)

Agglomerative
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

data = pd.read_csv("https://raw.githubusercontent.com/amankharwal/
Website-data/master/customers.csv")
print(data.head())
data["Income"] = data[["Annual Income (k$)"]]
data["Spending"] = data[["Spending Score (1-100)"]]
data = data[["Income", "Spending"]]
print(data.head())
data.shape
import scipy.cluster.hierarchy as Scn
from sklearn.cluster import AgglomerativeClustering as ag
dendrogram=Scn.dendrogram(Scn.linkage(data,method="complete"))
cluster =ag(n_cluster=3, linkage="complete")
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering()
model.fit(data)
pred = model.fit_predict(data)
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(8, 8))
plt.scatter(data["Income"], data["Spending"], c=pred, cmap='rainbow', a
lpha=0.9)
plt.show()
data.shape
DBSCAN
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.metrics import rand_score
X, y_true = make_blobs(n_samples=300, centers=4,cluster_std=0.50, rando
m_state=0)
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print(labels)
unique_labels = set(labels)
colors = ['y', 'b', 'g', 'r']
print(colors)
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'

class_member_mask = (labels == k)

xy = X[class_member_mask & core_samples_mask]


plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k',
markersize=6)

xy = X[class_member_mask & ~core_samples_mask]


plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k',
markersize=6)

plt.title('number of clusters: %d' % n_clusters_)


plt.show()
sc = metrics.silhouette_score(X, labels)
print("Silhouette Coefficient:%0.2f"%sc)
ari = metrics.adjusted_rand_score(y_true, labels)
print("Adjusted Rand Index: %0.2f"%ari)
Gradient boost Regression and Adaboost Regression
import pandas as pd
import numpy as np
df1=pd.read_csv('/content/drive/MyDrive/ML_DATASET/Copy of iris_data.cs
v')
df1.shape
df1.isnull().sum()
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

df1['Species']= label_encoder.fit_transform(df['Species'])
df1['Species'].unique()
df1['Species']
x=df1.drop(columns=['Species','Id'])
print(x)
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
standard_data=scaler.fit_transform(x)
print(standard_data)
X=standard_data
print(X)
y=df['Species']
from sklearn.model_selection import train_test_split
#spliting the data into training and testing
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,rand
om_state=2)
print(X.shape,x_test.shape,x_train.shape)
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

Model1 = AdaBoostRegressor(n_estimators=100,learning_rate=0.1,random_st
ate=2)
Model2=GradientBoostingRegressor(n_estimators=50,learning_rate=0.1,rand
om_state=2)
#fit adaboost regressor to training data
Model1.fit(x_train,y_train)
Model2.fit(x_train,y_train)
y_pred1=Model1.predict(x_test)
print(y_pred1)
y_pred2=Model2.predict(x_test)
print(y_pred2)
print(y_test)
score=Model1.score(x_train,y_train)
score
score1=Model2.score(x_train,y_train)
score1

Ada Boost Classifier AND Gradient Boosting


Classifier
import pandas as pd
import numpy as np

from google.colab import drive


drive.mount('/content/drive')
df=pd.read_csv('/content/drive/MyDrive/ML_DATASET/Copy of iris_data.csv
')
df.head()
df.shape
df.isnull().sum()
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

df['Species']= label_encoder.fit_transform(df['Species'])
df['Species'].unique()
df['Species']
df.head()
x=df.drop(columns=['Species','Id'])
print(x)
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
standard_data=scaler.fit_transform(x)
print(standard_data)
X=standard_data
print(X)
y=df['Species']
y
y.head()
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
#spliting the data into training and testing
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,rand
om_state=2)
print(X.shape,x_test.shape,x_train.shape)
model1 = AdaBoostClassifier(n_estimators=100,learning_rate=0.1,random_s
tate=2)
model2=GradientBoostingClassifier(n_estimators=50,learning_rate=0.1,ran
dom_state=2)
#fit adaboost regressor to training data
model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
y_pred1=model1.predict(x_test)
print(y_pred1)
y_pred2=model2.predict(x_test)
print(y_pred2)
print(y_test)
y_test
#confusion matrix

from sklearn import metrics

cm = metrics.confusion_matrix(y_test, y_pred1)
cm
cm1 = metrics.confusion_matrix(y_test, y_pred2)
cm1
score=model1.score(x_train,X_test)
score=model2.score(x_train,X_test)

You might also like