Data Mining Lab Practical File Linear Regression and Logistic Regression

1
Data Mining Lab Practical File
Submitted to: Dr. Neeraj Gupta
Submitted by: Gaurav Tripathi

1901840003
B.Sc (H) Data Science (Semester - IV)
29 Jun 2021
2
1. Linear Regression
Code for Salary :
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
linearreg=LinearRegression()
x=np.array([1.1,1.3,1.5,2,2.2,2.9,3,3.2,3.2,3.7,3.9,4,4,4.1,4.5,4.9,5.1,5.
3,5.9,6,6.8,7.1,7.9,8.2,8.7,9,9.5,9.6,10.3,10.5,11.2,11.5,12.3,12.9,13.5])
y=np.array([39343,46205,37731,43525,39891,56642,60150,54445,64445,5718,632
18,55794,56957,57081,61111,67938,66029,83088,81363,93940,91738,98273,10130
2,113812,109431,105582,116969,112635,122391,121872,127345,126756,128765,13
5675,139465])
x=x.reshape(-1,1)
linearreg.fit(x,y)
y_pred=linearreg.predict(x)
plt.scatter(x,y)
plt.plot(x,y_pred,color='red')
plt.show()
Code for Insurance:
import numpy as np
3
from sklearn.linear_model import LinearRegression
linearreg=LinearRegression()
x=np.array([108,19,13,124,40,57,23,14,45,10,5,48,11,23,7,2,24,6,3,23,6,9
,9,3,29,7,4,20,7,4,0,25,6,5,22,11,61,12,4,16,13,60,41,37,55,41,11,27,8,
3 ,17,13,13,15,8,29,30,24,9,31,14,53,26])
y=np.array([392.5,46.2,15.7,422.2,119.4,170.9,56.9,77.5,214,65.3,20.9,24
8.1,23.5,39.6,48.8,6.6,134.9,50.9,4.4,113,14.8,48.7,52.1,13.2,103.9,77.5
,11.8,98.1,27.9,38.1,0,69.2,14.6,40.3,161.5,57.2,217.6,58.1,12.6,59.6,89
.9,202.4,181.3,152.8,162.8,73.4,21.3,92.6,76.1,39.9,142.1,93,31.9,32.1,5
5.6,133.3,194.5,137.9,87.4,209.8,95.5,244.6,187.5,
])
x=x.reshape(-1,1)
linearreg.fit(x,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
y_pred=linearreg.predict(x)
plt.scatter(x,y)
plt.plot(x,y_pred,color='red')
plt.show()
4
2. Multiple Linear Regression
Code for mlv:
import pandas as pd
data=pd.read_csv("mlr.csv")
data.head()
X1 X2 X3 X4 X5 X6
0.28 0.14 0.04 0.01 0.01

0 3 4 9 2 30.08 6
0.27 0.12 0.03 0.01 0.00 0.06
1 6 5 9 3 2 2
0.28 0.14 0.04 0.02 0.01 0.07
2 1 1 5 1 3 4
0.32 0.18 0.04 0.00 0.03 0.03
3 8 9 3 1 0 2
0.29 0.16 0.04 0.01 0.07 0.07
4 0 1 4 1 0 6
from sklearn import linear_model

import numpy as np
data.columns = ['x1','x2','x3','x4','x5','x6']
X = data.iloc[:,data.columns != "x1"]
Y = data.iloc[:, 0]
X
x3 x4 x5 x6
x 2
0.14 0.04 0.01 0.01 0.08

0 4 9 2 3 6
0.12 0.03 0.01 0.00 0.06
1 5 9 3 2 2
0.14 0.04 0.02 0.01 0.07
2 1 5 1 3 4
0.18 0.04 0.00 0.03 0.03

3 9 3 1 0 2
0.16 0.04 0.01 0.07 0.07

4 1 4 1 0 6
5
0.18 0.04 0.01 0.05 0.00

5 6 7 8 0 7
0.10 0.03 0.00 0.01 0.09

6 6 6 8 2 5
0.11 0.03 0.00 0.00 0.14

7 7 0 6 3 5
0.17 0.05 0.00 0.06 0.11

8 4 0 8 1 2
Y.head()
0 0.283
1 0.276
2 0.281
3 0.328
4 0.290
Name: x1, dtype: float64
plt.scatter(X.x2,X.x3,marker="*",color="orange")
plt.scatter(X.x2,X.x4,marker="*",color="green")
6
plt.scatter(X.x2,X.x5,marker="*",color="red")
plt.scatter(X.x2,X.x6,marker="*",color="orange")
plt.scatter(X.x3,X.x4,marker="*",color="green")
7
plt.scatter(X.x3,X.x5,marker="*",color="red")
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
random_state= 0)
model = linear_model.LinearRegression()
model.fit(X_train, Y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
y_pred = model.predict(X_test)
Y_pred
array([0.23386571, 0.31718764, 0.33311851, 0.29940205, 0.27633325,
0.24228869, 0.23924197, 0.29401786, 0.29011389])
8
from sklearn import metrics

print("mean square error ",metrics.mean_squared_error(Y_test, y_pred))
mean square error 0.0001948634357163518
Code for Insurance:
import pandas as pd
import numpy as np
import io
df = pd.read_csv(io.BytesIO(uploaded['insurance-1.csv']))
ivar = df[['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' , 'charges']]
fig = plt.figure()
ax = fig.add_subplot(111)
csx = ax.matshow(np.linalg.inv(ivar.corr()) , cmap = 'Blues')
fig.colorbar(csx)
ax.set_xlabel(['']+['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' ,
'charges'])
ax.set_ylabel(['']+['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' ,
'charges'])
df.head()
Age sex bmi children smoker region charges

0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
9
import numpy as np
df.columns = ['age','sex','bmi','children','smoker','region','charges']
X = df.iloc[:,df.columns != "age"]
Y = df.iloc[:, 0]
X
sex bmi children smoker region charges
0 female 27.900 0 yes southwest 16884.92400 1 male 33.770 1 no
southeast 1725.55230 2 male 33.000 3 no southeast 4449.46200 3 male
22.705 0 no northwest 21984.47061 4 male 28.880 0 no northwest
3866.85520
... ... ... ... ... ... ... 1333 male 30.970 3 no northwest
10600.54830 1334 female 31.920 0 no northeast 2205.98080 1335 female
36.850 0 no southeast 1629.83350 1336 female 25.800 0 no southwest
2007.94500
1337 female 29.070 0 yes northwest 29141.36030
1338 rows × 6 columns

10
Y.head()
0 19
1 18
2 28
3 33
4 32
Name: age, dtype: int64
plt.scatter(X.sex,X.bmi,marker="*",color="green")
11
plt.scatter(X.sex,X.children,marker="*",color="orange")
plt.scatter(X.sex,X.charges,marker="*",color="red")
plt.scatter(X.bmi,X.charges,marker="*",color="red")
plt.scatter(X.children,X.charges,marker="*",color="green")
12
plt.scatter(X.smoker,X.charges,marker="*",color="orange")
3. Logistic Regression
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
age=np.array([22,25,47,52,46,56,55,60,62,61,18,28,27,29,49,55,25,58,19,18,
21,26,4 0,45,50,54,23])
bought_insurance=np.array([0,0,1,0,1,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,1,1
13
,1,1,0] )
print(age)
print(bought_insurance)
[22 25 47 52 46 56 55 60 62 61 18 28 27 29 49 55 25 58 19 18 21 26 40 45
50 54 23]
[0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0]
plt.scatter(age,bought_insurance,c=bought_insurance,cmap='rainbow')
plt.title("Scatter Plot Of Logistic Regression")
plt.show()
age_train,age_test,bought_insurance_train,bought_insurance_test=train_test_split(
age,bought_insurance,random_state=1)
print(age_train.shape,age_test.shape,bought_insurance_train.shape,bought_insuranc
e_test.shape)
(20,) (7,) (20,) (7,)
log_reg=LogisticRegression
from sklearn.metrics import mean_squared_error
mean_squared_error(age,bought_insurance)
1760.2592592592594
14
4. K-Means Algorithm
import pandas as pd
import io
df = pd.read_csv(io.BytesIO(uploaded['cars.csv']))
df.head()
cubicinches hp time-to-6 0
weightlb s year brand
mpg cylinder s
31.9 4 89 71 1925 14 1980 Europe

0 14.0 8 350 165 4209 12 1972 US. 1 . 2 17.0
8 302 140 3449 11 1971 US. 3 15.0 8 400 150 3761 10 1971 US. 4 30.5 4 98 63
2051 17 1978 US.
X = df.iloc[:,:-1].values
X = pd.DataFrame(X)
X = pd
X.head()
weightlb s year
mpg cylinder s
0 14 8 350 165 4209 12 1972 1 31.9 4 89 71 1925 14 1980 2 17 8
302 140 3449 11 1971 3 15 8 400 150 3761 10 1971 4 30.5 4 98 63
2051 17 1978
X.describe()
mpg cylinder s weightlb s year
count 261.0 261 261 261 261 261 261 unique 103.0 5 75 85 240 17 1
top 14.0 4 97 150 2130 16 1974 freq 16.0 125 16 16 3 45 35
X['mpg'] = pd.to_numeric(X['mpg'], errors='coerce').astype('float64')

15
X['cylinders'] = pd.to_numeric(X['cylinders'],
errors='coerce').astype('float64')
X['cubicinches'] = pd.to_numeric(X['cubicinches'],
X['hp'] = pd.to_numeric(X['hp'], errors='coerce').astype('float64')
X['weightlbs'] = pd.to_numeric(X['weightlbs'],
X['time-to-60'] = pd.to_numeric(X['time-to-60'],
X['year'] = pd.to_numeric(X['year'], errors='coerce').astype('float64')
import numpy as np
X.replace([np.inf, -np.inf], np.nan)
print(X.isna().sum())
mean1=X['cubicinches'].mean()
X['cubicinches']=X['cubicinches'].fillna(mean1)
mean2=X['weightlbs'].mean()
X['weightlbs']=X['weightlbs'].fillna(mean2)
print(mean1,mean2)
print(X.isna().sum())
mpg 0
cylinders 0
cubicinches 2
hp 0
weightlbs 3
time-to-60 0
year 0
dtype: int64
200.9189189189189 3009.8333333333335
mpg 0
cylinders 0
cubicinches 0
hp 0
weightlbs 0
time-to-60 0
year 0
dtype: int64

16
# Using the elbow method to find the optimal number of clusters

from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
kmeans =
KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0
)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
# Applying k-means to the cars dataset

kmeans =
KMeans(n_clusters=3,init='k-means++',max_iter=300,n_init=10,random_state=0
)
y_kmeans = kmeans.fit_predict(X)
#X=X.to_numpy
# Visualising the clusters

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans ==
0,1],s=100,c='red',label='US')
1,1],s=100,c='blue',label='Japan')
2,1],s=100,c='green',label='Europe')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=30
0,c='yellow',label='Centroids')
plt.title('Clusters of car brands')
plt.legend()
plt.show()
17
18
5. Decision Tree Algorithm:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder#for train test splitting
from sklearn.model_selection import train_test_split#for decision tree object
from sklearn.tree import DecisionTreeClassifier#for checking testing results
from sklearn.metrics import classification_report, confusion_matrix#for
visualizing tree
from sklearn.tree import plot_tree
from google.colab import files

uploaded = files.upload()
import io
df = pd.read_csv(io.BytesIO(uploaded['Iris.csv']))
df.head()
SepalLength SepalWidth PetalLength PetalWidth Speci

Cm Cm Cm Cm es
5.1 3.5 1.4 0.2 Iris-s

etosa
4.9 3.0 1.4 0.2 Iris-s

etosa
4.7 3.2 1.3 0.2 Iris-s

etosa
4.6 3.1 1.5 0.2 Iris-s

etosa
5.0 3.6 1.4 0.2 Iris-s

etosa
19
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 150 non-null int64
1 SepalLengthCm 150 non-null float64
2 SepalWidthCm 150 non-null float64
3 PetalLengthCm 150 non-null float64
4 PetalWidthCm 150 non-null float64
5 Species 150 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
df.shape
(150, 6)
df.isnull().any()
Id False
SepalLengthCm False
SepalWidthCm False
PetalLengthCm False
PetalWidthCm False
Species False
dtype: bool
20
df['species'] = np.random.choice(2,150)
sns.pairplot(df, hue="species", size=2.5)
21
sns.heatmap(df.corr())
target = df['species']
df1 = df.copy()
df1 = df1.drop('species', axis =1)
X = df1
target
0 0
1 1
2 1
3 1
4 0
..
145 1
146 0
147 1
148 1
149 1
Name: species, Length: 150, dtype: int64
22
le = LabelEncoder()
target = le.fit_transform(target)
target
array([0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1])
y = target
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.2,

random_state = 42)
print("Training split input- ", X_train.shape)
print("Testing split input- ", X_test.shape)
Training split input- (120, 6)
Testing split input- (30, 6)
dtree.fit(X_train,y_train)print('Decision Tree Classifier Created')
y_pred = dtree.predict(X_test)
print("Classification report - \n", classification_report(y_test,y_pred))
Classification report-
Precision recall f1-score support
0 1.00 1.00 1.00 10
1 1.00 1.00 1.00 9
2 1.00 1.00 1.00 11
accuracy 1.00 30
macro avg 1.00 1.00 1.00 30
weighted avg 1.00 1.00 1.00 30
23
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,5))sns.heatmap(data=cm,linewidths=.5, annot=True,square
= True, cmap = 'Blues')plt.ylabel('Actual label')
plt.xlabel('Predicted label')all_sample_title = 'Accuracy Score:
{0}'.format(dtree.score(X_test, y_test))
plt.title(all_sample_title, size = 15)
24
dec_tree = plot_tree(decision_tree=dtree, feature_names = df1.columns,

class_names =["setosa", "vercicolor", "verginica"] , filled = True , precision =
4, rounded = True)
25
Neural Networks:
import numpy as np
import tensorflow as tf
observations=1000
xs=np.random.uniform(-10,10,(observations,1))
zs=np.random.uniform(-10,10,(observations,1))
generated_inputs=np.column_stack((xs,zs))
noise=np.random.uniform(-10,10,(observations,1))
generated_target=2*xs-3*zs+5+noise
np.savez('TF_intro',input=generated_inputs,targets=generated_target)
training_data=np.load('TF_intro.npz')
input_size=2
output_size=1
models = tf.keras.Sequential([
tf.keras.layers.Dense(output_size)
])
custom_optimizer=tf.keras.optimizers.SGD(learning_rate=0.02)
models.compile(optimizer=custom_optimizer,loss='mean_squared_error')
models.fit(training_data['input'],training_data['targets'],epochs=100,verbose=
1)
Epoch 1/100
32/32 [==============================] - 1s 1ms/step - loss: 60.4431
Epoch 2/100
32/32 [==============================] - 0s 1ms/step - loss: 40.1109
Epoch 3/100
32/32 [==============================] - 0s 1ms/step - loss: 39.9997
Epoch 4/100
32/32 [==============================] - 0s 1ms/step - loss: 34.7306
Epoch 5/100
32/32 [==============================] - 0s 1ms/step - loss: 36.7232
models.layers[0].get_weights()
[array([[ 2.0985565],
[-2.907345 ]], dtype=float32), array([5.32353], dtype=float32)]
26
[array([[2.0985565 ],
[-2.907345]], dtype=float32), array([5.32353], dtype=float32)]
weights=models.layers[0].get_weights()[0]
bias=models.layers[0].get_weights()[1]
out=training_data['targets'].round(1)
from sklearn.metrics import mean_squared_error
mean_squared_error(generated_target, out, squared=False)
0.02858235386343541
plt.scatter(np.squeeze(models.predict_on_batch(training_data['input'])),np.squeez
e(training_data['targets']),c='#88c999')
plt.xlabel('Input')
plt.ylabel('Predicted Output')
plt.show()
27
Naive Bayesian Classification:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats.stats import pearsonr
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, precision_score
%matplotlib inline
def cross_validate(estimator, train, validation):

X_train = train[0]
Y_train = train[1]
X_val = validation[0]
Y_val = validation[1]
train_predictions = classifier.predict(X_train)
train_accuracy = accuracy_score(train_predictions, Y_train)
train_recall = recall_score(train_predictions, Y_train)
train_precision = precision_score(train_predictions, Y_train)
val_predictions = classifier.predict(X_val)
val_accuracy = accuracy_score(val_predictions, Y_val)
val_recall = recall_score(val_predictions, Y_val)
val_precision = precision_score(val_predictions, Y_val)
print('Model metrics')
print('Accuracy Train: %.2f, Validation: %.2f' % (train_accuracy,
val_accuracy))
print('Recall Train: %.2f, Validation: %.2f' % (train_recall, val_recall))
print('Precision Train: %.2f, Validation: %.2f' % (train_precision,
val_precision))
from google.colab import files

uploaded = files.upload()
test.csv(application/vnd.ms-excel) - 28629 bytes, last modified: 6/29/2021 - 100% done
train.csv(application/vnd.ms-excel) - 61194 bytes, last modified: 6/29/2021 - 100% done
Saving test.csv to test.csv
28
Saving train.csv to train.csv
import io
train_data = pd.read_csv(io.StringIO(uploaded['train.csv'].decode('utf-8')))
import io
test_data = pd.read_csv(io.StringIO(uploaded['test.csv'].decode('utf-8')))
train_data['train'] = 1
test_data['train'] = 0
data = train_data.append(test_data, sort=False)
test_ids = test_data['PassengerId'].values
data.head()
Passeng Surviv Pcla Na Sex Age Sib Par Tic Fare Cabi Embark tra
erId ed ss me Sp ch ket n ed in
1 0.0 3 Braund mal 22. 1 0 A/5 7.25 NaN S 1

, Mr. e 0 21171 00
0
Owen
Harris
2 1.0 1 Cumin fem 38. 1 0 PC 71.2 C85 C 1

gs, ale 0 17599 833
Mrs.
John
Bradle
1
y
(Floren
ce
Briggs
Th...
3 1.0 3 Heikki fem 26. 0 0 STON/ 7.92 NaN S 1

nen, ale 0 O2. 50
2
Miss. 31012
Laina 82
29
4 1.0 1 Futrell fem 35. 1 0 11380 53.1 C123 S 1

e, Mrs. ale 0 3 000
Jacque
s
3
Heath
(Lily
May
Peel)
5 0.0 3 Allen, mal 35. 0 0 37345 8.05 NaN S 1

Mr. e 0 0 00
4 Willia
m
Henry
data.describe()
Passenge Survived Pclass Age SibSp Parch Fare train

rId
cou 1309.000 891.000 1309.000 1046.000 1309.000 1309.000 1308.000 1309.000

nt 000 000 000 000 000 000 000 000
mea 655.0000 0.38383 2.294882 29.88113 0.498854 0.385027 33.29547 0.680672

n 00 8 8 9
378.0200 0.48659 0.837836 14.41349 1.041658 0.865560 51.75866 0.466394

std
61 2 3 8
1.000000 0.00000 1.000000 0.170000 0.000000 0.000000 0.000000 0.000000

min
0
328.0000 0.00000 2.000000 21.00000 0.000000 0.000000 7.895800 0.000000

25%
00 0 0
655.0000 0.00000 3.000000 28.00000 0.000000 0.000000 14.45420 1.000000

50%
00 0 0 0
982.0000 1.00000 3.000000 39.00000 1.000000 0.000000 31.27500 1.000000

75%
00 0 0 0
1309.000 1.00000 3.000000 80.00000 8.000000 9.000000 512.3292 1.000000

max
000 0 0 00
30
features = ['Age', 'Embarked', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp']

target = 'Survived'
data = data[features + [target] + ['train']]
data['Sex'] = data['Sex'].replace(["female", "male"], [0, 1])

data['Embarked'] = data['Embarked'].replace(['S', 'C', 'Q'], [1, 2, 3])
data['Age'] = pd.qcut(data['Age'], 10, labels=False)
train = data.query('train == 1')

test = data.query('train == 0')
train.dropna(axis=0, inplace=True)
labels = train[target].values
train.head()
Age Embarked Fare Parch Pclass Sex SibSp Survived train
0 2.0 1.0 7.2500 0 3 1 1 0.0 1
7.0 2.0 71.283 0 1 0 1 1.0 1

1
3
2 4.0 1.0 7.9250 0 3 0 0 1.0 1
6.0 1.0 53.100 0 1 0 1 1.0 1

3
0
4 6.0 1.0 8.0500 0 3 1 0 0.0 1
columns = train[features + [target]].columns.tolist()

nColumns = len(columns)
result = pd.DataFrame(np.zeros((nColumns, nColumns)), columns=columns)
for col_a in range(nColumns):

for col_b in range(nColumns):
result.iloc[[col_a], [col_b]] = pearsonr(train.loc[:, columns[col_a]],
train.loc[:, columns[col_b]])[0]
fig, ax = plt.subplots(figsize=(10,10))
ax = sns.heatmap(result, yticklabels=columns, vmin=-1, vmax=1, annot=True,
fmt='.2f', linewidths=.2)
ax.set_title('PCC - Pearson correlation coefficient')
plt.show()
31
32
continuous_numeric_features = ['Age', 'Fare', 'Parch', 'SibSp']

for feature in continuous_numeric_features:
sns.distplot(train[feature])
plt.show()
33
train.drop(['train', target, 'Pclass'], axis=1, inplace=True)

test.drop(['train', target, 'Pclass'], axis=1, inplace=True)
X_train, X_val, Y_train, Y_val = train_test_split(train, labels, test_size=0.2,

random_state=1)
X_train.head()
Age Embarked Fare Parch Sex SibSp
1.0 2.0 14.454 0 0 1

830
2
566 1.0 1.0 7.8958 0 1 0
7.0 1.0 13.000 0 1 0

149
0
106 2.0 1.0 7.6500 0 0 0
4.0 1.0 78.850 0 0 0

290
0
X_train1, X_train2, Y_train1, Y_train2 = train_test_split(X_train, Y_train,

test_size=0.3, random_state=12)
classifier = GaussianNB()
classifier.fit(X_train2, Y_train2)
GaussianNB(priors=None, var_smoothing=1e-09)
34
print('Metrics with only 30% of train data')

cross_validate(classifier, (X_train, Y_train), (X_val, Y_val))
Metrics with only 30% of train data
Model metrics
Accuracy Train: 0.79, Validation: 0.76
Recall Train: 0.75, Validation: 0.71
Precision Train: 0.71, Validation: 0.71
classifier.partial_fit(X_train1, Y_train1)
GaussianNB(priors=None, var_smoothing=1e-09)
print('Metrics with the remaining 70% of train data')

cross_validate(classifier, (X_train, Y_train), (X_val, Y_val)
Metrics with the remaining 70% of train data
Model metrics
Accuracy Train: 0.80, Validation: 0.76
Recall Train: 0.78, Validation: 0.70
Precision Train: 0.69, Validation: 0.69
print('Probability of each class')

print('Survive = 0: %.2f' % classifier.class_prior_[0])
print('Survive = 1: %.2f' % classifier.class_prior_[1])
Probability of each class
Survive = 0: 0.60
Survive = 1: 0.40
print('Mean of each feature per class')

print(' Age Embarked Fare Parch Sex SibSp')
print('Survive = 0: %s' % classifier.theta_[0])
print('Survive = 1: %s' % classifier.theta_[1])
Mean of each feature per class
Age Embarked Fare Parch Sex SibSp
Survive = 0: [ 4.5339233 1.23893805 22.97357316 0.37463127 0.86135693 0.5280236 ]
Survive = 1: [ 4.27391304 1.33478261 53.52155957 0.52608696 0.33043478 0.49130435]
print('Variance of each feature per class')

print('Survive = 0: %s' % classifier.sigma_[0])
print('Survive = 1: %s' % classifier.sigma_[1])
Variance of each feature per class
Survive = 0: [8.32554825e+00 2.88044224e-01 8.60096730e+02 8.36055467e-01
1.19424042e-01 1.11647419e+00]
Survive = 1: [8.45106148e+00 2.74880003e-01 5.28959488e+03 6.75409304e-01
35
2.21250514e-01 4.41231610e-01]
test.fillna(test.mean(), inplace=True)
test_predictions = classifier.predict(test)
submission = pd.DataFrame({'PassengerId': test_ids})
submission['Survived'] = test_predictions.astype('int')
submission.to_csv('submission.csv', index=False)
submission.head(10)
PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
5 897 0
6 898 1
7 899 0
8 900 1
9 901 0
36

Data Mining Lab Practical File Linear Regression and Logistic Regression

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Data Mining Lab Practical File Linear Regression and Logistic Regression

Uploaded by

Copyright:

Available Formats

1

Data Mining Lab Practical File

Submitted to: Dr. Neeraj Gupta

Submitted by: Gaurav Tripathi

B.Sc (H) Data Science (Semester - IV)

Code for Salary :

Code for Insurance:

from sklearn.linear_model import LinearRegression

2. Multiple Linear Regression

Code for mlv:

0.28 0.14 0.04 0.01 0.01

from sklearn import linear_model

0.14 0.04 0.01 0.01 0.08

0.18 0.04 0.00 0.03 0.03

0.16 0.04 0.01 0.07 0.07

0.18 0.04 0.01 0.05 0.00

0.10 0.03 0.00 0.01 0.09

0.11 0.03 0.00 0.00 0.14

0.17 0.05 0.00 0.06 0.11

from sklearn.model_selection import train_test_split

from sklearn import metrics

Code for Insurance:

Age sex bmi children smoker region charges

sex bmi children smoker region charges

0 female 27.900 0 yes southwest 16884.92400 1 male 33.770 1 no

southeast 1725.55230 2 male 33.000 3 no southeast 4449.46200 3 male

22.705 0 no northwest 21984.47061 4 male 28.880 0 no northwest

10600.54830 1334 female 31.920 0 no northeast 2205.98080 1335 female

36.850 0 no southeast 1629.83350 1336 female 25.800 0 no southwest

1337 female 29.070 0 yes northwest 29141.36030

1338 rows × 6 columns

Name: age, dtype: int64

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

31.9 4 89 71 1925 14 1980 Europe

2051 17 1978 US.

0 14 8 350 165 4209 12 1972 1 31.9 4 89 71 1925 14 1980 2 17 8

302 140 3449 11 1971 3 15 8 400 150 3761 10 1971 4 30.5 4 98 63

top 14.0 4 97 150 2130 16 1974 freq 16.0 125 16 16 3 45 35

X['mpg'] = pd.to_numeric(X['mpg'], errors='coerce').astype('float64')

import matplotlib.pyplot as plt

# Using the elbow method to find the optimal number of clusters

# Applying k-means to the cars dataset

# Visualising the clusters

5. Decision Tree Algorithm:

from google.colab import files

SepalLength SepalWidth PetalLength PetalWidth Speci

5.1 3.5 1.4 0.2 Iris-s

4.9 3.0 1.4 0.2 Iris-s

4.7 3.2 1.3 0.2 Iris-s

4.6 3.1 1.5 0.2 Iris-s

5.0 3.6 1.4 0.2 Iris-s

X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.2,

dtree.fit(X_train,y_train)print('Decision Tree Classifier Created')

dec_tree = plot_tree(decision_tree=dtree, feature_names = df1.columns,

Naive Bayesian Classification:

def cross_validate(estimator, train, validation):

from google.colab import files

Saving train.csv to train.csv

1 0.0 3 Braund mal 22. 1 0 A/5 7.25 NaN S 1

2 1.0 1 Cumin fem 38. 1 0 PC 71.2 C85 C 1

3 1.0 3 Heikki fem 26. 0 0 STON/ 7.92 NaN S 1