You are on page 1of 36

1

Data Mining Lab Practical File

Submitted to: Dr. Neeraj Gupta

Submitted by: Gaurav Tripathi


1901840003

B.Sc (H) Data Science (Semester - IV)

29 Jun 2021
2

1. Linear Regression

Code for Salary :

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

linearreg=LinearRegression()

x=np.array([1.1,1.3,1.5,2,2.2,2.9,3,3.2,3.2,3.7,3.9,4,4,4.1,4.5,4.9,5.1,5.
3,5.9,6,6.8,7.1,7.9,8.2,8.7,9,9.5,9.6,10.3,10.5,11.2,11.5,12.3,12.9,13.5])
y=np.array([39343,46205,37731,43525,39891,56642,60150,54445,64445,5718,632
18,55794,56957,57081,61111,67938,66029,83088,81363,93940,91738,98273,10130
2,113812,109431,105582,116969,112635,122391,121872,127345,126756,128765,13
5675,139465])

x=x.reshape(-1,1)

linearreg.fit(x,y)

y_pred=linearreg.predict(x)

plt.scatter(x,y)
plt.plot(x,y_pred,color='red')
plt.show()

Code for Insurance:

import numpy as np
import matplotlib.pyplot as plt
3

from sklearn.linear_model import LinearRegression

linearreg=LinearRegression()

x=np.array([108,19,13,124,40,57,23,14,45,10,5,48,11,23,7,2,24,6,3,23,6,9
,9,3,29,7,4,20,7,4,0,25,6,5,22,11,61,12,4,16,13,60,41,37,55,41,11,27,8,
3 ,17,13,13,15,8,29,30,24,9,31,14,53,26])
y=np.array([392.5,46.2,15.7,422.2,119.4,170.9,56.9,77.5,214,65.3,20.9,24
8.1,23.5,39.6,48.8,6.6,134.9,50.9,4.4,113,14.8,48.7,52.1,13.2,103.9,77.5
,11.8,98.1,27.9,38.1,0,69.2,14.6,40.3,161.5,57.2,217.6,58.1,12.6,59.6,89
.9,202.4,181.3,152.8,162.8,73.4,21.3,92.6,76.1,39.9,142.1,93,31.9,32.1,5
5.6,133.3,194.5,137.9,87.4,209.8,95.5,244.6,187.5,
])

x=x.reshape(-1,1)

linearreg.fit(x,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)

y_pred=linearreg.predict(x)

plt.scatter(x,y)
plt.plot(x,y_pred,color='red')
plt.show()
4

2. Multiple Linear Regression

Code for mlv:

import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv("mlr.csv")
data.head()

X1 X2 X3 X4 X5 X6

0.28 0.14 0.04 0.01 0.01


0 3 4 9 2 30.08 6
0.27 0.12 0.03 0.01 0.00 0.06
1 6 5 9 3 2 2
0.28 0.14 0.04 0.02 0.01 0.07
2 1 1 5 1 3 4
0.32 0.18 0.04 0.00 0.03 0.03
3 8 9 3 1 0 2
0.29 0.16 0.04 0.01 0.07 0.07
4 0 1 4 1 0 6

from sklearn import linear_model


import numpy as np
data.columns = ['x1','x2','x3','x4','x5','x6']

X = data.iloc[:,data.columns != "x1"]
Y = data.iloc[:, 0]
X
x3 x4 x5 x6

x 2

0.14 0.04 0.01 0.01 0.08


0 4 9 2 3 6
0.12 0.03 0.01 0.00 0.06
1 5 9 3 2 2
0.14 0.04 0.02 0.01 0.07
2 1 5 1 3 4

0.18 0.04 0.00 0.03 0.03


3 9 3 1 0 2

0.16 0.04 0.01 0.07 0.07


4 1 4 1 0 6
5

0.18 0.04 0.01 0.05 0.00


5 6 7 8 0 7

0.10 0.03 0.00 0.01 0.09


6 6 6 8 2 5

0.11 0.03 0.00 0.00 0.14


7 7 0 6 3 5

0.17 0.05 0.00 0.06 0.11


8 4 0 8 1 2

Y.head()
0 0.283
1 0.276
2 0.281
3 0.328
4 0.290
Name: x1, dtype: float64

plt.scatter(X.x2,X.x3,marker="*",color="orange")

plt.scatter(X.x2,X.x4,marker="*",color="green")
6

plt.scatter(X.x2,X.x5,marker="*",color="red")

plt.scatter(X.x2,X.x6,marker="*",color="orange")

plt.scatter(X.x3,X.x4,marker="*",color="green")
7

plt.scatter(X.x3,X.x5,marker="*",color="red")

from sklearn.model_selection import train_test_split


from sklearn import linear_model
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
random_state= 0)

model = linear_model.LinearRegression()
model.fit(X_train, Y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

y_pred = model.predict(X_test)

Y_pred
array([0.23386571, 0.31718764, 0.33311851, 0.29940205, 0.27633325,
0.24228869, 0.23924197, 0.29401786, 0.29011389])
8

from sklearn import metrics


print("mean square error ",metrics.mean_squared_error(Y_test, y_pred))
mean square error 0.0001948634357163518

Code for Insurance:

import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
df = pd.read_csv(io.BytesIO(uploaded['insurance-1.csv']))
ivar = df[['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' , 'charges']]
fig = plt.figure()
ax = fig.add_subplot(111)
csx = ax.matshow(np.linalg.inv(ivar.corr()) , cmap = 'Blues')
fig.colorbar(csx)
ax.set_xlabel(['']+['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' ,
'charges'])
ax.set_ylabel(['']+['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' ,
'charges'])
df.head()

Age sex bmi children smoker region charges


0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
9

import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
df.columns = ['age','sex','bmi','children','smoker','region','charges']

X = df.iloc[:,df.columns != "age"]
Y = df.iloc[:, 0]
X

sex bmi children smoker region charges

0 female 27.900 0 yes southwest 16884.92400 1 male 33.770 1 no

southeast 1725.55230 2 male 33.000 3 no southeast 4449.46200 3 male

22.705 0 no northwest 21984.47061 4 male 28.880 0 no northwest

3866.85520

... ... ... ... ... ... ... 1333 male 30.970 3 no northwest

10600.54830 1334 female 31.920 0 no northeast 2205.98080 1335 female

36.850 0 no southeast 1629.83350 1336 female 25.800 0 no southwest

2007.94500

1337 female 29.070 0 yes northwest 29141.36030

1338 rows × 6 columns


10

Y.head()

0 19

1 18

2 28

3 33

4 32

Name: age, dtype: int64

plt.scatter(X.sex,X.bmi,marker="*",color="green")
11

plt.scatter(X.sex,X.children,marker="*",color="orange")

plt.scatter(X.sex,X.charges,marker="*",color="red")

plt.scatter(X.bmi,X.charges,marker="*",color="red")

plt.scatter(X.children,X.charges,marker="*",color="green")
12

plt.scatter(X.smoker,X.charges,marker="*",color="orange")

3. Logistic Regression

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

age=np.array([22,25,47,52,46,56,55,60,62,61,18,28,27,29,49,55,25,58,19,18,
21,26,4 0,45,50,54,23])
bought_insurance=np.array([0,0,1,0,1,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,1,1
13

,1,1,0] )
print(age)
print(bought_insurance)
[22 25 47 52 46 56 55 60 62 61 18 28 27 29 49 55 25 58 19 18 21 26 40 45
50 54 23]
[0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0]

plt.scatter(age,bought_insurance,c=bought_insurance,cmap='rainbow')
plt.title("Scatter Plot Of Logistic Regression")
plt.show()

from sklearn.model_selection import train_test_split

age_train,age_test,bought_insurance_train,bought_insurance_test=train_test_split(
age,bought_insurance,random_state=1)

print(age_train.shape,age_test.shape,bought_insurance_train.shape,bought_insuranc
e_test.shape)
(20,) (7,) (20,) (7,)

log_reg=LogisticRegression

from sklearn.metrics import mean_squared_error

mean_squared_error(age,bought_insurance)
1760.2592592592594
14

4. K-Means Algorithm

import pandas as pd
import io

df = pd.read_csv(io.BytesIO(uploaded['cars.csv']))
df.head()
cubicinches hp time-to-6 0
weightlb s year brand
mpg cylinder s

31.9 4 89 71 1925 14 1980 Europe


0 14.0 8 350 165 4209 12 1972 US. 1 . 2 17.0

8 302 140 3449 11 1971 US. 3 15.0 8 400 150 3761 10 1971 US. 4 30.5 4 98 63

2051 17 1978 US.

X = df.iloc[:,:-1].values
X = pd.DataFrame(X)
X = pd
X.head()
cubicinches hp time-to-6 0
weightlb s year
mpg cylinder s

0 14 8 350 165 4209 12 1972 1 31.9 4 89 71 1925 14 1980 2 17 8

302 140 3449 11 1971 3 15 8 400 150 3761 10 1971 4 30.5 4 98 63

2051 17 1978

X.describe()

cubicinches hp time-to-6 0
mpg cylinder s weightlb s year

count 261.0 261 261 261 261 261 261 unique 103.0 5 75 85 240 17 1

top 14.0 4 97 150 2130 16 1974 freq 16.0 125 16 16 3 45 35

X['mpg'] = pd.to_numeric(X['mpg'], errors='coerce').astype('float64')


15

X['cylinders'] = pd.to_numeric(X['cylinders'],
errors='coerce').astype('float64')
X['cubicinches'] = pd.to_numeric(X['cubicinches'],
errors='coerce').astype('float64')
X['hp'] = pd.to_numeric(X['hp'], errors='coerce').astype('float64')
X['weightlbs'] = pd.to_numeric(X['weightlbs'],
errors='coerce').astype('float64')
X['time-to-60'] = pd.to_numeric(X['time-to-60'],
errors='coerce').astype('float64')
X['year'] = pd.to_numeric(X['year'], errors='coerce').astype('float64')

import numpy as np
X.replace([np.inf, -np.inf], np.nan)
print(X.isna().sum())
mean1=X['cubicinches'].mean()
X['cubicinches']=X['cubicinches'].fillna(mean1)
mean2=X['weightlbs'].mean()
X['weightlbs']=X['weightlbs'].fillna(mean2)
print(mean1,mean2)
print(X.isna().sum())

mpg 0
cylinders 0
cubicinches 2
hp 0
weightlbs 3
time-to-60 0

year 0
dtype: int64
200.9189189189189 3009.8333333333335
mpg 0
cylinders 0
cubicinches 0
hp 0
weightlbs 0
time-to-60 0
year 0
dtype: int64

import matplotlib.pyplot as plt


16

# Using the elbow method to find the optimal number of clusters


from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
kmeans =
KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0
)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Applying k-means to the cars dataset


kmeans =
KMeans(n_clusters=3,init='k-means++',max_iter=300,n_init=10,random_state=0
)
y_kmeans = kmeans.fit_predict(X)

#X=X.to_numpy

# Visualising the clusters


plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans ==
0,1],s=100,c='red',label='US')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans ==

1,1],s=100,c='blue',label='Japan')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans ==
2,1],s=100,c='green',label='Europe')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=30
0,c='yellow',label='Centroids')
plt.title('Clusters of car brands')
plt.legend()
plt.show()
17
18

5. Decision Tree Algorithm:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder#for train test splitting
from sklearn.model_selection import train_test_split#for decision tree object
from sklearn.tree import DecisionTreeClassifier#for checking testing results
from sklearn.metrics import classification_report, confusion_matrix#for
visualizing tree
from sklearn.tree import plot_tree

from google.colab import files


uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['Iris.csv']))

df.head()

SepalLength SepalWidth PetalLength PetalWidth Speci


Cm Cm Cm Cm es

5.1 3.5 1.4 0.2 Iris-s


etosa

4.9 3.0 1.4 0.2 Iris-s


etosa

4.7 3.2 1.3 0.2 Iris-s


etosa

4.6 3.1 1.5 0.2 Iris-s


etosa

5.0 3.6 1.4 0.2 Iris-s


etosa
19

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 150 non-null int64
1 SepalLengthCm 150 non-null float64
2 SepalWidthCm 150 non-null float64
3 PetalLengthCm 150 non-null float64
4 PetalWidthCm 150 non-null float64
5 Species 150 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB

df.shape
(150, 6)

df.isnull().any()
Id False
SepalLengthCm False
SepalWidthCm False
PetalLengthCm False
PetalWidthCm False
Species False
dtype: bool
20

df['species'] = np.random.choice(2,150)
sns.pairplot(df, hue="species", size=2.5)
21

sns.heatmap(df.corr())

target = df['species']
df1 = df.copy()
df1 = df1.drop('species', axis =1)

X = df1

target
0 0
1 1
2 1
3 1
4 0
..
145 1
146 0
147 1
148 1
149 1
Name: species, Length: 150, dtype: int64
22

le = LabelEncoder()
target = le.fit_transform(target)
target
array([0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1])

y = target

X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.2,


random_state = 42)
print("Training split input- ", X_train.shape)
print("Testing split input- ", X_test.shape)
Training split input- (120, 6)
Testing split input- (30, 6)

dtree.fit(X_train,y_train)print('Decision Tree Classifier Created')

y_pred = dtree.predict(X_test)
print("Classification report - \n", classification_report(y_test,y_pred))
Classification report-
Precision recall f1-score support
0 1.00 1.00 1.00 10
1 1.00 1.00 1.00 9
2 1.00 1.00 1.00 11

accuracy 1.00 30
macro avg 1.00 1.00 1.00 30
weighted avg 1.00 1.00 1.00 30
23

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,5))sns.heatmap(data=cm,linewidths=.5, annot=True,square
= True, cmap = 'Blues')plt.ylabel('Actual label')
plt.xlabel('Predicted label')all_sample_title = 'Accuracy Score:
{0}'.format(dtree.score(X_test, y_test))
plt.title(all_sample_title, size = 15)
24

dec_tree = plot_tree(decision_tree=dtree, feature_names = df1.columns,


class_names =["setosa", "vercicolor", "verginica"] , filled = True , precision =
4, rounded = True)
25

Neural Networks:

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

observations=1000
xs=np.random.uniform(-10,10,(observations,1))
zs=np.random.uniform(-10,10,(observations,1))
generated_inputs=np.column_stack((xs,zs))
noise=np.random.uniform(-10,10,(observations,1))
generated_target=2*xs-3*zs+5+noise

np.savez('TF_intro',input=generated_inputs,targets=generated_target)
training_data=np.load('TF_intro.npz')

input_size=2
output_size=1
models = tf.keras.Sequential([
tf.keras.layers.Dense(output_size)
])
custom_optimizer=tf.keras.optimizers.SGD(learning_rate=0.02)
models.compile(optimizer=custom_optimizer,loss='mean_squared_error')
models.fit(training_data['input'],training_data['targets'],epochs=100,verbose=
1)
Epoch 1/100
32/32 [==============================] - 1s 1ms/step - loss: 60.4431
Epoch 2/100
32/32 [==============================] - 0s 1ms/step - loss: 40.1109
Epoch 3/100
32/32 [==============================] - 0s 1ms/step - loss: 39.9997
Epoch 4/100
32/32 [==============================] - 0s 1ms/step - loss: 34.7306
Epoch 5/100
32/32 [==============================] - 0s 1ms/step - loss: 36.7232

models.layers[0].get_weights()
[array([[ 2.0985565],
[-2.907345 ]], dtype=float32), array([5.32353], dtype=float32)]
26

[array([[2.0985565 ],
[-2.907345]], dtype=float32), array([5.32353], dtype=float32)]

weights=models.layers[0].get_weights()[0]
bias=models.layers[0].get_weights()[1]

out=training_data['targets'].round(1)
from sklearn.metrics import mean_squared_error
mean_squared_error(generated_target, out, squared=False)
0.02858235386343541

plt.scatter(np.squeeze(models.predict_on_batch(training_data['input'])),np.squeez
e(training_data['targets']),c='#88c999')
plt.xlabel('Input')
plt.ylabel('Predicted Output')
plt.show()
27

Naive Bayesian Classification:

import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
%matplotlib inline

def cross_validate(estimator, train, validation):


X_train = train[0]
Y_train = train[1]
X_val = validation[0]
Y_val = validation[1]
train_predictions = classifier.predict(X_train)
train_accuracy = accuracy_score(train_predictions, Y_train)
train_recall = recall_score(train_predictions, Y_train)
train_precision = precision_score(train_predictions, Y_train)

val_predictions = classifier.predict(X_val)
val_accuracy = accuracy_score(val_predictions, Y_val)
val_recall = recall_score(val_predictions, Y_val)
val_precision = precision_score(val_predictions, Y_val)

print('Model metrics')
print('Accuracy Train: %.2f, Validation: %.2f' % (train_accuracy,
val_accuracy))
print('Recall Train: %.2f, Validation: %.2f' % (train_recall, val_recall))
print('Precision Train: %.2f, Validation: %.2f' % (train_precision,
val_precision))

from google.colab import files


uploaded = files.upload()
test.csv(application/vnd.ms-excel) - 28629 bytes, last modified: 6/29/2021 - 100% done
train.csv(application/vnd.ms-excel) - 61194 bytes, last modified: 6/29/2021 - 100% done
Saving test.csv to test.csv
28

Saving train.csv to train.csv

import io
train_data = pd.read_csv(io.StringIO(uploaded['train.csv'].decode('utf-8')))

import io
test_data = pd.read_csv(io.StringIO(uploaded['test.csv'].decode('utf-8')))

train_data['train'] = 1
test_data['train'] = 0
data = train_data.append(test_data, sort=False)

test_ids = test_data['PassengerId'].values

data.head()

Passeng Surviv Pcla Na Sex Age Sib Par Tic Fare Cabi Embark tra
erId ed ss me Sp ch ket n ed in

1 0.0 3 Braund mal 22. 1 0 A/5 7.25 NaN S 1


, Mr. e 0 21171 00
0
Owen
Harris

2 1.0 1 Cumin fem 38. 1 0 PC 71.2 C85 C 1


gs, ale 0 17599 833
Mrs.
John
Bradle
1
y
(Floren
ce
Briggs
Th...

3 1.0 3 Heikki fem 26. 0 0 STON/ 7.92 NaN S 1


nen, ale 0 O2. 50
2
Miss. 31012
Laina 82
29

4 1.0 1 Futrell fem 35. 1 0 11380 53.1 C123 S 1


e, Mrs. ale 0 3 000
Jacque
s
3
Heath
(Lily
May
Peel)

5 0.0 3 Allen, mal 35. 0 0 37345 8.05 NaN S 1


Mr. e 0 0 00
4 Willia
m
Henry

data.describe()

Passenge Survived Pclass Age SibSp Parch Fare train


rId

cou 1309.000 891.000 1309.000 1046.000 1309.000 1309.000 1308.000 1309.000


nt 000 000 000 000 000 000 000 000

mea 655.0000 0.38383 2.294882 29.88113 0.498854 0.385027 33.29547 0.680672


n 00 8 8 9

378.0200 0.48659 0.837836 14.41349 1.041658 0.865560 51.75866 0.466394


std
61 2 3 8

1.000000 0.00000 1.000000 0.170000 0.000000 0.000000 0.000000 0.000000


min
0

328.0000 0.00000 2.000000 21.00000 0.000000 0.000000 7.895800 0.000000


25%
00 0 0

655.0000 0.00000 3.000000 28.00000 0.000000 0.000000 14.45420 1.000000


50%
00 0 0 0

982.0000 1.00000 3.000000 39.00000 1.000000 0.000000 31.27500 1.000000


75%
00 0 0 0

1309.000 1.00000 3.000000 80.00000 8.000000 9.000000 512.3292 1.000000


max
000 0 0 00
30

features = ['Age', 'Embarked', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp']


target = 'Survived'

data = data[features + [target] + ['train']]

data['Sex'] = data['Sex'].replace(["female", "male"], [0, 1])


data['Embarked'] = data['Embarked'].replace(['S', 'C', 'Q'], [1, 2, 3])
data['Age'] = pd.qcut(data['Age'], 10, labels=False)

train = data.query('train == 1')


test = data.query('train == 0')
train.dropna(axis=0, inplace=True)
labels = train[target].values

train.head()

Age Embarked Fare Parch Pclass Sex SibSp Survived train

0 2.0 1.0 7.2500 0 3 1 1 0.0 1

7.0 2.0 71.283 0 1 0 1 1.0 1


1
3

2 4.0 1.0 7.9250 0 3 0 0 1.0 1

6.0 1.0 53.100 0 1 0 1 1.0 1


3
0

4 6.0 1.0 8.0500 0 3 1 0 0.0 1

columns = train[features + [target]].columns.tolist()


nColumns = len(columns)
result = pd.DataFrame(np.zeros((nColumns, nColumns)), columns=columns)

for col_a in range(nColumns):


for col_b in range(nColumns):
result.iloc[[col_a], [col_b]] = pearsonr(train.loc[:, columns[col_a]],
train.loc[:, columns[col_b]])[0]
fig, ax = plt.subplots(figsize=(10,10))
ax = sns.heatmap(result, yticklabels=columns, vmin=-1, vmax=1, annot=True,
fmt='.2f', linewidths=.2)
ax.set_title('PCC - Pearson correlation coefficient')
plt.show()
31
32

continuous_numeric_features = ['Age', 'Fare', 'Parch', 'SibSp']


for feature in continuous_numeric_features:
sns.distplot(train[feature])
plt.show()
33

train.drop(['train', target, 'Pclass'], axis=1, inplace=True)


test.drop(['train', target, 'Pclass'], axis=1, inplace=True)

X_train, X_val, Y_train, Y_val = train_test_split(train, labels, test_size=0.2,


random_state=1)

X_train.head()

Age Embarked Fare Parch Sex SibSp

1.0 2.0 14.454 0 0 1


830
2

566 1.0 1.0 7.8958 0 1 0

7.0 1.0 13.000 0 1 0


149
0

106 2.0 1.0 7.6500 0 0 0

4.0 1.0 78.850 0 0 0


290
0

X_train1, X_train2, Y_train1, Y_train2 = train_test_split(X_train, Y_train,


test_size=0.3, random_state=12)

classifier = GaussianNB()

classifier.fit(X_train2, Y_train2)
GaussianNB(priors=None, var_smoothing=1e-09)
34

print('Metrics with only 30% of train data')


cross_validate(classifier, (X_train, Y_train), (X_val, Y_val))
Metrics with only 30% of train data
Model metrics
Accuracy Train: 0.79, Validation: 0.76
Recall Train: 0.75, Validation: 0.71
Precision Train: 0.71, Validation: 0.71

classifier.partial_fit(X_train1, Y_train1)
GaussianNB(priors=None, var_smoothing=1e-09)

print('Metrics with the remaining 70% of train data')


cross_validate(classifier, (X_train, Y_train), (X_val, Y_val)
Metrics with the remaining 70% of train data
Model metrics
Accuracy Train: 0.80, Validation: 0.76
Recall Train: 0.78, Validation: 0.70
Precision Train: 0.69, Validation: 0.69

print('Probability of each class')


print('Survive = 0: %.2f' % classifier.class_prior_[0])
print('Survive = 1: %.2f' % classifier.class_prior_[1])
Probability of each class
Survive = 0: 0.60
Survive = 1: 0.40

print('Mean of each feature per class')


print(' Age Embarked Fare Parch Sex SibSp')
print('Survive = 0: %s' % classifier.theta_[0])
print('Survive = 1: %s' % classifier.theta_[1])
Mean of each feature per class
Age Embarked Fare Parch Sex SibSp
Survive = 0: [ 4.5339233 1.23893805 22.97357316 0.37463127 0.86135693 0.5280236 ]
Survive = 1: [ 4.27391304 1.33478261 53.52155957 0.52608696 0.33043478 0.49130435]

print('Variance of each feature per class')


print('Survive = 0: %s' % classifier.sigma_[0])
print('Survive = 1: %s' % classifier.sigma_[1])
Variance of each feature per class
Survive = 0: [8.32554825e+00 2.88044224e-01 8.60096730e+02 8.36055467e-01
1.19424042e-01 1.11647419e+00]
Survive = 1: [8.45106148e+00 2.74880003e-01 5.28959488e+03 6.75409304e-01
35

2.21250514e-01 4.41231610e-01]

test.fillna(test.mean(), inplace=True)
test_predictions = classifier.predict(test)
submission = pd.DataFrame({'PassengerId': test_ids})
submission['Survived'] = test_predictions.astype('int')
submission.to_csv('submission.csv', index=False)
submission.head(10)

PassengerId Survived

0 892 0

1 893 1

2 894 0

3 895 0

4 896 1

5 897 0

6 898 1

7 899 0

8 900 1

9 901 0
36

You might also like