Knn1 MinMaxScalar

9/7/2018 komal_knn1_minMaxScalar
In [36]: import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import seaborn as sns

sns.set(font_scale=1.5)
sns.set(style='white',color_codes=True)
In [2]: location = r"D:\komal\SIMPLILEARN\MY COURSES\IN PROGRESS\DATA SCIENCE WITH PYT

HON\Live class downloads\Aug 11 Sat - Sep 15 Sat - Attending\datasets\iris.cs
v"
In [3]: # load the training data from breast cancer data set
df_iris = pd.read_csv(location)
df_iris.head()
Out[3]:
sepal_length sepal_width petal_length petal_width class
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
file:///D:/komal/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/codes%20in%20pdf/komal_knn1_minMaxScalar.html 1/13
In [4]: # Check the available styles

plt.style.available
Out[4]: ['bmh',
'classic',
'dark_background',
'fast',
'fivethirtyeight',
'ggplot',
'grayscale',
'seaborn-bright',
'seaborn-colorblind',
'seaborn-dark-palette',
'seaborn-dark',
'seaborn-darkgrid',
'seaborn-deep',
'seaborn-muted',
'seaborn-notebook',
'seaborn-paper',
'seaborn-pastel',
'seaborn-poster',
'seaborn-talk',
'seaborn-ticks',
'seaborn-white',
'seaborn-whitegrid',
'seaborn',
'Solarize_Light2',
'tableau-colorblind10',
'_classic_test']
In [5]: plt.style.use('ggplot')
In [6]: # Means are in the same order of magnitude for all features so scaling
# might not be beneficial.
# If mean values were of different orders of magnitude, scaling could
# significantly improve accuracy of a classifier.
df_iris.describe()
Out[6]:
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
In [7]: X = df_iris.drop('class' , 1).values # drop target variable

y1 = df_iris['class'].values
y = df_iris['class']
In [8]: scaler = MinMaxScaler()

scaler
Out[8]: MinMaxScaler(copy=True, feature_range=(0, 1))
In [9]: X_scaled = scaler.fit_transform(X)
print('X_scaled type is', type(X_scaled))

X_scaled
X_scaled type is <class 'numpy.ndarray'>
Out[9]: array([[0.22222222, 0.625 , 0.06779661, 0.04166667],

[0.16666667, 0.41666667, 0.06779661, 0.04166667],
[0.11111111, 0.5 , 0.05084746, 0.04166667],
[0.08333333, 0.45833333, 0.08474576, 0.04166667],
[0.19444444, 0.66666667, 0.06779661, 0.04166667],
[0.30555556, 0.79166667, 0.11864407, 0.125 ],
[0.08333333, 0.58333333, 0.06779661, 0.08333333],
[0.19444444, 0.58333333, 0.08474576, 0.04166667],
[0.02777778, 0.375 , 0.06779661, 0.04166667],
[0.16666667, 0.45833333, 0.08474576, 0. ],
[0.30555556, 0.70833333, 0.08474576, 0.04166667],
[0.13888889, 0.58333333, 0.10169492, 0.04166667],
[0.13888889, 0.41666667, 0.06779661, 0. ],
[0. , 0.41666667, 0.01694915, 0. ],
[0.41666667, 0.83333333, 0.03389831, 0.04166667],
[0.38888889, 1. , 0.08474576, 0.125 ],
[0.30555556, 0.79166667, 0.05084746, 0.125 ],
[0.22222222, 0.625 , 0.06779661, 0.08333333],
[0.38888889, 0.75 , 0.11864407, 0.08333333],
[0.22222222, 0.75 , 0.08474576, 0.08333333],
[0.30555556, 0.58333333, 0.11864407, 0.04166667],
[0.22222222, 0.70833333, 0.08474576, 0.125 ],
[0.08333333, 0.66666667, 0. , 0.04166667],
[0.22222222, 0.54166667, 0.11864407, 0.16666667],
[0.13888889, 0.58333333, 0.15254237, 0.04166667],
[0.19444444, 0.41666667, 0.10169492, 0.04166667],
[0.19444444, 0.58333333, 0.10169492, 0.125 ],
[0.25 , 0.625 , 0.08474576, 0.04166667],
[0.25 , 0.58333333, 0.06779661, 0.04166667],
[0.11111111, 0.5 , 0.10169492, 0.04166667],
[0.13888889, 0.45833333, 0.10169492, 0.04166667],
[0.30555556, 0.58333333, 0.08474576, 0.125 ],
[0.25 , 0.875 , 0.08474576, 0. ],
[0.33333333, 0.91666667, 0.06779661, 0.04166667],
[0.16666667, 0.45833333, 0.08474576, 0. ],
[0.19444444, 0.5 , 0.03389831, 0.04166667],
[0.33333333, 0.625 , 0.05084746, 0.04166667],
[0.16666667, 0.45833333, 0.08474576, 0. ],
[0.02777778, 0.41666667, 0.05084746, 0.04166667],
[0.22222222, 0.58333333, 0.08474576, 0.04166667],
[0.19444444, 0.625 , 0.05084746, 0.08333333],
[0.05555556, 0.125 , 0.05084746, 0.08333333],
[0.02777778, 0.5 , 0.05084746, 0.04166667],
[0.19444444, 0.625 , 0.10169492, 0.20833333],
[0.22222222, 0.75 , 0.15254237, 0.125 ],
[0.13888889, 0.41666667, 0.06779661, 0.08333333],
[0.22222222, 0.75 , 0.10169492, 0.04166667],
[0.08333333, 0.5 , 0.06779661, 0.04166667],
[0.27777778, 0.70833333, 0.08474576, 0.04166667],
[0.19444444, 0.54166667, 0.06779661, 0.04166667],
[0.75 , 0.5 , 0.62711864, 0.54166667],
[0.58333333, 0.5 , 0.59322034, 0.58333333],
[0.72222222, 0.45833333, 0.66101695, 0.58333333],
[0.33333333, 0.125 , 0.50847458, 0.5 ],
[0.61111111, 0.33333333, 0.61016949, 0.58333333],
[0.38888889, 0.33333333, 0.59322034, 0.5 ],
[0.55555556, 0.54166667, 0.62711864, 0.625 ],
[0.16666667, 0.16666667, 0.38983051, 0.375 ],

[0.63888889, 0.375 , 0.61016949, 0.5 ],
[0.25 , 0.29166667, 0.49152542, 0.54166667],
[0.19444444, 0. , 0.42372881, 0.375 ],
[0.44444444, 0.41666667, 0.54237288, 0.58333333],
[0.47222222, 0.08333333, 0.50847458, 0.375 ],
[0.5 , 0.375 , 0.62711864, 0.54166667],
[0.36111111, 0.375 , 0.44067797, 0.5 ],
[0.66666667, 0.45833333, 0.57627119, 0.54166667],
[0.36111111, 0.41666667, 0.59322034, 0.58333333],
[0.41666667, 0.29166667, 0.52542373, 0.375 ],
[0.52777778, 0.08333333, 0.59322034, 0.58333333],
[0.36111111, 0.20833333, 0.49152542, 0.41666667],
[0.44444444, 0.5 , 0.6440678 , 0.70833333],
[0.5 , 0.33333333, 0.50847458, 0.5 ],
[0.55555556, 0.20833333, 0.66101695, 0.58333333],
[0.5 , 0.33333333, 0.62711864, 0.45833333],
[0.58333333, 0.375 , 0.55932203, 0.5 ],
[0.63888889, 0.41666667, 0.57627119, 0.54166667],
[0.69444444, 0.33333333, 0.6440678 , 0.54166667],
[0.66666667, 0.41666667, 0.6779661 , 0.66666667],
[0.47222222, 0.375 , 0.59322034, 0.58333333],
[0.38888889, 0.25 , 0.42372881, 0.375 ],
[0.33333333, 0.16666667, 0.47457627, 0.41666667],
[0.33333333, 0.16666667, 0.45762712, 0.375 ],
[0.41666667, 0.29166667, 0.49152542, 0.45833333],
[0.47222222, 0.29166667, 0.69491525, 0.625 ],
[0.30555556, 0.41666667, 0.59322034, 0.58333333],
[0.47222222, 0.58333333, 0.59322034, 0.625 ],
[0.66666667, 0.45833333, 0.62711864, 0.58333333],
[0.55555556, 0.125 , 0.57627119, 0.5 ],
[0.36111111, 0.41666667, 0.52542373, 0.5 ],
[0.33333333, 0.20833333, 0.50847458, 0.5 ],
[0.33333333, 0.25 , 0.57627119, 0.45833333],
[0.5 , 0.41666667, 0.61016949, 0.54166667],
[0.41666667, 0.25 , 0.50847458, 0.45833333],
[0.19444444, 0.125 , 0.38983051, 0.375 ],
[0.36111111, 0.29166667, 0.54237288, 0.5 ],
[0.38888889, 0.41666667, 0.54237288, 0.45833333],
[0.38888889, 0.375 , 0.54237288, 0.5 ],
[0.52777778, 0.375 , 0.55932203, 0.5 ],
[0.22222222, 0.20833333, 0.33898305, 0.41666667],
[0.38888889, 0.33333333, 0.52542373, 0.5 ],
[0.55555556, 0.54166667, 0.84745763, 1. ],
[0.41666667, 0.29166667, 0.69491525, 0.75 ],
[0.77777778, 0.41666667, 0.83050847, 0.83333333],
[0.55555556, 0.375 , 0.77966102, 0.70833333],
[0.61111111, 0.41666667, 0.81355932, 0.875 ],
[0.91666667, 0.41666667, 0.94915254, 0.83333333],
[0.16666667, 0.20833333, 0.59322034, 0.66666667],
[0.83333333, 0.375 , 0.89830508, 0.70833333],
[0.66666667, 0.20833333, 0.81355932, 0.70833333],
[0.80555556, 0.66666667, 0.86440678, 1. ],
[0.61111111, 0.5 , 0.69491525, 0.79166667],
[0.58333333, 0.29166667, 0.72881356, 0.75 ],
[0.69444444, 0.41666667, 0.76271186, 0.83333333],
[0.38888889, 0.20833333, 0.6779661 , 0.79166667],
[0.41666667, 0.33333333, 0.69491525, 0.95833333],

[0.58333333, 0.5 , 0.72881356, 0.91666667],
[0.61111111, 0.41666667, 0.76271186, 0.70833333],
[0.94444444, 0.75 , 0.96610169, 0.875 ],
[0.94444444, 0.25 , 1. , 0.91666667],
[0.47222222, 0.08333333, 0.6779661 , 0.58333333],
[0.72222222, 0.5 , 0.79661017, 0.91666667],
[0.36111111, 0.33333333, 0.66101695, 0.79166667],
[0.94444444, 0.33333333, 0.96610169, 0.79166667],
[0.55555556, 0.29166667, 0.66101695, 0.70833333],
[0.66666667, 0.54166667, 0.79661017, 0.83333333],
[0.80555556, 0.5 , 0.84745763, 0.70833333],
[0.52777778, 0.33333333, 0.6440678 , 0.70833333],
[0.5 , 0.41666667, 0.66101695, 0.70833333],
[0.58333333, 0.33333333, 0.77966102, 0.83333333],
[0.80555556, 0.41666667, 0.81355932, 0.625 ],
[0.86111111, 0.33333333, 0.86440678, 0.75 ],
[1. , 0.75 , 0.91525424, 0.79166667],
[0.58333333, 0.33333333, 0.77966102, 0.875 ],
[0.55555556, 0.33333333, 0.69491525, 0.58333333],
[0.5 , 0.25 , 0.77966102, 0.54166667],
[0.94444444, 0.41666667, 0.86440678, 0.91666667],
[0.55555556, 0.58333333, 0.77966102, 0.95833333],
[0.58333333, 0.45833333, 0.76271186, 0.70833333],
[0.47222222, 0.41666667, 0.6440678 , 0.70833333],
[0.72222222, 0.45833333, 0.74576271, 0.83333333],
[0.66666667, 0.45833333, 0.77966102, 0.95833333],
[0.72222222, 0.45833333, 0.69491525, 0.91666667],
[0.41666667, 0.29166667, 0.69491525, 0.75 ],
[0.69444444, 0.5 , 0.83050847, 0.91666667],
[0.66666667, 0.54166667, 0.79661017, 1. ],
[0.66666667, 0.41666667, 0.71186441, 0.91666667],
[0.55555556, 0.20833333, 0.6779661 , 0.75 ],
[0.61111111, 0.41666667, 0.71186441, 0.79166667],
[0.52777778, 0.58333333, 0.74576271, 0.91666667],
[0.44444444, 0.41666667, 0.69491525, 0.70833333]])
In [10]: # transform back to df for easier exploration/plotting (output of scaler)

X_scaled_df = pd.DataFrame(X_scaled, columns=['s_SepalLength','s_SepalWidth',
's_PetalLength','s_PetalWidth'])
X_scaled_df.head()
Out[10]:
s_SepalLength s_SepalWidth s_PetalLength s_PetalWidth
0 0.222222 0.625000 0.067797 0.041667
1 0.166667 0.416667 0.067797 0.041667
2 0.111111 0.500000 0.050847 0.041667
3 0.083333 0.458333 0.084746 0.041667
4 0.194444 0.666667 0.067797 0.041667
In [11]: df_iris_scaled = pd.concat([X_scaled_df,y],axis=1)

df_iris_scaled.head()
Out[11]:
s_SepalLength s_SepalWidth s_PetalLength s_PetalWidth class
0 0.222222 0.625000 0.067797 0.041667 Iris-setosa
1 0.166667 0.416667 0.067797 0.041667 Iris-setosa
2 0.111111 0.500000 0.050847 0.041667 Iris-setosa
3 0.083333 0.458333 0.084746 0.041667 Iris-setosa
4 0.194444 0.666667 0.067797 0.041667 Iris-setosa
In [12]: # Notice x-axis on subplots are all the same for all features (0 to 1)
# after scaling.
fig = plt.figure(figsize=(14,9))
fig.suptitle('Frequency Distribution of Features by Species ',fontsize=20)
ax1 = fig.add_subplot(221)
df_iris_scaled.groupby("class").s_PetalLength.plot(kind='hist',
alpha=0.8,
legend=True,
title='s_PetalLength')
ax2 = fig.add_subplot(222,sharey=ax1)
df_iris_scaled.groupby("class").s_PetalWidth.plot(kind='hist',
alpha=0.8,
legend=True,
title='s_PetalWidth')
df_iris_scaled.groupby("class").s_SepalLength.plot(kind='hist',
alpha=0.8,
legend=True,
title='s_SepalLength')
df_iris_scaled.groupby("class").s_SepalWidth.plot(kind='hist',
alpha=0.8,
legend=True,
title='s_SepalWidth');
In [13]: X_scaled_df.describe()
Out[13]:
s_SepalLength s_SepalWidth s_PetalLength s_PetalWidth
count 150.000000 150.000000 150.000000 150.000000
mean 0.428704 0.439167 0.467571 0.457778
std 0.230018 0.180664 0.299054 0.317984
min 0.000000 0.000000 0.000000 0.000000
25% 0.222222 0.333333 0.101695 0.083333
50% 0.416667 0.416667 0.567797 0.500000
75% 0.583333 0.541667 0.694915 0.708333
max 1.000000 1.000000 1.000000 1.000000
In [18]: # train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, rando

m_state = 0)
In [19]: print("train sample size",X_train.shape, type(X_train))

print("test sample size",X_test.shape, type(X_test))
train sample size (105, 4) <class 'numpy.ndarray'>

test sample size (45, 4) <class 'numpy.ndarray'>
In [23]: clf = KNeighborsClassifier(n_neighbors=5)

clf.fit(X_train, y_train)
Out[23]: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',

metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform')
In [24]: y_pred = clf.predict(X_test)
In [28]: # Creates a confusion matrix

cm = metrics.confusion_matrix(y_test, y_pred)
In [29]: cm
Out[29]: array([[16, 0, 0],

[ 0, 17, 1],
[ 0, 0, 11]], dtype=int64)
In [32]: CT=pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], marg

ins=True)
CT
Out[32]:
Predicted Iris-setosa Iris-versicolor Iris-virginica All
True
Iris-setosa 16 0 0 16
Iris-versicolor 0 17 1 18
Iris-virginica 0 0 11 11
All 16 17 12 45
In [38]: from sklearn.metrics import accuracy_score
An insight we can get from the matrix is that the model was very accurate at classifying setosa and
versicolor (True Positive/All = 1.0). However, accuracy for virginica was lower (11/12 = 0.917).
In [39]: plt.figure(figsize=(6,4))
sns.heatmap(CT, annot=True)
plt.title('KNN classification model \nAccuracy:{0:.3f}'.format(accuracy_score(
y_test, y_pred)))
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[39]: Text(0.5,16,'Predicted label')
In [42]: from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))
precision recall f1-score support
Iris-setosa 1.00 1.00 1.00 16

Iris-versicolor 1.00 0.94 0.97 18
Iris-virginica 0.92 1.00 0.96 11
avg / total 0.98 0.98 0.98 45
In [43]: # Classification accuracy : Overall how often is the classifier correct?

print(metrics.accuracy_score(y_test, y_pred))
# classification error : Overall how often is the classifier incorrect?

print(1-metrics.accuracy_score(y_test, y_pred))
0.9777777777777777
0.022222222222222254
In [45]: # Sensitivity : when the actual value is +ve, how often is the predication cor
rect
# Also known as "True Positive Rate" or 'Recall"
# should be MAXIMIZED
#print(metrics.recall_score(y_test, y_pred, average='none'))
# Specificity: When the actual value is -ve, how often the prediction correct
# Also known as "Selective"
# should be MAXIMIZED
# False Positive Rate : when the actual value is negative, how often is the
# prediction incorrect
# 1- Specificity
# Precision: when a +ve value is predicted, how often is the prediction correc
t?
# print(metrics.precision_score(y_test, y_pred, average='none'))

Knn1 MinMaxScalar

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Knn1 MinMaxScalar

Uploaded by

Copyright:

Available Formats

9/7/2018 komal_knn1_minMaxScalar

In [36]: import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

import seaborn as sns

In [2]: location = r"D:\komal\SIMPLILEARN\MY COURSES\IN PROGRESS\DATA SCIENCE WITH PYT

0 5.1 3.5 1.4 0.2 Iris-setosa

1 4.9 3.0 1.4 0.2 Iris-setosa

2 4.7 3.2 1.3 0.2 Iris-setosa

3 4.6 3.1 1.5 0.2 Iris-setosa

4 5.0 3.6 1.4 0.2 Iris-setosa

In [4]: # Check the available styles

count 150.000000 150.000000 150.000000 150.000000

mean 5.843333 3.054000 3.758667 1.198667

std 0.828066 0.433594 1.764420 0.763161

min 4.300000 2.000000 1.000000 0.100000

25% 5.100000 2.800000 1.600000 0.300000

50% 5.800000 3.000000 4.350000 1.300000

75% 6.400000 3.300000 5.100000 1.800000

max 7.900000 4.400000 6.900000 2.500000

In [7]: X = df_iris.drop('class' , 1).values # drop target variable

In [8]: scaler = MinMaxScaler()

Out[8]: MinMaxScaler(copy=True, feature_range=(0, 1))

In [9]: X_scaled = scaler.fit_transform(X)

print('X_scaled type is', type(X_scaled))

X_scaled type is <class 'numpy.ndarray'>

Out[9]: array([[0.22222222, 0.625 , 0.06779661, 0.04166667],

[0.16666667, 0.16666667, 0.38983051, 0.375 ],

[0.41666667, 0.33333333, 0.69491525, 0.95833333],

In [10]: # transform back to df for easier exploration/plotting (output of scaler)

0 0.222222 0.625000 0.067797 0.041667

1 0.166667 0.416667 0.067797 0.041667

2 0.111111 0.500000 0.050847 0.041667

3 0.083333 0.458333 0.084746 0.041667

4 0.194444 0.666667 0.067797 0.041667

In [11]: df_iris_scaled = pd.concat([X_scaled_df,y],axis=1)

0 0.222222 0.625000 0.067797 0.041667 Iris-setosa

1 0.166667 0.416667 0.067797 0.041667 Iris-setosa

2 0.111111 0.500000 0.050847 0.041667 Iris-setosa

3 0.083333 0.458333 0.084746 0.041667 Iris-setosa

4 0.194444 0.666667 0.067797 0.041667 Iris-setosa

count 150.000000 150.000000 150.000000 150.000000

mean 0.428704 0.439167 0.467571 0.457778

std 0.230018 0.180664 0.299054 0.317984

min 0.000000 0.000000 0.000000 0.000000

25% 0.222222 0.333333 0.101695 0.083333

50% 0.416667 0.416667 0.567797 0.500000

75% 0.583333 0.541667 0.694915 0.708333

max 1.000000 1.000000 1.000000 1.000000

In [18]: # train and test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, rando

In [19]: print("train sample size",X_train.shape, type(X_train))

train sample size (105, 4) <class 'numpy.ndarray'>

In [23]: clf = KNeighborsClassifier(n_neighbors=5)

Out[23]: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',

In [24]: y_pred = clf.predict(X_test)

In [28]: # Creates a confusion matrix

Out[29]: array([[16, 0, 0],

In [32]: CT=pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], marg

In [38]: from sklearn.metrics import accuracy_score

Out[39]: Text(0.5,16,'Predicted label')

In [42]: from sklearn.metrics import classification_report