KNN - Predict whether a person will have diabetes or not
In [4]:
# Inport the important Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings. filterwarnings( ignore’)
In [2]:
1 # Load the datasets
2
3. df = pd.read_csv(‘diabetes.csv')
4
5
df.head()
out (2
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigr
° 6 148 2 3 OO 388 0
1 1 85 66 2 0 286 0.
2 8 183 64 0 0 as 0.
3 1 89 66 2% 88 281 °.
4 0 437 40 35168 43.1 2.
In [3]:
1 # Information of Dataset
2
3. df.info()
RangeIndex: 768 entries, @ to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
@ Pregnancies 768 non-null int64
1 Glucose 768 non-null inte
2 BloodPressure 768 non-null inte
3. SkinThickness 768 non-null inté4
4 Insulin 768 non-null int64
5 BMI 768 non-null floatea
6 DiabetesPedigreeFunction 768 non-null —_floate4
7 Age 768 non-null int64
B Outcome 768 non-null inte4
dtypes: floate4(2), inté4(7)
memory usage: 54.1 KBIn [4]:
1 # Description of the data
2
3. df.describe()
out [4]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Diab
‘count 78.000000 768.000000 768,000000 _768.000000 768,000000 78.000000
mean 9.845052 120.894531 69.105469 —-20,59845879.799479 31.992578
sid 9.969578 21.972618 —«t9.25580715,952218 115.244002 7.884160
min 0.000000 0.000000 0.000000 0.000009 0.009000 0.000000
25% 1.000000 9.000000 62000000 0.000009 0.000000 27.300000
50% 3.900000 117,000000 72.0000 + 23,0000 30.500000 2.000000
75% 6.000000 *40.250000 80,000000 2.000000 127.250000 6.600000
max 17.000000 199,000000 122.0000 99,000000 846,000000 67.0000
In [5]:
1. df[ Insulin’ ].value_counts()
out [5]:
6 374
15 at
138 9
140 9
128 8
B 1
am 1
255. 1
52 1
112 1
Name: Insulin, Length: 186, dtype: intea
In [6]:
1. 4F[[ Glucose’, 'BloodPressure’ , 'SkinThickness’, Insulin’, "BMI']] = df[[ ‘Glucose’ , ‘BloIn [7]:
1. éF.info()
RangeIndex: 768 entries, @ to 767
bata columns (total 9 column:
# Column
s):
Non-Null Count
Dtype
@ Pregnancies
1 Glucose
2 BloodPressure
3 SkinThickness
4 Insulin
5 eM
6 DiabetesPedigreeFunction
7 Age
8 Outcome
dtypes: floatéa(s), int64(3)
memory usage: 54.1 KB
In [8
1. # Checking for null values
2
3. éf.isnull().sum()
out [8]:
Pregnancies e
Glucose 5
BloodPressure 35
SkinThickness 27
Insulin 374
BME a
DiabetesPedigreeFunction @
Age @
Outcome @
dtype: intea
In [9]:
1 # Missing value percentage
2
768
763
2B
54a
394
757
768
768
768
non-null
non-null
non-null
non-null
non-null
non-null
non-null
non-null
non-nul1
3 dF.isnul1().sum()/ éf.shape[@] *100
out (9
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BME
DiabetesPedigreeFunction
Age
Outcome
dtype: floates
Null value Treatment
esr kSpoo
008808
651042
557292
-557292
697917
-432292
208800
228800
208800
antea
Floates
floates
Floates
Floates
floates
floates
intea
inteaIn [10]:
1 fig, axes = plt.subplots(ncols = 4, nrows = 2, figsize = (50, 20),)
2
3. sns.kdeplot (df['Pregnancies'], ax = axes[0,0])
4 sns.kdeplot(df[ ‘Glucose’ ], ax = axes[@,1])
5. sns.kdeplot (df['BloodPressure’], ax = axes[0,2])
6 sns.kdeplot (df[skinThickness*], ax = axes[0,3])
7 sns.kdeplot(df[ ‘Insulin’ ], ax = axes[1,0])
8 sns.kdeplot(d#['BMI'], ax = axes[1,1])
9. sns.kdeplot (df ‘DiabetesPedigreeFunction"], ax = axes[1,2])
10 sns.kdeplot (df[‘Age’], ax = axes[1,3])
out [18]:
In [42]:
1 # Checking for null values
2
3. df.isnul1().sum()
out (11):
Pregnancies @
Glucose 5
BloodPressure 35
SkinThickness 227
Insulin 374
BNE a
DiabetesPedigreeFunction @
Age @
Outcome @
dtype: intea
In [42]:
1 # Mean value imputation
2
3. df[ Glucose’ ].fillna(df[ Glucose" ].mean(), inplace = True)
4 df['BloodPressure' ].fillna(df[ 'BloodPressure’].mean(), inplace = True)In [43]:
1 # Checking for null values
2
3. df.isnul1().sum()
out [13]:
Pregnancies e
Glucose e
BloodPressure a
SkinThickness 227
Insulin 374
BMT a
DiabetesPedigreeFunction e
Age e
outcome e
dtype: intea
Tn [14]:
1 # Median value imputation
2
3. d€['SkinThickness'].fillna(df[ ‘SkinThickness’].median(), inplace = True)
4 df[' Insulin’ ].fillna(df['Insulin'].median(), inplace = True)
5. df['BMz' ].fallna(df['MI'].median(), inplace = True)
In [45]:
1 # Checking for null values
2
3. df.isnull().sum()
out [15]:
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BME
DiabetesPedigreeFunction
Age
outcome
dtype: inteaIn [16]:
1 ‘checking the data information
2
3. df.info()
RangeIndex: 768 entries, @ to 767
Data colurns (total 9 columns):
# Column Non-Null Count Dtype
@ Pregnancies 768 non-null intoa
1 Glucose 768 non-null floatoa
2 BloodPressure 768 non-null —floatéa
3. SkinThickness 768 non-null floatea
4 Insulin 768 non-null floated
5 eM 768 non-null floated
6 DiabetesPedigreeFunction 768 non-null floated
7 Age 768 non-null int6a
8 Outcome 768 non-null intéa
dtypes: floatea(6), inte4(3)
memory usage: 54.1 KB
In [17]:
1 df. describe()
out [17]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Diab
‘count 768,000000 768,000000 768,000 768.0000 768,000000 768.000000
mean 3.845052 121.686763 «72.405184 29108073 140.671875 2.455208
std 3.369578 30.435049 —12,096346 8.791221 86,383080 6.875177
min 0.000000 44,0000 24,0000, 7.900000 14,009000 18.2000
25% 1.000000 9.750000 64,0000 —25.000000 121.500000 27.5000
50% 3.000000 117.000000 72.202582 2.000000 125.090000 2.300000
75% 6.000000 140.250000 80.0000 2.000000 127.250000 6.600000
max 17.0000 199,000000 127,000000 99,0000 846,000000 67.1000
In [18]:
1 # Feature Scaling
2 from sklearn.preprocessing import StandardScaler
In [19]:
1 scaler = Standardscaler()In [20]:
1. scaler. fit (df drop( "Outcome" ,axis=1))
out (20):
Standardscaler()
In [21]:
1 scaled_features = scaler.transform(df.drop( ‘Outcome’ , axis=1))
In [22]:
1 df feat = pd.bataFrane(scaled_features, colunns=df.colunns[:-1])
2 df feat.head()
out (22):
Pregnancies Glucose BloodPressure SkinThickness Insulin _—_-BMI_DiabetesPedig
0 0639087 O.8e5108 0.033518, 0.670883 -0.181541 0.166819
1 -0844885 1.206162 0.529859 0.012301 0.181541 -0.852200
2 1.233880 2.015813 0.695308. 0.012301 0.181541. -1.332500
3 0844885 1.074652 0.529859 0.695245 0.540642 -0.633881
4 -1041852 0.503458 ——_-2,680869 0.670843 0.316568 1.549303,
In [23]:
# Train Test Split
fron sklearn.model_selection import train_test_split
X train, X test, y_train, y test = train_test_split(scaled_ features, df[‘Outcone"],
‘test_size=0.30, randon_state=161)
KNN Classification
In [24]:
1 from sklearn.neighbors import KNeighborsClassifier
In [25]:
1. # KNW with nn_neighbors=1
2 knn = KNeighborsClassifier(n_neighbors=1)
In [26]:
1 kn. Fit(X_train,y_train)
out [26]:
KNeighborsClassifier(n_neighbor:In [27]:
1 pred = knn.predict(X_test)
Predictions and Evaluations
Let's evaluate our KNN model!
In [28]:
1 from sklearn.metrics import classification_report, confusion_matrix
In [29]:
1. print (confusion_matrix(y_test,pred))
[[115 35]
[ 29. 52])
In [30]:
1 print(classification_report(y_test,pred))
precision recall fl-score support
e 0.80 0.77 0.78 150
1 2.60 2.64 2.62 a1
accuracy 2.72 234
macro avg @.70 @.78 @.78 231
weighted ave 2.73 @.72 0.73 231
Choosing a K Value
Let's go ahead and use the elbow method to pick a good K Value
In [31]:
1 from sklearn.model_selection import cross_val_score
In [33]:
1 accuracy_rate = []
2
3. for i in range(1,40):
4
5 knn = KNeighborsClassifier(n_neighbor:
6 score=cross_val_score(knn,df_feat, df[ ‘Outcome’ J, cv=10)
7 accuracy_rate.append(score.mean())In [34]:
1 error_rate = []
2
3 for i in range(1,49):
knn = KNeighborsClassifier(n_neighbors=i)
score=cross_val_score(knn,df_feat ,df[ ‘Outcome’ ],cv=10)
error_rate.append(1-score.mean())
In [35]:
error_rate
u
for i in range(1,40):
knn. #it(X_train,y_train)
pred_i = knn.predict(Xx_test)
1
2
3
4
5 knn = KNeighborsClassifier(n_neighbor:
6
7
8 error_rate.append(np.mean(pred_i
y_test))
In [48]:
plt. Figure(Figsize=(10,6))
plt.plot(range(1,49),error_rate,color='blue’, Linestyl
markerfacecolor='red", markersize=10)
4plt.plot(range(1, 49), accuracy_rate, color="blue’, Linestyle='dashed’, marker='0',
# markerfacecolor='red', markersize=16)
plt.title(*Error Rate vs. K Value")
plt.xlabel("K")
plt.ylabel("Error Rate")
dashed’, marker='0",
out (48):
Text(@, @.5, ‘Error Rate")
Error Rate vs. K ValueIn [37]:
1 FIRST A QUICK COMPARISON TO OUR ORIGINAL K=21
2 knn = KNeighborsClassifier(n_neighbors=1)
3
4 knn.#it(X_train,y_train)
5 pred = knn.predict(x_test)
6
7 print (‘WITH K=1')
8 print(*\n')
9 print (confusion_matrix(y_test,pred))
10 print(*\n')
11 print (classification_report(y test, pred))
wath
[(aas 35]
[29 s2]]
precision recall f1-score support
e e809 8.77.78 ase
1 e568 0.64.62. at
accuracy 0.72 2a
macro avg 0.70 8.78 0.70 231
weighted avg 73 @.72 @.73 231In [47]:
1. NOW WITH K=20
2. knn = KNeighborsClassifier(n_neighbors=20)
3
4 knn.fit(X_train,y_train)
5 pred = knn.predict(Xx_test)
6
7 print (‘WITH K=20')
8 print(*\n')
9 print(confusion_matrix(y_test,pred))
1 print(*\n')
11 print(classification_report(y_test,pred))
WITH K=2@
[[134 16]
[31 507]
precision recall fi-score support
e 0.81 0.89.85 150
1 0.76 0.62.68 a1
accuracy 0.80 231
macro avg 0.78 0.76 8.77 231
weighted avg 0.79 0.88 8.79 231
In[}:
1