0% found this document useful (0 votes)
43 views11 pages

KNN Classification

Uploaded by

pawanyadav3317
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF or read online on Scribd
0% found this document useful (0 votes)
43 views11 pages

KNN Classification

Uploaded by

pawanyadav3317
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF or read online on Scribd
KNN - Predict whether a person will have diabetes or not In [4]: # Inport the important Libraries import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import warnings warnings. filterwarnings( ignore’) In [2]: 1 # Load the datasets 2 3. df = pd.read_csv(‘diabetes.csv') 4 5 df.head() out (2 Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigr ° 6 148 2 3 OO 388 0 1 1 85 66 2 0 286 0. 2 8 183 64 0 0 as 0. 3 1 89 66 2% 88 281 °. 4 0 437 40 35168 43.1 2. In [3]: 1 # Information of Dataset 2 3. df.info() RangeIndex: 768 entries, @ to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype @ Pregnancies 768 non-null int64 1 Glucose 768 non-null inte 2 BloodPressure 768 non-null inte 3. SkinThickness 768 non-null inté4 4 Insulin 768 non-null int64 5 BMI 768 non-null floatea 6 DiabetesPedigreeFunction 768 non-null —_floate4 7 Age 768 non-null int64 B Outcome 768 non-null inte4 dtypes: floate4(2), inté4(7) memory usage: 54.1 KB In [4]: 1 # Description of the data 2 3. df.describe() out [4]: Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Diab ‘count 78.000000 768.000000 768,000000 _768.000000 768,000000 78.000000 mean 9.845052 120.894531 69.105469 —-20,59845879.799479 31.992578 sid 9.969578 21.972618 —«t9.25580715,952218 115.244002 7.884160 min 0.000000 0.000000 0.000000 0.000009 0.009000 0.000000 25% 1.000000 9.000000 62000000 0.000009 0.000000 27.300000 50% 3.900000 117,000000 72.0000 + 23,0000 30.500000 2.000000 75% 6.000000 *40.250000 80,000000 2.000000 127.250000 6.600000 max 17.000000 199,000000 122.0000 99,000000 846,000000 67.0000 In [5]: 1. df[ Insulin’ ].value_counts() out [5]: 6 374 15 at 138 9 140 9 128 8 B 1 am 1 255. 1 52 1 112 1 Name: Insulin, Length: 186, dtype: intea In [6]: 1. 4F[[ Glucose’, 'BloodPressure’ , 'SkinThickness’, Insulin’, "BMI']] = df[[ ‘Glucose’ , ‘Blo In [7]: 1. éF.info() RangeIndex: 768 entries, @ to 767 bata columns (total 9 column: # Column s): Non-Null Count Dtype @ Pregnancies 1 Glucose 2 BloodPressure 3 SkinThickness 4 Insulin 5 eM 6 DiabetesPedigreeFunction 7 Age 8 Outcome dtypes: floatéa(s), int64(3) memory usage: 54.1 KB In [8 1. # Checking for null values 2 3. éf.isnull().sum() out [8]: Pregnancies e Glucose 5 BloodPressure 35 SkinThickness 27 Insulin 374 BME a DiabetesPedigreeFunction @ Age @ Outcome @ dtype: intea In [9]: 1 # Missing value percentage 2 768 763 2B 54a 394 757 768 768 768 non-null non-null non-null non-null non-null non-null non-null non-null non-nul1 3 dF.isnul1().sum()/ éf.shape[@] *100 out (9 Pregnancies Glucose BloodPressure SkinThickness Insulin BME DiabetesPedigreeFunction Age Outcome dtype: floates Null value Treatment esr kSpoo 008808 651042 557292 -557292 697917 -432292 208800 228800 208800 antea Floates floates Floates Floates floates floates intea intea In [10]: 1 fig, axes = plt.subplots(ncols = 4, nrows = 2, figsize = (50, 20),) 2 3. sns.kdeplot (df['Pregnancies'], ax = axes[0,0]) 4 sns.kdeplot(df[ ‘Glucose’ ], ax = axes[@,1]) 5. sns.kdeplot (df['BloodPressure’], ax = axes[0,2]) 6 sns.kdeplot (df[skinThickness*], ax = axes[0,3]) 7 sns.kdeplot(df[ ‘Insulin’ ], ax = axes[1,0]) 8 sns.kdeplot(d#['BMI'], ax = axes[1,1]) 9. sns.kdeplot (df ‘DiabetesPedigreeFunction"], ax = axes[1,2]) 10 sns.kdeplot (df[‘Age’], ax = axes[1,3]) out [18]: In [42]: 1 # Checking for null values 2 3. df.isnul1().sum() out (11): Pregnancies @ Glucose 5 BloodPressure 35 SkinThickness 227 Insulin 374 BNE a DiabetesPedigreeFunction @ Age @ Outcome @ dtype: intea In [42]: 1 # Mean value imputation 2 3. df[ Glucose’ ].fillna(df[ Glucose" ].mean(), inplace = True) 4 df['BloodPressure' ].fillna(df[ 'BloodPressure’].mean(), inplace = True) In [43]: 1 # Checking for null values 2 3. df.isnul1().sum() out [13]: Pregnancies e Glucose e BloodPressure a SkinThickness 227 Insulin 374 BMT a DiabetesPedigreeFunction e Age e outcome e dtype: intea Tn [14]: 1 # Median value imputation 2 3. d€['SkinThickness'].fillna(df[ ‘SkinThickness’].median(), inplace = True) 4 df[' Insulin’ ].fillna(df['Insulin'].median(), inplace = True) 5. df['BMz' ].fallna(df['MI'].median(), inplace = True) In [45]: 1 # Checking for null values 2 3. df.isnull().sum() out [15]: Pregnancies Glucose BloodPressure SkinThickness Insulin BME DiabetesPedigreeFunction Age outcome dtype: intea In [16]: 1 ‘checking the data information 2 3. df.info() RangeIndex: 768 entries, @ to 767 Data colurns (total 9 columns): # Column Non-Null Count Dtype @ Pregnancies 768 non-null intoa 1 Glucose 768 non-null floatoa 2 BloodPressure 768 non-null —floatéa 3. SkinThickness 768 non-null floatea 4 Insulin 768 non-null floated 5 eM 768 non-null floated 6 DiabetesPedigreeFunction 768 non-null floated 7 Age 768 non-null int6a 8 Outcome 768 non-null intéa dtypes: floatea(6), inte4(3) memory usage: 54.1 KB In [17]: 1 df. describe() out [17]: Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Diab ‘count 768,000000 768,000000 768,000 768.0000 768,000000 768.000000 mean 3.845052 121.686763 «72.405184 29108073 140.671875 2.455208 std 3.369578 30.435049 —12,096346 8.791221 86,383080 6.875177 min 0.000000 44,0000 24,0000, 7.900000 14,009000 18.2000 25% 1.000000 9.750000 64,0000 —25.000000 121.500000 27.5000 50% 3.000000 117.000000 72.202582 2.000000 125.090000 2.300000 75% 6.000000 140.250000 80.0000 2.000000 127.250000 6.600000 max 17.0000 199,000000 127,000000 99,0000 846,000000 67.1000 In [18]: 1 # Feature Scaling 2 from sklearn.preprocessing import StandardScaler In [19]: 1 scaler = Standardscaler() In [20]: 1. scaler. fit (df drop( "Outcome" ,axis=1)) out (20): Standardscaler() In [21]: 1 scaled_features = scaler.transform(df.drop( ‘Outcome’ , axis=1)) In [22]: 1 df feat = pd.bataFrane(scaled_features, colunns=df.colunns[:-1]) 2 df feat.head() out (22): Pregnancies Glucose BloodPressure SkinThickness Insulin _—_-BMI_DiabetesPedig 0 0639087 O.8e5108 0.033518, 0.670883 -0.181541 0.166819 1 -0844885 1.206162 0.529859 0.012301 0.181541 -0.852200 2 1.233880 2.015813 0.695308. 0.012301 0.181541. -1.332500 3 0844885 1.074652 0.529859 0.695245 0.540642 -0.633881 4 -1041852 0.503458 ——_-2,680869 0.670843 0.316568 1.549303, In [23]: # Train Test Split fron sklearn.model_selection import train_test_split X train, X test, y_train, y test = train_test_split(scaled_ features, df[‘Outcone"], ‘test_size=0.30, randon_state=161) KNN Classification In [24]: 1 from sklearn.neighbors import KNeighborsClassifier In [25]: 1. # KNW with nn_neighbors=1 2 knn = KNeighborsClassifier(n_neighbors=1) In [26]: 1 kn. Fit(X_train,y_train) out [26]: KNeighborsClassifier(n_neighbor: In [27]: 1 pred = knn.predict(X_test) Predictions and Evaluations Let's evaluate our KNN model! In [28]: 1 from sklearn.metrics import classification_report, confusion_matrix In [29]: 1. print (confusion_matrix(y_test,pred)) [[115 35] [ 29. 52]) In [30]: 1 print(classification_report(y_test,pred)) precision recall fl-score support e 0.80 0.77 0.78 150 1 2.60 2.64 2.62 a1 accuracy 2.72 234 macro avg @.70 @.78 @.78 231 weighted ave 2.73 @.72 0.73 231 Choosing a K Value Let's go ahead and use the elbow method to pick a good K Value In [31]: 1 from sklearn.model_selection import cross_val_score In [33]: 1 accuracy_rate = [] 2 3. for i in range(1,40): 4 5 knn = KNeighborsClassifier(n_neighbor: 6 score=cross_val_score(knn,df_feat, df[ ‘Outcome’ J, cv=10) 7 accuracy_rate.append(score.mean()) In [34]: 1 error_rate = [] 2 3 for i in range(1,49): knn = KNeighborsClassifier(n_neighbors=i) score=cross_val_score(knn,df_feat ,df[ ‘Outcome’ ],cv=10) error_rate.append(1-score.mean()) In [35]: error_rate u for i in range(1,40): knn. #it(X_train,y_train) pred_i = knn.predict(Xx_test) 1 2 3 4 5 knn = KNeighborsClassifier(n_neighbor: 6 7 8 error_rate.append(np.mean(pred_i y_test)) In [48]: plt. Figure(Figsize=(10,6)) plt.plot(range(1,49),error_rate,color='blue’, Linestyl markerfacecolor='red", markersize=10) 4plt.plot(range(1, 49), accuracy_rate, color="blue’, Linestyle='dashed’, marker='0', # markerfacecolor='red', markersize=16) plt.title(*Error Rate vs. K Value") plt.xlabel("K") plt.ylabel("Error Rate") dashed’, marker='0", out (48): Text(@, @.5, ‘Error Rate") Error Rate vs. K Value In [37]: 1 FIRST A QUICK COMPARISON TO OUR ORIGINAL K=21 2 knn = KNeighborsClassifier(n_neighbors=1) 3 4 knn.#it(X_train,y_train) 5 pred = knn.predict(x_test) 6 7 print (‘WITH K=1') 8 print(*\n') 9 print (confusion_matrix(y_test,pred)) 10 print(*\n') 11 print (classification_report(y test, pred)) wath [(aas 35] [29 s2]] precision recall f1-score support e e809 8.77.78 ase 1 e568 0.64.62. at accuracy 0.72 2a macro avg 0.70 8.78 0.70 231 weighted avg 73 @.72 @.73 231 In [47]: 1. NOW WITH K=20 2. knn = KNeighborsClassifier(n_neighbors=20) 3 4 knn.fit(X_train,y_train) 5 pred = knn.predict(Xx_test) 6 7 print (‘WITH K=20') 8 print(*\n') 9 print(confusion_matrix(y_test,pred)) 1 print(*\n') 11 print(classification_report(y_test,pred)) WITH K=2@ [[134 16] [31 507] precision recall fi-score support e 0.81 0.89.85 150 1 0.76 0.62.68 a1 accuracy 0.80 231 macro avg 0.78 0.76 8.77 231 weighted avg 0.79 0.88 8.79 231 In[}: 1

You might also like