You are on page 1of 8

6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.

ipynb - Colaboratory

from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive

#Program for SVM(Support Vector Machine)


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn import svm

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv files/diabetes.csv')

df.head()

Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedig

0 6 148 72 35 0 33.6

1 1 85 66 29 0 26.6

2 8 183 64 0 0 23.3

3 1 89 66 23 94 28.1

4 0 137 40 35 168 43.1

df.describe()

Pregnancies Glucose BloodPressure SkinThickness Insulin BM

count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000

mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578

std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160

min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000

50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000

75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000

max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000

df.isnull().values.any()

False

zero_not_allowed = ["Glucose","BloodPressure","SkinThickness"]

for column in zero_not_allowed:


df[column] = df[column].replace(0, np.NaN)
mean = int(df[column].mean(skipna = True))
df[column] = df[column].replace(np.NaN, mean)

x = df.iloc[:, :-2]
y = df.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size = 0.2)

clf = svm.SVC(kernel='rbf')
clf.fit(x_train,y_train)
https://colab.research.google.com/drive/1A0OglTKaWFaL81tRzcsF_pLPI4xvd0vA#scrollTo=kaIiTkNy4Wpc&printMode=true 1/8
6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.ipynb - Colaboratory
y_pred = clf.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7922077922077922

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,y_pred)

array([[98, 9],
[23, 24]])

#Program for RF(Random Forest)


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sbs

sbs.set_theme()

# %matplotlib qt
%matplotlib inline

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/csv files/diabetes.csv")


df.head()

Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedig

0 6 148 72 35 0 33.6

1 1 85 66 29 0 26.6

2 8 183 64 0 0 23.3

3 1 89 66 23 94 28.1

4 0 137 40 35 168 43.1

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pregnancies 768 non-null int64
1 Glucose 768 non-null int64
2 BloodPressure 768 non-null int64
3 SkinThickness 768 non-null int64
4 Insulin 768 non-null int64
5 BMI 768 non-null float64
6 DiabetesPedigreeFunction 768 non-null float64
7 Age 768 non-null int64
8 Outcome 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB

df.describe()

https://colab.research.google.com/drive/1A0OglTKaWFaL81tRzcsF_pLPI4xvd0vA#scrollTo=kaIiTkNy4Wpc&printMode=true 2/8
6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.ipynb - Colaboratory

Pregnancies Glucose BloodPressure SkinThickness Insulin BM


df[['Glucose',
count 'BloodPressure',
768.000000 'SkinThickness',
768.000000 'Insulin',
768.000000 'BMI']]768.000000
768.000000 = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].rep
768.000000

mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578


_ = df.hist(bins=50, figsize=(20, 15))
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160

min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000

50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000

75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000

max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000

Healthy = df[ df['Outcome'] == 0 ]


Diabetic = df[ df['Outcome'] == 1 ]

h_diab = pd.Series({'healthy':Healthy.shape[0],
'Diabetic':Diabetic.shape[0]})
h_diab.plot.bar(alpha=0.7)

https://colab.research.google.com/drive/1A0OglTKaWFaL81tRzcsF_pLPI4xvd0vA#scrollTo=kaIiTkNy4Wpc&printMode=true 3/8
6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.ipynb - Colaboratory

<Axes: >

h_diab.plot.pie(startangle=90,
explode=[0, 0.1],
autopct='%1.1f%%',
colors=['C3', 'C4'])
plt.title('Relative % of females diabetic ')
plt.ylabel('')
_ = plt.axis('equal')

df.isnull().sum()

Pregnancies 0
Glucose 5
BloodPressure 35
SkinThickness 227
Insulin 374
BMI 11
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64

df.pivot_table(index=['Outcome'] )

Age BMI BloodPressure DiabetesPedigreeFunction Glucose

Outcome

0 31.190000 30.859674 70.877339 0.429734 110.643863

1 37.067164 35.406767 75.321429 0.550500 142.319549

def replace_null_values(df):
for col in df.columns:
df.loc[(df['Outcome']==0) & (df[col].isnull()), col] = df[df['Outcome'] == 0][col].median()
df.loc[(df['Outcome']==1) & (df[col].isnull()), col] = df[ df['Outcome'] == 1][col].median()
print(df.isnull().sum())

replace_null_values(df)

https://colab.research.google.com/drive/1A0OglTKaWFaL81tRzcsF_pLPI4xvd0vA#scrollTo=kaIiTkNy4Wpc&printMode=true 4/8
6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.ipynb - Colaboratory

Pregnancies 0
Glucose 0
BloodPressure 0
SkinThickness 0
Insulin 0
BMI 0
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64

def create_new_bmi(df):
new_cat = "NEW_BMI_CAT"
df.loc[(df['BMI'] < 18.5), new_cat] = "underweight"
df.loc[(df['BMI'] > 18.5) & (df['BMI'] < 25), new_cat] = "normal"
df.loc[(df['BMI'] > 24) & (df['BMI'] < 30), new_cat] = "overweight"
df.loc[(df['BMI'] > 30) & (df['BMI']< 40), new_cat] = "obese"
df.drop('BMI', axis=1, inplace=True)
df[new_cat] = df[new_cat].astype('category')
def create_new_glucose(df):
new_cat = "NEW_GLUCOSE_CAT"
df.loc[(df['Glucose'] < 70), new_cat] = "low"
df.loc[(df['Glucose'] > 70) & (df['Glucose'] < 99), new_cat] = "normal"
df.loc[(df['Glucose'] > 99) & (df['Glucose'] < 126), new_cat] = "high"
df.loc[(df['Glucose'] > 126) & (df['Glucose'] < 200), new_cat] = "very_high"
df[new_cat] = df[new_cat].astype('category')
def create_new_skinthickness(df):
new_cat = "NEW_SKIN_THICKNESS"
df.loc[df['SkinThickness'] < 30, new_cat] = "normal"
df.loc[df['SkinThickness'] >= 70, new_cat] = "highfat"
df[new_cat] = df[new_cat].astype('category')
def create_new_pregnancies(df):
new_cat = "NEW_PREGNANCIES"
df.loc[df['Pregnancies'] == 0, new_cat] = "no_pregnancies"
df.loc[(df['Pregnancies'] > 0) & df['Pregnancies'] <= 4, new_cat] = "std_pregnancies"
df.loc[(df['Pregnancies'] > 4), new_cat] = "over_pregnancies"
df[new_cat] = df[new_cat].astype('category')
def create_circulation_level(df):
new_cat = "NEW_CIRCULATION_LEVEL"
df.loc[(df['SkinThickness'] < 30) & (df['BloodPressure'] < 80), new_cat] = "normal"
df.loc[(df['SkinThickness'] > 30) & (df['BloodPressure']>= 80), new_cat] = "high_risk"
df.loc[((df['SkinThickness']< 30) & (df['BloodPressure'] >=80)) | ((df['SkinThickness']> 30) & (df['BloodPressure'] <80)), new_cat] = "me
df[new_cat] = df[new_cat].astype('category')
df.drop('SkinThickness', axis=1, inplace=True)
def create_other_features(df):
df['PRE_AGE_CAT'] = df['Age'] * df['Pregnancies']
df['INSULIN_GLUCOSE_CAT'] = df['Insulin'] * df['Glucose']
df.drop('Pregnancies', axis=1, inplace=True)
df.drop('Glucose', axis=1, inplace=True)
create_new_bmi(df)
create_new_glucose(df)
create_new_pregnancies(df)
create_new_skinthickness(df)
create_circulation_level(df)
create_other_features(df)
df

https://colab.research.google.com/drive/1A0OglTKaWFaL81tRzcsF_pLPI4xvd0vA#scrollTo=kaIiTkNy4Wpc&printMode=true 5/8
6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.ipynb - Colaboratory

BloodPressure Insulin DiabetesPedigreeFunction Age Outcome NEW_BMI_CAT

0 72.0 169.5 0.627 50 1 obese

1 66.0 102.5 0.351 31 0 overweight

2 64.0 169.5 0.672 32 1 normal


df['NEW_CIRCULATION_LEVEL'].dtype.name == 'category'
3 66.0 94.0 0.167 21 0 overweight
True
4 40.0 168.0 2.288 33 1 NaN

... ... ...


len(df['NEW_CIRCULATION_LEVEL'].unique()) ... ... ... ...

763 76.0 180.0 0.171 63 0 obese


4
764 70.0 102.5 0.340 27 0 obese
label_encoder
765 = preprocessing.LabelEncoder()
72.0 112.0 0.245 30 0 overweight
df['Outcome'] = label_encoder.fit_transform(df['Outcome'])
766 60.0 169.5 0.349 47 1 obese

767 = [col for70.0


categ_cols col in 102.5 0.315
df.columns if df[col].dtype.name 23 0
== 'category'] obese
print(categ_cols)
768 rows × 12 columns
['NEW_BMI_CAT', 'NEW_GLUCOSE_CAT', 'NEW_PREGNANCIES', 'NEW_SKIN_THICKNESS', 'NEW_CIRCULATION_LEVEL']

def one_hot_encoder(df, columns):


df_dummy = df.copy()
df_dummy = pd.get_dummies(df, columns=columns, drop_first=True)
return df_dummy

result = one_hot_encoder(df, categ_cols)


result

BloodPressure Insulin DiabetesPedigreeFunction Age Outcome PRE_AGE_CAT

0 72.0 169.5 0.627 50 1 300

1 66.0 102.5 0.351 31 0 31

2 64.0 169.5 0.672 32 1 256

3 66.0 94.0 0.167 21 0 21

4 40.0 168.0 2.288 33 1 0

... ... ... ... ... ... ...

763 76.0 180.0 0.171 63 0 630

764 70.0 102.5 0.340 27 0 54

765 72.0 112.0 0.245 30 0 150

766 60.0 169.5 0.349 47 1 47

767 70.0 102.5 0.315 23 0 23

768 rows × 17 columns

from sklearn.model_selection import train_test_split

X = result.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression


from sklearn.metrics import mean_squared_error

rmse = lambda labels, predictions: np.sqrt(mean_squared_error(labels, predictions))

lg_model = LogisticRegression(max_iter=1000,C=0.01).fit(X_train, y_train)


lg_predictions = lg_model.predict(X_test)
rmse(y_test, lg_predictions)

0.4558423058385518

https://colab.research.google.com/drive/1A0OglTKaWFaL81tRzcsF_pLPI4xvd0vA#scrollTo=kaIiTkNy4Wpc&printMode=true 6/8
6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.ipynb - Colaboratory

from sklearn.metrics import accuracy_score


from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score

print(accuracy_score(y_test, lg_predictions))
print(roc_auc_score(y_test, lg_predictions))

0.7922077922077922
0.7494949494949495

from sklearn.metrics import classification_report

print(classification_report(y_test, lg_predictions))

precision recall f1-score support

0 0.80 0.90 0.85 99


1 0.77 0.60 0.67 55

accuracy 0.79 154


macro avg 0.78 0.75 0.76 154
weighted avg 0.79 0.79 0.79 154

from sklearn.ensemble import RandomForestClassifier


from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [400, 500, 600],


'max_leaf_nodes': [14, 15, 16]}
random_forest = RandomForestClassifier(n_jobs=-1)

grid_search_rf = GridSearchCV(random_forest,
param_grid=param_grid,
cv=3,
scoring='recall',
return_train_score=True)
grid_search_rf.fit(X_train, y_train)
grid_search_rf.best_params_

{'max_leaf_nodes': 14, 'n_estimators': 500}

print(classification_report(y_test, grid_search_rf.predict(X_test)))
print(recall_score(y_test, grid_search_rf.predict(X_test)))
print(roc_auc_score(y_test, grid_search_rf.predict_proba(X_test)[:, 1]))

precision recall f1-score support

0 0.89 0.85 0.87 99


1 0.75 0.82 0.78 55

accuracy 0.84 154


macro avg 0.82 0.83 0.83 154
weighted avg 0.84 0.84 0.84 154

0.8181818181818182
0.9263544536271808

https://colab.research.google.com/drive/1A0OglTKaWFaL81tRzcsF_pLPI4xvd0vA#scrollTo=kaIiTkNy4Wpc&printMode=true 7/8
6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.ipynb - Colaboratory

check 0s completed at 4:19 PM

https://colab.research.google.com/drive/1A0OglTKaWFaL81tRzcsF_pLPI4xvd0vA#scrollTo=kaIiTkNy4Wpc&printMode=true 8/8

You might also like