SVM - RF - Diabetes - CSV - 26 - 6 - 2023.ipynb - Colaboratory

6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.
ipynb - Colaboratory
from google.colab import drive

drive.mount('/content/drive')
Mounted at /content/drive
#Program for SVM(Support Vector Machine)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn import svm
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv files/diabetes.csv')
df.head()
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedig
0 6 148 72 35 0 33.6
1 1 85 66 29 0 26.6
2 8 183 64 0 0 23.3
3 1 89 66 23 94 28.1
4 0 137 40 35 168 43.1
df.describe()
Pregnancies Glucose BloodPressure SkinThickness Insulin BM
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000
df.isnull().values.any()
False
zero_not_allowed = ["Glucose","BloodPressure","SkinThickness"]
for column in zero_not_allowed:

df[column] = df[column].replace(0, np.NaN)
mean = int(df[column].mean(skipna = True))
df[column] = df[column].replace(np.NaN, mean)
x = df.iloc[:, :-2]
y = df.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size = 0.2)
clf = svm.SVC(kernel='rbf')
clf.fit(x_train,y_train)
https://colab.research.google.com/drive/1A0OglTKaWFaL81tRzcsF_pLPI4xvd0vA#scrollTo=kaIiTkNy4Wpc&printMode=true 1/8
6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.ipynb - Colaboratory
y_pred = clf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
Accuracy: 0.7922077922077922
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)
array([[98, 9],
[23, 24]])
#Program for RF(Random Forest)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sbs
sbs.set_theme()
# %matplotlib qt
%matplotlib inline
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/csv files/diabetes.csv")

df.head()
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedig
0 6 148 72 35 0 33.6
1 1 85 66 29 0 26.6
2 8 183 64 0 0 23.3
3 1 89 66 23 94 28.1
4 0 137 40 35 168 43.1
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pregnancies 768 non-null int64
1 Glucose 768 non-null int64
2 BloodPressure 768 non-null int64
3 SkinThickness 768 non-null int64
4 Insulin 768 non-null int64
5 BMI 768 non-null float64
6 DiabetesPedigreeFunction 768 non-null float64
7 Age 768 non-null int64
8 Outcome 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
df.describe()
Pregnancies Glucose BloodPressure SkinThickness Insulin BM

df[['Glucose',
count 'BloodPressure',
768.000000 'SkinThickness',
768.000000 'Insulin',
768.000000 'BMI']]768.000000
768.000000 = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].rep
768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578

_ = df.hist(bins=50, figsize=(20, 15))
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000
Healthy = df[ df['Outcome'] == 0 ]

Diabetic = df[ df['Outcome'] == 1 ]
h_diab = pd.Series({'healthy':Healthy.shape[0],
'Diabetic':Diabetic.shape[0]})
h_diab.plot.bar(alpha=0.7)
<Axes: >
h_diab.plot.pie(startangle=90,
explode=[0, 0.1],
autopct='%1.1f%%',
colors=['C3', 'C4'])
plt.title('Relative % of females diabetic ')
plt.ylabel('')
_ = plt.axis('equal')
df.isnull().sum()
Pregnancies 0
Glucose 5
BloodPressure 35
SkinThickness 227
Insulin 374
BMI 11
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64
df.pivot_table(index=['Outcome'] )
Age BMI BloodPressure DiabetesPedigreeFunction Glucose
Outcome
0 31.190000 30.859674 70.877339 0.429734 110.643863
1 37.067164 35.406767 75.321429 0.550500 142.319549
def replace_null_values(df):
for col in df.columns:
df.loc[(df['Outcome']==0) & (df[col].isnull()), col] = df[df['Outcome'] == 0][col].median()
df.loc[(df['Outcome']==1) & (df[col].isnull()), col] = df[ df['Outcome'] == 1][col].median()
print(df.isnull().sum())
replace_null_values(df)
Pregnancies 0
Glucose 0
BloodPressure 0
SkinThickness 0
Insulin 0
BMI 0
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64
def create_new_bmi(df):
new_cat = "NEW_BMI_CAT"
df.loc[(df['BMI'] < 18.5), new_cat] = "underweight"
df.loc[(df['BMI'] > 18.5) & (df['BMI'] < 25), new_cat] = "normal"
df.loc[(df['BMI'] > 24) & (df['BMI'] < 30), new_cat] = "overweight"
df.loc[(df['BMI'] > 30) & (df['BMI']< 40), new_cat] = "obese"
df.drop('BMI', axis=1, inplace=True)
df[new_cat] = df[new_cat].astype('category')
def create_new_glucose(df):
new_cat = "NEW_GLUCOSE_CAT"
df.loc[(df['Glucose'] < 70), new_cat] = "low"
df.loc[(df['Glucose'] > 70) & (df['Glucose'] < 99), new_cat] = "normal"
df.loc[(df['Glucose'] > 99) & (df['Glucose'] < 126), new_cat] = "high"
df.loc[(df['Glucose'] > 126) & (df['Glucose'] < 200), new_cat] = "very_high"
def create_new_skinthickness(df):
new_cat = "NEW_SKIN_THICKNESS"
df.loc[df['SkinThickness'] < 30, new_cat] = "normal"
df.loc[df['SkinThickness'] >= 70, new_cat] = "highfat"
def create_new_pregnancies(df):
new_cat = "NEW_PREGNANCIES"
df.loc[df['Pregnancies'] == 0, new_cat] = "no_pregnancies"
df.loc[(df['Pregnancies'] > 0) & df['Pregnancies'] <= 4, new_cat] = "std_pregnancies"
df.loc[(df['Pregnancies'] > 4), new_cat] = "over_pregnancies"
def create_circulation_level(df):
new_cat = "NEW_CIRCULATION_LEVEL"
df.loc[(df['SkinThickness'] < 30) & (df['BloodPressure'] < 80), new_cat] = "normal"
df.loc[(df['SkinThickness'] > 30) & (df['BloodPressure']>= 80), new_cat] = "high_risk"
df.loc[((df['SkinThickness']< 30) & (df['BloodPressure'] >=80)) | ((df['SkinThickness']> 30) & (df['BloodPressure'] <80)), new_cat] = "me
df.drop('SkinThickness', axis=1, inplace=True)
def create_other_features(df):
df['PRE_AGE_CAT'] = df['Age'] * df['Pregnancies']
df['INSULIN_GLUCOSE_CAT'] = df['Insulin'] * df['Glucose']
df.drop('Pregnancies', axis=1, inplace=True)
df.drop('Glucose', axis=1, inplace=True)
create_new_bmi(df)
create_new_glucose(df)
create_new_pregnancies(df)
create_new_skinthickness(df)
create_circulation_level(df)
create_other_features(df)
df
BloodPressure Insulin DiabetesPedigreeFunction Age Outcome NEW_BMI_CAT
0 72.0 169.5 0.627 50 1 obese
1 66.0 102.5 0.351 31 0 overweight
2 64.0 169.5 0.672 32 1 normal

df['NEW_CIRCULATION_LEVEL'].dtype.name == 'category'
3 66.0 94.0 0.167 21 0 overweight
True
4 40.0 168.0 2.288 33 1 NaN
... ... ...

len(df['NEW_CIRCULATION_LEVEL'].unique()) ... ... ... ...
763 76.0 180.0 0.171 63 0 obese

4
764 70.0 102.5 0.340 27 0 obese
label_encoder
765 = preprocessing.LabelEncoder()
72.0 112.0 0.245 30 0 overweight
df['Outcome'] = label_encoder.fit_transform(df['Outcome'])
766 60.0 169.5 0.349 47 1 obese
767 = [col for70.0

categ_cols col in 102.5 0.315
df.columns if df[col].dtype.name 23 0
== 'category'] obese
print(categ_cols)
768 rows × 12 columns
['NEW_BMI_CAT', 'NEW_GLUCOSE_CAT', 'NEW_PREGNANCIES', 'NEW_SKIN_THICKNESS', 'NEW_CIRCULATION_LEVEL']
def one_hot_encoder(df, columns):

df_dummy = df.copy()
df_dummy = pd.get_dummies(df, columns=columns, drop_first=True)
return df_dummy
result = one_hot_encoder(df, categ_cols)

result
BloodPressure Insulin DiabetesPedigreeFunction Age Outcome PRE_AGE_CAT
0 72.0 169.5 0.627 50 1 300
1 66.0 102.5 0.351 31 0 31
2 64.0 169.5 0.672 32 1 256
3 66.0 94.0 0.167 21 0 21
4 40.0 168.0 2.288 33 1 0
... ... ... ... ... ... ...
763 76.0 180.0 0.171 63 0 630
764 70.0 102.5 0.340 27 0 54
765 72.0 112.0 0.245 30 0 150
766 60.0 169.5 0.349 47 1 47
767 70.0 102.5 0.315 23 0 23
768 rows × 17 columns
from sklearn.model_selection import train_test_split
X = result.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mean_squared_error
rmse = lambda labels, predictions: np.sqrt(mean_squared_error(labels, predictions))
lg_model = LogisticRegression(max_iter=1000,C=0.01).fit(X_train, y_train)

lg_predictions = lg_model.predict(X_test)
rmse(y_test, lg_predictions)
0.4558423058385518
from sklearn.metrics import accuracy_score

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score
print(accuracy_score(y_test, lg_predictions))
print(roc_auc_score(y_test, lg_predictions))
0.7922077922077922
0.7494949494949495
from sklearn.metrics import classification_report
print(classification_report(y_test, lg_predictions))
precision recall f1-score support
0 0.80 0.90 0.85 99

1 0.77 0.60 0.67 55
accuracy 0.79 154

macro avg 0.78 0.75 0.76 154
weighted avg 0.79 0.79 0.79 154
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [400, 500, 600],

'max_leaf_nodes': [14, 15, 16]}
random_forest = RandomForestClassifier(n_jobs=-1)
grid_search_rf = GridSearchCV(random_forest,
param_grid=param_grid,
cv=3,
scoring='recall',
return_train_score=True)
grid_search_rf.fit(X_train, y_train)
grid_search_rf.best_params_
{'max_leaf_nodes': 14, 'n_estimators': 500}
print(classification_report(y_test, grid_search_rf.predict(X_test)))
print(recall_score(y_test, grid_search_rf.predict(X_test)))
print(roc_auc_score(y_test, grid_search_rf.predict_proba(X_test)[:, 1]))
precision recall f1-score support
0 0.89 0.85 0.87 99

1 0.75 0.82 0.78 55
accuracy 0.84 154

macro avg 0.82 0.83 0.83 154
weighted avg 0.84 0.84 0.84 154
0.8181818181818182
0.9263544536271808
check 0s completed at 4:19 PM

SVM - RF - Diabetes - CSV - 26 - 6 - 2023.ipynb - Colaboratory

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

SVM - RF - Diabetes - CSV - 26 - 6 - 2023.ipynb - Colaboratory

Uploaded by

Copyright:

Available Formats

6/27/23, 4:21 PM SVM_RF_Diabetes_CSV_26/6/2023.

from google.colab import drive

#Program for SVM(Support Vector Machine)

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv files/diabetes.csv')

Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedig

4 0 137 40 35 168 43.1

Pregnancies Glucose BloodPressure SkinThickness Insulin BM

count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000

mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578

std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160

min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000

50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000

75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000

max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000

for column in zero_not_allowed:

print("Accuracy:", accuracy_score(y_test, y_pred))

from sklearn.metrics import confusion_matrix

#Program for RF(Random Forest)

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/csv files/diabetes.csv")

Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedig

4 0 137 40 35 168 43.1

Pregnancies Glucose BloodPressure SkinThickness Insulin BM

mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578

min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000

50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000

75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000

max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000

Healthy = df[ df['Outcome'] == 0 ]

Age BMI BloodPressure DiabetesPedigreeFunction Glucose

0 31.190000 30.859674 70.877339 0.429734 110.643863

1 37.067164 35.406767 75.321429 0.550500 142.319549

BloodPressure Insulin DiabetesPedigreeFunction Age Outcome NEW_BMI_CAT

0 72.0 169.5 0.627 50 1 obese

1 66.0 102.5 0.351 31 0 overweight

2 64.0 169.5 0.672 32 1 normal

... ... ...

763 76.0 180.0 0.171 63 0 obese

767 = [col for70.0

def one_hot_encoder(df, columns):

result = one_hot_encoder(df, categ_cols)

BloodPressure Insulin DiabetesPedigreeFunction Age Outcome PRE_AGE_CAT

0 72.0 169.5 0.627 50 1 300

1 66.0 102.5 0.351 31 0 31

2 64.0 169.5 0.672 32 1 256

3 66.0 94.0 0.167 21 0 21

4 40.0 168.0 2.288 33 1 0

... ... ... ... ... ... ...

763 76.0 180.0 0.171 63 0 630

764 70.0 102.5 0.340 27 0 54

765 72.0 112.0 0.245 30 0 150

766 60.0 169.5 0.349 47 1 47

767 70.0 102.5 0.315 23 0 23

768 rows × 17 columns

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression

rmse = lambda labels, predictions: np.sqrt(mean_squared_error(labels, predictions))

lg_model = LogisticRegression(max_iter=1000,C=0.01).fit(X_train, y_train)

from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report

precision recall f1-score support