You are on page 1of 90

credit-card-approve-predict-bynvd

February 19, 2024

[1]: from IPython.display import set_matplotlib_formats


set_matplotlib_formats('pdf', 'svg')

<ipython-input-1-d867fd071f7d>:2: DeprecationWarning: `set_matplotlib_formats`


is deprecated since IPython 7.23, directly use
`matplotlib_inline.backend_inline.set_matplotlib_formats()`
set_matplotlib_formats('pdf', 'svg')

1 Import libraries and data


[2]: import numpy as np
import pandas as pd
import missingno as msno
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path #?
from scipy.stats import probplot, chi2_contingency, chi2
from sklearn.model_selection import train_test_split, GridSearchCV,␣
↪RandomizedSearchCV, cross_val_score, cross_val_predict #RandomizedSearchCV?

from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV #?
from sklearn.compose import ColumnTransformer #?
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.metrics import ConfusionMatrixDisplay, classification_report,␣
↪roc_curve, roc_auc_score, accuracy_score, f1_score, precision_score,␣

↪recall_score, RocCurveDisplay#roc_curve?

from imblearn.over_sampling import SMOTE #?


from sklearn.linear_model import SGDClassifier, LogisticRegression #SGDC?
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance #?
from yellowbrick.model_selection import FeatureImportances
import scipy.stats as stats

1
import joblib
import os
%matplotlib inline

[3]: data_record = pd.read_csv("/content/drive/MyDrive/Data upload/


↪application_record.csv")

credit_record = pd.read_csv("/content/drive/MyDrive/Data upload/credit_record.


↪csv")

print(data_record.info())
print(credit_record.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 438557 non-null int64
1 CODE_GENDER 438557 non-null object
2 FLAG_OWN_CAR 438557 non-null object
3 FLAG_OWN_REALTY 438557 non-null object
4 CNT_CHILDREN 438557 non-null int64
5 AMT_INCOME_TOTAL 438557 non-null float64
6 NAME_INCOME_TYPE 438557 non-null object
7 NAME_EDUCATION_TYPE 438557 non-null object
8 NAME_FAMILY_STATUS 438557 non-null object
9 NAME_HOUSING_TYPE 438557 non-null object
10 DAYS_BIRTH 438557 non-null int64
11 DAYS_EMPLOYED 438557 non-null int64
12 FLAG_MOBIL 438557 non-null int64
13 FLAG_WORK_PHONE 438557 non-null int64
14 FLAG_PHONE 438557 non-null int64
15 FLAG_EMAIL 438557 non-null int64
16 OCCUPATION_TYPE 304354 non-null object
17 CNT_FAM_MEMBERS 438557 non-null float64
dtypes: float64(2), int64(8), object(8)
memory usage: 60.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 1048575 non-null int64
1 MONTHS_BALANCE 1048575 non-null int64
2 STATUS 1048575 non-null object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB

2
None

[4]: data_record.head()

[4]: ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN \


0 5008804 M Y Y 0
1 5008805 M Y Y 0
2 5008806 M Y Y 0
3 5008808 F N Y 0
4 5008809 F N Y 0

AMT_INCOME_TOTAL NAME_INCOME_TYPE NAME_EDUCATION_TYPE \


0 427500.0 Working Higher education
1 427500.0 Working Higher education
2 112500.0 Working Secondary / secondary special
3 270000.0 Commercial associate Secondary / secondary special
4 270000.0 Commercial associate Secondary / secondary special

NAME_FAMILY_STATUS NAME_HOUSING_TYPE DAYS_BIRTH DAYS_EMPLOYED \


0 Civil marriage Rented apartment -12005 -4542
1 Civil marriage Rented apartment -12005 -4542
2 Married House / apartment -21474 -1134
3 Single / not married House / apartment -19110 -3051
4 Single / not married House / apartment -19110 -3051

FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE \


0 1 1 0 0 NaN
1 1 1 0 0 NaN
2 1 0 0 0 Security staff
3 1 0 1 1 Sales staff
4 1 0 1 1 Sales staff

CNT_FAM_MEMBERS
0 2.0
1 2.0
2 2.0
3 1.0
4 1.0

[5]: credit_record.head()
#MONTHS_BALANCE
#debt age
#STATUS
#0: 1-29 days past due
#1: 30-59 days past due
#2: 60-89 days overdue
#3: 90-119 days overdue

3
#4: 120-149 days overdue
#5: Overdue or bad debts, write-offs for more than 150 days
#C: paid off that month = pay debt in full
#X: No loan for the month

[5]: ID MONTHS_BALANCE STATUS


0 5001711 0 X
1 5001711 -1 0
2 5001711 -2 0
3 5001711 -3 0
4 5001712 0 C

[6]: begin_month = pd.DataFrame(credit_record.groupby(['ID'])['MONTHS_BALANCE'].


↪agg(min))

begin_month = begin_month.rename(columns={"MONTHS_BALANCE ": "Account␣


↪age"})

[7]: begin_month

[7]: MONTHS_BALANCE
ID
5001711 -3
5001712 -18
5001713 -21
5001714 -14
5001715 -59
… …
5150482 -28
5150483 -17
5150484 -12
5150485 -1
5150487 -29

[45985 rows x 1 columns]

[8]: data_record = pd.merge(data_record, begin_month, how="left", on="ID")

[9]: data_record.head()

[9]: ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN \


0 5008804 M Y Y 0
1 5008805 M Y Y 0
2 5008806 M Y Y 0
3 5008808 F N Y 0
4 5008809 F N Y 0

AMT_INCOME_TOTAL NAME_INCOME_TYPE NAME_EDUCATION_TYPE \

4
0 427500.0 Working Higher education
1 427500.0 Working Higher education
2 112500.0 Working Secondary / secondary special
3 270000.0 Commercial associate Secondary / secondary special
4 270000.0 Commercial associate Secondary / secondary special

NAME_FAMILY_STATUS NAME_HOUSING_TYPE DAYS_BIRTH DAYS_EMPLOYED \


0 Civil marriage Rented apartment -12005 -4542
1 Civil marriage Rented apartment -12005 -4542
2 Married House / apartment -21474 -1134
3 Single / not married House / apartment -19110 -3051
4 Single / not married House / apartment -19110 -3051

FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE \


0 1 1 0 0 NaN
1 1 1 0 0 NaN
2 1 0 0 0 Security staff
3 1 0 1 1 Sales staff
4 1 0 1 1 Sales staff

CNT_FAM_MEMBERS MONTHS_BALANCE
0 2.0 -15.0
1 2.0 -14.0
2 2.0 -29.0
3 1.0 -4.0
4 1.0 -26.0

[10]: credit_record["STATUS"][credit_record["ID"]== 5001712]


# count 19 obsers but dep_value is 0 mean this customer has no bad debt

[10]: 4 C
5 C
6 C
7 C
8 C
9 C
10 C
11 C
12 C
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 0

5
21 0
22 0
Name: STATUS, dtype: object

[11]: #Create new column dep_value


credit_record["dep_value"] = None
# Bad debt?
credit_record["dep_value"][credit_record['STATUS'] == "2"] = "Yes"
credit_record["dep_value"][credit_record['STATUS'] == "3"] = "Yes"
credit_record["dep_value"][credit_record['STATUS'] == "4"] = "Yes"
credit_record["dep_value"][credit_record['STATUS'] == "5"] = "Yes"
# Count the bad debt
cpunt = credit_record.groupby("ID").count()
cpunt["dep_value"][cpunt["dep_value"] > 0] = "Yes"
cpunt["dep_value"][cpunt["dep_value"] == 0] = "No"
cpunt = cpunt[["dep_value"]]
# Merge data
data_record = pd.merge(data_record, cpunt, how="inner", on="ID")
data_record["Is High Risk"] = data_record["dep_value"]
data_record.loc[data_record["Is High Risk"] == "Yes", "Is High Risk"] == 1
data_record.loc[data_record["Is High Risk"] == "No", "Is High Risk"] == 0
data_record.drop(columns="dep_value", axis=1, inplace=True)

<ipython-input-11-5fce52290614>:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-


docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
credit_record["dep_value"][credit_record['STATUS'] == "2"] = "Yes"
<ipython-input-11-5fce52290614>:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-


docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
credit_record["dep_value"][credit_record['STATUS'] == "3"] = "Yes"
<ipython-input-11-5fce52290614>:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-


docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
credit_record["dep_value"][credit_record['STATUS'] == "4"] = "Yes"
<ipython-input-11-5fce52290614>:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-


docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
credit_record["dep_value"][credit_record['STATUS'] == "5"] = "Yes"

6
[12]: data_record.columns

[12]: Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',


'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'MONTHS_BALANCE',
'Is High Risk'],
dtype='object')

[13]: # Rename the features


data_record = data_record.rename(columns={
'CODE_GENDER':'Gender',
'FLAG_OWN_CAR':'Has a car',
'FLAG_OWN_REALTY':'Has a property',
'CNT_CHILDREN':'Children count',
'AMT_INCOME_TOTAL':'Annual income',
'NAME_INCOME_TYPE':'Employment status',
'NAME_EDUCATION_TYPE':'Education level',
'NAME_FAMILY_STATUS':'Marital status',
'NAME_HOUSING_TYPE':'Dwelling',
'DAYS_BIRTH':'Age',
'DAYS_EMPLOYED': 'Employment length',
'FLAG_MOBIL': 'Has a mobile phone',
'FLAG_WORK_PHONE': 'Has a work phone',
'FLAG_PHONE': 'Has a phone',
'FLAG_EMAIL': 'Has an email',
'OCCUPATION_TYPE': 'Job title',
'CNT_FAM_MEMBERS': 'Family member count',
'MONTHS_BALANCE': 'Account age'
})

[14]: data_record.head()

[14]: ID Gender Has a car Has a property Children count Annual income \
0 5008804 M Y Y 0 427500.0
1 5008805 M Y Y 0 427500.0
2 5008806 M Y Y 0 112500.0
3 5008808 F N Y 0 270000.0
4 5008809 F N Y 0 270000.0

Employment status Education level Marital status \


0 Working Higher education Civil marriage
1 Working Higher education Civil marriage
2 Working Secondary / secondary special Married
3 Commercial associate Secondary / secondary special Single / not married
4 Commercial associate Secondary / secondary special Single / not married

7
Dwelling Age Employment length Has a mobile phone \
0 Rented apartment -12005 -4542 1
1 Rented apartment -12005 -4542 1
2 House / apartment -21474 -1134 1
3 House / apartment -19110 -3051 1
4 House / apartment -19110 -3051 1

Has a work phone Has a phone Has an email Job title \


0 1 0 0 NaN
1 1 0 0 NaN
2 0 0 0 Security staff
3 0 1 1 Sales staff
4 0 1 1 Sales staff

Family member count Account age Is High Risk


0 2.0 -15.0 No
1 2.0 -14.0 No
2 2.0 -29.0 No
3 1.0 -4.0 No
4 1.0 -26.0 No

[15]: def data_split(df, test_size):


df_train, df_test = train_test_split(df, test_size = test_size,␣
↪random_state=42)

return df_train.reset_index(drop=True), df_test.reset_index(drop=True)


# reset index with drop=True mean we not keep the old index

[16]: data_record_train_original, data_record_test_original = data_split(data_record,␣


↪test_size=0.2)

[17]: data_record_train_original.shape

[17]: (29165, 20)

[18]: data_record_test_original.shape

[18]: (7292, 20)

[19]: # So we can wrangle data, but not touching the original data
data_record_train_copy = data_record_train_original.copy()
data_record_test_copy = data_record_test_original.copy()

8
2 Automate Data Function
[20]: data_record.head()
#Age, Employment length is counted in day, account age is counted in month

[20]: ID Gender Has a car Has a property Children count Annual income \
0 5008804 M Y Y 0 427500.0
1 5008805 M Y Y 0 427500.0
2 5008806 M Y Y 0 112500.0
3 5008808 F N Y 0 270000.0
4 5008809 F N Y 0 270000.0

Employment status Education level Marital status \


0 Working Higher education Civil marriage
1 Working Higher education Civil marriage
2 Working Secondary / secondary special Married
3 Commercial associate Secondary / secondary special Single / not married
4 Commercial associate Secondary / secondary special Single / not married

Dwelling Age Employment length Has a mobile phone \


0 Rented apartment -12005 -4542 1
1 Rented apartment -12005 -4542 1
2 House / apartment -21474 -1134 1
3 House / apartment -19110 -3051 1
4 House / apartment -19110 -3051 1

Has a work phone Has a phone Has an email Job title \


0 1 0 0 NaN
1 1 0 0 NaN
2 0 0 0 Security staff
3 0 1 1 Sales staff
4 0 1 1 Sales staff

Family member count Account age Is High Risk


0 2.0 -15.0 No
1 2.0 -14.0 No
2 2.0 -29.0 No
3 1.0 -4.0 No
4 1.0 -26.0 No

[21]: data_record.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36457 entries, 0 to 36456
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 36457 non-null int64

9
1 Gender 36457 non-null object
2 Has a car 36457 non-null object
3 Has a property 36457 non-null object
4 Children count 36457 non-null int64
5 Annual income 36457 non-null float64
6 Employment status 36457 non-null object
7 Education level 36457 non-null object
8 Marital status 36457 non-null object
9 Dwelling 36457 non-null object
10 Age 36457 non-null int64
11 Employment length 36457 non-null int64
12 Has a mobile phone 36457 non-null int64
13 Has a work phone 36457 non-null int64
14 Has a phone 36457 non-null int64
15 Has an email 36457 non-null int64
16 Job title 25134 non-null object
17 Family member count 36457 non-null float64
18 Account age 36457 non-null float64
19 Is High Risk 36457 non-null object
dtypes: float64(3), int64(8), object(9)
memory usage: 5.8+ MB

[22]: data_record.describe()

[22]: ID Children count Annual income Age \


count 3.645700e+04 36457.000000 3.645700e+04 36457.000000
mean 5.078227e+06 0.430315 1.866857e+05 -15975.173382
std 4.187524e+04 0.742367 1.017892e+05 4200.549944
min 5.008804e+06 0.000000 2.700000e+04 -25152.000000
25% 5.042028e+06 0.000000 1.215000e+05 -19438.000000
50% 5.074614e+06 0.000000 1.575000e+05 -15563.000000
75% 5.115396e+06 1.000000 2.250000e+05 -12462.000000
max 5.150487e+06 19.000000 1.575000e+06 -7489.000000

Employment length Has a mobile phone Has a work phone Has a phone \
count 36457.000000 36457.0 36457.000000 36457.000000
mean 59262.935568 1.0 0.225526 0.294813
std 137651.334859 0.0 0.417934 0.455965
min -15713.000000 1.0 0.000000 0.000000
25% -3153.000000 1.0 0.000000 0.000000
50% -1552.000000 1.0 0.000000 0.000000
75% -408.000000 1.0 0.000000 1.000000
max 365243.000000 1.0 1.000000 1.000000

Has an email Family member count Account age


count 36457.000000 36457.000000 36457.000000
mean 0.089722 2.198453 -26.164193

10
max
75%
50%
25%
min
std

0.0
0.2
0.4
0.6
0.8
1.0
36457
1
ID
36
45
Ge 7 ID
nd

plt.show()
plt.show()
er
36
Ha 45
sa 7 Ge
ca nd
Ha r er
sa 36
45 Ha
pro
pe 7 sa
rty ca
Ch
ild 36 r
45 Ha
1.000000
0.000000
0.000000
0.000000
0.000000
0.285787

ren
co 7 sa
u nt pro
An p

[24]: msno.bar(data_record)
nu 36 Ch ert
y
al 45 ild
Em inc 7 ren
om
e co
msno.matrix(data_record)
[23]: # Checking missing value

plo un
ym 36 An t
e nt 45 nu
sta 7 al
inc
Ed tus
Em o me
uc 36 plo
ati 45
on 7 ym
lev en
el Ed ts
Ma 36 uc tat
rita 45 ati us
ls 7 on
tat
us Ma l e ve
rita l
36 ls
Dw 45
ell 7 tat
us
ing Dw
ell
36 ing
45
7
20.000000
3.000000
2.000000
2.000000
1.000000
0.911686

Em Ag
Ag e

11
plo e
ym 36
e nt 45
Ha len 7 Em
sa gth plo
mo ym
36 en
bil
ep 45
7 Ha t len
Ha ho sa gth
sa ne mo
wo 36 bil
ep
rk 45
7
Ha
ho
ph sa
o ne wo ne
Ha 36 rk
sa 45 Ha ph
7 on
sa e
-12.000000
-24.000000
-39.000000
-60.000000
16.501854

0.000000

ph ph
on
Ha e on
e
sa 36 Ha
45 sa
ne 7 ne
ma ma
il Jo il
36 bt
45 itle
Fa
mi
Jo
bt 7
ly itle
m em
Fa
25 mi
ly
be 13
rc 4 me
ou mb
n t Ac er
Ac 36 co co
co 45 un un
u nt 7 ta
ge t
ag Is
e Hig
Is 36 hR
Hig 45
hR 7 isk
19

isk
36
45
7

0
7291
14582
21874
29165
36457
20
[25]: # Task 1: Convert day in Age, Employment length, age account to positive.
def val_count_and_fre(df, feature):
val_count = df[feature].value_counts()
val_fre = df[feature].value_counts(normalize=True)*100
count_and_fre = pd.concat([val_count, val_fre], axis=1)
count_and_fre.columns = ["Counts", "Frequency (%)"]
return count_and_fre

[26]: def get_info_feat(df, feature):


match feature:
case "Age":
print("Description:\n{}".format(np.abs(df[feature]/365.25).describe()))␣
↪#\n:newline

print("-"*50)
print("Object type:{}".format(df[feature].dtype))
case "Employment length":
employed = data_record_train_copy["Employment␣
↪length"][data_record_train_copy["Employment length"] < 0]

employed_year = np.abs(employed/365.25)
print("Description:\n{}".format((employed_year).describe()))
print("-"*50)
print("Object type:{}".format(employed_year.dtype))
case "Account age":
print("Description:\n{}".format(np.abs(df[feature]).describe()))␣
↪#f-string can not handle(create a newline)?

print("-"*50)
print("Object type:{}".format(df[feature].dtype))
case _:
print("Description:\n{}".format(df[feature].describe()))
print("-"*50)
print("Object type:{}".format(df[feature].dtype))
print("-"*50)
value_count = val_count_and_fre(df, feature)
print("Value Counts:\n{}".format(value_count))

[27]: #function create pie chart


def plot_pie_chart(df, feature):
match feature:
case "Dwelling" | "Education level": # countable feature, but how about␣
↪marial status?

ratio_size = val_count_and_fre(df, feature)


ratio_size_len = len(ratio_size.index)
ratio_list = []
for i in range(ratio_size_len):

12
ratio_list.append(ratio_size.iloc[i]["Frequency (%)"])
fig, ax = plt.subplots(figsize=(8,8))
plt.pie(ratio_list, startangle=90, wedgeprops={"edgecolor" :"black"})
plt.title(f"Pie chart of {feature}")
plt.legend(loc="best",labels=ratio_size.index)
plt.axis("equal")
return plt.show()
case _:
ratio_size = val_count_and_fre(df, feature)
ratio_size_len = len(ratio_size.index)
ratio_list = []
for i in range(ratio_size_len):
ratio_list.append(ratio_size.iloc[i]["Frequency (%)"])
fig, ax = plt.subplots(figsize=(8,8))
# %1.2f%% display decimals in the pie chart with 2 decimal places
plt.pie(ratio_list, labels = ratio_size.index, startangle=90, autopct="%1.
↪2f%%", wedgeprops={"edgecolor" :"black"})

plt.title(f"Pie chart of {feature}")


plt.legend(loc="best")
plt.axis("equal")
return plt.show()

[28]: # function to create bar chart


def plot_bar_chart(df, feature):
match feature:
case "Marital status" | "Dwelling" | "Job title" | "Employment status" |␣
↪"Education level":

fig, ax = plt.subplots(figsize=(6,10))
sns.barplot(x=val_count_and_fre(df, feature).index,␣
↪y=val_count_and_fre(df, feature).values[:,0])

ax.set_xticklabels(labels=val_count_and_fre(df, feature).index,␣
↪rotation=45, ha="right") #ha: horizontal alignment

plt.xlabel(f"{feature}")
plt.ylabel("Count")
plt.title(f"{feature} count")
return plt.show()
case _:
fig, ax = plt.subplots(figsize=(6,10))
sns.barplot(x=val_count_and_fre(df, feature).index,␣
↪y=val_count_and_fre(df, feature).values[:,0])

plt.xlabel(f"{feature}")
plt.ylabel("Count")
plt.title(f"{feature} count")
return plt.show()

13
[29]: # function to create box plot
def create_box_plot(df, feature):
match feature:
case "Age":
fig, ax = plt.subplots(figsize=(2, 8))
sns.boxplot(y=np.abs(df[feature]/365.25))
plt.title(f"Distribution(Boxplot) of {feature}")
return plt.show()
case "Children count":
fig, ax = plt.subplots(figsize=(2, 8))
sns.boxplot(y=df[feature])
plt.title(f"Distribution(Boxplot) of {feature}")
plt.yticks(np.arange(0, df[feature].max(), 1)) # arange(start, end, step)
return plt.show()
case "Employment length":
fig, ax = plt.subplots(figsize=(2, 8))
employed = data_record_train_copy["Employment␣
↪length"][data_record_train_copy["Employment length"]<0]

employed_year = np.abs(employed)/365.25
sns.boxplot(y=employed_year)
plt.title(f"Distribution(Boxplot) of {feature}")
plt.yticks(np.arange(0, employed_year.max(), 2))
return plt.show()
case "Annual income":
fig, ax = plt.subplots(figsize=(2, 8))
sns.boxplot(y=df[feature])
plt.title(f"Distribution(Boxplot) of {feature}")
# suppress scientific notation
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ",")))
return plt.show()
case "Account age":
fig, ax = plt.subplots(figsize=(2, 8))
sns.boxplot(y=np.abs(df[feature]))
plt.title(f"Distribution(Boxplot) of {feature}")
return plt.show()
case _:
fig, ax = plt.subplots(figsize=(2, 8))
sns.boxplot(y=df[feature])
plt.title(f"Distribution(Boxplot) of {feature}")
return plt.show()

[30]: # function to create histogram


def create_hist_plot(df, feature, n_bins=30):
match feature:
case "Age":
fig, ax = plt.subplots(figsize=(18, 10))

14
sns.histplot(np.abs(df[feature])/365.25, bins=n_bins, kde=True) #kde:
↪probability density of the underlying distribution of the data
plt.title(f"Distribution(Histogram) of {feature}")
return plt.show()
case "Annual income":
fig, ax = plt.subplots(figsize=(18,10))
sns.histplot(df[feature], bins=n_bins, kde=True)
plt.title(f"Distribution(Histogram) of {feature}")
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ",")))
return plt.show()
case "Employment length":
fig, ax = plt.subplots(figsize=(18, 10))
employed = data_record_train_copy["Employment␣
↪length"][data_record_train_copy["Employment length"] < 0]

employed_year = np.abs(employed)/365.25
sns.histplot(employed_year, bins=n_bins, kde=True)
plt.title(f"Distribution(Histogram) of {feature}")
return plt.show()
case "Account age":
fig, ax = plt.subplots(figsize=(18, 10))
sns.histplot(np.abs(df[feature]), bins=n_bins, kde=True)
plt.title(f"Distribution(Histogram) of {feature}")
return plt.show()
case _:
fig, ax = plt.subplots(figsize=(18, 10))
sns.histplot(df[feature], bins=n_bins, kde=True)
plt.title(f"Distribution(Histogram) of {feature}")
return plt.show()

[31]: # create function of high-low risk box plot


def hl_risk_box_plot(df, feature):
match feature:
case "Age":
print(np.abs(df.groupby("Is High Risk")[feature].mean()/365.25))
fig, ax = plt.subplots(figsize=(5,8))
sns.boxplot(y=np.abs(df[feature])/365.25, x=df["Is High Risk"])
plt.xticks(ticks=[0,1], labels=["No", "Yes"])
plt.title(f"High risk individuals grouped by {feature}")
return plt.show()
case "Employment length":
employed = data_record_train_copy["Employment␣
↪length"][data_record_train_copy["Employment length"]<0]

employed_year = np.abs(employed/365.25) #array, but we will build on␣


↪dataframe

employed_year_df = data_record_train_copy.iloc[employed_year.
↪index][["Employment length", "Is High Risk"]]

15
employed_year_is_high_risk = employed_year_df.groupby("Is High␣
↪Risk")["Employment length"].mean()
print(np.abs(employed_year_is_high_risk/365.25))
fig, ax = plt.subplots(figsize=(5,8))
sns.boxplot(y=employed_year, x=df["Is High Risk"])
plt.xticks(ticks=[0,1], labels=["No", "Yes"])
plt.title(f"High risk individuals grouped by {feature}")
return plt.show()
case "Annual income":
print(np.abs(df.groupby('Is High Risk')[feature].mean()))
fig, ax = plt.subplots(figsize=(5,8))
sns.boxplot(y=np.abs(df[feature]),x=df['Is High Risk'])
plt.xticks(ticks=[0,1],labels=['No','Yes'])
# suppress scientific notation
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.title('High risk individuals grouped by {}'.format(feature))
return plt.show()
case _:
print(np.abs(df.groupby("Is High Risk")[feature].mean()))
fig, ax = plt.subplots(figsize=(5,8))
sns.boxplot(y=np.abs(df[feature]), x=df["Is High Risk"])
plt.xticks(ticks=[0,1], labels=["No", "Yes"])
plt.title(f"High risk individuals grouped by {feature}")
return plt.show()

[32]: group_high_risk = data_record_train_copy.groupby("Marital status")["Is High␣


↪Risk"]

group_high_risk.value_counts()

[32]: Marital status Is High Risk


Civil marriage No 2278
Yes 34
Married No 19724
Yes 320
Separated No 1688
Yes 24
Single / not married No 3777
Yes 87
Widow No 1199
Yes 34
Name: Is High Risk, dtype: int64

[33]: # create high low bar plot


# This means, each type of marital status, how many individuals is high risk =>␣
↪visual into bar

def hl_bar_plot(df, feature):

16
df["Is High Risk"] = df["Is High Risk"].map({"Yes": 1, "No": 0}) #****
group_high_risk = df.groupby(feature)["Is High Risk"].sum() #sum the high␣
↪risk(Yes), but in this case, "No" don't mean 0

sort_gr_h_risk = group_high_risk.sort_values(ascending=False)
print(dict(sort_gr_h_risk))
fig, ax = plt.subplots(figsize=(8,15))
sns.barplot(x=sort_gr_h_risk.index, y=sort_gr_h_risk.values)
ax.set_xticklabels(labels=sort_gr_h_risk.index, rotation=45, ha="right") #ha:␣
↪horizental align

plt.ylabel("Count")
plt.title(f"High risk group of {feature}")
return plt.show()

[34]: data_record_train_copy.columns

[34]: Index(['ID', 'Gender', 'Has a car', 'Has a property', 'Children count',


'Annual income', 'Employment status', 'Education level',
'Marital status', 'Dwelling', 'Age', 'Employment length',
'Has a mobile phone', 'Has a work phone', 'Has a phone', 'Has an email',
'Job title', 'Family member count', 'Account age', 'Is High Risk'],
dtype='object')

3 Explore data

4 Univariate analysis
1. Gender
[35]: get_info_feat(data_record_train_copy, "Gender")

Description:
count 29165
unique 2
top F
freq 19549
Name: Gender, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Counts Frequency (%)
F 19549 67.028973
M 9616 32.971027

[36]: plot_pie_chart(data_record_train_copy, "Gender")

17
Pie chart of Gender
F
M

32.97%

67.03%

[37]: plot_bar_chart(data_record_train_copy, "Gender")

18
Gender count
20000

17500

15000

12500
Count

10000

7500

5000

2500

0
F M
Gender

19
Almost our observed induviduals are women (67.0.3%)
2. Age
[38]: get_info_feat(data_record_train_copy, "Age")

Description:
count 29165.000000
mean 43.749425
std 11.507180
min 21.095140
25% 34.154689
50% 42.614648
75% 53.234771
max 68.862423
Name: Age, dtype: float64
--------------------------------------------------
Object type:int64

[39]: create_box_plot(data_record_train_copy, "Age")

20
Distribution(Boxplot) of Age
70

60

50
Age

40

30

20

[40]: create_hist_plot(data_record_train_copy, "Age")

21
Distribution(Histogram) of Age

1400

1200

1000

800
Count

600

400

200

0
20 30 40 50 60 70
Age

[41]: hl_risk_box_plot(data_record_train_copy, "Age")

Is High Risk
No 43.753103
Yes 43.538148
Name: Age, dtype: float64

22
High risk individuals grouped by Age
70

60

50
Age

40

30

20
No Yes
Is High Risk

• No much difference between age of two group: High risk or Not


• Most of applicants are over 40, data is not Normal, it’s Positive Skew
• This age can tell us, all the applicants are in the working age, no more student.
3. Marital status
[42]: get_info_feat(data_record_train_copy, "Marital status")

23
Description:
count 29165
unique 5
top Married
freq 20044
Name: Marital status, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Counts Frequency (%)
Married 20044 68.726213
Single / not married 3864 13.248757
Civil marriage 2312 7.927310
Separated 1712 5.870050
Widow 1233 4.227670

[43]: plot_pie_chart(data_record_train_copy, "Marital status")

Pie chart of Marital status


Widow
Separated

Civil marriage
4.23%
5.87%
7.93%

13.25% Single / not married

68.73%

Married
Married
Single / not married
Civil marriage
Separated
Widow

24
[44]: plot_bar_chart(data_record_train_copy, "Marital status")

<ipython-input-28-cd296eb7d4d5>:7: UserWarning: FixedFormatter should only be


used together with FixedLocator
ax.set_xticklabels(labels=val_count_and_fre(df, feature).index, rotation=45,
ha="right") #ha: horizontal alignment

25
Marital status count

20000

17500

15000

12500
Count

10000

7500

5000

2500

0
d

w
d

d
rrie

iag
rrie

do
rat

Wi
arr
Ma

ma

pa
il m

Se
ot
/n

Civ
gle
Sin

Marital status
26
[45]: hl_bar_plot(data_record_train_copy, "Marital status")

{'Married': 320, 'Single / not married': 87, 'Civil marriage': 34, 'Widow': 34,
'Separated': 24}
<ipython-input-33-6f446414d3e9>:10: UserWarning: FixedFormatter should only be
used together with FixedLocator
ax.set_xticklabels(labels=sort_gr_h_risk.index, rotation=45, ha="right") #ha:
horizental align

27
High risk group of Marital status

300

250

200
Count

150

100

50

0
d

e
d

ed
rrie

iag
rrie

do

rat
Wi
arr
Ma

ma

pa
il m

Se
ot

28
/n

Civ
gle
Sin

Marital status
• As we quick glance at Age data, we can predict most of our Applicants got married.
• We got Separated Applicants more than Widow ones, but Widow seem riskier than Separat-
eds.
4. Family member count
[46]: data_record_train_copy.columns

[46]: Index(['ID', 'Gender', 'Has a car', 'Has a property', 'Children count',


'Annual income', 'Employment status', 'Education level',
'Marital status', 'Dwelling', 'Age', 'Employment length',
'Has a mobile phone', 'Has a work phone', 'Has a phone', 'Has an email',
'Job title', 'Family member count', 'Account age', 'Is High Risk'],
dtype='object')

[47]: get_info_feat(data_record_train_copy, "Family member count")

Description:
count 29165.000000
mean 2.197531
std 0.912189
min 1.000000
25% 2.000000
50% 2.000000
75% 3.000000
max 20.000000
Name: Family member count, dtype: float64
--------------------------------------------------
Object type:float64
--------------------------------------------------
Value Counts:
Counts Frequency (%)
2.0 15552 53.324190
1.0 5613 19.245671
3.0 5121 17.558718
4.0 2503 8.582205
5.0 309 1.059489
6.0 48 0.164581
7.0 14 0.048003
9.0 2 0.006858
15.0 2 0.006858
20.0 1 0.003429

[48]: create_box_plot(data_record_train_copy, "Family member count")

29
Distribution(Boxplot) of Family member count
20.0

17.5

Family member count 15.0

12.5

10.0

7.5

5.0

2.5

[49]: hl_risk_box_plot(data_record_train_copy, "Family member count")

Is High Risk
0 2.198284
1 2.154309
Name: Family member count, dtype: float64

30
High risk individuals grouped by Family member count
20.0

17.5

15.0
Family member count

12.5

10.0

7.5

5.0

2.5

No Yes
Is High Risk

[50]: plot_bar_chart(data_record_train_copy, "Family member count")

31
Family member count count
16000

14000

12000

10000
Count

8000

6000

4000

2000

0
1.0 2.0 3.0 4.0 5.0 6.0 7.0 9.0 15.0 20.0
Family member count

32
5. Children count
[51]: get_info_feat(data_record_train_copy, "Children count")

Description:
count 29165.000000
mean 0.430790
std 0.741882
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 19.000000
Name: Children count, dtype: float64
--------------------------------------------------
Object type:int64
--------------------------------------------------
Value Counts:
Counts Frequency (%)
0 20143 69.065661
1 6003 20.582890
2 2624 8.997086
3 323 1.107492
4 52 0.178296
5 15 0.051432
7 2 0.006858
14 2 0.006858
19 1 0.003429

[52]: create_box_plot(data_record_train_copy, "Children count")

33
Distribution(Boxplot) of Children count

18
17
16
15
14
13
12
11
Children count

10
9
8
7
6
5
4
3
2
1
0

[53]: plot_bar_chart(data_record_train_copy, "Children count")

34
Children count count

20000

17500

15000

12500
Count

10000

7500

5000

2500

0
0 1 2 3 4 5 7 14 19
Children count

35
6. Dwelling type
[54]: get_info_feat(data_record_train_copy, "Dwelling")

Description:
count 29165
unique 6
top House / apartment
freq 26059
Name: Dwelling, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Counts Frequency (%)
House / apartment 26059 89.350249
With parents 1406 4.820847
Municipal apartment 912 3.127036
Rented apartment 453 1.553232
Office apartment 208 0.713184
Co-op apartment 127 0.435453

[55]: plot_pie_chart(data_record_train_copy, "Dwelling")

36
Pie chart of Dwelling
House / apartment
With parents
Municipal apartment
Rented apartment
Office apartment
Co­op apartment

[56]: plot_bar_chart(data_record_train_copy, "Dwelling")

<ipython-input-28-cd296eb7d4d5>:7: UserWarning: FixedFormatter should only be


used together with FixedLocator
ax.set_xticklabels(labels=val_count_and_fre(df, feature).index, rotation=45,
ha="right") #ha: horizontal alignment

37
Count
Ho
us

10000
15000
20000
25000

0
5000
e/
ap
art
me
nt
Wi
th
p are
Mu nts
nic
ipa
la
pa
rtm
en
Re t

38
nte
da
pa
rtm

Dwelling
en
Of t
Dwelling count

fic
e ap
art
me
nt
Co
­op
ap
art
me
nt
Most applicants live in house/apartment.
7. Annual income
[57]: pd.set_option('display.float_format', lambda x: '%.2f' % x)
get_info_feat(data_record_train_copy, "Annual income")

Description:
count 29165.00
mean 186890.39
std 101409.64
min 27000.00
25% 121500.00
50% 157500.00
75% 225000.00
max 1575000.00
Name: Annual income, dtype: float64
--------------------------------------------------
Object type:float64
--------------------------------------------------
Value Counts:
Counts Frequency (%)
135000.00 3468 11.89
180000.00 2487 8.53
157500.00 2469 8.47
225000.00 2373 8.14
112500.00 2359 8.09
… … …
164250.00 1 0.00
42300.00 1 0.00
432000.00 1 0.00
145350.00 1 0.00
82350.00 1 0.00

[259 rows x 2 columns]

[58]: create_hist_plot(data_record_train_copy, "Annual income")

39
Distribution(Histogram) of Annual income

10,000

8,000

6,000
Count

4,000

2,000

0
0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6
Annual income 1e6

[59]: create_box_plot(data_record_train_copy, "Annual income")

40
Distribution(Boxplot) of Annual income
1,600,000

1,400,000

1,200,000

1,000,000
Annual income

800,000

600,000

400,000

200,000

[60]: hl_risk_box_plot(data_record_train_copy, "Annual income")

Is High Risk
0 186913.94
1 185537.26
Name: Annual income, dtype: float64

41
High risk individuals grouped by Annual income
1,600,000

1,400,000

1,200,000

1,000,000
Annual income

800,000

600,000

400,000

200,000

0
No Yes
Is High Risk

• There is no different between group high risk and group no high risk.
• Most of applicants have annual income in a range from “120k” to “210k”.
8. Job title
[61]: get_info_feat(data_record_train_copy, "Job title")

Description:

42
count 20138
unique 18
top Laborers
freq 5004
Name: Job title, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Counts Frequency (%)
Laborers 5004 24.85
Core staff 2866 14.23
Sales staff 2773 13.77
Managers 2422 12.03
Drivers 1722 8.55
High skill tech staff 1133 5.63
Accountants 998 4.96
Medicine staff 956 4.75
Cooking staff 521 2.59
Security staff 464 2.30
Cleaning staff 425 2.11
Private service staff 287 1.43
Low-skill Laborers 138 0.69
Waiters/barmen staff 127 0.63
Secretaries 122 0.61
HR staff 72 0.36
Realty agents 60 0.30
IT staff 48 0.24

[62]: missing_val_pct = data_record_train_copy["Job title"].isnull().sum()/


↪len(data_record_train_copy) * 100

print(f"Percentage of missing value in Job title: {missing_val_pct.round(2)}%")

Percentage of missing value in Job title: 30.95%

[63]: plot_bar_chart(data_record_train_copy, "Job title")

<ipython-input-28-cd296eb7d4d5>:7: UserWarning: FixedFormatter should only be


used together with FixedLocator
ax.set_xticklabels(labels=val_count_and_fre(df, feature).index, rotation=45,
ha="right") #ha: horizontal alignment

43
Count
La
b

0
1000
2000
3000
4000
5000

Co orer
r s
Sa e sta
le ff
Ma s sta
Hig na ff
hs ge
kill Driv rs
t e
Ac ech rs
s
Me coun taff
di ta
Co cine nts
o s
Se king taff
c s

44
Pr Cle urity taff
iva a s
t n t
Lo e se ing s aff

Job title
Wa w­s rvic taf
ite kill e s f
Job title count

rs/ La taf
ba bo f
rm rer
Se en s s
cre taff
ta
Re HR ries
alt st
y a aff
ge
IT nts
sta
ff
9. Employment status
[64]: get_info_feat(data_record_train_copy, "Employment status")

Description:
count 29165
unique 5
top Working
freq 15056
Name: Employment status, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Counts Frequency (%)
Working 15056 51.62
Commercial associate 6801 23.32
Pensioner 4920 16.87
State servant 2381 8.16
Student 7 0.02

[65]: plot_bar_chart(data_record_train_copy, "Employment status")

<ipython-input-28-cd296eb7d4d5>:7: UserWarning: FixedFormatter should only be


used together with FixedLocator
ax.set_xticklabels(labels=val_count_and_fre(df, feature).index, rotation=45,
ha="right") #ha: horizontal alignment

45
Employment status count

14000

12000

10000
Count

8000

6000

4000

2000

0
te

er

nt
ing

an
cia

de
ion
rk

erv

Stu
so
Wo

ns

s
as

Pe

te
Sta
ial
erc
mm
Co

Employment
46
status
[66]: plot_pie_chart(data_record_train_copy, "Employment status")

Pie chart of Employment


Student
status
State servant

0.02%8.16%
Pensioner

16.87%

Working 51.62%

23.32%

Working Commercial associate


Commercial associate
Pensioner
State servant
Student

• Most applicants are worker.


• Only 2 applicants are student.
10. Education level
[67]: get_info_feat(data_record_train_copy, "Education level")

Description:
count 29165
unique 5
top Secondary / secondary special
freq 19803
Name: Education level, dtype: object
--------------------------------------------------

47
Object type:object
--------------------------------------------------
Value Counts:
Counts Frequency (%)
Secondary / secondary special 19803 67.90
Higher education 7910 27.12
Incomplete higher 1129 3.87
Lower secondary 298 1.02
Academic degree 25 0.09

[68]: plot_pie_chart(data_record_train_copy, "Education level")

Pie chart of Education level

Secondary / secondary special


Higher education
Incomplete higher
Lower secondary
Academic degree

48
[69]: plot_bar_chart(data_record_train_copy, "Education level")

<ipython-input-28-cd296eb7d4d5>:7: UserWarning: FixedFormatter should only be


used together with FixedLocator
ax.set_xticklabels(labels=val_count_and_fre(df, feature).index, rotation=45,
ha="right") #ha: horizontal alignment

49
Se
co
nd
ary Count
/s
ec

10000
12500
15000
17500
20000

0
2500
5000
7500
on
da
ry
sp
ec
ial
Hig
he
re
du
ca
tio
n
Inc
om
ple

50
te
h igh
er
Lo
we
rs

Education level
ec
on
Education level count

da
ry
Ac
ad
em
ic
de
gre
e
The majority of applicants have completed their secondary degree, ¼ completed their higher edu-
cation
11. Employment length
[70]: get_info_feat(data_record_train_copy, "Employment length")

Description:
count 24257.00
mean 7.26
std 6.46
min 0.05
25% 2.68
50% 5.45
75% 9.60
max 43.02
Name: Employment length, dtype: float64
--------------------------------------------------
Object type:float64

[71]: create_box_plot(data_record_train_copy, "Employment length")

51
Distribution(Boxplot) of Employment length

42
40
38
36
34
32
30
28
26
Employment length

24
22
20
18
16
14
12
10
8
6
4
2
0

[72]: create_hist_plot(data_record_train_copy, "Employment length")

52
Distribution(Histogram) of Employment length
3500

3000

2500

2000
Count

1500

1000

500

0
0 10 20 30 40
Employment length

[73]: hl_risk_box_plot(data_record_train_copy, "Employment length")

Is High Risk
0 7.29
1 5.75
Name: Employment length, dtype: float64

53
High risk individuals grouped by Employment length

40

30
Employment length

20

10

No Yes
Is High Risk

• The group label high risk has a shorter employment length than group label no high risk.
12. Has a car
[74]: get_info_feat(data_record_train_copy, "Has a car")

Description:
count 29165

54
unique 2
top N
freq 18128
Name: Has a car, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Counts Frequency (%)
N 18128 62.16
Y 11037 37.84

[75]: plot_pie_chart(data_record_train_copy, "Has a car")

Pie chart of Has a car


N
Y

37.84%

62.16%

55
Most applicants do not own car.
13. Has a property
[76]: get_info_feat(data_record_train_copy, "Has a property")

Description:
count 29165
unique 2
top Y
freq 19557
Name: Has a property, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Counts Frequency (%)
Y 19557 67.06
N 9608 32.94

[77]: plot_bar_chart(data_record_train_copy, "Has a property")

56
Has a property count
20000

17500

15000

12500
Count

10000

7500

5000

2500

0
Y N
Has a property

57
[78]: plot_pie_chart(data_record_train_copy, "Has a property")

Pie chart of Has a property


Y
N

32.94%

67.06%

Most of applicants has a property.


14. Account age
[79]: get_info_feat(data_record_train_copy, "Account age")

Description:
count 29165.00
mean 26.14
std 16.49
min 0.00

58
25% 12.00
50% 24.00
75% 39.00
max 60.00
Name: Account age, dtype: float64
--------------------------------------------------
Object type:float64

[80]: create_box_plot(data_record_train_copy, "Account age")

Distribution(Boxplot) of Account age


60

50

40
Account age

30

20

10

59
[81]: create_hist_plot(data_record_train_copy, "Account age")

Distribution(Histogram) of Account age


1400

1200

1000

800
Count

600

400

200

0
0 10 20 30 40 50 60
Account age

[82]: hl_risk_box_plot(data_record_train_copy, "Account age")

Is High Risk
0 26.00
1 34.04
Name: Account age, dtype: float64

60
High risk individuals grouped by Account age
60

50

40
Account age

30

20

10

No Yes
Is High Risk

15. Is High Risk (Target Variable)

[83]: get_info_feat(data_record_train_copy, "Is High Risk")

Description:
count 29165.00
mean 0.02

61
std 0.13
min 0.00
25% 0.00
50% 0.00
75% 0.00
max 1.00
Name: Is High Risk, dtype: float64
--------------------------------------------------
Object type:int64
--------------------------------------------------
Value Counts:
Counts Frequency (%)
0 28666 98.29
1 499 1.71

[84]: plot_bar_chart(data_record_train_copy, "Is High Risk")

62
Is High Risk count
30000

25000

20000
Count

15000

10000

5000

0
0 1
Is High Risk

63
This target data is very imbalance.

5 Preprocessing data
1. Remove Outlier
[85]: class OutlierRemover(BaseEstimator, TransformerMixin):
def __init__(self,feat_with_outliers = ['Family member count','Annual␣
↪income', 'Employment length']):

self.feat_with_outliers = feat_with_outliers
def fit(self,df):
return self
def transform(self,df):
if (set(self.feat_with_outliers).issubset(df.columns)):
# 25% quantile
Q1 = df[self.feat_with_outliers].quantile(.25)
# 75% quantile
Q3 = df[self.feat_with_outliers].quantile(.75)
IQR = Q3 - Q1
# keep the data within 3 IQR
df = df[~((df[self.feat_with_outliers] < (Q1 - 3 * IQR)) |(df[self.
↪feat_with_outliers] > (Q3 + 3 * IQR))).any(axis=1)]

return df
else:
print("One or more features are not in the dataframe")
return df

2. Drop features
[86]: class DropFeatures(BaseEstimator,TransformerMixin):
def __init__(self,feature_to_drop = ['ID','Has a mobile phone','Children␣
↪count','Job title','Account age']):

self.feature_to_drop = feature_to_drop
def fit(self,df):
return self
def transform(self,df):
if (set(self.feature_to_drop).issubset(df.columns)):
df.drop(self.feature_to_drop,axis=1,inplace=True)
return df
else:
print("One or more features are not in the dataframe")
return df

3. Time convert (Conver Age, Employment length to positive value)

[87]: class TimeConversionHandler(BaseEstimator, TransformerMixin):


def __init__(self, feat_with_days = ['Employment length', 'Age']):
self.feat_with_days = feat_with_days

64
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
if (set(self.feat_with_days).issubset(X.columns)):
# convert days to absolute value
X[['Employment length','Age']] = np.abs(X[['Employment␣
↪length','Age']])

return X
else:
print("One or more features are not in the dataframe")
return X

4. Outlier Employment length


[88]: class RetireeHandler(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, df):
return self
def transform(self, df):
if 'Employment length' in df.columns:
# select rows with employment length is 365243 which corresponds to␣
↪retirees

df_ret_idx = df['Employment length'][df['Employment length'] ==␣


↪365243].index

# change 365243 to 0
df.loc[df_ret_idx,'Employment length'] = 0
return df
else:
print("Employment length is not in the dataframe")
return df

5. Handle skew
[89]: class SkewnessHandler(BaseEstimator, TransformerMixin):
def __init__(self,feat_with_skewness=['Annual income','Age']):
self.feat_with_skewness = feat_with_skewness
def fit(self,df):
return self
def transform(self,df):
if (set(self.feat_with_skewness).issubset(df.columns)):
# Handle skewness with cubic root transformation
df[self.feat_with_skewness] = np.cbrt(df[self.feat_with_skewness])
return df
else:
print("One or more features are not in the dataframe")
return df

65
6. Binning
[90]: class BinningNumToYN(BaseEstimator, TransformerMixin):
def __init__(self,feat_with_num_enc=['Has a work phone','Has a phone','Has␣
↪an email']):

self.feat_with_num_enc = feat_with_num_enc
def fit(self,df):
return self
def transform(self,df):
if (set(self.feat_with_num_enc).issubset(df.columns)):
# Change 0 to N and 1 to Y for all the features in feat_with_num_enc
for ft in self.feat_with_num_enc:
df[ft] = df[ft].map({1:'Y',0:'N'})
return df
else:
print("One or more features are not in the dataframe")
return df

7. One hot encoding


[91]: class OneHotWithFeatNames(BaseEstimator,TransformerMixin):
def __init__(self,one_hot_enc_ft = ['Gender', 'Marital status', 'Dwelling',␣
↪'Employment status', 'Has a car', 'Has a property', 'Has a work phone', 'Has␣

↪a phone', 'Has an email']):

self.one_hot_enc_ft = one_hot_enc_ft
def fit(self,df):
return self
def transform(self,df):
if (set(self.one_hot_enc_ft).issubset(df.columns)):
# function to one hot encode the features in one_hot_enc_ft
def one_hot_enc(df,one_hot_enc_ft):
one_hot_enc = OneHotEncoder()
one_hot_enc.fit(df[one_hot_enc_ft])
# get the result of the one hot encoding columns names
feat_names_one_hot_enc = one_hot_enc.
↪get_feature_names_out(one_hot_enc_ft)

# change the array of the one hot encoding to a dataframe with␣


↪the column names

df = pd.DataFrame(one_hot_enc.transform(df[self.
↪one_hot_enc_ft]).toarray(),columns=feat_names_one_hot_enc,index=df.index)

return df
# function to concatenat the one hot encoded features with the rest␣
↪of features that were not encoded

def concat_with_rest(df,one_hot_enc_df,one_hot_enc_ft):
# get the rest of the features
rest_of_features = [ft for ft in df.columns if ft not in␣
↪one_hot_enc_ft]

66
# concatenate the rest of the features with the one hot encoded␣
↪features
df_concat = pd.concat([one_hot_enc_df,␣
↪df[rest_of_features]],axis=1)
return df_concat
# one hot encoded dataframe
one_hot_enc_df = one_hot_enc(df,self.one_hot_enc_ft)
# returns the concatenated dataframe
full_df_one_hot_enc = concat_with_rest(df,one_hot_enc_df,self.
↪one_hot_enc_ft)

return full_df_one_hot_enc
else:
print("One or more features are not in the dataframe")
return df

8. Ordinal Feature Names


[92]: class OrdinalFeatNames(BaseEstimator,TransformerMixin):
def __init__(self,ordinal_enc_ft = ['Education level']):
self.ordinal_enc_ft = ordinal_enc_ft
def fit(self,df):
return self
def transform(self,df):
if 'Education level' in df.columns:
ordinal_enc = OrdinalEncoder()
df[self.ordinal_enc_ft] = ordinal_enc.fit_transform(df[self.
↪ordinal_enc_ft])

return df
else:
print("Education level is not in the dataframe")
return df

10. Min-Max Scaling


[93]: class MinMaxWithFeatNames(BaseEstimator,TransformerMixin):
def __init__(self,min_max_scaler_ft = ['Age', 'Annual income', 'Employment␣
↪length']):

self.min_max_scaler_ft = min_max_scaler_ft
def fit(self,df):
return self
def transform(self,df):
if (set(self.min_max_scaler_ft).issubset(df.columns)):
min_max_enc = MinMaxScaler()
df[self.min_max_scaler_ft] = min_max_enc.fit_transform(df[self.
↪min_max_scaler_ft])

return df
else:

67
print("One or more features are not in the dataframe")
return df

11. Change the data types of target feature “Is High Risk”
[94]: class ChangeToNumTarget(BaseEstimator,TransformerMixin):
def __init__(self):
pass
def fit(self,df):
return self
def transform(self,df):
if 'Is High Risk' in df.columns:
df['Is High Risk'] = pd.to_numeric(df['Is High Risk'],␣
↪errors='coerce')

return df
else:
print("Is High Risk is not in the dataframe")
return df

12. Over Sampling target feature


[95]: class Oversample(BaseEstimator,TransformerMixin):
def __init__(self):
pass
def fit(self,df):
return self
def transform(self,df):
if 'Is High Risk' in df.columns:
# smote function to oversample the minority class to fix the␣
↪imbalance data

oversample = SMOTE(sampling_strategy='minority')
X_bal, y_bal = oversample.fit_resample(df.loc[:, df.columns != 'Is␣
↪High Risk'],df['Is High Risk'])

df_bal = pd.concat([pd.DataFrame(X_bal),pd.DataFrame(y_bal)],axis=1)
return df_bal
else:
print("Is high risk is not in the dataframe")
return df

[96]: def full_pipeline(df):


# Create the pipeline that will call all the class from OutlierRemoval to␣
↪OversampleSMOTE in one go

pipeline = Pipeline([
('outlier_remover', OutlierRemover()),
('feature_dropper', DropFeatures()),
('time_conversion_handler', TimeConversionHandler()),
('retiree_handler', RetireeHandler()),

68
('skewness_handler', SkewnessHandler()),
('binning_num_to_yn', BinningNumToYN()),
('one_hot_with_feat_names', OneHotWithFeatNames()),
('ordinal_feat_names', OrdinalFeatNames()),
('min_max_with_feat_names', MinMaxWithFeatNames()),
('change_to_num_target', ChangeToNumTarget()),
('oversample', Oversample())
])
df_pipe_prep = pipeline.fit_transform(df)
return df_pipe_prep

[97]: pd.options.mode.chained_assignment = None # Hide the copy warning


df_prep_train = full_pipeline(data_record_train_copy)
df_prep_train

[97]: Gender_F Gender_M Marital status_Civil marriage \


0 0.00 1.00 0.00
1 1.00 0.00 0.00
2 1.00 0.00 0.00
3 1.00 0.00 0.00
4 1.00 0.00 0.00
… … … …
46539 1.00 0.00 0.00
46540 1.00 0.00 0.00
46541 1.00 0.00 0.80
46542 0.00 1.00 0.00
46543 1.00 0.00 0.00

Marital status_Married Marital status_Separated \


0 1.00 0.00
1 0.00 0.00
2 1.00 0.00
3 0.00 0.00
4 0.00 1.00
… … …
46539 1.00 0.00
46540 1.00 0.00
46541 0.20 0.00
46542 1.00 0.00
46543 1.00 0.00

Marital status_Single / not married Marital status_Widow \


0 0.00 0.00
1 1.00 0.00
2 0.00 0.00
3 1.00 0.00
4 0.00 0.00

69
… … …
46539 0.00 0.00
46540 0.00 0.00
46541 0.00 0.00
46542 0.00 0.00
46543 0.00 0.00

Dwelling_Co-op apartment Dwelling_House / apartment \


0 0.00 0.00
1 0.00 1.00
2 0.00 1.00
3 0.00 1.00
4 0.00 1.00
… … …
46539 0.00 1.00
46540 0.00 1.00
46541 0.00 1.00
46542 0.00 0.00
46543 0.00 1.00

Dwelling_Municipal apartment … Has a phone_N Has a phone_Y \


0 0.00 … 1.00 0.00
1 0.00 … 1.00 0.00
2 0.00 … 1.00 0.00
3 0.00 … 1.00 0.00
4 0.00 … 0.00 1.00
… … … … …
46539 0.00 … 1.00 0.00
46540 0.00 … 1.00 0.00
46541 0.00 … 1.00 0.00
46542 0.00 … 1.00 0.00
46543 0.00 … 1.00 0.00

Has an email_N Has an email_Y Annual income Education level Age \


0 1.00 0.00 0.42 4.00 0.60
1 1.00 0.00 0.42 1.00 0.20
2 1.00 0.00 0.52 4.00 0.39
3 0.00 1.00 0.81 1.00 0.84
4 1.00 0.00 0.68 4.00 0.60
… … … … … …
46539 1.00 0.00 0.42 4.00 0.46
46540 1.00 0.00 0.43 4.00 0.66
46541 1.00 0.00 0.34 1.00 0.29
46542 1.00 0.00 0.52 1.66 0.21
46543 1.00 0.00 0.36 4.00 0.33

Employment length Family member count Is High Risk

70
0 0.27 2.00 0
1 0.14 2.00 0
2 0.50 4.00 0
3 0.18 1.00 0
4 0.04 1.00 0
… … … …
46539 0.27 4.00 1
46540 0.49 2.00 1
46541 0.19 2.00 1
46542 0.17 2.34 1
46543 0.38 4.79 1

[46544 rows x 34 columns]

[98]: pd.set_option('display.max_columns', None)


df_prep_train.head()

[98]: Gender_F Gender_M Marital status_Civil marriage Marital status_Married \


0 0.00 1.00 0.00 1.00
1 1.00 0.00 0.00 0.00
2 1.00 0.00 0.00 1.00
3 1.00 0.00 0.00 0.00
4 1.00 0.00 0.00 0.00

Marital status_Separated Marital status_Single / not married \


0 0.00 0.00
1 0.00 1.00
2 0.00 0.00
3 0.00 1.00
4 1.00 0.00

Marital status_Widow Dwelling_Co-op apartment Dwelling_House / apartment \


0 0.00 0.00 0.00
1 0.00 0.00 1.00
2 0.00 0.00 1.00
3 0.00 0.00 1.00
4 0.00 0.00 1.00

Dwelling_Municipal apartment Dwelling_Office apartment \


0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00

Dwelling_Rented apartment Dwelling_With parents \


0 0.00 1.00

71
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00

Employment status_Commercial associate Employment status_Pensioner \


0 0.00 0.00
1 1.00 0.00
2 1.00 0.00
3 1.00 0.00
4 0.00 0.00

Employment status_State servant Employment status_Student \


0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00

Employment status_Working Has a car_N Has a car_Y Has a property_N \


0 1.00 0.00 1.00 0.00
1 0.00 0.00 1.00 1.00
2 0.00 1.00 0.00 0.00
3 0.00 0.00 1.00 0.00
4 1.00 1.00 0.00 1.00

Has a property_Y Has a work phone_N Has a work phone_Y Has a phone_N \
0 1.00 1.00 0.00 1.00
1 0.00 1.00 0.00 1.00
2 1.00 1.00 0.00 1.00
3 1.00 1.00 0.00 1.00
4 0.00 1.00 0.00 0.00

Has a phone_Y Has an email_N Has an email_Y Annual income \


0 0.00 1.00 0.00 0.42
1 0.00 1.00 0.00 0.42
2 0.00 1.00 0.00 0.52
3 0.00 0.00 1.00 0.81
4 1.00 1.00 0.00 0.68

Education level Age Employment length Family member count Is High Risk
0 4.00 0.60 0.27 2.00 0
1 1.00 0.20 0.14 2.00 0
2 4.00 0.39 0.50 4.00 0
3 1.00 0.84 0.18 1.00 0
4 4.00 0.60 0.04 1.00 0

72
[99]: X_train, y_train = df_prep_train.loc[:, df_prep_train.columns != 'Is High␣
↪Risk'], df_prep_train['Is High Risk'].astype('int64')

[100]: data_record_test_copy["Is High Risk"] = data_record_test_copy["Is High Risk"].


↪map({"No":'0',"Yes":'1'})

[101]: df_test_prep = full_pipeline(data_record_test_copy)


df_test_prep

[101]: Gender_F Gender_M Marital status_Civil marriage \


0 1.00 0.00 0.00
1 1.00 0.00 1.00
2 1.00 0.00 0.00
3 1.00 0.00 0.00
4 1.00 0.00 0.00
… … … …
11649 0.00 1.00 0.00
11650 1.00 0.00 0.00
11651 1.00 0.00 0.00
11652 0.00 1.00 0.00
11653 1.00 0.00 0.00

Marital status_Married Marital status_Separated \


0 0.00 1.00
1 0.00 0.00
2 1.00 0.00
3 1.00 0.00
4 1.00 0.00
… … …
11649 1.00 0.00
11650 1.00 0.00
11651 1.00 0.00
11652 1.00 0.00
11653 1.00 0.00

Marital status_Single / not married Marital status_Widow \


0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00
… … …
11649 0.00 0.00
11650 0.00 0.00
11651 0.00 0.00
11652 0.00 0.00
11653 0.00 0.00

73
Dwelling_Co-op apartment Dwelling_House / apartment \
0 0.00 1.00
1 0.00 1.00
2 0.00 1.00
3 0.00 0.00
4 0.00 1.00
… … …
11649 0.00 1.00
11650 0.00 1.00
11651 0.00 1.00
11652 0.00 1.00
11653 0.00 0.03

Dwelling_Municipal apartment Dwelling_Office apartment \


0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 1.00 0.00
4 0.00 0.00
… … …
11649 0.00 0.00
11650 0.00 0.00
11651 0.00 0.00
11652 0.00 0.00
11653 0.97 0.00

Dwelling_Rented apartment Dwelling_With parents \


0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00
… … …
11649 0.00 0.00
11650 0.00 0.00
11651 0.00 0.00
11652 0.00 0.00
11653 0.00 0.00

Employment status_Commercial associate Employment status_Pensioner \


0 0.00 0.00
1 0.00 0.00
2 1.00 0.00
3 0.00 0.00
4 0.00 0.00
… … …

74
11649 0.88 0.00
11650 0.00 0.00
11651 0.00 0.00
11652 0.65 0.00
11653 0.03 0.00

Employment status_State servant Employment status_Student \


0 1.00 0.00
1 1.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00
… … …
11649 0.00 0.00
11650 0.00 0.00
11651 0.00 0.00
11652 0.00 0.00
11653 0.00 0.00

Employment status_Working Has a car_N Has a car_Y Has a property_N \


0 0.00 1.00 0.00 0.00
1 0.00 1.00 0.00 1.00
2 0.00 0.00 1.00 0.00
3 1.00 0.00 1.00 1.00
4 1.00 1.00 0.00 0.00
… … … … …
11649 0.12 1.00 0.00 0.00
11650 1.00 1.00 0.00 1.00
11651 1.00 1.00 0.00 0.00
11652 0.35 0.00 1.00 0.00
11653 0.97 0.97 0.03 1.00

Has a property_Y Has a work phone_N Has a work phone_Y \


0 1.00 1.00 0.00
1 0.00 1.00 0.00
2 1.00 1.00 0.00
3 0.00 1.00 0.00
4 1.00 1.00 0.00
… … … …
11649 1.00 1.00 0.00
11650 0.00 0.00 1.00
11651 1.00 1.00 0.00
11652 1.00 1.00 0.00
11653 0.00 0.00 1.00

Has a phone_N Has a phone_Y Has an email_N Has an email_Y \


0 1.00 0.00 1.00 0.00

75
1 1.00 0.00 0.00 1.00
2 1.00 0.00 1.00 0.00
3 1.00 0.00 1.00 0.00
4 1.00 0.00 1.00 0.00
… … … … …
11649 1.00 0.00 1.00 0.00
11650 0.95 0.05 1.00 0.00
11651 1.00 0.00 0.87 0.13
11652 1.00 0.00 0.35 0.65
11653 0.97 0.03 1.00 0.00

Annual income Education level Age Employment length \


0 0.56 4.00 0.64 0.15
1 0.61 1.00 0.89 0.47
2 0.90 1.00 0.38 0.04
3 0.60 4.00 0.36 0.06
4 0.30 4.00 0.70 0.05
… … … … …
11649 0.65 4.00 0.41 0.22
11650 0.36 4.00 0.22 0.05
11651 0.39 1.00 0.43 0.32
11652 0.53 1.00 0.59 0.18
11653 0.36 2.00 0.55 0.22

Family member count Is High Risk


0 1.00 0
1 2.00 0
2 2.00 0
3 2.00 0
4 2.00 0
… … …
11649 2.00 1
11650 3.00 1
11651 2.00 1
11652 2.65 1
11653 4.97 1

[11654 rows x 34 columns]

[102]: X_test, y_test = df_test_prep.loc[:, df_test_prep.columns != "Is High Risk"],␣


↪df_test_prep["Is High Risk"]

[103]: X_train.head()

[103]: Gender_F Gender_M Marital status_Civil marriage Marital status_Married \


0 0.00 1.00 0.00 1.00
1 1.00 0.00 0.00 0.00

76
2 1.00 0.00 0.00 1.00
3 1.00 0.00 0.00 0.00
4 1.00 0.00 0.00 0.00

Marital status_Separated Marital status_Single / not married \


0 0.00 0.00
1 0.00 1.00
2 0.00 0.00
3 0.00 1.00
4 1.00 0.00

Marital status_Widow Dwelling_Co-op apartment Dwelling_House / apartment \


0 0.00 0.00 0.00
1 0.00 0.00 1.00
2 0.00 0.00 1.00
3 0.00 0.00 1.00
4 0.00 0.00 1.00

Dwelling_Municipal apartment Dwelling_Office apartment \


0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00

Dwelling_Rented apartment Dwelling_With parents \


0 0.00 1.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00

Employment status_Commercial associate Employment status_Pensioner \


0 0.00 0.00
1 1.00 0.00
2 1.00 0.00
3 1.00 0.00
4 0.00 0.00

Employment status_State servant Employment status_Student \


0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00

Employment status_Working Has a car_N Has a car_Y Has a property_N \

77
0 1.00 0.00 1.00 0.00
1 0.00 0.00 1.00 1.00
2 0.00 1.00 0.00 0.00
3 0.00 0.00 1.00 0.00
4 1.00 1.00 0.00 1.00

Has a property_Y Has a work phone_N Has a work phone_Y Has a phone_N \
0 1.00 1.00 0.00 1.00
1 0.00 1.00 0.00 1.00
2 1.00 1.00 0.00 1.00
3 1.00 1.00 0.00 1.00
4 0.00 1.00 0.00 0.00

Has a phone_Y Has an email_N Has an email_Y Annual income \


0 0.00 1.00 0.00 0.42
1 0.00 1.00 0.00 0.42
2 0.00 1.00 0.00 0.52
3 0.00 0.00 1.00 0.81
4 1.00 1.00 0.00 0.68

Education level Age Employment length Family member count


0 4.00 0.60 0.27 2.00
1 1.00 0.20 0.14 2.00
2 4.00 0.39 0.50 4.00
3 1.00 0.84 0.18 1.00
4 4.00 0.60 0.04 1.00

6 Model Training
[104]: classifiers = {
'sgd':SGDClassifier(random_state=42,loss='perceptron'),
'logistic_regression':LogisticRegression(random_state=42,max_iter=1000),
'decision_tree':DecisionTreeClassifier(random_state=42),
'random_forest':RandomForestClassifier(random_state=42),
'k_nearest_neighbors':KNeighborsClassifier(),
'gradient_boosting':GradientBoostingClassifier(random_state=42)
}

[105]: # Set the option for displaying float values


pd.set_option('display.float_format', lambda x: '%.3f' % x)

for i in range(len(list(classifiers))):
model = list(classifiers.values())[i]
model.fit(X_train, y_train)
# make predictions
y_train_pred = model.predict(X_train)

78
y_test_pred = model.predict(X_test)
# training set performance
model_train_accuracy = accuracy_score(y_train, y_train_pred)
model_train_f1 = f1_score(y_train, y_train_pred, average="weighted")
model_train_precision = precision_score(y_train, y_train_pred)
model_train_recall = recall_score(y_train, y_train_pred)
# test set performance
model_test_accuracy = accuracy_score(y_test, y_test_pred)
model_test_f1 = f1_score(y_test, y_test_pred, average="weighted")
model_test_precision = precision_score(y_test, y_test_pred)
model_test_recall = recall_score(y_test, y_test_pred)

print(list(classifiers.keys())[i])
print("Model performance for training")
print(f"- Accuracy: {model_train_accuracy:.3f}")
print(f"- F1 score: {model_train_f1:.3f}")
print(f"- Precision: {model_train_precision:.3f}")
print(f"- Recall: {model_train_recall:.3f}")

print("-" * 20)
print("Model performance for testing")
print(f"- Accuracy: {model_test_accuracy:.3f}")
print(f"- F1 score: {model_test_f1:.3f}")
print(f"- Precision: {model_test_precision:.3f}")
print(f"- Recall: {model_test_recall:.3f}")
print("=" * 35)
print('\n')

sgd
Model performance for training
- Accuracy: 0.542
- F1 score: 0.439
- Precision: 0.523
- Recall: 0.970
--------------------
Model performance for testing
- Accuracy: 0.531
- F1 score: 0.429
- Precision: 0.517
- Recall: 0.953
===================================

logistic_regression
Model performance for training
- Accuracy: 0.589
- F1 score: 0.588

79
- Precision: 0.584
- Recall: 0.615
--------------------
Model performance for testing
- Accuracy: 0.558
- F1 score: 0.558
- Precision: 0.558
- Recall: 0.556
===================================

decision_tree
Model performance for training
- Accuracy: 0.994
- F1 score: 0.994
- Precision: 0.993
- Recall: 0.995
--------------------
Model performance for testing
- Accuracy: 0.836
- F1 score: 0.835
- Precision: 0.900
- Recall: 0.756
===================================

random_forest
Model performance for training
- Accuracy: 0.994
- F1 score: 0.994
- Precision: 0.992
- Recall: 0.995
--------------------
Model performance for testing
- Accuracy: 0.890
- F1 score: 0.889
- Precision: 0.979
- Recall: 0.798
===================================

k_nearest_neighbors
Model performance for training
- Accuracy: 0.956
- F1 score: 0.956
- Precision: 0.980
- Recall: 0.930
--------------------

80
Model performance for testing
- Accuracy: 0.754
- F1 score: 0.742
- Precision: 0.944
- Recall: 0.539
===================================

gradient_boosting
Model performance for training
- Accuracy: 0.904
- F1 score: 0.904
- Precision: 0.900
- Recall: 0.909
--------------------
Model performance for testing
- Accuracy: 0.843
- F1 score: 0.842
- Precision: 0.794
- Recall: 0.927
===================================

SGD Classifier
[106]: model = SGDClassifier(random_state=42,loss='perceptron')
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
report = classification_report(y_test, y_test_pred)
print("\033[1mSGD Classification Report\033[0m")
print(report)
print("="*50)

disp = ConfusionMatrixDisplay.from_estimator(model, X_test,␣


↪y_test,cmap='Blues',values_format='d')

plt.grid(visible=None)
plt.title("Confusion Matrix")
plt.show()
#remove the grid
print('\n')
print("="*50)
RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.grid(visible=None)
plt.show()

SGD Classification Report


precision recall f1-score support

81
0 0.70 0.11 0.19 5827
1 0.52 0.95 0.67 5827

accuracy 0.53 11654


macro avg 0.61 0.53 0.43 11654
weighted avg 0.61 0.53 0.43 11654

==================================================

Confusion Matrix

5000

0 629 5198
4000
True label

3000

2000
1 272 5555

1000

0 1
Predicted label

==================================================

82
1.0

0.8
True Positive Rate (Positive label: 1)

0.6

0.4

0.2

0.0 SGDClassifier (AUC = 0.55)


0.0 0.2 0.4 0.6 0.8 1.0
False Positive Rate (Positive label: 1)

Gradient Boosting
[107]: model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
report = classification_report(y_test, y_test_pred)
print("\033[1mGradient Boosting Classification Report\033[0m")
print(report)
print("="*50)

disp = ConfusionMatrixDisplay.from_estimator(model, X_test,␣


↪y_test,cmap='Blues',values_format='d')

plt.grid(visible=None)
plt.title("Confusion Matrix")
plt.show()
#remove the grid
print('\n')
print("="*50)
RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.grid(visible=None)
plt.show()

Gradient Boosting Classification Report

83
precision recall f1-score support

0 0.91 0.76 0.83 5827


1 0.79 0.93 0.86 5827

accuracy 0.84 11654


macro avg 0.85 0.84 0.84 11654
weighted avg 0.85 0.84 0.84 11654

==================================================

Confusion Matrix

5000

0 4421 1406
4000
True label

3000

2000
1 423 5404

1000

0 1
Predicted label

==================================================

84
1.0

0.8
True Positive Rate (Positive label: 1)

0.6

0.4

0.2

0.0 GradientBoostingClassifier (AUC = 0.95)


0.0 0.2 0.4 0.6 0.8 1.0
False Positive Rate (Positive label: 1)

KNeighbors
[108]: model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
report = classification_report(y_test, y_test_pred)
print("\033[1mKNeighbors Classification Report\033[0m")
print(report)
print("="*50)

disp = ConfusionMatrixDisplay.from_estimator(model, X_test,␣


↪y_test,cmap='Blues',values_format='d')

plt.grid(visible=None)
plt.title("Confusion Matrix")
plt.show()
#remove the grid
print('\n')
print("="*50)
RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.grid(visible=None)
plt.show()

KNeighbors Classification Report

85
precision recall f1-score support

0 0.68 0.97 0.80 5827


1 0.94 0.54 0.69 5827

accuracy 0.75 11654


macro avg 0.81 0.75 0.74 11654
weighted avg 0.81 0.75 0.74 11654

==================================================

Confusion Matrix

5000

0 5641 186
4000
True label

3000

2000
1 2686 3141
1000

0 1
Predicted label

==================================================

86
1.0

0.8
True Positive Rate (Positive label: 1)

0.6

0.4

0.2

0.0 KNeighborsClassifier (AUC = 0.78)


0.0 0.2 0.4 0.6 0.8 1.0
False Positive Rate (Positive label: 1)

Random Forest Classifier


[109]: model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
report = classification_report(y_test, y_test_pred)
print("\033[1mRandom Forest Classification Report\033[0m")
print(report)
print("="*50)

disp = ConfusionMatrixDisplay.from_estimator(model, X_test,␣


↪y_test,cmap='Blues',values_format='d')

plt.grid(visible=None)
plt.title("Confusion Matrix")
plt.show()
#remove the grid
print('\n')
print("="*50)
RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.grid(visible=None)
plt.show()

Random Forest Classification Report

87
precision recall f1-score support

0 0.83 0.98 0.90 5827


1 0.98 0.80 0.88 5827

accuracy 0.89 11654


macro avg 0.90 0.89 0.89 11654
weighted avg 0.90 0.89 0.89 11654

==================================================

Confusion Matrix

5000

0 5726 101
4000
True label

3000

2000
1 1179 4648
1000

0 1
Predicted label

==================================================

88
1.0

0.8
True Positive Rate (Positive label: 1)

0.6

0.4

0.2

0.0 RandomForestClassifier (AUC = 0.98)


0.0 0.2 0.4 0.6 0.8 1.0
False Positive Rate (Positive label: 1)

[110]: rfc = RandomForestClassifier(random_state=42)


Knb = KNeighborsClassifier()
sgd = SGDClassifier(random_state=42,loss='perceptron')
gradient = GradientBoostingClassifier(random_state=42)
rfc.fit(X_train, y_train)
Knb.fit(X_train, y_train)
sgd.fit(X_train, y_train)
gradient.fit(X_train, y_train)
ax = plt.gca()
rfc_disp = RocCurveDisplay.from_estimator(rfc, X_test, y_test, ax=ax, alpha=0.8)
Knb_disp = RocCurveDisplay.from_estimator(Knb, X_test, y_test, ax=ax, alpha=0.8)
sgd_disp = RocCurveDisplay.from_estimator(sgd, X_test, y_test, ax=ax, alpha=0.8)
gradient_disp = RocCurveDisplay.from_estimator(gradient, X_test, y_test, ax=ax,␣
↪alpha=0.8)

Knb_disp.plot(ax=ax, alpha=0.8)
sgd_disp.plot(ax=ax, alpha=0.8)
gradient_disp.plot(ax=ax, alpha=0.8)

[110]: <sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x7ba6b9880ac0>

89
1.0

0.8
True Positive Rate (Positive label: 1)

0.6

0.4
RandomForestClassifier (AUC = 0.98)
KNeighborsClassifier (AUC = 0.78)
0.2 SGDClassifier (AUC = 0.55)
GradientBoostingClassifier (AUC = 0.95)
KNeighborsClassifier (AUC = 0.78)
SGDClassifier (AUC = 0.55)
0.0 GradientBoostingClassifier (AUC = 0.95)
0.0 0.2 0.4 0.6 0.8 1.0
False Positive Rate (Positive label: 1)

[114]: from google.colab import drive


drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call


drive.mount("/content/drive", force_remount=True).

[ ]: !jupyter nbconvert --to html /content/KNN.ipynb

90

You might also like