Credit Card Approve Predict Bynvd

credit-card-approve-predict-bynvd
February 19, 2024
[1]: from IPython.display import set_matplotlib_formats

set_matplotlib_formats('pdf', 'svg')
<ipython-input-1-d867fd071f7d>:2: DeprecationWarning: `set_matplotlib_formats`

is deprecated since IPython 7.23, directly use
`matplotlib_inline.backend_inline.set_matplotlib_formats()`
set_matplotlib_formats('pdf', 'svg')
1 Import libraries and data

[2]: import numpy as np
import pandas as pd
import missingno as msno
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path #?
from scipy.stats import probplot, chi2_contingency, chi2
from sklearn.model_selection import train_test_split, GridSearchCV,␣
↪RandomizedSearchCV, cross_val_score, cross_val_predict #RandomizedSearchCV?
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV #?
from sklearn.compose import ColumnTransformer #?
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.metrics import ConfusionMatrixDisplay, classification_report,␣
↪roc_curve, roc_auc_score, accuracy_score, f1_score, precision_score,␣
↪recall_score, RocCurveDisplay#roc_curve?
from imblearn.over_sampling import SMOTE #?

from sklearn.linear_model import SGDClassifier, LogisticRegression #SGDC?
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance #?
from yellowbrick.model_selection import FeatureImportances
import scipy.stats as stats
1
import joblib
import os
%matplotlib inline
[3]: data_record = pd.read_csv("/content/drive/MyDrive/Data upload/

↪application_record.csv")
credit_record = pd.read_csv("/content/drive/MyDrive/Data upload/credit_record.

↪csv")
print(data_record.info())
print(credit_record.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 438557 non-null int64
1 CODE_GENDER 438557 non-null object
2 FLAG_OWN_CAR 438557 non-null object
3 FLAG_OWN_REALTY 438557 non-null object
4 CNT_CHILDREN 438557 non-null int64
5 AMT_INCOME_TOTAL 438557 non-null float64
6 NAME_INCOME_TYPE 438557 non-null object
7 NAME_EDUCATION_TYPE 438557 non-null object
8 NAME_FAMILY_STATUS 438557 non-null object
9 NAME_HOUSING_TYPE 438557 non-null object
10 DAYS_BIRTH 438557 non-null int64
11 DAYS_EMPLOYED 438557 non-null int64
12 FLAG_MOBIL 438557 non-null int64
13 FLAG_WORK_PHONE 438557 non-null int64
14 FLAG_PHONE 438557 non-null int64
15 FLAG_EMAIL 438557 non-null int64
16 OCCUPATION_TYPE 304354 non-null object
17 CNT_FAM_MEMBERS 438557 non-null float64
dtypes: float64(2), int64(8), object(8)
memory usage: 60.2+ MB
None
RangeIndex: 1048575 entries, 0 to 1048574
--- ------ -------------- -----
1 MONTHS_BALANCE 1048575 non-null int64
2 STATUS 1048575 non-null object
dtypes: int64(2), object(1)
2
None
[4]: data_record.head()
[4]: ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN \

0 5008804 M Y Y 0
1 5008805 M Y Y 0
2 5008806 M Y Y 0
3 5008808 F N Y 0
4 5008809 F N Y 0
AMT_INCOME_TOTAL NAME_INCOME_TYPE NAME_EDUCATION_TYPE \

0 427500.0 Working Higher education
2 112500.0 Working Secondary / secondary special
3 270000.0 Commercial associate Secondary / secondary special
NAME_FAMILY_STATUS NAME_HOUSING_TYPE DAYS_BIRTH DAYS_EMPLOYED \

0 Civil marriage Rented apartment -12005 -4542
2 Married House / apartment -21474 -1134
3 Single / not married House / apartment -19110 -3051
FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE \

0 1 1 0 0 NaN
1 1 1 0 0 NaN
2 1 0 0 0 Security staff
3 1 0 1 1 Sales staff
CNT_FAM_MEMBERS
0 2.0
1 2.0
2 2.0
3 1.0
4 1.0
[5]: credit_record.head()
#MONTHS_BALANCE
#debt age
#STATUS
#0: 1-29 days past due
#1: 30-59 days past due
#2: 60-89 days overdue
3
#5: Overdue or bad debts, write-offs for more than 150 days
#C: paid off that month = pay debt in full
#X: No loan for the month
[5]: ID MONTHS_BALANCE STATUS

0 5001711 0 X
1 5001711 -1 0
2 5001711 -2 0
3 5001711 -3 0
4 5001712 0 C
[6]: begin_month = pd.DataFrame(credit_record.groupby(['ID'])['MONTHS_BALANCE'].

↪agg(min))
begin_month = begin_month.rename(columns={"MONTHS_BALANCE ": "Account␣

↪age"})
[7]: begin_month
[7]: MONTHS_BALANCE
ID
5001711 -3
5001712 -18
5001713 -21
5001714 -14
5001715 -59
… …
5150482 -28
5150483 -17
5150484 -12
5150485 -1
5150487 -29
[45985 rows x 1 columns]
[8]: data_record = pd.merge(data_record, begin_month, how="left", on="ID")
[9]: ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN \

0 5008804 M Y Y 0
1 5008805 M Y Y 0
2 5008806 M Y Y 0
3 5008808 F N Y 0
4 5008809 F N Y 0
AMT_INCOME_TOTAL NAME_INCOME_TYPE NAME_EDUCATION_TYPE \
4
2 112500.0 Working Secondary / secondary special
NAME_FAMILY_STATUS NAME_HOUSING_TYPE DAYS_BIRTH DAYS_EMPLOYED \

2 Married House / apartment -21474 -1134
FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE \

0 1 1 0 0 NaN
1 1 1 0 0 NaN
2 1 0 0 0 Security staff
CNT_FAM_MEMBERS MONTHS_BALANCE
0 2.0 -15.0
1 2.0 -14.0
2 2.0 -29.0
3 1.0 -4.0
4 1.0 -26.0
[10]: credit_record["STATUS"][credit_record["ID"]== 5001712]

# count 19 obsers but dep_value is 0 mean this customer has no bad debt
[10]: 4 C
5 C
6 C
7 C
8 C
9 C
10 C
11 C
12 C
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 0
5
21 0
22 0
Name: STATUS, dtype: object
[11]: #Create new column dep_value

credit_record["dep_value"] = None
# Bad debt?
credit_record["dep_value"][credit_record['STATUS'] == "2"] = "Yes"
# Count the bad debt
cpunt = credit_record.groupby("ID").count()
cpunt["dep_value"][cpunt["dep_value"] > 0] = "Yes"
cpunt["dep_value"][cpunt["dep_value"] == 0] = "No"
cpunt = cpunt[["dep_value"]]
# Merge data
data_record = pd.merge(data_record, cpunt, how="inner", on="ID")
data_record["Is High Risk"] = data_record["dep_value"]
data_record.loc[data_record["Is High Risk"] == "Yes", "Is High Risk"] == 1
data_record.loc[data_record["Is High Risk"] == "No", "Is High Risk"] == 0
data_record.drop(columns="dep_value", axis=1, inplace=True)
<ipython-input-11-5fce52290614>:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-

docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



6
[12]: data_record.columns
[12]: Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',

'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'MONTHS_BALANCE',
'Is High Risk'],
dtype='object')
[13]: # Rename the features

data_record = data_record.rename(columns={
'CODE_GENDER':'Gender',
'FLAG_OWN_CAR':'Has a car',
'FLAG_OWN_REALTY':'Has a property',
'CNT_CHILDREN':'Children count',
'AMT_INCOME_TOTAL':'Annual income',
'NAME_INCOME_TYPE':'Employment status',
'NAME_EDUCATION_TYPE':'Education level',
'NAME_FAMILY_STATUS':'Marital status',
'NAME_HOUSING_TYPE':'Dwelling',
'DAYS_BIRTH':'Age',
'DAYS_EMPLOYED': 'Employment length',
'FLAG_MOBIL': 'Has a mobile phone',
'FLAG_WORK_PHONE': 'Has a work phone',
'FLAG_PHONE': 'Has a phone',
'FLAG_EMAIL': 'Has an email',
'OCCUPATION_TYPE': 'Job title',
'CNT_FAM_MEMBERS': 'Family member count',
'MONTHS_BALANCE': 'Account age'
})
[14]: ID Gender Has a car Has a property Children count Annual income \
0 5008804 M Y Y 0 427500.0
1 5008805 M Y Y 0 427500.0
2 5008806 M Y Y 0 112500.0
3 5008808 F N Y 0 270000.0
4 5008809 F N Y 0 270000.0
Employment status Education level Marital status \

0 Working Higher education Civil marriage
2 Working Secondary / secondary special Married
3 Commercial associate Secondary / secondary special Single / not married
7
Dwelling Age Employment length Has a mobile phone \
0 Rented apartment -12005 -4542 1
2 House / apartment -21474 -1134 1
Has a work phone Has a phone Has an email Job title \

0 1 0 0 NaN
1 1 0 0 NaN
2 0 0 0 Security staff
3 0 1 1 Sales staff
4 0 1 1 Sales staff
Family member count Account age Is High Risk

0 2.0 -15.0 No
1 2.0 -14.0 No
2 2.0 -29.0 No
3 1.0 -4.0 No
4 1.0 -26.0 No
[15]: def data_split(df, test_size):

df_train, df_test = train_test_split(df, test_size = test_size,␣
↪random_state=42)
return df_train.reset_index(drop=True), df_test.reset_index(drop=True)

# reset index with drop=True mean we not keep the old index
[16]: data_record_train_original, data_record_test_original = data_split(data_record,␣

↪test_size=0.2)
[17]: data_record_train_original.shape
[17]: (29165, 20)
[18]: data_record_test_original.shape
[18]: (7292, 20)
[19]: # So we can wrangle data, but not touching the original data
data_record_train_copy = data_record_train_original.copy()
data_record_test_copy = data_record_test_original.copy()
8
2 Automate Data Function
#Age, Employment length is counted in day, account age is counted in month
[20]: ID Gender Has a car Has a property Children count Annual income \
0 5008804 M Y Y 0 427500.0
1 5008805 M Y Y 0 427500.0
2 5008806 M Y Y 0 112500.0
3 5008808 F N Y 0 270000.0
4 5008809 F N Y 0 270000.0
Employment status Education level Marital status \

2 Working Secondary / secondary special Married
Dwelling Age Employment length Has a mobile phone \

Has a work phone Has a phone Has an email Job title \

0 1 0 0 NaN
1 1 0 0 NaN
2 0 0 0 Security staff
3 0 1 1 Sales staff
4 0 1 1 Sales staff
Family member count Account age Is High Risk

0 2.0 -15.0 No
1 2.0 -14.0 No
2 2.0 -29.0 No
3 1.0 -4.0 No
4 1.0 -26.0 No
[21]: data_record.info()
Int64Index: 36457 entries, 0 to 36456
--- ------ -------------- -----
9
1 Gender 36457 non-null object
2 Has a car 36457 non-null object
3 Has a property 36457 non-null object
4 Children count 36457 non-null int64
5 Annual income 36457 non-null float64
6 Employment status 36457 non-null object
7 Education level 36457 non-null object
8 Marital status 36457 non-null object
9 Dwelling 36457 non-null object
10 Age 36457 non-null int64
11 Employment length 36457 non-null int64
12 Has a mobile phone 36457 non-null int64
13 Has a work phone 36457 non-null int64
14 Has a phone 36457 non-null int64
15 Has an email 36457 non-null int64
16 Job title 25134 non-null object
17 Family member count 36457 non-null float64
18 Account age 36457 non-null float64
19 Is High Risk 36457 non-null object
dtypes: float64(3), int64(8), object(9)
[22]: data_record.describe()
[22]: ID Children count Annual income Age \

count 3.645700e+04 36457.000000 3.645700e+04 36457.000000
mean 5.078227e+06 0.430315 1.866857e+05 -15975.173382
std 4.187524e+04 0.742367 1.017892e+05 4200.549944
min 5.008804e+06 0.000000 2.700000e+04 -25152.000000
25% 5.042028e+06 0.000000 1.215000e+05 -19438.000000
50% 5.074614e+06 0.000000 1.575000e+05 -15563.000000
75% 5.115396e+06 1.000000 2.250000e+05 -12462.000000
max 5.150487e+06 19.000000 1.575000e+06 -7489.000000
Employment length Has a mobile phone Has a work phone Has a phone \
count 36457.000000 36457.0 36457.000000 36457.000000
mean 59262.935568 1.0 0.225526 0.294813
std 137651.334859 0.0 0.417934 0.455965
min -15713.000000 1.0 0.000000 0.000000
25% -3153.000000 1.0 0.000000 0.000000
50% -1552.000000 1.0 0.000000 0.000000
75% -408.000000 1.0 0.000000 1.000000
max 365243.000000 1.0 1.000000 1.000000
Has an email Family member count Account age

count 36457.000000 36457.000000 36457.000000
mean 0.089722 2.198453 -26.164193
10
max
75%
50%
25%
min
std
0.0
0.2
0.4
0.6
0.8
1.0
36457
1
ID
36
45
Ge 7 ID
nd
plt.show()
plt.show()
er
36
Ha 45
sa 7 Ge
ca nd
Ha r er
sa 36
45 Ha
pro
pe 7 sa
rty ca
Ch
ild 36 r
45 Ha
1.000000
0.000000
0.000000
0.000000
0.000000
0.285787
ren
co 7 sa
u nt pro
An p
[24]: msno.bar(data_record)
nu 36 Ch ert
y
al 45 ild
Em inc 7 ren
om
e co
msno.matrix(data_record)
[23]: # Checking missing value
plo un
ym 36 An t
e nt 45 nu
sta 7 al
inc
Ed tus
Em o me
uc 36 plo
ati 45
on 7 ym
lev en
el Ed ts
Ma 36 uc tat
rita 45 ati us
ls 7 on
tat
us Ma l e ve
rita l
36 ls
Dw 45
ell 7 tat
us
ing Dw
ell
36 ing
45
7
20.000000
3.000000
2.000000
2.000000
1.000000
0.911686
Em Ag
Ag e
11
plo e
ym 36
e nt 45
Ha len 7 Em
sa gth plo
mo ym
36 en
bil
ep 45
7 Ha t len
Ha ho sa gth
sa ne mo
wo 36 bil
ep
rk 45
7
Ha
ho
ph sa
o ne wo ne
Ha 36 rk
sa 45 Ha ph
7 on
sa e
-12.000000
-24.000000
-39.000000
-60.000000
16.501854
0.000000
ph ph
on
Ha e on
e
sa 36 Ha
45 sa
ne 7 ne
ma ma
il Jo il
36 bt
45 itle
Fa
mi
Jo
bt 7
ly itle
m em
Fa
25 mi
ly
be 13
rc 4 me
ou mb
n t Ac er
Ac 36 co co
co 45 un un
u nt 7 ta
ge t
ag Is
e Hig
Is 36 hR
Hig 45
hR 7 isk
19
isk
36
45
7
0
7291
14582
21874
29165
36457
20
[25]: # Task 1: Convert day in Age, Employment length, age account to positive.
def val_count_and_fre(df, feature):
val_count = df[feature].value_counts()
val_fre = df[feature].value_counts(normalize=True)*100
count_and_fre = pd.concat([val_count, val_fre], axis=1)
count_and_fre.columns = ["Counts", "Frequency (%)"]
return count_and_fre
[26]: def get_info_feat(df, feature):

match feature:
case "Age":
print("Description:\n{}".format(np.abs(df[feature]/365.25).describe()))␣
↪#\n:newline
print("-"*50)
print("Object type:{}".format(df[feature].dtype))
case "Employment length":
employed = data_record_train_copy["Employment␣
↪length"][data_record_train_copy["Employment length"] < 0]
employed_year = np.abs(employed/365.25)
print("Description:\n{}".format((employed_year).describe()))
print("-"*50)
print("Object type:{}".format(employed_year.dtype))
case "Account age":
print("Description:\n{}".format(np.abs(df[feature]).describe()))␣
↪#f-string can not handle(create a newline)?
print("-"*50)
case _:
print("Description:\n{}".format(df[feature].describe()))
print("-"*50)
print("-"*50)
value_count = val_count_and_fre(df, feature)
print("Value Counts:\n{}".format(value_count))
[27]: #function create pie chart

def plot_pie_chart(df, feature):
match feature:
case "Dwelling" | "Education level": # countable feature, but how about␣
↪marial status?
ratio_size = val_count_and_fre(df, feature)

ratio_size_len = len(ratio_size.index)
ratio_list = []
for i in range(ratio_size_len):
12
ratio_list.append(ratio_size.iloc[i]["Frequency (%)"])
fig, ax = plt.subplots(figsize=(8,8))
plt.pie(ratio_list, startangle=90, wedgeprops={"edgecolor" :"black"})
plt.title(f"Pie chart of {feature}")
plt.legend(loc="best",labels=ratio_size.index)
plt.axis("equal")
return plt.show()
case _:
ratio_size = val_count_and_fre(df, feature)
ratio_size_len = len(ratio_size.index)
ratio_list = []
for i in range(ratio_size_len):
ratio_list.append(ratio_size.iloc[i]["Frequency (%)"])
# %1.2f%% display decimals in the pie chart with 2 decimal places
plt.pie(ratio_list, labels = ratio_size.index, startangle=90, autopct="%1.
↪2f%%", wedgeprops={"edgecolor" :"black"})
plt.title(f"Pie chart of {feature}")

plt.legend(loc="best")
plt.axis("equal")
return plt.show()
[28]: # function to create bar chart

def plot_bar_chart(df, feature):
match feature:
case "Marital status" | "Dwelling" | "Job title" | "Employment status" |␣
↪"Education level":
sns.barplot(x=val_count_and_fre(df, feature).index,␣
↪y=val_count_and_fre(df, feature).values[:,0])
ax.set_xticklabels(labels=val_count_and_fre(df, feature).index,␣
↪rotation=45, ha="right") #ha: horizontal alignment
plt.xlabel(f"{feature}")
plt.ylabel("Count")
plt.title(f"{feature} count")
return plt.show()
case _:
sns.barplot(x=val_count_and_fre(df, feature).index,␣
↪y=val_count_and_fre(df, feature).values[:,0])
plt.xlabel(f"{feature}")
plt.ylabel("Count")
plt.title(f"{feature} count")
return plt.show()
13
[29]: # function to create box plot
def create_box_plot(df, feature):
match feature:
case "Age":
fig, ax = plt.subplots(figsize=(2, 8))
sns.boxplot(y=np.abs(df[feature]/365.25))
plt.title(f"Distribution(Boxplot) of {feature}")
return plt.show()
case "Children count":
sns.boxplot(y=df[feature])
plt.yticks(np.arange(0, df[feature].max(), 1)) # arange(start, end, step)
return plt.show()
↪length"][data_record_train_copy["Employment length"]<0]
employed_year = np.abs(employed)/365.25
sns.boxplot(y=employed_year)
plt.yticks(np.arange(0, employed_year.max(), 2))
return plt.show()
case "Annual income":
# suppress scientific notation
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ",")))
return plt.show()
case "Account age":
sns.boxplot(y=np.abs(df[feature]))
return plt.show()
case _:
return plt.show()
[30]: # function to create histogram

def create_hist_plot(df, feature, n_bins=30):
match feature:
case "Age":
14
sns.histplot(np.abs(df[feature])/365.25, bins=n_bins, kde=True) #kde:
↪probability density of the underlying distribution of the data
plt.title(f"Distribution(Histogram) of {feature}")
return plt.show()
sns.histplot(df[feature], bins=n_bins, kde=True)
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ",")))
return plt.show()
↪length"][data_record_train_copy["Employment length"] < 0]
employed_year = np.abs(employed)/365.25
sns.histplot(employed_year, bins=n_bins, kde=True)
return plt.show()
case "Account age":
sns.histplot(np.abs(df[feature]), bins=n_bins, kde=True)
return plt.show()
case _:
sns.histplot(df[feature], bins=n_bins, kde=True)
return plt.show()
[31]: # create function of high-low risk box plot

def hl_risk_box_plot(df, feature):
match feature:
case "Age":
print(np.abs(df.groupby("Is High Risk")[feature].mean()/365.25))
sns.boxplot(y=np.abs(df[feature])/365.25, x=df["Is High Risk"])
plt.xticks(ticks=[0,1], labels=["No", "Yes"])
plt.title(f"High risk individuals grouped by {feature}")
return plt.show()
↪length"][data_record_train_copy["Employment length"]<0]
employed_year = np.abs(employed/365.25) #array, but we will build on␣

↪dataframe
employed_year_df = data_record_train_copy.iloc[employed_year.
↪index][["Employment length", "Is High Risk"]]
15
employed_year_is_high_risk = employed_year_df.groupby("Is High␣
↪Risk")["Employment length"].mean()
print(np.abs(employed_year_is_high_risk/365.25))
sns.boxplot(y=employed_year, x=df["Is High Risk"])
return plt.show()
print(np.abs(df.groupby('Is High Risk')[feature].mean()))
sns.boxplot(y=np.abs(df[feature]),x=df['Is High Risk'])
plt.xticks(ticks=[0,1],labels=['No','Yes'])
# suppress scientific notation
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.title('High risk individuals grouped by {}'.format(feature))
return plt.show()
case _:
print(np.abs(df.groupby("Is High Risk")[feature].mean()))
sns.boxplot(y=np.abs(df[feature]), x=df["Is High Risk"])
return plt.show()
[32]: group_high_risk = data_record_train_copy.groupby("Marital status")["Is High␣

↪Risk"]
group_high_risk.value_counts()
[32]: Marital status Is High Risk

Civil marriage No 2278
Yes 34
Married No 19724
Yes 320
Separated No 1688
Yes 24
Single / not married No 3777
Yes 87
Widow No 1199
Yes 34
Name: Is High Risk, dtype: int64
[33]: # create high low bar plot

# This means, each type of marital status, how many individuals is high risk =>␣
↪visual into bar
def hl_bar_plot(df, feature):
16
df["Is High Risk"] = df["Is High Risk"].map({"Yes": 1, "No": 0}) #****
group_high_risk = df.groupby(feature)["Is High Risk"].sum() #sum the high␣
↪risk(Yes), but in this case, "No" don't mean 0
sort_gr_h_risk = group_high_risk.sort_values(ascending=False)
print(dict(sort_gr_h_risk))
sns.barplot(x=sort_gr_h_risk.index, y=sort_gr_h_risk.values)
ax.set_xticklabels(labels=sort_gr_h_risk.index, rotation=45, ha="right") #ha:␣
↪horizental align
plt.ylabel("Count")
plt.title(f"High risk group of {feature}")
return plt.show()
[34]: data_record_train_copy.columns
[34]: Index(['ID', 'Gender', 'Has a car', 'Has a property', 'Children count',

'Annual income', 'Employment status', 'Education level',
'Marital status', 'Dwelling', 'Age', 'Employment length',
'Has a mobile phone', 'Has a work phone', 'Has a phone', 'Has an email',
'Job title', 'Family member count', 'Account age', 'Is High Risk'],
dtype='object')
3 Explore data
4 Univariate analysis
1. Gender
[35]: get_info_feat(data_record_train_copy, "Gender")
Description:
count 29165
unique 2
top F
freq 19549
Name: Gender, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Counts Frequency (%)
F 19549 67.028973
M 9616 32.971027
[36]: plot_pie_chart(data_record_train_copy, "Gender")
17
Pie chart of Gender
F
M
32.97%
67.03%
[37]: plot_bar_chart(data_record_train_copy, "Gender")
18
Gender count
20000
17500
15000
12500
Count
10000
7500
5000
2500
0
F M
Gender
19
Almost our observed induviduals are women (67.0.3%)
2. Age
[38]: get_info_feat(data_record_train_copy, "Age")
Description:
count 29165.000000
mean 43.749425
std 11.507180
min 21.095140
25% 34.154689
50% 42.614648
75% 53.234771
max 68.862423
Name: Age, dtype: float64
--------------------------------------------------
Object type:int64
[39]: create_box_plot(data_record_train_copy, "Age")
20
Distribution(Boxplot) of Age
70
60
50
Age
40
30
20
[40]: create_hist_plot(data_record_train_copy, "Age")
21
Distribution(Histogram) of Age
1400
1200
1000
800
Count
600
400
200
0
20 30 40 50 60 70
Age
[41]: hl_risk_box_plot(data_record_train_copy, "Age")
Is High Risk
No 43.753103
Yes 43.538148
Name: Age, dtype: float64
22
High risk individuals grouped by Age
70
60
50
Age
40
30
20
No Yes
Is High Risk
• No much difference between age of two group: High risk or Not

• Most of applicants are over 40, data is not Normal, it’s Positive Skew
• This age can tell us, all the applicants are in the working age, no more student.
3. Marital status
[42]: get_info_feat(data_record_train_copy, "Marital status")
23
Description:
count 29165
unique 5
top Married
freq 20044
Name: Marital status, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Married 20044 68.726213
Single / not married 3864 13.248757
Civil marriage 2312 7.927310
Separated 1712 5.870050
Widow 1233 4.227670
[43]: plot_pie_chart(data_record_train_copy, "Marital status")
Pie chart of Marital status

Widow
Separated
Civil marriage
4.23%
5.87%
7.93%
13.25% Single / not married
68.73%
Married
Married
Single / not married
Civil marriage
Separated
Widow
24
[44]: plot_bar_chart(data_record_train_copy, "Marital status")
<ipython-input-28-cd296eb7d4d5>:7: UserWarning: FixedFormatter should only be

used together with FixedLocator
ax.set_xticklabels(labels=val_count_and_fre(df, feature).index, rotation=45,
ha="right") #ha: horizontal alignment
25
Marital status count
20000
17500
15000
12500
Count
10000
7500
5000
2500
0
d
w
d
d
rrie
iag
rrie
do
rat
Wi
arr
Ma
ma
pa
il m
Se
ot
/n
Civ
gle
Sin
Marital status
26
[45]: hl_bar_plot(data_record_train_copy, "Marital status")
{'Married': 320, 'Single / not married': 87, 'Civil marriage': 34, 'Widow': 34,
'Separated': 24}
<ipython-input-33-6f446414d3e9>:10: UserWarning: FixedFormatter should only be
ax.set_xticklabels(labels=sort_gr_h_risk.index, rotation=45, ha="right") #ha:
horizental align
27
High risk group of Marital status
300
250
200
Count
150
100
50
0
d
e
d
ed
rrie
iag
rrie
do
rat
Wi
arr
Ma
ma
pa
il m
Se
ot
28
/n
Civ
gle
Sin
Marital status
• As we quick glance at Age data, we can predict most of our Applicants got married.
• We got Separated Applicants more than Widow ones, but Widow seem riskier than Separat-
eds.
4. Family member count
[46]: data_record_train_copy.columns
[46]: Index(['ID', 'Gender', 'Has a car', 'Has a property', 'Children count',

'Annual income', 'Employment status', 'Education level',
'Marital status', 'Dwelling', 'Age', 'Employment length',
'Has a mobile phone', 'Has a work phone', 'Has a phone', 'Has an email',
'Job title', 'Family member count', 'Account age', 'Is High Risk'],
dtype='object')
[47]: get_info_feat(data_record_train_copy, "Family member count")
Description:
count 29165.000000
mean 2.197531
std 0.912189
min 1.000000
25% 2.000000
50% 2.000000
75% 3.000000
max 20.000000
Name: Family member count, dtype: float64
--------------------------------------------------
Object type:float64
--------------------------------------------------
Value Counts:
2.0 15552 53.324190
1.0 5613 19.245671
3.0 5121 17.558718
4.0 2503 8.582205
5.0 309 1.059489
6.0 48 0.164581
7.0 14 0.048003
9.0 2 0.006858
15.0 2 0.006858
20.0 1 0.003429
[48]: create_box_plot(data_record_train_copy, "Family member count")
29
Distribution(Boxplot) of Family member count
20.0
17.5
Family member count 15.0
12.5
10.0
7.5
5.0
2.5
[49]: hl_risk_box_plot(data_record_train_copy, "Family member count")
Is High Risk
0 2.198284
1 2.154309
Name: Family member count, dtype: float64
30
High risk individuals grouped by Family member count
20.0
17.5
15.0
Family member count
12.5
10.0
7.5
5.0
2.5
No Yes
Is High Risk
[50]: plot_bar_chart(data_record_train_copy, "Family member count")
31
Family member count count
16000
14000
12000
10000
Count
8000
6000
4000
2000
0
1.0 2.0 3.0 4.0 5.0 6.0 7.0 9.0 15.0 20.0
Family member count
32
5. Children count
[51]: get_info_feat(data_record_train_copy, "Children count")
Description:
count 29165.000000
mean 0.430790
std 0.741882
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 19.000000
Name: Children count, dtype: float64
--------------------------------------------------
Object type:int64
--------------------------------------------------
Value Counts:
0 20143 69.065661
1 6003 20.582890
2 2624 8.997086
3 323 1.107492
4 52 0.178296
5 15 0.051432
7 2 0.006858
14 2 0.006858
19 1 0.003429
[52]: create_box_plot(data_record_train_copy, "Children count")
33
Distribution(Boxplot) of Children count
18
17
16
15
14
13
12
11
Children count
10
9
8
7
6
5
4
3
2
1
0
[53]: plot_bar_chart(data_record_train_copy, "Children count")
34
Children count count
20000
17500
15000
12500
Count
10000
7500
5000
2500
0
0 1 2 3 4 5 7 14 19
Children count
35
6. Dwelling type
[54]: get_info_feat(data_record_train_copy, "Dwelling")
Description:
count 29165
unique 6
top House / apartment
freq 26059
Name: Dwelling, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
House / apartment 26059 89.350249
With parents 1406 4.820847
Municipal apartment 912 3.127036
Rented apartment 453 1.553232
Office apartment 208 0.713184
Co-op apartment 127 0.435453
[55]: plot_pie_chart(data_record_train_copy, "Dwelling")
36
Pie chart of Dwelling
House / apartment
With parents
Municipal apartment
Rented apartment
Office apartment
Coop apartment
[56]: plot_bar_chart(data_record_train_copy, "Dwelling")

37
Count
Ho
us
10000
15000
20000
25000
0
5000
e/
ap
art
me
nt
Wi
th
p are
Mu nts
nic
ipa
la
pa
rtm
en
Re t
38
nte
da
pa
rtm
Dwelling
en
Of t
Dwelling count
fic
e ap
art
me
nt
Co
op
ap
art
me
nt
Most applicants live in house/apartment.
7. Annual income
[57]: pd.set_option('display.float_format', lambda x: '%.2f' % x)
get_info_feat(data_record_train_copy, "Annual income")
Description:
count 29165.00
mean 186890.39
std 101409.64
min 27000.00
25% 121500.00
50% 157500.00
75% 225000.00
max 1575000.00
Name: Annual income, dtype: float64
--------------------------------------------------
Object type:float64
--------------------------------------------------
Value Counts:
135000.00 3468 11.89
180000.00 2487 8.53
157500.00 2469 8.47
225000.00 2373 8.14
112500.00 2359 8.09
… … …
164250.00 1 0.00
42300.00 1 0.00
432000.00 1 0.00
145350.00 1 0.00
82350.00 1 0.00
[58]: create_hist_plot(data_record_train_copy, "Annual income")
39
Distribution(Histogram) of Annual income
10,000
8,000
6,000
Count
4,000
2,000
0
0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6
Annual income 1e6
[59]: create_box_plot(data_record_train_copy, "Annual income")
40
Distribution(Boxplot) of Annual income
1,600,000
1,400,000
1,200,000
1,000,000
Annual income
800,000
600,000
400,000
200,000
[60]: hl_risk_box_plot(data_record_train_copy, "Annual income")
Is High Risk
0 186913.94
1 185537.26
Name: Annual income, dtype: float64
41
High risk individuals grouped by Annual income
1,600,000
1,400,000
1,200,000
1,000,000
Annual income
800,000
600,000
400,000
200,000
0
No Yes
Is High Risk
• There is no different between group high risk and group no high risk.
• Most of applicants have annual income in a range from “120k” to “210k”.
8. Job title
[61]: get_info_feat(data_record_train_copy, "Job title")
Description:
42
count 20138
unique 18
top Laborers
freq 5004
Name: Job title, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Laborers 5004 24.85
Core staff 2866 14.23
Sales staff 2773 13.77
Managers 2422 12.03
Drivers 1722 8.55
High skill tech staff 1133 5.63
Accountants 998 4.96
Medicine staff 956 4.75
Cooking staff 521 2.59
Security staff 464 2.30
Cleaning staff 425 2.11
Private service staff 287 1.43
Low-skill Laborers 138 0.69
Waiters/barmen staff 127 0.63
Secretaries 122 0.61
HR staff 72 0.36
Realty agents 60 0.30
IT staff 48 0.24
[62]: missing_val_pct = data_record_train_copy["Job title"].isnull().sum()/

↪len(data_record_train_copy) * 100
print(f"Percentage of missing value in Job title: {missing_val_pct.round(2)}%")
Percentage of missing value in Job title: 30.95%
[63]: plot_bar_chart(data_record_train_copy, "Job title")

43
Count
La
b
0
1000
2000
3000
4000
5000
Co orer
r s
Sa e sta
le ff
Ma s sta
Hig na ff
hs ge
kill Driv rs
t e
Ac ech rs
s
Me coun taff
di ta
Co cine nts
o s
Se king taff
c s
44
Pr Cle urity taff
iva a s
t n t
Lo e se ing s aff
Job title
Wa ws rvic taf
ite kill e s f
Job title count
rs/ La taf
ba bo f
rm rer
Se en s s
cre taff
ta
Re HR ries
alt st
y a aff
ge
IT nts
sta
ff
9. Employment status
[64]: get_info_feat(data_record_train_copy, "Employment status")
Description:
count 29165
unique 5
top Working
freq 15056
Name: Employment status, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Working 15056 51.62
Commercial associate 6801 23.32
Pensioner 4920 16.87
State servant 2381 8.16
Student 7 0.02
[65]: plot_bar_chart(data_record_train_copy, "Employment status")

45
Employment status count
14000
12000
10000
Count
8000
6000
4000
2000
0
te
er
nt
ing
an
cia
de
ion
rk
erv
Stu
so
Wo
ns
s
as
Pe
te
Sta
ial
erc
mm
Co
Employment
46
status
[66]: plot_pie_chart(data_record_train_copy, "Employment status")
Pie chart of Employment

Student
status
State servant
0.02%8.16%
Pensioner
16.87%
Working 51.62%
23.32%
Working Commercial associate

Commercial associate
Pensioner
State servant
Student
• Most applicants are worker.

• Only 2 applicants are student.
10. Education level
[67]: get_info_feat(data_record_train_copy, "Education level")
Description:
count 29165
unique 5
top Secondary / secondary special
freq 19803
Name: Education level, dtype: object
--------------------------------------------------
47
Object type:object
--------------------------------------------------
Value Counts:
Secondary / secondary special 19803 67.90
Higher education 7910 27.12
Incomplete higher 1129 3.87
Lower secondary 298 1.02
Academic degree 25 0.09
[68]: plot_pie_chart(data_record_train_copy, "Education level")
Pie chart of Education level
Secondary / secondary special

Higher education
Incomplete higher
Lower secondary
Academic degree
48
[69]: plot_bar_chart(data_record_train_copy, "Education level")

49
Se
co
nd
ary Count
/s
ec
10000
12500
15000
17500
20000
0
2500
5000
7500
on
da
ry
sp
ec
ial
Hig
he
re
du
ca
tio
n
Inc
om
ple
50
te
h igh
er
Lo
we
rs
Education level
ec
on
Education level count
da
ry
Ac
ad
em
ic
de
gre
e
The majority of applicants have completed their secondary degree, ¼ completed their higher edu-
cation
11. Employment length
[70]: get_info_feat(data_record_train_copy, "Employment length")
Description:
count 24257.00
mean 7.26
std 6.46
min 0.05
25% 2.68
50% 5.45
75% 9.60
max 43.02
Name: Employment length, dtype: float64
--------------------------------------------------
Object type:float64
[71]: create_box_plot(data_record_train_copy, "Employment length")
51
Distribution(Boxplot) of Employment length
42
40
38
36
34
32
30
28
26
Employment length
24
22
20
18
16
14
12
10
8
6
4
2
0
[72]: create_hist_plot(data_record_train_copy, "Employment length")
52
Distribution(Histogram) of Employment length
3500
3000
2500
2000
Count
1500
1000
500
0
0 10 20 30 40
Employment length
[73]: hl_risk_box_plot(data_record_train_copy, "Employment length")
Is High Risk
0 7.29
1 5.75
Name: Employment length, dtype: float64
53
High risk individuals grouped by Employment length
40
30
Employment length
20
10
No Yes
Is High Risk
• The group label high risk has a shorter employment length than group label no high risk.
12. Has a car
[74]: get_info_feat(data_record_train_copy, "Has a car")
Description:
count 29165
54
unique 2
top N
freq 18128
Name: Has a car, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
N 18128 62.16
Y 11037 37.84
[75]: plot_pie_chart(data_record_train_copy, "Has a car")
Pie chart of Has a car

N
Y
37.84%
62.16%
55
Most applicants do not own car.
13. Has a property
[76]: get_info_feat(data_record_train_copy, "Has a property")
Description:
count 29165
unique 2
top Y
freq 19557
Name: Has a property, dtype: object
--------------------------------------------------
Object type:object
--------------------------------------------------
Value Counts:
Y 19557 67.06
N 9608 32.94
[77]: plot_bar_chart(data_record_train_copy, "Has a property")
56
Has a property count
20000
17500
15000
12500
Count
10000
7500
5000
2500
0
Y N
Has a property
57
[78]: plot_pie_chart(data_record_train_copy, "Has a property")
Pie chart of Has a property

Y
N
32.94%
67.06%
Most of applicants has a property.

14. Account age
[79]: get_info_feat(data_record_train_copy, "Account age")
Description:
count 29165.00
mean 26.14
std 16.49
min 0.00
58
25% 12.00
50% 24.00
75% 39.00
max 60.00
Name: Account age, dtype: float64
--------------------------------------------------
Object type:float64
[80]: create_box_plot(data_record_train_copy, "Account age")
Distribution(Boxplot) of Account age

60
50
40
Account age
30
20
10
59
[81]: create_hist_plot(data_record_train_copy, "Account age")
Distribution(Histogram) of Account age

1400
1200
1000
800
Count
600
400
200
0
0 10 20 30 40 50 60
Account age
[82]: hl_risk_box_plot(data_record_train_copy, "Account age")
Is High Risk
0 26.00
1 34.04
Name: Account age, dtype: float64
60
High risk individuals grouped by Account age
60
50
40
Account age
30
20
10
No Yes
Is High Risk
15. Is High Risk (Target Variable)
[83]: get_info_feat(data_record_train_copy, "Is High Risk")
Description:
count 29165.00
mean 0.02
61
std 0.13
min 0.00
25% 0.00
50% 0.00
75% 0.00
max 1.00
Name: Is High Risk, dtype: float64
--------------------------------------------------
Object type:int64
--------------------------------------------------
Value Counts:
0 28666 98.29
1 499 1.71
[84]: plot_bar_chart(data_record_train_copy, "Is High Risk")
62
Is High Risk count
30000
25000
20000
Count
15000
10000
5000
0
0 1
Is High Risk
63
This target data is very imbalance.
5 Preprocessing data
1. Remove Outlier
[85]: class OutlierRemover(BaseEstimator, TransformerMixin):
def __init__(self,feat_with_outliers = ['Family member count','Annual␣
↪income', 'Employment length']):
self.feat_with_outliers = feat_with_outliers
def fit(self,df):
return self
def transform(self,df):
if (set(self.feat_with_outliers).issubset(df.columns)):
# 25% quantile
Q1 = df[self.feat_with_outliers].quantile(.25)
# 75% quantile
Q3 = df[self.feat_with_outliers].quantile(.75)
IQR = Q3 - Q1
# keep the data within 3 IQR
df = df[~((df[self.feat_with_outliers] < (Q1 - 3 * IQR)) |(df[self.
↪feat_with_outliers] > (Q3 + 3 * IQR))).any(axis=1)]
return df
else:
print("One or more features are not in the dataframe")
return df
2. Drop features
[86]: class DropFeatures(BaseEstimator,TransformerMixin):
def __init__(self,feature_to_drop = ['ID','Has a mobile phone','Children␣
↪count','Job title','Account age']):
self.feature_to_drop = feature_to_drop
def fit(self,df):
return self
if (set(self.feature_to_drop).issubset(df.columns)):
df.drop(self.feature_to_drop,axis=1,inplace=True)
return df
else:
return df
3. Time convert (Conver Age, Employment length to positive value)
[87]: class TimeConversionHandler(BaseEstimator, TransformerMixin):

def __init__(self, feat_with_days = ['Employment length', 'Age']):
self.feat_with_days = feat_with_days
64
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
if (set(self.feat_with_days).issubset(X.columns)):
# convert days to absolute value
X[['Employment length','Age']] = np.abs(X[['Employment␣
↪length','Age']])
return X
else:
return X
4. Outlier Employment length

[88]: class RetireeHandler(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, df):
return self
def transform(self, df):
if 'Employment length' in df.columns:
# select rows with employment length is 365243 which corresponds to␣
↪retirees
df_ret_idx = df['Employment length'][df['Employment length'] ==␣

↪365243].index
# change 365243 to 0
df.loc[df_ret_idx,'Employment length'] = 0
return df
else:
print("Employment length is not in the dataframe")
return df
5. Handle skew
[89]: class SkewnessHandler(BaseEstimator, TransformerMixin):
def __init__(self,feat_with_skewness=['Annual income','Age']):
self.feat_with_skewness = feat_with_skewness
def fit(self,df):
return self
if (set(self.feat_with_skewness).issubset(df.columns)):
# Handle skewness with cubic root transformation
df[self.feat_with_skewness] = np.cbrt(df[self.feat_with_skewness])
return df
else:
return df
65
6. Binning
[90]: class BinningNumToYN(BaseEstimator, TransformerMixin):
def __init__(self,feat_with_num_enc=['Has a work phone','Has a phone','Has␣
↪an email']):
self.feat_with_num_enc = feat_with_num_enc
def fit(self,df):
return self
if (set(self.feat_with_num_enc).issubset(df.columns)):
# Change 0 to N and 1 to Y for all the features in feat_with_num_enc
for ft in self.feat_with_num_enc:
df[ft] = df[ft].map({1:'Y',0:'N'})
return df
else:
return df
7. One hot encoding

[91]: class OneHotWithFeatNames(BaseEstimator,TransformerMixin):
def __init__(self,one_hot_enc_ft = ['Gender', 'Marital status', 'Dwelling',␣
↪'Employment status', 'Has a car', 'Has a property', 'Has a work phone', 'Has␣
↪a phone', 'Has an email']):
self.one_hot_enc_ft = one_hot_enc_ft
def fit(self,df):
return self
if (set(self.one_hot_enc_ft).issubset(df.columns)):
# function to one hot encode the features in one_hot_enc_ft
def one_hot_enc(df,one_hot_enc_ft):
one_hot_enc = OneHotEncoder()
one_hot_enc.fit(df[one_hot_enc_ft])
# get the result of the one hot encoding columns names
feat_names_one_hot_enc = one_hot_enc.
↪get_feature_names_out(one_hot_enc_ft)
# change the array of the one hot encoding to a dataframe with␣

↪the column names
df = pd.DataFrame(one_hot_enc.transform(df[self.
↪one_hot_enc_ft]).toarray(),columns=feat_names_one_hot_enc,index=df.index)
return df
# function to concatenat the one hot encoded features with the rest␣
↪of features that were not encoded
def concat_with_rest(df,one_hot_enc_df,one_hot_enc_ft):
# get the rest of the features
rest_of_features = [ft for ft in df.columns if ft not in␣
↪one_hot_enc_ft]
66
# concatenate the rest of the features with the one hot encoded␣
↪features
df_concat = pd.concat([one_hot_enc_df,␣
↪df[rest_of_features]],axis=1)
return df_concat
# one hot encoded dataframe
one_hot_enc_df = one_hot_enc(df,self.one_hot_enc_ft)
# returns the concatenated dataframe
full_df_one_hot_enc = concat_with_rest(df,one_hot_enc_df,self.
↪one_hot_enc_ft)
return full_df_one_hot_enc
else:
return df
8. Ordinal Feature Names

[92]: class OrdinalFeatNames(BaseEstimator,TransformerMixin):
def __init__(self,ordinal_enc_ft = ['Education level']):
self.ordinal_enc_ft = ordinal_enc_ft
def fit(self,df):
return self
if 'Education level' in df.columns:
ordinal_enc = OrdinalEncoder()
df[self.ordinal_enc_ft] = ordinal_enc.fit_transform(df[self.
↪ordinal_enc_ft])
return df
else:
print("Education level is not in the dataframe")
return df
10. Min-Max Scaling

[93]: class MinMaxWithFeatNames(BaseEstimator,TransformerMixin):
def __init__(self,min_max_scaler_ft = ['Age', 'Annual income', 'Employment␣
↪length']):
self.min_max_scaler_ft = min_max_scaler_ft
def fit(self,df):
return self
if (set(self.min_max_scaler_ft).issubset(df.columns)):
min_max_enc = MinMaxScaler()
df[self.min_max_scaler_ft] = min_max_enc.fit_transform(df[self.
↪min_max_scaler_ft])
return df
else:
67
return df
11. Change the data types of target feature “Is High Risk”
[94]: class ChangeToNumTarget(BaseEstimator,TransformerMixin):
def __init__(self):
pass
def fit(self,df):
return self
if 'Is High Risk' in df.columns:
df['Is High Risk'] = pd.to_numeric(df['Is High Risk'],␣
↪errors='coerce')
return df
else:
print("Is High Risk is not in the dataframe")
return df
12. Over Sampling target feature

[95]: class Oversample(BaseEstimator,TransformerMixin):
def __init__(self):
pass
def fit(self,df):
return self
if 'Is High Risk' in df.columns:
# smote function to oversample the minority class to fix the␣
↪imbalance data
oversample = SMOTE(sampling_strategy='minority')
X_bal, y_bal = oversample.fit_resample(df.loc[:, df.columns != 'Is␣
↪High Risk'],df['Is High Risk'])
df_bal = pd.concat([pd.DataFrame(X_bal),pd.DataFrame(y_bal)],axis=1)
return df_bal
else:
print("Is high risk is not in the dataframe")
return df
[96]: def full_pipeline(df):

# Create the pipeline that will call all the class from OutlierRemoval to␣
↪OversampleSMOTE in one go
pipeline = Pipeline([
('outlier_remover', OutlierRemover()),
('feature_dropper', DropFeatures()),
('time_conversion_handler', TimeConversionHandler()),
('retiree_handler', RetireeHandler()),
68
('skewness_handler', SkewnessHandler()),
('binning_num_to_yn', BinningNumToYN()),
('one_hot_with_feat_names', OneHotWithFeatNames()),
('ordinal_feat_names', OrdinalFeatNames()),
('min_max_with_feat_names', MinMaxWithFeatNames()),
('change_to_num_target', ChangeToNumTarget()),
('oversample', Oversample())
])
df_pipe_prep = pipeline.fit_transform(df)
return df_pipe_prep
[97]: pd.options.mode.chained_assignment = None # Hide the copy warning

df_prep_train = full_pipeline(data_record_train_copy)
df_prep_train
[97]: Gender_F Gender_M Marital status_Civil marriage \

0 0.00 1.00 0.00
1 1.00 0.00 0.00
2 1.00 0.00 0.00
3 1.00 0.00 0.00
4 1.00 0.00 0.00
… … … …
46539 1.00 0.00 0.00
46540 1.00 0.00 0.00
46541 1.00 0.00 0.80
46542 0.00 1.00 0.00
46543 1.00 0.00 0.00
Marital status_Married Marital status_Separated \

0 1.00 0.00
1 0.00 0.00
2 1.00 0.00
3 0.00 0.00
4 0.00 1.00
… … …
46539 1.00 0.00
46540 1.00 0.00
46541 0.20 0.00
46542 1.00 0.00
46543 1.00 0.00
Marital status_Single / not married Marital status_Widow \

0 0.00 0.00
1 1.00 0.00
2 0.00 0.00
3 1.00 0.00
4 0.00 0.00
69
… … …
46539 0.00 0.00
46540 0.00 0.00
46541 0.00 0.00
46542 0.00 0.00
46543 0.00 0.00
Dwelling_Co-op apartment Dwelling_House / apartment \

0 0.00 0.00
1 0.00 1.00
2 0.00 1.00
3 0.00 1.00
4 0.00 1.00
… … …
46539 0.00 1.00
46540 0.00 1.00
46541 0.00 1.00
46542 0.00 0.00
46543 0.00 1.00
Dwelling_Municipal apartment … Has a phone_N Has a phone_Y \

0 0.00 … 1.00 0.00
1 0.00 … 1.00 0.00
2 0.00 … 1.00 0.00
3 0.00 … 1.00 0.00
4 0.00 … 0.00 1.00
… … … … …
46539 0.00 … 1.00 0.00
46540 0.00 … 1.00 0.00
46541 0.00 … 1.00 0.00
46542 0.00 … 1.00 0.00
46543 0.00 … 1.00 0.00
Has an email_N Has an email_Y Annual income Education level Age \

0 1.00 0.00 0.42 4.00 0.60
1 1.00 0.00 0.42 1.00 0.20
2 1.00 0.00 0.52 4.00 0.39
3 0.00 1.00 0.81 1.00 0.84
4 1.00 0.00 0.68 4.00 0.60
… … … … … …
46539 1.00 0.00 0.42 4.00 0.46
46540 1.00 0.00 0.43 4.00 0.66
46541 1.00 0.00 0.34 1.00 0.29
46542 1.00 0.00 0.52 1.66 0.21
46543 1.00 0.00 0.36 4.00 0.33
Employment length Family member count Is High Risk
70
0 0.27 2.00 0
1 0.14 2.00 0
2 0.50 4.00 0
3 0.18 1.00 0
4 0.04 1.00 0
… … … …
46539 0.27 4.00 1
46540 0.49 2.00 1
46541 0.19 2.00 1
46542 0.17 2.34 1
46543 0.38 4.79 1
[98]: pd.set_option('display.max_columns', None)

df_prep_train.head()
[98]: Gender_F Gender_M Marital status_Civil marriage Marital status_Married \

0 0.00 1.00 0.00 1.00
1 1.00 0.00 0.00 0.00
2 1.00 0.00 0.00 1.00
3 1.00 0.00 0.00 0.00
4 1.00 0.00 0.00 0.00
Marital status_Separated Marital status_Single / not married \

0 0.00 0.00
1 0.00 1.00
2 0.00 0.00
3 0.00 1.00
4 1.00 0.00
Marital status_Widow Dwelling_Co-op apartment Dwelling_House / apartment \

0 0.00 0.00 0.00
1 0.00 0.00 1.00
2 0.00 0.00 1.00
3 0.00 0.00 1.00
4 0.00 0.00 1.00
Dwelling_Municipal apartment Dwelling_Office apartment \

0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00
Dwelling_Rented apartment Dwelling_With parents \

0 0.00 1.00
71
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00
Employment status_Commercial associate Employment status_Pensioner \

0 0.00 0.00
1 1.00 0.00
2 1.00 0.00
3 1.00 0.00
4 0.00 0.00
Employment status_State servant Employment status_Student \

0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00
Employment status_Working Has a car_N Has a car_Y Has a property_N \

0 1.00 0.00 1.00 0.00
1 0.00 0.00 1.00 1.00
2 0.00 1.00 0.00 0.00
3 0.00 0.00 1.00 0.00
4 1.00 1.00 0.00 1.00
Has a property_Y Has a work phone_N Has a work phone_Y Has a phone_N \
0 1.00 1.00 0.00 1.00
1 0.00 1.00 0.00 1.00
2 1.00 1.00 0.00 1.00
3 1.00 1.00 0.00 1.00
4 0.00 1.00 0.00 0.00
Has a phone_Y Has an email_N Has an email_Y Annual income \

0 0.00 1.00 0.00 0.42
1 0.00 1.00 0.00 0.42
2 0.00 1.00 0.00 0.52
3 0.00 0.00 1.00 0.81
4 1.00 1.00 0.00 0.68
Education level Age Employment length Family member count Is High Risk
0 4.00 0.60 0.27 2.00 0
1 1.00 0.20 0.14 2.00 0
2 4.00 0.39 0.50 4.00 0
3 1.00 0.84 0.18 1.00 0
4 4.00 0.60 0.04 1.00 0
72
[99]: X_train, y_train = df_prep_train.loc[:, df_prep_train.columns != 'Is High␣
↪Risk'], df_prep_train['Is High Risk'].astype('int64')
[100]: data_record_test_copy["Is High Risk"] = data_record_test_copy["Is High Risk"].

↪map({"No":'0',"Yes":'1'})
[101]: df_test_prep = full_pipeline(data_record_test_copy)

df_test_prep
[101]: Gender_F Gender_M Marital status_Civil marriage \

0 1.00 0.00 0.00
1 1.00 0.00 1.00
2 1.00 0.00 0.00
3 1.00 0.00 0.00
4 1.00 0.00 0.00
… … … …
11649 0.00 1.00 0.00
11650 1.00 0.00 0.00
11651 1.00 0.00 0.00
11652 0.00 1.00 0.00
11653 1.00 0.00 0.00
Marital status_Married Marital status_Separated \

0 0.00 1.00
1 0.00 0.00
2 1.00 0.00
3 1.00 0.00
4 1.00 0.00
… … …
11649 1.00 0.00
11650 1.00 0.00
11651 1.00 0.00
11652 1.00 0.00
11653 1.00 0.00
Marital status_Single / not married Marital status_Widow \

0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00
… … …
11649 0.00 0.00
11650 0.00 0.00
11651 0.00 0.00
11652 0.00 0.00
11653 0.00 0.00
73
Dwelling_Co-op apartment Dwelling_House / apartment \
0 0.00 1.00
1 0.00 1.00
2 0.00 1.00
3 0.00 0.00
4 0.00 1.00
… … …
11649 0.00 1.00
11650 0.00 1.00
11651 0.00 1.00
11652 0.00 1.00
11653 0.00 0.03

0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 1.00 0.00
4 0.00 0.00
… … …
11649 0.00 0.00
11650 0.00 0.00
11651 0.00 0.00
11652 0.00 0.00
11653 0.97 0.00

0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00
… … …
11649 0.00 0.00
11650 0.00 0.00
11651 0.00 0.00
11652 0.00 0.00
11653 0.00 0.00

0 0.00 0.00
1 0.00 0.00
2 1.00 0.00
3 0.00 0.00
4 0.00 0.00
… … …
74
11649 0.88 0.00
11650 0.00 0.00
11651 0.00 0.00
11652 0.65 0.00
11653 0.03 0.00

0 1.00 0.00
1 1.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00
… … …
11649 0.00 0.00
11650 0.00 0.00
11651 0.00 0.00
11652 0.00 0.00
11653 0.00 0.00

0 0.00 1.00 0.00 0.00
1 0.00 1.00 0.00 1.00
2 0.00 0.00 1.00 0.00
3 1.00 0.00 1.00 1.00
4 1.00 1.00 0.00 0.00
… … … … …
11649 0.12 1.00 0.00 0.00
11650 1.00 1.00 0.00 1.00
11651 1.00 1.00 0.00 0.00
11652 0.35 0.00 1.00 0.00
11653 0.97 0.97 0.03 1.00
Has a property_Y Has a work phone_N Has a work phone_Y \

0 1.00 1.00 0.00
1 0.00 1.00 0.00
2 1.00 1.00 0.00
3 0.00 1.00 0.00
4 1.00 1.00 0.00
… … … …
11649 1.00 1.00 0.00
11650 0.00 0.00 1.00
11651 1.00 1.00 0.00
11652 1.00 1.00 0.00
11653 0.00 0.00 1.00
Has a phone_N Has a phone_Y Has an email_N Has an email_Y \

0 1.00 0.00 1.00 0.00
75
1 1.00 0.00 0.00 1.00
2 1.00 0.00 1.00 0.00
3 1.00 0.00 1.00 0.00
4 1.00 0.00 1.00 0.00
… … … … …
11649 1.00 0.00 1.00 0.00
11650 0.95 0.05 1.00 0.00
11651 1.00 0.00 0.87 0.13
11652 1.00 0.00 0.35 0.65
11653 0.97 0.03 1.00 0.00
Annual income Education level Age Employment length \

0 0.56 4.00 0.64 0.15
1 0.61 1.00 0.89 0.47
2 0.90 1.00 0.38 0.04
3 0.60 4.00 0.36 0.06
4 0.30 4.00 0.70 0.05
… … … … …
11649 0.65 4.00 0.41 0.22
11650 0.36 4.00 0.22 0.05
11651 0.39 1.00 0.43 0.32
11652 0.53 1.00 0.59 0.18
11653 0.36 2.00 0.55 0.22
Family member count Is High Risk

0 1.00 0
1 2.00 0
2 2.00 0
3 2.00 0
4 2.00 0
… … …
11649 2.00 1
11650 3.00 1
11651 2.00 1
11652 2.65 1
11653 4.97 1
[102]: X_test, y_test = df_test_prep.loc[:, df_test_prep.columns != "Is High Risk"],␣

↪df_test_prep["Is High Risk"]
[103]: X_train.head()
[103]: Gender_F Gender_M Marital status_Civil marriage Marital status_Married \

0 0.00 1.00 0.00 1.00
1 1.00 0.00 0.00 0.00
76
2 1.00 0.00 0.00 1.00
3 1.00 0.00 0.00 0.00
4 1.00 0.00 0.00 0.00
Marital status_Separated Marital status_Single / not married \

0 0.00 0.00
1 0.00 1.00
2 0.00 0.00
3 0.00 1.00
4 1.00 0.00
Marital status_Widow Dwelling_Co-op apartment Dwelling_House / apartment \

0 0.00 0.00 0.00
1 0.00 0.00 1.00
2 0.00 0.00 1.00
3 0.00 0.00 1.00
4 0.00 0.00 1.00

0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00

0 0.00 1.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00

0 0.00 0.00
1 1.00 0.00
2 1.00 0.00
3 1.00 0.00
4 0.00 0.00

0 0.00 0.00
1 0.00 0.00
2 0.00 0.00
3 0.00 0.00
4 0.00 0.00
77
0 1.00 0.00 1.00 0.00
1 0.00 0.00 1.00 1.00
2 0.00 1.00 0.00 0.00
3 0.00 0.00 1.00 0.00
4 1.00 1.00 0.00 1.00
Has a property_Y Has a work phone_N Has a work phone_Y Has a phone_N \
0 1.00 1.00 0.00 1.00
1 0.00 1.00 0.00 1.00
2 1.00 1.00 0.00 1.00
3 1.00 1.00 0.00 1.00
4 0.00 1.00 0.00 0.00
Has a phone_Y Has an email_N Has an email_Y Annual income \

0 0.00 1.00 0.00 0.42
1 0.00 1.00 0.00 0.42
2 0.00 1.00 0.00 0.52
3 0.00 0.00 1.00 0.81
4 1.00 1.00 0.00 0.68
Education level Age Employment length Family member count

0 4.00 0.60 0.27 2.00
1 1.00 0.20 0.14 2.00
2 4.00 0.39 0.50 4.00
3 1.00 0.84 0.18 1.00
4 4.00 0.60 0.04 1.00
6 Model Training
[104]: classifiers = {
'sgd':SGDClassifier(random_state=42,loss='perceptron'),
'logistic_regression':LogisticRegression(random_state=42,max_iter=1000),
'decision_tree':DecisionTreeClassifier(random_state=42),
'random_forest':RandomForestClassifier(random_state=42),
'k_nearest_neighbors':KNeighborsClassifier(),
'gradient_boosting':GradientBoostingClassifier(random_state=42)
}
[105]: # Set the option for displaying float values

pd.set_option('display.float_format', lambda x: '%.3f' % x)
for i in range(len(list(classifiers))):
model = list(classifiers.values())[i]
model.fit(X_train, y_train)
# make predictions
y_train_pred = model.predict(X_train)
78
y_test_pred = model.predict(X_test)
# training set performance
model_train_accuracy = accuracy_score(y_train, y_train_pred)
model_train_f1 = f1_score(y_train, y_train_pred, average="weighted")
model_train_precision = precision_score(y_train, y_train_pred)
model_train_recall = recall_score(y_train, y_train_pred)
# test set performance
model_test_accuracy = accuracy_score(y_test, y_test_pred)
model_test_f1 = f1_score(y_test, y_test_pred, average="weighted")
model_test_precision = precision_score(y_test, y_test_pred)
model_test_recall = recall_score(y_test, y_test_pred)
print(list(classifiers.keys())[i])
print("Model performance for training")
print(f"- Accuracy: {model_train_accuracy:.3f}")
print(f"- F1 score: {model_train_f1:.3f}")
print(f"- Precision: {model_train_precision:.3f}")
print(f"- Recall: {model_train_recall:.3f}")
print("-" * 20)
print("Model performance for testing")
print(f"- Accuracy: {model_test_accuracy:.3f}")
print(f"- F1 score: {model_test_f1:.3f}")
print(f"- Precision: {model_test_precision:.3f}")
print(f"- Recall: {model_test_recall:.3f}")
print("=" * 35)
print('\n')
sgd
Model performance for training
- Accuracy: 0.542
- F1 score: 0.439
- Precision: 0.523
- Recall: 0.970
--------------------
Model performance for testing
- Accuracy: 0.531
- F1 score: 0.429
- Precision: 0.517
- Recall: 0.953
===================================
logistic_regression
- Accuracy: 0.589
- F1 score: 0.588
79
- Precision: 0.584
- Recall: 0.615
--------------------
- Accuracy: 0.558
- F1 score: 0.558
- Precision: 0.558
- Recall: 0.556
===================================
decision_tree
- Accuracy: 0.994
- F1 score: 0.994
- Precision: 0.993
- Recall: 0.995
--------------------
- Accuracy: 0.836
- F1 score: 0.835
- Precision: 0.900
- Recall: 0.756
===================================
random_forest
- Accuracy: 0.994
- F1 score: 0.994
- Precision: 0.992
- Recall: 0.995
--------------------
- Accuracy: 0.890
- F1 score: 0.889
- Precision: 0.979
- Recall: 0.798
===================================
k_nearest_neighbors
- Accuracy: 0.956
- F1 score: 0.956
- Precision: 0.980
- Recall: 0.930
--------------------
80
- Accuracy: 0.754
- F1 score: 0.742
- Precision: 0.944
- Recall: 0.539
===================================
gradient_boosting
- Accuracy: 0.904
- F1 score: 0.904
- Precision: 0.900
- Recall: 0.909
--------------------
- Accuracy: 0.843
- F1 score: 0.842
- Precision: 0.794
- Recall: 0.927
===================================
SGD Classifier
[106]: model = SGDClassifier(random_state=42,loss='perceptron')
report = classification_report(y_test, y_test_pred)
print("\033[1mSGD Classification Report\033[0m")
print(report)
print("="*50)
disp = ConfusionMatrixDisplay.from_estimator(model, X_test,␣

↪y_test,cmap='Blues',values_format='d')
plt.grid(visible=None)
plt.title("Confusion Matrix")
plt.show()
#remove the grid
print('\n')
print("="*50)
RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.show()
SGD Classification Report

precision recall f1-score support
81
0 0.70 0.11 0.19 5827
1 0.52 0.95 0.67 5827
accuracy 0.53 11654

macro avg 0.61 0.53 0.43 11654
weighted avg 0.61 0.53 0.43 11654
==================================================
Confusion Matrix
5000
0 629 5198
4000
True label
3000
2000
1 272 5555
1000
0 1
Predicted label
==================================================
82
1.0
0.8
True Positive Rate (Positive label: 1)
0.6
0.4
0.2
0.0 SGDClassifier (AUC = 0.55)

0.0 0.2 0.4 0.6 0.8 1.0
False Positive Rate (Positive label: 1)
Gradient Boosting
[107]: model = GradientBoostingClassifier(random_state=42)
print("\033[1mGradient Boosting Classification Report\033[0m")
print(report)
print("="*50)

plt.show()
#remove the grid
print('\n')
print("="*50)
plt.show()
Gradient Boosting Classification Report
83
0 0.91 0.76 0.83 5827

1 0.79 0.93 0.86 5827
accuracy 0.84 11654

macro avg 0.85 0.84 0.84 11654
weighted avg 0.85 0.84 0.84 11654
==================================================
Confusion Matrix
5000
0 4421 1406
4000
True label
3000
2000
1 423 5404
1000
0 1
Predicted label
==================================================
84
1.0
0.8
0.6
0.4
0.2
0.0 GradientBoostingClassifier (AUC = 0.95)

0.0 0.2 0.4 0.6 0.8 1.0
KNeighbors
[108]: model = KNeighborsClassifier()
print("\033[1mKNeighbors Classification Report\033[0m")
print(report)
print("="*50)

plt.show()
#remove the grid
print('\n')
print("="*50)
plt.show()
KNeighbors Classification Report
85
0 0.68 0.97 0.80 5827

1 0.94 0.54 0.69 5827
accuracy 0.75 11654

macro avg 0.81 0.75 0.74 11654
weighted avg 0.81 0.75 0.74 11654
==================================================
Confusion Matrix
5000
0 5641 186
4000
True label
3000
2000
1 2686 3141
1000
0 1
Predicted label
==================================================
86
1.0
0.8
0.6
0.4
0.2
0.0 KNeighborsClassifier (AUC = 0.78)

0.0 0.2 0.4 0.6 0.8 1.0
Random Forest Classifier

[109]: model = RandomForestClassifier(random_state=42)
print("\033[1mRandom Forest Classification Report\033[0m")
print(report)
print("="*50)

plt.show()
#remove the grid
print('\n')
print("="*50)
plt.show()
Random Forest Classification Report
87
0 0.83 0.98 0.90 5827

1 0.98 0.80 0.88 5827
accuracy 0.89 11654

macro avg 0.90 0.89 0.89 11654
weighted avg 0.90 0.89 0.89 11654
==================================================
Confusion Matrix
5000
0 5726 101
4000
True label
3000
2000
1 1179 4648
1000
0 1
Predicted label
==================================================
88
1.0
0.8
0.6
0.4
0.2
0.0 RandomForestClassifier (AUC = 0.98)

0.0 0.2 0.4 0.6 0.8 1.0
[110]: rfc = RandomForestClassifier(random_state=42)

Knb = KNeighborsClassifier()
sgd = SGDClassifier(random_state=42,loss='perceptron')
gradient = GradientBoostingClassifier(random_state=42)
rfc.fit(X_train, y_train)
Knb.fit(X_train, y_train)
sgd.fit(X_train, y_train)
gradient.fit(X_train, y_train)
ax = plt.gca()
rfc_disp = RocCurveDisplay.from_estimator(rfc, X_test, y_test, ax=ax, alpha=0.8)
Knb_disp = RocCurveDisplay.from_estimator(Knb, X_test, y_test, ax=ax, alpha=0.8)
sgd_disp = RocCurveDisplay.from_estimator(sgd, X_test, y_test, ax=ax, alpha=0.8)
gradient_disp = RocCurveDisplay.from_estimator(gradient, X_test, y_test, ax=ax,␣
↪alpha=0.8)
Knb_disp.plot(ax=ax, alpha=0.8)
sgd_disp.plot(ax=ax, alpha=0.8)
gradient_disp.plot(ax=ax, alpha=0.8)
[110]: <sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x7ba6b9880ac0>
89
1.0
0.8
0.6
0.4
RandomForestClassifier (AUC = 0.98)
KNeighborsClassifier (AUC = 0.78)
0.2 SGDClassifier (AUC = 0.55)
GradientBoostingClassifier (AUC = 0.95)
KNeighborsClassifier (AUC = 0.78)
SGDClassifier (AUC = 0.55)
0.0 GradientBoostingClassifier (AUC = 0.95)
0.0 0.2 0.4 0.6 0.8 1.0
[114]: from google.colab import drive

drive.mount("/content/drive")
Drive already mounted at /content/drive; to attempt to forcibly remount, call

drive.mount("/content/drive", force_remount=True).
[ ]: !jupyter nbconvert --to html /content/KNN.ipynb
90

Credit Card Approve Predict Bynvd

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Credit Card Approve Predict Bynvd

Uploaded by

Copyright:

Available Formats

credit-card-approve-predict-bynvd

February 19, 2024

[1]: from IPython.display import set_matplotlib_formats

<ipython-input-1-d867fd071f7d>:2: DeprecationWarning: `set_matplotlib_formats`

1 Import libraries and data

from sklearn.base import BaseEstimator, TransformerMixin

from imblearn.over_sampling import SMOTE #?

[3]: data_record = pd.read_csv("/content/drive/MyDrive/Data upload/

credit_record = pd.read_csv("/content/drive/MyDrive/Data upload/credit_record.

[4]: ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN \

AMT_INCOME_TOTAL NAME_INCOME_TYPE NAME_EDUCATION_TYPE \

NAME_FAMILY_STATUS NAME_HOUSING_TYPE DAYS_BIRTH DAYS_EMPLOYED \

FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE \

[5]: ID MONTHS_BALANCE STATUS

[6]: begin_month = pd.DataFrame(credit_record.groupby(['ID'])['MONTHS_BALANCE'].

begin_month = begin_month.rename(columns={"MONTHS_BALANCE ": "Account␣

[45985 rows x 1 columns]

[8]: data_record = pd.merge(data_record, begin_month, how="left", on="ID")

[9]: ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN \

AMT_INCOME_TOTAL NAME_INCOME_TYPE NAME_EDUCATION_TYPE \

NAME_FAMILY_STATUS NAME_HOUSING_TYPE DAYS_BIRTH DAYS_EMPLOYED \

FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE \

[10]: credit_record["STATUS"][credit_record["ID"]== 5001712]

[11]: #Create new column dep_value

See the caveats in the documentation: https://pandas.pydata.org/pandas-

See the caveats in the documentation: https://pandas.pydata.org/pandas-

See the caveats in the documentation: https://pandas.pydata.org/pandas-

See the caveats in the documentation: https://pandas.pydata.org/pandas-

[12]: Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',

[13]: # Rename the features

Employment status Education level Marital status \

Has a work phone Has a phone Has an email Job title \

Family member count Account age Is High Risk

[15]: def data_split(df, test_size):

return df_train.reset_index(drop=True), df_test.reset_index(drop=True)

[16]: data_record_train_original, data_record_test_original = data_split(data_record,␣

[17]: (29165, 20)

[18]: (7292, 20)

Employment status Education level Marital status \

Dwelling Age Employment length Has a mobile phone \

Has a work phone Has a phone Has an email Job title \

Family member count Account age Is High Risk

[22]: ID Children count Annual income Age \

Has an email Family member count Account age

[26]: def get_info_feat(df, feature):

[27]: #function create pie chart

ratio_size = val_count_and_fre(df, feature)

plt.title(f"Pie chart of {feature}")

[28]: # function to create bar chart

[30]: # function to create histogram

[31]: # create function of high-low risk box plot

employed_year = np.abs(employed/365.25) #array, but we will build on␣

[32]: group_high_risk = data_record_train_copy.groupby("Marital status")["Is High␣

[32]: Marital status Is High Risk

[33]: # create high low bar plot

def hl_bar_plot(df, feature):

[34]: Index(['ID', 'Gender', 'Has a car', 'Has a property', 'Children count',

[36]: plot_pie_chart(data_record_train_copy, "Gender")

[37]: plot_bar_chart(data_record_train_copy, "Gender")

[39]: create_box_plot(data_record_train_copy, "Age")

[40]: create_hist_plot(data_record_train_copy, "Age")

[41]: hl_risk_box_plot(data_record_train_copy, "Age")