Professional Documents
Culture Documents
Work
Work
pandas as pd
df = pd.read_csv('work.txt')
Unnamed:
id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... hardship_start_date hardship
0
36
0 0 1077501 5000 5000 4975.0 10.65% 162.87 B B2 ... NaN
months
60
1 1 1077430 2500 2500 2500.0 15.27% 59.83 C C4 ... NaN
months
36
2 2 1077175 2400 2400 2400.0 15.96% 84.33 C C5 ... NaN
months
36
3 3 1076863 10000 10000 10000.0 13.49% 339.31 C C1 ... NaN
months
60
4 4 1075358 3000 3000 3000.0 12.69% 67.79 B B5 ... NaN
months
id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title ... hardship_start_date hards
36
0 1077501 5000 5000 4975.0 10.65% 162.87 B B2 NaN ... NaN
months
60
1 1077430 2500 2500 2500.0 15.27% 59.83 C C4 Ryder ... NaN
months
36
2 1077175 2400 2400 2400.0 15.96% 84.33 C C5 NaN ... NaN
months
AIR
36
3 1076863 10000 10000 10000.0 13.49% 339.31 C C1 RESOURCES ... NaN
months
BOARD
University
60
4 1075358 3000 3000 3000.0 12.69% 67.79 B B5 Medical ... NaN
months
Group
df_filtered
id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title ... collections_12_mths_ex_
36
0 1077501 5000 5000 4975.0 10.65% 162.87 B B2 NaN ...
months
60
1 1077430 2500 2500 2500.0 15.27% 59.83 C C4 Ryder ...
months
36
2 1077175 2400 2400 2400.0 15.96% 84.33 C C5 NaN ...
months
AIR
36
3 1076863 10000 10000 10000.0 13.49% 339.31 C C1 RESOURCES ...
months
BOARD
University
60
4 1075358 3000 3000 3000.0 12.69% 67.79 B B5 Medical ...
months
Group
... ... ... ... ... ... ... ... ... ... ... ...
36 Gilbert
197 1065350 9000 9000 9000.0 12.69% 301.91 B B5 ...
months Express
36
198 1067028 13250 13250 13250.0 10.65% 431.60 B B2 Talbert House ...
months
36
199 1061877 20000 20000 20000.0 13.49% 678.61 C C1 NaN ...
months
36 Kohls
200 1067018 3000 3000 3000.0 14.65% 103.49 C C3 ...
months Corporation
Hospice
36
201 1067223 7350 7350 7350.0 10.65% 239.42 B B2 Peachtree, ...
months
LLC
df_filtered.isnull().sum()
id 0
loan_amnt 0
funded_amnt 0
funded_amnt_inv 0
term 0
int_rate 0
installment 0
grade 0
sub_grade 0
emp_title 10
emp_length 1
home_ownership 0
annual_inc 0
verification_status 0
issue_d 0
loan_status 0
pymnt_plan 0
url 0
purpose 1
title 1
zip_code 1
addr_state 1
dti 1
delinq_2yrs 1
earliest_cr_line 1
fico_range_low 1
fico_range_high 1
inq_last_6mths 1
mths_since_last_delinq 155
mths_since_last_record 197
open_acc 1
pub_rec 1
revol_bal 1
revol_util 1
total_acc 1
initial_list_status 1
out_prncp 1
out_prncp_inv 1
total_pymnt 1
total_pymnt_inv 1
total_rec_prncp 1
total_rec_int 1
total_rec_late_fee 1
recoveries 1
collection_recovery_fee 1
last_pymnt_d 2
last_pymnt_amnt 1
last_credit_pull_d 1
last_fico_range_high 1
last_fico_range_low 1
collections_12_mths_ex_med 1
policy_code 1
application_type 1
acc_now_delinq 1
chargeoff_within_12_mths 1
delinq_amnt 1
pub_rec_bankruptcies 1
tax_liens 1
hardship_flag 1
debt_settlement_flag 1
dtype: int64
df_filtered.drop(['mths_since_last_delinq',
'mths_since_last_record'],axis = 1, inplace=True)
C:\Users\Simeon\AppData\Local\Temp\ipykernel_9788\2937646126.py:1: SettingWithCopyWarning:
df_filtered.shape
(202, 58)
df_filtered.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202 entries, 0 to 201
Data columns (total 58 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 202 non-null int64
1 loan_amnt 202 non-null int64
2 funded_amnt 202 non-null int64
3 funded_amnt_inv 202 non-null float64
4 term 202 non-null object
5 int_rate 202 non-null object
6 installment 202 non-null float64
7 grade 202 non-null object
8 sub_grade 202 non-null object
9 emp_title 192 non-null object
10 emp_length 201 non-null object
11 home_ownership 202 non-null object
12 annual_inc 202 non-null float64
13 verification_status 202 non-null object
14 issue_d 202 non-null object
15 loan_status 202 non-null object
16 pymnt_plan 202 non-null object
17 url 202 non-null object
18 purpose 201 non-null object
19 title 201 non-null object
20 zip_code 201 non-null object
21 addr_state 201 non-null object
22 dti 201 non-null float64
23 delinq_2yrs 201 non-null float64
24 earliest_cr_line 201 non-null object
25 fico_range_low 201 non-null float64
26 fico_range_high 201 non-null float64
27 inq_last_6mths 201 non-null float64
28 open_acc 201 non-null float64
29 pub_rec 201 non-null float64
30 revol_bal 201 non-null float64
31 revol_util 201 non-null object
32 total_acc 201 non-null float64
33 initial_list_status 201 non-null object
34 out_prncp 201 non-null float64
35 out_prncp_inv 201 non-null float64
36 total_pymnt 201 non-null float64
37 total_pymnt_inv 201 non-null float64
38 total_rec_prncp 201 non-null float64
39 total_rec_int 201 non-null float64
40 total_rec_late_fee 201 non-null float64
41 recoveries 201 non-null float64
42 collection_recovery_fee 201 non-null float64
43 last_pymnt_d 200 non-null object
44 last_pymnt_amnt 201 non-null float64
45 last_credit_pull_d 201 non-null object
46 last_fico_range_high 201 non-null float64
47 last_fico_range_low 201 non-null float64
48 collections_12_mths_ex_med 201 non-null float64
49 policy_code 201 non-null float64
50 application_type 201 non-null object
51 acc_now_delinq 201 non-null float64
52 chargeoff_within_12_mths 201 non-null float64
53 delinq_amnt 201 non-null float64
54 pub_rec_bankruptcies 201 non-null float64
55 tax_liens 201 non-null float64
56 hardship_flag 201 non-null object
57 debt_settlement_flag 201 non-null object
dtypes: float64(31), int64(3), object(24)
memory usage: 91.7+ KB
df_filtered.dropna(inplace =True)
C:\Users\Simeon\AppData\Local\Temp\ipykernel_9788\3027548963.py:1: SettingWithCopyWarning:
#df_filtered.dropna(inplace =True)
df_filtered.shape
(190, 58)
df3 = df_filtered.describe().transpose()
df3
count mean std min 25% 50% 75% max
categorical_columns = df_filtered.select_dtypes(include='object')
categorical_columns
term int_rate grade sub_grade emp_title emp_length home_ownership verification_status issue_d loan_status ... zip_code addr_s
60 Dec-
1 15.27% C C4 Ryder < 1 year RENT Source Verified Charged Off ... 309xx
months 2011
AIR
36 Dec-
3 13.49% C C1 RESOURCES 10+ years RENT Source Verified Fully Paid ... 917xx
months 2011
BOARD
University
60 Dec-
4 12.69% B B5 Medical 1 year RENT Source Verified Fully Paid ... 972xx
months 2011
Group
36 Veolia Dec-
5 7.90% A A4 3 years RENT Source Verified Fully Paid ... 852xx
months Transportaton 2011
... ... ... ... ... ... ... ... ... ... ... ... ...
36 Dec-
195 14.27% C C2 Corning Inc. 8 years MORTGAGE Not Verified Charged Off ... 148xx
months 2011
36 Dec-
196 11.71% B B3 UPS 10+ years MORTGAGE Verified Fully Paid ... 028xx
months 2011
36 Gilbert Dec-
197 12.69% B B5 5 years RENT Source Verified Fully Paid ... 115xx
months Express 2011
36 Dec-
198 10.65% B B2 Talbert House 4 years RENT Not Verified Fully Paid ... 450xx
months 2011
36 Kohls Dec-
200 14.65% C C3 5 years RENT Verified Fully Paid ... 532xx
months Corporation 2011
df5 = categorical_columns.describe().transpose()
import plotly.express as px
for i in df_filtered:
fig = px.histogram(df[i])
fig.show()
categorical_columns.columns
C:\Users\Simeon\AppData\Local\Temp\ipykernel_9788\3546400269.py:10: SettingWithCopyWarning:
id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title ... collections_12_mths_ex_med
... ... ... ... ... ... ... ... ... ... ... ...
model.fit(xtrain,ytrain)
DecisionTreeClassifier()
pred = model.predict(xtest)
accuracy 0.95 38
macro avg 0.97 0.86 0.90 38
weighted avg 0.95 0.95 0.94 38
print(confusion_matrix(ytest,pred))
[[ 5 2]
[ 0 31]]
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js