Cleaning Data

12/24/21, 5:34 PM CLEANING_DATA.
ipynb - Colaboratory
1 import pandas as pd
2 import numpy as np
3 import matplotlib.pyplot as plt
4 import seaborn as sns
Loading the datasets and processing some records
1 cr = pd.read_csv("CreditRisk.csv")
1 cr.columns
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',

'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
dtype='object')
1 cr.head() ## top 5 records
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
0 LP001002 Male No 0.0 Graduate No 5849
1 LP001003 Male Yes 1.0 Graduate No 4583
2 LP001005 Male Yes 0.0 Graduate Yes 3000
Not
3 LP001006 Male Yes 0.0 No 2583
Graduate
1 cr.head(10)
https://colab.research.google.com/drive/1TlZut8ZGAyUfKNV3npX7OuLYVqTj7ws_#scrollTo=0fsSaPwLFAwV&printMode=true 1/18
12/24/21, 5:34 PM CLEANING_DATA.ipynb - Colaboratory
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
Not
3 LP001006 Male Yes 0.0 No 2583
Graduate
1 cr.tail(10) # lastMale
4 LP001008 5 recordsNo 0.0 Graduate No 6000

Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncom
Not
6
971LP001013
LP002952 Male
Male YesNo 0.00.0 Graduate NoNo 2333
250
Graduate
7
972LP001014
LP002954 Male
Male YesYes 4.02.0 GraduateNot NoNo 3036
313
Graduate
974 LP002965 Female Yes 0.0 Graduate No 855
Not
976 LP002971 Male Yes 4.0 Yes 400
Graduate
980 LP002989 Male No 0.0 Graduate Yes 920
1 cr.shape # number of rows and columns
(981, 13)
1 cr.describe() # gives the description of each column( only numerical columns)
Dependents ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
count 956.000000 981.000000 981.000000 954.000000 961.000000
mean 0.881799 5179.795107 1601.916330 142.511530 342.201873
std 1.255623 5695.104533 2718.772806 77.421743 65.100602

1 cr.Property_Area.value_counts()
min 0.000000 # gives the count
0.000000 for each 9.000000
0.000000 category 6.000000
25%
Semiurban 0.000000
349 2875.000000 0.000000 100.000000 360.000000
Urban 342
50%
Rural 0.000000
290 3800.000000 1110.000000 126.000000 360.000000
Name: Property_Area, dtype: int64
75% 2.000000 5516.000000 2365.000000 162.000000 360.000000
max 4.000000 81000.000000 41667.000000 700.000000 480.000000

1 cr.Gender.value_counts()
Male 775
Female 182
Name: Gender, dtype: int64
1 cr['Gender'].value_counts()
Male 775
Female 182
1 # want to create a DF earning more than a particular amout( 3000)

2
3 aa = cr.ApplicantIncome > 5000
1 df1 = cr[aa] # df1 is the name of new data frame

2 df1

1 df3 = cr[-aa] # remaining records that doest not fall under the category ApplicantIn
2 df3 Not
20 LP001043 Male Yes 0.0 No 766
Graduate
... Loan_ID
... Gender
... Married
... Dependents
... Education
... Self_Employed
... ApplicantIncom.
1
967 LP001003
LP002920 Male
Male Yes
Yes 1.0
0.0 Graduate
Graduate No
No 458
511
2 LP001005 Male Yes 0.0 Graduate
Not Yes 300
968 LP002921 Male Yes 4.0 No 531
Graduate
Not
3 LP001006 Male Yes 0.0 No 258
969 LP002932 Male Yes 4.0 Graduate
Graduate No 760
974 LP002965 Female Yes 0.0 Not

Graduate No 855
6 LP001013 Male Yes 0.0 No 233
Graduate
285 rows × 13 columns
... ... ... ... ... ... ... .
Not
976 LP002971 Male Yes 4.0 Yes 400
Graduate
1 # ApplicantIncome > 3000 , Gender = Male , should also be married
1 abc = ((cr.ApplicantIncome > 3000) & (cr.Gender == "Male" ) & (cr.Married == "Yes") )
1 df3 = cr[abc]
1 df3
... ... ... ... ... ... ... .
Not
972 LP002954 Male Yes 2.0 No 313
Graduate
Not
976 LP002971 Male Yes 4.0 Yes 400
Graduate

1 cr.ApplicantIncome.mean()
5179.795107033639
1 cr.mean()
Dependents 0.881799
ApplicantIncome 5179.795107
CoapplicantIncome 1601.916330
LoanAmount 142.511530
Loan_Amount_Term 342.201873
Credit_History 0.835920
dtype: float64
1 cr.ApplicantIncome.median()
3800.0
1 cr.ApplicantIncome.sum()
5081379
1 cr.ApplicantIncome.max()
81000
1 cr.ApplicantIncome.min()
0
1 cr.Gender.value_counts()
Male 775
Female 182
1 # GROUP BY FUNCTION
2
3 cr.groupby('Gender').ApplicantIncome.agg(['count' , 'min' ,'max' ,'mean'])
count min max mean
Gender
Female 182 0 19484 4458.906593
Male 775 0 81000 5256.925161
1 cr.groupby('Gender').agg(['min' ,'max' ,'mean'])
Dependents ApplicantIncome CoapplicantIncome LoanAm
min max mean min max mean min max mean min m
Gender
Female 0.0 4.0 0.531073 0 19484 4458.906593 0.0 41667.0 1132.604396 9.0 6
Male 0.0 4.0 0.960317 0 81000 5256.925161 0.0 33837.0 1716.340542 17.0 6
1 cr.groupby(["Gender" , "Married"]).ApplicantIncome.mean()
Gender Married
Female No 4394.153226
Yes 4501.736842
Male No 4912.764151
Yes 5390.440285
Name: ApplicantIncome, dtype: float64
1 cr.rename(columns = {'Gender':'GEN'}, inplace = True)
1 cr.iloc[:,:-1]
Loan_ID GEN Married Dependents Education Self_Employed ApplicantIncome
Not
3 LP001006 Male Yes 0.0 No 2583
Graduate
... ... ... ... ... ... ... ...
Not
976 LP002971 Male Yes 4.0 Yes 4009
Graduate
How to split your records?
1 # slice and dice of data
1 # select only top 10 records and all the columns
1 cr.iloc[1:10] ## iloc index location
Loan_ID GEN Married Dependents Education Self_Employed ApplicantIncome Co

1 # top 10 records and 1 ,3, 5
2 Not
3 LP001006 Male Yes 0.0 No 2583
3 cr.iloc[[0,1,2,3,4,5,6,7,8,9] ,[ 1,3,5]] Graduate

GEN Dependents Self_Employed
0 Male 0.0 No
Not
6
1 LP001013
Male Male1.0 Yes No 0.0 No 2333
Graduate
2
7 Male
LP001014 Male0.0 Yes Yes 4.0 Graduate No 3036
3
8 Male
LP001018 Male0.0 Yes No 2.0 Graduate No 4006
4
9 Male
LP001020 Male0.0 Yes No 1.0 Graduate No 12841
5 Male 2.0 Yes
6 Male 0.0 No
7 Male 4.0 No
8 Male 2.0 No
9 Male 1.0 No
1 cr.iloc[ 0:10 , [ 1,3,5]]
GEN Dependents Self_Employed
0 Male 0.0 No
1 Male 1.0 No
2 Male 0.0 Yes
3 Male 0.0 No
4 Male 0.0 No
5 Male 2.0 Yes
6 Male 0.0 No
7 Male 4.0 No
8 Male 2.0 No
9 Male 1.0 No
1 cr.iloc[:, 2:6]
Married Dependents Education Self_Employed
0 No 0.0 Graduate No
1 Yes 1.0 Graduate No
2 Yes 0.0 Graduate Yes
3 Yes 0.0 Not Graduate No
... ... ... ... ...
976 Yes 4.0 Not Graduate Yes
980 No 0.0 Graduate Yes
981 rows
1 cr.loc[ : ×
, 4['ApplicantIncome',
columns 'Self_Employed']]
ApplicantIncome Self_Employed
0 5849 No
1 4583 No
2 3000 Yes
3 2583 No
4 6000 No
... ... ...
976 4009 Yes
977 4158 No
978 3250 No
979 5000 No
980 9200 Yes
bold text# Renaming the columns
1 cr1 = cr.rename(columns= {'ApplicantIncome' : 'ApplicantIncome1'})
1 cr1.head()
Loan_ID GEN Married Dependents Education Self_Employed ApplicantIncome1 C
Not
3 LP001006 Male Yes 0.0 No 2583
Graduate
1 cr.columns
Index(['Loan_ID', 'GEN', 'Married', 'Dependents', 'Education', 'Self_Employed',

'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
dtype='object')
1 cr3 = cr.rename(columns = {'Gender' : 'Gender1' ,

2 'Married' : 'Married1' ,
3 'Dependents' : 'Dependents1'} )
1 cr3.head()
Loan_ID GEN Married1 Dependents1 Education Self_Employed ApplicantIncome
Not
3 LP001006 Male Yes 0.0 No 2583
Graduate
1 cr.head()
Loan_ID GEN Married Dependents Education Self_Employed ApplicantIncome Co

1 cr.rename(columns= {cr.columns[1] : "Gender1" } , inplace = True)
Not
3 LP001006 Male Yes 0.0 No 2583
Graduate
1 cr.head()
Loan_ID Gender1 Married Dependents Education Self_Employed ApplicantIncome
Not
3 LP001006 Male Yes 0.0 No 2583
Graduate
1 cr.rename( columns= {'ApplicantIncome' : 'ApplicantIncome1'} , inplace = True)

2
1 cr.drop("Loan_ID", axis = 1 , inplace = True) # axis =1 , something oncolumn side

2 # axis 0 , somefunction on row side
1 cr.head()
Gender1 Married Dependents Education Self_Employed ApplicantIncome1 Coapplic
0 Male No 0.0 Graduate No 5849
1 Male Yes 1.0 Graduate No 4583
2 Male Yes 0.0 Graduate Yes 3000
Not
3 Male Yes 0.0 No 2583
Graduate
4 Male No 0.0 Graduate No 6000
Handling Missing Data
Missing data occurs commonly in many data analysis applications. One of the goals of pandas
is to make working with missing data as painless as possible
3 string_data = pd.Series(['aardvark', 'artichoke', np.nan,'avocado'])
1 string_data
0 aardvark
1 artichoke
2 NaN
3 avocado
dtype: object
1 string_data.isnull()
0 False
1 False
2 True
3 False
dtype: bool
link text
1 # Filtering Out Missing Data

2 from numpy import nan as NA
3 data = pd.Series([1, NA, 3.5, NA, 7])
4 data
0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
dtype: float64
1 data.dropna() # data[data.notnull()]
0 1.0
2 3.5
4 7.0
dtype: float64
1 # Passing how='all' will only drop rows that are all NA:
2 data.dropna(how='all')
0 1.0
2 3.5
4 7.0
dtype: float64
1 # To drop columns in the same way, pass axis=1:
2 data[4] = NA
3 data
0 1.0
1 NaN
2 3.5
3 NaN
4 NaN
dtype: float64
1 data.dropna(axis=0, how='all')
0 1.0
2 3.5
dtype: float64
1 # Filling In Missing Data

4 df = pd.DataFrame(np.random.randn(7, 3))
5 df
0 1 2
0 -0.411211 -0.242755 -1.265463
1 -0.041934 0.194562 0.038329
2 0.818116 1.182376 -0.511086
3 -1.128460 1.114058 -1.124696
4 0.098289 0.269538 1.364129
5 0.866626 0.974051 -0.608766
6 2.068476 -0.163661 -0.766034
1 df.iloc[:4, 1] = NA
1 df.iloc[:2, 2] = NA
1 df
0 1 2
0 -0.411211 NaN NaN
1 -0.041934 NaN NaN
2 0.818116 NaN -0.511086
3 -1.128460
1 df.fillna(0) NaN -1.124696
4 0.098289 0.269538 1.364129

0 1 2
5 0.866626 0.974051 -0.608766
0 -0.411211 0.000000 0.000000
6 2.068476 -0.163661 -0.766034
1 -0.041934 0.000000 0.000000
2 0.818116 0.000000 -0.511086
3 -1.128460 0.000000 -1.124696
4 0.098289 0.269538 1.364129
5 0.866626 0.974051 -0.608766
6 2.068476 -0.163661 -0.766034
1 _ = df.fillna(0, inplace=True)
1 df
0 1 2
0 -0.411211 0.000000 0.000000
1 -0.041934 0.000000 0.000000
2 0.818116 0.000000 -0.511086
3 -1.128460 0.000000 -1.124696
4 0.098289 0.269538 1.364129
5 0.866626 0.974051 -0.608766
6 2.068476 -0.163661 -0.766034
Argument Description
value
Scalar value or dict-like object to use to fill missing values method
Interpolation; by default 'ffill' if function called with no other arguments axis
Axis to fill on; default axis=0
inplace Modify the calling object without producing a copy
limit For forward and backward filling, maximum number of consecutive periods to fill
1 df = pd.DataFrame(np.random.randn(6, 3))
2 df.iloc[2:, 1] = NA
3 df.iloc[4:, 2] = NA
4 df
0 1 2
0 -1.161707 -0.616881 0.892303
1 1.684420 1.952207 -1.987142
2 2.378477 NaN 0.940864
3 -0.548917 NaN 0.536736
4 -0.796169 NaN NaN
5 1.590066 NaN NaN
How to find the null values in the records¶
1 cr.isnull().sum()
2
Gender1 24
Married 3
Dependents 25
Education 0
Self_Employed 55
ApplicantIncome1 0
CoapplicantIncome 0
LoanAmount 27
Loan_Amount_Term 20
Credit_History 79
Property_Area 0
Loan_Status 0
dtype: int64
How to fill the null values with some default value¶
1 cr.Gender1 = cr.Gender1.fillna("Male")
2
1 cr.Married = cr.Married.fillna("Yes")
2
1 cr.Dependents = cr.Dependents.fillna(0)
1 cr.Self_Employed = cr.Self_Employed.fillna("no")
2
1 cr.LoanAmount = cr.LoanAmount.fillna( cr.LoanAmount.mean() )

2
1 cr.Loan_Amount_Term = cr.Loan_Amount_Term.fillna(cr.Loan_Amount_Term.mean())
2
1 cr.Credit_History = cr.Credit_History.fillna(0)
2
1 cr.isnull().sum()
2
Gender1 0
Married 0
Dependents 0
Education 0
Self_Employed 0
ApplicantIncome1 0
CoapplicantIncome 0
LoanAmount 0
Loan_Amount_Term 0
Credit_History 0
Property_Area 0
Loan_Status 0
dtype: int64
1 #cr.dropna()
2
Converting the non numeric columns into numeric for

model building and analysis¶
1 # ONE HOT ENCODING / Dummy Variable Categarization

2
3 # Gender ( Male , Females)
4
5 #Male = 1
6 #Female = 0
7
8 #----------------
9 # Male = 0
10 # Female = 1
11
1 cr.replace({ "Gender1" :{"Male" :0 , "Female" :1} ,

2 "Married" : {"No" :0 , "Yes" :1} } , inplace = True)
1 cr.head(20)
2
Gender1 Married Dependents Education Self_Employed ApplicantIncome1 Coappli
0 0 0 0.0 Graduate No 5849
1 0 1 1.0 Graduate No 4583
2 0 1 0.0 Graduate Yes 3000
Not
3 0 1 0.0 No 2583
Graduate
4 0 0 0.0 Graduate No 6000
5 0 1 2.0 Graduate Yes 5417
Not
6 0 1 0.0 No 2333
Graduate
7 0 1 4.0 Graduate No 3036
8 0 1 2.0 Graduate No 4006
9 0 1 1.0 Graduate No 12841
10 0 1 2.0 Graduate No 3200
11 0 1 2.0 Graduate no 2500
12 0 1 2.0 Graduate No 3073
13 0 0 0.0 Graduate No 1853
14 0 1 2.0 Graduate No 1299
15 0 0 0.0 Graduate No 4950
Not
16 0 0 1.0 No 3596
Graduate
17 1 0 0.0 Graduate No 3510
Not
18 0 1 0.0 No 4887
Graduate
19 0 1 0.0 Graduate no 2600
1 cr = pd.read_csv(r"CreditRisk.csv")
2
1 cr = pd.get_dummies(cr ,columns = ['Gender' ,'Married', 'Dependents', 'Education'])

2 # One hot encodinG
1 cr.head()
2
Loan_ID Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amo
0 LP001002 No 5849 0.0 NaN
1 LP001003 No 4583 1508.0 128.0
2 LP001005 Yes 3000 0.0 66.0
3 LP001006 No 2583 2358.0 120.0
4 LP001008 No 6000 0.0 141.0
1 cr.shape
2
(981, 20)
check 0s completed at 5:33 PM

Cleaning Data

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Cleaning Data

Uploaded by

Copyright:

Available Formats

12/24/21, 5:34 PM CLEANING_DATA.

Loading the datasets and processing some records

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',

1 cr.head() ## top 5 records

Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome

0 LP001002 Male No 0.0 Graduate No 5849

1 LP001003 Male Yes 1.0 Graduate No 4583

2 LP001005 Male Yes 0.0 Graduate Yes 3000

4 LP001008 Male No 0.0 Graduate No 6000

Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome

0 LP001002 Male No 0.0 Graduate No 5849

1 LP001003 Male Yes 1.0 Graduate No 4583

2 LP001005 Male Yes 0.0 Graduate Yes 3000

5 LP001011 Male Yes 2.0 Graduate Yes 5417

975 LP002969 Male Yes 1.0 Graduate No 226

977 LP002975 Male Yes 0.0 Graduate No 415

978 LP002980 Male No 0.0 Graduate No 325

979 LP002986 Male Yes 0.0 Graduate No 500

980 LP002989 Male No 0.0 Graduate Yes 920

1 cr.shape # number of rows and columns

1 cr.describe() # gives the description of each column( only numerical columns)

Dependents ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term

count 956.000000 981.000000 981.000000 954.000000 961.000000

mean 0.881799 5179.795107 1601.916330 142.511530 342.201873

std 1.255623 5695.104533 2718.772806 77.421743 65.100602

max 4.000000 81000.000000 41667.000000 700.000000 480.000000

1 # want to create a DF earning more than a particular amout( 3000)

1 df1 = cr[aa] # df1 is the name of new data frame

Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncom

0 LP001002 Male No 0.0 Graduate No 584

4 LP001008 Male No 0.0 Graduate No 600

5 LP001011 Male Yes 2.0 Graduate Yes 541

9 LP001020 Male Yes 1.0 Graduate No 1284

974 LP002965 Female Yes 0.0 Not

975 LP002969 Male Yes 1.0 Graduate No 226

977 LP002975 Male Yes 0.0 Graduate No 415

978 LP002980 Male No 0.0 Graduate No 325

979 LP002986 Male Yes 0.0 Graduate No 500

696 rows × 13 columns

1 # ApplicantIncome > 3000 , Gender = Male , should also be married

Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncom

1 LP001003 Male Yes 1.0 Graduate No 458

5 LP001011 Male Yes 2.0 Graduate Yes 541

7 LP001014 Male Yes 4.0 Graduate No 303

8 LP001018 Male Yes 2.0 Graduate No 400

9 LP001020 Male Yes 1.0 Graduate No 1284

... ... ... ... ... ... ... .

970 LP002935 Male Yes 1.0 Graduate No 379

977 LP002975 Male Yes 0.0 Graduate No 415

979 LP002986 Male Yes 0.0 Graduate No 500

408 rows × 13 columns

count min max mean

Female 182 0 19484 4458.906593

Male 775 0 81000 5256.925161

1 cr.groupby('Gender').agg(['min' ,'max' ,'mean'])

Dependents ApplicantIncome CoapplicantIncome LoanAm

1 cr.rename(columns = {'Gender':'GEN'}, inplace = True)

Loan_ID GEN Married Dependents Education Self_Employed ApplicantIncome

0 LP001002 Male No 0.0 Graduate No 5849

1 LP001003 Male Yes 1.0 Graduate No 4583

2 LP001005 Male Yes 0.0 Graduate Yes 3000

4 LP001008 Male No 0.0 Graduate No 6000