You are on page 1of 7

In 

[1]: import pandas as pd

import numpy as np

In [6]: df=pd.read_csv('Churn_Modelling.csv')

type(df),df.shape

(pandas.core.frame.DataFrame, (10000, 14))


Out[6]:

In [7]: df.drop(['HasCrCard'],axis=1,inplace=False) #drops column

# df.drop(['France'],axis=0,inplace=False)

new_df=df.drop(['HasCrCard'],axis=1,inplace=False) #drops column and stores view a new


new_df.shape

df.shape

(10000, 14)
Out[7]:

In [8]: df.columns,df.values

(Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',

Out[8]:
'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',

'IsActiveMember', 'EstimatedSalary', 'Exited'],

dtype='object'),

array([[1, 15634602, 'Hargrave', ..., 1, 101348.88, 1],

[2, 15647311, 'Hill', ..., 1, 112542.58, 0],

[3, 15619304, 'Onio', ..., 0, 113931.57, 1],

...,

[9998, 15584532, 'Liu', ..., 1, 42085.58, 1],

[9999, 15682355, 'Sabbatini', ..., 0, 92888.52, 1],

[10000, 15628319, 'Walker', ..., 0, 38190.78, 0]], dtype=object))

In [10]: new_dt=pd.read_csv('Churn_Modelling.csv',usecols=["Gender","Age","Tenure"])

new_dt.head()

Out[10]: Gender Age Tenure

0 Female 42 2

1 Female 41 1

2 Female 42 8

3 Female 39 1

4 Female 43 2

In [11]: new_dt=new_dt.sample(n=10)#to select random rows

new_dt

Out[11]: Gender Age Tenure

6447 Female 31 3

9466 Male 52 2

5569 Male 36 0

5967 Male 37 2

8931 Female 55 5

861 Female 32 2

5356 Male 47 2

6881 Male 29 3

2365 Male 32 5

5191 Female 58 1

In [12]: new_dt=new_dt.sample(frac=0.5)#takes ceiling value (7/2)=3.5

new_dt

Out[12]: Gender Age Tenure

5569 Male 36 0

861 Female 32 2

6881 Male 29 3

5356 Male 47 2

9466 Male 52 2

In [14]: new_dt=pd.read_csv("Churn_Modelling.csv")

new_dt.isna().sum()

RowNumber 0

Out[14]:
CustomerId 0

Surname 0

CreditScore 0

Geography 0

Gender 0

Age 0

Tenure 0

Balance 0

NumOfProducts 0

HasCrCard 0

IsActiveMember 0

EstimatedSalary 0

Exited 0

dtype: int64

In [15]: missing_ind=np.random.randint(10,size=4)

new_dt.loc[missing_ind,['CreditScore','Age','NumOfProducts']]=np.nan

new_dt

Out[15]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance

0 1 15634602 Hargrave 619.0 France Female 42.0 2 0.00

1 2 15647311 Hill 608.0 Spain Female 41.0 1 83807.86

2 3 15619304 Onio NaN France Female NaN 8 159660.80

3 4 15701354 Boni NaN France Female NaN 1 0.00

4 5 15737888 Mitchell 850.0 Spain Female 43.0 2 125510.82

... ... ... ... ... ... ... ... ... .

9995 9996 15606229 Obijiaku 771.0 France Male 39.0 5 0.00

9996 9997 15569892 Johnstone 516.0 France Male 35.0 10 57369.61

9997 9998 15584532 Liu 709.0 France Female 36.0 7 0.00

9998 9999 15682355 Sabbatini 772.0 Germany Male 42.0 3 75075.31

9999 10000 15628319 Walker 792.0 France Female 28.0 4 130142.79

10000 rows × 14 columns

In [16]: new_dt.iloc[missing_ind,-1]=np.nan

new_dt

Out[16]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance

0 1 15634602 Hargrave 619.0 France Female 42.0 2 0.00

1 2 15647311 Hill 608.0 Spain Female 41.0 1 83807.86

2 3 15619304 Onio NaN France Female NaN 8 159660.80

3 4 15701354 Boni NaN France Female NaN 1 0.00

4 5 15737888 Mitchell 850.0 Spain Female 43.0 2 125510.82

... ... ... ... ... ... ... ... ... .

9995 9996 15606229 Obijiaku 771.0 France Male 39.0 5 0.00

9996 9997 15569892 Johnstone 516.0 France Male 35.0 10 57369.61

9997 9998 15584532 Liu 709.0 France Female 36.0 7 0.00

9998 9999 15682355 Sabbatini 772.0 Germany Male 42.0 3 75075.31

9999 10000 15628319 Walker 792.0 France Female 28.0 4 130142.79

10000 rows × 14 columns

In [17]: new_dt.isna().sum()

RowNumber 0

Out[17]:
CustomerId 0

Surname 0

CreditScore 4

Geography 0

Gender 0

Age 4

Tenure 0

Balance 0

NumOfProducts 4

HasCrCard 0

IsActiveMember 0

EstimatedSalary 0

Exited 4

dtype: int64

In [18]: mode=new_dt['Geography'].value_counts().index[0]

new_dt['Geography'].fillna(value=mode,inplace=True)

new_dt

Out[18]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance

0 1 15634602 Hargrave 619.0 France Female 42.0 2 0.00

1 2 15647311 Hill 608.0 Spain Female 41.0 1 83807.86

2 3 15619304 Onio NaN France Female NaN 8 159660.80

3 4 15701354 Boni NaN France Female NaN 1 0.00

4 5 15737888 Mitchell 850.0 Spain Female 43.0 2 125510.82

... ... ... ... ... ... ... ... ... .

9995 9996 15606229 Obijiaku 771.0 France Male 39.0 5 0.00

9996 9997 15569892 Johnstone 516.0 France Male 35.0 10 57369.61

9997 9998 15584532 Liu 709.0 France Female 36.0 7 0.00

9998 9999 15682355 Sabbatini 772.0 Germany Male 42.0 3 75075.31

9999 10000 15628319 Walker 792.0 France Female 28.0 4 130142.79

10000 rows × 14 columns

In [19]: new_dts=pd.read_csv("Churn_Modelling.csv")

new_dts.iloc[missing_ind,-1]=np.nan

missing_ind=np.random.randint(10,size=4)

new_dts.dropna(axis=0,how='any',inplace=True)

new_dts

Out[19]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance

0 1 15634602 Hargrave 619 France Female 42 2 0.00

1 2 15647311 Hill 608 Spain Female 41 1 83807.86

4 5 15737888 Mitchell 850 Spain Female 43 2 125510.82

6 7 15592531 Bartlett 822 France Male 50 7 0.00

7 8 15656148 Obinna 376 Germany Female 29 4 115046.74

... ... ... ... ... ... ... ... ... ..

9995 9996 15606229 Obijiaku 771 France Male 39 5 0.00

9996 9997 15569892 Johnstone 516 France Male 35 10 57369.61

9997 9998 15584532 Liu 709 France Female 36 7 0.00

9998 9999 15682355 Sabbatini 772 Germany Male 42 3 75075.31

9999 10000 15628319 Walker 792 France Female 28 4 130142.79

9996 rows × 14 columns

In [20]: mode=new_dt['NumOfProducts'].value_counts().index[0]

new_dt['NumOfProducts'].fillna(value=mode,inplace=True)

new_dt

Out[20]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance

0 1 15634602 Hargrave 619.0 France Female 42.0 2 0.00

1 2 15647311 Hill 608.0 Spain Female 41.0 1 83807.86

2 3 15619304 Onio NaN France Female NaN 8 159660.80

3 4 15701354 Boni NaN France Female NaN 1 0.00

4 5 15737888 Mitchell 850.0 Spain Female 43.0 2 125510.82

... ... ... ... ... ... ... ... ... .

9995 9996 15606229 Obijiaku 771.0 France Male 39.0 5 0.00

9996 9997 15569892 Johnstone 516.0 France Male 35.0 10 57369.61

9997 9998 15584532 Liu 709.0 France Female 36.0 7 0.00

9998 9999 15682355 Sabbatini 772.0 Germany Male 42.0 3 75075.31

9999 10000 15628319 Walker 792.0 France Female 28.0 4 130142.79

10000 rows × 14 columns

In [21]: avg=new_dt['Age'].mean()

new_dt['Age'].fillna(value=avg,inplace=True)

new_dt

Out[21]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Ba

0 1 15634602 Hargrave 619.0 France Female 42.000000 2

1 2 15647311 Hill 608.0 Spain Female 41.000000 1 83

2 3 15619304 Onio NaN France Female 38.922169 8 159

3 4 15701354 Boni NaN France Female 38.922169 1

4 5 15737888 Mitchell 850.0 Spain Female 43.000000 2 125

... ... ... ... ... ... ... ... ...

9995 9996 15606229 Obijiaku 771.0 France Male 39.000000 5

9996 9997 15569892 Johnstone 516.0 France Male 35.000000 10 57

9997 9998 15584532 Liu 709.0 France Female 36.000000 7

9998 9999 15682355 Sabbatini 772.0 Germany Male 42.000000 3 75

9999 10000 15628319 Walker 792.0 France Female 28.000000 4 130

10000 rows × 14 columns

In [22]: spaing2=df[(new_dt.Geography=='Spain') & (new_dt.NumOfProducts>2)]

spaing2.Geography.value_counts()

Spain 73

Out[22]:
Name: Geography, dtype: int64

In [23]: querydt=new_dt.query('CreditScore>700')

querydt

Out[23]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance

4 5 15737888 Mitchell 850.0 Spain Female 43.0 2 125510.82

6 7 15592531 Bartlett 822.0 France Male 50.0 7 0.00

19 20 15568982 Hao 726.0 France Female 24.0 6 0.00

20 21 15577657 McDonald 732.0 France Male 41.0 8 0.00

24 25 15625047 Yen 846.0 France Female 38.0 5 0.00

... ... ... ... ... ... ... ... ... ..

9994 9995 15719294 Wood 800.0 France Female 29.0 2 0.00

9995 9996 15606229 Obijiaku 771.0 France Male 39.0 5 0.00

9997 9998 15584532 Liu 709.0 France Female 36.0 7 0.00

9998 9999 15682355 Sabbatini 772.0 Germany Male 42.0 3 75075.31

9999 10000 15628319 Walker 792.0 France Female 28.0 4 130142.79

3116 rows × 14 columns


In [24]: new_dt[new_dt['Tenure'].isin([2,4,6,8,10])][:100]

Out[24]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Bal

0 1 15634602 Hargrave 619.0 France Female 42.000000 2

2 3 15619304 Onio NaN France Female 38.922169 8 1596

4 5 15737888 Mitchell 850.0 Spain Female 43.000000 2 1255

5 6 15574012 Chu NaN Spain Male 38.922169 8 1137

7 8 15656148 Obinna 376.0 Germany Female 29.000000 4 11504

... ... ... ... ... ... ... ... ...

214 215 15785542 Kornilova 572.0 Germany Male 26.000000 4 1182

219 220 15774854 Fuller 592.0 France Male 54.000000 8

222 223 15727829 McIntyre 567.0 France Male 42.000000 2

223 224 15733247 Stevenson 850.0 France Male 33.000000 10

224 225 15568748 Poole 671.0 Germany Male 45.000000 6 995

100 rows × 14 columns

In [25]: df[['Surname','Gender','Age','Balance','Exited']].groupby(['Surname','Balance']).mean(

Out[25]: Age Exited

Surname Balance

Abazu 0.00 28.0 0.0

85534.83 32.0 0.0

Abbie 186796.37 31.0 0.0

Abbott 0.00 29.0 0.0

34547.82 24.0 0.0

... ... ... ...

Zuev 116537.60 39.0 0.0

Zuyev 106259.63 39.0 0.0

170491.84 53.0 1.0

Zuyeva 0.00 41.0 0.0

106301.85 32.0 0.0

8088 rows × 2 columns

In [ ]:

You might also like