Professional Documents
Culture Documents
import numpy as np
In [6]: df=pd.read_csv('Churn_Modelling.csv')
type(df),df.shape
# df.drop(['France'],axis=0,inplace=False)
df.shape
(10000, 14)
Out[7]:
In [8]: df.columns,df.values
Out[8]:
'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
dtype='object'),
...,
In [10]: new_dt=pd.read_csv('Churn_Modelling.csv',usecols=["Gender","Age","Tenure"])
new_dt.head()
0 Female 42 2
1 Female 41 1
2 Female 42 8
3 Female 39 1
4 Female 43 2
new_dt
6447 Female 31 3
9466 Male 52 2
5569 Male 36 0
5967 Male 37 2
8931 Female 55 5
861 Female 32 2
5356 Male 47 2
6881 Male 29 3
2365 Male 32 5
5191 Female 58 1
new_dt
5569 Male 36 0
861 Female 32 2
6881 Male 29 3
5356 Male 47 2
9466 Male 52 2
In [14]: new_dt=pd.read_csv("Churn_Modelling.csv")
new_dt.isna().sum()
RowNumber 0
Out[14]:
CustomerId 0
Surname 0
CreditScore 0
Geography 0
Gender 0
Age 0
Tenure 0
Balance 0
NumOfProducts 0
HasCrCard 0
IsActiveMember 0
EstimatedSalary 0
Exited 0
dtype: int64
In [15]: missing_ind=np.random.randint(10,size=4)
new_dt.loc[missing_ind,['CreditScore','Age','NumOfProducts']]=np.nan
new_dt
Out[15]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance
In [16]: new_dt.iloc[missing_ind,-1]=np.nan
new_dt
Out[16]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance
In [17]: new_dt.isna().sum()
RowNumber 0
Out[17]:
CustomerId 0
Surname 0
CreditScore 4
Geography 0
Gender 0
Age 4
Tenure 0
Balance 0
NumOfProducts 4
HasCrCard 0
IsActiveMember 0
EstimatedSalary 0
Exited 4
dtype: int64
In [18]: mode=new_dt['Geography'].value_counts().index[0]
new_dt['Geography'].fillna(value=mode,inplace=True)
new_dt
Out[18]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance
In [19]: new_dts=pd.read_csv("Churn_Modelling.csv")
new_dts.iloc[missing_ind,-1]=np.nan
missing_ind=np.random.randint(10,size=4)
new_dts.dropna(axis=0,how='any',inplace=True)
new_dts
Out[19]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance
In [20]: mode=new_dt['NumOfProducts'].value_counts().index[0]
new_dt['NumOfProducts'].fillna(value=mode,inplace=True)
new_dt
Out[20]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance
In [21]: avg=new_dt['Age'].mean()
new_dt['Age'].fillna(value=avg,inplace=True)
new_dt
spaing2.Geography.value_counts()
Spain 73
Out[22]:
Name: Geography, dtype: int64
In [23]: querydt=new_dt.query('CreditScore>700')
querydt
Out[23]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance
Out[24]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Bal
In [25]: df[['Surname','Gender','Age','Balance','Exited']].groupby(['Surname','Balance']).mean(
Surname Balance
In [ ]: