You are on page 1of 11

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:

data=pd.read_excel('customer_churn_large_dataset.xlsx')

In [3]:

data.head()

Out[3]:
CustomerID Name Age Gender Location Subscription_Length_Months
Monthly_Bill Total_Usage_GB Churn
0 1 Customer_1 63 Male Los Angeles 17 73.36 236 0
1 2 Customer_2 62 Female New York 1 48.76 172 0
2 3 Customer_3 24 Female Los Angeles 5 85.47 460
0
3 4 Customer_4 36 Female Miami 3 97.94 297 1
4 5 Customer_5 46 Female Miami 19 58.14 266 0

1. Data Preprocessing:¶ <#1.-Data-Preprocessing:>

In [4]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3208 entries, 0 to 3207
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CustomerID 3208 non-null int64
1 Name 3208 non-null object
2 Age 3208 non-null int64
3 Gender 3208 non-null object
4 Location 3208 non-null object
5 Subscription_Length_Months 3208 non-null int64
6 Monthly_Bill 3208 non-null float64
7 Total_Usage_GB 3208 non-null int64
8 Churn 3208 non-null int64
dtypes: float64(1), int64(5), object(3)
memory usage: 225.7+ KB

In [5]:

# there is no null value in data and no missing value in data.

In [6]:

data.describe()

Out[6]:
CustomerID Age Subscription_Length_Months Monthly_Bill
Total_Usage_GB Churn
count 3208.000000 3208.000000 3208.000000 3208.000000
3208.000000
3208.000000
mean 1604.500000 43.904613 12.363778 64.821397 273.826683 0.507481
std 926.214158 15.175543 6.966723 20.180945 129.579639 0.500022
min 1.000000 18.000000 1.000000 30.020000 50.000000 0.000000
25% 802.750000 31.000000 6.000000 47.202500 161.750000 0.000000
50% 1604.500000 44.000000 12.000000 64.665000 273.000000 1.000000
75% 2406.250000 57.000000 18.000000 82.635000 386.000000 1.000000
max 3208.000000 70.000000 24.000000 99.960000 500.000000 1.000000

In [7]:

plt.plot(data['Gender'],data['Churn'],label='male')
plt.legend()
plt.show()

In [8]:

plt.scatter(data['Age'],data['Churn'])

Out[8]:

<matplotlib.collections.PathCollection at 0x18d5ae35d30>

In [9]:

plt.hist(data['Churn'])
plt.show()

In [10]:

data.corr()

C:\Users\Sonu\AppData\Local\Temp\ipykernel_17892\2627137660.py:1: FutureWarning:
The default value of numeric_only in DataFrame.corr is deprecated. In a future
version, it will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
data.corr()

Out[10]:
CustomerID Age Subscription_Length_Months Monthly_Bill
Total_Usage_GB Churn
CustomerID 1.000000 0.011307 0.008368 0.029753 -0.006244 -0.000607
Age 0.011307 1.000000 -0.009815 0.020243 -0.024137 -0.007344
Subscription_Length_Months 0.008368 -0.009815 1.000000 -0.040562
-0.035997 -0.005481
Monthly_Bill 0.029753 0.020243 -0.040562 1.000000 -0.022275
0.002809
Total_Usage_GB -0.006244 -0.024137 -0.035997 -0.022275 1.000000
0.016984
Churn -0.000607 -0.007344 -0.005481 0.002809 0.016984 1.000000

In [11]:

data['Churn'].value_counts()

Out[11]:
1 1628
0 1580
Name: Churn, dtype: int64

train_test_split¶ <#train_test_split>

In [12]:

from sklearn.model_selection import train_test_split

In [13]:

train_df,test_df=train_test_split(data,test_size=0.2,random_state=42)

In [14]:

train_df.shape

Out[14]:

(2566, 9)

In [15]:

data.shape

Out[15]:

(3208, 9)

In [16]:

test_df.shape

Out[16]:

(642, 9)

In [17]:

input_cols=list(data.columns)[1:-1]

In [18]:

input_cols

Out[18]:

['Name',
'Age',
'Gender',
'Location',
'Subscription_Length_Months',
'Monthly_Bill',
'Total_Usage_GB']

In [19]:
target_cols='Churn'

In [20]:

target_cols

Out[20]:

'Churn'

In [22]:

train_inputs=train_df[input_cols].copy()

In [44]:

test_inputs=test_df[input_cols].copy()

In [58]:

train_targets=train_df[target_cols].copy()

In [69]:

test_targets=test_df[target_cols].copy()

In [25]:

train_inputs

Out[25]:
Name Age Gender Location Subscription_Length_Months Monthly_Bill
Total_Usage_GB
460 Customer_461 18 Male Houston 12 65.83 156
43 Customer_44 20 Female Miami 20 79.84 122
87 Customer_88 50 Male Houston 10 49.00 114
2438 Customer_2439 53 Female New York 9 60.10 285
990 Customer_991 68 Female Miami 22 84.18 228
... ... ... ... ... ... ... ...
1095 Customer_1096 60 Female Miami 10 72.52 191
1130 Customer_1131 36 Female Houston 10 45.99 100
1294 Customer_1295 33 Male Houston 6 75.81 324
860 Customer_861 55 Male Chicago 12 89.19 315
3174 Customer_3175 30 Female Los Angeles 18 99.42 345

2566 rows × 7 columns

In [59]:

train_targets

Out[59]:

460 1
43 1
87 0
2438 0
990 0
..
1095 1
1130 0
1294 0
860 1
3174 1
Name: Churn, Length: 2566, dtype: int64

In [60]:

numerical_cols=train_inputs.select_dtypes(include=np.number).columns.tolist()

In [28]:

categoric_cols=train_inputs.select_dtypes(include='object').columns.tolist()

In [30]:

categoric_cols

Out[30]:

['Name', 'Gender', 'Location']

In [31]:

numerical_cols

Out[31]:

['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']

In [32]:

train_inputs[numerical_cols].describe()

Out[32]:
Age Subscription_Length_Months Monthly_Bill Total_Usage_GB
count 2566.000000 2566.000000 2566.000000 2566.000000
mean 43.594310 12.286048 64.677689 272.079891
std 15.142142 6.999583 20.116820 129.322741
min 18.000000 1.000000 30.020000 50.000000
25% 31.000000 6.000000 47.302500 161.000000
50% 44.000000 12.000000 64.100000 269.000000
75% 56.000000 18.000000 82.390000 383.000000
max 70.000000 24.000000 99.960000 500.000000

OneHotEncoding of categorical columns¶


<#OneHotEncoding-of-categorical-columns>

In [33]:

from sklearn.preprocessing import OneHotEncoder

In [34]:

encoder=OneHotEncoder(sparse=False,handle_unknown='ignore')
In [35]:

encoder.fit(data[categoric_cols])

C:\Users\Sonu\anaconda3\lib\site-packages\sklearn\preprocessing\_encoders.py:868:
FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be
removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default
value.
warnings.warn(

Out[35]:

OneHotEncoder(handle_unknown='ignore', sparse=False, sparse_output=False)

*In a Jupyter environment, please rerun this cell to show the HTML
representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try
loading this page with nbviewer.org.*
OneHotEncoder

OneHotEncoder(handle_unknown='ignore', sparse=False, sparse_output=False)

In [36]:

encoder.categories_

Out[36]:

[array(['Customer_1', 'Customer_10', 'Customer_100', ..., 'Customer_997',


'Customer_998', 'Customer_999'], dtype=object),
array(['Female', 'Male'], dtype=object),
array(['Chicago', 'Houston', 'Los Angeles', 'Miami', 'New York'],
dtype=object)]

In [37]:

encoded_cols=list(encoder.get_feature_names_out(categoric_cols))

print(encoded_cols)train_inputs[encoded_cols]=encoder.transform(train_inputs[catego
ric_cols])test_inputs[encoded_cols]=encoder.transform(test_inputs[categoric_cols])
In [47]:

train_inputs[encoded_cols]

Out[47]:
Name_Customer_1 Name_Customer_10 Name_Customer_100
Name_Customer_1000 Name_Customer_1001 Name_Customer_1002
Name_Customer_1003 Name_Customer_1004 Name_Customer_1005
Name_Customer_1006 ... Name_Customer_997 Name_Customer_998
Name_Customer_999 Gender_Female Gender_Male Location_Chicago
Location_Houston Location_Los Angeles Location_Miami Location_New York
460 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
43 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0
1.0 0.0 0.0 0.0 0.0 1.0 0.0
87 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0
0.0 1.0 0.0 1.0 0.0 0.0 0.0
2438 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
990 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...
1095 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
1130 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
1294 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
860 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
3174 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0

2566 rows × 3215 columns

In [48]:

train_inputs[numerical_cols+encoded_cols]

Out[48]:
Age Subscription_Length_Months Monthly_Bill Total_Usage_GB
Name_Customer_1 Name_Customer_10 Name_Customer_100 Name_Customer_1000
Name_Customer_1001 Name_Customer_1002 ... Name_Customer_997
Name_Customer_998 Name_Customer_999 Gender_Female Gender_Male
Location_Chicago Location_Houston Location_Los Angeles Location_Miami
Location_New York
460 18 12 65.83 156 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
43 20 20 79.84 122 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0 0.0
1.0 0.0 0.0 0.0 0.0 1.0 0.0
87 50 10 49.00 114 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0 0.0
0.0 1.0 0.0 1.0 0.0 0.0 0.0
2438 53 9 60.10 285 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
990 68 22 84.18 228 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...
1095 60 10 72.52 191 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
1130 36 10 45.99 100 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
1294 33 6 75.81 324 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
860 55 12 89.19 315 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
3174 30 18 99.42 345 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0

2566 rows × 3219 columns

In [50]:

test_inputs[encoded_cols]

Out[50]:
Name_Customer_1 Name_Customer_10 Name_Customer_100
Name_Customer_1000 Name_Customer_1001 Name_Customer_1002
Name_Customer_1003 Name_Customer_1004 Name_Customer_1005
Name_Customer_1006 ... Name_Customer_997 Name_Customer_998
Name_Customer_999 Gender_Female Gender_Male Location_Chicago
Location_Houston Location_Los Angeles Location_Miami Location_New York
610 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
3051 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
2905 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0
2891 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
2886 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...
1027 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
1611 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
3122 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
420 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
2353 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0

642 rows × 3215 columns

In [51]:

test_inputs.shape

Out[51]:

(642, 3222)

In [52]:

train_inputs.shape

Out[52]:

(2566, 3222)

In [53]:
test_inputs[numerical_cols+encoded_cols]

Out[53]:
Age Subscription_Length_Months Monthly_Bill Total_Usage_GB
Name_Customer_1 Name_Customer_10 Name_Customer_100 Name_Customer_1000
Name_Customer_1001 Name_Customer_1002 ... Name_Customer_997
Name_Customer_998 Name_Customer_999 Gender_Female Gender_Male
Location_Chicago Location_Houston Location_Los Angeles Location_Miami
Location_New York
610 35 9 33.59 282 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0 0.0
1.0 0.0 0.0 0.0 0.0 0.0 1.0
3051 20 10 37.27 335 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
2905 67 20 57.08 104 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0
2891 63 1 67.51 58 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0 0.0
1.0 0.0 0.0 1.0 0.0 0.0 0.0
2886 49 16 55.84 202 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...
1027 51 23 55.85 306 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
1611 50 5 75.53 466 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
3122 51 6 43.39 300 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
420 46 24 34.14 387 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
2353 68 9 97.83 90 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0 0.0
0.0 1.0 1.0 0.0 0.0 0.0 0.0

642 rows × 3219 columns

Decision Tree Regression¶ <#Decision-Tree-Regression>

In [54]:

from sklearn.tree import DecisionTreeRegressor

In [55]:

model=DecisionTreeRegressor()

In [61]:

# model fit in data


model.fit(train_inputs[numerical_cols+encoded_cols],train_targets)

Out[61]:

DecisionTreeRegressor()

*In a Jupyter environment, please rerun this cell to show the HTML
representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try
loading this page with nbviewer.org.*
DecisionTreeRegressor

DecisionTreeRegressor()

In [63]:

train_preds=model.predict(train_inputs[numerical_cols+encoded_cols])

Importing metrics to evalute accuracy_score of prediction¶


<#Importing-metrics-to-evalute-accuracy_score-of-prediction>

In [65]:

from sklearn.metrics import r2_score

In [66]:

r2_score(train_targets,train_preds)

Out[66]:

1.0

In [73]:

model.fit(test_inputs[numerical_cols+encoded_cols],test_targets)

Out[73]:

DecisionTreeRegressor()

*In a Jupyter environment, please rerun this cell to show the HTML
representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try
loading this page with nbviewer.org.*
DecisionTreeRegressor

DecisionTreeRegressor()

In [74]:

test_preds=model.predict(test_inputs[numerical_cols+encoded_cols])

In [75]:

r2_score(test_targets,test_preds)

Out[75]:
1.0

In [ ]:

You might also like