Tel Churn

In
[1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 300)
In [2]:
# read data
churn = pd.read_csv("telecom_data_for_students.csv")
In [3]:
# look at initial rows of the data
churn.head(10)
Out[3]:
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
In [4]:
# feature type summary
churn.info(verbose=1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 226 columns):
mobile_number int64
circle_id int64
loc_og_t2o_mou float64
std_og_t2o_mou float64
loc_ic_t2o_mou float64
last_date_of_month_6 object
arpu_6 float64
arpu_7 float64
arpu_8 float64
arpu_9 float64
onnet_mou_6 float64
onnet_mou_7 float64
onnet_mou_8 float64
onnet_mou_9 float64
offnet_mou_6 float64
roam_ic_mou_6 float64
roam_og_mou_6 float64
loc_og_t2t_mou_6 float64
loc_og_t2m_mou_6 float64
loc_og_t2f_mou_6 float64
loc_og_t2c_mou_6 float64
loc_og_mou_6 float64
std_og_t2t_mou_6 float64
std_og_t2m_mou_6 float64
std_og_t2f_mou_6 float64
std_og_t2c_mou_6 float64
std_og_mou_6 float64
isd_og_mou_6 float64
spl_og_mou_6 float64
og_others_6 float64
og_others_7 float64
og_others_8 float64
og_others_9 float64
total_og_mou_6 float64
loc_ic_t2t_mou_6 float64
loc_ic_t2m_mou_6 float64
loc_ic_t2f_mou_6 float64
loc_ic_mou_6 float64
std_ic_t2t_mou_6 float64
std_ic_t2m_mou_6 float64
std_ic_t2f_mou_6 float64
std_ic_t2o_mou_6 float64
std_ic_mou_6 float64
total_ic_mou_6 float64
spl_ic_mou_6 float64
isd_ic_mou_6 float64
ic_others_6 float64
ic_others_7 float64
ic_others_8 float64
ic_others_9 float64
total_rech_num_6 int64
total_rech_amt_6 int64
max_rech_amt_6 int64
date_of_last_rech_6 object
last_day_rch_amt_6 int64
date_of_last_rech_data_6 object
total_rech_data_6 float64
max_rech_data_6 float64
count_rech_2g_6 float64
av_rech_amt_data_6 float64
vol_2g_mb_6 float64
vol_2g_mb_7 float64
vol_2g_mb_8 float64
vol_2g_mb_9 float64
vol_3g_mb_6 float64
vol_3g_mb_7 float64
vol_3g_mb_8 float64
vol_3g_mb_9 float64
arpu_3g_6 float64
arpu_3g_7 float64
arpu_3g_8 float64
arpu_3g_9 float64
arpu_2g_6 float64
arpu_2g_7 float64
arpu_2g_8 float64
arpu_2g_9 float64
night_pck_user_6 float64
monthly_2g_6 int64
monthly_2g_7 int64
monthly_2g_8 int64
monthly_2g_9 int64
sachet_2g_6 int64
sachet_2g_7 int64
sachet_2g_8 int64
sachet_2g_9 int64
monthly_3g_6 int64
monthly_3g_7 int64
monthly_3g_8 int64
monthly_3g_9 int64
sachet_3g_6 int64
sachet_3g_7 int64
sachet_3g_8 int64
sachet_3g_9 int64
fb_user_6 float64
fb_user_7 float64
fb_user_8 float64
fb_user_9 float64
aon int64
aug_vbc_3g float64
jul_vbc_3g float64
jun_vbc_3g float64
sep_vbc_3g float64
dtypes: float64(179), int64(35), object(12)
memory usage: 172.4+ MB
There are 99999 rows and 226 columns in the data. Lot of the columns are numeric type, but
we need to inspect which are the categorical columns.
In [5]:
# look at data statistics
churn.describe(include='all')
Out[5]:
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
In [6]:
# create backup of data
original = churn.copy()
In [7]:
# create column name list by types of columns
id_cols = ['mobile_number', 'circle_id']
date_cols = ['last_date_of_month_6',
'last_date_of_month_7',
'date_of_last_rech_6',
'date_of_last_rech_data_6',
'date_of_last_rech_data_9'
]
cat_cols = ['night_pck_user_6',
'night_pck_user_7',
'night_pck_user_8',
'night_pck_user_9',
'fb_user_6',
'fb_user_7',
'fb_user_8',
'fb_user_9'
]
num_cols = [column for column in churn.columns if column not in id_cols +

date_cols + cat_cols]
# print the number of columns in each list

print("#ID cols: %d\n#Date cols:%d\n#Numeric cols:%d\n#Category cols:%d" %
(len(id_cols), len(date_cols), len(num_cols), len(cat_cols)))
# check if we have missed any column or not

print(len(id_cols) + len(date_cols) + len(num_cols) + len(cat_cols) ==
churn.shape[1])
#ID cols: 2
#Date cols:12
#Numeric cols:204
#Category cols:8
True
Handling missing values

In [8]:
# look at missing value ratio in each column
churn.isnull().sum()*100/churn.shape[0]
Out[8]:
mobile_number 0.000000
circle_id 0.000000
loc_og_t2o_mou 1.018010
std_og_t2o_mou 1.018010
loc_ic_t2o_mou 1.018010
last_date_of_month_6 0.000000
arpu_6 0.000000
arpu_7 0.000000
arpu_8 0.000000
arpu_9 0.000000
onnet_mou_6 3.937039
offnet_mou_6 3.937039
roam_ic_mou_6 3.937039
roam_og_mou_6 3.937039
loc_og_t2t_mou_6 3.937039
loc_og_t2m_mou_6 3.937039
loc_og_t2f_mou_6 3.937039
loc_og_t2c_mou_6 3.937039
loc_og_mou_6 3.937039
loc_og_mou_7 3.859039
loc_og_mou_8 5.378054
loc_og_mou_9 7.745077
std_og_t2t_mou_6 3.937039
std_og_t2m_mou_6 3.937039
std_og_t2f_mou_6 3.937039
std_og_t2c_mou_6 3.937039
std_og_mou_6 3.937039
std_og_mou_7 3.859039
std_og_mou_8 5.378054
std_og_mou_9 7.745077
isd_og_mou_6 3.937039
isd_og_mou_7 3.859039
isd_og_mou_8 5.378054
isd_og_mou_9 7.745077
spl_og_mou_6 3.937039
spl_og_mou_7 3.859039
spl_og_mou_8 5.378054
spl_og_mou_9 7.745077
og_others_6 3.937039
total_og_mou_6 0.000000
loc_ic_t2t_mou_6 3.937039
loc_ic_t2m_mou_6 3.937039
loc_ic_t2f_mou_6 3.937039
loc_ic_mou_6 3.937039
loc_ic_mou_7 3.859039
loc_ic_mou_8 5.378054
loc_ic_mou_9 7.745077
std_ic_t2t_mou_6 3.937039
std_ic_t2m_mou_6 3.937039
std_ic_t2f_mou_6 3.937039
std_ic_t2o_mou_6 3.937039
std_ic_mou_6 3.937039
std_ic_mou_7 3.859039
std_ic_mou_8 5.378054
std_ic_mou_9 7.745077
total_ic_mou_6 0.000000
spl_ic_mou_6 3.937039
spl_ic_mou_7 3.859039
spl_ic_mou_8 5.378054
spl_ic_mou_9 7.745077
isd_ic_mou_6 3.937039
isd_ic_mou_7 3.859039
isd_ic_mou_8 5.378054
isd_ic_mou_9 7.745077
ic_others_6 3.937039
total_rech_num_6 0.000000
total_rech_amt_6 0.000000
max_rech_amt_6 0.000000
date_of_last_rech_6 1.607016
last_day_rch_amt_6 0.000000
date_of_last_rech_data_6 74.846748
total_rech_data_6 74.846748
max_rech_data_6 74.846748
count_rech_2g_6 74.846748
av_rech_amt_data_6 74.846748
vol_2g_mb_6 0.000000
vol_2g_mb_7 0.000000
vol_2g_mb_8 0.000000
vol_2g_mb_9 0.000000
vol_3g_mb_6 0.000000
vol_3g_mb_7 0.000000
vol_3g_mb_8 0.000000
vol_3g_mb_9 0.000000
arpu_3g_6 74.846748
arpu_3g_7 74.428744
arpu_3g_8 73.660737
arpu_3g_9 74.077741
arpu_2g_6 74.846748
arpu_2g_7 74.428744
arpu_2g_8 73.660737
arpu_2g_9 74.077741
night_pck_user_6 74.846748
monthly_2g_6 0.000000
monthly_2g_7 0.000000
monthly_2g_8 0.000000
monthly_2g_9 0.000000
sachet_2g_6 0.000000
sachet_2g_7 0.000000
sachet_2g_8 0.000000
sachet_2g_9 0.000000
monthly_3g_6 0.000000
monthly_3g_7 0.000000
monthly_3g_8 0.000000
monthly_3g_9 0.000000
sachet_3g_6 0.000000
sachet_3g_7 0.000000
sachet_3g_8 0.000000
sachet_3g_9 0.000000
fb_user_6 74.846748
fb_user_7 74.428744
fb_user_8 73.660737
fb_user_9 74.077741
aon 0.000000
aug_vbc_3g 0.000000
jul_vbc_3g 0.000000
jun_vbc_3g 0.000000
sep_vbc_3g 0.000000
dtype: float64
impute missing values

i) Imputing with zeroes
In [9]:
# some recharge columns have minimum value of 1 while some don't
recharge_cols = ['total_rech_data_6', 'total_rech_data_7',
'total_rech_data_8', 'total_rech_data_9',
'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8',
'count_rech_2g_9',
'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8',
'count_rech_3g_9',
'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8',
'max_rech_data_9',
'av_rech_amt_data_6', 'av_rech_amt_data_7',
'av_rech_amt_data_8', 'av_rech_amt_data_9',
]
churn[recharge_cols].describe(include='all')
Out[9]:
tot tot ma ma ma ma
tota tota cou cou cou cou cou cou cou cou av_ av_ av_ av_r
al_ al_ x_r x_r x_r x_r
l_re l_re nt_ nt_ nt_ nt_ nt_ nt_ nt_ nt_ rech rech rech ech_
rec rec ech ech ech ech
ch_ ch_ rec rec rec rec rec rec rec rec _am _am _am amt
h_d h_d _da _da _da _da
dat dat h_2 h_2 h_2 h_2 h_3 h_3 h_3 h_3 t_da t_da t_da _dat
ata ata ta_ ta_ ta_ ta_
a_6 a_8 g_6 g_7 g_8 g_9 g_6 g_7 g_8 g_9 ta_6 ta_7 ta_8 a_9
_7 _9 6 7 8 9
c
251 255 263 259 251 255 263 259 251 255 263 259 251 255 263 259 251 255 263
o 2592
53. 71. 39. 22. 53. 71. 39. 22. 53. 71. 39. 22. 53. 71. 39. 22. 53.0 71.0 39.0
u 2.00
000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000
n 0000
000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 00 00 00 00
t
m 126 126 125
2.4 2.6 2.6 2.4 1.8 2.0 2.0 1.7 0.5 0.6 0.6 0.6 124 192. 200. 197. 192.
e .39 .72 .71
638 664 519 411 646 446 162 818 991 217 357 593 .94 600 981 526 7343
a 339 945 730
02 19 99 70 68 99 88 07 33 20 11 63 144 982 292 489 15
n 2 9 1
108 109 109
s 2.7 3.0 3.0 2.5 2.5 2.7 2.7 2.2 1.2 1.3 1.4 1.4 111 192. 196. 191. 188.
.47 .76 .43
t 891 315 749 163 702 683 201 147 744 945 228 115 .36 646 791 301 4002
723 526 785
d 28 93 87 39 54 32 32 01 28 24 27 13 376 318 224 305 86
5 7 1
m 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.00 0.50 0.50
1.00
i 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000
0000
n 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0 0 0 0
2 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 25. 25. 25. 25. 82.0 92.0 87.0 69.0
5 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 0000
%00 00 00 00 00 00 00 00 00 00 00 00 000 000 000 00 00 00 00 0
145 145 145
5 1.0 1.0 1.0 2.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 145 154. 154. 154. 164.
.00 .00 .00
0 000 000 000 000 000 000 000 000 000 000 000 000 .00 000 000 000 0000
000 000 000
%00 00 00 00 00 00 00 00 00 00 00 00 000 000 000 000 00
0 0 0
177 177 179
7 3.0 3.0 3.0 3.0 2.0 2.0 2.0 2.0 1.0 1.0 1.0 1.0 179 252. 252. 252. 252.
.00 .00 .00
5 000 000 000 000 000 000 000 000 000 000 000 000 .00 000 000 000 0000
000 000 000
%00 00 00 00 00 00 00 00 00 00 00 00 000 000 000 000 00
0 0 0
155 155 155 155 754 436 407
m 61. 54. 60. 84. 42. 48. 44. 40. 29. 35. 45. 49. 4061
5.0 5.0 5.0 5.0 6.00 5.00 6.00
a 000 000 000 000 000 000 000 000 000 000 000 000 .000
000 000 000 000 000 000 000
x 000 000 000 000 000 000 000 000 000 000 000 000 000
00 00 00 0 0 0 0
In [10]:
# It is also observed that the recharge date and the recharge value are
missing together which means the customer didn't recharge
churn.loc[churn.total_rech_data_6.isnull() &
churn.date_of_last_rech_data_6.isnull(), ["total_rech_data_6",
"date_of_last_rech_data_6"]].head(20)
Out[10]:
total_rech_data_6 date_of_last_rech_data_6
1 NaN NaN
total_rech_data_6 date_of_last_rech_data_6
2 NaN NaN
3 NaN NaN
5 NaN NaN
6 NaN NaN
7 NaN NaN
8 NaN NaN
9 NaN NaN
10 NaN NaN
11 NaN NaN
12 NaN NaN
13 NaN NaN
14 NaN NaN
15 NaN NaN
16 NaN NaN
17 NaN NaN
18 NaN NaN
20 NaN NaN
21 NaN NaN
22 NaN NaN
In the recharge variables where minumum value is 1, we can impute missing values with
zeroes since it means customer didn't recharge their numbere that month.
In [11]:
# create a list of recharge columns where we will impute missing values
with zeroes
zero_impute = ['total_rech_data_6', 'total_rech_data_7',
'total_rech_data_8', 'total_rech_data_9',
'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8',
'av_rech_amt_data_9',
'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8',
'max_rech_data_9'
]
In [12]:
# impute missing values with 0
churn[zero_impute] = churn[zero_impute].apply(lambda x: x.fillna(0))
In [13]:
# now, let's make sure values are imputed correctly
print("Missing value ratio:\n")
print(churn[zero_impute].isnull().sum()*100/churn.shape[1])
# summary
print("\n\nSummary statistics\n")
print(churn[zero_impute].describe(include='all'))
Missing value ratio:
max_rech_data_6 0.0
max_rech_data_7 0.0
max_rech_data_8 0.0
max_rech_data_9 0.0
dtype: float64
Summary statistics
total_rech_data_6 total_rech_data_7 total_rech_data_8 \

count 99999.000000 99999.000000 99999.000000
mean 0.619726 0.681837 0.698517
std 1.760541 1.924382 1.963417
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000
75% 1.000000 1.000000 1.000000
max 61.000000 54.000000 60.000000
total_rech_data_9 av_rech_amt_data_6 av_rech_amt_data_7 \

count 99999.000000 99999.000000 99999.000000
mean 0.632806 48.445409 51.393440
std 1.669040 127.743863 132.629365
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000
75% 1.000000 8.250000 17.000000
max 84.000000 7546.000000 4365.000000
av_rech_amt_data_8 av_rech_amt_data_9 max_rech_data_6 \

count 99999.000000 99999.000000 99999.000000
mean 52.027022 49.961089 31.792048
std 131.182609 127.804280 77.248778
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000
75% 23.000000 17.000000 8.000000
max 4076.000000 4061.000000 1555.000000
max_rech_data_7 max_rech_data_8 max_rech_data_9

count 99999.000000 99999.000000 99999.000000
mean 32.406314 33.113011 32.387644
std 78.342435 78.872739 78.818696
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000
75% 14.000000 17.000000 17.000000
max 1555.000000 1555.000000 1555.000000
In [14]:
# drop id and date columns
print("Shape before dropping: ", churn.shape)
churn = churn.drop(id_cols + date_cols, axis=1)
print("Shape after dropping: ", churn.shape)
Shape before dropping: (99999, 226)
Shape after dropping: (99999, 212)
ii) Replace NaN values in categorical variables
We will replace missing values in the categorical values with '-1' where '-1' will be a new
category.
In [15]:
# replace missing values with '-1' in categorical columns
churn[cat_cols] = churn[cat_cols].apply(lambda x: x.fillna(-1))
In [16]:
# missing value ratio
print("Missing value ratio:\n")
print(churn[cat_cols].isnull().sum()*100/churn.shape[0])
Missing value ratio:
fb_user_6 0.0
fb_user_7 0.0
fb_user_8 0.0
fb_user_9 0.0
dtype: float64
iii) Drop variables with more than a given threshold of

missing values
In [17]:
initial_cols = churn.shape[1]
MISSING_THRESHOLD = 0.7
include_cols = list(churn.apply(lambda column: True if

column.isnull().sum()/churn.shape[0] < MISSING_THRESHOLD else False))
drop_missing = pd.DataFrame({'features':churn.columns , 'include':

include_cols})
drop_missing.loc[drop_missing.include == True,:]
Out[17]:
features include
0 loc_og_t2o_mou True
1 std_og_t2o_mou True
2 loc_ic_t2o_mou True
3 arpu_6 True
4 arpu_7 True
5 arpu_8 True
6 arpu_9 True
7 onnet_mou_6 True
8 onnet_mou_7 True
9 onnet_mou_8 True
features include
10 onnet_mou_9 True
11 offnet_mou_6 True
15 roam_ic_mou_6 True
19 roam_og_mou_6 True
23 loc_og_t2t_mou_6 True
27 loc_og_t2m_mou_6 True
31 loc_og_t2f_mou_6 True
35 loc_og_t2c_mou_6 True
39 loc_og_mou_6 True
43 std_og_t2t_mou_6 True
47 std_og_t2m_mou_6 True
features include
51 std_og_t2f_mou_6 True
55 std_og_t2c_mou_6 True
59 std_og_mou_6 True
63 isd_og_mou_6 True
67 spl_og_mou_6 True
71 og_others_6 True
72 og_others_7 True
73 og_others_8 True
74 og_others_9 True
75 total_og_mou_6 True
79 loc_ic_t2t_mou_6 True
83 loc_ic_t2m_mou_6 True
87 loc_ic_t2f_mou_6 True
features include
91 loc_ic_mou_6 True
95 std_ic_t2t_mou_6 True
99 std_ic_t2m_mou_6 True
103 std_ic_t2f_mou_6 True
107 std_ic_t2o_mou_6 True
111 std_ic_mou_6 True
115 total_ic_mou_6 True
119 spl_ic_mou_6 True
123 isd_ic_mou_6 True
127 ic_others_6 True
features include
131 total_rech_num_6 True
135 total_rech_amt_6 True
139 max_rech_amt_6 True
143 last_day_rch_amt_6 True
147 total_rech_data_6 True
151 max_rech_data_6 True
163 av_rech_amt_data_6 True
167 vol_2g_mb_6 True
183 night_pck_user_6 True
features include
187 monthly_2g_6 True
191 sachet_2g_6 True
203 fb_user_6 True
204 fb_user_7 True
205 fb_user_8 True
206 fb_user_9 True
207 aon True
208 aug_vbc_3g True
209 jul_vbc_3g True
210 jun_vbc_3g True
211 sep_vbc_3g True
In [18]:
# drop columns
churn = churn.loc[:, include_cols]
dropped_cols = churn.shape[1] - initial_cols

print("{0} columns dropped.".format(dropped_cols))
-16 columns dropped.
iv) imputing using MICE

install fancyimpute package using this link and following the install instructions
In [19]:
churn_cols = churn.columns
# using MICE technique to impute missing values in the rest of the columns
from fancyimpute import MICE
churn_imputed = MICE(n_imputations=1).complete(churn)
Using TensorFlow backend.
[MICE] Completing matrix with shape (99999, 196)
[MICE] Starting imputation round 1/11, elapsed time 0.887
In [20]:
# convert imputed numpy array to pandas dataframe
churn = pd.DataFrame(churn_imputed, columns=churn_cols)
print(churn.isnull().sum()*100/churn.shape[0])
loc_og_t2o_mou 0.0
std_og_t2o_mou 0.0
loc_ic_t2o_mou 0.0
arpu_6 0.0
arpu_7 0.0
arpu_8 0.0
arpu_9 0.0
onnet_mou_6 0.0
onnet_mou_7 0.0
onnet_mou_8 0.0
onnet_mou_9 0.0
offnet_mou_6 0.0
offnet_mou_7 0.0
offnet_mou_8 0.0
offnet_mou_9 0.0
roam_ic_mou_6 0.0
roam_ic_mou_7 0.0
roam_ic_mou_8 0.0
roam_ic_mou_9 0.0
roam_og_mou_6 0.0
roam_og_mou_7 0.0
roam_og_mou_8 0.0
roam_og_mou_9 0.0
loc_og_mou_6 0.0
loc_og_mou_7 0.0
loc_og_mou_8 0.0
loc_og_mou_9 0.0
std_og_mou_6 0.0
std_og_mou_7 0.0
std_og_mou_8 0.0
std_og_mou_9 0.0
isd_og_mou_6 0.0
isd_og_mou_7 0.0
isd_og_mou_8 0.0
isd_og_mou_9 0.0
spl_og_mou_6 0.0
spl_og_mou_7 0.0
spl_og_mou_8 0.0
spl_og_mou_9 0.0
og_others_6 0.0
og_others_7 0.0
og_others_8 0.0
og_others_9 0.0
total_og_mou_6 0.0
total_og_mou_7 0.0
total_og_mou_8 0.0
total_og_mou_9 0.0
loc_ic_mou_6 0.0
loc_ic_mou_7 0.0
loc_ic_mou_8 0.0
loc_ic_mou_9 0.0
std_ic_mou_6 0.0
std_ic_mou_7 0.0
std_ic_mou_8 0.0
std_ic_mou_9 0.0
total_ic_mou_6 0.0
total_ic_mou_7 0.0
total_ic_mou_8 0.0
total_ic_mou_9 0.0
spl_ic_mou_6 0.0
spl_ic_mou_7 0.0
spl_ic_mou_8 0.0
spl_ic_mou_9 0.0
isd_ic_mou_6 0.0
isd_ic_mou_7 0.0
isd_ic_mou_8 0.0
isd_ic_mou_9 0.0
ic_others_6 0.0
ic_others_7 0.0
ic_others_8 0.0
ic_others_9 0.0
max_rech_amt_6 0.0
max_rech_amt_7 0.0
max_rech_amt_8 0.0
max_rech_amt_9 0.0
max_rech_data_6 0.0
max_rech_data_7 0.0
max_rech_data_8 0.0
max_rech_data_9 0.0
vol_2g_mb_6 0.0
vol_2g_mb_7 0.0
vol_2g_mb_8 0.0
vol_2g_mb_9 0.0
vol_3g_mb_6 0.0
vol_3g_mb_7 0.0
vol_3g_mb_8 0.0
vol_3g_mb_9 0.0
monthly_2g_6 0.0
monthly_2g_7 0.0
monthly_2g_8 0.0
monthly_2g_9 0.0
sachet_2g_6 0.0
sachet_2g_7 0.0
sachet_2g_8 0.0
sachet_2g_9 0.0
monthly_3g_6 0.0
monthly_3g_7 0.0
monthly_3g_8 0.0
monthly_3g_9 0.0
sachet_3g_6 0.0
sachet_3g_7 0.0
sachet_3g_8 0.0
sachet_3g_9 0.0
fb_user_6 0.0
fb_user_7 0.0
fb_user_8 0.0
fb_user_9 0.0
aon 0.0
aug_vbc_3g 0.0
jul_vbc_3g 0.0
jun_vbc_3g 0.0
sep_vbc_3g 0.0
dtype: float64
filter high-value customers

calculate total data recharge amount
In [21]:
# calculate the total data recharge amount for June and July --> number of
recharges * average recharge amount
churn['total_data_rech_6'] = churn.total_rech_data_6 *
churn.av_rech_amt_data_6
churn['total_data_rech_7'] = churn.total_rech_data_7 *
churn.av_rech_amt_data_7
add total data recharge and total recharge to get total combined recharge
amount for a month
In [22]:
# calculate total recharge amount for June and July --> call recharge
amount + data recharge amount
churn['amt_data_6'] = churn.total_rech_amt_6 + churn.total_data_rech_6
churn['amt_data_7'] = churn.total_rech_amt_7 + churn.total_data_rech_7
In [23]:
# calculate average recharge done by customer in June and July
churn['av_amt_data_6_7'] = (churn.amt_data_6 + churn.amt_data_7)/2
In [24]:
# look at the 70th percentile recharge amount
print("Recharge amount at 70th percentile:
{0}".format(churn.av_amt_data_6_7.quantile(0.7)))
Recharge amount at 70th percentile: 478.0
In [25]:
# retain only those customers who have recharged their mobiles with more
than or equal to 70th percentile amount
churn_filtered = churn.loc[churn.av_amt_data_6_7 >=
churn.av_amt_data_6_7.quantile(0.7), :]
churn_filtered = churn_filtered.reset_index(drop=True)
churn_filtered.shape
Out[25]:
(30001, 201)
In [26]:
# delete variables created to filter high-value customers
churn_filtered = churn_filtered.drop(['total_data_rech_6',
'total_data_rech_7',
'amt_data_6', 'amt_data_7',
'av_amt_data_6_7'], axis=1)
Out[26]:
(30001, 196)
We're left with 30,001 rows after selecting the customers who have provided recharge value
of more than or equal to the recharge value of the 70th percentile customer.
derive churn
In [27]:
# calculate total incoming and outgoing minutes of usage
churn_filtered['total_calls_mou_9'] = churn_filtered.total_ic_mou_9 +
churn_filtered.total_og_mou_9
In [28]:
# calculate 2g and 3g data consumption
churn_filtered['total_internet_mb_9'] = churn_filtered.vol_2g_mb_9 +
churn_filtered.vol_3g_mb_9
In [29]:
# create churn variable: those who have not used either calls or internet
in the month of September are customers who have churned
# 0 - not churn, 1 - churn

churn_filtered['churn'] = churn_filtered.apply(lambda row: 1 if
(row.total_calls_mou_9 == 0 and row.total_internet_mb_9 == 0) else 0,
axis=1)
In [30]:
# delete derived variables
churn_filtered = churn_filtered.drop(['total_calls_mou_9',
'total_internet_mb_9'], axis=1)
In [31]:
# change data type to category
churn_filtered.churn = churn_filtered.churn.astype("category")
# print churn ratio

print("Churn Ratio:")
print(churn_filtered.churn.value_counts()*100/churn_filtered.shape[0])
Churn Ratio:
0 91.863605
1 8.136395
Name: churn, dtype: float64
Calculate difference between 8th and

previous months
Let's derive some variables. The most important feature, in this situation, can be the
difference between the 8th month and the previous months. The difference can be in patterns
such as usage difference or recharge value difference. Let's calculate difference variable as
the difference between 8th month and the average of 6th and 7th month.
In [32]:
churn_filtered['arpu_diff'] = churn_filtered.arpu_8 -
((churn_filtered.arpu_6 + churn_filtered.arpu_7)/2)
churn_filtered['onnet_mou_diff'] = churn_filtered.onnet_mou_8 -
((churn_filtered.onnet_mou_6 + churn_filtered.onnet_mou_7)/2)
churn_filtered['offnet_mou_diff'] = churn_filtered.offnet_mou_8 -
((churn_filtered.offnet_mou_6 + churn_filtered.offnet_mou_7)/2)
churn_filtered['roam_ic_mou_diff'] = churn_filtered.roam_ic_mou_8 -
((churn_filtered.roam_ic_mou_6 + churn_filtered.roam_ic_mou_7)/2)
churn_filtered['roam_og_mou_diff'] = churn_filtered.roam_og_mou_8 -
((churn_filtered.roam_og_mou_6 + churn_filtered.roam_og_mou_7)/2)
churn_filtered['loc_og_mou_diff'] = churn_filtered.loc_og_mou_8 -
((churn_filtered.loc_og_mou_6 + churn_filtered.loc_og_mou_7)/2)
churn_filtered['std_og_mou_diff'] = churn_filtered.std_og_mou_8 -
((churn_filtered.std_og_mou_6 + churn_filtered.std_og_mou_7)/2)
churn_filtered['isd_og_mou_diff'] = churn_filtered.isd_og_mou_8 -
((churn_filtered.isd_og_mou_6 + churn_filtered.isd_og_mou_7)/2)
churn_filtered['spl_og_mou_diff'] = churn_filtered.spl_og_mou_8 -
((churn_filtered.spl_og_mou_6 + churn_filtered.spl_og_mou_7)/2)
churn_filtered['total_og_mou_diff'] = churn_filtered.total_og_mou_8 -
((churn_filtered.total_og_mou_6 + churn_filtered.total_og_mou_7)/2)
churn_filtered['loc_ic_mou_diff'] = churn_filtered.loc_ic_mou_8 -
((churn_filtered.loc_ic_mou_6 + churn_filtered.loc_ic_mou_7)/2)
churn_filtered['std_ic_mou_diff'] = churn_filtered.std_ic_mou_8 -
((churn_filtered.std_ic_mou_6 + churn_filtered.std_ic_mou_7)/2)
churn_filtered['isd_ic_mou_diff'] = churn_filtered.isd_ic_mou_8 -
((churn_filtered.isd_ic_mou_6 + churn_filtered.isd_ic_mou_7)/2)
churn_filtered['spl_ic_mou_diff'] = churn_filtered.spl_ic_mou_8 -
((churn_filtered.spl_ic_mou_6 + churn_filtered.spl_ic_mou_7)/2)
churn_filtered['total_ic_mou_diff'] = churn_filtered.total_ic_mou_8 -
((churn_filtered.total_ic_mou_6 + churn_filtered.total_ic_mou_7)/2)
churn_filtered['total_rech_num_diff'] = churn_filtered.total_rech_num_8 -
((churn_filtered.total_rech_num_6 + churn_filtered.total_rech_num_7)/2)
churn_filtered['total_rech_amt_diff'] = churn_filtered.total_rech_amt_8 -
((churn_filtered.total_rech_amt_6 + churn_filtered.total_rech_amt_7)/2)
churn_filtered['max_rech_amt_diff'] = churn_filtered.max_rech_amt_8 -
((churn_filtered.max_rech_amt_6 + churn_filtered.max_rech_amt_7)/2)
churn_filtered['total_rech_data_diff'] = churn_filtered.total_rech_data_8 -
((churn_filtered.total_rech_data_6 + churn_filtered.total_rech_data_7)/2)
churn_filtered['max_rech_data_diff'] = churn_filtered.max_rech_data_8 -
((churn_filtered.max_rech_data_6 + churn_filtered.max_rech_data_7)/2)
churn_filtered['av_rech_amt_data_diff'] = churn_filtered.av_rech_amt_data_8
- ((churn_filtered.av_rech_amt_data_6 +
churn_filtered.av_rech_amt_data_7)/2)
churn_filtered['vol_2g_mb_diff'] = churn_filtered.vol_2g_mb_8 -
((churn_filtered.vol_2g_mb_6 + churn_filtered.vol_2g_mb_7)/2)
churn_filtered['vol_3g_mb_diff'] = churn_filtered.vol_3g_mb_8 -
((churn_filtered.vol_3g_mb_6 + churn_filtered.vol_3g_mb_7)/2)
In [33]:
# let's look at summary of one of the difference variables
churn_filtered['total_og_mou_diff'].describe()
Out[33]:
count 30001.000000
mean -67.437337
std 502.630069
min -7213.410000
25% -168.025000
50% -14.625000
75% 67.915000
max 12768.705000
Name: total_og_mou_diff, dtype: float64
delete columns that belong to the churn month (9th

month)
In [34]:
# delete all variables relating to 9th month
churn_filtered = churn_filtered.filter(regex='[^9]$', axis=1)
Out[34]:
(30001, 173)
In [35]:
# extract all names that end with 9
col_9_names = churn.filter(regex='9$', axis=1).columns
# update num_cols and cat_cols column name list

cat_cols = [col for col in cat_cols if col not in col_9_names]
cat_cols.append('churn')
num_cols = [col for col in churn_filtered.columns if col not in cat_cols]
visualise data
In [36]:
# change columns types
churn_filtered[num_cols] = churn_filtered[num_cols].apply(pd.to_numeric)
churn_filtered[cat_cols] = churn_filtered[cat_cols].apply(lambda column:
column.astype("category"), axis=0)
In [37]:
# create plotting functions
def data_type(variable):
if variable.dtype == np.int64 or variable.dtype == np.float64:
return 'numerical'
elif variable.dtype == 'category':
return 'categorical'
def univariate(variable, stats=True):
if data_type(variable) == 'numerical':
sns.distplot(variable)
if stats == True:
print(variable.describe())
elif data_type(variable) == 'categorical':

sns.countplot(variable)
if stats == True:
print(variable.value_counts())
else:
print("Invalid variable passed: either pass a numeric variable or a
categorical vairable.")
def bivariate(var1, var2):

if data_type(var1) == 'numerical' and data_type(var2) == 'numerical':
sns.regplot(var1, var2)
elif (data_type(var1) == 'categorical' and data_type(var2) ==
'numerical') or (data_type(var1) == 'numerical' and data_type(var2) ==
'categorical'):
sns.boxplot(var1, var2)
Univariate EDA
In [38]:
univariate(churn.arpu_6)
count 99999.000000
mean 282.987358
std 328.439770
min -2258.709000
25% 93.411500
50% 197.704000
75% 371.060000
max 27731.088000
Name: arpu_6, dtype: float64
In [39]:
univariate(churn.loc_og_t2o_mou)
count 99999.000000
mean -0.000002
std 0.000312
min -0.009873
25% 0.000000
50% 0.000000
75% 0.000000
max 0.009702
Name: loc_og_t2o_mou, dtype: float64
In [40]:
univariate(churn.std_og_t2o_mou)
count 9.999900e+04
mean -9.957627e-07
std 3.136615e-04
min -1.085887e-02
25% 0.000000e+00
50% 0.000000e+00
75% 0.000000e+00
max 9.418004e-03
Name: std_og_t2o_mou, dtype: float64
In [41]:
univariate(churn.onnet_mou_8)
count 99999.000000
mean 125.972580
std 302.822628
min -700.923704
25% 5.510000
50% 30.760000
75% 109.400000
max 10752.560000
Name: onnet_mou_8, dtype: float64
In [42]:
univariate(churn.offnet_mou_9)
count 99999.000000
mean 176.085292
std 311.955151
min -1407.015438
25% 21.903023
50% 78.360000
75% 204.270000
max 10310.760000
Name: offnet_mou_9, dtype: float64
Variables are very skewed towards the left.

Bivariate EDA
In [43]:
bivariate(churn_filtered.churn, churn_filtered.aon)
In [44]:
bivariate(churn_filtered.sep_vbc_3g, churn_filtered.churn)
In [45]:
bivariate(churn_filtered.spl_og_mou_8, churn_filtered.churn)
In [46]:
pd.crosstab(churn_filtered.churn, churn_filtered.night_pck_user_8,
normalize='columns')*100
Out[46]:
night_pck_user_8 -1.0 0.0 1.0
churn
0 85.89123 97.117602 97.360704
1 14.10877 2.882398 2.639296
In [47]:
pd.crosstab(churn_filtered.churn, churn_filtered.sachet_3g_8)
Out[47]:
sachet 1. 2. 3. 4. 5. 6. 7. 8. 9. 10 11 12 13 14 15 16 17 18 19 20 21 23 25 27 29 30 38 41
0.0
_3g_8 0 0 0 0 0 0 0 0 0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0
churn
24 3 1 1
16 8 4 3 2 1
0 97 9 8 0 15 8 11 10 6 6 2 2 3 1 3 3 2 1 1 1 2 1 1
09 6 3 5 8 9
2 9 4 6
23
1 48 5 8 4 2 1 0 2 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
69
Cap outliers in all numeric variables with k-sigma technique
In [48]:
def cap_outliers(array, k=3):
upper_limit = array.mean() + k*array.std()
lower_limit = array.mean() - k*array.std()
array[array<lower_limit] = lower_limit
array[array>upper_limit] = upper_limit
return array
In [49]:
# example of capping
sample_array = list(range(100))
# add outliers to the data

sample_array[0] = -9999
sample_array[99] = 9999
# cap outliers
sample_array = np.array(sample_array)
print("Array after capping outliers: \n", cap_outliers(sample_array, k=2))
Array after capping outliers:
[-2780 1 2 3 4 5 6 7 8 9 10 11
12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31 32 33 34 35
36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59
60 61 62 63 64 65 66 67 68 69 70 71
72 73 74 75 76 77 78 79 80 81 82 83
84 85 86 87 88 89 90 91 92 93 94 95
96 97 98 2877]
In [50]:
# cap outliers in the numeric columns
churn_filtered[num_cols] = churn_filtered[num_cols].apply(cap_outliers,
axis=0)
Modelling
i) Making predictions
In [51]:
# import required libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from imblearn.metrics import sensitivity_specificity_support
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
Preprocessing data
In [52]:
# change churn to numeric
churn_filtered['churn'] = pd.to_numeric(churn_filtered['churn'])
Train Test split
In [53]:
# divide data into train and test
X = churn_filtered.drop("churn", axis = 1)
y = churn_filtered.churn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,
random_state = 4, stratify = y)
In [54]:
# print shapes of train and test sets
X_train.shape
y_train.shape
X_test.shape
y_test.shape
Out[54]:
(22500, 172)
Out[54]:
(22500,)
Out[54]:
(7501, 172)
Out[54]:
(7501,)
Aggregating the categorical columns

In [55]:
train = pd.concat([X_train, y_train], axis=1)
# aggregate the categorical variables

train.groupby('night_pck_user_6').churn.mean()
train.groupby('fb_user_6').churn.mean()
Out[55]:
night_pck_user_6
-1.0 0.099165
0.0 0.066797
1.0 0.087838
Out[55]:
night_pck_user_7
-1.0 0.115746
0.0 0.055494
1.0 0.051282
Out[55]:
night_pck_user_8
-1.0 0.141108
0.0 0.029023
1.0 0.016194
Out[55]:
fb_user_6
-1.0 0.099165
0.0 0.069460
1.0 0.067124
Out[55]:
fb_user_7
-1.0 0.115746
0.0 0.059305
1.0 0.055082
Out[55]:
fb_user_8
-1.0 0.141108
0.0 0.066887
1.0 0.024463
In [56]:
# replace categories with aggregated values in each categorical column
mapping = {'night_pck_user_6' : {-1: 0.099165, 0: 0.066797, 1: 0.087838},
'night_pck_user_7' : {-1: 0.115746, 0: 0.055494, 1: 0.051282},
'night_pck_user_8' : {-1: 0.141108, 0: 0.029023, 1: 0.016194},
'fb_user_6' : {-1: 0.099165, 0: 0.069460, 1: 0.067124},
'fb_user_7' : {-1: 0.115746, 0: 0.059305, 1: 0.055082},
'fb_user_8' : {-1: 0.141108, 0: 0.066887, 1: 0.024463}
}
X_train.replace(mapping, inplace = True)
X_test.replace(mapping, inplace = True)
In [57]:
# check data type of categorical columns - make sure they are numeric
X_train[[col for col in cat_cols if col not in ['churn']]].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 22500 entries, 4525 to 22754
Data columns (total 6 columns):
night_pck_user_6 22500 non-null float64
fb_user_6 22500 non-null float64
dtypes: float64(6)
memory usage: 1.2 MB
PCA
In [58]:
# apply pca to train data
pca = Pipeline([('scaler', StandardScaler()), ('pca', PCA())])
In [59]:
pca.fit(X_train)
churn_pca = pca.fit_transform(X_train)
Out[59]:
Pipeline(memory=None,
steps=[('scaler', StandardScaler(copy=True, with_mean=True,
with_std=True)), ('pca', PCA(copy=True, iterated_power='auto',
n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False))])
In [60]:
# extract pca model from pipeline
pca = pca.named_steps['pca']
# look at explainded variance of PCA components
print(pd.Series(np.round(pca.explained_variance_ratio_.cumsum(), 4)*100))
0 10.91
1 19.80
2 25.31
3 29.95
4 33.80
5 37.30
6 39.90
7 42.33
8 44.51
9 46.52
10 48.42
11 50.23
12 51.91
13 53.54
14 55.03
15 56.40
16 57.70
17 58.95
18 60.19
19 61.41
20 62.54
21 63.65
22 64.71
23 65.72
24 66.70
25 67.61
26 68.49
27 69.34
28 70.17
29 71.00
30 71.77
31 72.54
32 73.30
33 74.05
34 74.77
35 75.46
36 76.15
37 76.82
38 77.49
39 78.15
40 78.79
41 79.42
42 80.04
43 80.65
44 81.23
45 81.80
46 82.36
47 82.89
48 83.43
49 83.94
50 84.45
51 84.95
52 85.45
53 85.93
54 86.40
55 86.85
56 87.30
57 87.75
58 88.19
59 88.61
60 89.02
61 89.41
62 89.78
63 90.13
64 90.47
65 90.81
66 91.13
67 91.45
68 91.75
69 92.05
70 92.34
71 92.62
72 92.89
73 93.16
74 93.43
75 93.69
76 93.93
77 94.17
78 94.41
79 94.64
80 94.86
81 95.05
82 95.25
83 95.43
84 95.62
85 95.80
86 95.97
87 96.15
88 96.31
89 96.48
90 96.64
91 96.80
92 96.96
93 97.11
94 97.25
95 97.39
96 97.53
97 97.66
98 97.78
99 97.90
100 98.02
101 98.14
102 98.25
103 98.34
104 98.43
105 98.52
106 98.60
107 98.68
108 98.76
109 98.83
110 98.90
111 98.95
112 99.01
113 99.06
114 99.12
115 99.17
116 99.22
117 99.26
118 99.30
119 99.34
120 99.38
121 99.42
122 99.45
123 99.48
124 99.51
125 99.53
126 99.56
127 99.58
128 99.61
129 99.63
130 99.65
131 99.67
132 99.69
133 99.71
134 99.72
135 99.74
136 99.76
137 99.77
138 99.79
139 99.80
140 99.82
141 99.83
142 99.84
143 99.85
144 99.86
145 99.87
146 99.88
147 99.89
148 99.90
149 99.91
150 99.92
151 99.92
152 99.93
153 99.94
154 99.94
155 99.95
156 99.95
157 99.96
158 99.97
159 99.97
160 99.97
161 99.98
162 99.98
163 99.98
164 99.99
165 99.99
166 99.99
167 100.00
168 100.00
169 100.00
170 100.00
171 100.00
dtype: float64
~ 60 components explain 90% variance
~ 80 components explain 95% variance
In [61]:
# plot feature variance
features = range(pca.n_components_)
cumulative_variance =
np.round(np.cumsum(pca.explained_variance_ratio_)*100, decimals=4)
plt.figure(figsize=(175/20,100/20)) # 100 elements on y-axis; 175 elements
on x-axis; 20 is normalising factor
plt.plot(cumulative_variance)
Out[61]:
<Figure size 630x360 with 0 Axes>
Out[61]:
[<matplotlib.lines.Line2D at 0x2b6027981d0>]
PCA and Logistic Regression

In [62]:
# create pipeline
PCA_VARS = 60
steps = [('scaler', StandardScaler()),
("pca", PCA(n_components=PCA_VARS)),
("logistic", LogisticRegression(class_weight='balanced'))
]
pipeline = Pipeline(steps)
In [63]:
# fit model
pipeline.fit(X_train, y_train)
# check score on train data

pipeline.score(X_train, y_train)
Out[63]:
Pipeline(memory=None,
n_components=60, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('logistic',
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
fit_intercept=True, intercept_scaling=1, max_iter=100,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])
Out[63]:
0.8205333333333333
Evaluate on test data
In [64]:
# predict churn on test data
y_pred = pipeline.predict(X_test)
# create onfusion matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)
# check sensitivity and specificity

sensitivity, specificity, _ = sensitivity_specificity_support(y_test,
y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t",
round(specificity, 2), sep='')
# check area under curve

y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
print("AUC: \t", round(roc_auc_score(y_test, y_pred_prob),2))
[[5613 1278]
[ 91 519]]
Sensitivity: 0.85
Specificity: 0.81
AUC: 0.9
Hyperparameter tuning - PCA and Logistic Regression
In [65]:
# class imbalance
y_train.value_counts()/y_train.shape
Out[65]:
0 0.918622
1 0.081378
In [66]:
# PCA
pca = PCA()
# logistic regression - the class weight is used to handle class imbalance

- it adjusts the cost function
logistic = LogisticRegression(class_weight={0:0.1, 1: 0.9})
# create pipeline
steps = [("scaler", StandardScaler()),
("pca", pca),
("logistic", logistic)
]
# compile pipeline
pca_logistic = Pipeline(steps)
# hyperparameter space
params = {'pca__n_components': [60, 80], 'logistic__C': [0.1, 0.5, 1, 2, 3,
4, 5, 10], 'logistic__penalty': ['l1', 'l2']}
# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)
# create gridsearch object

model = GridSearchCV(estimator=pca_logistic, cv=folds, param_grid=params,
scoring='roc_auc', n_jobs=-1, verbose=1)
In [67]:
# fit model
model.fit(X_train, y_train)
Fitting 5 folds for each of 32 candidates, totalling 160 fits
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 3.0min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed: 10.3min finished
Out[67]:
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=4, shuffle=True),
error_score='raise',
estimator=Pipeline(memory=None,
n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('logistic',
LogisticRegression(C=1.0, class_weight={0: ...y='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]),
fit_params=None, iid=True, n_jobs=-1,
param_grid={'pca__n_components': [60, 80], 'logistic__C': [0.1, 0.5,
1, 2, 3, 4, 5, 10], 'logistic__penalty': ['l1', 'l2']},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='roc_auc', verbose=1)
In [68]:
# cross validation results
pd.DataFrame(model.cv_results_)
Out[68]:
st
m st par para st
me d_ par spl spl spl spl spl me ra spl spl spl spl spl me
ea d_ am_ m_p d_ std
an sc am it0 it1 it2 it3 it4 an nk it0 it1 it2 it3 it4 an
n_ fit logis ca__ pa tes _tr
_sc or _lo _te _te _te _te _te _te _te _tr _tr _tr _tr _tr _tr
fit _t tic_ n_co ra t_ ain
ore e_ gist st_ st_ st_ st_ st_ st_ st_ ain ain ain ain ain ain
_ti i _pe mpo ms sc _sc
_ti ti ic_ sco sco sco sco sco sco sco _sc _sc _sc _sc _sc _sc
m m nalt nent or ore
me m _C re re re re re re re ore ore ore ore ore ore
e e y s e
e
0 14 3. 0.0 0.0 0.1 l1 60 {'lo 0.8 0.8 0.8 0.8 0.8 0.8 0. 31 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.3 60 82 26 gist 93 96 79 79 89 87 00 92 91 96 95 92 93 01
10 12 18 81 ic_ 42 38 45 65 87 75 70 66 37 54 23 72 71 89
69 07 2 7 _C' 4 8 5 0 7 9 11 6 6 4 4 8 0 0
1 :
0.1
,
'log
isti
c__
pen
alt
y':
'l1'.
st
m st par para st
e e y s e
e
..
{'lo
gist
ic_
_C'
:
16 0.1
4. 0.0 0.0 0.8 0.9 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.9 0.8 0.8 0.8 0.0
.3 ,
12 97 29 97 00 80 84 91 90 00 96 95 01 99 97 97 02
1 86 0.1 l1 80 'log 4
51 79 30 33 26 97 40 97 99 73 78 30 14 41 09 94 07
90 isti
11 4 6 7 8 0 9 0 1 63 9 0 4 4 5 8 1
0 c__
pen
alt
y':
'l1'.
..
{'lo
gist
ic_
_C'
:
13 0.1
3. 0.0 0.0 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.1 ,
31 77 12 93 97 78 81 87 87 00 93 92 97 96 93 94 01
2 35 0.1 l2 60 'log 32
00 55 02 68 61 13 05 65 62 73 94 52 77 40 91 91 90
52 isti
98 1 9 7 6 0 2 9 9 47 8 5 8 0 1 2 1
6 c__
pen
alt
y':
'l2'.
..
3 12 1. 0.0 0.0 0.1 l2 80 {'lo 0.8 0.9 0.8 0.8 0.8 0.8 0. 12 0.8 0.8 0.9 0.9 0.8 0.8 0.0
.9 81 65 20 gist 98 02 79 85 88 90 00 98 97 03 01 99 99 02
18 09 15 04 ic_ 39 41 35 68 06 78 84 69 26 30 21 32 96 09
39 22 8 9 _C' 8 9 6 7 1 5 54 6 1 2 1 8 0 8
1 :
0.1
,
'log
isti
c__
st
m st par para st
e e y s e
e
pen
alt
y':
'l2'.
..
{'lo
gist
ic_
_C'
:
11 0.5
1. 0.0 0.0 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.8 ,
25 80 17 93 97 78 82 87 88 00 94 92 97 96 93 95 01
4 20 0.5 l1 60 'log 27
69 44 17 67 86 30 75 48 01 70 08 61 90 49 94 01 91
12 isti
41 7 0 9 7 8 3 4 8 90 8 3 8 7 1 0 6
8 c__
pen
alt
y':
'l1'.
..
{'lo
gist
ic_
_C'
:
13 0.5
2. 0.0 0.0 0.8 0.9 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.9 0.9 0.8 0.9 0.0
.5 ,
14 73 27 98 02 80 86 88 91 00 99 97 03 01 99 00 02
5 21 0.5 l1 80 'log 1
65 40 84 68 39 32 32 06 15 81 09 48 24 63 40 17 02
79 isti
24 5 3 3 9 4 7 2 9 65 2 4 8 0 8 2 8
6 c__
pen
alt
y':
'l1'.
..
6 12 1. 0.0 0.0 0.5 l2 60 {'lo 0.8 0.8 0.8 0.8 0.8 0.8 0. 21 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.6 90 99 26 gist 93 99 79 81 87 88 00 94 93 97 96 94 95 01
41 60 81 06 ic_ 50 11 06 70 79 23 73 37 14 97 63 20 26 76
42 88 6 7 _C' 8 2 6 3 0 6 90 1 2 0 1 3 3 7
5 :
0.5
st
m st par para st
e e y s e
e
,
'log
isti
c__
pen
alt
y':
'l2'.
..
{'lo
gist
ic_
_C'
:
14 0.5
2. 0.0 0.0 0.8 0.9 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.9 0.9 0.8 0.9 0.0
.7 ,
57 84 15 98 03 78 85 87 90 00 98 97 03 01 99 00 01
7 12 0.5 l2 80 'log 15
84 82 30 30 20 68 83 42 69 88 90 92 26 74 95 35 92
00 isti
77 5 2 6 6 2 8 5 2 67 2 8 4 0 6 8 6
2 c__
pen
alt
y':
'l2'.
..
{'lo
gist
ic_
_C'
13 : 1,
2. 0.0 0.0 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.3 'log
11 58 04 93 98 78 81 88 88 00 93 93 97 96 93 95 01
8 21 1 l1 60 isti 17
61 24 61 75 87 75 96 90 45 73 86 07 94 74 70 06 91
89 c__
56 5 6 2 5 1 8 2 0 90 7 1 7 8 1 7 9
7 pen
alt
y':
'l1',
...
9 19 3. 0.1 0.0 1 l1 80 {'lo 0.8 0.9 0.8 0.8 0.8 0.8 0. 13 0.8 0.8 0.9 0.9 0.9 0.9 0.0
.1 30 10 19 gist 98 02 79 85 86 90 00 99 97 03 01 00 00 02
62 91 47 74 ic_ 41 56 83 96 98 75 84 06 61 63 45 04 36 05
46 08 0 0 _C' 5 8 4 1 0 2 23 3 6 1 2 6 1 9
st
m st par para st
e e y s e
e
: 1,
'log
isti
c__
2 pen
alt
y':
'l1',
...
{'lo
gist
ic_
_C'
15 : 1,
2. 0.1 0.0 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.5 'log
1 48 18 30 93 98 78 82 87 88 00 94 93 98 96 93 95 01
68 1 l2 60 isti 19
0 43 45 63 82 97 43 17 81 24 74 25 14 05 59 89 18 84
77 c__
54 6 4 3 2 8 4 3 4 76 8 0 0 7 1 7 0
0 pen
alt
y':
'l2',
...
{'lo
gist
ic_
_C'
16 : 1,
3. 0.0 0.0 0.8 0.9 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.9 0.9 0.8 0.9 0.0
.5 'log
1 08 93 48 98 02 79 86 87 90 00 99 97 03 01 99 00 02
55 1 l2 80 isti 16
1 71 84 67 47 41 08 06 17 64 85 07 88 62 87 69 43 05
75 c__
09 2 0 1 2 2 2 5 1 61 4 5 5 5 2 0 7
4 pen
alt
y':
'l2',
...
1 13 2. 0.0 0.0 2 l1 60 {'lo 0.8 0.8 0.8 0.8 0.8 0.8 0. 23 0.8 0.8 0.8 0.8 0.8 0.8 0.0
2 .7 33 88 07 gist 93 98 78 81 88 88 00 94 92 98 96 93 95 01
54 36 56 75 ic_ 92 77 67 34 28 20 75 32 96 03 55 82 14 87
14 08 0 8 _C' 1 9 1 8 2 1 15 3 7 0 9 0 0 1
0 : 2,
'log
st
m st par para st
e e y s e
e
isti
c__
pen
alt
y':
'l1',
...
{'lo
gist
ic_
_C'
15 : 2,
3. 0.0 0.0 0.8 0.9 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.9 0.9 0.8 0.9 0.0
.1 'log
1 35 89 22 98 02 79 86 87 90 00 99 97 03 01 99 00 01
55 2 l1 80 isti 7
3 71 51 16 96 46 08 98 13 92 85 21 95 52 78 59 41 98
68 c__
57 6 9 6 0 1 1 6 5 78 0 3 9 3 0 3 8
1 pen
alt
y':
'l1',
...
{'lo
gist
ic_
_C'
12 : 2,
1. 0.1 0.0 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.3 'log
1 78 27 23 93 98 78 81 88 88 00 94 92 97 96 94 95 01
02 2 l2 60 isti 25
4 15 97 03 61 70 41 75 09 11 74 46 91 66 55 08 13 72
50 c__
86 7 2 0 9 6 6 4 7 43 9 3 4 7 4 7 7
5 pen
alt
y':
'l2',
...
1 13 1. 0.1 0.0 2 l2 80 {'lo 0.8 0.9 0.8 0.8 0.8 0.8 0. 14 0.8 0.8 0.9 0.9 0.8 0.9 0.0
5 .4 99 05 37 gist 98 02 78 86 87 90 00 99 97 03 01 99 00 02
54 64 23 20 ic_ 26 80 35 81 29 70 87 08 44 54 78 95 36 11
20 81 8 8 _C' 8 4 1 8 9 9 51 1 5 4 4 9 2 9
4 : 2,
'log
isti
c__
st
m st par para st
e e y s e
e
pen
alt
y':
'l2',
...
{'lo
gist
ic_
_C'
12 : 3,
1. 0.0 0.0 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.9 'log
1 97 93 10 93 98 78 81 87 87 00 94 93 97 96 94 95 01
51 3 l1 60 isti 28
6 97 14 15 28 52 55 91 46 95 72 14 02 98 55 24 19 80
00 c__
78 3 1 9 7 5 6 4 0 84 8 7 0 2 8 1 5
0 pen
alt
y':
'l1',
...
{'lo
gist
ic_
_C'
16 : 3,
1. 0.1 0.0 0.8 0.9 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.9 0.9 0.9 0.9 0.0
.6 'log
1 96 09 21 98 02 79 86 87 90 00 99 97 03 01 00 00 02
19 3 l1 80 isti 9
7 72 13 63 62 37 45 58 31 87 84 13 61 48 76 08 41 04
93 c__
24 5 7 8 6 0 5 8 2 14 1 6 4 7 7 7 2
1 pen
alt
y':
'l1',
...
1 14 0. 0.1 0.0 3 l2 60 {'lo 0.8 0.8 0.8 0.8 0.8 0.8 0. 18 0.8 0.8 0.8 0.8 0.8 0.8 0.0
8 .5 38 17 21 gist 93 99 79 82 87 88 00 94 93 98 96 94 95 01
36 05 48 49 ic_ 73 12 51 03 55 39 72 17 07 21 64 12 24 89
84 87 6 6 _C' 8 3 7 3 6 4 61 7 1 0 9 6 7 1
6 : 3,
'log
isti
c__
pen
alt
st
m st par para st
e e y s e
e
y':
'l2',
...
{'lo
gist
ic_
_C'
13 : 3,
1. 0.0 0.0 0.8 0.9 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.9 0.9 0.8 0.9 0.0
.3 'log
1 88 95 35 98 02 79 86 87 91 00 99 97 03 01 99 00 02
80 3 l2 80 isti 3
9 92 54 35 86 64 41 36 81 02 85 47 77 53 90 86 50 00
01 c__
65 5 4 6 9 6 5 5 3 25 1 5 4 0 1 8 3
7 pen
alt
y':
'l2',
...
{'lo
gist
ic_
_C'
11 : 4,
1. 0.0 0.0 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.9 'log
2 35 94 28 93 98 77 81 87 87 00 94 93 97 96 94 95 01
94 4 l1 60 isti 30
0 24 94 31 96 91 44 80 21 86 78 26 17 82 87 17 26 77
45 c__
83 8 8 6 1 6 2 9 9 10 1 1 7 1 7 1 3
1 pen
alt
y':
'l1',
...
2 16 3. 0.0 0.0 4 l1 80 {'lo 0.8 0.9 0.8 0.8 0.8 0.8 0. 10 0.8 0.8 0.9 0.9 0.9 0.9 0.0
1 .7 02 72 14 gist 98 02 79 86 87 90 00 99 97 03 01 00 00 02
23 93 40 46 ic_ 52 41 35 37 43 82 84 23 62 55 75 33 50 04
63 06 7 2 _C' 8 4 5 5 1 1 44 0 6 7 6 0 0 1
3 : 4,
'log
isti
c__
pen
alt
y':
'l1',
st
m st par para st
e e y s e
e
...
{'lo
gist
ic_
_C'
13 : 4,
1. 0.0 0.0 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.3 'log
2 93 80 20 93 99 78 81 87 88 00 94 93 98 96 94 95 01
56 4 l2 60 isti 20
2 96 70 96 57 28 54 92 88 24 75 35 35 20 74 08 34 82
52 c__
83 8 5 0 1 2 0 5 0 42 7 2 9 5 3 9 8
8 pen
alt
y':
'l2',
...
{'lo
gist
ic_
_C'
13 : 4,
0. 0.0 0.0 0.8 0.9 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.9 0.9 0.9 0.9 0.0
.4 'log
2 88 72 21 98 02 80 86 87 90 00 99 97 03 01 00 00 02
83 4 l2 80 isti 6
3 70 30 71 56 27 64 21 15 96 81 20 52 46 57 34 42 02
16 c__
52 3 2 0 3 6 2 0 9 14 5 8 5 3 1 2 2
0 pen
alt
y':
'l2',
...
{'lo
gist
ic_
_C'
13 : 5,
0. 0.0 0.0 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.0 'log
2 33 62 30 93 98 78 82 87 88 00 94 93 98 96 94 95 01
05 5 l1 60 isti 26
4 79 64 04 74 47 23 18 58 04 73 41 06 17 84 25 34 87
04 c__
82 1 5 4 6 8 6 6 6 76 1 2 9 3 0 9 4
3 pen
alt
y':
'l1',
...
2 12 1. 0.0 0.0 5 l1 80 {'lo 0.8 0.9 0.8 0.8 0.8 0.8 0. 8 0.8 0.8 0.9 0.9 0.9 0.9 0.0
st
m st par para st
e e y s e
e
gist
ic_
_C'
: 5,
.9 'log
92 77 20 98 02 79 86 86 90 00 99 97 03 01 00 00 02
76 isti
5 22 72 49 68 68 43 79 88 89 85 30 69 72 74 33 56 06
81 c__
45 1 4 4 0 2 0 1 4 31 8 0 5 2 2 0 3
1 pen
alt
y':
'l1',
...
{'lo
gist
ic_
_C'
13 : 5,
1. 0.0 0.0 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.3 'log
2 09 66 13 93 99 77 81 87 87 00 93 93 98 96 93 95 01
59 5 l2 60 isti 29
6 99 57 85 51 64 65 53 15 90 79 81 29 38 78 88 23 99
46 c__
91 5 7 1 0 4 7 4 0 42 3 1 1 2 6 0 6
7 pen
alt
y':
'l2',
...
{'lo
gist
ic_
_C'
18 : 5,
2. 0.0 0.0 0.8 0.9 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.9 0.9 0.9 0.9 0.0
.3 'log
2 92 69 11 98 02 79 85 87 90 00 99 97 03 01 00 00 02
01 5 l2 80 isti 11
7 66 41 39 79 69 71 56 32 82 85 47 39 40 80 37 49 04
42 c__
47 5 9 0 8 5 9 8 0 76 3 8 9 5 8 3 2
0 pen
alt
y':
'l2',
...
2 13 2. 0.0 0.0 10 l1 60 {'lo 0.8 0.8 0.8 0.8 0.8 0.8 0. 24 0.8 0.8 0.8 0.8 0.8 0.8 0.0
8 .7 87 83 10 gist 93 99 78 81 87 88 00 94 92 97 96 94 95 01
18 81 40 01 ic_ 55 79 11 44 73 12 78 10 98 92 46 09 11 80
st
m st par para st
e e y s e
e
_C'
:
10,
'log
isti
04
32 2 6 c__ 2 9 4 0 3 8 84 7 9 9 9 7 8 7
5
pen
alt
y':
'l1',
...
{'lo
gist
ic_
_C'
:
15
3. 0.0 0.0 10, 0.8 0.9 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.9 0.9 0.8 0.9 0.0
.7
2 10 57 11 'log 99 03 79 86 87 90 00 99 97 03 01 99 00 01
53 10 l1 80 5
9 14 84 19 isti 13 06 10 31 25 97 88 52 72 47 73 98 49 96
56
51 5 5 c__ 2 0 6 1 9 4 21 6 8 2 8 5 0 3
6
pen
alt
y':
'l1',
...
{'lo
gist
ic_
_C'
:
15
1. 0.0 0.0 10, 0.8 0.8 0.8 0.8 0.8 0.8 0. 0.8 0.8 0.8 0.8 0.8 0.8 0.0
.2
3 88 95 18 'log 94 98 78 81 87 88 00 94 93 98 96 94 95 01
61 10 l2 60 22
0 14 37 96 isti 18 87 28 72 94 20 76 55 29 00 67 31 37 71
61
16 0 7 c__ 0 9 3 6 1 2 18 2 5 6 7 9 0 7
9
pen
alt
y':
'l2',
...
3 12 4. 0.0 0.0 10 l2 80 {'lo 0.8 0.9 0.8 0.8 0.8 0.8 0. 2 0.8 0.8 0.9 0.9 0.9 0.9 0.0
1 .8 11 78 11 gist 98 02 80 86 87 91 00 99 97 03 01 00 00 01
st
m st par para st
e e y s e
e
ic_
_C'
:
10,
'log
12
97 19 50 isti 58 50 53 81 14 11 81 39 75 28 80 20 48 91
69
37 2 3 c__ 1 1 8 1 8 6 43 8 0 2 0 2 6 3
0
pen
alt
y':
'l2',
...
In [69]:
# print best hyperparameters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)
Best AUC: 0.8911594701125283
Best hyperparameters: {'logistic__C': 0.5, 'logistic__penalty': 'l1',
'pca__n_components': 80}
In [70]:
y_pred = model.predict(X_test)

print(cm)


y_pred_prob = model.predict_proba(X_test)[:, 1]
[[5864 1027]
[ 107 503]]
Sensitivity: 0.82
Specificity: 0.85
AUC: 0.91
Random Forest
In [71]:
# random forest - the class weight is used to handle class imbalance - it
adjusts the cost function
forest = RandomForestClassifier(class_weight={0:0.1, 1: 0.9}, n_jobs = -1)
params = {"criterion": ['gini', 'entropy'], "max_features": ['auto', 0.4]}
# create 5 folds

model = GridSearchCV(estimator=forest, cv=folds, param_grid=params,
In [72]:
# fit model
[Parallel(n_jobs=-1)]: Done 20 out of 20 | elapsed: 51.8s finished
Out[72]:
estimator=RandomForestClassifier(bootstrap=True, class_weight={0:
0.1, 1: 0.9},
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
verbose=0, warm_start=False),
param_grid={'criterion': ['gini', 'entropy'], 'max_features':
['auto', 0.4]},
In [73]:
Best AUC: 0.8821281027498742
Best hyperparameters: {'criterion': 'entropy', 'max_features': 0.4}
In [74]:

print(cm)


[[6771 120]
[ 328 282]]
Sensitivity: 0.46
Specificity: 0.98
AUC: 0.89
Poor sensitivity. The best model is PCA along with Logistic regression.
ii) Choosing best features

In [75]:
# run a random forest model on train data
max_features = int(round(np.sqrt(X_train.shape[1]))) # number of
variables to consider to split each node
print(max_features)
rf_model = RandomForestClassifier(n_estimators=100,
max_features=max_features, class_weight={0:0.1, 1: 0.9}, oob_score=True,
random_state=4, verbose=1)
13
In [76]:
# fit model
rf_model.fit(X_train, y_train)
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 28.4s finished
Out[76]:
RandomForestClassifier(bootstrap=True, class_weight={0: 0.1, 1: 0.9},
criterion='gini', max_depth=None, max_features=13,
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=1, oob_score=True, random_state=4,
verbose=1, warm_start=False)
In [77]:
# OOB score
rf_model.oob_score_
Out[77]:
0.9426666666666667
In [78]:
y_pred = rf_model.predict(X_test)

print(cm)


y_pred_prob = rf_model.predict_proba(X_test)[:, 1]
print("ROC: \t", round(roc_auc_score(y_test, y_pred_prob),2))
[[6792 99]
[ 329 281]]
Sensitivity: 0.46
Specificity: 0.99
ROC: 0.93
Feature Importance
In [79]:
# predictors
features = churn_filtered.drop('churn', axis=1).columns
# feature_importance
importance = rf_model.feature_importances_
# create dataframe
feature_importance = pd.DataFrame({'variables': features,
'importance_percentage': importance*100})
feature_importance = feature_importance[['variables',
'importance_percentage']]
# sort features
feature_importance =
feature_importance.sort_values('importance_percentage',
ascending=False).reset_index(drop=True)
print("Sum of importance=", feature_importance.importance_percentage.sum())
feature_importance
Sum of importance= 99.99999999999999
Out[79]:
variables importance_percentage
0 total_ic_mou_8 7.051799
1 total_og_mou_8 4.029977
2 last_day_rch_amt_8 3.217914
3 arpu_8 3.112591
4 total_rech_amt_diff 2.908439
5 loc_ic_mou_8 2.801836
6 roam_og_mou_8 2.618070
7 loc_ic_t2m_mou_8 2.574123
8 max_rech_amt_8 2.565527
9 total_rech_amt_8 2.479779
10 arpu_diff 2.433597
11 roam_ic_mou_8 2.313062
12 loc_ic_t2t_mou_8 2.272950
13 total_rech_num_diff 1.684547
14 total_ic_mou_diff 1.616509
15 total_og_mou_diff 1.515810
16 loc_og_mou_8 1.407685
17 av_rech_amt_data_8 1.341360
18 std_og_mou_diff 1.297516
19 roam_ic_mou_diff 1.295492
20 loc_ic_mou_diff 1.231582
21 roam_og_mou_diff 1.196602
22 loc_og_t2m_mou_8 0.927088
23 loc_og_t2t_mou_8 0.918190
24 offnet_mou_8 0.835193
25 offnet_mou_diff 0.787708
26 vol_2g_mb_8 0.767465
27 std_ic_mou_8 0.756777
28 total_ic_mou_7 0.755680
29 std_og_t2m_mou_8 0.750015
30 max_rech_amt_diff 0.727545
31 fb_user_8 0.726221
32 night_pck_user_8 0.693271
33 onnet_mou_diff 0.682034
34 aon 0.657394
35 max_rech_data_8 0.614868
36 loc_og_mou_diff 0.606564
37 std_og_mou_8 0.593622
38 onnet_mou_8 0.575225
40 loc_ic_t2m_mou_6 0.518464
41 total_rech_data_8 0.518115
42 arpu_7 0.512462
43 std_ic_mou_diff 0.502147
44 std_og_mou_7 0.492795
45 loc_og_mou_7 0.491933
46 total_rech_num_8 0.488850
47 total_ic_mou_6 0.473363
48 loc_ic_t2m_mou_7 0.466724
49 total_og_mou_7 0.465682
50 loc_og_t2m_mou_6 0.465483
51 loc_ic_mou_7 0.463715
52 vol_2g_mb_diff 0.461875
53 arpu_6 0.459183
54 std_og_t2t_mou_8 0.457876
55 vol_3g_mb_8 0.454195
56 spl_og_mou_diff 0.445280
57 offnet_mou_6 0.445082
58 loc_ic_t2t_mou_7 0.444555
59 av_rech_amt_data_diff 0.443252
60 offnet_mou_7 0.441294
61 loc_ic_mou_6 0.438766
62 loc_og_t2m_mou_7 0.433585
63 loc_og_mou_6 0.427759
64 max_rech_data_diff 0.421121
65 loc_og_t2t_mou_7 0.420918
66 loc_ic_t2t_mou_6 0.417714
68 onnet_mou_7 0.397039
70 std_ic_t2t_mou_8 0.381641
71 std_ic_t2m_mou_8 0.377689
73 std_og_mou_6 0.369188
74 std_ic_mou_6 0.366826
75 loc_og_t2t_mou_6 0.366003
76 std_ic_t2m_mou_7 0.356541
77 isd_ic_mou_diff 0.354723
78 onnet_mou_6 0.351645
79 loc_ic_t2f_mou_8 0.348421
80 roam_ic_mou_7 0.347550
81 spl_ic_mou_diff 0.340279
82 std_og_t2m_mou_7 0.339827
83 vol_3g_mb_diff 0.338704
84 total_og_mou_6 0.336792
85 std_og_t2m_mou_6 0.336494
86 std_og_t2t_mou_7 0.332075
87 aug_vbc_3g 0.328900
88 loc_ic_t2f_mou_7 0.328640
89 std_ic_mou_7 0.324208
90 spl_og_mou_7 0.316932
91 std_og_t2t_mou_6 0.315337
92 max_rech_amt_7 0.313147
93 isd_ic_mou_8 0.308535
94 loc_og_t2f_mou_8 0.307020
95 std_ic_t2m_mou_6 0.303577
96 total_rech_data_diff 0.303365
97 max_rech_amt_6 0.293195
98 loc_ic_t2f_mou_6 0.290351
99 spl_og_mou_6 0.288099
100 roam_og_mou_7 0.285888
104 loc_og_t2f_mou_6 0.269368
105 spl_og_mou_8 0.266655
106 std_ic_t2t_mou_6 0.266636
107 loc_og_t2f_mou_7 0.263393
108 vol_2g_mb_6 0.257802
109 std_ic_t2t_mou_7 0.254425
110 vol_3g_mb_7 0.243538
111 vol_2g_mb_7 0.240131
113 roam_og_mou_6 0.233235
114 roam_ic_mou_6 0.228482
115 og_others_6 0.227395
116 vol_3g_mb_6 0.218148
117 max_rech_data_7 0.214674
118 loc_og_t2c_mou_8 0.210770
119 isd_ic_mou_6 0.205554
120 jul_vbc_3g 0.204624
121 isd_og_mou_diff 0.204362
122 spl_ic_mou_6 0.201601
123 ic_others_7 0.199106
124 std_ic_t2f_mou_8 0.191354
125 spl_ic_mou_8 0.186443
126 isd_ic_mou_7 0.186243
127 std_ic_t2f_mou_7 0.180216
128 jun_vbc_3g 0.178941
129 ic_others_8 0.176169
130 ic_others_6 0.176027
131 loc_og_t2c_mou_6 0.172089
132 sep_vbc_3g 0.171285
133 max_rech_data_6 0.170704
134 loc_og_t2c_mou_7 0.164413
136 std_ic_t2f_mou_6 0.146022
137 isd_og_mou_6 0.145912
138 std_ic_t2o_mou_8 0.139639
139 std_og_t2f_mou_7 0.137777
140 isd_og_mou_7 0.126929
142 og_others_8 0.117843
143 spl_ic_mou_7 0.114016
144 sachet_2g_7 0.108001
145 isd_og_mou_8 0.105353
146 sachet_2g_8 0.101112
147 std_og_t2f_mou_6 0.098533
148 sachet_2g_6 0.087329
149 std_og_t2f_mou_8 0.083208
150 sachet_3g_7 0.082937
151 sachet_3g_6 0.078567
152 monthly_2g_7 0.058376
153 monthly_3g_6 0.055064
154 fb_user_7 0.053158
155 monthly_3g_7 0.052559
157 monthly_3g_8 0.050323
158 std_og_t2c_mou_6 0.049699
159 sachet_3g_8 0.047272
160 og_others_7 0.045714
161 std_og_t2c_mou_8 0.040714
163 monthly_2g_6 0.039866
164 fb_user_6 0.039001
165 monthly_2g_8 0.033863
166 std_ic_t2o_mou_6 0.029129
167 std_ic_t2o_mou_7 0.019845
168 std_og_t2o_mou 0.015844
169 loc_og_t2o_mou 0.011197
170 std_og_t2c_mou_7 0.006985
171 loc_ic_t2o_mou 0.006427
Extracting top 30 features
In [80]:
# extract top 'n' features
top_n = 30
top_features = feature_importance.variables[0:top_n]
In [81]:
# plot feature correlation
import seaborn as sns
plt.rcParams["figure.figsize"] =(10,10)
mycmap = sns.diverging_palette(199, 359, s=99, center="light",
as_cmap=True)
sns.heatmap(data=X_train[top_features].corr(), center=0.0, cmap=mycmap)
Out[81]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b602a162e8>
In [82]:
top_features = ['total_ic_mou_8', 'total_rech_amt_diff', 'total_og_mou_8',
'arpu_8', 'roam_ic_mou_8', 'roam_og_mou_8',
'std_ic_mou_8', 'av_rech_amt_data_8', 'std_og_mou_8']
X_train = X_train[top_features]
X_test = X_test[top_features]
In [83]:
# logistic regression
steps = [('scaler', StandardScaler()),
("logistic", LogisticRegression(class_weight={0:0.1, 1:0.9}))
]
# compile pipeline
logistic = Pipeline(steps)
params = {'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10],
'logistic__penalty': ['l1', 'l2']}
# create 5 folds

model = GridSearchCV(estimator=logistic, cv=folds, param_grid=params,
In [84]:
# fit model
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 10.1s
[Parallel(n_jobs=-1)]: Done 80 out of 80 | elapsed: 14.2s finished
Out[84]:
estimator=Pipeline(memory=None,
with_std=True)), ('logistic', LogisticRegression(C=1.0, class_weight={0:
0.1, 1: 0.9}, dual=False,
fit_intercept=True, intercept_scaling=1, max_iter=100,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]),
param_grid={'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10],
'logistic__penalty': ['l1', 'l2']},
In [85]:
Best AUC: 0.8728895343145171
Best hyperparameters: {'logistic__C': 10, 'logistic__penalty': 'l1'}
In [86]:

print(cm)


print("ROC: \t", round(roc_auc_score(y_test, y_pred_prob),2))
[[5758 1133]
[ 121 489]]
Sensitivity: 0.8
Specificity: 0.84
ROC: 0.88
Extract the intercept and the coefficients from the logistic model
In [87]:
logistic_model = model.best_estimator_.named_steps['logistic']
In [88]:
# intercept
intercept_df = pd.DataFrame(logistic_model.intercept_.reshape((1,1)),
columns = ['intercept'])
In [89]:
# coefficients
coefficients = logistic_model.coef_.reshape((9, 1)).tolist()
coefficients = [val for sublist in coefficients for val in sublist]
coefficients = [round(coefficient, 3) for coefficient in coefficients]
logistic_features = list(X_train.columns)
coefficients_df = pd.DataFrame(logistic_model.coef_,
columns=logistic_features)
In [90]:
# concatenate dataframes
coefficients = pd.concat([intercept_df, coefficients_df], axis=1)
coefficients
Out[90]:
inter total_ic total_rech total_og arpu roam_ic roam_og std_ic_ av_rech_a std_og_
cept _mou_8 _amt_diff _mou_8 _8 _mou_8 _mou_8 mou_8 mt_data_8 mou_8
- - -
0.17 0.02502
0 1.46 1.19186 -0.684758 1.09609 0.066651 0.16906 -0.795034 0.58883
9667 6
353 6 9
Business Insights
 Telecom company needs to pay attention to the roaming rates. They need to provide
good offers to the customers who are using services from a roaming zone.
 The company needs to focus on the STD and ISD rates. Perhaps, the rates are too
high. Provide them with some kind of STD and ISD packages.
 To look into both of the issues stated above, it is desired that the telecom company
collects customer query and complaint data and work on their services according to
the needs of customers.
In [ ]:

Tel Churn

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Tel Churn

Uploaded by

Copyright:

Available Formats

In

from IPython.core.interactiveshell import InteractiveShell

num_cols = [column for column in churn.columns if column not in id_cols +

# print the number of columns in each list

# check if we have missed any column or not

Handling missing values

impute missing values

total_rech_data_6 total_rech_data_7 total_rech_data_8 \

total_rech_data_9 av_rech_amt_data_6 av_rech_amt_data_7 \

av_rech_amt_data_8 av_rech_amt_data_9 max_rech_data_6 \

max_rech_data_7 max_rech_data_8 max_rech_data_9

iii) Drop variables with more than a given threshold of

include_cols = list(churn.apply(lambda column: True if

drop_missing = pd.DataFrame({'features':churn.columns , 'include':

dropped_cols = churn.shape[1] - initial_cols

iv) imputing using MICE

filter high-value customers

# 0 - not churn, 1 - churn

# print churn ratio

Calculate difference between 8th and

delete columns that belong to the churn month (9th

# update num_cols and cat_cols column name list

def univariate(variable, stats=True):

elif data_type(variable) == 'categorical':

def bivariate(var1, var2):

Variables are very skewed towards the left.

Cap outliers in all numeric variables with k-sigma technique

# add outliers to the data

Aggregating the categorical columns

# aggregate the categorical variables

~ 60 components explain 90% variance

~ 80 components explain 95% variance

PCA and Logistic Regression

# check score on train data

Evaluate on test data

# create onfusion matrix

# check sensitivity and specificity

# check area under curve

Hyperparameter tuning - PCA and Logistic Regression

# logistic regression - the class weight is used to handle class imbalance

# create gridsearch object

# create onfusion matrix

# check sensitivity and specificity

# check area under curve

# create gridsearch object

# create onfusion matrix

# check sensitivity and specificity

# check area under curve

ii) Choosing best features

# create onfusion matrix

# check sensitivity and specificity

# check area under curve

Extracting top 30 features

# create gridsearch object

# create onfusion matrix

# check sensitivity and specificity

# check area under curve

You might also like