Professional Documents
Culture Documents
[1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 300)
In [2]:
# read data
churn = pd.read_csv("telecom_data_for_students.csv")
In [3]:
# look at initial rows of the data
churn.head(10)
Out[3]:
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
In [4]:
# feature type summary
churn.info(verbose=1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 226 columns):
mobile_number int64
circle_id int64
loc_og_t2o_mou float64
std_og_t2o_mou float64
loc_ic_t2o_mou float64
last_date_of_month_6 object
last_date_of_month_7 object
last_date_of_month_8 object
last_date_of_month_9 object
arpu_6 float64
arpu_7 float64
arpu_8 float64
arpu_9 float64
onnet_mou_6 float64
onnet_mou_7 float64
onnet_mou_8 float64
onnet_mou_9 float64
offnet_mou_6 float64
offnet_mou_7 float64
offnet_mou_8 float64
offnet_mou_9 float64
roam_ic_mou_6 float64
roam_ic_mou_7 float64
roam_ic_mou_8 float64
roam_ic_mou_9 float64
roam_og_mou_6 float64
roam_og_mou_7 float64
roam_og_mou_8 float64
roam_og_mou_9 float64
loc_og_t2t_mou_6 float64
loc_og_t2t_mou_7 float64
loc_og_t2t_mou_8 float64
loc_og_t2t_mou_9 float64
loc_og_t2m_mou_6 float64
loc_og_t2m_mou_7 float64
loc_og_t2m_mou_8 float64
loc_og_t2m_mou_9 float64
loc_og_t2f_mou_6 float64
loc_og_t2f_mou_7 float64
loc_og_t2f_mou_8 float64
loc_og_t2f_mou_9 float64
loc_og_t2c_mou_6 float64
loc_og_t2c_mou_7 float64
loc_og_t2c_mou_8 float64
loc_og_t2c_mou_9 float64
loc_og_mou_6 float64
loc_og_mou_7 float64
loc_og_mou_8 float64
loc_og_mou_9 float64
std_og_t2t_mou_6 float64
std_og_t2t_mou_7 float64
std_og_t2t_mou_8 float64
std_og_t2t_mou_9 float64
std_og_t2m_mou_6 float64
std_og_t2m_mou_7 float64
std_og_t2m_mou_8 float64
std_og_t2m_mou_9 float64
std_og_t2f_mou_6 float64
std_og_t2f_mou_7 float64
std_og_t2f_mou_8 float64
std_og_t2f_mou_9 float64
std_og_t2c_mou_6 float64
std_og_t2c_mou_7 float64
std_og_t2c_mou_8 float64
std_og_t2c_mou_9 float64
std_og_mou_6 float64
std_og_mou_7 float64
std_og_mou_8 float64
std_og_mou_9 float64
isd_og_mou_6 float64
isd_og_mou_7 float64
isd_og_mou_8 float64
isd_og_mou_9 float64
spl_og_mou_6 float64
spl_og_mou_7 float64
spl_og_mou_8 float64
spl_og_mou_9 float64
og_others_6 float64
og_others_7 float64
og_others_8 float64
og_others_9 float64
total_og_mou_6 float64
total_og_mou_7 float64
total_og_mou_8 float64
total_og_mou_9 float64
loc_ic_t2t_mou_6 float64
loc_ic_t2t_mou_7 float64
loc_ic_t2t_mou_8 float64
loc_ic_t2t_mou_9 float64
loc_ic_t2m_mou_6 float64
loc_ic_t2m_mou_7 float64
loc_ic_t2m_mou_8 float64
loc_ic_t2m_mou_9 float64
loc_ic_t2f_mou_6 float64
loc_ic_t2f_mou_7 float64
loc_ic_t2f_mou_8 float64
loc_ic_t2f_mou_9 float64
loc_ic_mou_6 float64
loc_ic_mou_7 float64
loc_ic_mou_8 float64
loc_ic_mou_9 float64
std_ic_t2t_mou_6 float64
std_ic_t2t_mou_7 float64
std_ic_t2t_mou_8 float64
std_ic_t2t_mou_9 float64
std_ic_t2m_mou_6 float64
std_ic_t2m_mou_7 float64
std_ic_t2m_mou_8 float64
std_ic_t2m_mou_9 float64
std_ic_t2f_mou_6 float64
std_ic_t2f_mou_7 float64
std_ic_t2f_mou_8 float64
std_ic_t2f_mou_9 float64
std_ic_t2o_mou_6 float64
std_ic_t2o_mou_7 float64
std_ic_t2o_mou_8 float64
std_ic_t2o_mou_9 float64
std_ic_mou_6 float64
std_ic_mou_7 float64
std_ic_mou_8 float64
std_ic_mou_9 float64
total_ic_mou_6 float64
total_ic_mou_7 float64
total_ic_mou_8 float64
total_ic_mou_9 float64
spl_ic_mou_6 float64
spl_ic_mou_7 float64
spl_ic_mou_8 float64
spl_ic_mou_9 float64
isd_ic_mou_6 float64
isd_ic_mou_7 float64
isd_ic_mou_8 float64
isd_ic_mou_9 float64
ic_others_6 float64
ic_others_7 float64
ic_others_8 float64
ic_others_9 float64
total_rech_num_6 int64
total_rech_num_7 int64
total_rech_num_8 int64
total_rech_num_9 int64
total_rech_amt_6 int64
total_rech_amt_7 int64
total_rech_amt_8 int64
total_rech_amt_9 int64
max_rech_amt_6 int64
max_rech_amt_7 int64
max_rech_amt_8 int64
max_rech_amt_9 int64
date_of_last_rech_6 object
date_of_last_rech_7 object
date_of_last_rech_8 object
date_of_last_rech_9 object
last_day_rch_amt_6 int64
last_day_rch_amt_7 int64
last_day_rch_amt_8 int64
last_day_rch_amt_9 int64
date_of_last_rech_data_6 object
date_of_last_rech_data_7 object
date_of_last_rech_data_8 object
date_of_last_rech_data_9 object
total_rech_data_6 float64
total_rech_data_7 float64
total_rech_data_8 float64
total_rech_data_9 float64
max_rech_data_6 float64
max_rech_data_7 float64
max_rech_data_8 float64
max_rech_data_9 float64
count_rech_2g_6 float64
count_rech_2g_7 float64
count_rech_2g_8 float64
count_rech_2g_9 float64
count_rech_3g_6 float64
count_rech_3g_7 float64
count_rech_3g_8 float64
count_rech_3g_9 float64
av_rech_amt_data_6 float64
av_rech_amt_data_7 float64
av_rech_amt_data_8 float64
av_rech_amt_data_9 float64
vol_2g_mb_6 float64
vol_2g_mb_7 float64
vol_2g_mb_8 float64
vol_2g_mb_9 float64
vol_3g_mb_6 float64
vol_3g_mb_7 float64
vol_3g_mb_8 float64
vol_3g_mb_9 float64
arpu_3g_6 float64
arpu_3g_7 float64
arpu_3g_8 float64
arpu_3g_9 float64
arpu_2g_6 float64
arpu_2g_7 float64
arpu_2g_8 float64
arpu_2g_9 float64
night_pck_user_6 float64
night_pck_user_7 float64
night_pck_user_8 float64
night_pck_user_9 float64
monthly_2g_6 int64
monthly_2g_7 int64
monthly_2g_8 int64
monthly_2g_9 int64
sachet_2g_6 int64
sachet_2g_7 int64
sachet_2g_8 int64
sachet_2g_9 int64
monthly_3g_6 int64
monthly_3g_7 int64
monthly_3g_8 int64
monthly_3g_9 int64
sachet_3g_6 int64
sachet_3g_7 int64
sachet_3g_8 int64
sachet_3g_9 int64
fb_user_6 float64
fb_user_7 float64
fb_user_8 float64
fb_user_9 float64
aon int64
aug_vbc_3g float64
jul_vbc_3g float64
jun_vbc_3g float64
sep_vbc_3g float64
dtypes: float64(179), int64(35), object(12)
memory usage: 172.4+ MB
There are 99999 rows and 226 columns in the data. Lot of the columns are numeric type, but
we need to inspect which are the categorical columns.
In [5]:
# look at data statistics
churn.describe(include='all')
Out[5]:
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
st
d
_
o
g
_t
2
c
_
m
o
u
_
6
st
d
_
o
g
_t
2
c
_
m
o
u
_
7
st
d
_
o
g
_t
2
c
_
m
o
u
_
8
st
d
_
o
g
_t
2
c
_
In [6]:
# create backup of data
original = churn.copy()
In [7]:
# create column name list by types of columns
id_cols = ['mobile_number', 'circle_id']
date_cols = ['last_date_of_month_6',
'last_date_of_month_7',
'last_date_of_month_8',
'last_date_of_month_9',
'date_of_last_rech_6',
'date_of_last_rech_7',
'date_of_last_rech_8',
'date_of_last_rech_9',
'date_of_last_rech_data_6',
'date_of_last_rech_data_7',
'date_of_last_rech_data_8',
'date_of_last_rech_data_9'
]
cat_cols = ['night_pck_user_6',
'night_pck_user_7',
'night_pck_user_8',
'night_pck_user_9',
'fb_user_6',
'fb_user_7',
'fb_user_8',
'fb_user_9'
]
In the recharge variables where minumum value is 1, we can impute missing values with
zeroes since it means customer didn't recharge their numbere that month.
In [11]:
# create a list of recharge columns where we will impute missing values
with zeroes
zero_impute = ['total_rech_data_6', 'total_rech_data_7',
'total_rech_data_8', 'total_rech_data_9',
'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8',
'av_rech_amt_data_9',
'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8',
'max_rech_data_9'
]
In [12]:
# impute missing values with 0
churn[zero_impute] = churn[zero_impute].apply(lambda x: x.fillna(0))
In [13]:
# now, let's make sure values are imputed correctly
print("Missing value ratio:\n")
print(churn[zero_impute].isnull().sum()*100/churn.shape[1])
# summary
print("\n\nSummary statistics\n")
print(churn[zero_impute].describe(include='all'))
Missing value ratio:
total_rech_data_6 0.0
total_rech_data_7 0.0
total_rech_data_8 0.0
total_rech_data_9 0.0
av_rech_amt_data_6 0.0
av_rech_amt_data_7 0.0
av_rech_amt_data_8 0.0
av_rech_amt_data_9 0.0
max_rech_data_6 0.0
max_rech_data_7 0.0
max_rech_data_8 0.0
max_rech_data_9 0.0
dtype: float64
Summary statistics
In [15]:
# replace missing values with '-1' in categorical columns
churn[cat_cols] = churn[cat_cols].apply(lambda x: x.fillna(-1))
In [16]:
# missing value ratio
print("Missing value ratio:\n")
print(churn[cat_cols].isnull().sum()*100/churn.shape[0])
Missing value ratio:
night_pck_user_6 0.0
night_pck_user_7 0.0
night_pck_user_8 0.0
night_pck_user_9 0.0
fb_user_6 0.0
fb_user_7 0.0
fb_user_8 0.0
fb_user_9 0.0
dtype: float64
MISSING_THRESHOLD = 0.7
In [19]:
churn_cols = churn.columns
# using MICE technique to impute missing values in the rest of the columns
from fancyimpute import MICE
churn_imputed = MICE(n_imputations=1).complete(churn)
Using TensorFlow backend.
[MICE] Completing matrix with shape (99999, 196)
[MICE] Starting imputation round 1/11, elapsed time 0.887
[MICE] Starting imputation round 2/11, elapsed time 176.649
[MICE] Starting imputation round 3/11, elapsed time 343.774
[MICE] Starting imputation round 4/11, elapsed time 507.450
[MICE] Starting imputation round 5/11, elapsed time 677.217
[MICE] Starting imputation round 6/11, elapsed time 839.576
[MICE] Starting imputation round 7/11, elapsed time 1001.674
[MICE] Starting imputation round 8/11, elapsed time 1174.182
[MICE] Starting imputation round 9/11, elapsed time 1369.406
[MICE] Starting imputation round 10/11, elapsed time 1534.524
[MICE] Starting imputation round 11/11, elapsed time 1683.410
In [20]:
# convert imputed numpy array to pandas dataframe
churn = pd.DataFrame(churn_imputed, columns=churn_cols)
print(churn.isnull().sum()*100/churn.shape[0])
loc_og_t2o_mou 0.0
std_og_t2o_mou 0.0
loc_ic_t2o_mou 0.0
arpu_6 0.0
arpu_7 0.0
arpu_8 0.0
arpu_9 0.0
onnet_mou_6 0.0
onnet_mou_7 0.0
onnet_mou_8 0.0
onnet_mou_9 0.0
offnet_mou_6 0.0
offnet_mou_7 0.0
offnet_mou_8 0.0
offnet_mou_9 0.0
roam_ic_mou_6 0.0
roam_ic_mou_7 0.0
roam_ic_mou_8 0.0
roam_ic_mou_9 0.0
roam_og_mou_6 0.0
roam_og_mou_7 0.0
roam_og_mou_8 0.0
roam_og_mou_9 0.0
loc_og_t2t_mou_6 0.0
loc_og_t2t_mou_7 0.0
loc_og_t2t_mou_8 0.0
loc_og_t2t_mou_9 0.0
loc_og_t2m_mou_6 0.0
loc_og_t2m_mou_7 0.0
loc_og_t2m_mou_8 0.0
loc_og_t2m_mou_9 0.0
loc_og_t2f_mou_6 0.0
loc_og_t2f_mou_7 0.0
loc_og_t2f_mou_8 0.0
loc_og_t2f_mou_9 0.0
loc_og_t2c_mou_6 0.0
loc_og_t2c_mou_7 0.0
loc_og_t2c_mou_8 0.0
loc_og_t2c_mou_9 0.0
loc_og_mou_6 0.0
loc_og_mou_7 0.0
loc_og_mou_8 0.0
loc_og_mou_9 0.0
std_og_t2t_mou_6 0.0
std_og_t2t_mou_7 0.0
std_og_t2t_mou_8 0.0
std_og_t2t_mou_9 0.0
std_og_t2m_mou_6 0.0
std_og_t2m_mou_7 0.0
std_og_t2m_mou_8 0.0
std_og_t2m_mou_9 0.0
std_og_t2f_mou_6 0.0
std_og_t2f_mou_7 0.0
std_og_t2f_mou_8 0.0
std_og_t2f_mou_9 0.0
std_og_t2c_mou_6 0.0
std_og_t2c_mou_7 0.0
std_og_t2c_mou_8 0.0
std_og_t2c_mou_9 0.0
std_og_mou_6 0.0
std_og_mou_7 0.0
std_og_mou_8 0.0
std_og_mou_9 0.0
isd_og_mou_6 0.0
isd_og_mou_7 0.0
isd_og_mou_8 0.0
isd_og_mou_9 0.0
spl_og_mou_6 0.0
spl_og_mou_7 0.0
spl_og_mou_8 0.0
spl_og_mou_9 0.0
og_others_6 0.0
og_others_7 0.0
og_others_8 0.0
og_others_9 0.0
total_og_mou_6 0.0
total_og_mou_7 0.0
total_og_mou_8 0.0
total_og_mou_9 0.0
loc_ic_t2t_mou_6 0.0
loc_ic_t2t_mou_7 0.0
loc_ic_t2t_mou_8 0.0
loc_ic_t2t_mou_9 0.0
loc_ic_t2m_mou_6 0.0
loc_ic_t2m_mou_7 0.0
loc_ic_t2m_mou_8 0.0
loc_ic_t2m_mou_9 0.0
loc_ic_t2f_mou_6 0.0
loc_ic_t2f_mou_7 0.0
loc_ic_t2f_mou_8 0.0
loc_ic_t2f_mou_9 0.0
loc_ic_mou_6 0.0
loc_ic_mou_7 0.0
loc_ic_mou_8 0.0
loc_ic_mou_9 0.0
std_ic_t2t_mou_6 0.0
std_ic_t2t_mou_7 0.0
std_ic_t2t_mou_8 0.0
std_ic_t2t_mou_9 0.0
std_ic_t2m_mou_6 0.0
std_ic_t2m_mou_7 0.0
std_ic_t2m_mou_8 0.0
std_ic_t2m_mou_9 0.0
std_ic_t2f_mou_6 0.0
std_ic_t2f_mou_7 0.0
std_ic_t2f_mou_8 0.0
std_ic_t2f_mou_9 0.0
std_ic_t2o_mou_6 0.0
std_ic_t2o_mou_7 0.0
std_ic_t2o_mou_8 0.0
std_ic_t2o_mou_9 0.0
std_ic_mou_6 0.0
std_ic_mou_7 0.0
std_ic_mou_8 0.0
std_ic_mou_9 0.0
total_ic_mou_6 0.0
total_ic_mou_7 0.0
total_ic_mou_8 0.0
total_ic_mou_9 0.0
spl_ic_mou_6 0.0
spl_ic_mou_7 0.0
spl_ic_mou_8 0.0
spl_ic_mou_9 0.0
isd_ic_mou_6 0.0
isd_ic_mou_7 0.0
isd_ic_mou_8 0.0
isd_ic_mou_9 0.0
ic_others_6 0.0
ic_others_7 0.0
ic_others_8 0.0
ic_others_9 0.0
total_rech_num_6 0.0
total_rech_num_7 0.0
total_rech_num_8 0.0
total_rech_num_9 0.0
total_rech_amt_6 0.0
total_rech_amt_7 0.0
total_rech_amt_8 0.0
total_rech_amt_9 0.0
max_rech_amt_6 0.0
max_rech_amt_7 0.0
max_rech_amt_8 0.0
max_rech_amt_9 0.0
last_day_rch_amt_6 0.0
last_day_rch_amt_7 0.0
last_day_rch_amt_8 0.0
last_day_rch_amt_9 0.0
total_rech_data_6 0.0
total_rech_data_7 0.0
total_rech_data_8 0.0
total_rech_data_9 0.0
max_rech_data_6 0.0
max_rech_data_7 0.0
max_rech_data_8 0.0
max_rech_data_9 0.0
av_rech_amt_data_6 0.0
av_rech_amt_data_7 0.0
av_rech_amt_data_8 0.0
av_rech_amt_data_9 0.0
vol_2g_mb_6 0.0
vol_2g_mb_7 0.0
vol_2g_mb_8 0.0
vol_2g_mb_9 0.0
vol_3g_mb_6 0.0
vol_3g_mb_7 0.0
vol_3g_mb_8 0.0
vol_3g_mb_9 0.0
night_pck_user_6 0.0
night_pck_user_7 0.0
night_pck_user_8 0.0
night_pck_user_9 0.0
monthly_2g_6 0.0
monthly_2g_7 0.0
monthly_2g_8 0.0
monthly_2g_9 0.0
sachet_2g_6 0.0
sachet_2g_7 0.0
sachet_2g_8 0.0
sachet_2g_9 0.0
monthly_3g_6 0.0
monthly_3g_7 0.0
monthly_3g_8 0.0
monthly_3g_9 0.0
sachet_3g_6 0.0
sachet_3g_7 0.0
sachet_3g_8 0.0
sachet_3g_9 0.0
fb_user_6 0.0
fb_user_7 0.0
fb_user_8 0.0
fb_user_9 0.0
aon 0.0
aug_vbc_3g 0.0
jul_vbc_3g 0.0
jun_vbc_3g 0.0
sep_vbc_3g 0.0
dtype: float64
In [21]:
# calculate the total data recharge amount for June and July --> number of
recharges * average recharge amount
churn['total_data_rech_6'] = churn.total_rech_data_6 *
churn.av_rech_amt_data_6
churn['total_data_rech_7'] = churn.total_rech_data_7 *
churn.av_rech_amt_data_7
add total data recharge and total recharge to get total combined recharge
amount for a month
In [22]:
# calculate total recharge amount for June and July --> call recharge
amount + data recharge amount
churn['amt_data_6'] = churn.total_rech_amt_6 + churn.total_data_rech_6
churn['amt_data_7'] = churn.total_rech_amt_7 + churn.total_data_rech_7
In [23]:
# calculate average recharge done by customer in June and July
churn['av_amt_data_6_7'] = (churn.amt_data_6 + churn.amt_data_7)/2
In [24]:
# look at the 70th percentile recharge amount
print("Recharge amount at 70th percentile:
{0}".format(churn.av_amt_data_6_7.quantile(0.7)))
Recharge amount at 70th percentile: 478.0
In [25]:
# retain only those customers who have recharged their mobiles with more
than or equal to 70th percentile amount
churn_filtered = churn.loc[churn.av_amt_data_6_7 >=
churn.av_amt_data_6_7.quantile(0.7), :]
churn_filtered = churn_filtered.reset_index(drop=True)
churn_filtered.shape
Out[25]:
(30001, 201)
In [26]:
# delete variables created to filter high-value customers
churn_filtered = churn_filtered.drop(['total_data_rech_6',
'total_data_rech_7',
'amt_data_6', 'amt_data_7',
'av_amt_data_6_7'], axis=1)
churn_filtered.shape
Out[26]:
(30001, 196)
We're left with 30,001 rows after selecting the customers who have provided recharge value
of more than or equal to the recharge value of the 70th percentile customer.
derive churn
In [27]:
# calculate total incoming and outgoing minutes of usage
churn_filtered['total_calls_mou_9'] = churn_filtered.total_ic_mou_9 +
churn_filtered.total_og_mou_9
In [28]:
# calculate 2g and 3g data consumption
churn_filtered['total_internet_mb_9'] = churn_filtered.vol_2g_mb_9 +
churn_filtered.vol_3g_mb_9
In [29]:
# create churn variable: those who have not used either calls or internet
in the month of September are customers who have churned
In [32]:
churn_filtered['arpu_diff'] = churn_filtered.arpu_8 -
((churn_filtered.arpu_6 + churn_filtered.arpu_7)/2)
churn_filtered['onnet_mou_diff'] = churn_filtered.onnet_mou_8 -
((churn_filtered.onnet_mou_6 + churn_filtered.onnet_mou_7)/2)
churn_filtered['offnet_mou_diff'] = churn_filtered.offnet_mou_8 -
((churn_filtered.offnet_mou_6 + churn_filtered.offnet_mou_7)/2)
churn_filtered['roam_ic_mou_diff'] = churn_filtered.roam_ic_mou_8 -
((churn_filtered.roam_ic_mou_6 + churn_filtered.roam_ic_mou_7)/2)
churn_filtered['roam_og_mou_diff'] = churn_filtered.roam_og_mou_8 -
((churn_filtered.roam_og_mou_6 + churn_filtered.roam_og_mou_7)/2)
churn_filtered['loc_og_mou_diff'] = churn_filtered.loc_og_mou_8 -
((churn_filtered.loc_og_mou_6 + churn_filtered.loc_og_mou_7)/2)
churn_filtered['std_og_mou_diff'] = churn_filtered.std_og_mou_8 -
((churn_filtered.std_og_mou_6 + churn_filtered.std_og_mou_7)/2)
churn_filtered['isd_og_mou_diff'] = churn_filtered.isd_og_mou_8 -
((churn_filtered.isd_og_mou_6 + churn_filtered.isd_og_mou_7)/2)
churn_filtered['spl_og_mou_diff'] = churn_filtered.spl_og_mou_8 -
((churn_filtered.spl_og_mou_6 + churn_filtered.spl_og_mou_7)/2)
churn_filtered['total_og_mou_diff'] = churn_filtered.total_og_mou_8 -
((churn_filtered.total_og_mou_6 + churn_filtered.total_og_mou_7)/2)
churn_filtered['loc_ic_mou_diff'] = churn_filtered.loc_ic_mou_8 -
((churn_filtered.loc_ic_mou_6 + churn_filtered.loc_ic_mou_7)/2)
churn_filtered['std_ic_mou_diff'] = churn_filtered.std_ic_mou_8 -
((churn_filtered.std_ic_mou_6 + churn_filtered.std_ic_mou_7)/2)
churn_filtered['isd_ic_mou_diff'] = churn_filtered.isd_ic_mou_8 -
((churn_filtered.isd_ic_mou_6 + churn_filtered.isd_ic_mou_7)/2)
churn_filtered['spl_ic_mou_diff'] = churn_filtered.spl_ic_mou_8 -
((churn_filtered.spl_ic_mou_6 + churn_filtered.spl_ic_mou_7)/2)
churn_filtered['total_ic_mou_diff'] = churn_filtered.total_ic_mou_8 -
((churn_filtered.total_ic_mou_6 + churn_filtered.total_ic_mou_7)/2)
churn_filtered['total_rech_num_diff'] = churn_filtered.total_rech_num_8 -
((churn_filtered.total_rech_num_6 + churn_filtered.total_rech_num_7)/2)
churn_filtered['total_rech_amt_diff'] = churn_filtered.total_rech_amt_8 -
((churn_filtered.total_rech_amt_6 + churn_filtered.total_rech_amt_7)/2)
churn_filtered['max_rech_amt_diff'] = churn_filtered.max_rech_amt_8 -
((churn_filtered.max_rech_amt_6 + churn_filtered.max_rech_amt_7)/2)
churn_filtered['total_rech_data_diff'] = churn_filtered.total_rech_data_8 -
((churn_filtered.total_rech_data_6 + churn_filtered.total_rech_data_7)/2)
churn_filtered['max_rech_data_diff'] = churn_filtered.max_rech_data_8 -
((churn_filtered.max_rech_data_6 + churn_filtered.max_rech_data_7)/2)
churn_filtered['av_rech_amt_data_diff'] = churn_filtered.av_rech_amt_data_8
- ((churn_filtered.av_rech_amt_data_6 +
churn_filtered.av_rech_amt_data_7)/2)
churn_filtered['vol_2g_mb_diff'] = churn_filtered.vol_2g_mb_8 -
((churn_filtered.vol_2g_mb_6 + churn_filtered.vol_2g_mb_7)/2)
churn_filtered['vol_3g_mb_diff'] = churn_filtered.vol_3g_mb_8 -
((churn_filtered.vol_3g_mb_6 + churn_filtered.vol_3g_mb_7)/2)
In [33]:
# let's look at summary of one of the difference variables
churn_filtered['total_og_mou_diff'].describe()
Out[33]:
count 30001.000000
mean -67.437337
std 502.630069
min -7213.410000
25% -168.025000
50% -14.625000
75% 67.915000
max 12768.705000
Name: total_og_mou_diff, dtype: float64
if data_type(variable) == 'numerical':
sns.distplot(variable)
if stats == True:
print(variable.describe())
else:
print("Invalid variable passed: either pass a numeric variable or a
categorical vairable.")
Univariate EDA
In [38]:
univariate(churn.arpu_6)
count 99999.000000
mean 282.987358
std 328.439770
min -2258.709000
25% 93.411500
50% 197.704000
75% 371.060000
max 27731.088000
Name: arpu_6, dtype: float64
In [39]:
univariate(churn.loc_og_t2o_mou)
count 99999.000000
mean -0.000002
std 0.000312
min -0.009873
25% 0.000000
50% 0.000000
75% 0.000000
max 0.009702
Name: loc_og_t2o_mou, dtype: float64
In [40]:
univariate(churn.std_og_t2o_mou)
count 9.999900e+04
mean -9.957627e-07
std 3.136615e-04
min -1.085887e-02
25% 0.000000e+00
50% 0.000000e+00
75% 0.000000e+00
max 9.418004e-03
Name: std_og_t2o_mou, dtype: float64
In [41]:
univariate(churn.onnet_mou_8)
count 99999.000000
mean 125.972580
std 302.822628
min -700.923704
25% 5.510000
50% 30.760000
75% 109.400000
max 10752.560000
Name: onnet_mou_8, dtype: float64
In [42]:
univariate(churn.offnet_mou_9)
count 99999.000000
mean 176.085292
std 311.955151
min -1407.015438
25% 21.903023
50% 78.360000
75% 204.270000
max 10310.760000
Name: offnet_mou_9, dtype: float64
In [44]:
bivariate(churn_filtered.sep_vbc_3g, churn_filtered.churn)
In [45]:
bivariate(churn_filtered.spl_og_mou_8, churn_filtered.churn)
In [46]:
pd.crosstab(churn_filtered.churn, churn_filtered.night_pck_user_8,
normalize='columns')*100
Out[46]:
night_pck_user_8 -1.0 0.0 1.0
churn
0 85.89123 97.117602 97.360704
1 14.10877 2.882398 2.639296
In [47]:
pd.crosstab(churn_filtered.churn, churn_filtered.sachet_3g_8)
Out[47]:
sachet 1. 2. 3. 4. 5. 6. 7. 8. 9. 10 11 12 13 14 15 16 17 18 19 20 21 23 25 27 29 30 38 41
0.0
_3g_8 0 0 0 0 0 0 0 0 0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0 .0
churn
24 3 1 1
16 8 4 3 2 1
0 97 9 8 0 15 8 11 10 6 6 2 2 3 1 3 3 2 1 1 1 2 1 1
09 6 3 5 8 9
2 9 4 6
23
1 48 5 8 4 2 1 0 2 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
69
In [48]:
def cap_outliers(array, k=3):
upper_limit = array.mean() + k*array.std()
lower_limit = array.mean() - k*array.std()
array[array<lower_limit] = lower_limit
array[array>upper_limit] = upper_limit
return array
In [49]:
# example of capping
sample_array = list(range(100))
# cap outliers
sample_array = np.array(sample_array)
print("Array after capping outliers: \n", cap_outliers(sample_array, k=2))
Array after capping outliers:
[-2780 1 2 3 4 5 6 7 8 9 10 11
12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31 32 33 34 35
36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59
60 61 62 63 64 65 66 67 68 69 70 71
72 73 74 75 76 77 78 79 80 81 82 83
84 85 86 87 88 89 90 91 92 93 94 95
96 97 98 2877]
In [50]:
# cap outliers in the numeric columns
churn_filtered[num_cols] = churn_filtered[num_cols].apply(cap_outliers,
axis=0)
Modelling
i) Making predictions
In [51]:
# import required libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from imblearn.metrics import sensitivity_specificity_support
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
Preprocessing data
In [52]:
# change churn to numeric
churn_filtered['churn'] = pd.to_numeric(churn_filtered['churn'])
Train Test split
In [53]:
# divide data into train and test
X = churn_filtered.drop("churn", axis = 1)
y = churn_filtered.churn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,
random_state = 4, stratify = y)
In [54]:
# print shapes of train and test sets
X_train.shape
y_train.shape
X_test.shape
y_test.shape
Out[54]:
(22500, 172)
Out[54]:
(22500,)
Out[54]:
(7501, 172)
Out[54]:
(7501,)
PCA
In [58]:
# apply pca to train data
pca = Pipeline([('scaler', StandardScaler()), ('pca', PCA())])
In [59]:
pca.fit(X_train)
churn_pca = pca.fit_transform(X_train)
Out[59]:
Pipeline(memory=None,
steps=[('scaler', StandardScaler(copy=True, with_mean=True,
with_std=True)), ('pca', PCA(copy=True, iterated_power='auto',
n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False))])
In [60]:
# extract pca model from pipeline
pca = pca.named_steps['pca']
# look at explainded variance of PCA components
print(pd.Series(np.round(pca.explained_variance_ratio_.cumsum(), 4)*100))
0 10.91
1 19.80
2 25.31
3 29.95
4 33.80
5 37.30
6 39.90
7 42.33
8 44.51
9 46.52
10 48.42
11 50.23
12 51.91
13 53.54
14 55.03
15 56.40
16 57.70
17 58.95
18 60.19
19 61.41
20 62.54
21 63.65
22 64.71
23 65.72
24 66.70
25 67.61
26 68.49
27 69.34
28 70.17
29 71.00
30 71.77
31 72.54
32 73.30
33 74.05
34 74.77
35 75.46
36 76.15
37 76.82
38 77.49
39 78.15
40 78.79
41 79.42
42 80.04
43 80.65
44 81.23
45 81.80
46 82.36
47 82.89
48 83.43
49 83.94
50 84.45
51 84.95
52 85.45
53 85.93
54 86.40
55 86.85
56 87.30
57 87.75
58 88.19
59 88.61
60 89.02
61 89.41
62 89.78
63 90.13
64 90.47
65 90.81
66 91.13
67 91.45
68 91.75
69 92.05
70 92.34
71 92.62
72 92.89
73 93.16
74 93.43
75 93.69
76 93.93
77 94.17
78 94.41
79 94.64
80 94.86
81 95.05
82 95.25
83 95.43
84 95.62
85 95.80
86 95.97
87 96.15
88 96.31
89 96.48
90 96.64
91 96.80
92 96.96
93 97.11
94 97.25
95 97.39
96 97.53
97 97.66
98 97.78
99 97.90
100 98.02
101 98.14
102 98.25
103 98.34
104 98.43
105 98.52
106 98.60
107 98.68
108 98.76
109 98.83
110 98.90
111 98.95
112 99.01
113 99.06
114 99.12
115 99.17
116 99.22
117 99.26
118 99.30
119 99.34
120 99.38
121 99.42
122 99.45
123 99.48
124 99.51
125 99.53
126 99.56
127 99.58
128 99.61
129 99.63
130 99.65
131 99.67
132 99.69
133 99.71
134 99.72
135 99.74
136 99.76
137 99.77
138 99.79
139 99.80
140 99.82
141 99.83
142 99.84
143 99.85
144 99.86
145 99.87
146 99.88
147 99.89
148 99.90
149 99.91
150 99.92
151 99.92
152 99.93
153 99.94
154 99.94
155 99.95
156 99.95
157 99.96
158 99.97
159 99.97
160 99.97
161 99.98
162 99.98
163 99.98
164 99.99
165 99.99
166 99.99
167 100.00
168 100.00
169 100.00
170 100.00
171 100.00
dtype: float64
In [61]:
# plot feature variance
features = range(pca.n_components_)
cumulative_variance =
np.round(np.cumsum(pca.explained_variance_ratio_)*100, decimals=4)
plt.figure(figsize=(175/20,100/20)) # 100 elements on y-axis; 175 elements
on x-axis; 20 is normalising factor
plt.plot(cumulative_variance)
Out[61]:
<Figure size 630x360 with 0 Axes>
Out[61]:
[<matplotlib.lines.Line2D at 0x2b6027981d0>]
In [64]:
# predict churn on test data
y_pred = pipeline.predict(X_test)
In [65]:
# class imbalance
y_train.value_counts()/y_train.shape
Out[65]:
0 0.918622
1 0.081378
Name: churn, dtype: float64
In [66]:
# PCA
pca = PCA()
# create pipeline
steps = [("scaler", StandardScaler()),
("pca", pca),
("logistic", logistic)
]
# compile pipeline
pca_logistic = Pipeline(steps)
# hyperparameter space
params = {'pca__n_components': [60, 80], 'logistic__C': [0.1, 0.5, 1, 2, 3,
4, 5, 10], 'logistic__penalty': ['l1', 'l2']}
# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)
Random Forest
In [71]:
# random forest - the class weight is used to handle class imbalance - it
adjusts the cost function
forest = RandomForestClassifier(class_weight={0:0.1, 1: 0.9}, n_jobs = -1)
# hyperparameter space
params = {"criterion": ['gini', 'entropy'], "max_features": ['auto', 0.4]}
# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)
rf_model = RandomForestClassifier(n_estimators=100,
max_features=max_features, class_weight={0:0.1, 1: 0.9}, oob_score=True,
random_state=4, verbose=1)
13
In [76]:
# fit model
rf_model.fit(X_train, y_train)
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 28.4s finished
Out[76]:
RandomForestClassifier(bootstrap=True, class_weight={0: 0.1, 1: 0.9},
criterion='gini', max_depth=None, max_features=13,
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=1, oob_score=True, random_state=4,
verbose=1, warm_start=False)
In [77]:
# OOB score
rf_model.oob_score_
Out[77]:
0.9426666666666667
In [78]:
# predict churn on test data
y_pred = rf_model.predict(X_test)
Feature Importance
In [79]:
# predictors
features = churn_filtered.drop('churn', axis=1).columns
# feature_importance
importance = rf_model.feature_importances_
# create dataframe
feature_importance = pd.DataFrame({'variables': features,
'importance_percentage': importance*100})
feature_importance = feature_importance[['variables',
'importance_percentage']]
# sort features
feature_importance =
feature_importance.sort_values('importance_percentage',
ascending=False).reset_index(drop=True)
print("Sum of importance=", feature_importance.importance_percentage.sum())
feature_importance
Sum of importance= 99.99999999999999
Out[79]:
variables importance_percentage
0 total_ic_mou_8 7.051799
1 total_og_mou_8 4.029977
2 last_day_rch_amt_8 3.217914
3 arpu_8 3.112591
4 total_rech_amt_diff 2.908439
5 loc_ic_mou_8 2.801836
6 roam_og_mou_8 2.618070
7 loc_ic_t2m_mou_8 2.574123
8 max_rech_amt_8 2.565527
9 total_rech_amt_8 2.479779
10 arpu_diff 2.433597
11 roam_ic_mou_8 2.313062
12 loc_ic_t2t_mou_8 2.272950
13 total_rech_num_diff 1.684547
14 total_ic_mou_diff 1.616509
15 total_og_mou_diff 1.515810
16 loc_og_mou_8 1.407685
17 av_rech_amt_data_8 1.341360
18 std_og_mou_diff 1.297516
19 roam_ic_mou_diff 1.295492
20 loc_ic_mou_diff 1.231582
21 roam_og_mou_diff 1.196602
22 loc_og_t2m_mou_8 0.927088
23 loc_og_t2t_mou_8 0.918190
24 offnet_mou_8 0.835193
variables importance_percentage
25 offnet_mou_diff 0.787708
26 vol_2g_mb_8 0.767465
27 std_ic_mou_8 0.756777
28 total_ic_mou_7 0.755680
29 std_og_t2m_mou_8 0.750015
30 max_rech_amt_diff 0.727545
31 fb_user_8 0.726221
32 night_pck_user_8 0.693271
33 onnet_mou_diff 0.682034
34 aon 0.657394
35 max_rech_data_8 0.614868
36 loc_og_mou_diff 0.606564
37 std_og_mou_8 0.593622
38 onnet_mou_8 0.575225
39 total_rech_amt_7 0.528601
40 loc_ic_t2m_mou_6 0.518464
41 total_rech_data_8 0.518115
42 arpu_7 0.512462
43 std_ic_mou_diff 0.502147
44 std_og_mou_7 0.492795
45 loc_og_mou_7 0.491933
46 total_rech_num_8 0.488850
47 total_ic_mou_6 0.473363
48 loc_ic_t2m_mou_7 0.466724
49 total_og_mou_7 0.465682
50 loc_og_t2m_mou_6 0.465483
51 loc_ic_mou_7 0.463715
52 vol_2g_mb_diff 0.461875
53 arpu_6 0.459183
54 std_og_t2t_mou_8 0.457876
55 vol_3g_mb_8 0.454195
56 spl_og_mou_diff 0.445280
57 offnet_mou_6 0.445082
58 loc_ic_t2t_mou_7 0.444555
59 av_rech_amt_data_diff 0.443252
60 offnet_mou_7 0.441294
61 loc_ic_mou_6 0.438766
62 loc_og_t2m_mou_7 0.433585
63 loc_og_mou_6 0.427759
64 max_rech_data_diff 0.421121
variables importance_percentage
65 loc_og_t2t_mou_7 0.420918
66 loc_ic_t2t_mou_6 0.417714
67 total_rech_amt_6 0.416888
68 onnet_mou_7 0.397039
69 total_rech_num_7 0.395627
70 std_ic_t2t_mou_8 0.381641
71 std_ic_t2m_mou_8 0.377689
72 total_rech_num_6 0.370274
73 std_og_mou_6 0.369188
74 std_ic_mou_6 0.366826
75 loc_og_t2t_mou_6 0.366003
76 std_ic_t2m_mou_7 0.356541
77 isd_ic_mou_diff 0.354723
78 onnet_mou_6 0.351645
79 loc_ic_t2f_mou_8 0.348421
80 roam_ic_mou_7 0.347550
81 spl_ic_mou_diff 0.340279
82 std_og_t2m_mou_7 0.339827
83 vol_3g_mb_diff 0.338704
84 total_og_mou_6 0.336792
85 std_og_t2m_mou_6 0.336494
86 std_og_t2t_mou_7 0.332075
87 aug_vbc_3g 0.328900
88 loc_ic_t2f_mou_7 0.328640
89 std_ic_mou_7 0.324208
90 spl_og_mou_7 0.316932
91 std_og_t2t_mou_6 0.315337
92 max_rech_amt_7 0.313147
93 isd_ic_mou_8 0.308535
94 loc_og_t2f_mou_8 0.307020
95 std_ic_t2m_mou_6 0.303577
96 total_rech_data_diff 0.303365
97 max_rech_amt_6 0.293195
98 loc_ic_t2f_mou_6 0.290351
99 spl_og_mou_6 0.288099
100 roam_og_mou_7 0.285888
101 last_day_rch_amt_6 0.278386
102 last_day_rch_amt_7 0.276803
103 av_rech_amt_data_7 0.275718
104 loc_og_t2f_mou_6 0.269368
variables importance_percentage
105 spl_og_mou_8 0.266655
106 std_ic_t2t_mou_6 0.266636
107 loc_og_t2f_mou_7 0.263393
108 vol_2g_mb_6 0.257802
109 std_ic_t2t_mou_7 0.254425
110 vol_3g_mb_7 0.243538
111 vol_2g_mb_7 0.240131
112 av_rech_amt_data_6 0.236701
113 roam_og_mou_6 0.233235
114 roam_ic_mou_6 0.228482
115 og_others_6 0.227395
116 vol_3g_mb_6 0.218148
117 max_rech_data_7 0.214674
118 loc_og_t2c_mou_8 0.210770
119 isd_ic_mou_6 0.205554
120 jul_vbc_3g 0.204624
121 isd_og_mou_diff 0.204362
122 spl_ic_mou_6 0.201601
123 ic_others_7 0.199106
124 std_ic_t2f_mou_8 0.191354
125 spl_ic_mou_8 0.186443
126 isd_ic_mou_7 0.186243
127 std_ic_t2f_mou_7 0.180216
128 jun_vbc_3g 0.178941
129 ic_others_8 0.176169
130 ic_others_6 0.176027
131 loc_og_t2c_mou_6 0.172089
132 sep_vbc_3g 0.171285
133 max_rech_data_6 0.170704
134 loc_og_t2c_mou_7 0.164413
135 total_rech_data_7 0.150091
136 std_ic_t2f_mou_6 0.146022
137 isd_og_mou_6 0.145912
138 std_ic_t2o_mou_8 0.139639
139 std_og_t2f_mou_7 0.137777
140 isd_og_mou_7 0.126929
141 total_rech_data_6 0.124994
142 og_others_8 0.117843
143 spl_ic_mou_7 0.114016
144 sachet_2g_7 0.108001
variables importance_percentage
145 isd_og_mou_8 0.105353
146 sachet_2g_8 0.101112
147 std_og_t2f_mou_6 0.098533
148 sachet_2g_6 0.087329
149 std_og_t2f_mou_8 0.083208
150 sachet_3g_7 0.082937
151 sachet_3g_6 0.078567
152 monthly_2g_7 0.058376
153 monthly_3g_6 0.055064
154 fb_user_7 0.053158
155 monthly_3g_7 0.052559
156 night_pck_user_7 0.051557
157 monthly_3g_8 0.050323
158 std_og_t2c_mou_6 0.049699
159 sachet_3g_8 0.047272
160 og_others_7 0.045714
161 std_og_t2c_mou_8 0.040714
162 night_pck_user_6 0.039921
163 monthly_2g_6 0.039866
164 fb_user_6 0.039001
165 monthly_2g_8 0.033863
166 std_ic_t2o_mou_6 0.029129
167 std_ic_t2o_mou_7 0.019845
168 std_og_t2o_mou 0.015844
169 loc_og_t2o_mou 0.011197
170 std_og_t2c_mou_7 0.006985
171 loc_ic_t2o_mou 0.006427
In [80]:
# extract top 'n' features
top_n = 30
top_features = feature_importance.variables[0:top_n]
In [81]:
# plot feature correlation
import seaborn as sns
plt.rcParams["figure.figsize"] =(10,10)
mycmap = sns.diverging_palette(199, 359, s=99, center="light",
as_cmap=True)
sns.heatmap(data=X_train[top_features].corr(), center=0.0, cmap=mycmap)
Out[81]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b602a162e8>
In [82]:
top_features = ['total_ic_mou_8', 'total_rech_amt_diff', 'total_og_mou_8',
'arpu_8', 'roam_ic_mou_8', 'roam_og_mou_8',
'std_ic_mou_8', 'av_rech_amt_data_8', 'std_og_mou_8']
X_train = X_train[top_features]
X_test = X_test[top_features]
In [83]:
# logistic regression
steps = [('scaler', StandardScaler()),
("logistic", LogisticRegression(class_weight={0:0.1, 1:0.9}))
]
# compile pipeline
logistic = Pipeline(steps)
# hyperparameter space
params = {'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10],
'logistic__penalty': ['l1', 'l2']}
# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)
Extract the intercept and the coefficients from the logistic model
In [87]:
logistic_model = model.best_estimator_.named_steps['logistic']
In [88]:
# intercept
intercept_df = pd.DataFrame(logistic_model.intercept_.reshape((1,1)),
columns = ['intercept'])
In [89]:
# coefficients
coefficients = logistic_model.coef_.reshape((9, 1)).tolist()
coefficients = [val for sublist in coefficients for val in sublist]
coefficients = [round(coefficient, 3) for coefficient in coefficients]
logistic_features = list(X_train.columns)
coefficients_df = pd.DataFrame(logistic_model.coef_,
columns=logistic_features)
In [90]:
# concatenate dataframes
coefficients = pd.concat([intercept_df, coefficients_df], axis=1)
coefficients
Out[90]:
inter total_ic total_rech total_og arpu roam_ic roam_og std_ic_ av_rech_a std_og_
cept _mou_8 _amt_diff _mou_8 _8 _mou_8 _mou_8 mou_8 mt_data_8 mou_8
- - -
0.17 0.02502
0 1.46 1.19186 -0.684758 1.09609 0.066651 0.16906 -0.795034 0.58883
9667 6
353 6 9
Business Insights
Telecom company needs to pay attention to the roaming rates. They need to provide
good offers to the customers who are using services from a roaming zone.
The company needs to focus on the STD and ISD rates. Perhaps, the rates are too
high. Provide them with some kind of STD and ISD packages.
To look into both of the issues stated above, it is desired that the telecom company
collects customer query and complaint data and work on their services according to
the needs of customers.
In [ ]: