Sunbase Data Assignment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
data=pd.read_excel('customer_churn_large_dataset.xlsx')
In [3]:
data.head()
Out[3]:
CustomerID Name Age Gender Location Subscription_Length_Months
Monthly_Bill Total_Usage_GB Churn
0 1 Customer_1 63 Male Los Angeles 17 73.36 236 0
1 2 Customer_2 62 Female New York 1 48.76 172 0
2 3 Customer_3 24 Female Los Angeles 5 85.47 460
0
3 4 Customer_4 36 Female Miami 3 97.94 297 1
4 5 Customer_5 46 Female Miami 19 58.14 266 0
1. Data Preprocessing:¶ <#1.-Data-Preprocessing:>
In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3208 entries, 0 to 3207
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CustomerID 3208 non-null int64
1 Name 3208 non-null object
2 Age 3208 non-null int64
3 Gender 3208 non-null object
4 Location 3208 non-null object
5 Subscription_Length_Months 3208 non-null int64
6 Monthly_Bill 3208 non-null float64
7 Total_Usage_GB 3208 non-null int64
8 Churn 3208 non-null int64
dtypes: float64(1), int64(5), object(3)
memory usage: 225.7+ KB
In [5]:
# there is no null value in data and no missing value in data.
In [6]:
data.describe()
Out[6]:
CustomerID Age Subscription_Length_Months Monthly_Bill
Total_Usage_GB Churn
count 3208.000000 3208.000000 3208.000000 3208.000000
3208.000000
3208.000000
mean 1604.500000 43.904613 12.363778 64.821397 273.826683 0.507481
std 926.214158 15.175543 6.966723 20.180945 129.579639 0.500022
min 1.000000 18.000000 1.000000 30.020000 50.000000 0.000000
25% 802.750000 31.000000 6.000000 47.202500 161.750000 0.000000
50% 1604.500000 44.000000 12.000000 64.665000 273.000000 1.000000
75% 2406.250000 57.000000 18.000000 82.635000 386.000000 1.000000
max 3208.000000 70.000000 24.000000 99.960000 500.000000 1.000000
In [7]:
plt.plot(data['Gender'],data['Churn'],label='male')
plt.legend()
plt.show()
In [8]:
plt.scatter(data['Age'],data['Churn'])
Out[8]:
<matplotlib.collections.PathCollection at 0x18d5ae35d30>
In [9]:
plt.hist(data['Churn'])
plt.show()
In [10]:
data.corr()
C:\Users\Sonu\AppData\Local\Temp\ipykernel_17892\2627137660.py:1: FutureWarning:
The default value of numeric_only in DataFrame.corr is deprecated. In a future
version, it will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
data.corr()
Out[10]:
CustomerID Age Subscription_Length_Months Monthly_Bill
Total_Usage_GB Churn
CustomerID 1.000000 0.011307 0.008368 0.029753 -0.006244 -0.000607
Age 0.011307 1.000000 -0.009815 0.020243 -0.024137 -0.007344
Subscription_Length_Months 0.008368 -0.009815 1.000000 -0.040562
-0.035997 -0.005481
Monthly_Bill 0.029753 0.020243 -0.040562 1.000000 -0.022275
0.002809
Total_Usage_GB -0.006244 -0.024137 -0.035997 -0.022275 1.000000
0.016984
Churn -0.000607 -0.007344 -0.005481 0.002809 0.016984 1.000000
In [11]:
data['Churn'].value_counts()
Out[11]:
1 1628
0 1580
Name: Churn, dtype: int64
train_test_split¶ <#train_test_split>
In [12]:
from sklearn.model_selection import train_test_split
In [13]:
train_df,test_df=train_test_split(data,test_size=0.2,random_state=42)
In [14]:
train_df.shape
Out[14]:
(2566, 9)
In [15]:
data.shape
Out[15]:
(3208, 9)
In [16]:
test_df.shape
Out[16]:
(642, 9)
In [17]:
input_cols=list(data.columns)[1:-1]
In [18]:
input_cols
Out[18]:
['Name',
'Age',
'Gender',
'Location',
'Subscription_Length_Months',
'Monthly_Bill',
'Total_Usage_GB']
In [19]:
target_cols='Churn'
In [20]:
target_cols
Out[20]:
'Churn'
In [22]:
train_inputs=train_df[input_cols].copy()
In [44]:
test_inputs=test_df[input_cols].copy()
In [58]:
train_targets=train_df[target_cols].copy()
In [69]:
test_targets=test_df[target_cols].copy()
In [25]:
train_inputs
Out[25]:
Name Age Gender Location Subscription_Length_Months Monthly_Bill
Total_Usage_GB
460 Customer_461 18 Male Houston 12 65.83 156
43 Customer_44 20 Female Miami 20 79.84 122
2438 Customer_2439 53 Female New York 9 60.10 285
... ... ... ... ... ... ... ...
1130 Customer_1131 36 Female Houston 10 45.99 100
860 Customer_861 55 Male Chicago 12 89.19 315
3174 Customer_3175 30 Female Los Angeles 18 99.42 345
2566 rows × 7 columns
In [59]:
train_targets
Out[59]:
460 1
43 1
87 0
2438 0
990 0
..
1095 1
1130 0
1294 0
860 1
3174 1
Name: Churn, Length: 2566, dtype: int64
In [60]:
numerical_cols=train_inputs.select_dtypes(include=np.number).columns.tolist()
In [28]:
categoric_cols=train_inputs.select_dtypes(include='object').columns.tolist()
In [30]:
categoric_cols
Out[30]:
['Name', 'Gender', 'Location']
In [31]:
numerical_cols
Out[31]:
['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']
In [32]:
train_inputs[numerical_cols].describe()
Out[32]:
Age Subscription_Length_Months Monthly_Bill Total_Usage_GB
count 2566.000000 2566.000000 2566.000000 2566.000000
mean 43.594310 12.286048 64.677689 272.079891
std 15.142142 6.999583 20.116820 129.322741
min 18.000000 1.000000 30.020000 50.000000
25% 31.000000 6.000000 47.302500 161.000000
50% 44.000000 12.000000 64.100000 269.000000
75% 56.000000 18.000000 82.390000 383.000000
max 70.000000 24.000000 99.960000 500.000000
OneHotEncoding of categorical columns¶

<#OneHotEncoding-of-categorical-columns>
In [33]:
from sklearn.preprocessing import OneHotEncoder
In [34]:
encoder=OneHotEncoder(sparse=False,handle_unknown='ignore')
In [35]:
encoder.fit(data[categoric_cols])
C:\Users\Sonu\anaconda3\lib\site-packages\sklearn\preprocessing\_encoders.py:868:
FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be
removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default
value.
warnings.warn(
Out[35]:
OneHotEncoder(handle_unknown='ignore', sparse=False, sparse_output=False)
*In a Jupyter environment, please rerun this cell to show the HTML
representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try
loading this page with nbviewer.org.*
OneHotEncoder
OneHotEncoder(handle_unknown='ignore', sparse=False, sparse_output=False)
In [36]:
encoder.categories_
Out[36]:
[array(['Customer_1', 'Customer_10', 'Customer_100', ..., 'Customer_997',

'Customer_998', 'Customer_999'], dtype=object),
array(['Female', 'Male'], dtype=object),
array(['Chicago', 'Houston', 'Los Angeles', 'Miami', 'New York'],
dtype=object)]
In [37]:
encoded_cols=list(encoder.get_feature_names_out(categoric_cols))
print(encoded_cols)train_inputs[encoded_cols]=encoder.transform(train_inputs[catego
ric_cols])test_inputs[encoded_cols]=encoder.transform(test_inputs[categoric_cols])
In [47]:
train_inputs[encoded_cols]
Out[47]:
Name_Customer_1 Name_Customer_10 Name_Customer_100
Name_Customer_1006 ... Name_Customer_997 Name_Customer_998
Name_Customer_999 Gender_Female Gender_Male Location_Chicago
Location_Houston Location_Los Angeles Location_Miami Location_New York
460 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
43 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0
1.0 0.0 0.0 0.0 0.0 1.0 0.0
87 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0
0.0 1.0 0.0 1.0 0.0 0.0 0.0
2438 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
990 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...
1095 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
1130 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
1294 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
860 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
3174 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
In [48]:
train_inputs[numerical_cols+encoded_cols]
Out[48]:
Name_Customer_1 Name_Customer_10 Name_Customer_100 Name_Customer_1000
Name_Customer_1001 Name_Customer_1002 ... Name_Customer_997
Name_Customer_998 Name_Customer_999 Gender_Female Gender_Male
Location_Chicago Location_Houston Location_Los Angeles Location_Miami
Location_New York
460 18 12 65.83 156 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
43 20 20 79.84 122 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0 0.0
1.0 0.0 0.0 0.0 0.0 1.0 0.0
87 50 10 49.00 114 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0 0.0
0.0 1.0 0.0 1.0 0.0 0.0 0.0
2438 53 9 60.10 285 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
990 68 22 84.18 228 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...
1095 60 10 72.52 191 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
1130 36 10 45.99 100 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
1294 33 6 75.81 324 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
860 55 12 89.19 315 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
3174 30 18 99.42 345 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
In [50]:
test_inputs[encoded_cols]
Out[50]:
Name_Customer_1006 ... Name_Customer_997 Name_Customer_998
Name_Customer_999 Gender_Female Gender_Male Location_Chicago
Location_Houston Location_Los Angeles Location_Miami Location_New York
610 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
3051 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
2905 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0
2891 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
2886 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...
1027 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
1611 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
3122 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
420 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
2353 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
In [51]:
test_inputs.shape
Out[51]:
(642, 3222)
In [52]:
train_inputs.shape
Out[52]:
(2566, 3222)
In [53]:
test_inputs[numerical_cols+encoded_cols]
Out[53]:
Name_Customer_1 Name_Customer_10 Name_Customer_100 Name_Customer_1000
Name_Customer_1001 Name_Customer_1002 ... Name_Customer_997
Name_Customer_998 Name_Customer_999 Gender_Female Gender_Male
Location_Chicago Location_Houston Location_Los Angeles Location_Miami
Location_New York
610 35 9 33.59 282 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0 0.0
1.0 0.0 0.0 0.0 0.0 0.0 1.0
3051 20 10 37.27 335 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
2905 67 20 57.08 104 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0
2891 63 1 67.51 58 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0 0.0
1.0 0.0 0.0 1.0 0.0 0.0 0.0
2886 49 16 55.84 202 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...
1027 51 23 55.85 306 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
1611 50 5 75.53 466 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
3122 51 6 43.39 300 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
420 46 24 34.14 387 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0
0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
2353 68 9 97.83 90 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0
0.0 0.0
0.0 1.0 1.0 0.0 0.0 0.0 0.0
Decision Tree Regression¶ <#Decision-Tree-Regression>
In [54]:
from sklearn.tree import DecisionTreeRegressor
In [55]:
model=DecisionTreeRegressor()
In [61]:
# model fit in data

model.fit(train_inputs[numerical_cols+encoded_cols],train_targets)
Out[61]:
DecisionTreeRegressor()
DecisionTreeRegressor
In [63]:
train_preds=model.predict(train_inputs[numerical_cols+encoded_cols])
Importing metrics to evalute accuracy_score of prediction¶

<#Importing-metrics-to-evalute-accuracy_score-of-prediction>
In [65]:
from sklearn.metrics import r2_score
In [66]:
r2_score(train_targets,train_preds)
Out[66]:
1.0
In [73]:
model.fit(test_inputs[numerical_cols+encoded_cols],test_targets)
Out[73]:
DecisionTreeRegressor
In [74]:
test_preds=model.predict(test_inputs[numerical_cols+encoded_cols])
In [75]:
r2_score(test_targets,test_preds)
Out[75]:
1.0
In [ ]:

Sunbase Data Assignment

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Sunbase Data Assignment

Uploaded by

Copyright:

Available Formats

In [1]:

1. Data Preprocessing:¶ <#1.-Data-Preprocessing:>

# there is no null value in data and no missing value in data.

from sklearn.model_selection import train_test_split

2566 rows × 7 columns

['Name', 'Gender', 'Location']

['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']

OneHotEncoding of categorical columns¶

from sklearn.preprocessing import OneHotEncoder

OneHotEncoder(handle_unknown='ignore', sparse=False, sparse_output=False)

OneHotEncoder(handle_unknown='ignore', sparse=False, sparse_output=False)

[array(['Customer_1', 'Customer_10', 'Customer_100', ..., 'Customer_997',

2566 rows × 3215 columns

2566 rows × 3219 columns

642 rows × 3215 columns

642 rows × 3219 columns

Decision Tree Regression¶ <#Decision-Tree-Regression>

from sklearn.tree import DecisionTreeRegressor

# model fit in data

Importing metrics to evalute accuracy_score of prediction¶

from sklearn.metrics import r2_score

You might also like