LDA Code

You might also like

You are on page 1of 19

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
In [145]:
# Loading the data set
cmd= pd.read_excel('Contraceptive_method_dataset.xlsx')
In [146]:
cmd.head().T
Out[146]:
0 1 2 3 4

Wife_age 24.0 45.0 43.0 42.0 36.0

Wife_ education Primary Uneducated Primary Secondary Secondary

Husband_education Secondary Secondary Secondary Primary Secondary

No_of_children_born 3.0 10.0 7.0 9.0 8.0

Scientolog
Wife_religion Scientology Scientology Scientology Scientology
y

Wife_Working No No No No No

Husband_Occupation 2 3 3 3 3

Standard_of_living_index High Very High Very High High Low

Media_exposure Exposed Exposed Exposed Exposed Exposed

Contraceptive_method_use
No No No No No
d

In [147]:
cmd.shape
Out[147]:
(1473, 10)
In [148]:
cmd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Wife_age 1402 non-null float64
1 Wife_ education 1473 non-null object
2 Husband_education 1473 non-null object
3 No_of_children_born 1452 non-null float64
4 Wife_religion 1473 non-null object
5 Wife_Working 1473 non-null object
6 Husband_Occupation 1473 non-null int64
7 Standard_of_living_index 1473 non-null object
8 Media_exposure 1473 non-null object
9 Contraceptive_method_used 1473 non-null object
dtypes: float64(2), int64(1), object(7)
memory usage: 115.2+ KB
In [149]:
cmd.describe().transpose()
Out[149]:
count mean std min 25% 50% 75% max

16.
Wife_age 1402.0 32.606277 8.274927 26.0 32.0 39.0 49.0
0

No_of_children_born 1452.0 3.254132 2.365212 0.0 1.0 3.0 4.0 16.0

Husband_Occupation 1473.0 2.137814 0.864857 1.0 1.0 2.0 3.0 4.0

In [150]:
# To Check for null values in the dataset
cmd.isnull().sum()
Out[150]:
Wife_age 71
Wife_ education 0
Husband_education 0
No_of_children_born 21
Wife_religion 0
Wife_Working 0
Husband_Occupation 0
Standard_of_living_index 0
Media_exposure 0
Contraceptive_method_used 0
dtype: int64
In [151]:
dups = cmd.duplicated()
print("There are ",cmd.duplicated().sum()," duplicates")
cmd[dups]
There are 80 duplicates
Out[151]:
Wif
Wif e_ Husban No_of_ch Wife_ Wife_ Husband Standard_ Media Contracepti
e_a edu d_educa ildren_b religi Worki _Occupat of_living_i _expos ve_method_
ge cati tion orn on ng ion ndex ure used
on

7 Tert Scient Expose


38.0 Tertiary 1.0 Yes 1 Very High No
9 iary ology d

1
Tert Scient Expose
6 26.0 Tertiary 1.0 No 1 Very High No
iary ology d
7

2
Tert Scient Expose
2 47.0 Tertiary 4.0 No 1 Very High No
iary ology d
4

2
Tert Scient Expose
7 30.0 Tertiary 2.0 No 1 Very High No
iary ology d
0

2
Tert Scient Expose
9 26.0 Tertiary 1.0 No 1 Very High No
iary ology d
9

... ... ... ... ... ... ... ... ... ... ...

1
3 Tert Scient Expose
44.0 Tertiary 5.0 Yes 1 Very High Yes
6 iary ology d
7

1
Sec
3 Na Scient Expose
ond Tertiary 2.0 Yes 2 Very High Yes
8 N ology d
ary
7

1
Non-
4 Na Tert Expose
Tertiary 2.0 Scient No 1 Very High Yes
2 N iary d
ology
3

1
Non-
4 Na Tert Expose
Tertiary 1.0 Scient Yes 2 Very High Yes
4 N iary d
ology
0
Wif
Wif e_ Husban No_of_ch Wife_ Wife_ Husband Standard_ Media Contracepti
e_a edu d_educa ildren_b religi Worki _Occupat of_living_i _expos ve_method_
ge cati tion orn on ng ion ndex ure used
on

1
Non-
4 Na Tert Expose
Tertiary 2.0 Scient Yes 2 Very High Yes
4 N iary d
ology
7

80 rows × 10 columns

In [152]:
# Removing the duplicate value
cmd.drop_duplicates(inplace=True)
In [153]:
dups = cmd.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
cmd[dups]
Number of duplicate rows = 0
Out[153]:
Wif
Wif e_ Husband No_of_ch Wife_ Wife_ Husband Standard_o Media_ Contracepti
e_a edu _educati ildren_bo religi Worki _Occupat f_living_in exposu ve_method_
ge cati on rn on ng ion dex re used
on

In [154]:
# To print number of rows and columns present in the dataset after removing
the
print("no.of rows: ",cmd.shape[0], "no.of columns:",cmd.shape[1])
no.of rows: 1393 no.of columns: 10
In [155]:
# To describe the dataset
cmd.describe().T
Out[155]:
count mean std min 25% 50% 75% max

16.
Wife_age 1326.0 32.557315 8.289259 26.0 32.0 39.0 49.0
0

No_of_children_born 1372.0 3.290816 2.399697 0.0 1.0 3.0 5.0 16.0

Husband_Occupation 1393.0 2.174444 0.854590 1.0 1.0 2.0 3.0 4.0

In [156]:
# To Check for null values in the dataset
cmd.isnull().sum()
Out[156]:
Wife_age 67
Wife_ education 0
Husband_education 0
No_of_children_born 21
Wife_religion 0
Wife_Working 0
Husband_Occupation 0
Standard_of_living_index 0
Media_exposure 0
Contraceptive_method_used 0
dtype: int64
In [157]:
# Replace the null value with Median
cmd[['Wife_age', 'No_of_children_born']] = cmd[['Wife_age',
'No_of_children_born']].fillna(cmd[['Wife_age',
'No_of_children_born']].median())
In [158]:
# To Check for null values in the dataset
cmd.isnull().sum()
Out[158]:
Wife_age 0
Wife_ education 0
Husband_education 0
No_of_children_born 0
Wife_religion 0
Wife_Working 0
Husband_Occupation 0
Standard_of_living_index 0
Media_exposure 0
Contraceptive_method_used 0
dtype: int64

Outlier Treatment
In [85]:
cmd_plot = cmd.select_dtypes(include = ['float64', 'int64'])
In [162]:
#Check for presence of outliers in each feature
plt.figure(figsize = (10,8))
feature_list = cmd_plot.columns
for i in range(len(feature_list)):
plt.subplot(4, 3, i + 1)
sns.boxplot(y = cmd_plot[feature_list[i]], data = cmd_plot)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
In [163]:
# Define a function which results the upper and lower limit to detect
outliers
def remove_outlier(col):
Q1,Q3=col.quantile([0.25,0.75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
In [164]:
# Cap & floor the values beyond the outlier boundaries
for i in feature_list:
LL,UL = remove_outlier(cmd_plot[i])
cmd_plot[i] = np.where (cmd_plot[i] > UL,UL, cmd_plot[i])
cmd_plot[i] = np.where (cmd_plot[i] < LL,LL, cmd_plot[i])
In [166]:
#Check for presence of outliers in each feature
plt.figure(figsize = (10,8))
feature_list = cmd_plot.columns
for i in range(len(feature_list)):
plt.subplot(4, 3, i + 1)
sns.boxplot(y = cmd_plot[feature_list[i]], data = cmd_plot)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
In [93]:
# Univariate Analysis
plt.figure(figsize=(12,15))
feature_list = cmd.columns
for i in range (len(feature_list)):
plt.subplot(4, 3, i+1)
sns.histplot(y=cmd[feature_list[i]],data=cmd)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
In [94]:
# Bivariate Analysis
plt.figure(figsize=(12,8))
sns.pairplot(cmd, diag_kind="kde")
plt.show()
<Figure size 1200x800 with 0 Axes>
In [95]:
cmd.corr()
Out[95]:
Wife_age No_of_children_born Husband_Occupation

Wife_age 1.000000 0.528428 -0.187070

No_of_children_born 0.528428 1.000000 -0.024213

-
Husband_Occupation -0.024213 1.000000
0.187070

In [96]:
# Multivariate Analysis
plt.figure(figsize=(12,8))
sns.heatmap(cmd.iloc[:, 0:10].corr(),annot=True)
plt.show()

In [99]:
#The following code loops through each column and checks if the column type
is object
#then converts those columns into categorical with each distinct value
becoming a category.
for feature in cmd.columns:
if cmd[feature].dtype == 'object':
print('\n')
print('feature:',feature)
print(pd.Categorical(cmd[feature].unique()))
print(pd.Categorical(cmd[feature].unique()).codes)
cmd[feature] = pd.Categorical(cmd[feature]).codes

feature: Husband_education
['Secondary', 'Primary', 'Tertiary', 'Uneducated']
Categories (4, object): ['Primary', 'Secondary', 'Tertiary', 'Uneducated']
[1 0 2 3]

feature: Wife_religion
['Scientology', 'Non-Scientology']
Categories (2, object): ['Non-Scientology', 'Scientology']
[1 0]

feature: Wife_Working
['No', 'Yes']
Categories (2, object): ['No', 'Yes']
[0 1]

feature: Standard_of_living_index
['High', 'Very High', 'Low', 'Very Low']
Categories (4, object): ['High', 'Low', 'Very High', 'Very Low']
[0 2 1 3]

feature: Media_exposure
['Exposed', 'Not-Exposed']
Categories (2, object): ['Exposed', 'Not-Exposed']
[0 1]

feature: Contraceptive_method_used
['No', 'Yes']
Categories (2, object): ['No', 'Yes']
[0 1]
In [100]:
cmd.head().T
Out[100]:
0 1 2 3 4

24.
Wife_age 45.0 43.0 42.0 36.0
0

Wife_ education 2.0 1.0 2.0 3.0 3.0

Husband_education 1.0 1.0 1.0 0.0 1.0

No_of_children_born 3.0 10.0 7.0 9.0 8.0

Wife_religion 1.0 1.0 1.0 1.0 1.0

Wife_Working 0.0 0.0 0.0 0.0 0.0

Husband_Occupation 2.0 3.0 3.0 3.0 3.0

Standard_of_living_index 0.0 2.0 2.0 0.0 1.0

Media_exposure 0.0 0.0 0.0 0.0 0.0

Contraceptive_method_used 0.0 0.0 0.0 0.0 0.0

In [102]:
cmd.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1393 entries, 0 to 1472
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Wife_age 1393 non-null float64
1 Wife_ education 1393 non-null int64
2 Husband_education 1393 non-null int8
3 No_of_children_born 1393 non-null float64
4 Wife_religion 1393 non-null int8
5 Wife_Working 1393 non-null int8
6 Husband_Occupation 1393 non-null int64
7 Standard_of_living_index 1393 non-null int8
8 Media_exposure 1393 non-null int8
9 Contraceptive_method_used 1393 non-null int8
dtypes: float64(2), int64(2), int8(6)
memory usage: 62.6 KB
In [109]:
#Check for presence of outliers in each feature
cmd_plot = cmd.select_dtypes(include = ['float64', 'int64', 'int8'])
plt.figure(figsize = (12,8))
feature_list = cmd_plot.columns
for i in range(len(feature_list)):
plt.subplot(3, 4, i + 1)
sns.boxplot(y = cmd_plot[feature_list[i]], data = cmd_plot)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()

In [110]:
# Define a function which results the upper and lower limit to detect
outliers
def remove_outlier(col):
Q1,Q3=col.quantile([0.25,0.75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
In [111]:
# Cap & floor the values beyond the outlier boundaries
for i in feature_list:
LL,UL = remove_outlier(cmd[i])
cmd[i] = np.where (cmd[i] > UL,UL, cmd[i])
cmd[i] = np.where (cmd[i] < LL,LL, cmd[i])
In [112]:
# Check the presence of outliers in the each features
plt.figure(figsize=(12,8))
feature_list = cmd.columns
for i in range (len(feature_list)):
plt.subplot(3, 4, i+1)
sns.boxplot(y=cmd[feature_list[i]],data=cmd)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()

In [113]:
cmd.corr()
# Check the Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(cmd.iloc[:, 0:10].corr(),annot=True)
plt.show()
CART
In [114]:
#Creating a backup
cmd_1 = cmd.copy()
In [115]:
# capture the target column ("Contraceptive_method_used") into separate
vectors
X = cmd_1.drop("Contraceptive_method_used" , axis=1)
y = cmd_1.pop("Contraceptive_method_used")
In [116]:
from sklearn.model_selection import train_test_split

X_train, X_test, train_labels, test_labels = train_test_split(X, y,


test_size=.30, random_state=1)
In [117]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion = 'gini' )
In [118]:
dt_model.fit(X_train, train_labels)
Out[118]:
DecisionTreeClassifier
DecisionTreeClassifier()
In [121]:
from sklearn import tree

train_char_label = ['No', 'Yes']


ld_Tree_File = open('ld_Tree_File.dot','w')
dot_data = tree.export_graphviz(dt_model,
out_file=ld_Tree_File,
feature_names = list(X_train),
class_names = list(train_char_label))

ld_Tree_File.close()
In [122]:
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index
= X_train.columns).sort_values('Imp',ascending=False))
Imp
Wife_age 0.313523
No_of_children_born 0.261942
Standard_of_living_index 0.109489
Wife_ education 0.103750
Husband_Occupation 0.097337
Wife_Working 0.064697
Husband_education 0.049262
Wife_religion 0.000000
Media_exposure 0.000000
In [123]:
y_predict = dt_model.predict(X_test)
In [124]:
y_predict.shape
Out[124]:
(418,)

Regularising the Decision Tree


Adding Tuning Parameters

In [125]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_features': ['auto', 'sqrt', 'log2'],


'ccp_alpha': [0.1, .01, .001],
# cost complexity parameter, ccp_alpha. Greater values of
ccp_alpha increase the
# number of nodes pruned
'max_depth' : [1,5,10,15,20],
'min_samples_leaf':[1,5,10,15,20],
'criterion' :['gini', 'entropy']
}
tree_clas = DecisionTreeClassifier(random_state=1024)
grid_search = GridSearchCV(estimator=tree_clas, param_grid=param_grid,
cv=5, verbose=True)
grid_search.fit(X_train, train_labels)
Fitting 5 folds for each of 450 candidates, totalling 2250 fits
Out[125]:
GridSearchCV
estimator: DecisionTreeClassifier
DecisionTreeClassifier
In [126]:
grid_search.best_estimator_
Out[126]:
DecisionTreeClassifier
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=10,
max_features='auto', min_samples_leaf=20,
random_state=1024)
In [127]:
reg_dt_model = DecisionTreeClassifier( ccp_alpha=0.001,criterion='entropy',
max_depth=15,
max_features='auto',
random_state=1024)
reg_dt_model.fit(X_train, train_labels)
Out[127]:
DecisionTreeClassifier
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=15,
max_features='auto', random_state=1024)
In [128]:
ld_tree_regularized = open('ld_tree_regularized.dot','w')
dot_data = tree.export_graphviz(reg_dt_model, out_file= ld_tree_regularized
, feature_names = list(X_train), class_names = list(train_char_label))

ld_tree_regularized.close()
dot_data
In [129]:
print (pd.DataFrame(reg_dt_model.feature_importances_, columns = ["Imp"],
index = X_train.columns).sort_values('Imp',ascending=False))
Imp
Wife_age 0.335927
No_of_children_born 0.301169
Standard_of_living_index 0.112077
Husband_Occupation 0.075047
Husband_education 0.066805
Wife_ education 0.062557
Wife_Working 0.046418
Wife_religion 0.000000
Media_exposure 0.000000
In [130]:
ytrain_predict = reg_dt_model.predict(X_train)
ytest_predict = reg_dt_model.predict(X_test)
In [131]:
# AUC and ROC for the training data
# predict probabilities
probs = reg_dt_model.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(train_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(train_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.989

In [132]:
# AUC and ROC for the test data
# predict probabilities
probs = reg_dt_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(test_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(test_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.621
In [133]:
# Import the classification_report & Confusion_matrix library
from sklearn.metrics import classification_report,confusion_matrix

# To view the confusion matrix for train_labels, ytrain_predict


confusion_matrix(train_labels, ytrain_predict)
print(classification_report(train_labels, ytrain_predict))
precision recall f1-score support

0.0 0.93 0.92 0.93 422


1.0 0.94 0.95 0.94 553

accuracy 0.94 975


macro avg 0.94 0.93 0.94 975
weighted avg 0.94 0.94 0.94 975

In [134]:
# To view the confusion matrix for test_labels, ytest_predict
confusion_matrix(test_labels, ytest_predict)
print(classification_report(test_labels, ytest_predict))
precision recall f1-score support

0.0 0.60 0.55 0.57 192


1.0 0.64 0.69 0.67 226

accuracy 0.62 418


macro avg 0.62 0.62 0.62 418
weighted avg 0.62 0.62 0.62 418

In [135]:
# To view the model score for X_train, train_labels
reg_dt_model.score(X_train,train_labels)
Out[135]:
0.9364102564102564
In [136]:
# To view the model score for X_test, test_labels
reg_dt_model.score(X_test,test_labels)
Out[136]:
0.6244019138755981

Logistics Regression
In [137]:
# Creating a copy of the original data frame
cmd_2 = cmd.copy()
# Import the LabelEncoder library
from sklearn.preprocessing import LabelEncoder
# Defining a Label Encoder object instance
LE = LabelEncoder()
LE
Out[137]:
LabelEncoder
LabelEncoder()

You might also like