LDA Code

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
In [145]:
# Loading the data set
cmd= pd.read_excel('Contraceptive_method_dataset.xlsx')
In [146]:
cmd.head().T
Out[146]:
0 1 2 3 4
Wife_age 24.0 45.0 43.0 42.0 36.0
Wife_ education Primary Uneducated Primary Secondary Secondary
Husband_education Secondary Secondary Secondary Primary Secondary
No_of_children_born 3.0 10.0 7.0 9.0 8.0
Scientolog
Wife_religion Scientology Scientology Scientology Scientology
y
Wife_Working No No No No No
Husband_Occupation 2 3 3 3 3
Standard_of_living_index High Very High Very High High Low
Media_exposure Exposed Exposed Exposed Exposed Exposed
Contraceptive_method_use
No No No No No
d
In [147]:
cmd.shape
Out[147]:
(1473, 10)
In [148]:
cmd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Wife_age 1402 non-null float64
1 Wife_ education 1473 non-null object
2 Husband_education 1473 non-null object
3 No_of_children_born 1452 non-null float64
4 Wife_religion 1473 non-null object
5 Wife_Working 1473 non-null object
6 Husband_Occupation 1473 non-null int64
7 Standard_of_living_index 1473 non-null object
8 Media_exposure 1473 non-null object
9 Contraceptive_method_used 1473 non-null object
dtypes: float64(2), int64(1), object(7)
memory usage: 115.2+ KB
In [149]:
cmd.describe().transpose()
Out[149]:
count mean std min 25% 50% 75% max
16.
Wife_age 1402.0 32.606277 8.274927 26.0 32.0 39.0 49.0
0
No_of_children_born 1452.0 3.254132 2.365212 0.0 1.0 3.0 4.0 16.0
Husband_Occupation 1473.0 2.137814 0.864857 1.0 1.0 2.0 3.0 4.0
In [150]:
# To Check for null values in the dataset
cmd.isnull().sum()
Out[150]:
Wife_age 71
Wife_ education 0
Husband_education 0
No_of_children_born 21
Wife_religion 0
Wife_Working 0
Husband_Occupation 0
Standard_of_living_index 0
Media_exposure 0
Contraceptive_method_used 0
dtype: int64
In [151]:
dups = cmd.duplicated()
print("There are ",cmd.duplicated().sum()," duplicates")
cmd[dups]
There are 80 duplicates
Out[151]:
Wif
Wif e_ Husban No_of_ch Wife_ Wife_ Husband Standard_ Media Contracepti
e_a edu d_educa ildren_b religi Worki _Occupat of_living_i _expos ve_method_
ge cati tion orn on ng ion ndex ure used
on
7 Tert Scient Expose

38.0 Tertiary 1.0 Yes 1 Very High No
9 iary ology d
1
Tert Scient Expose
6 26.0 Tertiary 1.0 No 1 Very High No
iary ology d
7
2
Tert Scient Expose
iary ology d
4
2
Tert Scient Expose
iary ology d
0
2
Tert Scient Expose
iary ology d
9
... ... ... ... ... ... ... ... ... ... ...
1
3 Tert Scient Expose
44.0 Tertiary 5.0 Yes 1 Very High Yes
6 iary ology d
7
1
Sec
3 Na Scient Expose
ond Tertiary 2.0 Yes 2 Very High Yes
8 N ology d
ary
7
1
Non-
4 Na Tert Expose
Tertiary 2.0 Scient No 1 Very High Yes
2 N iary d
ology
3
1
Non-
4 Na Tert Expose
Tertiary 1.0 Scient Yes 2 Very High Yes
4 N iary d
ology
0
Wif
Wif e_ Husban No_of_ch Wife_ Wife_ Husband Standard_ Media Contracepti
e_a edu d_educa ildren_b religi Worki _Occupat of_living_i _expos ve_method_
ge cati tion orn on ng ion ndex ure used
on
1
Non-
4 Na Tert Expose
Tertiary 2.0 Scient Yes 2 Very High Yes
4 N iary d
ology
7
80 rows × 10 columns
In [152]:
# Removing the duplicate value
cmd.drop_duplicates(inplace=True)
In [153]:
dups = cmd.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
cmd[dups]
Number of duplicate rows = 0
Out[153]:
Wif
Wif e_ Husband No_of_ch Wife_ Wife_ Husband Standard_o Media_ Contracepti
e_a edu _educati ildren_bo religi Worki _Occupat f_living_in exposu ve_method_
ge cati on rn on ng ion dex re used
on
In [154]:
# To print number of rows and columns present in the dataset after removing
the
print("no.of rows: ",cmd.shape[0], "no.of columns:",cmd.shape[1])
no.of rows: 1393 no.of columns: 10
In [155]:
# To describe the dataset
cmd.describe().T
Out[155]:
count mean std min 25% 50% 75% max
16.
Wife_age 1326.0 32.557315 8.289259 26.0 32.0 39.0 49.0
0
No_of_children_born 1372.0 3.290816 2.399697 0.0 1.0 3.0 5.0 16.0
Husband_Occupation 1393.0 2.174444 0.854590 1.0 1.0 2.0 3.0 4.0
In [156]:
cmd.isnull().sum()
Out[156]:
Wife_age 67
Wife_ education 0
Husband_education 0
Wife_religion 0
Wife_Working 0
Media_exposure 0
dtype: int64
In [157]:
# Replace the null value with Median
cmd[['Wife_age', 'No_of_children_born']] = cmd[['Wife_age',
'No_of_children_born']].fillna(cmd[['Wife_age',
'No_of_children_born']].median())
In [158]:
cmd.isnull().sum()
Out[158]:
Wife_age 0
Wife_ education 0
Husband_education 0
Wife_religion 0
Wife_Working 0
Media_exposure 0
dtype: int64
Outlier Treatment
In [85]:
cmd_plot = cmd.select_dtypes(include = ['float64', 'int64'])
In [162]:
#Check for presence of outliers in each feature
plt.figure(figsize = (10,8))
feature_list = cmd_plot.columns
for i in range(len(feature_list)):
plt.subplot(4, 3, i + 1)
sns.boxplot(y = cmd_plot[feature_list[i]], data = cmd_plot)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
In [163]:
# Define a function which results the upper and lower limit to detect
outliers
def remove_outlier(col):
Q1,Q3=col.quantile([0.25,0.75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
In [164]:
# Cap & floor the values beyond the outlier boundaries
for i in feature_list:
LL,UL = remove_outlier(cmd_plot[i])
cmd_plot[i] = np.where (cmd_plot[i] > UL,UL, cmd_plot[i])
cmd_plot[i] = np.where (cmd_plot[i] < LL,LL, cmd_plot[i])
In [166]:
plt.tight_layout()
In [93]:
# Univariate Analysis
plt.figure(figsize=(12,15))
feature_list = cmd.columns
for i in range (len(feature_list)):
plt.subplot(4, 3, i+1)
sns.histplot(y=cmd[feature_list[i]],data=cmd)
plt.tight_layout()
In [94]:
# Bivariate Analysis
sns.pairplot(cmd, diag_kind="kde")
plt.show()
<Figure size 1200x800 with 0 Axes>
In [95]:
cmd.corr()
Out[95]:
Wife_age No_of_children_born Husband_Occupation
Wife_age 1.000000 0.528428 -0.187070
No_of_children_born 0.528428 1.000000 -0.024213
-
Husband_Occupation -0.024213 1.000000
0.187070
In [96]:
# Multivariate Analysis
sns.heatmap(cmd.iloc[:, 0:10].corr(),annot=True)
plt.show()
In [99]:
#The following code loops through each column and checks if the column type
is object
#then converts those columns into categorical with each distinct value
becoming a category.
for feature in cmd.columns:
if cmd[feature].dtype == 'object':
print('\n')
print('feature:',feature)
print(pd.Categorical(cmd[feature].unique()))
print(pd.Categorical(cmd[feature].unique()).codes)
cmd[feature] = pd.Categorical(cmd[feature]).codes
feature: Husband_education
['Secondary', 'Primary', 'Tertiary', 'Uneducated']
Categories (4, object): ['Primary', 'Secondary', 'Tertiary', 'Uneducated']
[1 0 2 3]
feature: Wife_religion
['Scientology', 'Non-Scientology']
Categories (2, object): ['Non-Scientology', 'Scientology']
[1 0]
feature: Wife_Working
['No', 'Yes']
Categories (2, object): ['No', 'Yes']
[0 1]
feature: Standard_of_living_index
['High', 'Very High', 'Low', 'Very Low']
Categories (4, object): ['High', 'Low', 'Very High', 'Very Low']
[0 2 1 3]
feature: Media_exposure
['Exposed', 'Not-Exposed']
Categories (2, object): ['Exposed', 'Not-Exposed']
[0 1]
feature: Contraceptive_method_used
['No', 'Yes']
Categories (2, object): ['No', 'Yes']
[0 1]
In [100]:
cmd.head().T
Out[100]:
0 1 2 3 4
24.
Wife_age 45.0 43.0 42.0 36.0
0
Wife_ education 2.0 1.0 2.0 3.0 3.0
Husband_education 1.0 1.0 1.0 0.0 1.0
No_of_children_born 3.0 10.0 7.0 9.0 8.0
Wife_religion 1.0 1.0 1.0 1.0 1.0
Wife_Working 0.0 0.0 0.0 0.0 0.0
Husband_Occupation 2.0 3.0 3.0 3.0 3.0
Standard_of_living_index 0.0 2.0 2.0 0.0 1.0
Media_exposure 0.0 0.0 0.0 0.0 0.0
Contraceptive_method_used 0.0 0.0 0.0 0.0 0.0
In [102]:
cmd.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1393 entries, 0 to 1472
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Wife_age 1393 non-null float64
1 Wife_ education 1393 non-null int64
2 Husband_education 1393 non-null int8
3 No_of_children_born 1393 non-null float64
4 Wife_religion 1393 non-null int8
5 Wife_Working 1393 non-null int8
6 Husband_Occupation 1393 non-null int64
7 Standard_of_living_index 1393 non-null int8
8 Media_exposure 1393 non-null int8
9 Contraceptive_method_used 1393 non-null int8
dtypes: float64(2), int64(2), int8(6)
memory usage: 62.6 KB
In [109]:
cmd_plot = cmd.select_dtypes(include = ['float64', 'int64', 'int8'])
plt.tight_layout()
In [110]:
# Define a function which results the upper and lower limit to detect
outliers
def remove_outlier(col):
Q1,Q3=col.quantile([0.25,0.75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
In [111]:
# Cap & floor the values beyond the outlier boundaries
for i in feature_list:
LL,UL = remove_outlier(cmd[i])
cmd[i] = np.where (cmd[i] > UL,UL, cmd[i])
cmd[i] = np.where (cmd[i] < LL,LL, cmd[i])
In [112]:
# Check the presence of outliers in the each features
feature_list = cmd.columns
for i in range (len(feature_list)):
plt.subplot(3, 4, i+1)
sns.boxplot(y=cmd[feature_list[i]],data=cmd)
plt.tight_layout()
In [113]:
cmd.corr()
# Check the Heatmap
sns.heatmap(cmd.iloc[:, 0:10].corr(),annot=True)
plt.show()
CART
In [114]:
#Creating a backup
cmd_1 = cmd.copy()
In [115]:
# capture the target column ("Contraceptive_method_used") into separate
vectors
X = cmd_1.drop("Contraceptive_method_used" , axis=1)
y = cmd_1.pop("Contraceptive_method_used")
In [116]:
from sklearn.model_selection import train_test_split
X_train, X_test, train_labels, test_labels = train_test_split(X, y,

test_size=.30, random_state=1)
In [117]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion = 'gini' )
In [118]:
dt_model.fit(X_train, train_labels)
Out[118]:
DecisionTreeClassifier
DecisionTreeClassifier()
In [121]:
from sklearn import tree
train_char_label = ['No', 'Yes']

ld_Tree_File = open('ld_Tree_File.dot','w')
dot_data = tree.export_graphviz(dt_model,
out_file=ld_Tree_File,
feature_names = list(X_train),
class_names = list(train_char_label))
ld_Tree_File.close()
In [122]:
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index
= X_train.columns).sort_values('Imp',ascending=False))
Imp
Wife_age 0.313523
No_of_children_born 0.261942
Standard_of_living_index 0.109489
Wife_ education 0.103750
Husband_Occupation 0.097337
Wife_Working 0.064697
Husband_education 0.049262
Wife_religion 0.000000
Media_exposure 0.000000
In [123]:
y_predict = dt_model.predict(X_test)
In [124]:
y_predict.shape
Out[124]:
(418,)
Regularising the Decision Tree

Adding Tuning Parameters
In [125]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],

'ccp_alpha': [0.1, .01, .001],
# cost complexity parameter, ccp_alpha. Greater values of
ccp_alpha increase the
# number of nodes pruned
'max_depth' : [1,5,10,15,20],
'min_samples_leaf':[1,5,10,15,20],
'criterion' :['gini', 'entropy']
}
tree_clas = DecisionTreeClassifier(random_state=1024)
grid_search = GridSearchCV(estimator=tree_clas, param_grid=param_grid,
cv=5, verbose=True)
grid_search.fit(X_train, train_labels)
Fitting 5 folds for each of 450 candidates, totalling 2250 fits
Out[125]:
GridSearchCV
estimator: DecisionTreeClassifier
In [126]:
grid_search.best_estimator_
Out[126]:
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=10,
max_features='auto', min_samples_leaf=20,
random_state=1024)
In [127]:
reg_dt_model = DecisionTreeClassifier( ccp_alpha=0.001,criterion='entropy',
max_depth=15,
max_features='auto',
random_state=1024)
reg_dt_model.fit(X_train, train_labels)
Out[127]:
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=15,
max_features='auto', random_state=1024)
In [128]:
ld_tree_regularized = open('ld_tree_regularized.dot','w')
dot_data = tree.export_graphviz(reg_dt_model, out_file= ld_tree_regularized
, feature_names = list(X_train), class_names = list(train_char_label))
ld_tree_regularized.close()
dot_data
In [129]:
print (pd.DataFrame(reg_dt_model.feature_importances_, columns = ["Imp"],
index = X_train.columns).sort_values('Imp',ascending=False))
Imp
Wife_age 0.335927
No_of_children_born 0.301169
Standard_of_living_index 0.112077
Husband_Occupation 0.075047
Husband_education 0.066805
Wife_ education 0.062557
Wife_Working 0.046418
Wife_religion 0.000000
Media_exposure 0.000000
In [130]:
ytrain_predict = reg_dt_model.predict(X_train)
ytest_predict = reg_dt_model.predict(X_test)
In [131]:
# AUC and ROC for the training data
# predict probabilities
probs = reg_dt_model.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(train_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(train_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.989
In [132]:
# AUC and ROC for the test data
# predict probabilities
probs = reg_dt_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(test_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(test_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.621
In [133]:
# Import the classification_report & Confusion_matrix library
from sklearn.metrics import classification_report,confusion_matrix
# To view the confusion matrix for train_labels, ytrain_predict

confusion_matrix(train_labels, ytrain_predict)
print(classification_report(train_labels, ytrain_predict))
precision recall f1-score support
0.0 0.93 0.92 0.93 422

1.0 0.94 0.95 0.94 553
accuracy 0.94 975

macro avg 0.94 0.93 0.94 975
weighted avg 0.94 0.94 0.94 975
In [134]:
# To view the confusion matrix for test_labels, ytest_predict
confusion_matrix(test_labels, ytest_predict)
print(classification_report(test_labels, ytest_predict))
precision recall f1-score support
0.0 0.60 0.55 0.57 192

1.0 0.64 0.69 0.67 226
accuracy 0.62 418

macro avg 0.62 0.62 0.62 418
weighted avg 0.62 0.62 0.62 418
In [135]:
# To view the model score for X_train, train_labels
reg_dt_model.score(X_train,train_labels)
Out[135]:
0.9364102564102564
In [136]:
# To view the model score for X_test, test_labels
reg_dt_model.score(X_test,test_labels)
Out[136]:
0.6244019138755981
Logistics Regression
In [137]:
# Creating a copy of the original data frame
cmd_2 = cmd.copy()
# Import the LabelEncoder library
from sklearn.preprocessing import LabelEncoder
# Defining a Label Encoder object instance
LE = LabelEncoder()
LE
Out[137]:
LabelEncoder
LabelEncoder()

LDA Code

Uploaded by

Copyright:

Available Formats

You might also like

LDA Code

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

LDA Code

Uploaded by

Copyright:

Available Formats

import numpy as np

Wife_age 24.0 45.0 43.0 42.0 36.0

Wife_ education Primary Uneducated Primary Secondary Secondary

Husband_education Secondary Secondary Secondary Primary Secondary

No_of_children_born 3.0 10.0 7.0 9.0 8.0

Standard_of_living_index High Very High Very High High Low

Media_exposure Exposed Exposed Exposed Exposed Exposed

No_of_children_born 1452.0 3.254132 2.365212 0.0 1.0 3.0 4.0 16.0

Husband_Occupation 1473.0 2.137814 0.864857 1.0 1.0 2.0 3.0 4.0

7 Tert Scient Expose

No_of_children_born 1372.0 3.290816 2.399697 0.0 1.0 3.0 5.0 16.0

Husband_Occupation 1393.0 2.174444 0.854590 1.0 1.0 2.0 3.0 4.0

Wife_age 1.000000 0.528428 -0.187070

No_of_children_born 0.528428 1.000000 -0.024213

Wife_ education 2.0 1.0 2.0 3.0 3.0

Husband_education 1.0 1.0 1.0 0.0 1.0

No_of_children_born 3.0 10.0 7.0 9.0 8.0

Wife_religion 1.0 1.0 1.0 1.0 1.0

Wife_Working 0.0 0.0 0.0 0.0 0.0

Husband_Occupation 2.0 3.0 3.0 3.0 3.0

Standard_of_living_index 0.0 2.0 2.0 0.0 1.0

Media_exposure 0.0 0.0 0.0 0.0 0.0

Contraceptive_method_used 0.0 0.0 0.0 0.0 0.0

X_train, X_test, train_labels, test_labels = train_test_split(X, y,

train_char_label = ['No', 'Yes']

Regularising the Decision Tree

param_grid = {'max_features': ['auto', 'sqrt', 'log2'],

# To view the confusion matrix for train_labels, ytrain_predict

0.0 0.93 0.92 0.93 422

accuracy 0.94 975

0.0 0.60 0.55 0.57 192

accuracy 0.62 418

You might also like