Project 4 - Preferred Transport Mode

title: "Project 4 - Cars Case Study (Predicitive Modelling)" author: "Abiola E Oladipupo" date: "10/22/2020"
output: html_document
```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)
```{r}
setwd("/Users/jaggiey/Desktop/Desktop Items/University of Texas, Austin - DSBA/Class Project 4")
PROJECT EXPLANATORY NOTE The project seeks to understand the choice of the mode of transportation by employees. Based on the understanding, it aims to
predict whether or not an employee will use a car as a mode of transport work while determining the critical factor or variable driving this preference. It will address
descriptive, predictive and prescriptive aspects of machine learning and modelling.
```{r} MoT = read.csv("Code - Joins, Summary Tables and Export.csv")
Load Libraries
```{r}
library(scales)
library(ggplot2)
library(DMwR)
library(caTools)
library(corrplot)
library(class)
library(e1071)
library(ROCR)
library(car)
library(caret)
library(ipred)
library(rpart)
library(gbm)
library(xgboost)
I. EXPLORATORY DATA ANALYSIS(EDA)
Data Overview and Summary

```{r} options(scipen=999) head(MoT) dim(MoT) str(MoT) MoT$Gender = ifelse(MoT$Gender=="Male",1,0) MoT$Gender = as.factor(MoT$Gender) MoT$Engineer =
as.factor(MoT$Engineer) MoT$MBA = as.factor(MoT$MBA) MoT$license = as.factor(MoT$license) MoT$Transport = as.factor(MoT$Transport) MoT$Trans.Mode =
as.factor(ifelse(MoT$Transport=="Car","Car","Others")) View(MoT) summary(MoT) colSums(is.na(MoT))
NOTES ON THE DATASET OVERVIEW:

The data set contains 418 records each with 9 variables or columns. From the data overview, the following can be gleaned:
1. The age range is between 18 and 43 years.

2. There are more male employees than female distributed as 297 men and 105 women.
3. Of the sampled employers, there 313 engineers and 109 MBA holders.
4. The travel distance by employees is between 3.2km and 23.4km, with average commuting distance of 11.29km
5. Only 85 employees old driving licenses.
6. Just 35 employees commute to the office in cars while 383 use others means ( 83 persons by 2-wheeler and 383 persons by public transport)
7. The salary range for employees is between 6,500 and 57,000; the maximum appears to be outlying considering the median and mean being 13,00
II. GRAPHICAL ILLUSTRATION OF EDA:
#Univariate Analysis - Distribution of Mode of Transport -Bimodal

```{r}
ggplot(MoT, aes(Trans.Mode, fill=Trans.Mode))+geom_bar(fill="blue",color="black", stat="count")+stat_count(geom="text",color="white", face="b
Notes: Over 90% of employees prefer other mode of transport than car trips.
Distribution of Mode of Transport -Multimodal

```{r} ggplot(MoT, aes(Transport, fill=Transport))+geom_bar(fill="darkgreen",color="black",stat="count")+stat_count(geom="text",color="white", face="bold",aes(label =
..count..), position = position_stack(vjust=0.5) )+ggtitle("Distribution of Mode of Transportation - Multimodal")+xlab("Mode of Transport")+ylab("No. of
Employees")+theme(title=element_text(face="bold", hjust = 0.5))
#Age Distribution
```{r}
ggplot(MoT, aes(Age, fill=Age))+geom_histogram(fill="aquamarine2",color="blue", bins = 15,stat="count")+stat_count(geom="text",color="blue",
Notes on Age Distribution: The employees' age is almost normally distributed with few above age 35years. Most employer are under 30 years( i.e.80.75%) while the
modal age is 26 years.
Salary Distribution of Employees

```{r} ggplot(MoT, aes(Salary, fill=Salary))+geom_histogram(fill="orange",color="red",bins=20)+ggtitle("Salary Distribution of Employees ")+xlab("Salary of
Employees")+ylab("No. of Employees")+theme(plot.title=element_text(hjust=0.5, face="bold"),title=element_text(face="bold", hjust = 0.5))
Notes on Employees' Salary:

Salary is right-tailed or skewed to the right.
Most employees earn below 20,000
#Distribution of Commuting Distance of Employees

```{r}
ggplot(MoT, aes(Distance, fill=Distance))+geom_histogram(fill="cyan",color="black",bins=20)+ggtitle("Distribution of Employees' Commuting Dis
Note on Commuting Distance: The commuting or travel distance appear distributed close to normal. Most employers granve between 6km and 14km; very few
beyond 20km.
Boxplot
```{r} boxplot(MoT)
#Bivariate Analysis:
#Mode of Transport based on Age
```{r}
ggplot(MoT, aes(Age, fill=Trans.Mode))+geom_histogram(color="black", bins = 20)+ggtitle("Mode of Transport by Ages ")+xlab("Age of Employees"
Observation: Employees younger than 30 years, who constitute the majority of employees, use other modes of transportation while only employees from 30 and
above use cars as a mode of transport. Although there is a mix of the modes of transport for the employees between 30 - 35 years of age, those from 36 upwards
solely commute in cars.
Mode of Transport based on Gender

```{r} ggplot(MoT, aes(Gender, fill=Trans.Mode))+geom_bar(color="black", position="dodge")+ggtitle("Mode of Transport by Gender ")+xlab("Gender")+ylab("No. of
Employees")+theme(plot.title=element_text(hjust=0.5,face="bold"),title=element_text(face="bold", hjust = 0.5))
Observation: There more male employees than female employees. Proportionately, more male employees prefer cars as a mode of transport than f
#Mode of Transport by License Ownership

```{r}
ggplot(MoT, aes(license, fill=Trans.Mode))+geom_bar(color="black", position="dodge")+ggtitle("Mode of Transport by License ")+xlab("License")
Observation: Most employees do not possess a driving license. However, very few of the employees without a driver's license commute to work in cars while only
about half of the minority with driver's license use cars to work.
Mode of Transport based on Commuting Distance

```{r} par(mfrow=c(2,2)) ggplot(MoT, aes(Distance, fill=Trans.Mode))+geom_histogram(color="black",bins=25)+ggtitle("Mode of Transport by
Distance")+xlab("Commuting Distance")+ylab("No. of Employees")+theme(plot.title=element_text(hjust=0.5,face="bold"),title=element_text(face="bold", hjust = 0.5))
ggplot(MoT, aes(Distance, fill=Trans.Mode))+geom_density(color="black", alpha=0.5)+ggtitle("Mode of Transport by Distance")+xlab("Commuting

Distance")+ylab("No. of Employees")+theme(plot.title=element_text(hjust=0.5,face="bold"),title=element_text(face="bold", hjust = 0.5))
Observation: Most employees' commuting distance to work is under 14km. Employees who live farther than approximately 14km from work, prefer
#Mode of Transport versus Employees' Salary

```{r}
ggplot(MoT, aes(Salary, fill=Trans.Mode))+geom_histogram(color="black", bins=25)+ggtitle("Mode of Transport versus Salary")+xlab("Employee Sa
ggplot(MoT, aes(Salary, fill=Trans.Mode))+geom_density(color="black", alpha=0.5)+ggtitle("Mode of Transport based om Salary")+xlab("Employee'
ggplot(MoT, aes(Salary,Distance, color=Trans.Mode))+geom_point()+ggtitle("Employee Salary versus Commuting Distance")+xlab("Employee Salary(i
Observations: 1. The distribution of the employees' salaries is rightly skewed with most less than 20,000.00. Almost all employees earning below 30,000.00 commute
by other mode rather than cars. The few exceptional cases, live farhter i.e. between 14-20 km from work.
2. Few earning above 30,000.00 use other mode and they live less than 15km from work.
3. Most employees earning from 30,000.00 and above live farther i.e. from 14km and beyond.
Boxplots - Mode of Numeric Variables -Age, Experience, Salary and

Distance
```{r} Numvar =MoT[,c(1,5:7)] boxplot(Numvar, col=c("red","orange","blue","green"), main="Boxplot of Age, Work Experience, Salary & Distance")
While there appears to be outliers among salary earners, it may not be abnormal considering age and years of experience, considering this ma
#Multivariate Analysis
#Work Experience, Salary and Mode of Transport
```{r}
ggplot(MoT, aes(Work.Exp,Salary, size=Salary, col=Trans.Mode))+geom_point()+geom_smooth()+ggtitle("Work Experience, Salary and Mode of Transp
Observations: There is a positive correlation between salary and work experience i.e. the longer the years of experience, the more the employee earns. Invariably, the
high earners afford a car ride to work than those earning less i.e. less than 30,000.00.
Work Experience, Salary and Gender; and preferred mode

```{r} ggplot(MoT, aes(Work.Exp,Salary, size=Salary, col=Gender))+geom_point()+ggtitle("Work Experience, Salary and Gender")+xlab("Work
Experience")+ylab("Salary")+theme(plot.title=element_text(hjust=0.5,face="bold"),title=element_text(face="bold", hjust = 0.5))
ggplot(MoT, aes(Work.Exp,Distance, size=Salary, col=Gender))+geom_point()+ggtitle("Distribution Across Travel Distance, Work Experience, Salary and
Gender")+xlab("Work Experience")+ylab("Travel Distance in km")+theme(plot.title=element_text(hjust=0.5,face="bold"),title=element_text(face="bold", hjust = 0.5))
Observations: The employment is male dominated as there are much fewer women in the employment. Also, the high echelon is even more may domi
A contrast is that at the lower end of the spectrum of pay versus experience, there is almost a balance in the gender mix.
Most of the employees (Male and Female alike) travel less than 15km to the office. More of lower earning male live after 15 km, although a fe
#Commuting Distance and Age over Gender

```{r}
par(mfrow =c(2,2))
ggplot(MoT, aes(Age, Distance, size=Distance, col=Gender))+geom_point()+ggtitle("Age, Distance and Gender")+xlab("Age")+ylab("Distance")+them
ggplot(MoT, aes(Distance,fill=Gender))+geom_histogram(color="blue")+ggtitle("Age, Distance and Gender")+xlab("Distance")+ylab("No. of Employe
ggplot(MoT, aes(Age, Salary, size=Distance, col=Age))+geom_point()+ggtitle("Age, Distance and Salary")+xlab("Age")+ylab("Salary")+theme(plot.
Observation: Most employees under age of 30 years travel a distance of less than 14km to the office. A proportion of employees above 30 years commute longer than
14km.
Most female employers commute less than 14km.
Most young employees with lesser earnings travel shorter distance to the office; invariably they stay closer. It may be suggestive of that the mode of transport is
greatly influenced by salary of employees, a reason to stay closer to work to reduce travel costs. This is logic is however speculatively inferred from the exploratory
analysis.
Commuting Distance and Age over Mode of Transport

```{r} par(mfrow=c(2,2)) ggplot(MoT, aes(Age, Distance, size=Distance, col=Trans.Mode))+geom_point()+ggtitle("Age, Distance and Mode of
Transport")+xlab("Age")+ylab("Distance")+theme(plot.title=element_text(hjust=0.5,face="bold"),title=element_text(face="bold", hjust = 0.5))
ggplot(MoT, aes(Age, Distance, size=Distance, col=Transport))+geom_point()+geom_smooth()+ggtitle("Age, Distance and Mode of Multimodal

Transport")+xlab("Age")+ylab("Distance")+theme(plot.title=element_text(hjust=0.5,face="bold"),title=element_text(face="bold", hjust = 0.5))
Observations: Most employees prefer public transport option, particularly those traveling than 14km. These are coincidentally, most of the e
III. THE PROJECT CHALLENGE
#Most Challenging Aspect

Going through the data pre-processing, three key challenges have been uncovered viz:
a)Data imbalance which can be resolved though minority oversampling via SMOTE(Synthentic Minority Oversampling Technique).
b) Determining relevant variables for model building. Few approaches are explored including variable importance test(varImp), Chi-square chec
c)The final and, in my judgment, the most challenging aspect of the problem is the identify and and resolving multi-collinearity among predic
Age and Work Experience.
IV. DATA PREPARATION
#Fixing the Missing Variable

```{r}
sum(is.na(MoT))
colSums(is.na(MoT)) ## 1 missing value in MBA
MoT<- knnImputation(MoT) ## To fix the missing value
sum(is.na(MoT))##To check if missing is treated.
Outlier Check and Treatment - Using Winsorization Approach 99% and

1% for Upper and Lower Limits respectively.
```{r} upper.age = quantile(MoT$Age, 0.99) upper.age sum(MoT$Age>upper.age) ((sum(MoT$Age>upper.age))/nrow(MoT))*100
proportion of outliers in the Age variable is negligible at 0.47% .
upper.workexp = quantile(MoT$Work.Exp, 0.99) upper.workexp sum(MoT$Work.Exp>upper.workexp) (sum(MoT$Work.Exp>upper.workexp)/nrow(MoT))*100
proportion of outliers in the Work Experience variable is 0.96% and also immaterial.
upper.dist = quantile(MoT$Distance, 0.99) upper.dist sum(MoT$Distance>upper.dist) (sum(MoT$Distance>upper.dist)/nrow(MoT))*100
proportion of outliers in the Distance variable is 1.20 % anb below 5% threshold.
upper.pay = quantile(MoT$Salary, 0.99) upper.pay sum(MoT$Salary>upper.pay) (sum(MoT$Salary>upper.pay)/nrow(MoT))*100
proportion of outliers in the Salary variable is 0.72% and hence considered inconsequential.
#Checking for Data Imbalance and Rectifying via Smoting if any.

```{r}
table(MoT$Trans.Mode)
prop.table(table(MoT$Trans.Mode))
sum(MoT$Trans.Mode=="Car")/nrow(MoT)
MoT$TMode.binary = as.factor(ifelse(MoT$Trans.Mode=="Car",1,0))
head(MoT$TMode.binary,20)
set.seed(123)
partition = sample.split(MoT$TMode.binary, SplitRatio = 0.7)
MoT.Train =subset(MoT,partition==T)
MoT.Test =subset(MoT,partition==F)
dim(MoT.Train)
dim(MoT.Test)
table(MoT.Train$TMode.binary)
prop.table(table(MoT.Train$TMode.binary))
table(MoT.Test$TMode.binary)
prop.table(table(MoT.Test$TMode.binary))
class(MoT.Train$TMode.binary)
####SMOTE
MoT.Trainsmote = SMOTE(TMode.binary~., data=MoT.Train, perc.over=3000, k=5,perc.under =500 )
dim(MoT.Trainsmote)
table(MoT.Trainsmote$TMode.binary)
prop.table(table(MoT.Trainsmote$TMode.binary))
## Car travel rate equals 0.08. The data set is unbalance and may affect the model performance particularly as regards model sensitivity or r
Using Corrplot and Correlation to Check for Multicollinearity for

Numeric Variables(numvar)
```{r} names(MoT) correl = MoT[,c(1,5:7)] corplt = function(x){ par(mfrow=c(1,2)) a=corrplot(cor(x), method = "circle") b=corrplot(cor(x), method ="number") }
corplt(correl)
Observation: Age, Salary and Work experience appear to be highly correlated predictors and will require treatment to improve the performance
#Check the significance of Categorical Variables

```{r}
##Hypothesis Statement -
#HO: Transport Mode is influenced by categorical variables
#H1: Transport Mode is NOT influenced by categorical variables
Gender_chistat = chisq.test(MoT$Trans.Mode,MoT$Gender)
Engineer_chistat = chisq.test(MoT$Trans.Mode,MoT$Engineer)
MBA_chistat = chisq.test(MoT$Trans.Mode,MoT$MBA)
license_chistat = chisq.test(MoT$Trans.Mode,MoT$license)
Cat_p.values = c(Gender_chistat$p.value, Engineer_chistat$p.value, MBA_chistat$p.value, license_chistat$p.value)
Cat_parameters = c(Gender_chistat$parameter, Engineer_chistat$parameter, MBA_chistat$parameter, license_chistat$parameter)
Cat_statistics = c(Gender_chistat$statistic, Engineer_chistat$statistic, MBA_chistat$statistic, license_chistat$statistic)
Categorical_names =c("Gender", "Engineer", "MBA","License")
Cat_relevance =data.frame(Categorical_names,Cat_statistics,Cat_parameters,Cat_p.values)
print(Cat_relevance)
From the variance relevance test, the result shows that only license is a significantly relevant variable. The Null hypothesis is rejected for Gender, Engineer and MBA.
Therefore, they are considered not significant to the model building process. However, driving license returns significant and is relevant in influencing the transport
mode preference by employees. In its case, we fail to reject the null hypothesis.
V. MODEL BUILING 1 - KNN, NAIVEBAYES AND LOGISTIC REGRESSION
KNN Model
```{r} MoT KNN=MoT names(KNN) head(KNN) levels(KNN$Gender) =c("1","0") str(KNN)
set.seed(123) KNNtrain = MoT.Train dim(KNNtrain) KNNtest = MoT.Test dim(KNNtest) Kmodel =

knn(train=KNNtrain[,c(6,7)],test=KNNtest[,c(6,7)],cl=KNNtrain$TMode.binary, k=5) Kmodel KNNtest.tab = table(KNNtest$TMode.binary,Kmodel) KNNtest.tab
#################==K-NEAREST NEIGHBOUR MODEL PERFORMANCE CHECK ==
------------------------------Accuracy Check Result- 100% ---------------------------------

----
KNNaccuracy=(KNNtest.tab[1,1]+KNNtest.tab[2,2])/nrow(KNNtest) KNNaccuracy=percent(KNNaccuracy, accuracy = 0.01) KNNaccuracy
------------------------------Sensitivity Check Result- 100% -------------------------------

--
KNNsensitivity = KNNtest.tab[2,2]/(KNNtest.tab[2,1]+KNNtest.tab[2,2]) KNNsensitivity = percent(KNNsensitivity, accuracy = 0.01) KNNsensitivity
------------------------------Specificity Check Result- 100% -------------------------------

--
KNNspecificity = KNNtest.tab[2,2]/(KNNtest.tab[2,1]+KNNtest.tab[2,2]) KNNspecificity = percent(KNNspecificity, accuracy = 0.01) KNNspecificity
KNN MODEL SUMMARY TABLE
KNNLabel = c("Performance Metric", "Accuracy", "Sensitivity", "Specificity") KNNOutput = c("Performance Output", KNNaccuracy, KNNsensitivity, KNNspecificity)
KNNResult_Table = data.frame(KNNLabel, KNNOutput) print(KNNResult_Table)
#Naive Bayes Modelling
```{r}
NBTrain= MoT.Train[,-c(9,10)]
NBTest = MoT.Test[,-c(9,10)]
names(NBTrain)
NBModel = naiveBayes(x=NBTrain[,1:8], y=NBTrain[,9])
predMode =predict(NBModel,NBTest)
predModeVal =predict(NBModel,NBTest,type="raw")[,2]
predModeVal = ifelse(predModeVal>0.5,1,0)
NBTest.tab = table(NBTest$TMode.binary,predModeVal)
NBTest.tab
#######################==K-NEAREST NEIGHBOUR MODEL PERFORMANCE CHECK ==#################################
#------------------------------Accuracy Check Result- 98.41% -------------------------------------#

NBaccuracy = (NBTest.tab[1,1]+NBTest.tab[2,2])/nrow(NBTest)
NBaccuracy= percent(NBaccuracy,accuracy=0.01)
NBaccuracy
#------------------------------Recall/Sensitivity Check Result- 100.00% -------------------------#

NBsensitivity = NBTest.tab[2,2]/(NBTest.tab[2,1]+NBTest.tab[2,2])
NBsensitivity =percent(NBsensitivity, accuracy=0.01)
NBsensitivity
#------------------------------Specificity Check Result- 98.26% --------------------------------#

NBspecificity = NBTest.tab[1,1]/(NBTest.tab[1,1]+NBTest.tab[1,2])
NBspecificity =percent(NBspecificity, accuracy=0.01)
NBspecificity
NBRoc = prediction(predModeVal,NBTest$TMode.binary)
nb.perf = performance(NBRoc,"tpr", "fpr")
plot(nb.perf,colorize = TRUE, main="Naive Bayes ROC Plot - Mode of Transport")
##$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ AREA UNDER THE CURVE(AUC) - 99.13% $$$$$$$$$$$$$$$$$$$$$$

nb.perf =performance(NBRoc,"auc")
NBauc=nb.perf@y.values
NBauc =percent(as.numeric(NBauc),accuracy = 0.01)
##NAIVE BAYES MODEL PERFORMANCE SUMMARY TABLE#####

NBLabel = c("Performance Metric", "Accuracy", "Sensitivity", "Specificity","AUC")
NBOutput = c("Performance Output", NBaccuracy, NBsensitivity, NBspecificity, NBauc)
NBResult_Table = data.frame(NBLabel, NBOutput)
print(NBResult_Table)
Logistic Regression Modelling

```{r}
Logitrain = MoT.Train[,-c(9,10)] dim(Logitrain) table(Logitrain$TMode.binary) prop.table(table(Logitrain$TMode.binary))
Logitest = MoT.Test[,-c(9,10)] dim(Logitest) table(Logitest$TMode.binary) prop.table(table(Logitest$TMode.binary))
str(Logitrain)
LogModel0 = glm(TMode.binary~., data=Logitrain,family = binomial) summary(LogModel0)
LogModel0 is impacted by multi-collinearity as variables (Age and Work.Exp) highly correlate with Salary. The model is
therefore improved by dropping both Age and Work.Exp variables.
LogModel = glm(TMode.binary~.-Age-Work.Exp, data=Logitrain,family = binomial)#### summary(LogModel) vif(LogModel)
Logpred.train = predict(LogModel,data=Logitrain, type="response") LogitrainVal = ifelse(Logpred.train>0.5,1,0) Logtraintab =

table(Logitrain$TMode.binary,LogitrainVal) Logtraintab
############################ Performance Check - LogModel
-------------------------TRAIN DATA - Measure of Accuracy and Sensitivity -----

--------------------------
Logitrain.accuracy1 = (Logtraintab[1,1]+Logtraintab[2,2])/nrow(Logitrain) #-----accuracy is 100%
Logitrain.sensitivity1 = Logtraintab[2,2]/(Logtraintab[2,1]+Logtraintab[2,2]) #----sensitivity is 100%
Logitrain.specificity1 = Logtraintab[1,1]/(Logtraintab[1,1]+Logtraintab[1,2]) #----sensitivity is 100%
Logitrain.accuracy1 = percent(Logitrain.accuracy1, accuracy = 0.01) Logitrain.accuracy1
Logitrain.sensitivity1 = percent(Logitrain.sensitivity1, accuracy = 0.01) Logitrain.sensitivity1
Logitrain.specificity1 = percent(Logitrain.specificity1, accuracy = 0.01) Logitrain.specificity1
--------------------------TEST DATA - Measure of Accuracy and Sensitivity -----

-----------------------
Logpred.test = predict(LogModel,newdata=Logitest, type="response") LogitestVal = ifelse(Logpred.test>0.5,1,0)
Logtest.tab=table(Logitest$TMode.binary,LogitestVal) Logtest.tab
Logitest.accuracy1 = (Logtest.tab[1,1]+Logtest.tab[2,2])/nrow(Logitest) #---------accuracy is 99.21%
Logitest.sensitivity1 = Logtest.tab[2,2]/(Logtest.tab[2,1]+Logtest.tab[2,2]) #-------sensitivity is 91.67%
Logitest.specificity1 = Logtest.tab[1,1]/(Logtest.tab[1,1]+Logtest.tab[1,2]) #-------sensitivity is 91.67%
Logitest.accuracy1 = percent(Logitest.accuracy1, accuracy = 0.01) Logitest.accuracy1
Logitest.sensitivity1 = percent(Logitest.sensitivity1, accuracy = 0.01) Logitest.sensitivity1
Logitest.specificity1 = percent(Logitest.specificity1, accuracy = 0.01) Logitest.specificity1
=============================================ROC AND
AUC===========================================
----------------------------------------ROC -------------------------------------------------------
LogRoc = prediction(Logpred.test,Logitest$TMode.binary) Log.perf = performance(LogRoc,"tpr", "fpr") plot(Log.perf,colorize = TRUE, main="ROC Plot LogiTest - Mode
of Transport")
------------------------------------------AUC ----------------------------------------------------
Log.perf =performance(LogRoc,"auc") auc=Log.perf@y.values auc = percent(as.numeric(auc), accuracy = 0.01) auc
----------------------------------Determining Variable Importance--------------------------------
varImp(LogModel) imp.x =data.frame(varImp(LogModel)) imp.x = cbind(rownames(imp.x),imp.x) imp.x[with(imp.x,order(-Overall)),]
**Note: Distance and Salary ranked the highest. Both also have the
least correlation based on earlier EDA.
Improving on LogModel by addressing multicollinearity and enhancing model performance.
LogModel2 = glm(TMode.binary~Distance+Salary, data=Logitrain,family = binomial) ##### summary(LogModel2) vif(LogModel2)
Logpred.train2 = predict(LogModel2,data=Logitrain, type="response") LogitrainVal2 = ifelse(Logpred.train2>0.5,1,0)

Logtraintab2=table(Logitrain$TMode.binary,LogitrainVal2) Logtraintab2
#############################Performance Check - LogModel2
--------------------------------TRAIN DATA 2 - Measure of Accuracy and

Sensitivity ----------------------
Logitrain.accuracy2 = (Logtraintab2[1,1]+Logtraintab2[2,2])/nrow(Logitrain) #-----accuracy is 100% Logitrain.sensitivity2 =
Logtraintab2[2,2]/(Logtraintab2[2,1]+Logtraintab2[2,2]) #----sensitivity is 100% Logitrain.specificity2 = Logtraintab2[1,1]/(Logtraintab2[1,1]+Logtraintab2[1,2]) #----
sensitivity is 100%
Logitrain.accuracy2= percent(Logitrain.accuracy2, accuracy = 0.01) Logitrain.accuracy2
Logitrain.sensitivity2 = percent(Logitrain.sensitivity2, accuracy = 0.01) Logitrain.sensitivity2
Logitrain.specificity2 = percent(Logitrain.specificity2, accuracy = 0.01) Logitrain.specificity2
-------------------------------TEST DATA 2 - Measure of Accuracy and

Sensitivity -----------------------
Logpred.test2 = predict(LogModel2,newdata=Logitest, type="response") LogitestVal2 = ifelse(Logpred.test2>0.5,1,0)
Logtest.tab2=table(Logitest$TMode.binary,LogitestVal2) Logtest.tab2
Logitest.accuracy2 = (Logtest.tab2[1,1]+Logtest.tab2[2,2])/nrow(Logitest) #---------accuracy is 99.21% Logitest.sensitivity2 =

Logtest.tab2[2,2]/(Logtest.tab2[2,1]+Logtest.tab2[2,2]) #-------sensitivity is 91.67% Logitest.specificity2 = Logtest.tab2[1,1]/(Logtest.tab2[1,1]+Logtest.tab2[1,2]) #-------
sensitivity is 91.67%
Logitest.accuracy2 =percent(Logitest.accuracy2, accuracy = 0.01) Logitest.accuracy2
Logitest.sensitivity2= percent(Logitest.sensitivity2, accuracy = 0.01) Logitest.sensitivity2
Logitest.specificity2= percent(Logitest.specificity2, accuracy = 0.01) Logitest.specificity2
==========================================ROC AND AUC

============================================
----------------------------------------------ROC -------------------------------------------------
LogRoc2 = prediction(Logpred.test2,Logitest$TMode.binary) Log.perf2 = performance(LogRoc2,"tpr", "fpr") plot(Log.perf2,colorize = TRUE, main="ROC Plot LogiTest 2
- Mode of Transport")
---------------------------------------------AUC --------------------------------------------------
Log.perf2 =performance(LogRoc2,"auc") auc2=Log.perf2@y.values auc2 = percent(as.numeric(auc2), accuracy = 0.01) auc2
================Comparison of Logistic models=========================
Loglabel =as.character(c("Performance Metric","Train Accuracy","Train Sensitivity","Train Specificity","Test Accuracy","Test.Sensitivity","Test Specificity", "AUC"))
LogModel1.perf = c(" Logigress Model 1", Logitrain.accuracy1 ,Logitrain.sensitivity1,Logitrain.specificity1,

Logitest.accuracy1,Logitest.sensitivity1,Logitest.specificity1,auc)
LogModel2.perf = c("Logigress Model 2",

Logitrain.accuracy2,Logitrain.sensitivity2,Logitrain.specificity2,Logitest.accuracy2,Logitest.sensitivity2,Logitest.specificity2,auc2)
Logmodel.comparison = data.frame(Loglabel,LogModel1.perf,LogModel2.perf)
print(Logmodel.comparison)
mFurther Model Validation checks

library(lmtest) library(pscl)
LogModel
lrtest(LogModel) #-------All Variables' combined log-likelihood of -13.94 greater than intercept log-likelihood of -82.94(indicative of a good model) pR2(LogModel)
odds =exp(coef(LogModel)) prob = odds/(1+odds) odds_prob1 = data.frame(odds,prob) print(odds_prob1)
LogModel2
lrtest(LogModel2) #-------All Variables' combined log-likelihood of -14.40 greater than intercept logl-ikelihood of -82.94(indicative of a better model than Logmodel
above, albeit slightly)
pR2(LogModel2) odds2 =exp(coef(LogModel2)) prob2 = odds2/(1+odds2) odds_prob2 = data.frame(odds2,prob2) print(odds_prob2)
odds-probability in both models show Salary and Distance as the significant variables
influence mode of transport preference.
VI. MODEL BUILING 2 - BAGGING, GRADIENT BOOSTING AND EXTREME GRADIENT BOOSTING
#Bagging
```{r}
MoTBagging = bagging(TMode.binary~.,data=MoT.Train, control =rpart.control(maxdepth = 5, minsplit =3))

MoT.Test$bagpred = predict(MoTBagging, MoT.Test)
head(MoT.Test)
##################### Performance Check - Bagging ###########################
#-----------Test Data - Measure of Accuracy and Sensitivity ------------------#

MoTbag.tab = table(MoT.Test$TMode.binary,MoT.Test$bagpred)
head(MoTbag.tab,10)
tail(MoTbag.tab,10)
##------------Accuracy --100% -------------------##

MoTbagging.Accuracy = (MoTbag.tab[1,1]+MoTbag.tab[2,2])/nrow(MoT.Test)
MoTbagging.Accuracy=percent(MoTbagging.Accuracy, accuracy = 0.01)
MoTbagging.Accuracy
##------------Sensitivity --100% ----------------##

MoTbagging.Sensitivity = MoTbag.tab[2,2]/(MoTbag.tab[2,1] + MoTbag.tab[2,2])
MoTbagging.Sensitivity = percent(MoTbagging.Sensitivity,accuracy = 0.01)
MoTbagging.Sensitivity
##------------Specificity --100% ----------------##

MoTbagging.Specificity = MoTbag.tab[1,1]/(MoTbag.tab[1,1] + MoTbag.tab[1,2])
MoTbagging.Specificity = percent(MoTbagging.Specificity,accuracy = 0.01)
MoTbagging.Specificity
##BAGGING MODEL PERFORMANCE SUMMARY TABLE#####

BaggingLabel = c("Performance Metric", "Accuracy", "Sensitivity", "Specificity")
BaggingOutput = c("Performance Output", MoTbagging.Accuracy, MoTbagging.Sensitivity, MoTbagging.Specificity)
BaggingResult_Table = data.frame(BaggingLabel, BaggingOutput)
print(BaggingResult_Table)
Gradient Boosting
```{r}
Model_6 : Gradient Boosting Machines
GBTrain = MoT.Train[,-11] names(GBTrain) dim(GBTrain) GBTest =MoT.Test[,-11] dim(GBTest)
Setting up the general parameters for training multiple models

booster = trainControl( method = 'repeatedcv', # k-fold cross validation number = 3, # number of folds or k repeats = 1, # repeated k-fold cross-validation allowParallel
= TRUE, classProbs = TRUE, summaryFunction=twoClassSummary # should class probabilities be returned )
Gb_model = train(Trans.Mode ~ ., data = GBTrain, method = "gbm", trControl = booster, verbose = FALSE)
Predictions:GBM
GBTest.pred = predict(Gb_model, newdata = GBTest, type = "raw")
GBTest.tab = table(GBTest$Trans.Mode,GBTest.pred) GBTest.tab
############################## Performance Check - Gradient Boosting Model
-----------Test Data - Measure of Accuracy and Sensitivity ------------------

------------Accuracy --100% -------------------
GBTest.Accuracy = (GBTest.tab[1,1]+GBTest.tab[2,2])/nrow(GBTest) GBTest.Accuracy=percent(GBTest.Accuracy, accuracy = 0.01) GBTest.Accuracy
------------Sensitivity --100% ----------------
GBTest.Sensitivity = GBTest.tab[1,1]/(GBTest.tab[1,1] + GBTest.tab[1,2]) GBTest.Sensitivity = percent(GBTest.Sensitivity,accuracy = 0.01) GBTest.Sensitivity
------------Specificity --100% ----------------
GBTest.Specificity = GBTest.tab[2,2]/(GBTest.tab[2,1] + GBTest.tab[2,2]) GBTest.Specificity = percent(GBTest.Specificity,accuracy = 0.01) GBTest.Specificity
GRADIENT BOOSTING MODEL PERFORMANCE SUMMARY TABLE
GBLabel = c("Performance Metric", "Accuracy", "Sensitivity", "Specificity") GBOutput = c("Performance Output", GBTest.Accuracy, GBTest.Sensitivity,
GBTest.Specificity) GBResult_Table = data.frame(GBLabel, GBOutput) print(GBResult_Table)
#XGB - Extreme Gradient Boosting
```{r}
names(MoT.Train)
xtreme.trainX = as.matrix(MoT.Train[,c(1,5:7)])
xtreme.trainY = as.matrix(MoT.Train[,11])
xtreme.testX = as.matrix(MoT.Test[,c(1,5:7)])
xtreme.testY = as.matrix(MoT.Test[,11])
summary(xtreme.trainX)
xgb.tuner = xgboost(
data = xtreme.trainX,
label = xtreme.trainY,
eta = 0.0001,
max_depth = 3,
min_child_weight = 3,
nrounds = 10000,
nfold = 5,
objective = "binary:logistic",
verbose = 0,
early_stopping_rounds = 10
)
xtreme.testX$pred = predict(xgb.tuner, xtreme.testX)
xtreme.test.tab= table(xtreme.testY,xtreme.testX$pred>0.5)
xtreme.test.tab
#################################### Performance Check - Extreme Gradient Boosting Model################################
#======================================Test Data - Measure of Accuracy and Sensitivity ==============================#
##-----------------------------------------------------Accuracy --99.21% -------------------------------------------##

xtreme.Test_Accuracy = (xtreme.test.tab[1,1]+xtreme.test.tab[2,2])/nrow(as.data.frame(xtreme.testX))
xtreme.Test_Accuracy = percent(as.numeric(xtreme.Test_Accuracy), accuracy = 0.01)
xtreme.Test_Accuracy
##----------------------------------------------------Sensitivity --100% ------------------------------------------##

xtreme.Test_Sensitivity = xtreme.test.tab[2,2]/(xtreme.test.tab[2,1]+xtreme.test.tab[2,2])
xtreme.Test_Sensitivity = percent(as.numeric(xtreme.Test_Sensitivity), accuracy = 0.01)
xtreme.Test_Sensitivity
##--------------------------------------------------Specificity --99.13% -------------------------------------------##

xtreme.Test_Specificity = xtreme.test.tab[1,1]/(xtreme.test.tab[1,1]+xtreme.test.tab[1,2])
xtreme.Test_Specificity = percent(as.numeric(xtreme.Test_Specificity), accuracy = 0.01)
xtreme.Test_Specificity
## EXTREME GRADIENT BOOSTING MODEL PERFORMANCE SUMMARY TABLE#####

XGBLabel = c("Performance Metric", "Accuracy", "Sensitivity", "Specificity")
XGBOutput = c("Performance Output", xtreme.Test_Accuracy, xtreme.Test_Sensitivity, xtreme.Test_Specificity)
XGBResult_Table = data.frame(XGBLabel, XGBOutput)
print(XGBResult_Table)
VII. COMPARATIVE ANALYSIS OF MODEL AND RECOMMENDATION ```{r} Model_Name =as.character(c("KNN","Naive Bayes","Logistic Regression","Bagging","Gradient
Boosting", "Extreme Gradient Boosting"))
Accuracy_Check = c(KNNaccuracy, NBaccuracy,Logitest.accuracy2,MoTbagging.Accuracy,GBTest.Accuracy,xtreme.Test_Accuracy)
Sensitivity_Check = c(KNNsensitivity, NBsensitivity, Logitest.sensitivity2, MoTbagging.Sensitivity,GBTest.Sensitivity,xtreme.Test_Sensitivity)
Specificity_Check = c(KNNspecificity, NBspecificity, Logitest.specificity2,MoTbagging.Specificity,GBTest.Specificity,xtreme.Test_Specificity)
Models_Comparison = data.frame(Model_Name,Accuracy_Check, Sensitivity_Check, Specificity_Check)

print(Models_Comparison) ```
Comparative Notes
Following the building of bagging and boosting models and as shown in the performance table above, observed that both bagging and gradient boost model perfectly
fits the test data. However, the extreme gradient boost lagged behind slightly in both accuracy and specificity. Hence there is no difference in choice between Bagging
model and the standard Gradient Boosting model in this cases.
Also, on the one hand, KNN and Logistic Regression models both return perfect scores across measures and therefore tie in performance.
Comparing the two class, it is safe to conclude that in this project, the best of the two groups of models tie or match in performance across board.
VIII. SUMMARY OF FINDINGS AND RECOMMENDATION
From the foregoing analysis, the following are the findings:
1.The workforce is largely young with most employees under the age of 30 years.
2. Most employees commute less than 14 kilometers to the office.
3. Most employees earn less than 20,000.00
4. Most employers do not choose cars but prefer other mode of transport.
5. Salary earned and distance traveled to work largely influence the choice of the mode of transportation; logically, perhaps a reflection of expediency(best suited
to distance) and means(affordability).
To understand and better predict the individual employee's preference of transportation mode, any of KNN, Logistic regression, Bagging or Boosting models may be
used as they produced exactly same results across relevant measures.
From the synopsis provided ab initio, it is not clear what decision the project is focused. However, if it is related to employing welfare, the decision makers may
consider:
i. Target a busing package to convey employees within 14kilometer radius.
ii. Raise the pay of employers earning less than 20,000

Project 4 - Preferred Transport Mode

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Project 4 - Preferred Transport Mode

Uploaded by

Copyright:

Available Formats

title: "Project 4 - Cars Case Study (Predicitive Modelling)" author: "Abiola E Oladipupo" date: "10/22/2020"

```{r} MoT = read.csv("Code - Joins, Summary Tables and Export.csv")

I. EXPLORATORY DATA ANALYSIS(EDA)

Data Overview and Summary

NOTES ON THE DATASET OVERVIEW:

1. The age range is between 18 and 43 years.

II. GRAPHICAL ILLUSTRATION OF EDA:

#Univariate Analysis - Distribution of Mode of Transport -Bimodal

Distribution of Mode of Transport -Multimodal

Salary Distribution of Employees

Notes on Employees' Salary:

#Distribution of Commuting Distance of Employees

Mode of Transport based on Gender

#Mode of Transport by License Ownership

Mode of Transport based on Commuting Distance

ggplot(MoT, aes(Distance, fill=Trans.Mode))+geom_density(color="black", alpha=0.5)+ggtitle("Mode of Transport by Distance")+xlab("Commuting

#Mode of Transport versus Employees' Salary

ggplot(MoT, aes(Salary, fill=Trans.Mode))+geom_density(color="black", alpha=0.5)+ggtitle("Mode of Transport based om Salary")+xlab("Employee'

ggplot(MoT, aes(Salary,Distance, color=Trans.Mode))+geom_point()+ggtitle("Employee Salary versus Commuting Distance")+xlab("Employee Salary(i

Boxplots - Mode of Numeric Variables -Age, Experience, Salary and

Work Experience, Salary and Gender; and preferred mode

#Commuting Distance and Age over Gender

ggplot(MoT, aes(Distance,fill=Gender))+geom_histogram(color="blue")+ggtitle("Age, Distance and Gender")+xlab("Distance")+ylab("No. of Employe

ggplot(MoT, aes(Age, Salary, size=Distance, col=Age))+geom_point()+ggtitle("Age, Distance and Salary")+xlab("Age")+ylab("Salary")+theme(plot.

Most female employers commute less than 14km.

Commuting Distance and Age over Mode of Transport

ggplot(MoT, aes(Age, Distance, size=Distance, col=Transport))+geom_point()+geom_smooth()+ggtitle("Age, Distance and Mode of Multimodal

III. THE PROJECT CHALLENGE

#Most Challenging Aspect

IV. DATA PREPARATION

#Fixing the Missing Variable

Outlier Check and Treatment - Using Winsorization Approach 99% and

proportion of outliers in the Age variable is negligible at 0.47% .

upper.workexp = quantile(MoT$Work.Exp, 0.99) upper.workexp sum(MoT$Work.Exp>upper.workexp) (sum(MoT$Work.Exp>upper.workexp)/nrow(MoT))*100

upper.dist = quantile(MoT$Distance, 0.99) upper.dist sum(MoT$Distance>upper.dist) (sum(MoT$Distance>upper.dist)/nrow(MoT))*100

proportion of outliers in the Distance variable is 1.20 % anb below 5% threshold.

upper.pay = quantile(MoT$Salary, 0.99) upper.pay sum(MoT$Salary>upper.pay) (sum(MoT$Salary>upper.pay)/nrow(MoT))*100

#Checking for Data Imbalance and Rectifying via Smoting if any.

Using Corrplot and Correlation to Check for Multicollinearity for

#Check the significance of Categorical Variables

V. MODEL BUILING 1 - KNN, NAIVEBAYES AND LOGISTIC REGRESSION

set.seed(123) KNNtrain = MoT.Train dim(KNNtrain) KNNtest = MoT.Test dim(KNNtest) Kmodel =

#################==K-NEAREST NEIGHBOUR MODEL PERFORMANCE CHECK ==

------------------------------Accuracy Check Result- 100% ---------------------------------

------------------------------Sensitivity Check Result- 100% -------------------------------

------------------------------Specificity Check Result- 100% -------------------------------

KNN MODEL SUMMARY TABLE

#######################==K-NEAREST NEIGHBOUR MODEL PERFORMANCE CHECK ==#################################

#------------------------------Accuracy Check Result- 98.41% -------------------------------------#

#------------------------------Recall/Sensitivity Check Result- 100.00% -------------------------#

#------------------------------Specificity Check Result- 98.26% --------------------------------#

##$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ AREA UNDER THE CURVE(AUC) - 99.13% $$$$$$$$$$$$$$$$$$$$$$

##NAIVE BAYES MODEL PERFORMANCE SUMMARY TABLE#####

Logistic Regression Modelling

Logitrain = MoT.Train[,-c(9,10)] dim(Logitrain) table(Logitrain$TMode.binary) prop.table(table(Logitrain$TMode.binary))

Logitest = MoT.Test[,-c(9,10)] dim(Logitest) table(Logitest$TMode.binary) prop.table(table(Logitest$TMode.binary))

LogModel0 = glm(TMode.binary~., data=Logitrain,family = binomial) summary(LogModel0)

LogModel = glm(TMode.binary~.-Age-Work.Exp, data=Logitrain,family = binomial)#### summary(LogModel) vif(LogModel)

Logpred.train = predict(LogModel,data=Logitrain, type="response") LogitrainVal = ifelse(Logpred.train>0.5,1,0) Logtraintab =

############################ Performance Check - LogModel

-------------------------TRAIN DATA - Measure of Accuracy and Sensitivity -----

Logitrain.sensitivity1 = Logtraintab[2,2]/(Logtraintab[2,1]+Logtraintab[2,2]) #----sensitivity is 100%

Logitrain.specificity1 = Logtraintab[1,1]/(Logtraintab[1,1]+Logtraintab[1,2]) #----sensitivity is 100%

Logitrain.accuracy1 = percent(Logitrain.accuracy1, accuracy = 0.01) Logitrain.accuracy1