Source Codes Cars

> setwd("F:/project")
> library(car)
> library(caret)
> library(class)
> library(devtools)
> library(e1071)
> library(ggplot2)
> library(Hmisc)
> library(klaR)
> library(MASS)
> library(nnet)
> library(plyr)
> library(pROC)
> library(psych)
> library(scatterplot3d)
> library(dplyr)
> library(ElemStatLearn)
> library(rpart)
> library(rpart.plot)
> library(randomForest)
> library(neuralnet)
> mydata = read.csv("Cars.csv",header = TRUE)
> attach(mydata)
The following objects are masked from mydata (pos = 5):
Age, Distance, Engineer, Gender, license, MBA, Salary, Transport,
Work.Exp
The following objects are masked from mydata (pos = 16):

Age, Distance, Engineer, Gender, license, MBA, Salary, Transport,
Work.Exp
> head(mydata)
Age Gender Engineer MBA Work.Exp Salary Distance license Transport
1 28 Male 0 0 4 14.3 3.2 0 Public Transport
2 23 Female 1 0 4 8.3 3.3 0 Public Transport
4 28 Female 1 1 5 13.4 4.5 0 Public Transport
> dim(mydata)
[1] 444 9
> names(mydata)
[1] "Age" "Gender" "Engineer" "MBA" "Work.Exp" "Salary"
[7] "Distance" "license" "Transport"
> str(mydata)
'data.frame': 444 obs. of 9 variables:
$ Age : int 28 23 29 28 27 26 28 26 22 27 ...
$ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 2 2 1 2 2 ...
$ Engineer : int 0 1 1 1 1 1 1 1 1 1 ...
$ MBA : int 0 0 0 1 0 0 0 0 0 0 ...
$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
$ license : int 0 0 0 0 0 1 0 0 0 0 ...
$ Transport: Factor w/ 3 levels "2Wheeler","Car",..: 3 3 3 3 3 3 1 3 3 3 ...
> summary(mydata)
Age Gender Engineer MBA Work.Exp
Min. :18.00 Female:128 Min. :0.0000 Min. :0.0000 Min. : 0.0
1st Qu.:25.00 Male :316 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.: 3.0
Median :27.00 Median :1.0000 Median :0.0000 Median : 5.0
Mean :27.75 Mean :0.7545 Mean :0.2528 Mean : 6.3
3rd Qu.:30.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 8.0
Max. :43.00 Max. :1.0000 Max. :1.0000 Max. :24.0
NA's :1
Salary Distance license Transport
Min. : 6.50 Min. : 3.20 Min. :0.0000 2Wheeler : 83
1st Qu.: 9.80 1st Qu.: 8.80 1st Qu.:0.0000 Car : 61
Median :13.60 Median :11.00 Median :0.0000 Public Transport:300
Mean :16.24 Mean :11.32 Mean :0.2342
3rd Qu.:15.72 3rd Qu.:13.43 3rd Qu.:0.0000
Max. :57.00 Max. :23.40 Max. :1.0000
> colSums(is.na(mydata))
Age Gender Engineer MBA Work.Exp Salary Distance license
0 0 0 1 0 0 0 0
Transport
> mydata[!complete.cases(mydata), ]
Age Gender Engineer MBA Work.Exp Salary Distance license Transport
145 28 Female 0 NA 6 13.7 9.4 0 Public Transport
> library(DMwR)
> mydata = knnImputation(mydata, 10)
> colSums(is.na(mydata))
0 0 0 0 0 0 0 0
Transport
> mydata$CarUsage=ifelse(mydata$Transport =='Car',1,0)
> table(mydata$CarUsage)
0 1
383 61
> mydata$Gender=ifelse(mydata$Gender =='Male',0,1)
> table(mydata$Gender)
0 1
316 128
> boxplot(mydata[,c(1,5,7)],col = "red")
> boxplot(mydata[,6],col = "red", main=colnames(mydata[6]))
> mydata_out=mydata[,c(1,5,6,7,10)]
> mydata_out=as.data.frame(mydata_out)
> for (i in (1:5))
+{
+ outval=boxplot.stats(mydata_out[,i])$out
+ cat("Outliers in ",colnames(mydata_out[i]),"\n", "=", outval,"\n")
+}
Outliers in Age
= 39 39 39 38 40 38 38 38 38 40 40 39 40 38 39 38 40 39 38 42 40 43 40 38 39
Outliers in Work.Exp
= 19 16 21 17 16 18 19 18 21 16 19 19 18 19 20 22 16 20 18 21 20 20 16 17 21 18 20 21 19 22 22 19 24
20 19 19 19 21
Outliers in Salary
= 36.6 38.9 25.9 34.8 28.8 39.9 39 28.7 36.9 28.7 34.9 47 28.8 36.9 54 29.9 34.9 36 44 37 24.9 43 37 54
44 34 48 42 51 45 34 28.8 45 42.9 41 40.9 30.9 41.9 43 33 36 33 38 46 45 48 35 51 51 55 45 42 52 38 57
44 45 47 50
Outliers in Distance
= 20.7 20.8 21 21.3 21.4 21.5 21.5 22.8 23.4
Outliers in CarUsage
=111111111111111111111111111111111111111111111111111111111
1111
par(mfrow = c(1,2))
> boxplot(Age~Transport,col = "blue")
> boxplot(Gender~Transport,col = "blue")
Error in oldClass(stats) <- cl :
adding class "factor" to an invalid object
> boxplot(Engineer~Transport,col = "blue")
> boxplot(MBA~Transport,col = "blue")
> boxplot(Work.Exp~Transport,col = "blue")
> boxplot(Salary~Transport,col = "blue")
> boxplot(Distance~Transport,col = "blue")
> boxplot(license~Transport,col = "blue")
> length(which(Transport=="Car"))*100/nrow(mydata)
[1] 13.73874
> table(Transport)
Transport
2Wheeler Car Public Transport
83 61 300
> table(Gender)
Gender
Female Male
128 316
> prop.table(table(Gender))
Gender
Female Male
0.2882883 0.7117117
> table(Engineer)
Engineer
0 1
109 335
> prop.table(table(Engineer))
Engineer
0 1
0.2454955 0.7545045
> table(MBA)
MBA
0 1
331 112
> prop.table(table(MBA))
MBA
0 1
0.7471783 0.2528217
> table(license)
license
0 1
340 104
> prop.table(table(license))
license
0 1
0.7657658 0.2342342
> table(Gender,Transport)
Transport
Gender 2Wheeler Car Public Transport
Female 38 13 77
Male 45 48 223
> prop.table(table(Gender,Transport))
Transport
Gender 2Wheeler Car Public Transport
Female 0.08558559 0.02927928 0.17342342
Male 0.10135135 0.10810811 0.50225225
> ggplot(mydata, aes(x=Gender)) + geom_bar()
> ggplot(mydata, aes(x= Gender)) + theme_bw()+
+ facet_wrap(~Transport ) + geom_bar()+
+ labs(x="Gender", y= "No. of People", title = "Gender vs. Transport")
> ggplot(mydata, aes(x= Gender)) + theme_bw()+
+ facet_wrap(~CarUsage ) + geom_bar()+
+ labs(x="Gender", y= "No. of People", title = "Gender vs. Car Usage")
> ggplot(mydata, aes(x=Work.Exp)) + geom_histogram(binwidth = 1)
> ggplot(mydata, aes(x=Work.Exp)) + geom_histogram(bins = 20)+
+ facet_wrap(~Transport, scales = 'free_x') +
+ labs(x="Work Exp", y= "No. of People", title = "Work Exp vs. Transport")
> ggplot(mydata, aes(x=Work.Exp)) + geom_histogram(bins = 20)+
+ facet_wrap(~CarUsage, scales = 'free_x') +
+ labs(x="Work Exp", y= "No. of People", title = "Work Exp vs. Car Usage")
> ggplot(mydata, aes(x=Transport)) + geom_bar() + theme_bw()
> ggplot(mydata, aes(x=Age)) + geom_histogram(binwidth = 5)
> ggplot(mydata, aes(x=Age)) + geom_histogram(binwidth = 1) +
+ labs(x="Age", y= "No. of People", title = "Age vs. Transport")
> ggplot(mydata, aes(x=Age)) + geom_histogram(binwidth = 1) +
+ labs(x="Age", y= "No. of People", title = "Age vs. Car Usage")
> ggplot(mydata, aes(x=license)) + geom_bar()

> ggplot(mydata, aes(x=license)) + geom_bar() +
+ facet_wrap(~Transport) +
+ labs(x="License", y= "No. of People", title = "License vs. Transport")
> ggplot(mydata, aes(x=license)) + geom_bar() +
+ facet_wrap(~CarUsage) +
+ labs(x="License", y= "No. of People", title = "License vs. Car Usage")
>
table(license,Transport)
Transport
license 2Wheeler Car Public Transport
0 60 13 267
1 23 48 33
> prop.table(table(license,Transport))
Transport
license 2Wheeler Car Public Transport
0 0.13513514 0.02927928 0.60135135
1 0.05180180 0.10810811 0.07432432
> table(license,mydata$CarUsage)
license 0 1
0 327 13
1 56 48
> prop.table(table(license,mydata$CarUsage))
license 0 1
0 0.73648649 0.02927928
1 0.12612613 0.10810811
> ggplot(mydata, aes(x=Gender, fill= Transport)) + geom_bar()
> ggplot(mydata, aes(x=Engineer, fill= Transport)) + geom_bar()

> ggplot(mydata, aes(x=MBA, fill= Transport)) + geom_bar()
Warning message:
position_stack requires non-overlapping x intervals
> ggplot(mydata, aes(x=license, fill= Transport)) + geom_bar()
> ggplot(mydata, aes(x=Age, fill= Transport)) + geom_bar()
> ggplot(mydata, aes(x=Salary)) + geom_histogram(bins = 20) +
+ labs(x="Salary", y= "No. of People", title = "Salary vs. Transport")
> ggplot(mydata, aes(x=Salary)) + geom_histogram(bins = 20) +
+ labs(x="Salary", y= "No. of People", title = "Salary vs. Car Usage")
> ggplot(mydata, aes(x=Distance)) + geom_histogram(bins = 20) +
+ labs(x="Distance travelled", y= "No. of People", title = "Distance vs. Transport")
> ggplot(mydata, aes(x=Distance)) + geom_histogram(bins = 20) +
+ labs(x="Distance travelled", y= "No. of People", title = "Distance vs. Car Usage")
> ggplot(mydata, aes(x=MBA)) + geom_bar() +
+ labs(x="MBA", y= "No. of People", title = "MBA vs. Transport")
> ggplot(mydata, aes(x=MBA)) + geom_bar() +
+ labs(x="MBA", y= "No. of People", title = "MBA vs. Car Usage")
> table(MBA,Transport)
Transport
MBA 2Wheeler Car Public Transport
0 66 49 216
1 17 12 83
> prop.table(table(MBA,Transport))
Transport
MBA 2Wheeler Car Public Transport
0 0.14898420 0.11060948 0.48758465
1 0.03837472 0.02708804 0.18735892
> table(MBA,mydata$CarUsage)
MBA 0 1
0 282 49
1 100 12
> prop.table(table(MBA,mydata$CarUsage))
MBA 0 1
0 0.63656885 0.11060948
1 0.22573363 0.02708804
> ggplot(mydata, aes(x=Engineer)) + geom_bar() +
+ labs(x="Engineer", y= "No. of People", title = "Engineer vs. Transport")
> ggplot(mydata, aes(x=Engineer)) + geom_bar() +
+ labs(x="Engineer", y= "No. of People", title = "Engineer vs. Car Usage")
> table(Engineer,Transport)
Transport
Engineer 2Wheeler Car Public Transport
0 23 9 77
1 60 52 223
> prop.table(table(Engineer,Transport))
Transport
Engineer 2Wheeler Car Public Transport
0 0.05180180 0.02027027 0.17342342

1 0.13513514 0.11711712 0.50225225
> table(Engineer,mydata$CarUsage)
Engineer 0 1
0 100 9
1 283 52
> prop.table(table(Engineer,mydata$CarUsage))
Engineer 0 1
0 0.22522523 0.02027027
1 0.63738739 0.11711712
dev.off()
null device
> par(mar = c(3,2,2,1))
> str(mydata)
$ Age : int 28 23 29 28 27 26 28 26 22 27 ...
$ Gender : num 0 1 0 1 0 0 0 1 0 0 ...
$ Engineer : int 0 1 1 1 1 1 1 1 1 1 ...
$ MBA : num 0 0 0 1 0 0 0 0 0 0 ...
$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
$ license : int 0 0 0 0 0 1 0 0 0 0 ...
$ CarUsage : num 0 0 0 0 0 0 0 0 0 0 ...
> mydata$MBA=as.numeric(mydata$MBA)
> library(corrplot)
> dataScatter=subset(mydata[,c(1:8,10)])
> cor(dataScatter)
Age Gender Engineer MBA Work.Exp Salary
Age 1.00000000 -0.09885525 0.09193523 -0.029010747 0.93223639 0.860673177
Gender -0.09885525 1.00000000 -0.01821436 -0.092446493 -0.08602226 -0.096262448
Engineer 0.09193523 -0.01821436 1.00000000 0.063476962 0.08572854 0.086762332
MBA -0.02901075 -0.09244649 0.06347696 1.000000000 0.00849268 -0.007656222
Work.Exp 0.93223639 -0.08602226 0.08572854 0.008492680 1.00000000 0.931974470
Salary 0.86067318 -0.09626245 0.08676233 -0.007656222 0.93197447 1.000000000
Distance 0.35287246 -0.05420649 0.05931640 0.035597805 0.37273497 0.442359088
license 0.45231086 -0.23459399 0.01892418 -0.028239932 0.45286699 0.508094614
CarUsage 0.72128798 -0.06623155 0.09083167 -0.051675771 0.73123159 0.764276968
Distance license CarUsage
Age 0.35287246 0.45231086 0.72128798
Gender -0.05420649 -0.23459399 -0.06623155
Engineer 0.05931640 0.01892418 0.09083167
MBA 0.03559781 -0.02823993 -0.05167577
Work.Exp 0.37273497 0.45286699 0.73123159
Salary 0.44235909 0.50809461 0.76427697
Distance 1.00000000 0.29008445 0.44060090
license 0.29008445 1.00000000 0.52076686
CarUsage 0.44060090 0.52076686 1.00000000
> dev.off()
null device
> corrplot(cor(dataScatter), method = "pie", type = "upper")
> mydata$CarUsage=as.numeric(mydata$CarUsage)
> mydata$Gender=as.numeric(mydata$Gender)
> mydata$Engineer=as.numeric(mydata$Engineer)
> mydata$MBA=as.numeric(mydata$MBA)
> mydata$license=as.numeric(mydata$license)
> par(mfrow = c(3,3))
> for (i in (1:8))
+ for (i in (1:8))
+ {
+ h = ifelse(max(mydata[,i])==1,1,round(max(mydata[,i]),0)+1)
+ l = ifelse(min(mydata[,i])==0,0,round(min(mydata[,i]),0)-1)
+ hist(mydata[,i],breaks=seq(l,h,((h-l)/6)),labels=TRUE,
+ include.lowest=TRUE,right=TRUE,
+ col="pink",border=1,
+ main = colnames(mydata[i]), ylab=NULL,xlab = NULL,
+ cex.lab=1, cex.axis=1, cex.main=1, cex.sub=1,xlim = c(l,h),
+ ylim = c(0,400))
+ }
mydata$CarUsage=as.factor(mydata$CarUsage)
> mydata$Engineer=as.factor(mydata$Engineer)
> mydata$MBA=as.factor(mydata$MBA)
> mydata$license=as.factor(mydata$license)
> str(mydata)
$ Age : int 28 23 29 28 27 26 28 26 22 27 ...
$ Gender : num 0 1 0 1 0 0 0 1 0 0 ...
$ Engineer : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 2 ...
$ MBA : Factor w/ 3 levels "0","0.304048449365552",..: 1 1 1 3 1 1 1 1 1 1 ...
$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
$ license : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...

$ CarUsage : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
> library(caTools)
> mydata[,'train']=ifelse(runif(nrow(mydata))<0.75,1,0)
> str(mydata)
$ Age : int 28 23 29 28 27 26 28 26 22 27 ...
$ Gender : num 0 1 0 1 0 0 0 1 0 0 ...
$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
$ train : num 1 0 1 1 1 1 1 1 0 1 ...
> data_train=mydata[mydata$train==1,]
> data_test=mydata[mydata$train==0,]
> mydata=mydata[-c(9,11)]
> data_train=data_train[-c(9,11)]
> data_test=data_test[-c(9,11)]
> dim(data_train)
[1] 346 9
> dim(data_test)
[1] 98 9
> prop.table(table(mydata$CarUsage))
0 1
0.8626126 0.1373874
> prop.table(table(data_train$CarUsage))
0 1
0.8699422 0.1300578
> prop.table(table(data_test$CarUsage))
0 1
0.8367347 0.1632653
> logit_model1 = glm(CarUsage ~ ., data = data_train,family = binomial(link="logit"))
> summary(logit_model1)
Call:
glm(formula = CarUsage ~ ., family = binomial(link = "logit"),
data = data_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.98562 -0.03113 -0.00580 -0.00047 2.03054
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -82.66624 19.15350 -4.316 1.59e-05 ***
Age 2.66855 0.64589 4.132 3.60e-05 ***
Gender 1.96875 0.96483 2.041 0.041298 *
Engineer1 0.10081 1.07307 0.094 0.925154
MBA1 -1.79362 1.02357 -1.752 0.079719 .
Work.Exp -1.57378 0.45225 -3.480 0.000502 ***
Salary 0.26930 0.09167 2.938 0.003306 **

Distance 0.46379 0.16723 2.773 0.005548 **
license1 2.72229 1.05394 2.583 0.009796 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 267.456 on 345 degrees of freedom
Residual deviance: 45.946 on 337 degrees of freedom
AIC: 63.946
Number of Fisher Scoring iterations: 10
library(car)
> vif(logit_model1)
12.048654 1.432683 1.124272 1.453715 18.275864 4.089127 1.615216 1.906161
> anova(logit_model1, test="Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: CarUsage
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 345 267.456
Age 1 176.806 344 90.649 < 2.2e-16 ***

Gender 1 1.344 343 89.305 0.246253
Engineer 1 0.024 342 89.281 0.877576
MBA 1 1.830 341 87.451 0.176166
Work.Exp 1 2.433 340 85.019 0.118842
Salary 1 20.163 339 64.856 7.112e-06 ***
Distance 1 10.273 338 54.583 0.001350 **
license 1 8.637 337 45.946 0.003293 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> mydata_logit = mydata[-c(3,5)]
> set.seed(1234)
> sample = sample.split(mydata_logit$CarUsage, SplitRatio = 0.75)
> data_train_logit = subset(mydata_logit, sample==TRUE)
> data_test_logit = subset(mydata_logit, sample==FALSE)
> dim(data_train_logit)
[1] 333 7
> dim(data_test_logit)
[1] 111 7
> table(data_train_logit$CarUsage)
0 1
287 46
> table(data_test_logit$CarUsage)
0 1
96 15
> prop.table(table(data_train_logit$CarUsage))
0 1
0.8618619 0.1381381
> prop.table(table(data_test_logit$CarUsage))
0 1
0.8648649 0.1351351
> logit_model2=glm(CarUsage ~ ., data=data_train_logit, family=binomial())
Call:
glm(formula = CarUsage ~ ., family = binomial(), data = data_train_logit)
Deviance Residuals:
-2.14502 -0.08214 -0.02041 -0.00386 2.89882
Coefficients:
(Intercept) -40.61417 8.25596 -4.919 8.68e-07 ***
Age 1.09471 0.24863 4.403 1.07e-05 ***
Gender 0.25213 0.85484 0.295 0.76803
MBA0.304048449365552 -10.32649 2399.54509 -0.004 0.99657
MBA1 -1.66327 0.91122 -1.825 0.06795 .
Salary -0.02902 0.05067 -0.573 0.56684
Distance 0.41152 0.12733 3.232 0.00123 **
license1 2.10992 0.80815 2.611 0.00903 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

AIC: 69.665
> vif(logit_model2)
GVIF Df GVIF^(1/(2*Df))
Age 1.840700 1 1.356724
Gender 1.109938 1 1.053536
MBA 1.267861 2 1.061128
Salary 1.473525 1 1.213888
Distance 1.503751 1 1.226275
license 1.301658 1 1.140902
> anova(logit_model2, test="Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: CarUsage
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 332 267.445
Age 1 186.290 331 81.155 < 2.2e-16 ***
Gender 1 0.014 330 81.140 0.904282

MBA 2 0.556 328 80.584 0.757302
Salary 1 2.095 327 78.490 0.147787
Distance 1 17.124 326 61.366 3.502e-05 ***
license 1 7.701 325 53.665 0.005518 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> data_train_logit_final = data_train_logit[,-c(2,3)]
> data_test_logit_final = data_test_logit[,-c(2,3)]
> logit_model3=glm(CarUsage ~ ., data=data_train_logit_final, family=binomial())
Call:
glm(formula = CarUsage ~ ., family = binomial(), data = data_train_logit_final)
Deviance Residuals:
-1.91222 -0.08983 -0.02809 -0.00636 2.89020
Coefficients:
(Intercept) -38.25057 7.65615 -4.996 5.85e-07 ***
Age 1.04299 0.23490 4.440 8.99e-06 ***
Salary -0.02631 0.04801 -0.548 0.58362
Distance 0.32744 0.10872 3.012 0.00260 **
license1 2.00751 0.74857 2.682 0.00732 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

AIC: 67.391
> vif(logit_model3)
Age Salary Distance license
1.775999 1.495108 1.194572 1.198738
pred_logit =predict.glm(logit_model3, newdata=data_test_logit_final, type="response")
> pred_logit
5 14 16 26 28 29
1.315334e-04 1.313280e-03 2.382703e-05 8.673400e-05 3.320324e-05 6.845859e-04
36 39 40 50 58 60
2.502183e-04 1.055614e-04 1.675927e-06 5.551214e-06 5.895809e-06 1.692712e-05
61 72 74 81 86 90
7.076112e-03 1.807273e-05 1.092898e-03 2.351850e-06 3.209926e-03 4.251727e-04
92 113 116 117 120 121
3.365587e-07 6.956575e-05 3.369932e-03 2.539871e-05 5.241201e-04 1.744132e-03
122 123 124 133 137 139
1.334102e-03 2.096316e-04 1.482651e-03 5.875591e-04 8.012185e-01 2.582092e-02
142 144 151 156 158 160
1.124653e-05 2.319970e-01 2.389607e-04 1.829620e-03 2.414167e-04 2.564754e-04
171 174 181 182 191 193
1.021160e-04 3.929836e-05 1.037100e-04 1.179156e-01 1.827955e-02 9.811994e-01
198 200 201 202 203 205
4.592469e-05 2.536643e-03 1.970838e-02 6.800003e-03 2.035105e-02 7.994245e-04
210 216 219 221 223 227

4.916193e-05 2.348663e-01 7.867423e-01 2.251931e-06 1.506852e-04 3.975917e-04
235 239 242 245 250 255
4.029219e-04 1.669732e-01 1.281620e-03 4.658275e-04 2.461301e-05 2.410030e-05
257 260 261 265 274 282
1.410011e-03 5.275801e-04 1.869691e-01 9.996483e-01 7.795572e-05 4.548890e-03
285 288 292 295 302 303
5.683909e-04 1.257940e-02 1.671107e-03 6.637048e-04 9.192883e-05 3.929207e-06
310 313 316 318 323 326
2.563620e-04 9.219463e-01 4.337766e-02 3.132357e-01 1.078694e-04 1.417465e-05
329 331 332 344 346 354
2.248441e-03 1.156075e-04 1.877994e-03 4.737698e-05 1.317836e-04 9.989957e-01
355 359 361 362 363 367
9.998418e-01 6.490100e-02 6.332216e-02 2.470506e-03 9.921216e-01 3.746041e-01
372 375 378 384 387 390
2.011669e-04 4.169592e-03 7.541128e-05 1.293841e-02 4.863610e-03 3.919313e-02
391 395 402 403 407 409
1.521395e-03 7.194241e-04 1.657707e-03 1.766548e-02 9.004328e-01 3.518718e-04
411 413 417 420 429 431
9.999502e-01 5.576012e-02 8.237702e-03 9.836954e-01 9.998971e-01 9.999956e-01
437 441 442
9.999992e-01 9.999061e-01 9.997264e-01
> data_test_logit_final$pred = pred_logit
> library(caret)
> data_test_logit_final$pred = pred_logit
> library(caret)
library(ROCR)
> pred_ROC = predict(logit_model3, newdata = data_test_logit_final, type = "response")
> pred2 = prediction(pred_ROC, data_test_logit_final$CarUsage)
> perf = performance(pred2, "tpr", "tnr")

> auc(data_test_logit_final$CarUsage,pred_ROC)
[1] 0.9861111
> plot(perf)
> KS_train_logit=max(attr(perf, 'y.values')[[1]]-attr(perf, 'x.values')[[1]])
> KS_train_logit
[1] 1
> library(InformationValue)
ks_plot(actuals=data_test_logit_final$CarUsage, predictedScores=data_test_logit_final$pred)
> fit.results_train=predict(logit_model3,newdata = data_train_logit_final)
> auc(data_train_logit_final$CarUsage,fit.results_train)
[1] 0.9885623
> fit.results_test=predict(logit_model3,newdata = data_test_logit_final)
> auc(data_test_logit_final$CarUsage,fit.results_test)
[1] 0.9861111
> library(ineq)
ks_plot(actuals=data_test_logit_final$CarUsage, predictedScores=data_test_logit_final$pred)
> fit.results_train=predict(logit_model3,newdata = data_train_logit_final)
> auc(data_train_logit_final$CarUsage,fit.results_train)
[1] 0.9885623
> fit.results_test=predict(logit_model3,newdata = data_test_logit_final)
> auc(data_test_logit_final$CarUsage,fit.results_test)
[1] 0.9861111
> library(ineq)
> gini_train = ineq(data_train_logit_final$CarUsage, type="Gini")
> gini_train
[1] 0.1046059
> gini_test = ineq(data_test_logit_final$CarUsage, type="Gini")
> gini_test
[1] 0.1029601
> library(caret)
> set.seed(400)
> ctrl=trainControl(method="repeatedcv",number = 10, repeats = 3, classProbs=TRUE)
> glmfit=train(make.names(CarUsage) ~ ., data = data_train_logit_final, method = "glm",
+ trControl = ctrl, preProcess = c("center","scale"),
+ family="binomial", tuneLength = 20)
> glmfit
Generalized Linear Model
333 samples
4 predictor
2 classes: 'X0', 'X1'
Pre-processing: centered (4), scaled (4)
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 300, 300, 299, 299, 301, 299, ...
Resampling results:
Accuracy Kappa
0.9548258 0.7992131
library(ModelMetrics)
> library(e1071)
> NB=naiveBayes(x=data_train[-9], y=as.factor(data_train$CarUsage))
> NB
Naive Bayes Classifier for Discrete Predictors
Call:
naiveBayes.default(x = data_train[-9], y = as.factor(data_train$CarUsage))

A-priori probabilities:
as.factor(data_train$CarUsage)
0 1
0.8699422 0.1300578
Conditional probabilities:
Age
as.factor(data_train$CarUsage) [,1] [,2]
0 26.51827 2.941173
1 35.40000 3.466725
Gender
0 0.2990033 0.4585837
1 0.2444444 0.4346135
Engineer
as.factor(data_train$CarUsage) 0 1
0 0.2757475 0.7242525
1 0.1333333 0.8666667
MBA
as.factor(data_train$CarUsage) 0 0.304048449365552 1
0 0.7375415 0.0000000 0.2624585
1 0.7333333 0.0000000 0.2666667
Work.Exp

0 4.89701 3.241300
1 15.22222 5.026696
Salary
0 13.11395 5.016201
1 34.66444 12.660392
Distance
0 10.80166 3.192298
1 14.74889 3.716525
license
as.factor(data_train$CarUsage) 0 1
0 0.8571429 0.1428571
1 0.2444444 0.7555556
y_pred.NB=predict(NB,newdata=data_test[-9])
> y_pred.NB
[1] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
[41] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0
[81] 1 1 0 0 1 0 0 0 1 0 1 1 1 1 1 1 1 1
Levels: 0 1
> tab.NB=table(data_test[,9],y_pred.NB)
> tab.NB
y_pred.NB
0 1
0 79 3
1 2 14
> accuracy.NB=sum(diag(tab.NB))/sum(tab.NB)
> accuracy.NB
[1] 0.9489796
> loss.NB=tab.NB[2,1]/(tab.NB[2,1]+tab.NB[1,1])
> loss.NB
[1] 0.02469136
> gini_train = ineq(data_train$CarUsage, type="Gini")
> gini_train
[1] 0.1001212
> gini_test = ineq(data_test$CarUsage, type="Gini")
> gini_test
[1] 0.1174364
> fit.results_train=predict(NB,newdata = data_train)
> auc(data_train$CarUsage,fit.results_train)
[1] 0.8722776
> fit.results_test=predict(NB,newdata = data_test)
> auc(data_test$CarUsage,fit.results_test)
[1] 0.9192073
>
library(klaR)
> data_train$CarUsage=as.factor(data_train$CarUsage)
> set.seed(234)
> nbfit=train(make.names(CarUsage) ~ ., data = data_train, method = "nb",
+ trControl = ctrl, preProcess = c("center","scale"), tuneLength = 20)
Naive Bayes
346 samples
8 predictor
2 classes: 'X0', 'X1'
Resampling: Cross-Validated (10 fold, repeated 3 times)
Resampling results across tuning parameters:
usekernel Accuracy Kappa
FALSE NaN NaN
TRUE 0.9529272 0.7743552
Tuning parameter 'fL' was held constant at a value of 0
Tuning parameter
'adjust' was held constant at a value of 1
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were fL = 0, usekernel = TRUE and adjust = 1.
plot(nbfit)
> varImp(nbfit)
ROC curve variable importance
Importance
Age 100.000
Salary 97.009
Work.Exp 95.119
license 63.857
Distance 60.301
Engineer 14.504
Gender 5.284
MBA 0.000
> plot(varImp(nbfit))
> normalize=function(x){
+ +return((x-min(x))/(max(x)-min(x)))}
plot(varImp(nbfit))
> normalize=function(x){
+ +return((x-min(x))/(max(x)-min(x)))}
> mydata$Age_norm=normalize(mydata$Age)
> mydata$Gender_norm=normalize(mydata$Gender)
> mydata$Engineer_norm=normalize(mydata$Engineer)
mydata$WorkExp_norm=normalize(mydata$Work.Exp)
> mydata$Engineer_norm=normalize(mydata$Engineer)
str(mydata)
$ Age : int 28 23 29 28 27 26 28 26 22 27 ...
$ Gender : num 0 1 0 1 0 0 0 1 0 0 ...
$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
$ Age_norm : num 0.4 0.2 0.44 0.4 0.36 0.32 0.4 0.32 0.16 0.36 ...
$ Gender_norm : num 0 1 0 1 0 0 0 1 0 0 ...
$ WorkExp_norm : num 0.167 0.167 0.292 0.208 0.167 ...
$ Salary_norm : num 0.1545 0.0356 0.1366 0.1366 0.1366 ...
$ Distance_norm: num 0 0.00495 0.04455 0.06436 0.06931 ...
dim(mydata)
[1] 444 12
> set.seed(1234)
> sample = sample.split(mydata$CarUsage, SplitRatio = 0.8)
> data_train_knn = subset(mydata, sample==TRUE)
> data_test_knn = subset(mydata, sample==FALSE)
train.knn=data_train_knn[,c(9:12)]
> test.knn=data_test_knn[,c(9:12)]
> str(train.knn)
$ Age_norm : num 0.4 0.2 0.44 0.4 0.32 0.4 0.32 0.16 0.36 0.28 ...
$ Gender_norm : num 0 1 0 1 0 0 1 0 0 1 ...
$ WorkExp_norm: num 0.167 0.167 0.292 0.208 0.167 ...
> library(class)
> trControl=trainControl(method = "cv", number = 10)
> fit.knn.cv=train(CarUsage ~ .,
+ method = "knn",
+ tuneGrid = expand.grid(k = 2:20),
+ trControl = trControl,
+ preProcess = c("center","scale"),
+ data = data_train_knn)
fit.knn.cv
k-Nearest Neighbors
355 samples
11 predictor
2 classes: '0', '1'

Resampling: Cross-Validated (10 fold)
k Accuracy Kappa
2 0.9492810 0.7814388
3 0.9464192 0.7553723
4 0.9435621 0.7496561
5 0.9520542 0.7886301
6 0.9462558 0.7569246
7 0.9518114 0.7852601
8 0.9519748 0.7827611
9 0.9548319 0.7902236
10 0.9520542 0.7777120
11 0.9520542 0.7777120
12 0.9464986 0.7451239
13 0.9464192 0.7365784
14 0.9464192 0.7365784
15 0.9464192 0.7365784
16 0.9492764 0.7552451
17 0.9436415 0.7179337
18 0.9436415 0.7179337
19 0.9436415 0.7179337
20 0.9436415 0.7179337
The final value used for the model was k = 9.

knnFit.rcv=train(CarUsage ~ ., data = data_train_knn, method = "knn", trControl = trControl, preProcess
= c("center","scale"), tuneLength = 20)
knnFit.rcv
k-Nearest Neighbors
355 samples
11 predictor
2 classes: '0', '1'
Resampling: Cross-Validated (10 fold)
k Accuracy Kappa
5 0.9519048 0.7942147
7 0.9519841 0.7891656
9 0.9548413 0.7937430
11 0.9576984 0.8039249
13 0.9464286 0.7314630
15 0.9407937 0.7024856
17 0.9407937 0.7084543
19 0.9435714 0.7177166
21 0.9435714 0.7177166
23 0.9435714 0.7177166
25 0.9464286 0.7254943
27 0.9464286 0.7254943
29 0.9464286 0.7229489
31 0.9464286 0.7223845
33 0.9436508 0.7071536
35 0.9436508 0.7071536
37 0.9436508 0.7071536
39 0.9436508 0.7071536
41 0.9436508 0.7071536
43 0.9436508 0.7071536
The final value used for the model was k = 11.
gini_train = ineq(train.knn$CarUsage, type="Gini")
> gini_train
[1] 0.1045461
> gini_test = ineq(test.knn$CarUsage, type="Gini")
> gini_test
[1] 0.1027923
library(gbm)
> library(xgboost)
> library(ipred)
> set.seed(1234)
> Cars.bagging=bagging(CarUsage~., data = data_train,
+ control=rpart.control(maxdepth = 5, minsplit = 15))
> data_test$pred.bag = predict(Cars.bagging, data_test)
> data_test$pred.bag=ifelse(data_test$pred.bag < 0.5,0,1)
BagTable = table(data_test$CarUsage,data_test$pred.bag)
data_test$pred.bag=as.numeric(data_test$pred.bag)
> data_test=data_test[-10]
> str(data_test)

$ Age : int 23 22 25 27 32 26 24 25 27 25 ...
$ Gender : num 1 0 1 0 0 1 0 0 0 0 ...
$ Engineer: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 2 1 ...
$ Work.Exp: int 4 1 4 4 9 4 6 1 7 2 ...
$ Salary : num 8.3 7.5 11.5 13.5 15.5 12.3 10.6 7.6 16.6 8.6 ...
$ Distance: num 3.3 5.1 5.2 5.3 5.5 5.9 6.1 6.3 6.4 6.7 ...
$ CarUsage: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
accuracy_bg = sum(diag(BagTable))/sum(BagTable)
accuracy_bg
97.5%
gbm.fit<-gbm(formula = CarUsage ~ .,distribution = "bernoulli",
data = data_train, n.trees = 10000, interaction.depth = 1,
shrinkage = 0.001, cv.folds = 5, n.cores= NULL, verbose = FALSE)
data_test$pred.car<- predict(gbm.fit, data_test, type="response")
bstable = table(data_test$CarUsage,data_test$pred.car>0.5)
bstable
accuracy_bst = sum(diag(bstable))/sum(bstable)
accuracy_bst
97%
library(DMwR)
table(mydata$CarUsage)
mydata[,'train']=ifelse(runif(nrow(mydata))<0.80,1,0)
str(mydata)
smote.train=mydata[mydata$train==1,]
smote.test=mydata[mydata$train==0,]
smote.train=smote.train[-10]
smote.test=smote.test[-10]
smote.train$CarUsage=as.factor(smote.train$CarUsage)
data_smote<-SMOTE(CarUsage ~., smote.train,k=5, perc.over =4000,perc.under = 200)
prop.table(table(data_smote$CarUsage))
smote_features_train<-as.matrix(data_smote[,1:8])
smote_label_train<-as.matrix(data_smote$CarUsage)
smote.xgb.fit <- xgboost(
data = smote_features_train,
label = smote_label_train,
eta = 0.7,
max_depth = 5,
nrounds = 7,
nfold = 5,
objective = "binary:logistic", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
smote_features_test<-as.matrix(smote.test[,1:8])
smote.test$smote.pred.class <- predict(smote.xgb.fit, smote_features_test)
smotexbg = table(smote.test$CarUsage,smote.test$smote.pred.class>=0.5)
smotexbg
accuracy_smotexgb = sum(diag(smotexbg))/sum(smotexbg)
accuracy_smotexgb
96.7%

Source Codes Cars

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Source Codes Cars

Uploaded by

Copyright:

Available Formats

> setwd("F:/project")

> mydata = read.csv("Cars.csv",header = TRUE)

The following objects are masked from mydata (pos = 5):

Age, Distance, Engineer, Gender, license, MBA, Salary, Transport,

The following objects are masked from mydata (pos = 16):

Age Gender Engineer MBA Work.Exp Salary Distance license Transport

1 28 Male 0 0 4 14.3 3.2 0 Public Transport

2 23 Female 1 0 4 8.3 3.3 0 Public Transport

3 29 Male 1 0 7 13.4 4.1 0 Public Transport

4 28 Female 1 1 5 13.4 4.5 0 Public Transport

5 27 Male 1 0 4 13.4 4.6 0 Public Transport

6 26 Male 1 0 4 12.3 4.8 1 Public Transport

[1] "Age" "Gender" "Engineer" "MBA" "Work.Exp" "Salary"

[7] "Distance" "license" "Transport"

'data.frame': 444 obs. of 9 variables:

$ Age : int 28 23 29 28 27 26 28 26 22 27 ...

$ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 2 2 1 2 2 ...

$ Engineer : int 0 1 1 1 1 1 1 1 1 1 ...

$ MBA : int 0 0 0 1 0 0 0 0 0 0 ...

$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...

$ license : int 0 0 0 0 0 1 0 0 0 0 ...

$ Transport: Factor w/ 3 levels "2Wheeler","Car",..: 3 3 3 3 3 3 1 3 3 3 ...

Min. :18.00 Female:128 Min. :0.0000 Min. :0.0000 Min. : 0.0

Median :27.00 Median :1.0000 Median :0.0000 Median : 5.0

Mean :27.75 Mean :0.7545 Mean :0.2528 Mean : 6.3

3rd Qu.:30.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 8.0

Max. :43.00 Max. :1.0000 Max. :1.0000 Max. :24.0

Salary Distance license Transport

Min. : 6.50 Min. : 3.20 Min. :0.0000 2Wheeler : 83

1st Qu.: 9.80 1st Qu.: 8.80 1st Qu.:0.0000 Car : 61

Median :13.60 Median :11.00 Median :0.0000 Public Transport:300

Mean :16.24 Mean :11.32 Mean :0.2342

3rd Qu.:15.72 3rd Qu.:13.43 3rd Qu.:0.0000

Max. :57.00 Max. :23.40 Max. :1.0000

Age Gender Engineer MBA Work.Exp Salary Distance license

Age Gender Engineer MBA Work.Exp Salary Distance license Transport

145 28 Female 0 NA 6 13.7 9.4 0 Public Transport

> mydata = knnImputation(mydata, 10)

Age Gender Engineer MBA Work.Exp Salary Distance license

> mydata$CarUsage=ifelse(mydata$Transport =='Car',1,0)

> mydata$Gender=ifelse(mydata$Gender =='Male',0,1)

> boxplot(mydata[,c(1,5,7)],col = "red")

> boxplot(mydata[,6],col = "red", main=colnames(mydata[6]))

> for (i in (1:5))

+ cat("Outliers in ",colnames(mydata_out[i]),"\n", "=", outval,"\n")

= 20.7 20.8 21 21.3 21.4 21.5 21.5 22.8 23.4

> boxplot(Age~Transport,col = "blue")

> boxplot(Gender~Transport,col = "blue")

Error in oldClass(stats) <- cl :

adding class "factor" to an invalid object

> boxplot(Engineer~Transport,col = "blue")

> boxplot(MBA~Transport,col = "blue")

> boxplot(Work.Exp~Transport,col = "blue")

> boxplot(Salary~Transport,col = "blue")

> boxplot(Distance~Transport,col = "blue")

> boxplot(license~Transport,col = "blue")

2Wheeler Car Public Transport

Gender 2Wheeler Car Public Transport

Gender 2Wheeler Car Public Transport

Female 0.08558559 0.02927928 0.17342342

Male 0.10135135 0.10810811 0.50225225

> ggplot(mydata, aes(x=Gender)) + geom_bar()

> ggplot(mydata, aes(x= Gender)) + theme_bw()+

+ labs(x="Gender", y= "No. of People", title = "Gender vs. Transport")