You are on page 1of 35

> setwd("F:/project")

> library(car)

> library(caret)

> library(class)

> library(devtools)

> library(e1071)

> library(ggplot2)

> library(Hmisc)

> library(klaR)

> library(MASS)

> library(nnet)

> library(plyr)

> library(pROC)

> library(psych)

> library(scatterplot3d)

> library(dplyr)

> library(ElemStatLearn)

> library(rpart)

> library(rpart.plot)

> library(randomForest)

> library(neuralnet)

> mydata = read.csv("Cars.csv",header = TRUE)

> attach(mydata)

The following objects are masked from mydata (pos = 5):

Age, Distance, Engineer, Gender, license, MBA, Salary, Transport,

Work.Exp

The following objects are masked from mydata (pos = 16):


Age, Distance, Engineer, Gender, license, MBA, Salary, Transport,

Work.Exp

> head(mydata)

Age Gender Engineer MBA Work.Exp Salary Distance license Transport

1 28 Male 0 0 4 14.3 3.2 0 Public Transport

2 23 Female 1 0 4 8.3 3.3 0 Public Transport

3 29 Male 1 0 7 13.4 4.1 0 Public Transport

4 28 Female 1 1 5 13.4 4.5 0 Public Transport

5 27 Male 1 0 4 13.4 4.6 0 Public Transport

6 26 Male 1 0 4 12.3 4.8 1 Public Transport

> dim(mydata)

[1] 444 9

> names(mydata)

[1] "Age" "Gender" "Engineer" "MBA" "Work.Exp" "Salary"

[7] "Distance" "license" "Transport"

> str(mydata)

'data.frame': 444 obs. of 9 variables:

$ Age : int 28 23 29 28 27 26 28 26 22 27 ...

$ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 2 2 1 2 2 ...

$ Engineer : int 0 1 1 1 1 1 1 1 1 1 ...

$ MBA : int 0 0 0 1 0 0 0 0 0 0 ...

$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...

$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...

$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...

$ license : int 0 0 0 0 0 1 0 0 0 0 ...

$ Transport: Factor w/ 3 levels "2Wheeler","Car",..: 3 3 3 3 3 3 1 3 3 3 ...

> summary(mydata)
Age Gender Engineer MBA Work.Exp

Min. :18.00 Female:128 Min. :0.0000 Min. :0.0000 Min. : 0.0

1st Qu.:25.00 Male :316 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.: 3.0

Median :27.00 Median :1.0000 Median :0.0000 Median : 5.0

Mean :27.75 Mean :0.7545 Mean :0.2528 Mean : 6.3

3rd Qu.:30.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 8.0

Max. :43.00 Max. :1.0000 Max. :1.0000 Max. :24.0

NA's :1

Salary Distance license Transport

Min. : 6.50 Min. : 3.20 Min. :0.0000 2Wheeler : 83

1st Qu.: 9.80 1st Qu.: 8.80 1st Qu.:0.0000 Car : 61

Median :13.60 Median :11.00 Median :0.0000 Public Transport:300

Mean :16.24 Mean :11.32 Mean :0.2342

3rd Qu.:15.72 3rd Qu.:13.43 3rd Qu.:0.0000

Max. :57.00 Max. :23.40 Max. :1.0000

> colSums(is.na(mydata))

Age Gender Engineer MBA Work.Exp Salary Distance license

0 0 0 1 0 0 0 0

Transport

> mydata[!complete.cases(mydata), ]

Age Gender Engineer MBA Work.Exp Salary Distance license Transport

145 28 Female 0 NA 6 13.7 9.4 0 Public Transport

> library(DMwR)

> mydata = knnImputation(mydata, 10)

> colSums(is.na(mydata))

Age Gender Engineer MBA Work.Exp Salary Distance license

0 0 0 0 0 0 0 0
Transport

> mydata$CarUsage=ifelse(mydata$Transport =='Car',1,0)

> table(mydata$CarUsage)

0 1

383 61

> mydata$Gender=ifelse(mydata$Gender =='Male',0,1)

> table(mydata$Gender)

0 1

316 128

> boxplot(mydata[,c(1,5,7)],col = "red")

> boxplot(mydata[,6],col = "red", main=colnames(mydata[6]))

> mydata_out=mydata[,c(1,5,6,7,10)]

> mydata_out=as.data.frame(mydata_out)

> for (i in (1:5))

+{

+ outval=boxplot.stats(mydata_out[,i])$out

+ cat("Outliers in ",colnames(mydata_out[i]),"\n", "=", outval,"\n")

+}

Outliers in Age

= 39 39 39 38 40 38 38 38 38 40 40 39 40 38 39 38 40 39 38 42 40 43 40 38 39

Outliers in Work.Exp

= 19 16 21 17 16 18 19 18 21 16 19 19 18 19 20 22 16 20 18 21 20 20 16 17 21 18 20 21 19 22 22 19 24
20 19 19 19 21

Outliers in Salary

= 36.6 38.9 25.9 34.8 28.8 39.9 39 28.7 36.9 28.7 34.9 47 28.8 36.9 54 29.9 34.9 36 44 37 24.9 43 37 54
44 34 48 42 51 45 34 28.8 45 42.9 41 40.9 30.9 41.9 43 33 36 33 38 46 45 48 35 51 51 55 45 42 52 38 57
44 45 47 50
Outliers in Distance

= 20.7 20.8 21 21.3 21.4 21.5 21.5 22.8 23.4

Outliers in CarUsage

=111111111111111111111111111111111111111111111111111111111
1111

par(mfrow = c(1,2))

> boxplot(Age~Transport,col = "blue")

> boxplot(Gender~Transport,col = "blue")

Error in oldClass(stats) <- cl :

adding class "factor" to an invalid object

> boxplot(Engineer~Transport,col = "blue")

> boxplot(MBA~Transport,col = "blue")

> boxplot(Work.Exp~Transport,col = "blue")

> boxplot(Salary~Transport,col = "blue")

> boxplot(Distance~Transport,col = "blue")

> boxplot(license~Transport,col = "blue")

> length(which(Transport=="Car"))*100/nrow(mydata)

[1] 13.73874

> table(Transport)

Transport

2Wheeler Car Public Transport

83 61 300

> table(Gender)

Gender

Female Male

128 316

> prop.table(table(Gender))

Gender

Female Male
0.2882883 0.7117117

> table(Engineer)

Engineer

0 1

109 335

> prop.table(table(Engineer))

Engineer

0 1

0.2454955 0.7545045

> table(MBA)

MBA

0 1

331 112

> prop.table(table(MBA))

MBA

0 1

0.7471783 0.2528217

> table(license)

license

0 1

340 104

> prop.table(table(license))

license

0 1

0.7657658 0.2342342

> table(Gender,Transport)

Transport

Gender 2Wheeler Car Public Transport

Female 38 13 77
Male 45 48 223

> prop.table(table(Gender,Transport))

Transport

Gender 2Wheeler Car Public Transport

Female 0.08558559 0.02927928 0.17342342

Male 0.10135135 0.10810811 0.50225225

> ggplot(mydata, aes(x=Gender)) + geom_bar()

> ggplot(mydata, aes(x= Gender)) + theme_bw()+

+ facet_wrap(~Transport ) + geom_bar()+

+ labs(x="Gender", y= "No. of People", title = "Gender vs. Transport")

> ggplot(mydata, aes(x= Gender)) + theme_bw()+

+ facet_wrap(~CarUsage ) + geom_bar()+

+ labs(x="Gender", y= "No. of People", title = "Gender vs. Car Usage")

> ggplot(mydata, aes(x=Work.Exp)) + geom_histogram(binwidth = 1)

> ggplot(mydata, aes(x=Work.Exp)) + geom_histogram(bins = 20)+

+ facet_wrap(~Transport, scales = 'free_x') +

+ labs(x="Work Exp", y= "No. of People", title = "Work Exp vs. Transport")

> ggplot(mydata, aes(x=Work.Exp)) + geom_histogram(bins = 20)+

+ facet_wrap(~CarUsage, scales = 'free_x') +

+ labs(x="Work Exp", y= "No. of People", title = "Work Exp vs. Car Usage")

> ggplot(mydata, aes(x=Transport)) + geom_bar() + theme_bw()

> ggplot(mydata, aes(x=Age)) + geom_histogram(binwidth = 5)

> ggplot(mydata, aes(x=Age)) + geom_histogram(binwidth = 1) +

+ facet_wrap(~Transport, scales = 'free_x') +

+ labs(x="Age", y= "No. of People", title = "Age vs. Transport")

> ggplot(mydata, aes(x=Age)) + geom_histogram(binwidth = 1) +

+ facet_wrap(~CarUsage, scales = 'free_x') +

+ labs(x="Age", y= "No. of People", title = "Age vs. Car Usage")

> ggplot(mydata, aes(x=license)) + geom_bar()


> ggplot(mydata, aes(x=license)) + geom_bar() +

+ facet_wrap(~Transport) +

+ labs(x="License", y= "No. of People", title = "License vs. Transport")

> ggplot(mydata, aes(x=license)) + geom_bar() +

+ facet_wrap(~CarUsage) +

+ labs(x="License", y= "No. of People", title = "License vs. Car Usage")

>

table(license,Transport)

Transport

license 2Wheeler Car Public Transport

0 60 13 267

1 23 48 33

> prop.table(table(license,Transport))

Transport

license 2Wheeler Car Public Transport

0 0.13513514 0.02927928 0.60135135

1 0.05180180 0.10810811 0.07432432

> table(license,mydata$CarUsage)

license 0 1

0 327 13

1 56 48

> prop.table(table(license,mydata$CarUsage))

license 0 1

0 0.73648649 0.02927928

1 0.12612613 0.10810811

> ggplot(mydata, aes(x=Gender, fill= Transport)) + geom_bar()

> ggplot(mydata, aes(x=Engineer, fill= Transport)) + geom_bar()


> ggplot(mydata, aes(x=MBA, fill= Transport)) + geom_bar()

Warning message:

position_stack requires non-overlapping x intervals

> ggplot(mydata, aes(x=license, fill= Transport)) + geom_bar()

> ggplot(mydata, aes(x=Age, fill= Transport)) + geom_bar()

> ggplot(mydata, aes(x=Salary)) + geom_histogram(bins = 20) +

+ facet_wrap(~Transport, scales = 'free_x') +

+ labs(x="Salary", y= "No. of People", title = "Salary vs. Transport")

> ggplot(mydata, aes(x=Salary)) + geom_histogram(bins = 20) +

+ facet_wrap(~CarUsage, scales = 'free_x') +

+ labs(x="Salary", y= "No. of People", title = "Salary vs. Car Usage")

> ggplot(mydata, aes(x=Distance)) + geom_histogram(bins = 20) +

+ facet_wrap(~Transport, scales = 'free_x') +

+ labs(x="Distance travelled", y= "No. of People", title = "Distance vs. Transport")

> ggplot(mydata, aes(x=Distance)) + geom_histogram(bins = 20) +

+ facet_wrap(~CarUsage, scales = 'free_x') +

+ labs(x="Distance travelled", y= "No. of People", title = "Distance vs. Car Usage")

> ggplot(mydata, aes(x=MBA)) + geom_bar() +

+ facet_wrap(~Transport) +

+ labs(x="MBA", y= "No. of People", title = "MBA vs. Transport")

> ggplot(mydata, aes(x=MBA)) + geom_bar() +

+ facet_wrap(~CarUsage) +

+ labs(x="MBA", y= "No. of People", title = "MBA vs. Car Usage")

> table(MBA,Transport)

Transport

MBA 2Wheeler Car Public Transport

0 66 49 216

1 17 12 83

> prop.table(table(MBA,Transport))
Transport

MBA 2Wheeler Car Public Transport

0 0.14898420 0.11060948 0.48758465

1 0.03837472 0.02708804 0.18735892

> table(MBA,mydata$CarUsage)

MBA 0 1

0 282 49

1 100 12

> prop.table(table(MBA,mydata$CarUsage))

MBA 0 1

0 0.63656885 0.11060948

1 0.22573363 0.02708804

> ggplot(mydata, aes(x=Engineer)) + geom_bar() +

+ facet_wrap(~Transport) +

+ labs(x="Engineer", y= "No. of People", title = "Engineer vs. Transport")

> ggplot(mydata, aes(x=Engineer)) + geom_bar() +

+ facet_wrap(~CarUsage) +

+ labs(x="Engineer", y= "No. of People", title = "Engineer vs. Car Usage")

> table(Engineer,Transport)

Transport

Engineer 2Wheeler Car Public Transport

0 23 9 77

1 60 52 223

> prop.table(table(Engineer,Transport))

Transport

Engineer 2Wheeler Car Public Transport

0 0.05180180 0.02027027 0.17342342


1 0.13513514 0.11711712 0.50225225

> table(Engineer,mydata$CarUsage)

Engineer 0 1

0 100 9

1 283 52

> prop.table(table(Engineer,mydata$CarUsage))

Engineer 0 1

0 0.22522523 0.02027027

1 0.63738739 0.11711712

dev.off()

null device

> par(mar = c(3,2,2,1))

> str(mydata)

'data.frame': 444 obs. of 10 variables:

$ Age : int 28 23 29 28 27 26 28 26 22 27 ...

$ Gender : num 0 1 0 1 0 0 0 1 0 0 ...

$ Engineer : int 0 1 1 1 1 1 1 1 1 1 ...

$ MBA : num 0 0 0 1 0 0 0 0 0 0 ...

$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...

$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...

$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...

$ license : int 0 0 0 0 0 1 0 0 0 0 ...

$ Transport: Factor w/ 3 levels "2Wheeler","Car",..: 3 3 3 3 3 3 1 3 3 3 ...

$ CarUsage : num 0 0 0 0 0 0 0 0 0 0 ...

> mydata$MBA=as.numeric(mydata$MBA)

> library(corrplot)
> dataScatter=subset(mydata[,c(1:8,10)])

> cor(dataScatter)

Age Gender Engineer MBA Work.Exp Salary

Age 1.00000000 -0.09885525 0.09193523 -0.029010747 0.93223639 0.860673177

Gender -0.09885525 1.00000000 -0.01821436 -0.092446493 -0.08602226 -0.096262448

Engineer 0.09193523 -0.01821436 1.00000000 0.063476962 0.08572854 0.086762332

MBA -0.02901075 -0.09244649 0.06347696 1.000000000 0.00849268 -0.007656222

Work.Exp 0.93223639 -0.08602226 0.08572854 0.008492680 1.00000000 0.931974470

Salary 0.86067318 -0.09626245 0.08676233 -0.007656222 0.93197447 1.000000000

Distance 0.35287246 -0.05420649 0.05931640 0.035597805 0.37273497 0.442359088

license 0.45231086 -0.23459399 0.01892418 -0.028239932 0.45286699 0.508094614

CarUsage 0.72128798 -0.06623155 0.09083167 -0.051675771 0.73123159 0.764276968

Distance license CarUsage

Age 0.35287246 0.45231086 0.72128798

Gender -0.05420649 -0.23459399 -0.06623155

Engineer 0.05931640 0.01892418 0.09083167

MBA 0.03559781 -0.02823993 -0.05167577

Work.Exp 0.37273497 0.45286699 0.73123159

Salary 0.44235909 0.50809461 0.76427697

Distance 1.00000000 0.29008445 0.44060090

license 0.29008445 1.00000000 0.52076686

CarUsage 0.44060090 0.52076686 1.00000000

> dev.off()

null device

> corrplot(cor(dataScatter), method = "pie", type = "upper")

> mydata$CarUsage=as.numeric(mydata$CarUsage)

> mydata$Gender=as.numeric(mydata$Gender)

> mydata$Engineer=as.numeric(mydata$Engineer)
> mydata$MBA=as.numeric(mydata$MBA)

> mydata$license=as.numeric(mydata$license)

> par(mfrow = c(3,3))

> for (i in (1:8))

+ for (i in (1:8))

+ {

+ h = ifelse(max(mydata[,i])==1,1,round(max(mydata[,i]),0)+1)

+ l = ifelse(min(mydata[,i])==0,0,round(min(mydata[,i]),0)-1)

+ hist(mydata[,i],breaks=seq(l,h,((h-l)/6)),labels=TRUE,

+ include.lowest=TRUE,right=TRUE,

+ col="pink",border=1,

+ main = colnames(mydata[i]), ylab=NULL,xlab = NULL,

+ cex.lab=1, cex.axis=1, cex.main=1, cex.sub=1,xlim = c(l,h),

+ ylim = c(0,400))

+ }

mydata$CarUsage=as.factor(mydata$CarUsage)

> mydata$Engineer=as.factor(mydata$Engineer)

> mydata$MBA=as.factor(mydata$MBA)

> mydata$license=as.factor(mydata$license)

> str(mydata)

'data.frame': 444 obs. of 10 variables:

$ Age : int 28 23 29 28 27 26 28 26 22 27 ...

$ Gender : num 0 1 0 1 0 0 0 1 0 0 ...

$ Engineer : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 2 ...

$ MBA : Factor w/ 3 levels "0","0.304048449365552",..: 1 1 1 3 1 1 1 1 1 1 ...

$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...

$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...

$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...

$ license : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...


$ Transport: Factor w/ 3 levels "2Wheeler","Car",..: 3 3 3 3 3 3 1 3 3 3 ...

$ CarUsage : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

> library(caTools)

> mydata[,'train']=ifelse(runif(nrow(mydata))<0.75,1,0)

> str(mydata)

'data.frame': 444 obs. of 11 variables:

$ Age : int 28 23 29 28 27 26 28 26 22 27 ...

$ Gender : num 0 1 0 1 0 0 0 1 0 0 ...

$ Engineer : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 2 ...

$ MBA : Factor w/ 3 levels "0","0.304048449365552",..: 1 1 1 3 1 1 1 1 1 1 ...

$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...

$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...

$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...

$ license : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...

$ Transport: Factor w/ 3 levels "2Wheeler","Car",..: 3 3 3 3 3 3 1 3 3 3 ...

$ CarUsage : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

$ train : num 1 0 1 1 1 1 1 1 0 1 ...

> data_train=mydata[mydata$train==1,]

> data_test=mydata[mydata$train==0,]

> mydata=mydata[-c(9,11)]

> data_train=data_train[-c(9,11)]

> data_test=data_test[-c(9,11)]

> dim(data_train)

[1] 346 9

> dim(data_test)

[1] 98 9

> prop.table(table(mydata$CarUsage))

0 1
0.8626126 0.1373874

> prop.table(table(data_train$CarUsage))

0 1

0.8699422 0.1300578

> prop.table(table(data_test$CarUsage))

0 1

0.8367347 0.1632653

> logit_model1 = glm(CarUsage ~ ., data = data_train,family = binomial(link="logit"))

> summary(logit_model1)

Call:

glm(formula = CarUsage ~ ., family = binomial(link = "logit"),

data = data_train)

Deviance Residuals:

Min 1Q Median 3Q Max

-1.98562 -0.03113 -0.00580 -0.00047 2.03054

Coefficients:

Estimate Std. Error z value Pr(>|z|)

(Intercept) -82.66624 19.15350 -4.316 1.59e-05 ***

Age 2.66855 0.64589 4.132 3.60e-05 ***

Gender 1.96875 0.96483 2.041 0.041298 *

Engineer1 0.10081 1.07307 0.094 0.925154

MBA1 -1.79362 1.02357 -1.752 0.079719 .

Work.Exp -1.57378 0.45225 -3.480 0.000502 ***

Salary 0.26930 0.09167 2.938 0.003306 **


Distance 0.46379 0.16723 2.773 0.005548 **

license1 2.72229 1.05394 2.583 0.009796 **

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

Null deviance: 267.456 on 345 degrees of freedom

Residual deviance: 45.946 on 337 degrees of freedom

AIC: 63.946

Number of Fisher Scoring iterations: 10

library(car)

> vif(logit_model1)

Age Gender Engineer MBA Work.Exp Salary Distance license

12.048654 1.432683 1.124272 1.453715 18.275864 4.089127 1.615216 1.906161

> anova(logit_model1, test="Chisq")

Analysis of Deviance Table

Model: binomial, link: logit

Response: CarUsage

Terms added sequentially (first to last)

Df Deviance Resid. Df Resid. Dev Pr(>Chi)

NULL 345 267.456

Age 1 176.806 344 90.649 < 2.2e-16 ***


Gender 1 1.344 343 89.305 0.246253

Engineer 1 0.024 342 89.281 0.877576

MBA 1 1.830 341 87.451 0.176166

Work.Exp 1 2.433 340 85.019 0.118842

Salary 1 20.163 339 64.856 7.112e-06 ***

Distance 1 10.273 338 54.583 0.001350 **

license 1 8.637 337 45.946 0.003293 **

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> mydata_logit = mydata[-c(3,5)]

> set.seed(1234)

> sample = sample.split(mydata_logit$CarUsage, SplitRatio = 0.75)

> data_train_logit = subset(mydata_logit, sample==TRUE)

> data_test_logit = subset(mydata_logit, sample==FALSE)

> dim(data_train_logit)

[1] 333 7

> dim(data_test_logit)

[1] 111 7

> table(data_train_logit$CarUsage)

0 1

287 46

> table(data_test_logit$CarUsage)

0 1

96 15

> prop.table(table(data_train_logit$CarUsage))

0 1
0.8618619 0.1381381

> prop.table(table(data_test_logit$CarUsage))

0 1

0.8648649 0.1351351

> logit_model2=glm(CarUsage ~ ., data=data_train_logit, family=binomial())

> summary(logit_model2)

Call:

glm(formula = CarUsage ~ ., family = binomial(), data = data_train_logit)

Deviance Residuals:

Min 1Q Median 3Q Max

-2.14502 -0.08214 -0.02041 -0.00386 2.89882

Coefficients:

Estimate Std. Error z value Pr(>|z|)

(Intercept) -40.61417 8.25596 -4.919 8.68e-07 ***

Age 1.09471 0.24863 4.403 1.07e-05 ***

Gender 0.25213 0.85484 0.295 0.76803

MBA0.304048449365552 -10.32649 2399.54509 -0.004 0.99657

MBA1 -1.66327 0.91122 -1.825 0.06795 .

Salary -0.02902 0.05067 -0.573 0.56684

Distance 0.41152 0.12733 3.232 0.00123 **

license1 2.10992 0.80815 2.611 0.00903 **

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)


Null deviance: 267.445 on 332 degrees of freedom

Residual deviance: 53.665 on 325 degrees of freedom

AIC: 69.665

Number of Fisher Scoring iterations: 15

> vif(logit_model2)

GVIF Df GVIF^(1/(2*Df))

Age 1.840700 1 1.356724

Gender 1.109938 1 1.053536

MBA 1.267861 2 1.061128

Salary 1.473525 1 1.213888

Distance 1.503751 1 1.226275

license 1.301658 1 1.140902

> anova(logit_model2, test="Chisq")

Analysis of Deviance Table

Model: binomial, link: logit

Response: CarUsage

Terms added sequentially (first to last)

Df Deviance Resid. Df Resid. Dev Pr(>Chi)

NULL 332 267.445

Age 1 186.290 331 81.155 < 2.2e-16 ***

Gender 1 0.014 330 81.140 0.904282


MBA 2 0.556 328 80.584 0.757302

Salary 1 2.095 327 78.490 0.147787

Distance 1 17.124 326 61.366 3.502e-05 ***

license 1 7.701 325 53.665 0.005518 **

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> data_train_logit_final = data_train_logit[,-c(2,3)]

> data_test_logit_final = data_test_logit[,-c(2,3)]

> logit_model3=glm(CarUsage ~ ., data=data_train_logit_final, family=binomial())

> summary(logit_model3)

Call:

glm(formula = CarUsage ~ ., family = binomial(), data = data_train_logit_final)

Deviance Residuals:

Min 1Q Median 3Q Max

-1.91222 -0.08983 -0.02809 -0.00636 2.89020

Coefficients:

Estimate Std. Error z value Pr(>|z|)

(Intercept) -38.25057 7.65615 -4.996 5.85e-07 ***

Age 1.04299 0.23490 4.440 8.99e-06 ***

Salary -0.02631 0.04801 -0.548 0.58362

Distance 0.32744 0.10872 3.012 0.00260 **

license1 2.00751 0.74857 2.682 0.00732 **

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)


Null deviance: 267.445 on 332 degrees of freedom

Residual deviance: 57.391 on 328 degrees of freedom

AIC: 67.391

Number of Fisher Scoring iterations: 9

> vif(logit_model3)

Age Salary Distance license

1.775999 1.495108 1.194572 1.198738

pred_logit =predict.glm(logit_model3, newdata=data_test_logit_final, type="response")

> pred_logit

5 14 16 26 28 29

1.315334e-04 1.313280e-03 2.382703e-05 8.673400e-05 3.320324e-05 6.845859e-04

36 39 40 50 58 60

2.502183e-04 1.055614e-04 1.675927e-06 5.551214e-06 5.895809e-06 1.692712e-05

61 72 74 81 86 90

7.076112e-03 1.807273e-05 1.092898e-03 2.351850e-06 3.209926e-03 4.251727e-04

92 113 116 117 120 121

3.365587e-07 6.956575e-05 3.369932e-03 2.539871e-05 5.241201e-04 1.744132e-03

122 123 124 133 137 139

1.334102e-03 2.096316e-04 1.482651e-03 5.875591e-04 8.012185e-01 2.582092e-02

142 144 151 156 158 160

1.124653e-05 2.319970e-01 2.389607e-04 1.829620e-03 2.414167e-04 2.564754e-04

171 174 181 182 191 193

1.021160e-04 3.929836e-05 1.037100e-04 1.179156e-01 1.827955e-02 9.811994e-01

198 200 201 202 203 205

4.592469e-05 2.536643e-03 1.970838e-02 6.800003e-03 2.035105e-02 7.994245e-04

210 216 219 221 223 227


4.916193e-05 2.348663e-01 7.867423e-01 2.251931e-06 1.506852e-04 3.975917e-04

235 239 242 245 250 255

4.029219e-04 1.669732e-01 1.281620e-03 4.658275e-04 2.461301e-05 2.410030e-05

257 260 261 265 274 282

1.410011e-03 5.275801e-04 1.869691e-01 9.996483e-01 7.795572e-05 4.548890e-03

285 288 292 295 302 303

5.683909e-04 1.257940e-02 1.671107e-03 6.637048e-04 9.192883e-05 3.929207e-06

310 313 316 318 323 326

2.563620e-04 9.219463e-01 4.337766e-02 3.132357e-01 1.078694e-04 1.417465e-05

329 331 332 344 346 354

2.248441e-03 1.156075e-04 1.877994e-03 4.737698e-05 1.317836e-04 9.989957e-01

355 359 361 362 363 367

9.998418e-01 6.490100e-02 6.332216e-02 2.470506e-03 9.921216e-01 3.746041e-01

372 375 378 384 387 390

2.011669e-04 4.169592e-03 7.541128e-05 1.293841e-02 4.863610e-03 3.919313e-02

391 395 402 403 407 409

1.521395e-03 7.194241e-04 1.657707e-03 1.766548e-02 9.004328e-01 3.518718e-04

411 413 417 420 429 431

9.999502e-01 5.576012e-02 8.237702e-03 9.836954e-01 9.998971e-01 9.999956e-01

437 441 442

9.999992e-01 9.999061e-01 9.997264e-01

> data_test_logit_final$pred = pred_logit

> library(caret)

> data_test_logit_final$pred = pred_logit

> library(caret)

library(ROCR)

> pred_ROC = predict(logit_model3, newdata = data_test_logit_final, type = "response")

> pred2 = prediction(pred_ROC, data_test_logit_final$CarUsage)

> perf = performance(pred2, "tpr", "tnr")


> auc(data_test_logit_final$CarUsage,pred_ROC)

[1] 0.9861111

> plot(perf)

> KS_train_logit=max(attr(perf, 'y.values')[[1]]-attr(perf, 'x.values')[[1]])

> KS_train_logit

[1] 1

> library(InformationValue)

ks_plot(actuals=data_test_logit_final$CarUsage, predictedScores=data_test_logit_final$pred)

> fit.results_train=predict(logit_model3,newdata = data_train_logit_final)

> auc(data_train_logit_final$CarUsage,fit.results_train)

[1] 0.9885623

> fit.results_test=predict(logit_model3,newdata = data_test_logit_final)

> auc(data_test_logit_final$CarUsage,fit.results_test)

[1] 0.9861111

> library(ineq)

ks_plot(actuals=data_test_logit_final$CarUsage, predictedScores=data_test_logit_final$pred)

> fit.results_train=predict(logit_model3,newdata = data_train_logit_final)

> auc(data_train_logit_final$CarUsage,fit.results_train)

[1] 0.9885623

> fit.results_test=predict(logit_model3,newdata = data_test_logit_final)

> auc(data_test_logit_final$CarUsage,fit.results_test)

[1] 0.9861111

> library(ineq)

> gini_train = ineq(data_train_logit_final$CarUsage, type="Gini")

> gini_train

[1] 0.1046059

> gini_test = ineq(data_test_logit_final$CarUsage, type="Gini")

> gini_test

[1] 0.1029601
> library(caret)

> set.seed(400)

> ctrl=trainControl(method="repeatedcv",number = 10, repeats = 3, classProbs=TRUE)

> glmfit=train(make.names(CarUsage) ~ ., data = data_train_logit_final, method = "glm",

+ trControl = ctrl, preProcess = c("center","scale"),

+ family="binomial", tuneLength = 20)

> glmfit

Generalized Linear Model

333 samples

4 predictor

2 classes: 'X0', 'X1'

Pre-processing: centered (4), scaled (4)

Resampling: Cross-Validated (10 fold, repeated 3 times)

Summary of sample sizes: 300, 300, 299, 299, 301, 299, ...

Resampling results:

Accuracy Kappa

0.9548258 0.7992131

library(ModelMetrics)

> library(e1071)

> NB=naiveBayes(x=data_train[-9], y=as.factor(data_train$CarUsage))

> NB

Naive Bayes Classifier for Discrete Predictors

Call:

naiveBayes.default(x = data_train[-9], y = as.factor(data_train$CarUsage))


A-priori probabilities:

as.factor(data_train$CarUsage)

0 1

0.8699422 0.1300578

Conditional probabilities:

Age

as.factor(data_train$CarUsage) [,1] [,2]

0 26.51827 2.941173

1 35.40000 3.466725

Gender

as.factor(data_train$CarUsage) [,1] [,2]

0 0.2990033 0.4585837

1 0.2444444 0.4346135

Engineer

as.factor(data_train$CarUsage) 0 1

0 0.2757475 0.7242525

1 0.1333333 0.8666667

MBA

as.factor(data_train$CarUsage) 0 0.304048449365552 1

0 0.7375415 0.0000000 0.2624585

1 0.7333333 0.0000000 0.2666667

Work.Exp

as.factor(data_train$CarUsage) [,1] [,2]


0 4.89701 3.241300

1 15.22222 5.026696

Salary

as.factor(data_train$CarUsage) [,1] [,2]

0 13.11395 5.016201

1 34.66444 12.660392

Distance

as.factor(data_train$CarUsage) [,1] [,2]

0 10.80166 3.192298

1 14.74889 3.716525

license

as.factor(data_train$CarUsage) 0 1

0 0.8571429 0.1428571

1 0.2444444 0.7555556

y_pred.NB=predict(NB,newdata=data_test[-9])

> y_pred.NB

[1] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0

[41] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0

[81] 1 1 0 0 1 0 0 0 1 0 1 1 1 1 1 1 1 1

Levels: 0 1

> tab.NB=table(data_test[,9],y_pred.NB)

> tab.NB

y_pred.NB

0 1

0 79 3
1 2 14

> accuracy.NB=sum(diag(tab.NB))/sum(tab.NB)

> accuracy.NB

[1] 0.9489796

> loss.NB=tab.NB[2,1]/(tab.NB[2,1]+tab.NB[1,1])

> loss.NB

[1] 0.02469136

> gini_train = ineq(data_train$CarUsage, type="Gini")

> gini_train

[1] 0.1001212

> gini_test = ineq(data_test$CarUsage, type="Gini")

> gini_test

[1] 0.1174364

> fit.results_train=predict(NB,newdata = data_train)

> auc(data_train$CarUsage,fit.results_train)

[1] 0.8722776

> fit.results_test=predict(NB,newdata = data_test)

> auc(data_test$CarUsage,fit.results_test)

[1] 0.9192073

>

library(klaR)

> data_train$CarUsage=as.factor(data_train$CarUsage)

> set.seed(234)

> ctrl=trainControl(method="repeatedcv",number = 10, repeats = 3, classProbs=TRUE)

> nbfit=train(make.names(CarUsage) ~ ., data = data_train, method = "nb",

+ trControl = ctrl, preProcess = c("center","scale"), tuneLength = 20)

Naive Bayes

346 samples
8 predictor

2 classes: 'X0', 'X1'

Pre-processing: centered (9), scaled (9)

Resampling: Cross-Validated (10 fold, repeated 3 times)

Summary of sample sizes: 312, 312, 311, 312, 310, 311, ...

Resampling results across tuning parameters:

usekernel Accuracy Kappa

FALSE NaN NaN

TRUE 0.9529272 0.7743552

Tuning parameter 'fL' was held constant at a value of 0

Tuning parameter

'adjust' was held constant at a value of 1

Accuracy was used to select the optimal model using the largest value.

The final values used for the model were fL = 0, usekernel = TRUE and adjust = 1.

plot(nbfit)

> varImp(nbfit)

ROC curve variable importance

Importance

Age 100.000

Salary 97.009

Work.Exp 95.119

license 63.857

Distance 60.301

Engineer 14.504

Gender 5.284
MBA 0.000

> plot(varImp(nbfit))

> normalize=function(x){

+ +return((x-min(x))/(max(x)-min(x)))}

plot(varImp(nbfit))

> normalize=function(x){

+ +return((x-min(x))/(max(x)-min(x)))}

> mydata$Age_norm=normalize(mydata$Age)

> mydata$Gender_norm=normalize(mydata$Gender)

> mydata$Engineer_norm=normalize(mydata$Engineer)

mydata$WorkExp_norm=normalize(mydata$Work.Exp)

> mydata$Engineer_norm=normalize(mydata$Engineer)

str(mydata)

'data.frame': 444 obs. of 14 variables:

$ Age : int 28 23 29 28 27 26 28 26 22 27 ...

$ Gender : num 0 1 0 1 0 0 0 1 0 0 ...

$ Engineer : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 2 ...

$ MBA : Factor w/ 3 levels "0","0.304048449365552",..: 1 1 1 3 1 1 1 1 1 1 ...

$ Work.Exp : int 4 4 7 5 4 4 5 3 1 4 ...

$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...

$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...

$ license : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...

$ CarUsage : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

$ Age_norm : num 0.4 0.2 0.44 0.4 0.36 0.32 0.4 0.32 0.16 0.36 ...

$ Gender_norm : num 0 1 0 1 0 0 0 1 0 0 ...

$ WorkExp_norm : num 0.167 0.167 0.292 0.208 0.167 ...

$ Salary_norm : num 0.1545 0.0356 0.1366 0.1366 0.1366 ...

$ Distance_norm: num 0 0.00495 0.04455 0.06436 0.06931 ...

dim(mydata)
[1] 444 12

> set.seed(1234)

> sample = sample.split(mydata$CarUsage, SplitRatio = 0.8)

> data_train_knn = subset(mydata, sample==TRUE)

> data_test_knn = subset(mydata, sample==FALSE)

train.knn=data_train_knn[,c(9:12)]

> test.knn=data_test_knn[,c(9:12)]

> str(train.knn)

'data.frame': 355 obs. of 4 variables:

$ CarUsage : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

$ Age_norm : num 0.4 0.2 0.44 0.4 0.32 0.4 0.32 0.16 0.36 0.28 ...

$ Gender_norm : num 0 1 0 1 0 0 1 0 0 1 ...

$ WorkExp_norm: num 0.167 0.167 0.292 0.208 0.167 ...

> library(class)

> trControl=trainControl(method = "cv", number = 10)

> fit.knn.cv=train(CarUsage ~ .,

+ method = "knn",

+ tuneGrid = expand.grid(k = 2:20),

+ trControl = trControl,

+ preProcess = c("center","scale"),

+ data = data_train_knn)

fit.knn.cv

k-Nearest Neighbors

355 samples

11 predictor

2 classes: '0', '1'


Pre-processing: centered (12), scaled (12)

Resampling: Cross-Validated (10 fold)

Summary of sample sizes: 319, 321, 319, 319, 319, 320, ...

Resampling results across tuning parameters:

k Accuracy Kappa

2 0.9492810 0.7814388

3 0.9464192 0.7553723

4 0.9435621 0.7496561

5 0.9520542 0.7886301

6 0.9462558 0.7569246

7 0.9518114 0.7852601

8 0.9519748 0.7827611

9 0.9548319 0.7902236

10 0.9520542 0.7777120

11 0.9520542 0.7777120

12 0.9464986 0.7451239

13 0.9464192 0.7365784

14 0.9464192 0.7365784

15 0.9464192 0.7365784

16 0.9492764 0.7552451

17 0.9436415 0.7179337

18 0.9436415 0.7179337

19 0.9436415 0.7179337

20 0.9436415 0.7179337

Accuracy was used to select the optimal model using the largest value.

The final value used for the model was k = 9.

> ctrl=trainControl(method="repeatedcv",number = 10, repeats = 3, classProbs=TRUE)


knnFit.rcv=train(CarUsage ~ ., data = data_train_knn, method = "knn", trControl = trControl, preProcess
= c("center","scale"), tuneLength = 20)

knnFit.rcv

k-Nearest Neighbors

355 samples

11 predictor

2 classes: '0', '1'

Pre-processing: centered (12), scaled (12)

Resampling: Cross-Validated (10 fold)

Summary of sample sizes: 319, 319, 320, 319, 319, 320, ...

Resampling results across tuning parameters:

k Accuracy Kappa

5 0.9519048 0.7942147

7 0.9519841 0.7891656

9 0.9548413 0.7937430

11 0.9576984 0.8039249

13 0.9464286 0.7314630

15 0.9407937 0.7024856

17 0.9407937 0.7084543

19 0.9435714 0.7177166

21 0.9435714 0.7177166

23 0.9435714 0.7177166

25 0.9464286 0.7254943

27 0.9464286 0.7254943

29 0.9464286 0.7229489

31 0.9464286 0.7223845
33 0.9436508 0.7071536

35 0.9436508 0.7071536

37 0.9436508 0.7071536

39 0.9436508 0.7071536

41 0.9436508 0.7071536

43 0.9436508 0.7071536

Accuracy was used to select the optimal model using the largest value.

The final value used for the model was k = 11.

gini_train = ineq(train.knn$CarUsage, type="Gini")

> gini_train

[1] 0.1045461

> gini_test = ineq(test.knn$CarUsage, type="Gini")

> gini_test

[1] 0.1027923

library(gbm)

> library(xgboost)

> library(ipred)

> set.seed(1234)

> Cars.bagging=bagging(CarUsage~., data = data_train,

+ control=rpart.control(maxdepth = 5, minsplit = 15))

> data_test$pred.bag = predict(Cars.bagging, data_test)

> data_test$pred.bag=ifelse(data_test$pred.bag < 0.5,0,1)

BagTable = table(data_test$CarUsage,data_test$pred.bag)

data_test$pred.bag=as.numeric(data_test$pred.bag)

> data_test=data_test[-10]

> str(data_test)

'data.frame': 98 obs. of 9 variables:


$ Age : int 23 22 25 27 32 26 24 25 27 25 ...

$ Gender : num 1 0 1 0 0 1 0 0 0 0 ...

$ Engineer: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 2 1 ...

$ MBA : Factor w/ 3 levels "0","0.304048449365552",..: 1 1 1 1 1 1 1 1 1 1 ...

$ Work.Exp: int 4 1 4 4 9 4 6 1 7 2 ...

$ Salary : num 8.3 7.5 11.5 13.5 15.5 12.3 10.6 7.6 16.6 8.6 ...

$ Distance: num 3.3 5.1 5.2 5.3 5.5 5.9 6.1 6.3 6.4 6.7 ...

$ license : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 2 ...

$ CarUsage: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

accuracy_bg = sum(diag(BagTable))/sum(BagTable)

accuracy_bg

97.5%

gbm.fit<-gbm(formula = CarUsage ~ .,distribution = "bernoulli",

data = data_train, n.trees = 10000, interaction.depth = 1,

shrinkage = 0.001, cv.folds = 5, n.cores= NULL, verbose = FALSE)

data_test$pred.car<- predict(gbm.fit, data_test, type="response")

bstable = table(data_test$CarUsage,data_test$pred.car>0.5)

bstable

accuracy_bst = sum(diag(bstable))/sum(bstable)

accuracy_bst

97%

library(DMwR)

table(mydata$CarUsage)

mydata[,'train']=ifelse(runif(nrow(mydata))<0.80,1,0)

str(mydata)

smote.train=mydata[mydata$train==1,]

smote.test=mydata[mydata$train==0,]

smote.train=smote.train[-10]

smote.test=smote.test[-10]
smote.train$CarUsage=as.factor(smote.train$CarUsage)

data_smote<-SMOTE(CarUsage ~., smote.train,k=5, perc.over =4000,perc.under = 200)

prop.table(table(data_smote$CarUsage))

smote_features_train<-as.matrix(data_smote[,1:8])

smote_label_train<-as.matrix(data_smote$CarUsage)

smote.xgb.fit <- xgboost(

data = smote_features_train,

label = smote_label_train,

eta = 0.7,

max_depth = 5,

nrounds = 7,

nfold = 5,

objective = "binary:logistic", # for regression models

verbose = 0, # silent,

early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees

smote_features_test<-as.matrix(smote.test[,1:8])

smote.test$smote.pred.class <- predict(smote.xgb.fit, smote_features_test)

smotexbg = table(smote.test$CarUsage,smote.test$smote.pred.class>=0.5)

smotexbg

accuracy_smotexgb = sum(diag(smotexbg))/sum(smotexbg)

accuracy_smotexgb

96.7%

You might also like