Professional Documents
Culture Documents
> library(car)
> library(caret)
> library(class)
> library(devtools)
> library(e1071)
> library(ggplot2)
> library(Hmisc)
> library(klaR)
> library(MASS)
> library(nnet)
> library(plyr)
> library(pROC)
> library(psych)
> library(scatterplot3d)
> library(dplyr)
> library(ElemStatLearn)
> library(rpart)
> library(rpart.plot)
> library(randomForest)
> library(neuralnet)
> attach(mydata)
Work.Exp
Work.Exp
> head(mydata)
> dim(mydata)
[1] 444 9
> names(mydata)
> str(mydata)
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
> summary(mydata)
Age Gender Engineer MBA Work.Exp
1st Qu.:25.00 Male :316 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.: 3.0
NA's :1
> colSums(is.na(mydata))
0 0 0 1 0 0 0 0
Transport
> mydata[!complete.cases(mydata), ]
> library(DMwR)
> colSums(is.na(mydata))
0 0 0 0 0 0 0 0
Transport
> table(mydata$CarUsage)
0 1
383 61
> table(mydata$Gender)
0 1
316 128
> mydata_out=mydata[,c(1,5,6,7,10)]
> mydata_out=as.data.frame(mydata_out)
+{
+ outval=boxplot.stats(mydata_out[,i])$out
+}
Outliers in Age
= 39 39 39 38 40 38 38 38 38 40 40 39 40 38 39 38 40 39 38 42 40 43 40 38 39
Outliers in Work.Exp
= 19 16 21 17 16 18 19 18 21 16 19 19 18 19 20 22 16 20 18 21 20 20 16 17 21 18 20 21 19 22 22 19 24
20 19 19 19 21
Outliers in Salary
= 36.6 38.9 25.9 34.8 28.8 39.9 39 28.7 36.9 28.7 34.9 47 28.8 36.9 54 29.9 34.9 36 44 37 24.9 43 37 54
44 34 48 42 51 45 34 28.8 45 42.9 41 40.9 30.9 41.9 43 33 36 33 38 46 45 48 35 51 51 55 45 42 52 38 57
44 45 47 50
Outliers in Distance
Outliers in CarUsage
=111111111111111111111111111111111111111111111111111111111
1111
par(mfrow = c(1,2))
> length(which(Transport=="Car"))*100/nrow(mydata)
[1] 13.73874
> table(Transport)
Transport
83 61 300
> table(Gender)
Gender
Female Male
128 316
> prop.table(table(Gender))
Gender
Female Male
0.2882883 0.7117117
> table(Engineer)
Engineer
0 1
109 335
> prop.table(table(Engineer))
Engineer
0 1
0.2454955 0.7545045
> table(MBA)
MBA
0 1
331 112
> prop.table(table(MBA))
MBA
0 1
0.7471783 0.2528217
> table(license)
license
0 1
340 104
> prop.table(table(license))
license
0 1
0.7657658 0.2342342
> table(Gender,Transport)
Transport
Female 38 13 77
Male 45 48 223
> prop.table(table(Gender,Transport))
Transport
+ facet_wrap(~Transport ) + geom_bar()+
+ facet_wrap(~CarUsage ) + geom_bar()+
+ labs(x="Work Exp", y= "No. of People", title = "Work Exp vs. Car Usage")
+ facet_wrap(~Transport) +
+ facet_wrap(~CarUsage) +
>
table(license,Transport)
Transport
0 60 13 267
1 23 48 33
> prop.table(table(license,Transport))
Transport
> table(license,mydata$CarUsage)
license 0 1
0 327 13
1 56 48
> prop.table(table(license,mydata$CarUsage))
license 0 1
0 0.73648649 0.02927928
1 0.12612613 0.10810811
Warning message:
+ facet_wrap(~Transport) +
+ facet_wrap(~CarUsage) +
> table(MBA,Transport)
Transport
0 66 49 216
1 17 12 83
> prop.table(table(MBA,Transport))
Transport
> table(MBA,mydata$CarUsage)
MBA 0 1
0 282 49
1 100 12
> prop.table(table(MBA,mydata$CarUsage))
MBA 0 1
0 0.63656885 0.11060948
1 0.22573363 0.02708804
+ facet_wrap(~Transport) +
+ facet_wrap(~CarUsage) +
> table(Engineer,Transport)
Transport
0 23 9 77
1 60 52 223
> prop.table(table(Engineer,Transport))
Transport
> table(Engineer,mydata$CarUsage)
Engineer 0 1
0 100 9
1 283 52
> prop.table(table(Engineer,mydata$CarUsage))
Engineer 0 1
0 0.22522523 0.02027027
1 0.63738739 0.11711712
dev.off()
null device
> str(mydata)
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
> mydata$MBA=as.numeric(mydata$MBA)
> library(corrplot)
> dataScatter=subset(mydata[,c(1:8,10)])
> cor(dataScatter)
> dev.off()
null device
> mydata$CarUsage=as.numeric(mydata$CarUsage)
> mydata$Gender=as.numeric(mydata$Gender)
> mydata$Engineer=as.numeric(mydata$Engineer)
> mydata$MBA=as.numeric(mydata$MBA)
> mydata$license=as.numeric(mydata$license)
+ for (i in (1:8))
+ {
+ h = ifelse(max(mydata[,i])==1,1,round(max(mydata[,i]),0)+1)
+ l = ifelse(min(mydata[,i])==0,0,round(min(mydata[,i]),0)-1)
+ hist(mydata[,i],breaks=seq(l,h,((h-l)/6)),labels=TRUE,
+ include.lowest=TRUE,right=TRUE,
+ col="pink",border=1,
+ ylim = c(0,400))
+ }
mydata$CarUsage=as.factor(mydata$CarUsage)
> mydata$Engineer=as.factor(mydata$Engineer)
> mydata$MBA=as.factor(mydata$MBA)
> mydata$license=as.factor(mydata$license)
> str(mydata)
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
> library(caTools)
> mydata[,'train']=ifelse(runif(nrow(mydata))<0.75,1,0)
> str(mydata)
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
> data_train=mydata[mydata$train==1,]
> data_test=mydata[mydata$train==0,]
> mydata=mydata[-c(9,11)]
> data_train=data_train[-c(9,11)]
> data_test=data_test[-c(9,11)]
> dim(data_train)
[1] 346 9
> dim(data_test)
[1] 98 9
> prop.table(table(mydata$CarUsage))
0 1
0.8626126 0.1373874
> prop.table(table(data_train$CarUsage))
0 1
0.8699422 0.1300578
> prop.table(table(data_test$CarUsage))
0 1
0.8367347 0.1632653
> summary(logit_model1)
Call:
data = data_train)
Deviance Residuals:
Coefficients:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
AIC: 63.946
library(car)
> vif(logit_model1)
Response: CarUsage
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> set.seed(1234)
> dim(data_train_logit)
[1] 333 7
> dim(data_test_logit)
[1] 111 7
> table(data_train_logit$CarUsage)
0 1
287 46
> table(data_test_logit$CarUsage)
0 1
96 15
> prop.table(table(data_train_logit$CarUsage))
0 1
0.8618619 0.1381381
> prop.table(table(data_test_logit$CarUsage))
0 1
0.8648649 0.1351351
> summary(logit_model2)
Call:
Deviance Residuals:
Coefficients:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
AIC: 69.665
> vif(logit_model2)
GVIF Df GVIF^(1/(2*Df))
Response: CarUsage
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> summary(logit_model3)
Call:
Deviance Residuals:
Coefficients:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
AIC: 67.391
> vif(logit_model3)
> pred_logit
5 14 16 26 28 29
36 39 40 50 58 60
61 72 74 81 86 90
> library(caret)
> library(caret)
library(ROCR)
[1] 0.9861111
> plot(perf)
> KS_train_logit
[1] 1
> library(InformationValue)
ks_plot(actuals=data_test_logit_final$CarUsage, predictedScores=data_test_logit_final$pred)
> auc(data_train_logit_final$CarUsage,fit.results_train)
[1] 0.9885623
> auc(data_test_logit_final$CarUsage,fit.results_test)
[1] 0.9861111
> library(ineq)
ks_plot(actuals=data_test_logit_final$CarUsage, predictedScores=data_test_logit_final$pred)
> auc(data_train_logit_final$CarUsage,fit.results_train)
[1] 0.9885623
> auc(data_test_logit_final$CarUsage,fit.results_test)
[1] 0.9861111
> library(ineq)
> gini_train
[1] 0.1046059
> gini_test
[1] 0.1029601
> library(caret)
> set.seed(400)
> glmfit
333 samples
4 predictor
Summary of sample sizes: 300, 300, 299, 299, 301, 299, ...
Resampling results:
Accuracy Kappa
0.9548258 0.7992131
library(ModelMetrics)
> library(e1071)
> NB
Call:
as.factor(data_train$CarUsage)
0 1
0.8699422 0.1300578
Conditional probabilities:
Age
0 26.51827 2.941173
1 35.40000 3.466725
Gender
0 0.2990033 0.4585837
1 0.2444444 0.4346135
Engineer
as.factor(data_train$CarUsage) 0 1
0 0.2757475 0.7242525
1 0.1333333 0.8666667
MBA
as.factor(data_train$CarUsage) 0 0.304048449365552 1
Work.Exp
1 15.22222 5.026696
Salary
0 13.11395 5.016201
1 34.66444 12.660392
Distance
0 10.80166 3.192298
1 14.74889 3.716525
license
as.factor(data_train$CarUsage) 0 1
0 0.8571429 0.1428571
1 0.2444444 0.7555556
y_pred.NB=predict(NB,newdata=data_test[-9])
> y_pred.NB
[1] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
[41] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0
[81] 1 1 0 0 1 0 0 0 1 0 1 1 1 1 1 1 1 1
Levels: 0 1
> tab.NB=table(data_test[,9],y_pred.NB)
> tab.NB
y_pred.NB
0 1
0 79 3
1 2 14
> accuracy.NB=sum(diag(tab.NB))/sum(tab.NB)
> accuracy.NB
[1] 0.9489796
> loss.NB=tab.NB[2,1]/(tab.NB[2,1]+tab.NB[1,1])
> loss.NB
[1] 0.02469136
> gini_train
[1] 0.1001212
> gini_test
[1] 0.1174364
> auc(data_train$CarUsage,fit.results_train)
[1] 0.8722776
> auc(data_test$CarUsage,fit.results_test)
[1] 0.9192073
>
library(klaR)
> data_train$CarUsage=as.factor(data_train$CarUsage)
> set.seed(234)
Naive Bayes
346 samples
8 predictor
Summary of sample sizes: 312, 312, 311, 312, 310, 311, ...
Tuning parameter
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were fL = 0, usekernel = TRUE and adjust = 1.
plot(nbfit)
> varImp(nbfit)
Importance
Age 100.000
Salary 97.009
Work.Exp 95.119
license 63.857
Distance 60.301
Engineer 14.504
Gender 5.284
MBA 0.000
> plot(varImp(nbfit))
> normalize=function(x){
+ +return((x-min(x))/(max(x)-min(x)))}
plot(varImp(nbfit))
> normalize=function(x){
+ +return((x-min(x))/(max(x)-min(x)))}
> mydata$Age_norm=normalize(mydata$Age)
> mydata$Gender_norm=normalize(mydata$Gender)
> mydata$Engineer_norm=normalize(mydata$Engineer)
mydata$WorkExp_norm=normalize(mydata$Work.Exp)
> mydata$Engineer_norm=normalize(mydata$Engineer)
str(mydata)
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
$ Age_norm : num 0.4 0.2 0.44 0.4 0.36 0.32 0.4 0.32 0.16 0.36 ...
dim(mydata)
[1] 444 12
> set.seed(1234)
train.knn=data_train_knn[,c(9:12)]
> test.knn=data_test_knn[,c(9:12)]
> str(train.knn)
$ Age_norm : num 0.4 0.2 0.44 0.4 0.32 0.4 0.32 0.16 0.36 0.28 ...
> library(class)
> fit.knn.cv=train(CarUsage ~ .,
+ method = "knn",
+ trControl = trControl,
+ preProcess = c("center","scale"),
+ data = data_train_knn)
fit.knn.cv
k-Nearest Neighbors
355 samples
11 predictor
Summary of sample sizes: 319, 321, 319, 319, 319, 320, ...
k Accuracy Kappa
2 0.9492810 0.7814388
3 0.9464192 0.7553723
4 0.9435621 0.7496561
5 0.9520542 0.7886301
6 0.9462558 0.7569246
7 0.9518114 0.7852601
8 0.9519748 0.7827611
9 0.9548319 0.7902236
10 0.9520542 0.7777120
11 0.9520542 0.7777120
12 0.9464986 0.7451239
13 0.9464192 0.7365784
14 0.9464192 0.7365784
15 0.9464192 0.7365784
16 0.9492764 0.7552451
17 0.9436415 0.7179337
18 0.9436415 0.7179337
19 0.9436415 0.7179337
20 0.9436415 0.7179337
Accuracy was used to select the optimal model using the largest value.
knnFit.rcv
k-Nearest Neighbors
355 samples
11 predictor
Summary of sample sizes: 319, 319, 320, 319, 319, 320, ...
k Accuracy Kappa
5 0.9519048 0.7942147
7 0.9519841 0.7891656
9 0.9548413 0.7937430
11 0.9576984 0.8039249
13 0.9464286 0.7314630
15 0.9407937 0.7024856
17 0.9407937 0.7084543
19 0.9435714 0.7177166
21 0.9435714 0.7177166
23 0.9435714 0.7177166
25 0.9464286 0.7254943
27 0.9464286 0.7254943
29 0.9464286 0.7229489
31 0.9464286 0.7223845
33 0.9436508 0.7071536
35 0.9436508 0.7071536
37 0.9436508 0.7071536
39 0.9436508 0.7071536
41 0.9436508 0.7071536
43 0.9436508 0.7071536
Accuracy was used to select the optimal model using the largest value.
> gini_train
[1] 0.1045461
> gini_test
[1] 0.1027923
library(gbm)
> library(xgboost)
> library(ipred)
> set.seed(1234)
BagTable = table(data_test$CarUsage,data_test$pred.bag)
data_test$pred.bag=as.numeric(data_test$pred.bag)
> data_test=data_test[-10]
> str(data_test)
$ Salary : num 8.3 7.5 11.5 13.5 15.5 12.3 10.6 7.6 16.6 8.6 ...
$ Distance: num 3.3 5.1 5.2 5.3 5.5 5.9 6.1 6.3 6.4 6.7 ...
accuracy_bg = sum(diag(BagTable))/sum(BagTable)
accuracy_bg
97.5%
bstable = table(data_test$CarUsage,data_test$pred.car>0.5)
bstable
accuracy_bst = sum(diag(bstable))/sum(bstable)
accuracy_bst
97%
library(DMwR)
table(mydata$CarUsage)
mydata[,'train']=ifelse(runif(nrow(mydata))<0.80,1,0)
str(mydata)
smote.train=mydata[mydata$train==1,]
smote.test=mydata[mydata$train==0,]
smote.train=smote.train[-10]
smote.test=smote.test[-10]
smote.train$CarUsage=as.factor(smote.train$CarUsage)
prop.table(table(data_smote$CarUsage))
smote_features_train<-as.matrix(data_smote[,1:8])
smote_label_train<-as.matrix(data_smote$CarUsage)
data = smote_features_train,
label = smote_label_train,
eta = 0.7,
max_depth = 5,
nrounds = 7,
nfold = 5,
verbose = 0, # silent,
smote_features_test<-as.matrix(smote.test[,1:8])
smotexbg = table(smote.test$CarUsage,smote.test$smote.pred.class>=0.5)
smotexbg
accuracy_smotexgb = sum(diag(smotexbg))/sum(smotexbg)
accuracy_smotexgb
96.7%