Final Project

Final Project
PART A (30 points)

We will develop K-NN with k=3, Naive Bayes (after binning the continuous
predictors) and classification tree, then combine them in an ensemble.
a. Fit models to the data for (1) k-nearest neighbors with k = 3, (2) Naive
Bayes and (3) classification trees. Use Personal Loan as the outcome
variable. Report the validation confusion matrix for each of the three
model. (10 points)
bank.df <- read.csv("UniversalBank.csv")

bank.df <- bank.df[ , -c(1, 5)]
bank.df <- bank.df[ , c(1:7, 9:12, 8)]
set.seed(12345)
train.index <- sample(row.names(bank.df), 0.6*dim(bank.df)[1])
valid.index <- setdiff(row.names(bank.df), train.index)
train.df <- bank.df[train.index, ]
valid.df <- bank.df[valid.index, ]
##K-NN with k=3
install.packages("FNN")
install.packages("caret")
library(FNN)
library(caret)
knn.pred.valid <- knn (train.df[, 1:11], valid.df[, 1:11],
cl = train.df[, 12], k = 3)
confusionMatrix(knn.pred.valid, as.factor(valid.df[, 12]))
library(e1071)
nb <- naiveBayes(Personal.Loan ~ ., data = train.df)
nb
# training
pred.class <- predict(nb, newdata = train.df)
confusionMatrix(pred.class, as.factor(train.df$Personal.Loan))
# validation
pred.class2 <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class2, as.factor(valid.df$Personal.Loan))
#classification tree
library(rpart)
library(rpart.plot)
default.ct <- rpart(Personal.Loan ~ ., data = train.df, method = "class")
# plot tree
prp(default.ct, type = 1, extra = 1, under = TRUE, split.font = 1, varlen = -10)
default.ct.point.pred.train <- predict(default.ct,train.df,type = "class")

default.ct.point.pred.valid <- predict(default.ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(default.ct.point.pred.train, as.factor(train.df$Personal.Loan))
confusionMatrix(default.ct.point.pred.valid, as.factor(valid.df$Personal.Loan))
In this case, classification tree show a better accuracy with a value of 0.9835.
b. Create a data frame with the actual outcome, predicted outcome, and
probability of being a "1" for each of the three models. Report the first 10
rows of this data frame. (5 points)
##Create a data frame

## predict probabilities
knn.pred.prob <- knn(train.df[, 1:11], valid.df[, 1:11], cl = train.df[, 12], k = 3, prob = TRUE)
attr(knn.pred.prob, "prob")
nbpred.prob <- predict(nb, newdata = valid.df, type = "raw")
ctpred.prob <- predict(default.ct, newdata = valid.df, type ='prob')
## predict class membership

nbpred.class <- predict(nb, newdata = valid.df)
ctpred.class <- predict(default.ct, newdata = valid.df, type='class')
df2 <- data.frame(actual = valid.df$Personal.Loan, naivebayes = nbpred.class,

naivebayes_prob=nbpred.prob,
ctclass=ctpred.class,ct_prob=ctpred.prob, knn_class =knn.pred.valid,
knn_prob=knn.pred.prob )
head(df2, 10)
c. Add two columns to this data frame for (1) a majority vote of predicted
outcomes, and (2) the average of the predicted probabilities. Using the
classifications generated by these two methods derive a confusion matrix
for each method and report the overall accuracy. ( 10 points)
df3<-data.frame(actual = valid.df$Personal.Loan, naivebayes = nbpred.class,

naivebayes_prob=nbpred.prob,
ctclass=ctpred.class,ct_prob=ctpred.prob, knn_class =knn.pred.valid,
knn_prob=knn.pred.prob, majority_class= 0, ave_prob=0)
head(df3, 10)
for(i in 1:200) {
df3[i,11] <- as.numeric(df3[i,4])+as.numeric(df3[i,7])+as.numeric(df3[i,9])/3
if ((as.numeric(df3[i,2])-1)+(as.numeric(df3[i,5])-1)+(as.numeric(df3[i,8])-1) > 1) {
df3[i,10]<- 1 } else {df3[i,10]<- 0 }
}
head(df3, 10)
confusionMatrix(as.factor(df3$majority_class), as.factor(valid.df$Personal.Loan))
confusionMatrix(as.factor(ifelse(df3$ave_prob>0.5, 1,0)),as.factor(valid.df$Personal.Loan))
d. Compare the error rates for the three individual methods and the two
ensemble methods. (5 points)
## comparate error rates
erknn <-(71+108)/2000
ernb <-(157+74)/2000
erct <-(23+10)/2000
ervote <-(3+177)/2000
erprob <-(20+173)/2000
erknn
ernb
erct
ervote
erprob
PART B (5 points)
Use Bagging and Boosted Trees and compare their performance with all the
methodologies we used in PART A.
# bagging
library(adabag)
bank.df$Personal.Loan = as.factor(bank.df$Personal.Loan)
train.df$Personal.Loan = as.factor(train.df$Personal.Loan)
bag <- bagging(Personal.Loan ~ ., data = train.df)

pred <- predict(bag, valid.df, type = "class")
confusionMatrix(as.factor(pred$class), as.factor(valid.df$Personal.Loan))
# boosting
boost <- boosting(Personal.Loan ~ ., data = train.df)
pred1 <- predict(boost, valid.df, type = "class")
confusionMatrix(as.factor(pred1$class), as.factor(valid.df$Personal.Loan))
In this case, we can appreciate that boosting and bag trees have more accuracy than the previous
models. Boosting have the highest accuracy.
(5 points)
We are interested in predicting the loan acceptance behavior for three new
customers, with the following profiles. Use each of the developed methods to
predict these three new customers.
##Evaluation of new records

new.df <- data.frame(Age = c(40, 25, 59), Experience = c(10,6,30), Income = c(84,50,120), Family =
c(2,1,3),
CCAvg = c(2,1.8, 1.9), Education = c(2,1,3), Mortgage = c(0,1,0),
Securities.Account = c(0,0,0), CD.Account = c(0,0,1), Online = c(1,1,1),
CreditCard = c(1,1,0))
knn.pred.new <- knn (train.df[, 1:11], new.df[, 1:11],

cl = train.df[, 12], k = 3)
For Knn the model predict that the first and second new record won’t accept the personal Loan and
the last will accept.
pred.class <- predict(nb, newdata = new.df)
ctpred.class <- predict(default.ct, newdata = new.df, type='class')
The same classification is for naïve bayes and the classification tree.
pred <- predict(bag, new.df, type = "class")

pred1 <- predict(boost, new.df, type = "class")
In all the models we can predict the same conclusions. The customers 1 and 2 will not accept the
Personal Loan. The customer 3 will accept the offer of the bank.

Final Project

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Final Project

Uploaded by

Copyright:

Available Formats

Final Project

PART A (30 points)

bank.df <- read.csv("UniversalBank.csv")

##K-NN with k=3

default.ct.point.pred.train <- predict(default.ct,train.df,type = "class")

##Create a data frame

## predict class membership

df2 <- data.frame(actual = valid.df$Personal.Loan, naivebayes = nbpred.class,

df3<-data.frame(actual = valid.df$Personal.Loan, naivebayes = nbpred.class,

## comparate error rates

bag <- bagging(Personal.Loan ~ ., data = train.df)

##Evaluation of new records

knn.pred.new <- knn (train.df[, 1:11], new.df[, 1:11],

ctpred.class <- predict(default.ct, newdata = new.df, type='class')

pred <- predict(bag, new.df, type = "class")

You might also like