Professional Documents
Culture Documents
setwd("F:/project")
> getwd()
[1] "F:/project"
+ sheet = "Bank_Personal_Loan_Modelling")
> dim(loandata)
[1] 5000 14
> head(loandata)
# A tibble: 6 x 14
ID `Age (in years)` `Experience (in~ `Income (in K/m~ `ZIP Code`
1 1 25 1 49 91107
2 2 45 19 34 90089
3 3 39 15 11 94720
4 4 35 9 100 94112
5 5 35 8 45 91330
6 6 37 13 29 92121
# ... with 9 more variables: `Family members` <dbl>, CCAvg <dbl>, Education <dbl>,
> summary(loandata)
Min. : 9307 Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0.0
1st Qu.:91911 1st Qu.:1.000 1st Qu.: 0.700 1st Qu.:1.000 1st Qu.: 0.0
Median :93437 Median :2.000 Median : 1.500 Median :2.000 Median : 0.0
Mean :93153 Mean :2.397 Mean : 1.938 Mean :1.881 Mean : 56.5
3rd Qu.:94608 3rd Qu.:3.000 3rd Qu.: 2.500 3rd Qu.:3.000 3rd Qu.:101.0
Max. :96651 Max. :4.000 Max. :10.000 Max. :3.000 Max. :635.0
NA's :18
CreditCard
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.294
3rd Qu.:1.000
Max. :1.000
> names(loandata)
> sum(is.na(loandata))
[1] 18
> sum(is.na(loandata))
[1] 0
> loans=loandata[,c(-1,-3,-5)]
> summary(loans)
1st Qu.:35.00 1st Qu.: 39.00 1st Qu.:1.000 1st Qu.: 0.700
3rd Qu.:55.00 3rd Qu.: 98.00 3rd Qu.:3.000 3rd Qu.: 2.500
> dim(loans)
[1] 5000 11
> d.chebyshev=dist(x=loans,method="maximum")
> cluster.height=res.hclust.euc$height
> lines(cluster.height,lty=2,lwd=2,col="blue")
> par(mfrow=c(2,1))
> str(loans)
$ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
$ Education : num 1 1 1 2 2 2 2 3 2 3 ...
> loans.scaled=scale(loans)
> loans.scaled
Age (in years) Income (in K/month) Family members CCAvg Education
CreditCard
[1,] -0.6452498
[2,] -0.6452498
[3,] -0.6452498
[4,] -0.6452498
[5,] 1.5494774
[6,] -0.6452498
[7,] -0.6452498
[8,] 1.5494774
[9,] -0.6452498
[10,] -0.6452498
[11,] -0.6452498
[12,] -0.6452498
[13,] -0.6452498
[14,] -0.6452498
[15,] -0.6452498
[16,] 1.5494774
[17,] -0.6452498
[18,] -0.6452498
[19,] -0.6452498
[20,] 1.5494774
[21,] -0.6452498
[22,] -0.6452498
[23,] -0.6452498
[24,] -0.6452498
[25,] 1.5494774
[26,] -0.6452498
[27,] -0.6452498
[28,] 1.5494774
[29,] 1.5494774
[30,] 1.5494774
[31,] -0.6452498
[32,] -0.6452498
[33,] -0.6452498
[34,] -0.6452498
[35,] -0.6452498
[36,] -0.6452498
[37,] 1.5494774
[38,] -0.6452498
[39,] -0.6452498
[40,] -0.6452498
[41,] -0.6452498
[42,] -0.6452498
[43,] -0.6452498
[44,] -0.6452498
[45,] 1.5494774
[46,] 1.5494774
[47,] -0.6452498
[48,] 1.5494774
[49,] 1.5494774
[50,] 1.5494774
[51,] -0.6452498
[52,] -0.6452498
[53,] -0.6452498
[54,] -0.6452498
[55,] -0.6452498
[56,] -0.6452498
[57,] -0.6452498
[58,] -0.6452498
[59,] -0.6452498
[60,] -0.6452498
[61,] -0.6452498
[62,] -0.6452498
[63,] -0.6452498
[64,] -0.6452498
[65,] -0.6452498
[66,] 1.5494774
[67,] -0.6452498
[68,] -0.6452498
[69,] 1.5494774
[70,] -0.6452498
[71,] 1.5494774
[72,] -0.6452498
[73,] 1.5494774
[74,] 1.5494774
[75,] 1.5494774
[76,] 1.5494774
[77,] -0.6452498
[78,] -0.6452498
[79,] -0.6452498
[80,] -0.6452498
[81,] 1.5494774
[82,] -0.6452498
[83,] -0.6452498
[84,] -0.6452498
[85,] -0.6452498
[86,] -0.6452498
[87,] -0.6452498
[88,] -0.6452498
[89,] -0.6452498
[90,] 1.5494774
attr(,"scaled:center")
> seed=1000
> set.seed(seed)
> twss=rep(0.5)
> (k in 1:5)
{set.seed(seed)clust2=kmeans(x=loans.scaled,centres=k,nstart=5)twss[k]=clust1$tot.withinss}
> print(twss)
[1] 0.5
> set.seed(seed)
*** : The Hubert index is a graphical method of determining the number of clusters.
significant increase of the value of the measure i.e the significant peak in Hubert
In the plot of D index, we seek a significant knee (the significant peak in Dindex
the measure.
*******************************************************************
*******************************************************************
> table(nc$Best.n[1,])
0 2 3 4 5
2 8 2 12 2
> set.seed(seed)
> clust3=kmeans(x=loans.scaled,centers=4,nstart=5)
> #CART
> set.seed(111)
> str(loans)
$ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
$ Education : num 1 1 1 2 2 2 2 3 2 3 ...
0 1
0.904 0.096
0 1
4520 480
[1] 0.096
> CARTtrain=train.data
> CARTtest=test.data
> cart.model = rpart(formula = Personal Loan,~. , data = CARTtrain, method = class, control = r.ctrl)
> cart.model
> cartmodel$variable.importance
> cart.model
n= 5000
> library(rpart)
> install.packages("rpart.plot")
> fancyRpartPlot(cart.model)
> cart.model$variable.importance
> view(cart.model$variable.importance)
> round(cart.model$variable.importance,4)
Education Income (in K/month) Family members CCAvg
> cart.model$cptable
> print(cart.model)
n= 5000
> cptable.frame=as.data.frame(cart.model$cptable)
> cptable.frame$cp.deci=round(cptable.frame$CP,4)
> cptable.frame
> predCTrain=predict(cart.model,CARTtrain[,-7])
> sum(diag(tab2))/sum(tab2)
[1] 0.9834
> m1_pruned
n= 5000
> fancyRpartPlot(m1_pruned)
Random Forest
...
## $ SecuritiesAccountExist: num 1 0 0 0 0 0 0
0 0 0 ...
## $ DepositAccountExist : num 0 0 0 0 0 0
0 0 0 0 ...
## $ OnlineBankingExist : num 0 0 0 0 1 0
1 0 1 1 ...
## $ CreditCardExist : num 0 0 0 1 0 1 0
0 0 0 ...
## [1] 0.096
## [1] 0.096 ##
## Call:
## 0 1 class.error
## 0 3156 8 0.002528445
## 1 52 284 0.154761905
## MeanDecreaseGini
## Age 2.96512204
## Experience 1.42467278
## Income 177.99835899
## ZipCode 0.59038829
## FamilySize 78.28873558
## CCAvg 43.85047345
## Education 194.60045363
## Mortgage 1.25291003
## SecuritiesAccountExist 0.09830039
## DepositAccountExist 14.85910450
## OnlineBankingExist 0.15240309
## CreditCardExist 0.06280318
##
## Call:
## 0 1 class.error
## 0 3160 4 0.001264223
## 1 44 292 0.130952381
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## # A tibble: 6 x 15
## 1 25 1 49 91107 4 1.6 1 0
## 2 39 15 11 94720 1 1 1 0
## 4 35 8 45 91330 4 1 2 0
## 6 50 24 22 93943 1 0.3 3 0
## # A tibble: 6 x 16
## 1 25 1 49 91107 4 1.6 1 0
## 2 39 15 11 94720 1 1 1 0
## 4 35 8 45 91330 4 1 2 0
## 6 50 24 22 93943 1 0.3 3 0
## 2: 9 3150 31 3119
##
## ## Balanced Accuracy :
##
## [1] 1500 15
## # A tibble: 6 x 16
## 1 45 19 34 90089 3 1.5 1 0
## 2 53 27 72 91711 2 1.5 2 0
## 5 67 41 112 91741 1 2 1 0
## 6 60 30 22 95054 1 1.5 3 0