Professional Documents
Culture Documents
References
Chapter 8.3, [ISLR] An Introduction to Statistical Learning (with Applications in R). Free access to download
the book: http://www-bcf.usc.edu/~gareth/ISL/
To see the help file of a function funcname, type ?funcname.
1. Preparation
2. Regression Trees
Load dataset
data(Boston) # as usual, predict medv
str(Boston)
Data split
set.seed(1)
train <- sample(nrow(Boston), 0.7 * nrow(Boston))
Tree
reg.tree <- tree(medv ~ ., Boston[train, ])
summary(reg.tree)
reg.tree
plot(reg.tree)
text(reg.tree)
CV for tree
cv.reg.tree <- cv.tree(reg.tree)
cv.reg.tree
plot(cv.reg.tree$size, cv.reg.tree$dev, type='b')
(min.cv.reg <- cv.reg.tree$size[which.min(cv.reg.tree$dev)])
Prune Tree
1
MH4510 - Statistical Learning and Data Mining - AY1819 S1 Lab 09
Predict
medv.pred <- predict(prune.reg.tree, newdata = Boston[-train, ])
medv.true <- Boston$medv[-train]
plot(medv.pred, medv.true)
abline(0, 1)
mean((medv.pred - medv.true) ^ 2)
2
MH4510 - Statistical Learning and Data Mining - AY1819 S1 Lab 09
3. Classification Trees
Since Boston doesn’t have a categorical variable, we will create one and call it cmed. It indicates if the crime
rate crim is above or below the median. Note that we use the median of the training set.
Boston.cls <- Boston
Boston.cls$cmed <- "No"
Boston.cls$cmed[Boston.cls$crim > median(Boston.cls[train,]$crim)] <- "Yes"
Boston.cls$cmed <- factor(Boston.cls$cmed)
Boston.cls <- Boston.cls[-1] # drop the crim variable
str(Boston.cls)
Tree
cls.tree <- tree(cmed ~ ., Boston.cls[train, ])
summary(cls.tree)
cls.tree
plot(cls.tree)
text(cls.tree)
CV for tree
cv.cls.tree <- cv.tree(cls.tree, FUN = prune.misclass)
cv.cls.tree
plot(cv.cls.tree$size, cv.cls.tree$dev, type='b')
(min.cv.cls <- cv.cls.tree$size[which.min(cv.cls.tree$dev)])
Prune + Predict
prune.cls.tree <- prune.tree(cls.tree, best = min.cv.cls)
cmed.pred <- predict(cls.tree, newdata = Boston.cls[-train, ], type = "class")
cmed.true <- Boston.cls$cmed[-train]
table(cmed.pred, cmed.true)
mean(cmed.pred == cmed.true)
4. Random Forest
Predict
cmed.rf <- predict(rf.cls, newdata = Boston.cls[-train, ], type = "class")
table(cmed.rf, cmed.true)
mean(cmed.rf == cmed.true)
3
MH4510 - Statistical Learning and Data Mining - AY1819 S1 Lab 09
5. Bagging
Predict
medv.bag <- predict(bag.reg, newdata = Boston[-train, ])
mean((medv.bag - medv.true) ^ 2)
6. Boosting
set.seed(1)
boost.reg <- gbm(medv ~ .,
data = Boston[train, ],
distribution = 'gaussian', # bernoulli for classification
n.trees = 5000,
interaction.depth = 1)
boost.reg
summary(boost.reg)
Predict
medv.boost <- predict(boost.reg, newdata = Boston[-train, ], n.trees = 5000)
mean((medv.boost - medv.true) ^ 2)