Professional Documents
Culture Documents
N1 <- nrow(X1.c)
s2 <- s2.p <- 1/N1 * rss.c #######################
# Forward stepwise regression (continue with the previous homework assignment):
# 6. Select the predictor variables of target variable by the forward stepwise
regression
# method using the training data (X1.c, y1.c) and gcv. [p10 - p13]
# (the selected predictor variables are in P1 = P0[J1])
# p10
### Null model: Regression with no predictors
# (y1.c, X1.c) are used for the predictor selection
# Assume b0 = 0
# AIC
df.reg <- 0
py1.c <- rep(0, length(y1.c))
rss.c <- t(y1.c - py1.c) %*% (y1.c - py1.c)
s2 <- rss.c / N1
aic.0 <- N1 * log(s2) + 2 * df.reg
# GCV
gcv.0 <- s2 * 1/(1 - df.reg/N1)^2
# Cp
cp.0 <- s2 * (1 + 2 * (df.reg/N1) * s2.p/s2)
# p11
### Stepwise regression: Select the presictors one at a time
# to build the regression model of the least AIC/GCV/Cp using
# the (column-centered) training data.
# p12
# Forward stepwise regression
cr <- "gcv" # "cp", "aic"
repeat{
Cp.1 <- Gcv.1 <- Aic.1 <- NULL
for (i in 1:length(J2)){
A.i <- as.matrix(X1.c[, union(J1, J2[i])]) # Add a predictor
M1 <- MASS::ginv(t(A.i) %*% A.i)
b1.i <- M1 %*% t(A.i) %*% y1.c
py1.c <- A.i %*% b1.i
rss.c <- t(y1.c - py1.c) %*% (y1.c - py1.c)
s2 <- rss.c / N1
if (cr == "gcv"){
if (min(Gcv.1) > gcv.0){break}
i1 <- which.min(Gcv.1) # Add predictor producing the least cp
gcv.0 <- Gcv.1[i1] # Update gcv.0
}
if (cr == "cp"){
if (min(Cp.1) > cp.0){break}
i1 <- which.min(Cp.1) # Add predictor producing the least cp
cp.0 <- Cp.1[i1] # Update cp.0
}
J1 <- c(J1, J2[i1])
J2 <- setdiff(J2, J1)
if (length(J1) == np.max){break}
}
P0[J1] # The selected predictors
P1 = P0[J1]
# 7. Fit the linear regression with the selected predictor variables to the training data
X1 <- Z1[, P1]
y1 <- Z1[, T1]
X1.c <- scale(X1, center = TRUE, scale = FALSE)
y1.c <- scale(y1, center = TRUE, scale = FALSE)
#Fitted b0, b1:
M1 <- MASS::ginv(t(X1.c) %*% X1.c)
b1 <- M1 %*% t(X1.c) %*% y1.c
b0 <- mean(y1 - X1 %*% b1)
err.te
py2
### Regression tree:
# 1. Assume the predictor variable is YGL209W (P0[1]). Let the target variable
# YGL035C (T1) be modeled as a "binary piecewise defined constant function" of this
# predictor variable. Find the best split point for this binary piecewise defined
constant
# function and the corresponding total mean squared error (tmse) using the training
data.
# [p6]
# p2
### Training error of constant function model
er.y <- function(y) {
# y = (y.1,..., y.n)^T = data vector of target variable
if (length(y) == 0){
return(0)
}else{
py <- mean(y)
mse <- mean((y - py)^2)
return(mse)
}
}
# er.y(y1)
# p6
### R function for selecting an optimal split point
sp.x <- function(x, y) {
# x = Data vector of a predictor variable
# y = Data vector of the target variable
x <- unique(x)
for (i in 1:length(x)){
x.i <- x[i]
# Computing tmse(x.i)
I.l <- which(x <= x.i) # left index set
I.r <- which(x > x.i) # right index set
er.1 <- er.y(y[I.l])
er.r <- er.y(y[I.r])
er <- er.1 + er.r # er = tmse(x_i)
# 2. Find the best split point variable from P0 and its associated best split point. Also,
the
# tmse of y corresponding to this split. [p9]
# p9
### Recursively build a regression tree (rt)
# The algorithm estimates c_m, R_m (m = 1,..., M) and M at one time
rbrt <- function (X, y, d){
# X = Data of given features (predictor variables)
# y = Data of target variable
# d = Maximum depth of regression tree
# Note: (root (d = 0) split> 1, r (d = 1) split>...). d controls
# M = number of leaves = size of rt
# (Find a binary tree diagram on Google.)
if (d == 0 || length(unique(y)) == 1){
return (mean (y)) # mean (y) = c_m^ if x is in R_m (m = 1,..., M)
}
# Select the best predictor variable and split point of that variable
for (j in 1:ncol(X)){
x_j <- X[, j] # Data of predictor variable j
al <- sp.x(x_j, y)
sp_j <- al$sp # Chosen split point of x_j
er_j <- al$tmse # Total mse of y corresponding to the split
d=2
rbrt <- function (X1, y1, d)