You are on page 1of 14

數學規劃 HW11

# Data for the assignment:


# install.packages("BiocManager")
# BiocManager::install("grndata")
library(grndata)
data(package = "grndata")

# Yeast gene expression data


gnw2000.data <- as.matrix(gnw2000.data)
dim(gnw2000.data)
N0 <- 100
p0 <- 40
Z0 <- gnw2000.data[1:N0, 1:p0]
set.seed(123)
I1 <- sample(1:N0, N0/2)
Z1 <- Z0[I1, 1:p0] # Training data
I2 <- setdiff(1:N0, I1)
Z2 <- Z0[I2, 1:p0] # Test data
T1 <- colnames(Z1)[1] # Target variable
P0 <- setdiff(colnames(Z1), T1) # Predictor variables
# Traget and predictor variables are yeast genes.

X1 <- Z1[, P0]


y1 <- Z1[, T1]

y1 <- Z1[, T1] # Data of target variable


X1 <- Z1[, P0]
y1.c <- scale(y1, center = T, scale = F)
X1.c <- scale(X1, center = T, scale = F)

N1 <- nrow(X1.c)
s2 <- s2.p <- 1/N1 * rss.c #######################
# Forward stepwise regression (continue with the previous homework assignment):
# 6. Select the predictor variables of target variable by the forward stepwise
regression
# method using the training data (X1.c, y1.c) and gcv. [p10 - p13]
# (the selected predictor variables are in P1 = P0[J1])

# p10
### Null model: Regression with no predictors
# (y1.c, X1.c) are used for the predictor selection
# Assume b0 = 0

# AIC
df.reg <- 0
py1.c <- rep(0, length(y1.c))
rss.c <- t(y1.c - py1.c) %*% (y1.c - py1.c)
s2 <- rss.c / N1
aic.0 <- N1 * log(s2) + 2 * df.reg

# GCV
gcv.0 <- s2 * 1/(1 - df.reg/N1)^2

# Cp
cp.0 <- s2 * (1 + 2 * (df.reg/N1) * s2.p/s2)

# p11
### Stepwise regression: Select the presictors one at a time
# to build the regression model of the least AIC/GCV/Cp using
# the (column-centered) training data.

# Indices of selected and un-selected predictors/risk factors


J1 <- NULL # Indices of selected predictors.
J2 <- 1:ncol(X1.c) # Indices of unselected predictors.

# Maximum number of predictors of regression <= min(N1, p1)


np.max <- min(nrow(X1.c), ncol(X1.c), Inf)

# p12
# Forward stepwise regression
cr <- "gcv" # "cp", "aic"

repeat{
Cp.1 <- Gcv.1 <- Aic.1 <- NULL
for (i in 1:length(J2)){
A.i <- as.matrix(X1.c[, union(J1, J2[i])]) # Add a predictor
M1 <- MASS::ginv(t(A.i) %*% A.i)
b1.i <- M1 %*% t(A.i) %*% y1.c
py1.c <- A.i %*% b1.i
rss.c <- t(y1.c - py1.c) %*% (y1.c - py1.c)
s2 <- rss.c / N1

H1 <- A.i %*% M1 %*% t(A.i)


df.reg <- round(sum(diag(H1))) # df.reg <- ncol(A.i)

aic.1 <- N1 * log(s2) + 2 * df.reg


gcv.1 <- s2 * 1/(1 - df.reg/N1)^2
cp.1 <- s2 * (1 + 2 * df.reg/N1 * s2.p/s2)

Aic.1 <- c(Aic.1, aic.1)


Gcv.1 <- c(Gcv.1, gcv.1)
Cp.1 <- c(Cp.1, cp.1)
}
# p13
if (cr == "aic"){
if (min(Aic.1) > aic.0){break}
i1 <- which.min(Aic.1)# Add predictor producing the least cp
aic.0 <- Aic.1[i1] # Update aic.0
}

if (cr == "gcv"){
if (min(Gcv.1) > gcv.0){break}
i1 <- which.min(Gcv.1) # Add predictor producing the least cp
gcv.0 <- Gcv.1[i1] # Update gcv.0
}
if (cr == "cp"){
if (min(Cp.1) > cp.0){break}
i1 <- which.min(Cp.1) # Add predictor producing the least cp
cp.0 <- Cp.1[i1] # Update cp.0
}
J1 <- c(J1, J2[i1])
J2 <- setdiff(J2, J1)
if (length(J1) == np.max){break}
}
P0[J1] # The selected predictors
P1 = P0[J1]

# 7. Fit the linear regression with the selected predictor variables to the training data
X1 <- Z1[, P1]
y1 <- Z1[, T1]
X1.c <- scale(X1, center = TRUE, scale = FALSE)
y1.c <- scale(y1, center = TRUE, scale = FALSE)
#Fitted b0, b1:
M1 <- MASS::ginv(t(X1.c) %*% X1.c)
b1 <- M1 %*% t(X1.c) %*% y1.c
b0 <- mean(y1 - X1 %*% b1)

pred.lm <- function(Z1, b0, M1){

y1 <- Z1[, T1]


X1 <- Z1[, P1]
py1 <- b0 + X1 %*% M1
mse <- mean((y1 - py1)^2)
return(list(py1 = py1, mse = mse))
}

a1 <- pred.lm(Z1, b0, M1)


a1
# 8. Find the predicted y2 and compute the test error on (X2, y2):
# X2 <- Z2[, P1]
# y2 <- Z2[, T1]
# py2 <- b0 + X2 %*% b1
# s2 <- mean((y2 - py2)^2)

X2 <- Z2[, P1]


y2 <- Z2[, T1]
X2.c <- scale(X2, center = TRUE, scale = FALSE)
y2.c <- scale(y2, center = TRUE, scale = FALSE)
#Fitted b0, b1:
M2 <- MASS::ginv(t(X2.c) %*% X2.c)
b1_new <- M2 %*% t(X2.c) %*% y2.c
b0_new <- mean(y2 - X2 %*% b1_new)

pred.lm <- function(Z1, b0, M1){

y1 <- Z1[, T1]


X1 <- Z1[, P1]
py1 <- b0 + X1 %*% M1
mse <- mean((y1 - py1)^2)
return(list(py1 = py1, mse = mse))
}

a2 <- pred.lm(Z2, b0_new, M2)


err.te <- a2$mse
py2 = a2$py1

err.te
py2
### Regression tree:
# 1. Assume the predictor variable is YGL209W (P0[1]). Let the target variable
# YGL035C (T1) be modeled as a "binary piecewise defined constant function" of this
# predictor variable. Find the best split point for this binary piecewise defined
constant
# function and the corresponding total mean squared error (tmse) using the training
data.
# [p6]

# p2
### Training error of constant function model
er.y <- function(y) {
# y = (y.1,..., y.n)^T = data vector of target variable
if (length(y) == 0){
return(0)
}else{
py <- mean(y)
mse <- mean((y - py)^2)
return(mse)
}
}
# er.y(y1)

# p6
### R function for selecting an optimal split point
sp.x <- function(x, y) {
# x = Data vector of a predictor variable
# y = Data vector of the target variable

sp.opt <- NULL # Initial best split point


er.opt <- Inf # Total mse corresponding to sp.opt

x <- unique(x)
for (i in 1:length(x)){
x.i <- x[i]
# Computing tmse(x.i)
I.l <- which(x <= x.i) # left index set
I.r <- which(x > x.i) # right index set
er.1 <- er.y(y[I.l])
er.r <- er.y(y[I.r])
er <- er.1 + er.r # er = tmse(x_i)

# Update of sp.opt and er.opt

if (er < er.opt) {


sp.opt <- x.i
er.opt <- er
}
}
return(list(sp = sp.opt, tmse = er.opt))
}
X1 <- Z1[, P0]
y1 <- Z1[, T1]
sp.x(X1[, 1], y1)

# 2. Find the best split point variable from P0 and its associated best split point. Also,
the
# tmse of y corresponding to this split. [p9]

# p9
### Recursively build a regression tree (rt)
# The algorithm estimates c_m, R_m (m = 1,..., M) and M at one time
rbrt <- function (X, y, d){
# X = Data of given features (predictor variables)
# y = Data of target variable
# d = Maximum depth of regression tree
# Note: (root (d = 0) split> 1, r (d = 1) split>...). d controls
# M = number of leaves = size of rt
# (Find a binary tree diagram on Google.)

if (d == 0 || length(unique(y)) == 1){
return (mean (y)) # mean (y) = c_m^ if x is in R_m (m = 1,..., M)
}

sp.opt <- NULL # Initial best split predictor + split point


er.opt <- Inf # Total mse corresponding to sp.opt

# Select the best predictor variable and split point of that variable
for (j in 1:ncol(X)){
x_j <- X[, j] # Data of predictor variable j
al <- sp.x(x_j, y)
sp_j <- al$sp # Chosen split point of x_j
er_j <- al$tmse # Total mse of y corresponding to the split

if (er_j < er.opt){


er.opt <- er_j
sp.opt <- list(j = j, sp = sp_j)
}
}

d=2
rbrt <- function (X1, y1, d)

if (is.null(sp.opt)){return (mean (y))} #####


# Otherwise, split the block into 2 sub-blocks.
I.1 <- which (X[, sp.opt$j] <= sp.opt$sp)
I.r <- which (X[, sp.opt$j] > sp.opt$sp)

# Recursions: rbrt() calls rbrt ()


lrt <- rbrt (X[I.1, ], y[I.1], d - 1) # Left regression tree
rrt <- rbrt (X[I.r, ], y[I.r], d - 1) # Right regression tree

return (list (j = sp.opt$j, sp = sp.opt$sp, lrt = lrt, rrt = rrt))


}
rbrt (X1, y1, d = 2)

You might also like