You are on page 1of 4

# package for trees library(rpart) # package including data from Elements of Statistical Learning library(ElemStatLearn) data

(spam) # make response a 0-1 outcome #spam$spam = ifelse(spam$spam=="spam",1,0) spam.sub = c(1:nrow(spam))[spam$spam == 'spam'] nospam.sub = c(1:nrow(spam))[spam$spam == 'email'] # use 2/3 for training, 1/3 for test train.spam = sample(spam.sub,floor(length(spam.sub)*2/3)) train.email = sample(nospam.sub,floor(length(nospam.sub)*2/3)) train = c(train.spam,train.email) train.set = spam[train,] test.set = spam[-train,] rpart.spam = rpart(spam ~ ., data=train.set, method="class", parms=list(split="gini")) # take a look at the decision rule print(summary(rpart.spam)) png("spam_tree.png", height=600, width=900) # visualize it (gets difficult for bigger trees) post(rpart.spam, filename='') dev.off() # predict the labels for the test set predict.spam = predict(rpart.spam, test.set) plabels.spam = colnames(predict.spam)[apply(predict.spam, 1, which.max)] # compute the various measures of accuracy classification.summary = function(plabels, tlabels) { # true positives: things we labelled spam that are spam

TN=TN.2) colnames(C) = c('predicted spam'. data=train. width=800) # you can control some aspects of the tree building process # with rpart.spec=spec)) } s = classification.2.FP=FP.set$spam) print(s) png("spam_cptree. xval=20)) post(rpart.spam.spam == 'spam') * (tlabels == 'spam')) # false positives: things we labelled spam that are email FP = sum((plabels.spam.png".FN.00001.FP.. 'truly email') return(list(A=A. parms=list(split="gini"). control=rpart.C=C. filename='') .spam == 'email') * (tlabels == 'email')) # false negatives: things we labelled email that are spam FN = sum((plabels.TN). test.control rpart.spam == 'spam') * (tlabels == 'email')) # true negatives: things we labelled email that are email TN = sum((plabels.FN=FN.summary(plabels.TP=TP. height=1200.TP = sum((plabels. method="class".control(cp=0.set.spam. 'predicted email') rownames(C) = c('truly spam'.spam == 'email') * (tlabels == 'spam')) # accuracy A = (TP+TN) / (TP+TN+FP+FN) # sensitivity sens = TP / (TP+FN) # specificity spec = TN / (TN+FP) # precision prec = TP / (TP+FN) # confusion matrix C = matrix(c(TP.deeper = rpart(spam ~ .sens=sens.

train.set.sub. method="class".sub)*2/3)) train = c(train.set. height=600.set = spam[train.set = spam[train. width=600) train.set = spam[-train. method="class".sub)*2/3)) train. parms=list(split="gini")) post(rpart.sub)*2/3)) train.floor(length(spam. data=train. height=600.email) train.email) train. width=600) train.spam = rpart(spam ~ .email) train.png".sub.sub.off() png("spam_repeat1.sub)*2/3)) train = c(train.. width=600) train.] rpart.sub)*2/3)) train = c(train.png". filename='') dev. data=train. height=600.sub.set = spam[-train. data=train.off() png("spam_repeat2.set = spam[train.] test.spam = sample(spam.png".floor(length(nospam.floor(length(nospam. width=600) .floor(length(spam.] test.spam.spam = sample(spam.spam.spam.] test.] rpart.spam.email = sample(nospam.dev.train.spam = sample(spam. parms=list(split="gini")) post(rpart. parms=list(split="gini")) post(rpart.off() # let's look at the stability of the tree png("spam_repeat0.png".off() png("spam_repeat3. filename='') dev.email = sample(nospam.] rpart. filename='') dev..sub.sub)*2/3)) train.spam = rpart(spam ~ . method="class".floor(length(spam.set.sub.train.floor(length(nospam. height=600.spam = rpart(spam ~ .spam.spam.email = sample(nospam..set = spam[-train.

train.train.set.spam = rpart(spam ~ .1.png". sens.spam.spam = rep('email'.sub)*2/3)) train. height=600. type='l'.'spam'])) sens = c() spec = c() for (ll in l) { plabels.] test.set = spam[train.email = sample(nospam.spam.] test.train.sub)*2/3)) train = c(train.set = spam[-train. lwd=2) abline(0.sub. nrow(predict. data=train.set = spam[train.sub.email) train.spam = predict(rpart.spam[(predict.email) train.] rpart. filename='') dev.set$spam) sens = c(sens. parms=list(split="gini")) post(rpart.spam = sample(spam.'spam'] >= ll)] = 'spam' s = classification. col='blue') dev.off() .spam = rpart(spam ~ . width=600) predict.sub)*2/3)) train..summary(plabels.off() png("spamROC. data=train. s$spec) } sens = c(1. parms=list(split="gini")) post(rpart.floor(length(nospam.] rpart.sub)*2/3)) train = c(train. lty=2.spam.sub.floor(length(spam.0) spec = c(0.floor(length(spam..off() png("spam_repeat4.floor(length(nospam.spam.lwd=2. width=600) train.spam[.1) plot(1-spec.email = sample(nospam.set. s$sens) spec = c(spec.sens. height=600.spam.png".spec. test.spam[. col='red'.spam = sample(spam.spam. method="class". test.spam)) plabels.set) l = sort(unique(predict.set = spam[-train. method="class".sub. filename='') dev.