You are on page 1of 1

setwd('c:/Users/i311766/Desktop')

inData <- read.csv('raw_quiz_results_20140212.txt', skip=46, stringsAsFactors=FALSE)


colnames(inData) <- c(
'OBS_ID', 'DB_USED', 'QUIZ_LANG', 'T_START_QUIZ', 'T_SHOW_RESULTS',
'Q1_T', 'Q1_ID', 'Q1_ANS', 'Q1_HIT',
'Q2_T', 'Q2_ID', 'Q2_ANS', 'Q2_HIT',
'Q3_T', 'Q3_ID', 'Q3_ANS', 'Q3_HIT',
'Q4_T', 'Q4_ID', 'Q4_ANS', 'Q4_HIT',
'Q5_T', 'Q5_ID', 'Q5_ANS', 'Q5_HIT')
inData$DB_USED <- as.factor(inData$DB_USED)
inData$QUIZ_LANG <- as.factor(inData$QUIZ_LANG)
inData$T_START_QUIZ <- as.POSIXlt(inData$T_START_QUIZ)
inData$T_SHOW_RESULTS <- as.POSIXlt(inData$T_SHOW_RESULTS)
inData$Q1_T
inData$Q2_T
inData$Q3_T
inData$Q4_T
inData$Q5_T

<<<<<-

inData$Q1_HIT
inData$Q2_HIT
inData$Q3_HIT
inData$Q4_HIT
inData$Q5_HIT

as.POSIXlt(inData$Q1_T)
as.POSIXlt(inData$Q2_T)
as.POSIXlt(inData$Q3_T)
as.POSIXlt(inData$Q4_T)
as.POSIXlt(inData$Q5_T)
<<<<<-

as.logical(inData$Q1_HIT)
as.logical(inData$Q2_HIT)
as.logical(inData$Q3_HIT)
as.logical(inData$Q4_HIT)
as.logical(inData$Q5_HIT)

### add features


inData$Q1_SOLVETIME
inData$Q2_SOLVETIME
inData$Q3_SOLVETIME
inData$Q4_SOLVETIME
inData$Q5_SOLVETIME

<<<<<-

cut(as.numeric(inData$Q1_T-inData$T_START_QUIZ), breaks=c(-0.01,10,20,30,60,999999))
cut(as.numeric(inData$Q2_T-inData$Q1_T), breaks=c(0,10,20,30,60,999999))
cut(as.numeric(inData$Q3_T-inData$Q2_T), breaks=c(0,10,20,30,60,999999))
cut(as.numeric(inData$Q4_T-inData$Q3_T), breaks=c(0,10,20,30,60,999999))
cut(as.numeric(inData$Q5_T-inData$Q4_T), breaks=c(0,10,20,30,60,999999))

### exploratory analysis


for(i in seq(1,5)) {
print(paste0('Question ',i))
print(table(inData$DB_USED, inData[,paste0('Q',qNo,'_HIT')]))
print(table(inData$DB_USED, inData[,paste0('Q',qNo,'_SOLVETIME')]))
#plot(table(inData$DB_USED, inData[,paste0('Q',qNo,'_SOLVETIME')]))
}
### try
#hist(as.numeric(inData$T_SHOW_RESULTS-inData$T_START_QUIZ), 50)
library(randomForest)
oForest <- randomForest(DB_USED ~ Q1_SOLVETIME + Q2_SOLVETIME + Q3_SOLVETIME + Q4_SOLVETIME + Q5_SOLVETIME,
inDataSmall,
importance=TRUE, proximity=TRUE,
ntree=2)
print(oForest)

library(rpart)
oTree <- rpart(DB_USED ~ Q1_SOLVETIME + Q2_SOLVETIME + Q3_SOLVETIME + Q4_SOLVETIME + Q5_SOLVETIME,
inData)
predict(oTree, type="class")
table(predict(oTree, type="class"), inData$DB_USED)
###
library(binr)
head(as.numeric(inData$Q2_T-inData$Q1_T))
a<-cut(as.numeric(inData$Q2_T-inData$Q1_T), breaks=c(0,10,20,30,60,999999))
cut <- bins(as.numeric(inData$Q2_T-inData$Q1_T), target.bins=10, minpt=10)
bins.getvals(cut, minpt = -Inf, maxpt = Inf)
head(cut$xtbl)
cut$binct
table(inData$Q2_T-inData$Q1_T, inData$Q1_HIT)

You might also like