Professional Documents
Culture Documents
AST-531
Submitted to:
Dr. Md Hasinur Rahman Khan
Professor
Institute of Statistical Research and Training
Submiited from:
Name:Md. Emarat Hosen
Class Roll: AE-1136
Reg: 2017515117
4th Feb 2023
1
Million song dataset
library(tidyverse)
library(skimr)
library(kableExtra)
data <- read.csv("C:/Users/Downloads/dataset.txt", header=FALSE) data
<- data %>%
arrange(V1)
dim(data)
[1] 1385 91
Overall
n 1385
V1 (mean (SD)) 1998.94 (11.76)
V2 (mean (SD)) 43.31 (6.36)
V3 (mean (SD)) 0.58 (51.08)
V4 (mean (SD)) 7.39 (37.19)
V5 (mean (SD)) 5.27 (17.73)
V6 (mean (SD)) -5.97 (23.17)
V7 (mean (SD)) -7.41 (15.63)
V8 (mean (SD)) -1.14 (14.71)
V9 (mean (SD)) -1.71 (8.60)
V10 (mean (SD)) 3.52 (10.14)
V11 (mean (SD)) 2.48 (6.53)
V12 (mean (SD)) 0.09 (4.84)
V13 (mean (SD)) 2.12 (8.54)
V14 (mean (SD)) 31.77 (19.53)
V15 (mean (SD)) 2455.23 (1853.52)
V16 (mean (SD)) 2109.47 (1371.32)
V17 (mean (SD)) 1673.77 (1141.97)
V18 (mean (SD)) 909.39 (513.24)
V19 (mean (SD)) 1008.28 (599.08)
V20 (mean (SD)) 628.86 (355.01)
V21 (mean (SD)) 530.68 (274.16)
2
V22 (mean (SD)) 386.76 (240.11)
V23 (mean (SD)) 334.48 (143.40)
V24 (mean (SD)) 334.74 (216.89)
V25 (mean (SD)) 291.67 (163.16)
V26 (mean (SD)) 51.17 (118.97)
V27 (mean (SD)) 186.87 (826.76)
V28 (mean (SD)) -163.84 (619.64)
V29 (mean (SD)) -44.54 (265.14)
V30 (mean (SD)) 38.52 (174.29)
V31 (mean (SD)) 74.34 (152.03)
V32 (mean (SD)) 7.53 (114.54)
V33 (mean (SD)) 28.55 (69.26)
V34 (mean (SD)) 6.29 (78.90)
V35 (mean (SD)) 8.07 (66.47)
V36 (mean (SD)) -7.51 (47.76)
V37 (mean (SD)) 78.23 (112.47)
V38 (mean (SD)) -50.20 (465.36)
V39 (mean (SD)) 132.38 (491.68)
V40 (mean (SD)) -203.68 (288.88)
V41 (mean (SD)) 17.87 (208.01)
V42 (mean (SD)) 0.04 (124.44)
V43 (mean (SD)) 5.00 (130.84)
V44 (mean (SD)) -49.65 (79.13)
V45 (mean (SD)) -1.80 (41.97)
V46 (mean (SD)) -0.80 (42.26)
V47 (mean (SD)) -1.35 (65.66)
V48 (mean (SD)) 102.06 (462.30)
V49 (mean (SD)) 162.39 (378.04)
V50 (mean (SD)) -103.65 (243.78)
V51 (mean (SD)) 6.28 (153.37)
V52 (mean (SD)) 18.93 (101.79)
V53 (mean (SD)) 23.00 (84.71)
V54 (mean (SD)) 7.77 (82.90)
V55 (mean (SD)) -5.04 (85.70)
V56 (mean (SD)) -8.74 (53.28)
V57 (mean (SD)) 89.32 (311.32)
V58 (mean (SD)) 113.03 (362.00)
V59 (mean (SD)) 2.41 (304.36)
V60 (mean (SD)) 34.82 (180.74)
V61 (mean (SD)) -33.51 (138.90)
V62 (mean (SD)) 12.40 (68.25)
V63 (mean (SD)) 3.66 (51.25)
V64 (mean (SD)) -1.87 (41.95)
V65 (mean (SD)) -146.03 (303.20)
V66 (mean (SD)) -29.43 (240.05)
V67 (mean (SD)) -14.48 (135.80)
V68 (mean (SD)) -3.38 (115.37)
V69 (mean (SD)) 35.83 (136.47)
V70 (mean (SD)) 30.69 (106.55)
V71 (mean (SD)) 3.76 (36.11)
V72 (mean (SD)) 6.00 (286.80)
V73 (mean (SD)) 18.07 (258.22)
V74 (mean (SD)) -20.73 (188.16)
V75 (mean (SD)) -7.08 (67.92)
3
V76 (mean (SD)) -26.89 (70.00)
V77 (mean (SD)) -6.69 (29.30)
V78 (mean (SD)) -29.69 (253.43)
V79 (mean (SD)) 7.07 (149.40)
V80 (mean (SD)) -129.89 (253.63)
V81 (mean (SD)) 22.16 (131.18)
V82 (mean (SD)) 13.95 (29.42)
V83 (mean (SD)) -75.67 (169.42)
V84 (mean (SD)) 38.50 (142.36)
V85 (mean (SD)) 51.49 (111.49)
V86 (mean (SD)) -1.27 (16.94)
V87 (mean (SD)) 14.67 (129.02)
V88 (mean (SD)) -20.19 (176.42)
V89 (mean (SD)) 4.56 (16.02)
V90 (mean (SD)) 15.32 (181.82)
V91 (mean (SD)) 1.71 (21.05)
Census Dataset
# Data
data <- read.csv("C:/Users/W.C/Downloads/train/train.csv", header = TRUE)
data <- na.omit(data) %>%
mutate(income_level = ifelse(income_level=="-50000",1,0))
FALSE
410000
table(data$income_level)
0 1
600 9400
ind <- sample(2, nrow(data), replace = TRUE, prob = c(0.7, 0.3))
train <- data[ind==1,]
test <- data[ind==2,]
table(train$income_level)
0 1
424 6645
# Data for Developing Predictive Model
table(train$income_level)
0 1
424 6645
prop.table(table(train$income_level))
4
0 1
0.0599802 0.9400198
summary(train)
age class_of_worker industry_code occupation_code
Min. : 0.00 Length:7069 Min. : 0.00 Min. . . . .0.00
1st Qu.:15.00 Class :character 1st Qu.: 0.00 1st Qu. .0.00
Median :33.00 Mode :character Median : 2.00 Median. .2.00
Mean :34.22 Mean :15.77 Mean....11.82
3rd Qu.:49.00 3rd Qu.:33.00 3rd Qu.:26.00
Max. :90.00 Max. :51.00 Max. :46.00
education wage_per_hour enrolled_in_edu_inst_lastwk
Length:7069 Min. : 0.00 Length:7069
Class :character 1st Qu.: 0.00 Class :character
Mode :character Median : 0.00 Mode :character
Mean : 54.04
3rd Qu.: 0.00
Max. :4807.00
marital_status major_industry_code major_occupation_code
Length:7069 Length:7069 Length:7069
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
5
d_household_family_stat d_household_summary migration_msa
Length:7069 Length:7069 Length:7069
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
[1] 7069 41
library(ROSE)
over <- ovun.sample(income_level~., data = train, method = "over", N = nrow(data)*1.5)$data
table(over$income_level)
0 1
3958 6645
6
summary(over)
age class_of_worker industry_code occupation_code
Min. : 0.00 Length:10603 Min. : 0.00 Min. . . . .0.00
1st Qu.:25.00 Class :character 1st Qu.: 0.00 1st Qu. .0.00
Median :39.00 Mode :character Median :23.00 Median. .4.00
Mean :38.14 Mean :19.93 Mean....11.95
3rd Qu.:51.00 3rd Qu.:36.00 3rd Qu.:24.00
Max. :90.00 Max. :51.00 Max. :46.00
education wage_per_hour enrolled_in_edu_inst_lastwk
Length:10603 Min. : 0.00 Length:10603
Class :character 1st Qu.: 0.00 Class :character
Mode :character Median : 0.00 Mode :character
Mean : 62.54
3rd Qu.: 0.00
Max. :4807.00
marital_status major_industry_code major_occupation_code
Length:10603 Length:10603 Length:10603
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
7
Mode :character Mode :character Mode :character
0 1
424 6291
both <- ovun.sample(income_level~., data=train, method = "both",
p = 0.5,
seed = ,
N = nrow(data))$data
table(both$income_level)
0 1
8
3498 3571
# Predictive Model (Random Forest)
library(randomForest)
rftrain <- randomForest(income_level~., data = train)
rfover <- randomForest(income_level~., data = over)
rfunder <- randomForest(income_level~., data=under)
rfboth <-randomForest(income_level~., data=both)
###
## TRUE model
pr_t <- predict( rftrain, test[,-41])
p_t <- ifelse(pr_t<.5,0,1)
mean(p_t == test$income_level)
[1] 0.9535995
## over sample
pr_o <- predict( rfover, test[,-41])
p_o <- ifelse(pr_o<.5,0,1)
mean(p_o == test$income_level)
[1] 0.9477994
## Under
[1] 0.954623
## bOth
[1] 0.9321051
compar <- data.frame(TRUE_model= mean(p_t == test$income_level), Over_sample = mean(p_o == test$income_
compar %>%
round(digits = 4)*100
TRUE_model Over_sample Under Both
1 95.36 94.78 95.46 93.21
Question 3
Naive Bayes model
library(readxl)
library(stringr)
dataset <- read_excel("C:/Users/W.C/Downloads/dataset.xlsx")
dataset <- dataset %>%
mutate( Negative = str_replace(Negative, "-",""))
library(caret)
9
newdata <- dataset %>%
sample_n(nrow(dataset), replace = FALSE)
library(caTools)
z <- sample.split(newdata$response, SplitRatio = .7)
tr <- newdata[z==TRUE,]
tt <- newdata[z==FALSE,]
#### another
td <- dummyVars( ~ response + Negative, data = tr)
trn <- predict(td, tr)
train <- as.data.frame(trn)
1
0