You are on page 1of 10

Assignment-2

AST-531

Submitted to:
Dr. Md Hasinur Rahman Khan
Professor
Institute of Statistical Research and Training

Submiited from:
Name:Md. Emarat Hosen
Class Roll: AE-1136
Reg: 2017515117
4th Feb 2023

1
Million song dataset
library(tidyverse)
library(skimr)
library(kableExtra)
data <- read.csv("C:/Users/Downloads/dataset.txt", header=FALSE) data
<- data %>%
arrange(V1)
dim(data)

[1] 1385 91

Analysis the basic datset


library(tableone)
k = CreateTableOne(data = data)
print(k$ContTable)

Overall
n 1385
V1 (mean (SD)) 1998.94 (11.76)
V2 (mean (SD)) 43.31 (6.36)
V3 (mean (SD)) 0.58 (51.08)
V4 (mean (SD)) 7.39 (37.19)
V5 (mean (SD)) 5.27 (17.73)
V6 (mean (SD)) -5.97 (23.17)
V7 (mean (SD)) -7.41 (15.63)
V8 (mean (SD)) -1.14 (14.71)
V9 (mean (SD)) -1.71 (8.60)
V10 (mean (SD)) 3.52 (10.14)
V11 (mean (SD)) 2.48 (6.53)
V12 (mean (SD)) 0.09 (4.84)
V13 (mean (SD)) 2.12 (8.54)
V14 (mean (SD)) 31.77 (19.53)
V15 (mean (SD)) 2455.23 (1853.52)
V16 (mean (SD)) 2109.47 (1371.32)
V17 (mean (SD)) 1673.77 (1141.97)
V18 (mean (SD)) 909.39 (513.24)
V19 (mean (SD)) 1008.28 (599.08)
V20 (mean (SD)) 628.86 (355.01)
V21 (mean (SD)) 530.68 (274.16)

2
V22 (mean (SD)) 386.76 (240.11)
V23 (mean (SD)) 334.48 (143.40)
V24 (mean (SD)) 334.74 (216.89)
V25 (mean (SD)) 291.67 (163.16)
V26 (mean (SD)) 51.17 (118.97)
V27 (mean (SD)) 186.87 (826.76)
V28 (mean (SD)) -163.84 (619.64)
V29 (mean (SD)) -44.54 (265.14)
V30 (mean (SD)) 38.52 (174.29)
V31 (mean (SD)) 74.34 (152.03)
V32 (mean (SD)) 7.53 (114.54)
V33 (mean (SD)) 28.55 (69.26)
V34 (mean (SD)) 6.29 (78.90)
V35 (mean (SD)) 8.07 (66.47)
V36 (mean (SD)) -7.51 (47.76)
V37 (mean (SD)) 78.23 (112.47)
V38 (mean (SD)) -50.20 (465.36)
V39 (mean (SD)) 132.38 (491.68)
V40 (mean (SD)) -203.68 (288.88)
V41 (mean (SD)) 17.87 (208.01)
V42 (mean (SD)) 0.04 (124.44)
V43 (mean (SD)) 5.00 (130.84)
V44 (mean (SD)) -49.65 (79.13)
V45 (mean (SD)) -1.80 (41.97)
V46 (mean (SD)) -0.80 (42.26)
V47 (mean (SD)) -1.35 (65.66)
V48 (mean (SD)) 102.06 (462.30)
V49 (mean (SD)) 162.39 (378.04)
V50 (mean (SD)) -103.65 (243.78)
V51 (mean (SD)) 6.28 (153.37)
V52 (mean (SD)) 18.93 (101.79)
V53 (mean (SD)) 23.00 (84.71)
V54 (mean (SD)) 7.77 (82.90)
V55 (mean (SD)) -5.04 (85.70)
V56 (mean (SD)) -8.74 (53.28)
V57 (mean (SD)) 89.32 (311.32)
V58 (mean (SD)) 113.03 (362.00)
V59 (mean (SD)) 2.41 (304.36)
V60 (mean (SD)) 34.82 (180.74)
V61 (mean (SD)) -33.51 (138.90)
V62 (mean (SD)) 12.40 (68.25)
V63 (mean (SD)) 3.66 (51.25)
V64 (mean (SD)) -1.87 (41.95)
V65 (mean (SD)) -146.03 (303.20)
V66 (mean (SD)) -29.43 (240.05)
V67 (mean (SD)) -14.48 (135.80)
V68 (mean (SD)) -3.38 (115.37)
V69 (mean (SD)) 35.83 (136.47)
V70 (mean (SD)) 30.69 (106.55)
V71 (mean (SD)) 3.76 (36.11)
V72 (mean (SD)) 6.00 (286.80)
V73 (mean (SD)) 18.07 (258.22)
V74 (mean (SD)) -20.73 (188.16)
V75 (mean (SD)) -7.08 (67.92)

3
V76 (mean (SD)) -26.89 (70.00)
V77 (mean (SD)) -6.69 (29.30)
V78 (mean (SD)) -29.69 (253.43)
V79 (mean (SD)) 7.07 (149.40)
V80 (mean (SD)) -129.89 (253.63)
V81 (mean (SD)) 22.16 (131.18)
V82 (mean (SD)) 13.95 (29.42)
V83 (mean (SD)) -75.67 (169.42)
V84 (mean (SD)) 38.50 (142.36)
V85 (mean (SD)) 51.49 (111.49)
V86 (mean (SD)) -1.27 (16.94)
V87 (mean (SD)) 14.67 (129.02)
V88 (mean (SD)) -20.19 (176.42)
V89 (mean (SD)) 4.56 (16.02)
V90 (mean (SD)) 15.32 (181.82)
V91 (mean (SD)) 1.71 (21.05)

Census Dataset
# Data
data <- read.csv("C:/Users/W.C/Downloads/train/train.csv", header = TRUE)
data <- na.omit(data) %>%
mutate(income_level = ifelse(income_level=="-50000",1,0))

data <- data %>%


sample_n(10000,replace = F)
table(is.na(data))

FALSE
410000
table(data$income_level)

0 1
600 9400
ind <- sample(2, nrow(data), replace = TRUE, prob = c(0.7, 0.3))
train <- data[ind==1,]
test <- data[ind==2,]
table(train$income_level)

0 1
424 6645
# Data for Developing Predictive Model
table(train$income_level)

0 1
424 6645
prop.table(table(train$income_level))

4
0 1
0.0599802 0.9400198
summary(train)
age class_of_worker industry_code occupation_code
Min. : 0.00 Length:7069 Min. : 0.00 Min. . . . .0.00
1st Qu.:15.00 Class :character 1st Qu.: 0.00 1st Qu. .0.00
Median :33.00 Mode :character Median : 2.00 Median. .2.00
Mean :34.22 Mean :15.77 Mean....11.82
3rd Qu.:49.00 3rd Qu.:33.00 3rd Qu.:26.00
Max. :90.00 Max. :51.00 Max. :46.00
education wage_per_hour enrolled_in_edu_inst_lastwk
Length:7069 Min. : 0.00 Length:7069
Class :character 1st Qu.: 0.00 Class :character
Mode :character Median : 0.00 Mode :character
Mean : 54.04
3rd Qu.: 0.00
Max. :4807.00
marital_status major_industry_code major_occupation_code
Length:7069 Length:7069 Length:7069
Class :character Class :character Class :character
Mode :character Mode :character Mode :character

race hispanic_origin sex member_of_labor_union


Length:7069 Length:7069 Length:7069 Length:7069
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character

reason_for_unemployment full_parttime_employment_stat capital_gains


Length:7069 Length:7069 Min. : 0.0
Class :character Class :character 1st Qu.: 0.0
Mode :character Mode :character Median : 0.0
Mean : 420.6
3rd Qu.: 0.0
Max. :99999.0
capital_losses dividend_from_Stocks tax_filer_status
Min. : 0.0 Min. : 0.0 Length:7069
1st Qu.: 0.0 1st Qu.:....0.0 Class :character
Median : 0.0 Median :. . .0.0 Mode :character
Mean : 32.4 Mean. . . .228.8
3rd Qu.: 0.0 3rd Qu. . . . .0.0
Max. :4356.0 Max. . . .99999.0
region_of_previous_residence state_of_previous_residence
Length:7069 Length:7069
Class :character Class :character
Mode :character Mode :character

5
d_household_family_stat d_household_summary migration_msa
Length:7069 Length:7069 Length:7069
Class :character Class :character Class :character
Mode :character Mode :character Mode :character

migration_reg migration_within_reg live_1_year_ago migration_sunbelt


Length:7069 Length:7069 Length:7069 Length:7069
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character

num_person_Worked_employer family_members_under_18 country_father


Min. :0.00 Length:7069 Length:7069
1st Qu.:0.00 Class :character Class :character
Median :1.00 Mode :character Mode :character
Mean :1.98
3rd Qu.:4.00
Max. :6.00
country_mother country_self citizenship
Length:7069 Length:7069 Length:7069
Class :character Class :character Class :character
Mode :character Mode :character Mode :character

business_or_self_employed fill_questionnaire_veteran_admin veterans_benefits


Min. :0.0000 Length:7069 Min. :0.000
1st Qu.:0.0000 Class :character 1st Qu.:2.000
Median :0.0000 Mode :character Median :2.000
Mean :0.1819 Mean :1.521
3rd Qu.:0.0000 3rd Qu.:2.000
Max. :2.0000 Max. :2.000
weeks_worked_in_year year income_level
Min. . . . .0.0 Min. :94 Min. :0.00
1st Qu.: 0.0 1st Qu.:94 1st Qu.:1.00
Median :10.0 Median :94 Median :1.00
Mean :23.6 Mean :94 Mean :0.94
3rd Qu.:52.0 3rd Qu.:94 3rd Qu.:1.00
Max.
dim(train):52.0 Max. :94 Max. :1.00

[1] 7069 41
library(ROSE)
over <- ovun.sample(income_level~., data = train, method = "over", N = nrow(data)*1.5)$data
table(over$income_level)

0 1
3958 6645

6
summary(over)
age class_of_worker industry_code occupation_code
Min. : 0.00 Length:10603 Min. : 0.00 Min. . . . .0.00
1st Qu.:25.00 Class :character 1st Qu.: 0.00 1st Qu. .0.00
Median :39.00 Mode :character Median :23.00 Median. .4.00
Mean :38.14 Mean :19.93 Mean....11.95
3rd Qu.:51.00 3rd Qu.:36.00 3rd Qu.:24.00
Max. :90.00 Max. :51.00 Max. :46.00
education wage_per_hour enrolled_in_edu_inst_lastwk
Length:10603 Min. : 0.00 Length:10603
Class :character 1st Qu.: 0.00 Class :character
Mode :character Median : 0.00 Mode :character
Mean : 62.54
3rd Qu.: 0.00
Max. :4807.00
marital_status major_industry_code major_occupation_code
Length:10603 Length:10603 Length:10603
Class :character Class :character Class :character
Mode :character Mode :character Mode :character

race hispanic_origin sex member_of_labor_union


Length:10603 Length:10603 Length:10603 Length:10603
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character

reason_for_unemployment full_parttime_employment_stat capital_gains


Length:10603 Length:10603 Min. : 0
Class :character Class :character 1st Qu.: 0
Mode :character Mode :character Median : 0
Mean : 1996
3rd Qu.: 0
Max. :99999
capital_losses dividend_from_Stocks tax_filer_status
Min. : 0.00 Min. : 0 Length:10603
1st Qu.:....0.00 1st Qu.: 0 Class :character
Median :. . .0.00 Median : 0 Mode :character
Mean : 80.38 Mean : 832
3rd Qu.: 0.00 3rd Qu.: 0
Max. :4356.00 Max. :99999
region_of_previous_residence state_of_previous_residence
Length:10603 Length:10603
Class :character Class :character
Mode :character Mode :character

d_household_family_stat d_household_summary migration_msa


Length:10603 Length:10603 Length:10603
Class :character Class :character Class :character

7
Mode :character Mode :character Mode :character

migration_reg migration_within_reg live_1_year_ago migration_sunbelt


Length:10603 Length:10603 Length:10603 Length:10603
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character

num_person_Worked_employer family_members_under_18 country_father


Min. :0.000 Length:10603 Length:10603
1st Qu.:0.000 Class :character Class :character
Median :2.000 Mode :character Mode :character
Mean :2.596
3rd Qu.:6.000
Max. :6.000
country_mother country_self citizenship
Length:10603 Length:10603 Length:10603
Class :character Class :character Class :character
Mode :character Mode :character Mode :character

business_or_self_employed fill_questionnaire_veteran_admin veterans_benefits


Min. :0.000 Length:10603 Min. :0.000
1st Qu.:0.000 Class :character 1st Qu.:2.000
Median :0.000 Mode :character Median :2.000
Mean :0.216 Mean :1.675
3rd Qu.:0.000 3rd Qu.:2.000
Max. :2.000 Max. :2.000
weeks_worked_in_year year income_level
Min. . . . .0.00 Min. :94 Min. :0.0000
1st Qu.: 0.00 1st Qu.:94 1st Qu.:0.0000
Median :52.00 Median :94 Median :1.0000
Mean :31.86 Mean :94 Mean :0.6267
3rd Qu.:52.00 3rd Qu.:94 3rd Qu.:1.0000
Max. :52.00 Max. :94 Max. :1.0000
under <- ovun.sample(income_level~., data=train, method = "under", N = a
nrow(data)*.95)$dat table(under$income_level)

0 1
424 6291
both <- ovun.sample(income_level~., data=train, method = "both",
p = 0.5,
seed = ,
N = nrow(data))$data
table(both$income_level)

0 1

8
3498 3571
# Predictive Model (Random Forest)
library(randomForest)
rftrain <- randomForest(income_level~., data = train)
rfover <- randomForest(income_level~., data = over)
rfunder <- randomForest(income_level~., data=under)
rfboth <-randomForest(income_level~., data=both)

###
## TRUE model
pr_t <- predict( rftrain, test[,-41])
p_t <- ifelse(pr_t<.5,0,1)
mean(p_t == test$income_level)

[1] 0.9535995
## over sample
pr_o <- predict( rfover, test[,-41])
p_o <- ifelse(pr_o<.5,0,1)
mean(p_o == test$income_level)

[1] 0.9477994
## Under

pr_u <- predict( rfunder, test[,-41])


p_u <- ifelse(pr_u<.5,0,1)
mean(p_u == test$income_level)

[1] 0.954623
## bOth

pr_b <- predict( rfboth, test[,-41])


p_b <- ifelse(pr_b<.5,0,1)
mean(p_b == test$income_level)

[1] 0.9321051
compar <- data.frame(TRUE_model= mean(p_t == test$income_level), Over_sample = mean(p_o == test$income_
compar %>%
round(digits = 4)*100
TRUE_model Over_sample Under Both
1 95.36 94.78 95.46 93.21

Question 3
Naive Bayes model
library(readxl)
library(stringr)
dataset <- read_excel("C:/Users/W.C/Downloads/dataset.xlsx")
dataset <- dataset %>%
mutate( Negative = str_replace(Negative, "-",""))
library(caret)

9
newdata <- dataset %>%
sample_n(nrow(dataset), replace = FALSE)
library(caTools)
z <- sample.split(newdata$response, SplitRatio = .7)
tr <- newdata[z==TRUE,]
tt <- newdata[z==FALSE,]

#### another
td <- dummyVars( ~ response + Negative, data = tr)
trn <- predict(td, tr)
train <- as.data.frame(trn)

t <- dummyVars( ~ response + Negative, data = tt)


tet <- predict(t, tt)
test <- as.data.frame(tet)
# s <- createDataPartition(newdata$response, p = .8, list =FALSE)
# tr <- newdata[s,]
# tt <- newdata[-s,]

#### Naive bayes


library(naivebayes)
model <- naive_bayes(as.factor(response) ~., data = train)
nv <- predict(model,test[,-1])

The accuracy using Naive Bayes is 30.0047103%.

1
0

You might also like