You are on page 1of 13

20BCE1205-Lab8.

Shubham Ojha

20BCE1205

EDA LAB (L1+L2)

SVM and Random Forest

HP

2023-03-08
#Support Vector Machine

dataset =
read.csv("C:/Users/HP/Desktop/padhai/EDA/Lab/Social_Network_Ads.csv")
dataset = dataset[3:5]
summary(dataset)

## Age EstimatedSalary Purchased


## Min. :18.00 Min. : 15000 Min. :0.0000
## 1st Qu.:29.75 1st Qu.: 43000 1st Qu.:0.0000
## Median :37.00 Median : 70000 Median :0.0000
## Mean :37.66 Mean : 69743 Mean :0.3575
## 3rd Qu.:46.00 3rd Qu.: 88000 3rd Qu.:1.0000
## Max. :60.00 Max. :150000 Max. :1.0000

head(dataset)

## Age EstimatedSalary Purchased


## 1 19 19000 0
## 2 35 20000 0
## 3 26 43000 0
## 4 27 57000 0
## 5 19 76000 0
## 6 27 58000 0
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
summary(dataset)

## Age EstimatedSalary Purchased


## Min. :18.00 Min. : 15000 0:257
## 1st Qu.:29.75 1st Qu.: 43000 1:143
## Median :37.00 Median : 70000
## Mean :37.66 Mean : 69743
## 3rd Qu.:46.00 3rd Qu.: 88000
## Max. :60.00 Max. :150000

library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
head(training_set)

## Age EstimatedSalary Purchased


## 1 -1.7655475 -1.4733414 0
## 3 -1.0962966 -0.7883761 0
## 6 -1.0006894 -0.3602727 0
## 7 -1.0006894 0.3817730 0
## 8 -0.5226531 2.2654277 1
## 10 -0.2358313 -0.1604912 0

library(e1071)
classifierL = svm(formula = Purchased ~ .,
data = training_set,
type = 'C-classification',
kernel = 'linear')

classifierS = svm(formula = Purchased ~ .,


data = training_set,
type = 'C-classification', # this is because we want to
make a regression classification
kernel = 'sigmoid')

classifierR = svm(formula = Purchased ~ .,


data = training_set,
type = 'C-classification',
kernel = 'radial')

classifierP = svm(formula = Purchased ~ .,


data = training_set,
type = 'C-classification',
kernel = 'polynomial')
y_predL = predict(classifierL, newdata = test_set[-3])
y_predS = predict(classifierS, newdata = test_set[-3])
y_predR = predict(classifierR, newdata = test_set[-3])
y_predP = predict(classifierP, newdata = test_set[-3])
y_predL

## 2 4 5 9 12 18 19 20 22 29 32 34 35 38 45 46 48 52
66 69
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0
## 74 75 82 84 85 86 87 89 103 104 107 108 109 117 124 126 127 131
134 139
## 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
0 0
## 148 154 156 159 162 163 170 175 176 193 199 200 208 213 224 226 228 229
230 234
## 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0
1 1
## 236 237 239 241 255 264 265 266 273 274 281 286 292 299 302 305 307 310
316 324
## 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 0
0 0
## 326 332 339 341 343 347 353 363 364 367 368 369 372 373 380 383 389 392
395 400
## 0 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 0
0 0
## Levels: 0 1

y_predS

## 2 4 5 9 12 18 19 20 22 29 32 34 35 38 45 46 48 52
66 69
## 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0
0 0
## 74 75 82 84 85 86 87 89 103 104 107 108 109 117 124 126 127 131
134 139
## 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0
0 0
## 148 154 156 159 162 163 170 175 176 193 199 200 208 213 224 226 228 229
230 234
## 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1
1 1
## 236 237 239 241 255 264 265 266 273 274 281 286 292 299 302 305 307 310
316 324
## 1 1 1 0 0 0 1 1 0 0 1 0 1 1 1 1 1 0
1 0
## 326 332 339 341 343 347 353 363 364 367 368 369 372 373 380 383 389 392
395 400
## 1 1 0 1 0 1 1 1 1 0 1 0 1 1 0 0 0 0
0 0
## Levels: 0 1
y_predR

## 2 4 5 9 12 18 19 20 22 29 32 34 35 38 45 46 48 52
66 69
## 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0
0 0
## 74 75 82 84 85 86 87 89 103 104 107 108 109 117 124 126 127 131
134 139
## 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
0 0
## 148 154 156 159 162 163 170 175 176 193 199 200 208 213 224 226 228 229
230 234
## 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0
0 1
## 236 237 239 241 255 264 265 266 273 274 281 286 292 299 302 305 307 310
316 324
## 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 0
0 1
## 326 332 339 341 343 347 353 363 364 367 368 369 372 373 380 383 389 392
395 400
## 0 1 0 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1
0 1
## Levels: 0 1

y_predP

## 2 4 5 9 12 18 19 20 22 29 32 34 35 38 45 46 48 52
66 69
## 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
0 0
## 74 75 82 84 85 86 87 89 103 104 107 108 109 117 124 126 127 131
134 139
## 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
0 0
## 148 154 156 159 162 163 170 175 176 193 199 200 208 213 224 226 228 229
230 234
## 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0
0 0
## 236 237 239 241 255 264 265 266 273 274 281 286 292 299 302 305 307 310
316 324
## 0 0 0 1 1 0 0 1 1 1 1 0 0 0 0 0 1 0
0 1
## 326 332 339 341 343 347 353 363 364 367 368 369 372 373 380 383 389 392
395 400
## 0 1 0 1 0 1 0 0 0 1 0 0 1 0 1 1 0 0
0 1
## Levels: 0 1

cmL = table(test_set[, 3], y_predL)


cmS = table(test_set[, 3], y_predS)
cmR = table(test_set[, 3], y_predR)
cmP = table(test_set[, 3], y_predP)
cmL

## y_predL
## 0 1
## 0 57 7
## 1 13 23

cmS

## y_predS
## 0 1
## 0 53 11
## 1 14 22

cmR

## y_predR
## 0 1
## 0 58 6
## 1 4 32

cmP

## y_predP
## 0 1
## 0 60 4
## 1 18 18

# Linear
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifierL, newdata = grid_set)
plot(set[, -3],
main = 'SVM (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add =
TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3',
'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
#Sigmoid
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_gridS = predict(classifierS, newdata = grid_set)
plot(set[, -3],
main = 'SVM Sigmoid Kernel (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_gridS), length(X1), length(X2)), add =
TRUE)
points(grid_set, pch = '.', col = ifelse(y_gridS == 1, 'springgreen3',
'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Testing

#Linear
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifierL, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add =
TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3',
'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
#Sigmoid
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_gridS = predict(classifierS, newdata = grid_set)
plot(set[, -3], main = 'SVM Sigmoid Kernel (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_gridS), length(X1), length(X2)), add =
TRUE)
points(grid_set, pch = '.', col = ifelse(y_gridS == 1, 'springgreen3',
'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

# Random Forest
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

library(tree)
library(ggplot2)
##
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':


##
## margin

library(GGally)

## Registered S3 method overwritten by 'GGally':


## method from
## +.gg ggplot2

library(dplyr)

##
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':


##
## combine

## The following objects are masked from 'package:stats':


##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union

head(iris)

## Sepal.Length Sepal.Width Petal.Length Petal.Width Species


## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa

summary(iris)

## Sepal.Length Sepal.Width Petal.Length Petal.Width


## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##

decision_tree <- tree(Species ~ ., data = iris)


decision_tree

## node), split, n, deviance, yval, (yprob)


## * denotes terminal node
##
## 1) root 150 329.600 setosa ( 0.33333 0.33333 0.33333 )
## 2) Petal.Length < 2.45 50 0.000 setosa ( 1.00000 0.00000 0.00000 ) *
## 3) Petal.Length > 2.45 100 138.600 versicolor ( 0.00000 0.50000 0.50000
)
## 6) Petal.Width < 1.75 54 33.320 versicolor ( 0.00000 0.90741 0.09259
)
## 12) Petal.Length < 4.95 48 9.721 versicolor ( 0.00000 0.97917
0.02083 )
## 24) Sepal.Length < 5.15 5 5.004 versicolor ( 0.00000 0.80000
0.20000 ) *
## 25) Sepal.Length > 5.15 43 0.000 versicolor ( 0.00000 1.00000
0.00000 ) *
## 13) Petal.Length > 4.95 6 7.638 virginica ( 0.00000 0.33333
0.66667 ) *
## 7) Petal.Width > 1.75 46 9.635 virginica ( 0.00000 0.02174
0.97826 )
## 14) Petal.Length < 4.95 6 5.407 virginica ( 0.00000 0.16667
0.83333 ) *
## 15) Petal.Length > 4.95 40 0.000 virginica ( 0.00000 0.00000
1.00000 ) *

summary(decision_tree)

##
## Classification tree:
## tree(formula = Species ~ ., data = iris)
## Variables actually used in tree construction:
## [1] "Petal.Length" "Petal.Width" "Sepal.Length"
## Number of terminal nodes: 6
## Residual mean deviance: 0.1253 = 18.05 / 144
## Misclassification error rate: 0.02667 = 4 / 150

plot(decision_tree)
text(decision_tree)

ggpairs(iris[,1:5])

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
index_row <- sample(2,
nrow(iris),
replace = T,
prob = c(0.7, 0.3)
)
train_data <- iris[index_row == 1,]
test_data <- iris[index_row == 2,]
iris_classifier <- randomForest(Species ~.,
data = train_data,
importance = T)
iris_classifier

##
## Call:
## randomForest(formula = Species ~ ., data = train_data, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 6.67%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 37 0 0 0.00000000
## versicolor 0 32 3 0.08571429
## virginica 0 4 29 0.12121212

plot(iris_classifier)
predicted_table <- predict(iris_classifier, test_data[,-5])
table(observed = test_data[,5], predicted = predicted_table)

## predicted
## observed setosa versicolor virginica
## setosa 13 0 0
## versicolor 0 15 0
## virginica 0 2 15

You might also like