You are on page 1of 22

BMEN-589 Lab #13: Artificial Neural Networks

Shrey Patel

11/22/2020

Useful Libraries
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse


1.3.0 ──

## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4


## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0

## ── Conflicts ──────────────────────────────────────────
tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()

library(ggplot2)
library(caret)

## Loading required package: lattice

##
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':


##
## lift

library(caretEnsemble)

##
## Attaching package: 'caretEnsemble'

## The following object is masked from 'package:ggplot2':


##
## autoplot

library(psych)

##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha

library(Amelia)

## Loading required package: Rcpp

## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(mice)

##
## Attaching package: 'mice'

## The following object is masked from 'package:stats':


##
## filter

## The following objects are masked from 'package:base':


##
## cbind, rbind

library(GGally)

## Registered S3 method overwritten by 'GGally':


## method from
## +.gg ggplot2

library(rpart)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

##
## Attaching package: 'randomForest'

## The following object is masked from 'package:psych':


##
## outlier

## The following object is masked from 'package:dplyr':


##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin

library(e1071)
library(reshape2)

##
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':


##
## smiths

library(corrplot)

## corrplot 0.84 loaded

library(neuralnet)

##
## Attaching package: 'neuralnet'

## The following object is masked from 'package:dplyr':


##
## compute

User Defined Functions


# Function that adjusts a column for thresh values given
# Shows correlation with Class before and after
adjust.with.discretization <- function(givenDF, thresh.values, col.nr){
cat("Correlation with Class BEFORE:",
cor(givenDF[,col.nr],
ifelse(givenDF$class == 'Benign', 0, 1)), "\n")

# Adjusts ranges given thresh values


if(length(thresh.values)==1){
givenDF[,col.nr] <- ifelse(givenDF[,col.nr] <= thresh.values, thresh.values,
thresh.values+1)
}
else if(length(thresh.values)==2){
givenDF[,col.nr] <- ifelse(givenDF[,col.nr] <= thresh.values[1],
thresh.values[1],
ifelse(givenDF[,col.nr] <= thresh.values[2],
thresh.values[2],thresh.values[2]+1))
}

# How the results will be displayed


cat("---------------------------------------\n")
cat("- AFTER adjustment:\n")
cat("COLUMN:", names(givenDF)[col.nr])
print(table(givenDF[,col.nr], givenDF$class))
cat("---------------------------------------\n")

cat("Correlation with Class AFTER:",


cor(givenDF[,col.nr],
ifelse(givenDF$class == 'Benign', 0, 1)), "\n")

givenDF[,col.nr] <- as.integer(givenDF[,col.nr])

return(givenDF)
}

# Normalization Function
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}

Import Data Set & Deal w/ Missing Data


breast <- read.table("breastCancer.txt", header = TRUE, stringsAsFactors =
FALSE)

str(breast)

## 'data.frame': 699 obs. of 10 variables:


## $ clump_thickness : int 5 5 3 6 4 8 1 2 2 4 ...
## $ cell_shape_uniformity : int 1 4 1 8 1 10 1 1 1 2 ...
## $ cell_size_uniformity : int 1 4 1 8 1 10 1 2 1 1 ...
## $ marginal_adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ single_epithelial_cell_size: int 2 7 2 3 2 7 2 2 2 2 ...
## $ bare_nuclei : chr "1" "10" "2" "4" ...
## $ bland_chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ normal_nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ mitoses : int 1 1 1 1 1 1 1 1 5 1 ...
## $ class : int 0 0 0 0 0 1 0 0 0 0 ...

summary(breast)

## clump_thickness cell_shape_uniformity cell_size_uniformity


marginal_adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. :
1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:
1.000
## Median : 4.000 Median : 1.000 Median : 1.000 Median :
1.000
## Mean : 4.418 Mean : 3.134 Mean : 3.207 Mean :
2.807
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.:
4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max.
:10.000
## single_epithelial_cell_size bare_nuclei bland_chromatin
## Min. : 1.000 Length:699 Min. : 1.000
## 1st Qu.: 2.000 Class :character 1st Qu.: 2.000
## Median : 2.000 Mode :character Median : 3.000
## Mean : 3.216 Mean : 3.438
## 3rd Qu.: 4.000 3rd Qu.: 5.000
## Max. :10.000 Max. :10.000
## normal_nucleoli mitoses class
## Min. : 1.000 Min. : 1.000 Min. :0.0000
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:0.0000
## Median : 1.000 Median : 1.000 Median :0.0000
## Mean : 2.867 Mean : 1.589 Mean :0.3448
## 3rd Qu.: 4.000 3rd Qu.: 1.000 3rd Qu.:1.0000
## Max. :10.000 Max. :10.000 Max. :1.0000

breast$bare_nuclei <- as.integer(breast$bare_nuclei) # NA's will be


introduced

## Warning: NAs introduced by coercion

breast$class <- as.factor(breast$class)


levels(breast$class) <- c("Benign", "Malignant") # For clarity
str(breast)

## 'data.frame': 699 obs. of 10 variables:


## $ clump_thickness : int 5 5 3 6 4 8 1 2 2 4 ...
## $ cell_shape_uniformity : int 1 4 1 8 1 10 1 1 1 2 ...
## $ cell_size_uniformity : int 1 4 1 8 1 10 1 2 1 1 ...
## $ marginal_adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ single_epithelial_cell_size: int 2 7 2 3 2 7 2 2 2 2 ...
## $ bare_nuclei : int 1 10 2 4 1 10 10 1 1 1 ...
## $ bland_chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ normal_nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ mitoses : int 1 1 1 1 1 1 1 1 5 1 ...
## $ class : Factor w/ 2 levels "Benign","Malignant": 1
1 1 1 1 2 1 1 1 1 ...

summary(breast)

## clump_thickness cell_shape_uniformity cell_size_uniformity


marginal_adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. :
1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:
1.000
## Median : 4.000 Median : 1.000 Median : 1.000 Median :
1.000
## Mean : 4.418 Mean : 3.134 Mean : 3.207 Mean :
2.807
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.:
4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max.
:10.000
##
## single_epithelial_cell_size bare_nuclei bland_chromatin
normal_nucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. :
1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.:
1.000
## Median : 2.000 Median : 1.000 Median : 3.000 Median :
1.000
## Mean : 3.216 Mean : 3.545 Mean : 3.438 Mean :
2.867
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.:
4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max.
:10.000
## NA's :16
## mitoses class
## Min. : 1.000 Benign :458
## 1st Qu.: 1.000 Malignant:241
## Median : 1.000
## Mean : 1.589
## 3rd Qu.: 1.000
## Max. :10.000
##

# How many NA's are in the diabetes data set


sum(is.na(breast))

## [1] 16

# Visualize the missing data


missmap(breast)
# Note: we are missing less than 1% of our data, however removing 16
observations will remove over 2% of our data from analysis. Therefore, to
have more data, imputations can be performed via the "mice" package.

# Use rf (random forest tree method) to assign missing variables


mice_mod <- mice(breast[, c("bare_nuclei","class")], method='rf')

##
## iter imp variable
## 1 1 bare_nuclei
## 1 2 bare_nuclei
## 1 3 bare_nuclei
## 1 4 bare_nuclei
## 1 5 bare_nuclei
## 2 1 bare_nuclei
## 2 2 bare_nuclei
## 2 3 bare_nuclei
## 2 4 bare_nuclei
## 2 5 bare_nuclei
## 3 1 bare_nuclei
## 3 2 bare_nuclei
## 3 3 bare_nuclei
## 3 4 bare_nuclei
## 3 5 bare_nuclei
## 4 1 bare_nuclei
## 4 2 bare_nuclei
## 4 3 bare_nuclei
## 4 4 bare_nuclei
## 4 5 bare_nuclei
## 5 1 bare_nuclei
## 5 2 bare_nuclei
## 5 3 bare_nuclei
## 5 4 bare_nuclei
## 5 5 bare_nuclei

mice_complete <- mice::complete(mice_mod)

# Transfer the predicted missing values into the main data set
breast$bare_nuclei <- mice_complete$bare_nuclei

# Check for missing values before continuing with a missmap plot:


missmap(breast)

Data Visualization
# Compare benign and malignant cases
ggplot(data = breast) +
geom_bar(aes(x=breast$class))

## Warning: Use of `breast$class` is discouraged. Use `class` instead.


summary(breast)

## clump_thickness cell_shape_uniformity cell_size_uniformity


marginal_adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. :
1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:
1.000
## Median : 4.000 Median : 1.000 Median : 1.000 Median :
1.000
## Mean : 4.418 Mean : 3.134 Mean : 3.207 Mean :
2.807
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.:
4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max.
:10.000
## single_epithelial_cell_size bare_nuclei bland_chromatin
normal_nucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. :
1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.:
1.000
## Median : 2.000 Median : 1.000 Median : 3.000 Median :
1.000
## Mean : 3.216 Mean : 3.516 Mean : 3.438 Mean :
2.867
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.:
4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max.
:10.000
## mitoses class
## Min. : 1.000 Benign :458
## 1st Qu.: 1.000 Malignant:241
## Median : 1.000
## Mean : 1.589
## 3rd Qu.: 1.000
## Max. :10.000

Frequency Distributions
breast %>%
gather(-class, key = "var", value = "value") %>%
ggplot(aes(x = value, y = ..count.. , colour = class)) +
geom_density() +
scale_color_manual(values=c("#008000", "#FF0000"))+
facet_wrap(~var, scales = "free", nrow = 2) +
theme_bw()

Outliers
# View all predictor variables together
ggplot(stack(breast[,1:9]), aes(x = ind, y = values)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust=1)) +
labs(title = "Boxplots of columns") +
labs(x = "", y = "Values") +
scale_y_continuous(breaks = seq(1, 10, by = 1))

# View all predictor variables together and separated by class


breast.m <- melt(breast, id.vars = "class")
ggplot(breast.m) +
geom_boxplot(mapping = aes(x = variable, y = value, fill = class)) +
facet_wrap(~class, scales = "free", nrow = 2)
# Note: Most outliers exist in the benign class, but due to scaling, it is
possible that the predictor variables, either benign or malignant, may not
have equal weight. To look futher, we must see the coorelation of individual
variables

Mitoses
table(breast$mitoses, breast$class)

##
## Benign Malignant
## 1 445 134
## 2 8 27
## 3 2 31
## 4 0 12
## 5 1 5
## 6 0 3
## 7 1 8
## 8 1 7
## 10 0 14

# Note: starting from Mitoses of 2, the proportion of Benign and Malignant


tumors seems to remain close. Therefore we can group these values to make the
information simpler, helping reduce chances of overfitting or reduce the
influence of outliers in the extremes, thus making future models better for
generalization.
breast <- adjust.with.discretization(breast, c(1), c(9)) # user defined
function

## Correlation with Class BEFORE: 0.4231703


## ---------------------------------------
## - AFTER adjustment:
## COLUMN: mitoses
## Benign Malignant
## 1 445 134
## 2 13 107
## ---------------------------------------
## Correlation with Class AFTER: 0.5238247

# View all predictor variables together


ggplot(stack(breast[,1:9]), aes(x = ind, y = values)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust=1)) +
labs(title = "Boxplots of columns") +
labs(x = "", y = "Values") +
scale_y_continuous(breaks = seq(1, 10, by = 1))

# Note: t the correlation of the mitoses column with the class improved
considerably after adjustment, therefore confirming the validity of the
changes made.
Single Epithelial Cell Size
table(breast$single_epithelial_cell_size, breast$class)

##
## Benign Malignant
## 1 46 1
## 2 363 23
## 3 29 43
## 4 7 41
## 5 5 34
## 6 2 39
## 7 3 9
## 8 2 19
## 9 0 2
## 10 1 30

# Note: starting from a value of 4, the proportion of Benign and Malignant


tumors seems to remain close, and also values of 1 and 2 seem to have the
same proportion. Therefore we can group these values to make the information
simpler, helping reduce chances of overfitting or reduce the influence of
outliers in the extremes, thus making future models better for
generalization.

breast <- adjust.with.discretization(breast, c(2,3), c(5)) # user defined


function

## Correlation with Class BEFORE: 0.6827845


## ---------------------------------------
## - AFTER adjustment:
## COLUMN: single_epithelial_cell_size
## Benign Malignant
## 2 409 24
## 3 29 43
## 4 20 174
## ---------------------------------------
## Correlation with Class AFTER: 0.7920027

# View all predictor variables together


ggplot(stack(breast[,1:9]), aes(x = ind, y = values)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust=1)) +
labs(title = "Boxplots of columns") +
labs(x = "", y = "Values") +
scale_y_continuous(breaks = seq(1, 10, by = 1))
Normal Nucleoli
table(breast$normal_nucleoli, breast$class)

##
## Benign Malignant
## 1 402 41
## 2 30 6
## 3 12 32
## 4 1 17
## 5 2 17
## 6 4 18
## 7 2 14
## 8 4 20
## 9 1 15
## 10 0 61

# Note: starting from normal_nucleoli values of 4, the proportion of Benign


and Malignant tumors seems to remain close, and also values of 1 and 2 seem
to have the same proportion. Therefore we can group these values to make the
information simpler, helping reduce chances of overfitting or reduce the
influence of outliers in the extremes, thus making future models better for
generalization.

breast <- adjust.with.discretization(breast, c(2,3), c(8)) # user defined


function
## Correlation with Class BEFORE: 0.7122436
## ---------------------------------------
## - AFTER adjustment:
## COLUMN: normal_nucleoli
## Benign Malignant
## 2 432 47
## 3 12 32
## 4 14 162
## ---------------------------------------
## Correlation with Class AFTER: 0.7632114

# View all predictor variables together


ggplot(stack(breast[,1:9]), aes(x = ind, y = values)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust=1)) +
labs(title = "Boxplots of columns") +
labs(x = "", y = "Values") +
scale_y_continuous(breaks = seq(1, 10, by = 1))

Correlations
# remove the class so we can coerce df to a matrix.
# cor() function takes a matrix
df <- breast[,-10]
correlations <- cor(df, method = "pearson")
corrplot(correlations, number.cex = .9, method = "number",
order = "FPC", type = "upper", tl.cex = 0.8, tl.col = "black")
# cell shape uniformity vs. cell size uniformity has cor = 0.91
ggplot(data = breast, aes(x = breast$cell_shape_uniformity,
y = breast$cell_size_uniformity)) +
geom_point() +
geom_smooth(method = "lm")

## Warning: Use of `breast$cell_shape_uniformity` is discouraged. Use


## `cell_shape_uniformity` instead.

## Warning: Use of `breast$cell_size_uniformity` is discouraged. Use


## `cell_size_uniformity` instead.

## Warning: Use of `breast$cell_shape_uniformity` is discouraged. Use


## `cell_shape_uniformity` instead.

## Warning: Use of `breast$cell_size_uniformity` is discouraged. Use


## `cell_size_uniformity` instead.

## `geom_smooth()` using formula 'y ~ x'


ggpairs(breast)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Normalize Data
# Normalize the data set
breast.n <- as.data.frame(lapply(breast[,-10], normalize))
breast.n <- cbind(breast.n, class = breast$class)
ggpairs(breast.n)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Create Training & Testing Sets
# randomly split your data in to 70% training set and 30% test set
set.seed(1234)

# randomly extract row numbers in breast dataset which will be included in


the training set
spt <- createDataPartition(y = breast$class, p = 0.75, list = FALSE)

# subset the breast data set to include only the rows found in spt - 70%
train.breast <- breast[spt, ]

# subset the breast data set to include only the rows NOT found in spt - 30%
test.breast <- breast[-spt, ]

# what proportion of yes/no do we have in our full breast class variable


prop.table(table(breast$class)) * 100

##
## Benign Malignant
## 65.52217 34.47783

# what proportion of yes/no do we have in our training data set class


variable
prop.table(table(train.breast$class)) * 100
##
## Benign Malignant
## 65.52381 34.47619

# what proportion of yes/no do we have in our testing data set class variable
prop.table(table(test.breast$class)) * 100

##
## Benign Malignant
## 65.51724 34.48276

Training The Model


# Note: n the formula y~. is not accepted in the neuralnet() function. You
need to first write the formula and then pass it as an argument in the
fitting function. The argument hidden accepts a vector with the number of
neurons for each hidden layer. The argument linear.output is used to specify
whether we want to do regression linear.output = TRUE or classification
linear.output = FALSE

n <- names(train.breast)
f <- as.formula(paste("class ~", paste(n[!n %in% "class"], collapse = " +
")))
f

## class ~ clump_thickness + cell_shape_uniformity + cell_size_uniformity +


## marginal_adhesion + single_epithelial_cell_size + bare_nuclei +
## bland_chromatin + normal_nucleoli + mitoses

# Train the neural network


nn <- neuralnet(f, data = train.breast, hidden = c(7), linear.output = FALSE,
act.fct = "logistic", lifesign = "minimal")

## hidden: 7 thresh: 0.01 rep: 1/1 steps: 1318 error: 3.00316


time: 0.47 secs

Visualize the Model


# visualize the neural network
plot(nn)

# Test accuracy of model


# Compute predictions with Training set
pr.nn <- predict(nn, train.breast[, -10])

# Accuracy (training set)


x <- table(train.breast$class, apply(pr.nn, 1, which.max))
(x[1,1]+x[2,2]) / sum(x)

## [1] 0.9942857
# Compute predictions with Test set
pr.nn <- predict(nn, test.breast[, -10])

# Accuracy (Test set)


x <- table(test.breast$class, apply(pr.nn, 1, which.max))
(x[1,1]+x[2,2]) / sum(x)

## [1] 0.9425287

Cross Validate the Classifier


# Crossvalidate model using evergreen 10 fold cross fold validation with a
train and test split of 95% and 5% respectively
# Set seed for reproducibility purposes
set.seed(1234)

# 10 fold cross validation


k <- 10

# Results from cv
outs <- NULL

# Train test split proportions


proportion <- 0.95

for(i in 1:k)
{
spt <- createDataPartition(y = breast$class, p = 0.75, list = FALSE)
train_cv <- breast[spt, ]
test_cv <- breast[-spt, ]
nn_cv <- neuralnet(f, data = train_cv, hidden = c(7), act.fct = "logistic",
linear.output = FALSE)

# Compute predictions
pr.nn <- predict(nn_cv, test_cv[, -10])
x <- table(test_cv$class, apply(pr.nn, 1, which.max))
outs[i] <- (x[1,1]+x[2,2]) / sum(x)
}
mean(outs)

## [1] 0.9505747

You might also like