Bmen-589 Lab 13 Submission

BMEN-589 Lab #13: Artificial Neural Networks
Shrey Patel
11/22/2020
Useful Libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse

1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4

## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────
tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':

##
## lift
library(caretEnsemble)
##
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':

##
## autoplot
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':

##
## filter
## The following objects are masked from 'package:base':

##
## cbind, rbind
library(GGally)
## Registered S3 method overwritten by 'GGally':

## method from
## +.gg ggplot2
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':

##
## outlier
## The following object is masked from 'package:dplyr':

##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':

##
## smiths
library(corrplot)
## corrplot 0.84 loaded
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':

##
## compute
User Defined Functions

# Function that adjusts a column for thresh values given
# Shows correlation with Class before and after
adjust.with.discretization <- function(givenDF, thresh.values, col.nr){
cat("Correlation with Class BEFORE:",
cor(givenDF[,col.nr],
ifelse(givenDF$class == 'Benign', 0, 1)), "\n")
# Adjusts ranges given thresh values

if(length(thresh.values)==1){
givenDF[,col.nr] <- ifelse(givenDF[,col.nr] <= thresh.values, thresh.values,
thresh.values+1)
}
else if(length(thresh.values)==2){
givenDF[,col.nr] <- ifelse(givenDF[,col.nr] <= thresh.values[1],
thresh.values[1],
ifelse(givenDF[,col.nr] <= thresh.values[2],
thresh.values[2],thresh.values[2]+1))
}
# How the results will be displayed

cat("---------------------------------------\n")
cat("- AFTER adjustment:\n")
cat("COLUMN:", names(givenDF)[col.nr])
print(table(givenDF[,col.nr], givenDF$class))
cat("---------------------------------------\n")
cat("Correlation with Class AFTER:",

cor(givenDF[,col.nr],
ifelse(givenDF$class == 'Benign', 0, 1)), "\n")
givenDF[,col.nr] <- as.integer(givenDF[,col.nr])
return(givenDF)
}
# Normalization Function
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
Import Data Set & Deal w/ Missing Data

breast <- read.table("breastCancer.txt", header = TRUE, stringsAsFactors =
FALSE)
str(breast)
## 'data.frame': 699 obs. of 10 variables:

## $ clump_thickness : int 5 5 3 6 4 8 1 2 2 4 ...
## $ cell_shape_uniformity : int 1 4 1 8 1 10 1 1 1 2 ...
## $ cell_size_uniformity : int 1 4 1 8 1 10 1 2 1 1 ...
## $ marginal_adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ single_epithelial_cell_size: int 2 7 2 3 2 7 2 2 2 2 ...
## $ bare_nuclei : chr "1" "10" "2" "4" ...
## $ bland_chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ normal_nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ mitoses : int 1 1 1 1 1 1 1 1 5 1 ...
## $ class : int 0 0 0 0 0 1 0 0 0 0 ...
summary(breast)
## clump_thickness cell_shape_uniformity cell_size_uniformity

marginal_adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. :
1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:
1.000
## Median : 4.000 Median : 1.000 Median : 1.000 Median :
1.000
## Mean : 4.418 Mean : 3.134 Mean : 3.207 Mean :
2.807
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.:
4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max.
:10.000
## single_epithelial_cell_size bare_nuclei bland_chromatin
## Min. : 1.000 Length:699 Min. : 1.000
## 1st Qu.: 2.000 Class :character 1st Qu.: 2.000
## Median : 2.000 Mode :character Median : 3.000
## Mean : 3.216 Mean : 3.438
## 3rd Qu.: 4.000 3rd Qu.: 5.000
## Max. :10.000 Max. :10.000
## normal_nucleoli mitoses class
## Min. : 1.000 Min. : 1.000 Min. :0.0000
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:0.0000
## Median : 1.000 Median : 1.000 Median :0.0000
## Mean : 2.867 Mean : 1.589 Mean :0.3448
## 3rd Qu.: 4.000 3rd Qu.: 1.000 3rd Qu.:1.0000
## Max. :10.000 Max. :10.000 Max. :1.0000
breast$bare_nuclei <- as.integer(breast$bare_nuclei) # NA's will be

introduced
## Warning: NAs introduced by coercion
breast$class <- as.factor(breast$class)

levels(breast$class) <- c("Benign", "Malignant") # For clarity
str(breast)
## 'data.frame': 699 obs. of 10 variables:

## $ clump_thickness : int 5 5 3 6 4 8 1 2 2 4 ...
## $ cell_shape_uniformity : int 1 4 1 8 1 10 1 1 1 2 ...
## $ cell_size_uniformity : int 1 4 1 8 1 10 1 2 1 1 ...
## $ marginal_adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ single_epithelial_cell_size: int 2 7 2 3 2 7 2 2 2 2 ...
## $ bare_nuclei : int 1 10 2 4 1 10 10 1 1 1 ...
## $ bland_chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ normal_nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ mitoses : int 1 1 1 1 1 1 1 1 5 1 ...
## $ class : Factor w/ 2 levels "Benign","Malignant": 1
1 1 1 1 2 1 1 1 1 ...
summary(breast)

marginal_adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. :
1.000
1.000
1.000
2.807
4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max.
:10.000
##
normal_nucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. :
1.000
1.000
1.000
2.867
4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max.
:10.000
## NA's :16
## mitoses class
## Min. : 1.000 Benign :458
## 1st Qu.: 1.000 Malignant:241
## Median : 1.000
## Mean : 1.589
## 3rd Qu.: 1.000
## Max. :10.000
##
# How many NA's are in the diabetes data set

sum(is.na(breast))
## [1] 16
# Visualize the missing data

missmap(breast)
# Note: we are missing less than 1% of our data, however removing 16
observations will remove over 2% of our data from analysis. Therefore, to
have more data, imputations can be performed via the "mice" package.
# Use rf (random forest tree method) to assign missing variables

mice_mod <- mice(breast[, c("bare_nuclei","class")], method='rf')
##
## iter imp variable
## 1 1 bare_nuclei
## 1 2 bare_nuclei
## 1 3 bare_nuclei
## 1 4 bare_nuclei
## 1 5 bare_nuclei
## 2 1 bare_nuclei
## 2 2 bare_nuclei
## 2 3 bare_nuclei
## 2 4 bare_nuclei
## 2 5 bare_nuclei
## 3 1 bare_nuclei
## 3 2 bare_nuclei
## 3 3 bare_nuclei
## 3 4 bare_nuclei
## 3 5 bare_nuclei
## 4 1 bare_nuclei
## 4 2 bare_nuclei
## 4 3 bare_nuclei
## 4 4 bare_nuclei
## 4 5 bare_nuclei
## 5 1 bare_nuclei
## 5 2 bare_nuclei
## 5 3 bare_nuclei
## 5 4 bare_nuclei
## 5 5 bare_nuclei
mice_complete <- mice::complete(mice_mod)
# Transfer the predicted missing values into the main data set
breast$bare_nuclei <- mice_complete$bare_nuclei
# Check for missing values before continuing with a missmap plot:

missmap(breast)
Data Visualization
# Compare benign and malignant cases
ggplot(data = breast) +
geom_bar(aes(x=breast$class))
## Warning: Use of `breast$class` is discouraged. Use `class` instead.

summary(breast)

marginal_adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. :
1.000
1.000
1.000
2.807
4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max.
:10.000
normal_nucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. :
1.000
1.000
1.000
2.867
4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max.
:10.000
## mitoses class
## Min. : 1.000 Benign :458
## 1st Qu.: 1.000 Malignant:241
## Median : 1.000
## Mean : 1.589
## 3rd Qu.: 1.000
## Max. :10.000
Frequency Distributions
breast %>%
gather(-class, key = "var", value = "value") %>%
ggplot(aes(x = value, y = ..count.. , colour = class)) +
geom_density() +
scale_color_manual(values=c("#008000", "#FF0000"))+
facet_wrap(~var, scales = "free", nrow = 2) +
theme_bw()
Outliers
# View all predictor variables together
ggplot(stack(breast[,1:9]), aes(x = ind, y = values)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust=1)) +
labs(title = "Boxplots of columns") +
labs(x = "", y = "Values") +
scale_y_continuous(breaks = seq(1, 10, by = 1))
# View all predictor variables together and separated by class

breast.m <- melt(breast, id.vars = "class")
ggplot(breast.m) +
geom_boxplot(mapping = aes(x = variable, y = value, fill = class)) +
facet_wrap(~class, scales = "free", nrow = 2)
# Note: Most outliers exist in the benign class, but due to scaling, it is
possible that the predictor variables, either benign or malignant, may not
have equal weight. To look futher, we must see the coorelation of individual
variables
Mitoses
table(breast$mitoses, breast$class)
##
## Benign Malignant
## 1 445 134
## 2 8 27
## 3 2 31
## 4 0 12
## 5 1 5
## 6 0 3
## 7 1 8
## 8 1 7
## 10 0 14
# Note: starting from Mitoses of 2, the proportion of Benign and Malignant

tumors seems to remain close. Therefore we can group these values to make the
information simpler, helping reduce chances of overfitting or reduce the
influence of outliers in the extremes, thus making future models better for
generalization.
breast <- adjust.with.discretization(breast, c(1), c(9)) # user defined
function
## Correlation with Class BEFORE: 0.4231703

## ---------------------------------------
## - AFTER adjustment:
## COLUMN: mitoses
## Benign Malignant
## 1 445 134
## 2 13 107
## ---------------------------------------
## Correlation with Class AFTER: 0.5238247

geom_boxplot() +
# Note: t the correlation of the mitoses column with the class improved
considerably after adjustment, therefore confirming the validity of the
changes made.
Single Epithelial Cell Size
table(breast$single_epithelial_cell_size, breast$class)
##
## Benign Malignant
## 1 46 1
## 2 363 23
## 3 29 43
## 4 7 41
## 5 5 34
## 6 2 39
## 7 3 9
## 8 2 19
## 9 0 2
## 10 1 30
# Note: starting from a value of 4, the proportion of Benign and Malignant

tumors seems to remain close, and also values of 1 and 2 seem to have the
same proportion. Therefore we can group these values to make the information
simpler, helping reduce chances of overfitting or reduce the influence of
outliers in the extremes, thus making future models better for
generalization.
breast <- adjust.with.discretization(breast, c(2,3), c(5)) # user defined

function

## ---------------------------------------
## COLUMN: single_epithelial_cell_size
## Benign Malignant
## 2 409 24
## 3 29 43
## 4 20 174
## ---------------------------------------

geom_boxplot() +
Normal Nucleoli
table(breast$normal_nucleoli, breast$class)
##
## Benign Malignant
## 1 402 41
## 2 30 6
## 3 12 32
## 4 1 17
## 5 2 17
## 6 4 18
## 7 2 14
## 8 4 20
## 9 1 15
## 10 0 61
# Note: starting from normal_nucleoli values of 4, the proportion of Benign

and Malignant tumors seems to remain close, and also values of 1 and 2 seem
to have the same proportion. Therefore we can group these values to make the
information simpler, helping reduce chances of overfitting or reduce the
influence of outliers in the extremes, thus making future models better for
generalization.
breast <- adjust.with.discretization(breast, c(2,3), c(8)) # user defined

function
## ---------------------------------------
## COLUMN: normal_nucleoli
## Benign Malignant
## 2 432 47
## 3 12 32
## 4 14 162
## ---------------------------------------

geom_boxplot() +
Correlations
# remove the class so we can coerce df to a matrix.
# cor() function takes a matrix
df <- breast[,-10]
correlations <- cor(df, method = "pearson")
corrplot(correlations, number.cex = .9, method = "number",
order = "FPC", type = "upper", tl.cex = 0.8, tl.col = "black")
# cell shape uniformity vs. cell size uniformity has cor = 0.91
ggplot(data = breast, aes(x = breast$cell_shape_uniformity,
y = breast$cell_size_uniformity)) +
geom_point() +
geom_smooth(method = "lm")
## Warning: Use of `breast$cell_shape_uniformity` is discouraged. Use

## `cell_shape_uniformity` instead.
## Warning: Use of `breast$cell_size_uniformity` is discouraged. Use

## `cell_size_uniformity` instead.
## Warning: Use of `breast$cell_shape_uniformity` is discouraged. Use

## `cell_shape_uniformity` instead.
## Warning: Use of `breast$cell_size_uniformity` is discouraged. Use

## `cell_size_uniformity` instead.
## `geom_smooth()` using formula 'y ~ x'

ggpairs(breast)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Normalize Data
# Normalize the data set
breast.n <- as.data.frame(lapply(breast[,-10], normalize))
breast.n <- cbind(breast.n, class = breast$class)
ggpairs(breast.n)

Create Training & Testing Sets
# randomly split your data in to 70% training set and 30% test set
set.seed(1234)
# randomly extract row numbers in breast dataset which will be included in

the training set
spt <- createDataPartition(y = breast$class, p = 0.75, list = FALSE)
# subset the breast data set to include only the rows found in spt - 70%
train.breast <- breast[spt, ]
# subset the breast data set to include only the rows NOT found in spt - 30%
test.breast <- breast[-spt, ]
# what proportion of yes/no do we have in our full breast class variable

prop.table(table(breast$class)) * 100
##
## Benign Malignant
## 65.52217 34.47783
# what proportion of yes/no do we have in our training data set class

variable
prop.table(table(train.breast$class)) * 100
##
## Benign Malignant
## 65.52381 34.47619
# what proportion of yes/no do we have in our testing data set class variable
prop.table(table(test.breast$class)) * 100
##
## Benign Malignant
## 65.51724 34.48276
Training The Model

# Note: n the formula y~. is not accepted in the neuralnet() function. You
need to first write the formula and then pass it as an argument in the
fitting function. The argument hidden accepts a vector with the number of
neurons for each hidden layer. The argument linear.output is used to specify
whether we want to do regression linear.output = TRUE or classification
linear.output = FALSE
n <- names(train.breast)
f <- as.formula(paste("class ~", paste(n[!n %in% "class"], collapse = " +
")))
f
## class ~ clump_thickness + cell_shape_uniformity + cell_size_uniformity +

## marginal_adhesion + single_epithelial_cell_size + bare_nuclei +
## bland_chromatin + normal_nucleoli + mitoses
# Train the neural network

nn <- neuralnet(f, data = train.breast, hidden = c(7), linear.output = FALSE,
act.fct = "logistic", lifesign = "minimal")
## hidden: 7 thresh: 0.01 rep: 1/1 steps: 1318 error: 3.00316

time: 0.47 secs
Visualize the Model

# visualize the neural network
plot(nn)
# Test accuracy of model

# Compute predictions with Training set
pr.nn <- predict(nn, train.breast[, -10])
# Accuracy (training set)

x <- table(train.breast$class, apply(pr.nn, 1, which.max))
(x[1,1]+x[2,2]) / sum(x)
## [1] 0.9942857
# Compute predictions with Test set
pr.nn <- predict(nn, test.breast[, -10])
# Accuracy (Test set)

x <- table(test.breast$class, apply(pr.nn, 1, which.max))
(x[1,1]+x[2,2]) / sum(x)
## [1] 0.9425287
Cross Validate the Classifier

# Crossvalidate model using evergreen 10 fold cross fold validation with a
train and test split of 95% and 5% respectively
# Set seed for reproducibility purposes
set.seed(1234)
# 10 fold cross validation

k <- 10
# Results from cv
outs <- NULL
# Train test split proportions

proportion <- 0.95
for(i in 1:k)
{
spt <- createDataPartition(y = breast$class, p = 0.75, list = FALSE)
train_cv <- breast[spt, ]
test_cv <- breast[-spt, ]
nn_cv <- neuralnet(f, data = train_cv, hidden = c(7), act.fct = "logistic",
linear.output = FALSE)
# Compute predictions
pr.nn <- predict(nn_cv, test_cv[, -10])
x <- table(test_cv$class, apply(pr.nn, 1, which.max))
outs[i] <- (x[1,1]+x[2,2]) / sum(x)
}
mean(outs)
## [1] 0.9505747

Bmen-589 Lab 13 Submission

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Bmen-589 Lab 13 Submission

Uploaded by

Copyright:

Available Formats

BMEN-589 Lab #13: Artificial Neural Networks

## ── Attaching packages ─────────────────────────────────────── tidyverse

## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4

## Loading required package: lattice

## The following object is masked from 'package:purrr':

## The following object is masked from 'package:ggplot2':

## Loading required package: Rcpp

## The following object is masked from 'package:stats':

## The following objects are masked from 'package:base':

## Registered S3 method overwritten by 'GGally':

## Type rfNews() to see new features/changes/bug fixes.

## The following object is masked from 'package:psych':

## The following object is masked from 'package:dplyr':

## The following object is masked from 'package:tidyr':

## corrplot 0.84 loaded

## The following object is masked from 'package:dplyr':

User Defined Functions

# Adjusts ranges given thresh values

# How the results will be displayed

cat("Correlation with Class AFTER:",

givenDF[,col.nr] <- as.integer(givenDF[,col.nr])

Import Data Set & Deal w/ Missing Data

## 'data.frame': 699 obs. of 10 variables:

## clump_thickness cell_shape_uniformity cell_size_uniformity

breast$bare_nuclei <- as.integer(breast$bare_nuclei) # NA's will be

## Warning: NAs introduced by coercion

breast$class <- as.factor(breast$class)

## 'data.frame': 699 obs. of 10 variables:

## clump_thickness cell_shape_uniformity cell_size_uniformity

# How many NA's are in the diabetes data set

# Visualize the missing data

# Use rf (random forest tree method) to assign missing variables

mice_complete <- mice::complete(mice_mod)

# Check for missing values before continuing with a missmap plot:

## Warning: Use of `breast$class` is discouraged. Use `class` instead.

## clump_thickness cell_shape_uniformity cell_size_uniformity

# View all predictor variables together and separated by class

# Note: starting from Mitoses of 2, the proportion of Benign and Malignant

## Correlation with Class BEFORE: 0.4231703

# View all predictor variables together

# Note: starting from a value of 4, the proportion of Benign and Malignant

breast <- adjust.with.discretization(breast, c(2,3), c(5)) # user defined

## Correlation with Class BEFORE: 0.6827845

# View all predictor variables together

# Note: starting from normal_nucleoli values of 4, the proportion of Benign

breast <- adjust.with.discretization(breast, c(2,3), c(8)) # user defined

# View all predictor variables together

## Warning: Use of `breast$cell_shape_uniformity` is discouraged. Use

## Warning: Use of `breast$cell_size_uniformity` is discouraged. Use

## Warning: Use of `breast$cell_shape_uniformity` is discouraged. Use

## Warning: Use of `breast$cell_size_uniformity` is discouraged. Use

## `geom_smooth()` using formula 'y ~ x'

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# randomly extract row numbers in breast dataset which will be included in

# what proportion of yes/no do we have in our full breast class variable

# what proportion of yes/no do we have in our training data set class

Training The Model

## class ~ clump_thickness + cell_shape_uniformity + cell_size_uniformity +

# Train the neural network

## hidden: 7 thresh: 0.01 rep: 1/1 steps: 1318 error: 3.00316

Visualize the Model