Professional Documents
Culture Documents
Bmen-589 Lab 13 Submission
Bmen-589 Lab 13 Submission
Shrey Patel
11/22/2020
Useful Libraries
library(tidyverse)
## ── Conflicts ──────────────────────────────────────────
tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(caret)
##
## Attaching package: 'caret'
library(caretEnsemble)
##
## Attaching package: 'caretEnsemble'
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(Amelia)
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
##
## Attaching package: 'mice'
library(GGally)
library(rpart)
library(randomForest)
## randomForest 4.6-14
##
## Attaching package: 'randomForest'
library(e1071)
library(reshape2)
##
## Attaching package: 'reshape2'
library(corrplot)
library(neuralnet)
##
## Attaching package: 'neuralnet'
return(givenDF)
}
# Normalization Function
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
str(breast)
summary(breast)
summary(breast)
## [1] 16
##
## iter imp variable
## 1 1 bare_nuclei
## 1 2 bare_nuclei
## 1 3 bare_nuclei
## 1 4 bare_nuclei
## 1 5 bare_nuclei
## 2 1 bare_nuclei
## 2 2 bare_nuclei
## 2 3 bare_nuclei
## 2 4 bare_nuclei
## 2 5 bare_nuclei
## 3 1 bare_nuclei
## 3 2 bare_nuclei
## 3 3 bare_nuclei
## 3 4 bare_nuclei
## 3 5 bare_nuclei
## 4 1 bare_nuclei
## 4 2 bare_nuclei
## 4 3 bare_nuclei
## 4 4 bare_nuclei
## 4 5 bare_nuclei
## 5 1 bare_nuclei
## 5 2 bare_nuclei
## 5 3 bare_nuclei
## 5 4 bare_nuclei
## 5 5 bare_nuclei
# Transfer the predicted missing values into the main data set
breast$bare_nuclei <- mice_complete$bare_nuclei
Data Visualization
# Compare benign and malignant cases
ggplot(data = breast) +
geom_bar(aes(x=breast$class))
Frequency Distributions
breast %>%
gather(-class, key = "var", value = "value") %>%
ggplot(aes(x = value, y = ..count.. , colour = class)) +
geom_density() +
scale_color_manual(values=c("#008000", "#FF0000"))+
facet_wrap(~var, scales = "free", nrow = 2) +
theme_bw()
Outliers
# View all predictor variables together
ggplot(stack(breast[,1:9]), aes(x = ind, y = values)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust=1)) +
labs(title = "Boxplots of columns") +
labs(x = "", y = "Values") +
scale_y_continuous(breaks = seq(1, 10, by = 1))
Mitoses
table(breast$mitoses, breast$class)
##
## Benign Malignant
## 1 445 134
## 2 8 27
## 3 2 31
## 4 0 12
## 5 1 5
## 6 0 3
## 7 1 8
## 8 1 7
## 10 0 14
# Note: t the correlation of the mitoses column with the class improved
considerably after adjustment, therefore confirming the validity of the
changes made.
Single Epithelial Cell Size
table(breast$single_epithelial_cell_size, breast$class)
##
## Benign Malignant
## 1 46 1
## 2 363 23
## 3 29 43
## 4 7 41
## 5 5 34
## 6 2 39
## 7 3 9
## 8 2 19
## 9 0 2
## 10 1 30
##
## Benign Malignant
## 1 402 41
## 2 30 6
## 3 12 32
## 4 1 17
## 5 2 17
## 6 4 18
## 7 2 14
## 8 4 20
## 9 1 15
## 10 0 61
Correlations
# remove the class so we can coerce df to a matrix.
# cor() function takes a matrix
df <- breast[,-10]
correlations <- cor(df, method = "pearson")
corrplot(correlations, number.cex = .9, method = "number",
order = "FPC", type = "upper", tl.cex = 0.8, tl.col = "black")
# cell shape uniformity vs. cell size uniformity has cor = 0.91
ggplot(data = breast, aes(x = breast$cell_shape_uniformity,
y = breast$cell_size_uniformity)) +
geom_point() +
geom_smooth(method = "lm")
# subset the breast data set to include only the rows found in spt - 70%
train.breast <- breast[spt, ]
# subset the breast data set to include only the rows NOT found in spt - 30%
test.breast <- breast[-spt, ]
##
## Benign Malignant
## 65.52217 34.47783
# what proportion of yes/no do we have in our testing data set class variable
prop.table(table(test.breast$class)) * 100
##
## Benign Malignant
## 65.51724 34.48276
n <- names(train.breast)
f <- as.formula(paste("class ~", paste(n[!n %in% "class"], collapse = " +
")))
f
## [1] 0.9942857
# Compute predictions with Test set
pr.nn <- predict(nn, test.breast[, -10])
## [1] 0.9425287
# Results from cv
outs <- NULL
for(i in 1:k)
{
spt <- createDataPartition(y = breast$class, p = 0.75, list = FALSE)
train_cv <- breast[spt, ]
test_cv <- breast[-spt, ]
nn_cv <- neuralnet(f, data = train_cv, hidden = c(7), act.fct = "logistic",
linear.output = FALSE)
# Compute predictions
pr.nn <- predict(nn_cv, test_cv[, -10])
x <- table(test_cv$class, apply(pr.nn, 1, which.max))
outs[i] <- (x[1,1]+x[2,2]) / sum(x)
}
mean(outs)
## [1] 0.9505747