Professional Documents
Culture Documents
BMEN-589 Lab #12: Naive Bayes: Useful Libraries
BMEN-589 Lab #12: Naive Bayes: Useful Libraries
Shrey Patel
11/13/2020
Useful Libraries
library(tidyverse)
## ── Conflicts ──────────────────────────────────────────
tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(caret)
##
## Attaching package: 'caret'
library(caretEnsemble)
##
## Attaching package: 'caretEnsemble'
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(Amelia)
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
##
## Attaching package: 'mice'
library(GGally)
library(rpart)
library(randomForest)
## randomForest 4.6-14
##
## Attaching package: 'randomForest'
library(e1071)
Import Data
diabetes <- read.csv("diabetes.csv")
str(diabetes)
Clean Data
# Setting Diabetes variables as categorical
diabetes$Outcome <- factor(diabetes$Outcome, levels = c(0,1), labels =
c("No", "Yes"))
str(diabetes)
## [1] 652
##
## iter imp variable
## 1 1 Glucose BloodPressure SkinThickness Insulin BMI
## 1 2 Glucose BloodPressure SkinThickness Insulin BMI
## 1 3 Glucose BloodPressure SkinThickness Insulin BMI
## 1 4 Glucose BloodPressure SkinThickness Insulin BMI
## 1 5 Glucose BloodPressure SkinThickness Insulin BMI
## 2 1 Glucose BloodPressure SkinThickness Insulin BMI
## 2 2 Glucose BloodPressure SkinThickness Insulin BMI
## 2 3 Glucose BloodPressure SkinThickness Insulin BMI
## 2 4 Glucose BloodPressure SkinThickness Insulin BMI
## 2 5 Glucose BloodPressure SkinThickness Insulin BMI
## 3 1 Glucose BloodPressure SkinThickness Insulin BMI
## 3 2 Glucose BloodPressure SkinThickness Insulin BMI
## 3 3 Glucose BloodPressure SkinThickness Insulin BMI
## 3 4 Glucose BloodPressure SkinThickness Insulin BMI
## 3 5 Glucose BloodPressure SkinThickness Insulin BMI
## 4 1 Glucose BloodPressure SkinThickness Insulin BMI
## 4 2 Glucose BloodPressure SkinThickness Insulin BMI
## 4 3 Glucose BloodPressure SkinThickness Insulin BMI
## 4 4 Glucose BloodPressure SkinThickness Insulin BMI
## 4 5 Glucose BloodPressure SkinThickness Insulin BMI
## 5 1 Glucose BloodPressure SkinThickness Insulin BMI
## 5 2 Glucose BloodPressure SkinThickness Insulin BMI
## 5 3 Glucose BloodPressure SkinThickness Insulin BMI
## 5 4 Glucose BloodPressure SkinThickness Insulin BMI
## 5 5 Glucose BloodPressure SkinThickness Insulin BMI
#Transfer the predicted missing values into the main data set
diabetes$Glucose <- mice_complete$Glucose
diabetes$BloodPressure <- mice_complete$BloodPressure
diabetes$SkinThickness <- mice_complete$SkinThickness
diabetes$Insulin<- mice_complete$Insulin
diabetes$BMI <- mice_complete$BMI
Data Visualization
# Frequency Distributions
diabetes %>%
gather(-Outcome, key = "var", value = "value") %>%
ggplot(aes(x = value, y = ..count.. , colour = Outcome)) +
geom_density() +
scale_color_manual(values=c("#008000", "#FF0000"))+
facet_wrap(~var, scales = "free", nrow = 2) +
theme_bw()
# subset the diabetes data set to include only the rows found in spt - 70%
train.diabetes <- diabetes[spt, ]
# subset the diabetes data set to include only the rows NOT found in spt -
30%
test.diabetes <- diabetes[-spt, ]
# look for effectiveness of training and testing sets with the prop.table()
function
# what propotion of yes/no do we have in our full diabetes Outcome variable
prop.table(table(diabetes$Outcome)) * 100
##
## No Yes
## 65.10417 34.89583
# what propotion of yes/no do we have in our training data set Outcome
variable
prop.table(table(train.diabetes$Outcome)) * 100
##
## No Yes
## 67.28625 32.71375
##
## No Yes
## 60 40
# Note: to create uniform training and testing sets you need the same
proportions of yes/no for each full training and testing set
# subset the diabetes data set to include only the rows found in spt - 70%
train.diabetes <- diabetes[spt, ]
# subset the diabetes data set to include only the rows NOT found in spt -
30%
test.diabetes <- diabetes[-spt, ]
##
## No Yes
## 65.10417 34.89583
##
## No Yes
## 65.10417 34.89583
# what proportion of yes/no do we have in our testing data set Outcome
variable
prop.table(table(test.diabetes$Outcome)) * 100
##
## No Yes
## 65.10417 34.89583
# create objects x which holds the predictor variables and y which holds the
target variable
x <- train.diabetes[,-9]
y <- train.diabetes$Outcome
model
## Naive Bayes
##
## 576 samples
## 8 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 518, 518, 519, 518, 519, 519, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.7535693 0.4509402
## TRUE 0.7552934 0.4614451
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and
adjust
## = 1.
Model Evaluation
# Model Evaluation
model.predict <- predict(model, newdata = test.diabetes)
# Get the confusion matrix to see accuracy value and other parameter values
confusionMatrix(model.predict, test.diabetes$Outcome)