You are on page 1of 5

library(corrplot)

library(caret)
pima<-read.csv("pima-indians-
diabetes.csv",col.names=c("Pregnant","Plasma_Glucose","Dias_BP","Triceps_Skin","Ser
um_Insulin","BMI","DPF","Age","Diabetes"))
View(pima)
str(pima)

is.na(pima)
sapply(pima, function(x) sum(is.na(x)))

-----

-----

data graph analysis


-----------------
pima$Diabetes <- factor(pima$Diabetes) *(if not writing colour will
not come under fill option)

ggplot(pima,aes(Diabetes,fill = Diabetes)) +geom_bar() + ggtitle("Distribution of


Outcome variable")

or

ggplot() + geom_bar(aes(y = ..count..,x =Diabetes,fill = (Diabetes)),data=pima)+


ggtitle("Distribution of Outcome variable")
-------

library(gridExtra)
p1 <- ggplot(pima, aes(x = Diabetes, y = Pregnant,fill = Diabetes)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Number of pregnancies Vs Diabetes")

p2 <- ggplot(pima,aes(x = Pregnant,fill = factor(Diabetes))) +


geom_bar(position = "Dodge") +
scale_x_continuous(limits = c(0,16)) +
theme(legend.position = "bottom") +
labs(title = "Pregnancies Vs Diabetes")

gridExtra::grid.arrange(p1, p2, ncol = 2)


-------

p2 <- ggplot(pima, aes(x = Plasma_Glucose, color = Diabetes, fill = Diabetes)) +


geom_density(alpha = 0.8) +
theme(legend.position = "bottom") +
labs(x = "Glucose", y = "Density", title = "Density plot of glucose")

p1 <- ggplot(pima, aes(x = Diabetes, y = Plasma_Glucose,fill = Diabetes)) +


geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Variation of glucose in women Vs Diabetes")

gridExtra::grid.arrange(p1, p2, ncol = 2)


------------------------------------------------

p2 <- ggplot(pima, aes(Age, fill = Diabetes)) +


geom_histogram(binwidth = 5) +
theme(legend.position = "bottom") +
ggtitle("Variation of Age of women Vs Diabetes")

p1 <- ggplot(pima, aes(x = Diabetes, y = Age,fill = Diabetes)) +


geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Variation of Age of women Vs Diabetes")

gridExtra::grid.arrange(p1, p2, ncol = 2)


-------------------------------------------
p1 <- ggplot(pima, aes(x = Age, y = Pregnant)) +
geom_point(aes(color=Diabetes)) +
theme(legend.position = "bottom") +
ggtitle("Relationship of Pregnancies with Age Vs Diabetes")

p2 <- ggplot(pima,aes(x=Serum_Insulin,y=Plasma_Glucose))+
geom_point(aes(color=Diabetes))+
theme(legend.position = "bottom") +
ggtitle("Relationship of Insulin with Glucose Vs Diabetes")

gridExtra::grid.arrange(p1, p2, ncol = 2)


-------------------------------------------------

p1 <- ggplot(pima,aes(x=BMI,y=Dias_BP))+
geom_point(aes(color=Diabetes))+
theme(legend.position = "bottom") +
ggtitle("Relationship of BMI with BP Vs Diabetes")

p2 <- ggplot(pima,aes(x=BMI,y=Triceps_Skin))+
geom_point(aes(color=Diabetes))+
theme(legend.position = "bottom") +
ggtitle("Relationship of BMI with Skin Thickness Vs Diabetes")

gridExtra::grid.arrange(p1, p2, ncol = 2)


------------------------------------------------------------
pairs(pima, panel = panel.smooth)
corrplot(cor(pima[, -9]), type = "lower", method = "number")
or
corrplot(cor(pima[, -9]), method = "number")
or
corrplot(cor(pima[, -9])
-------------------
Model1:logistic regression
-----------------------
# Preparing the DataSet
library(caret)
library(e1071)
set.seed(123)
pima$Diabetes <- as.factor(pima$Diabetes)
n <- nrow(pima)
train <- sample(n, trunc(0.70*n))
pima_training <- pima[train, ]
pima_testing <- pima[-train, ]
nrow(pima_training)
nrow(pima_testing)
# Training The Model (Logistic regression)
glm_fm1 <- glm(Diabetes ~., data = pima_training, family = binomial)
summary(glm_fm1)

glm_fm2 <- update(glm_fm1, ~. - Triceps_Skin - Serum_Insulin - Age )


summary(glm_fm2)
----
OR(IF YOU DO NOT WANT TO TYPE GLM_FM2 COMMAND)
---
glm(formula = Diabetes ~ Pregnant + Plasma_Glucose + Dias_BP + BMI + DPF, family =
binomial, data = pima_training)

# Testing the Model

glm_probs <- predict(glm_fm2, newdata = pima_testing, type = "response")

glm_pred <- ifelse(glm_probs > 0.5, 1, 0)

table(Predicted = glm_pred, Actual = pima_testing$Diabetes)

mean(glm_pred != pima_testing$Diabetes) * 100

confusionMatrix(glm_pred, pima_testing$Diabetes )
confusionMatrix(data = factor(glm_pred),reference = factor(pima_testing$Diabetes))
confusionMatrix(factor(glm_pred),factor(pima_testing$Diabetes))

-------------------------------------------------------------------------
Model 2:DT
-----------------------------
library(tree)
library(e1071)
library(party)

treemod <- ctree(Diabetes ~ ., data = pima_training)


summary(treemod)
plot(treemod)

train_predict <- predict(treemod,pima_training,type="response")


table(train_predict,pima_training$Diabetes)
mean(train_predict != pima_training$Diabetes)* 100

tree_pred <- predict(treemod, newdata = pima_testing, type = "response" )


test_predict <- predict(treemod,pima_testing,type="response")
table(test_predict,pima_testing$Diabetes)
mean(test_predict != pima_testing$Diabetes) * 100

confusionMatrix(tree_pred, pima_testing$Diabetes)
---------------------------

Model 3:random forest


-----------------------

set.seed(123)
library(randomForest)
rf_pima <- randomForest(Diabetes ~., data = pima_training, mtry = 8, ntree=50,
importance = TRUE)
rf_probs <- predict(rf_pima, newdata = pima_testing)
mean(rf_probs != pima_testing$Diabetes) * 100
confusionMatrix(rf_probs, pima_testing$Diabetes)
or
confusionMatrix(data = factor(rf_probs),reference = factor(pima_testing$Diabetes))
importance(rf_pima)

par(mfrow = c(1, 2))


varImpPlot(rf_pima, type = 2, main = "Variable Importance",col = 'black')
plot(rf_pima, main = "Error vs no. of trees grown")

-----------------------------------
Model 4: SVM
---------------------------------

pima$Diabetes <- as.factor(pima$Diabetes)

library(e1071)
tuned <- tune.svm(Diabetes ~., data = pima_training, gamma =seq(.01, 0.1, by
= .01), cost = seq(0.1,1, by = 0.1))
summary(tuned) # to show the results
svm_model <- svm(Diabetes ~., data = pima_training, kernel = "radial", gamma =
0.01, cost = .1)
summary(svm_model)
svm_pred <- predict(svm_model,pima_testing)
mean(svm_pred != pima_testing$Diabetes) * 100
confusionMatrix(svm_pred, pima_testing$Diabetes)

---------------------
Model5:NB
--------------------
library(e1071)
nv_model<- naiveBayes(as.factor(Diabetes) ~., data = pima)
summary(nv_model)
nv_pred <- predict(nv_model, pima_testing)
mean(nv_pred != pima_testing$Diabetes) * 100
confusionMatrix(nv_pred, pima_testing$Diabetes)
------------------
Model6:NN

library(neuralnet)

NN = neuralnet(Diabetes ~.,data=pima_training)

OR

NN = neuralnet(Diabetes ~.,data=pima_training, hidden = 3 , linear.output = TRUE,


act.fct="logistic" )

plot(NN,rep="best")

prediction <- neuralnet::compute(NN, pima_testing[,1:8])

cm<-table(pima_testing$Diabetes,round(prediction$net.result[,1]))

library(psych)
a1<- tr(cm)/sum(cm)
----------
NN = neuralnet(Diabetes ~.,data=pima_training)

Predict=compute(NN,pima_testing)
Predict$net.result

probNN <- Predict$net.result


predNN <- ifelse(probNN>0.5, 1, 0)
predNN

mean(predNN != pima_testing$Diabetes) * 100

table(Predicted = predNN, Actual = pima_testing$Diabetes)

------
-----------------------
library(ggplot2)
accuracy <- data.frame(Model=c("Logistic Regression","Decision Tree","Random
Forest", "Support Vector Machine (SVM)","NB"), Accuracy=c(glm_pred, test_predict,
rf_probs, svm_pred,nv_pred ))
ggplot(accuracy,aes(x=Model,y=Accuracy)) + geom_bar(stat='identity') + theme_bw()
+ ggtitle('Comparison of Model Accuracy')

-----------------------

You might also like