You are on page 1of 3

# Author: Shovan Chowdhury

# Reading The data set


d<-read.csv("D:/business analytics/eMDP_BA/EDA_session/RG case/RG.csv",header=T)

attach(d)
names(d)
## check the type of all the columns of the dataframe
str(appcab)

##---------------Data Cleaning and Data


Validation-------------------###

## Look for duplicate values in Request.id column as it is the


primary key##

sum(duplicated(d)) ## In the data frame


sum(duplicated(business_id)) ## No duplicate entries found in the primary key

## Look for NA values and missing/blank values in all the columns


is.na(d)
sum(is.na(d))
na.omit(d)

## Checking for blank values in all the columns of the dataframe


sapply(d, function(x) length(which(x == ""))) # checking for blank "" values; there
are none

#------------exporting results in a text file----------

sink("D:/business analytics/eMDP_BA/EDA_session3/RG case/output.txt")


summary(pageviews[treatment==0])
summary(pageviews[restaurant_type=="chain"])
summary(pageviews[treatment==2 & restaurant_type=="chain"])
sink()

jpeg("D:/business analytics/eMDP_BA/EDA_session3/RG case/graph.jpg")


par(mfrow=c(1,3))
hist(pageviews[treatment==0])
hist(pageviews[treatment==1])
hist(pageviews[treatment==2])
graphics.off()

#---------------- Bar Plots with respect to treatments-----------------

tab_1=tapply(pageviews,treatment,"mean")
tab_2=tapply(calls,treatment,"mean")
tab_3=tapply(reservations,treatment,"mean")

#barplot(tab_1,col=c("red","blue","green"),xlab="Page Views")
barplot(tab_1,col=c("red","blue","green"),xlab="Page
Views",names.arg=c("Control","Treatment 1","Treatment 2"))
barplot(tab_2,col=c("red","blue","green"),xlab="Calls",names.arg=c("Control","Treat
ment 1","Treatment 2"))
barplot(tab_3,col=c("red","blue","green"),xlab="Reservations",names.arg=c("Control"
,"Treatment 1","Treatment 2"))
# Bar Plots with respect to treatments and restaurant type
tab_4=tapply(pageviews,list(treatment,restaurant_type),"mean")
tab_5=tapply(calls,list(treatment,restaurant_type),"mean")
tab_6=tapply(reservations,list(treatment,restaurant_type),"mean")

barplot(tab_4,beside=T,col=c("red","blue","green","red","blue","green"),xlab="Page
Views")
barplot(tab_5,beside=T,col=c("red","blue","green","red","blue","green"),xlab="Calls
")
barplot(tab_6,beside=T,col=c("red","blue","green","red","blue","green"),xlab="Reser
vations")

#-------confirmatory analysis---------------------

# ANOVA
TRT=as.factor(treatment)
RT=as.factor(restaurant_type)

# One-Way ANOVA Model


mod_1=aov(pageviews~TRT)
summary(mod_1)
#TukeyHSD(mod_1)

mod_2=aov(calls~TRT)
summary(mod_2)
#TukeyHSD(mod_2)

mod_3=aov(reservations~TRT)
summary(mod_3)
#TukeyHSD(mod_3)

# Two-Way ANOVA Model


mod_4=aov(pageviews~TRT*RT)
summary(mod_4)
#TukeyHSD(mod_4)

mod_5=aov(calls~TRT*RT)
summary(mod_5)
#TukeyHSD(mod_5)

mod_6=aov(reservations~TRT*RT)
summary(mod_6)
#TukeyHSD(mod_6)

#-----binomial distribution----------

dbinom(4, size=4, prob=0.2) # P(X=4)

#P(X>=2)
s=0
for(i in 2:4)
s=s+dbinom(i,4,0.2)

# can use CDF


1-pbinom(1,4,0.2)
#--------Poisson distribution---------

dpois(5,lambda=3)

ppois(10,3)

#-------Normal/Gaussian distribution-----

pnorm(20, mean=12, sd=3.2, lower.tail=FALSE) # right tail area


pnorm(16, mean=12, sd=3.2, lower.tail=TRUE) # left tail area
qnorm(0.9, 12, 3.2) # inverse (to obtain quantile/90th percentile

#-----Normality Check------------------

# Q-Q Plot
qqnorm(reservations)
qqline(reservations, col = "red",lwd=3)

# Formal Test
# Shapiro-Wilk normality test (maximum sample size should be 5000) best test
shapiro.test(reservations)

#Anderson-Darling test

library(nortest)
ad.test(reservations)
ad.test(calls[treatment==0 & restaurant_type=="chain"])

#------------mean test one sample------------

t.test(calls, alternative = "greater", mu = 35)

t.test(calls, mu = 35)

#---------------two samples test----------------

t.test(calls, reservations, mu = 0, conf.level = 0.95)

You might also like