You are on page 1of 9

homework 3

Andalib Shams

9/29/2020

library(ggplot2)
library(GGally)
library(DescTools)
library(mvShapiroTest)
library(biotools)

## ---
## biotools version 3.1
library(car)
library(reshape2)

rabbits_data <- read.csv('rabbits2.csv')


rabbits_data$insulin <- as.factor(rabbits_data$insulin)

Problem 1
(a) Examine differences in mean response for the two manufacturers by constructing a profile plot of the
percent changes from the baseline concentration versus time, with one profile for each manufacturer.
Display the plot in you solutions to this assignment. Do the profiles appear to be the same for the two
manufacturers?
means_col <- colMeans(rabbits_data[, 3:7])
col_name <- colnames(rabbits_data[, 3:7])
rabbits_data_new <- (rabbits_data[, 3:7] - means_col)/means_col*100
rabbits_data_new$insulin = rabbits_data$insulin
rabbits_data_new <- melt(rabbits_data_new, measure.vars = col_name, id.vars = "insulin")
ggplot(rabbits_data_new, aes(x= variable, y = value , color = insulin))+geom_boxplot()+
facet_wrap(~variable, scale="free")

1
X1 X2 X3
20 80
25
0 40
0

−20 0
−25

−40
−50 −40

−60 insulin
X1 X2 X3
value

1
X4 X5
120 2
100

80
50
40

0
0

−50
−40
X4 X5
variable
From the boxplot, mean differences in blood suger concentration % change is negligble in between
manufacturers after 1st and fourth hour. However there are considerable differences after 2nd hour and
5th hour. After 2nd hour manufacturer 2 seems to experienced higher concentration but after 5th hour
manufacturer 1 experienced higher concentration.

(b) Test the null hypothesis that profiles (mean vectors) are the same for the insulins made by the two
manufacturers. Report the value of the two-sample Hoteling T2 statistic, the corresponding F-value,
degrees of freedom, and the p-value. State your conclusion.
T2result <- HotellingsT2Test(rabbits_data[rabbits_data$insulin == 1, 3:7], rabbits_data[rabbits_data$ins
T2result

##
## Hotelling's two sample T2-test
##
## data: rabbits_data[rabbits_data$insulin == 1, 3:7] and rabbits_data[rabbits_data$insulin == 2, 3:7]
## T.2 = 0.39966, df1 = 5, df2 = 48, p-value = 0.8466
## alternative hypothesis: true location difference is not equal to c(0,0,0,0,0)
T 2 = 3.9966
df1 = 5
df2 = 48
p-value = 0.8466 >0.5; so we failed to reject the null hypothesis

Here, mean vectors are same for these two populations.


(c) Use the Bonferroni method to construct simultaneous 95% confidence intervals for the difference in the
mean responses (percent change in blood sugar concentration from baseline) to the insulins made by

2
the two manufacturers at each of the 5 time points. State your conclusions
xbar1<-sapply(rabbits_data[rabbits_data$insulin==1, 3:7], mean)
xvar1<-var(rabbits_data[rabbits_data$insulin==1 , 3:7])
xcor1 <-cor(rabbits_data[rabbits_data$insulin==1 , 3:7])
xbar2<-sapply(rabbits_data[rabbits_data$insulin==2, 3:7], mean)
xvar2<-var(rabbits_data[rabbits_data$insulin==2 , 3:7])
xcor2 <-cor(rabbits_data[rabbits_data$insulin==2 , 3:7])

n1 <- dim(rabbits_data[rabbits_data$insulin==1, -1])[1]


n2 <- dim(rabbits_data[rabbits_data$insulin==2, -1])[1]
p <- dim(rabbits_data[rabbits_data$insulin==2, 3:7])[2]

# Enter the confidence level


level <- 0.95
# Compute degrees of freedom and the multipliers
df1 <- p
df2 <- n1+n2-p-1
df3 <- n1+n2-2
c_T2 <- sqrt((n1+n2-2)*p*qf(level,df1,df2)/(n1+n2-p-1))
levelt <- 1-(1-level)/2
levelb <- 1-(1-level)/(2*p)
c_bon <- qt(levelb, df3)
# Compute pooled covariance matrix
vpool <- ((n1-1)*xvar1 +(n2-1)*xvar2)/(n1+n2-2)

# Compute Bonferroni limits


lower_limit <- (xbar1-xbar2) - c_bon*sqrt(diag(vpool)*((1/n1)+(1/n2)))
upper_limit <- (xbar1-xbar2) + c_bon*sqrt(diag(vpool)*((1/n1)+(1/n2)))
rbind(lower_limit, upper_limit)

## X1 X2 X3 X4 X5
## lower_limit -7.174761 -11.517462 -14.514729 -14.91099 -10.801066
## upper_limit 7.322909 6.406351 8.440655 11.79987 9.319585
All the interval includes zero, so none of mean differences are significant.
(d) The Hoteling T2 test was performed using the model assumption that the distribution of responses
among rabbits at the five inspection times is a five-dimensional normal distribution for each insulin.
Do the data conform to that condition? Perform an appropriate test and report the p-value and your
conclusion.
mvShapiro.Test(as.matrix(rabbits_data[rabbits_data$insulin == 1, 3:7]))

##
## Generalized Shapiro-Wilk test for Multivariate Normality by
## Villasenor-Alva and Gonzalez-Estrada
##
## data: as.matrix(rabbits_data[rabbits_data$insulin == 1, 3:7])
## MVW = 0.93359, p-value = 0.005768
p-value is less than 0.05, so we rejected null hypothesis. So at least one variable does not follow normal
distribution.
apply(rabbits_data[rabbits_data$insulin == 1, 3:7], 2, shapiro.test)

## $X1
##

3
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.97318, p-value = 0.6872
##
##
## $X2
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.95709, p-value = 0.3166
##
##
## $X3
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.91003, p-value = 0.02286
##
##
## $X4
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.97547, p-value = 0.749
##
##
## $X5
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.94976, p-value = 0.2115
Here X3 has a p-value lower than 0.05, so we rejected null hypotheis and this variable is not normally
distributed. Other variables have p-value over 0.05, so we failed to rejected the null hypothesis and these
variables are normally distributed.
mvShapiro.Test(as.matrix(rabbits_data[rabbits_data$insulin == 2, 3:7]))

##
## Generalized Shapiro-Wilk test for Multivariate Normality by
## Villasenor-Alva and Gonzalez-Estrada
##
## data: as.matrix(rabbits_data[rabbits_data$insulin == 2, 3:7])
## MVW = 0.93725, p-value = 0.01136
p-value less than 0.05 indicates that we rejected the null hypothesis, hence data does not follow multivariate
normal distribution.
apply(rabbits_data[rabbits_data$insulin == 2, 3:7], 2, shapiro.test)

## $X1

4
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.96305, p-value = 0.4324
##
##
## $X2
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.96109, p-value = 0.3912
##
##
## $X3
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.97051, p-value = 0.6154
##
##
## $X4
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.89305, p-value = 0.009307
##
##
## $X5
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.70989, p-value = 5.337e-06
Here, variable x4 and x5 have p-value lower than 0.01, so univariate null hypothesis is rejected in these two
cases. Also, the sample size is small. So, the assumptions were not right even though X1, X2 and X3 are
normally distributed. Problem 2
(a) Perform a one-way MANOVA to test the null hypothesis of no differences among the three sandstone
zones with respect to the mean vectors for the amounts of the five trace elements. Report values
for Wilks Lambda and the corresponding F-value, degrees of freedom, and the p-value. State your
conclusion.
crudeoil_data <- read.csv('crudeoil.csv')
crudeoil_data$Zone <- as.factor(crudeoil_data$Zone)

fit.lm <- lm(cbind(X1, X2, X3, X4, X5)~ Zone , data = crudeoil_data)
summary(fit.lm)

## Response X1 :
##
## Call:

5
## lm(formula = X1 ~ Zone, data = crudeoil_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5263 -1.0263 0.0737 1.1737 3.9545
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2286 0.7111 4.541 3.27e-05 ***
## Zone2 1.2169 0.9096 1.338 0.187
## Zone3 3.9977 0.7738 5.167 3.68e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.881 on 53 degrees of freedom
## Multiple R-squared: 0.4197, Adjusted R-squared: 0.3978
## F-statistic: 19.17 on 2 and 53 DF, p-value: 5.451e-07
##
##
## Response X2 :
##
## Call:
## lm(formula = X2 ~ Zone, data = crudeoil_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.0909 -5.2526 -0.8312 4.7474 29.7474
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.571 3.373 12.917 < 2e-16 ***
## Zone2 -10.481 4.315 -2.429 0.0186 *
## Zone3 -21.319 3.671 -5.808 3.65e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.924 on 53 degrees of freedom
## Multiple R-squared: 0.4302, Adjusted R-squared: 0.4087
## F-statistic: 20.01 on 2 and 53 DF, p-value: 3.366e-07
##
##
## Response X3 :
##
## Call:
## lm(formula = X3 ~ Zone, data = crudeoil_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43211 -0.17091 -0.04214 0.07164 1.06789
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.11714 0.10934 1.071 0.2889
## Zone2 0.05377 0.13987 0.384 0.7022

6
## Zone3 0.31496 0.11899 2.647 0.0107 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2893 on 53 degrees of freedom
## Multiple R-squared: 0.1816, Adjusted R-squared: 0.1507
## F-statistic: 5.881 on 2 and 53 DF, p-value: 0.004935
##
##
## Response X4 :
##
## Call:
## lm(formula = X4 ~ Zone, data = crudeoil_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6857 -0.7007 -0.2595 0.4062 2.9018
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.7957 0.3921 17.331 < 2e-16 ***
## Zone2 -0.2348 0.5016 -0.468 0.642
## Zone3 -2.1376 0.4267 -5.010 6.42e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.037 on 53 degrees of freedom
## Multiple R-squared: 0.4611, Adjusted R-squared: 0.4408
## F-statistic: 22.67 on 2 and 53 DF, p-value: 7.677e-08
##
##
## Response X5 :
##
## Call:
## lm(formula = X5 ~ Zone, data = crudeoil_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4932 -2.2883 0.0668 1.5042 7.4364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.5400 0.9546 12.089 < 2e-16 ***
## Zone2 -6.0564 1.2211 -4.960 7.65e-06 ***
## Zone3 -5.7768 1.0388 -5.561 8.95e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.526 on 53 degrees of freedom
## Multiple R-squared: 0.3826, Adjusted R-squared: 0.3593
## F-statistic: 16.43 on 2 and 53 DF, p-value: 2.813e-06
fit.manova <- Manova(fit.lm)
summary(fit.manova)

7
##
## Type II MANOVA Tests:
##
## Sum of squares and products for error:
## X1 X2 X3 X4 X5
## X1 187.575243 -34.81237 -6.847988 -21.133755 79.569024
## X2 -34.812372 4221.15811 20.112309 83.721918 -288.725952
## X3 -6.847988 20.11231 4.435665 8.637653 -0.583789
## X4 -21.133755 83.72192 8.637653 57.040433 32.843885
## X5 79.569024 -288.72595 -0.583789 32.843885 338.060876
##
## ------------------------------------------
##
## Term: Zone
##
## Sum of squares and products for the hypothesis:
## X1 X2 X3 X4 X5
## X1 135.67315 -647.33656 11.4925598 -80.479227 -114.02963
## X2 -647.33656 3186.68117 -53.8000232 373.774403 649.65102
## X3 11.49256 -53.80002 0.9844204 -6.924981 -8.54534
## X4 -80.47923 373.77440 -6.9249811 48.803422 56.63993
## X5 -114.02963 649.65102 -8.5453396 56.639933 209.53412
##
## Multivariate Tests: Zone
## Df test stat approx F num Df den Df Pr(>F)
## Pillai 2 1.206447 15.20310 10 100 3.7318e-16 ***
## Wilks 2 0.116048 18.96779 10 98 < 2.22e-16 ***
## Hotelling-Lawley 2 4.838122 23.22298 10 96 < 2.22e-16 ***
## Roy 2 4.172025 41.72025 5 50 < 2.22e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
DF = 2
Test-stat = 0.116
F-stat (approximately) = 18.967
Neumerator DF = 10
Denomenator DF = 98
p-value < 2.22e-16 <

Here, p-value is lower than 0.05. so, null hypothesis is rejected and populations do not have the same mean
vectors.
(b) Use the Bonferroni approach to determine which trace elements have significantly different means
across the different sandstone zones, using an experiment wise type I error level of 0.05. Note that you
must construct three confidence intervals (or do three t-tests) to compare the three pairs of sandstone
zones for each trace element, a total of fifteen confidence intervals. For each trace element, report your
conclusions about which sandstone zones have significantly higher or lower mean levels. (For example,
crude oil extracted from the Wilhelm zone has a significantly lower mean level of vanadium than crude
oil extracted either the Sub-Milina or the Upper zones. There is no difference in mean vanadium levels
for crude oil extracted from the Sub-Milina and Upper zones.)
xbar1<-sapply(crudeoil_data[crudeoil_data$Zone==1, 2:6], mean)
xvar1<-var(crudeoil_data[crudeoil_data$Zone==1 , 2:6])
xcor1 <-cor(crudeoil_data[crudeoil_data$Zone==1 , 2:6])
xbar2<-sapply(crudeoil_data[crudeoil_data$Zone==2, 2:6], mean)

8
xvar2<-var(crudeoil_data[crudeoil_data$Zone==2 , 2:6])
xcor2 <-cor(crudeoil_data[crudeoil_data$Zone==2 , 2:6])
xbar3<-sapply(crudeoil_data[crudeoil_data$Zone==3, 2:6], mean)
xvar3<-var(crudeoil_data[crudeoil_data$Zone==3 , 2:6])
xcor3 <-cor(crudeoil_data[crudeoil_data$Zone==3 , 2:6])

n1 <- dim(crudeoil_data[crudeoil_data$Zone==1, 2:6])[1]


n2 <- dim(crudeoil_data[crudeoil_data$Zone==2, 2:6])[1]
n3 <- dim(crudeoil_data[crudeoil_data$Zone==3, 2:6])[1]

p <- dim(rabbits_data[rabbits_data$insulin==2, 2:6])[2]


g <-3
# Enter the confidence level
level <- 0.95

# Compute Bonferroni Confidence Intervals


# for each student population
m <- p*g*(g-1)/2
levelb <- 1-((1-level)/(2*m))
df <- nrow(crudeoil_data)-g
c_bon <- qt(levelb, df)
# Compute pooled covariance matrix
vpool <- fit.manova$SSPE/df
# Compute Bonferroni limits

lower_limit_1v2 <- (xbar1-xbar2) - c_bon*sqrt(diag(vpool)*((1/n1)+(1/n2)))


upper_limit_1v2 <- (xbar1-xbar2) + c_bon*sqrt(diag(vpool)*((1/n1)+(1/n2)))
lower_limit_1v3 <- (xbar1-xbar3) - c_bon*sqrt(diag(vpool)*((1/n1)+(1/n3)))
upper_limit_1v3 <- (xbar1-xbar3) + c_bon*sqrt(diag(vpool)*((1/n1)+(1/n3)))
lower_limit_2v3 <- (xbar2-xbar3) - c_bon*sqrt(diag(vpool)*((1/n2)+(1/n3)))
upper_limit_2v3 <- (xbar2-xbar3) + c_bon*sqrt(diag(vpool)*((1/n2)+(1/n3)))
rbind(lower_limit_1v2, upper_limit_1v2, lower_limit_1v3, upper_limit_1v3,
lower_limit_2v3, upper_limit_2v3)

## X1 X2 X3 X4 X5
## lower_limit_1v2 -4.0130052 -2.783776 -0.48374575 -1.3071075 2.302609
## upper_limit_1v2 1.5792390 23.744815 0.37621328 1.7767179 9.810118
## lower_limit_1v3 -6.3763951 10.034910 -0.68074439 0.8258569 2.583537
## upper_limit_1v3 -1.6190936 32.602684 0.05081958 3.4492558 8.970147
## lower_limit_2v3 -4.7609066 1.445302 -0.56568178 0.8108614 -2.937704
## upper_limit_2v3 -0.8008159 20.231253 0.04328944 2.9946410 2.378661
In between zone 1 and zone 2, only variable X5 is significanly larger in zone 1 than zone 2, other mean
differences are not significant as the mean differences includes zero.
In between zone 1 and zone 3, for x1 zone 3 is significanlty larger than zone 1 and for variable X4 and X5,
zone 1 is significantly larger. other two differences are not significant.
In between zone 2 and zone 3, for variable X1, zone 3 is significantly larger and for variable X2, zone 2 is
significantly larger. Other differences are not significant.

You might also like