STA2050 Assignment 2

STA2050 Assignment 2
Mark Bilahi M’rabu
2023-11-28
Import the Necessary Packages

library(imputeTS)
## Warning: package 'imputeTS' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':

## method from
## as.zoo.data.frame zoo
library(VIM)
## Warning: package 'VIM' was built under R version 4.3.2
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at:

https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':

##
## sleep
library(nnet)
library(readxl)
library(csv)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':

##
## filter, lag
## The following objects are masked from 'package:base':

##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse

2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ──────────────────────────────────────────
tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all
conflicts to become errors
library(finalfit)
## Warning: package 'finalfit' was built under R version 4.3.2
library(lubridate)
library(tidyr)
library(ggplot2)
library(psych)
## Warning: package 'psych' was built under R version 4.3.2
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(lme4)
## Loading required package: Matrix

##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(tibble)
Question 1
# Create the DataSet
Tree_no = c(1:20)
Diameter_X = c(12, 11.4, 7.9, 10.5, 7.9, 9, 7.3, 10.2, 11.7, 11.3, 5.7, 8,
10.3, 12, 9.2, 8.5, 7, 10.7, 9.3, 8.2)
Age_Y = c(125, 119, 83, 85, 99, 117, 69, 133, 154, 168, 61, 80, 114, 147,
122, 106, 82, 88, 97, 99)
Treedata = data.frame(cbind(Tree_no, Diameter_X, Age_Y))
head(Treedata)
## Tree_no Diameter_X Age_Y

## 1 1 12.0 125
## 2 2 11.4 119
## 3 3 7.9 83
## 4 4 10.5 85
## 5 5 7.9 99
## 6 6 9.0 117
#View(Treedata)
str(Treedata)
## 'data.frame': 20 obs. of 3 variables:

## $ Tree_no : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Diameter_X: num 12 11.4 7.9 10.5 7.9 9 7.3 10.2 11.7 11.3 ...
## $ Age_Y : num 125 119 83 85 99 117 69 133 154 168 ...
a) Draw a scatterplot of y vs x
Treeplot = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y)) + geom_point() +
labs(Treedata, title = 'Scatterplot of Y (Age) against X (Diameter)', x =
'Tree Diameter', y = 'Tree Age')
Treeplot
"\n"
## [1] "\n"
b) Estimate the Population Mean age of trees in the stand using ratio estimation and
give an approximate standard error for your estimate
D ia m e t e r =A u x il li a r y=x
A g e=D e p e n d e nt= y
n
t x =∑ x i
i=1
n
t y =∑ y i
i=1
t x μx
=
t y μy
μ x∗t y
P o p ul a t i o n M e a n A g e μ y =
tx
S tan De v
S t a n d a r d E r r o r=
√n o f a g e
#Estimating Population Mean Age
meanx = 10.3
totalx = sum(Diameter_X)
cat("Diameter Sample Total =", totalx, "\n")
## Diameter Sample Total = 188.1
totaly = sum(Age_Y)
cat("Age Sample Total =", totaly, "\n")
## Age Sample Total = 2148
meany = meanx * (totaly / totalx)

cat("Population Mean Age =", meany, "\n")
## Population Mean Age = 117.6204
#Approximating Standard Error for above Estimation

StanError = sd(Age_Y) / (sqrt(length(Age_Y)))
cat("Standard Error=", StanError, "\n")
## Standard Error= 6.40904
cat("Therefore, the estimated population mean age of trees is", meany, "while
the standard error for said estimation is", StanError, "\n")
## Therefore, the estimated population mean age of trees is 117.6204 while

the standard error for said estimation is 6.40904
c) Repeat b) using Regression Estimation

n∑ x∗y − ∑ x∗∑ y
b i= 2 2
n∗∑ x − ( ∑ x )
y=b0 +b 1 x
#Creating a Subset of the Treedata
Regdata = Treedata
Regdata[4] = Regdata[2] * Regdata[3]
Regdata[5] = Regdata[2]^2
Regdata[6] = Regdata[3]^2
colnames(Regdata) = c("Tree_no", 'Diameter_X', "Age_Y", "XY", "Xsqr", "Ysqr")
Regdata = Regdata %>%
mutate_at(vars(1), as.character)
Totals = Regdata %>%
summarize(Tree_no = "Totals",
Diameter_X = sum(Regdata$Diameter_X),
Age_Y = sum(Regdata$Age_Y),
XY = sum(Regdata$XY),
Xsqr = sum(Regdata$Xsqr),
Ysqr = sum(Regdata$Ysqr))
Regdata = bind_rows(Regdata, Totals)
#Regdata = Regdata %>%
# mutate(Regdata$Tree_no[21], "Totals")
str(Regdata)
## 'data.frame': 21 obs. of 6 variables:

## $ Tree_no : chr "1" "2" "3" "4" ...
## $ Diameter_X: num 12 11.4 7.9 10.5 7.9 9 7.3 10.2 11.7 11.3 ...
## $ Age_Y : num 125 119 83 85 99 117 69 133 154 168 ...
## $ XY : num 1500 1357 656 892 782 ...
## $ Xsqr : num 144 130 62.4 110.2 62.4 ...
## $ Ysqr : num 15625 14161 6889 7225 9801 ...
head(Regdata)
## Tree_no Diameter_X Age_Y XY Xsqr Ysqr

## 1 1 12.0 125 1500.0 144.00 15625
## 2 2 11.4 119 1356.6 129.96 14161
## 3 3 7.9 83 655.7 62.41 6889
## 4 4 10.5 85 892.5 110.25 7225
## 5 5 7.9 99 782.1 62.41 9801
## 6 6 9.0 117 1053.0 81.00 13689
#Fitting the Linear Regression Model

Treemodel = lm(Age_Y ~ Diameter_X, data = Regdata)
summary(Treemodel)
##
## Call:
## lm(formula = Age_Y ~ Diameter_X, data = Regdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.892 -11.171 -0.288 9.977 38.971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03031 4.33635 -0.007 0.994
## Diameter_X 11.42115 0.10301 110.874 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.98 on 19 degrees of freedom
## Multiple R-squared: 0.9985, Adjusted R-squared: 0.9984
## F-statistic: 1.229e+04 on 1 and 19 DF, p-value: < 2.2e-16
Intercept = coef(Treemodel)[1]
Slope = coef(Treemodel)[2]
cat("The slope of the model is", Slope, "while the intercept is", Intercept,
"\n")
## The slope of the model is 11.42115 while the intercept is -0.03030842

Estimate = Intercept + (Slope *meanx)
The Regression is given by:

a g e=−7.63+ 12.23 d i a m e t e r
#Approximating Standard Error for the above Estimation
Residuals = resid(Treemodel)
ResStanDev = sd(Residuals)
SE = ResStanDev * sqrt(1 + 1/length(Age_Y) + ((meanx - mean(Diameter_X))^2 /
sum((Diameter_X - mean(Diameter_X))^2)))
cat("The Regression Estimate of the Population Mean Age is", Estimate, "while
the Standard Error for the above estimation is", SE, "\n")
## The Regression Estimate of the Population Mean Age is 117.6075 while the
Standard Error for the above estimation is 18.0675
d) Label your Estimates on your graph. How do they compare?

Ratio = c(10.3, meany)
Regression = c(10.3, Estimate)
Est = data.frame(rbind(Ratio, Regression))
Est
## V1 X.Intercept.
## Ratio 10.3 117.6204
## Regression 10.3 117.6075
colnames(Est) = c("X", "Y")

X = Est$X
Y = Est$Y
EstGraph = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y), color = "blue",
shape = 15) + geom_point() + labs(Treedata, title = 'Scatterplot of Y (Age)
against X (Diameter)\n with Estimates & Regression Line', x = 'Tree
Diameter', y = 'Tree Age') + geom_abline(intercept = Intercept, slope =
Slope, color = "red") + geom_point(data = Est, aes(x = X, y = Y), color =
"red", shape = 15)
EstGraph
"\n"
## [1] "\n"
From the above graph it is clear to see that both the Ratio Estimation & The Regression
Estimation of the Population Mean Age are very close in their value however the Standard
Error for the methods vary greatly
Question 2
The new candy Green Goobules is being test-marketed in an area of upstate NY. The market
research firm decided to sample 6 cities from the 45 in the area & then to sample
supermarkets within the cities, wanting to know the no. of Green Gobules cases sold
a) Obtain summary statistics for each cluster
# Create the Data Sets
cluster_1 = c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)
cluster_2 = c(99, 101, 52, 121)
cluster_3 = c(199, 179, 98, 63, 126, 87, 62)
cluster_4 = c(226, 129, 57, 46, 86, 43, 85, 165)
cluster_5 = c(12, 23)
cluster_6 = c(87, 43, 59)
#Cluster 1
summary(cluster_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 72.0 147.5 175.5 177.3 184.8 361.0
sd1 = sd(cluster_1)
var1 = var(cluster_1)
cat("The Standard Deviation of Cluster 1 is", sd1, "while the Variance is",
var1, "\n", "\n")
## The Standard Deviation of Cluster 1 is 83.59964 while the Variance is

6988.9
##
#Cluster 2
summary(cluster_2)

## 52.00 87.25 100.00 93.25 106.00 121.00
sd2 = sd(cluster_2)
var2, "\n", "\n")

854.9167
##
#Cluster 3
summary(cluster_3)

## 62.0 75.0 98.0 116.3 152.5 199.0
sd3 = sd(cluster_3)
var3, "\n", "\n")

2974.571
##
#Cluster 4
summary(cluster_4)

## 43.00 54.25 85.50 104.62 138.00 226.00
sd4 = sd(cluster_4)
var4, "\n", "\n")
4172.268
##
#Cluster 5
summary(cluster_5)

## 12.00 14.75 17.50 17.50 20.25 23.00
sd5 = sd(cluster_5)
var5, "\n", "\n")
## The Standard Deviation of Cluster 5 is 7.778175 while the Variance is 60.5
##
#Cluster 6
summary(cluster_6)

## 43 51 59 63 73 87
sd6 = sd(cluster_6)
var6, "\n", "\n")
## The Standard Deviation of Cluster 1 is 22.27106 while the Variance is 496

##
b) Estimate the total number of cases sold, and the average number sold per
supermarket, along with the standard errors of your Estimates
# Create the Data Set
City = c(1, 2, 3, 4, 5, 6)
Supermarket_No = c(52, 19, 37, 39, 8, 14)
"cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)))
cluster_2 = I(list(c(99, 101, 52, 121)))
cluster_3 = I(list(c(199, 179, 98, 63, 126, 87, 62)))
cluster_4 = I(list(c(226, 129, 57, 46, 86, 43, 85, 165)))
cluster_5 = I(list(c(12, 23)))
cluster_6 = I(list(c(87, 43, 59)))"
## [1] "cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73,
186)))\ncluster_2 = I(list(c(99, 101, 52, 121)))\ncluster_3 = I(list(c(199,
179, 98, 63, 126, 87, 62)))\ncluster_4 = I(list(c(226, 129, 57, 46, 86, 43,
85, 165)))\ncluster_5 = I(list(c(12, 23)))\ncluster_6 = I(list(c(87, 43,
59)))"
Cases_Sold = data.frame(Clusters = c("Cluster_1", "Cluster_2", "Cluster_3",
"Cluster_4", "Cluster_5", "Cluster_6"), Values = I(list(c(146, 180, 251, 152,
72, 181, 171, 361, 73, 186), c(99, 101, 52, 121), c(199, 179, 98, 63, 126,
87, 62), c(226, 129, 57, 46, 86, 43, 85, 165), c(12, 23), c(87, 43, 59))))
#Cases_Sold = bind_rows(
# tibble(Cluster = "cluster_1", Value = cluster_1),
# tibble(Cluster = "cluster_6", Value = cluster_6)
#)
#Cases_Sold
#Cases_Sold = tidyr::gather(data.frame(cluster_1, cluster_2, cluster_3,
cluster_4, cluster_5, cluster_6), key = "Cluster", value = "value")
#Cases_Sold = data.frame(var1 = rbind(cluster_1, cluster_2, cluster_3,
cluster_4, cluster_5, cluster_6))
View(Cases_Sold)
Green_Gobules = cbind(City, Supermarket_No, Cases_Sold[2])
View(Green_Gobules)
#Green_Gobules = Green_Gobules %>%
# mutate_at(vars(3), as.numeric)
mean(Green_Gobules[1, 3])
## Warning in mean.default(Green_Gobules[1, 3]): argument is not numeric or

## logical: returning NA
## [1] NA

STA2050 Assignment 2

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

STA2050 Assignment 2

Uploaded by

Copyright:

Available Formats

STA2050 Assignment 2

Mark Bilahi M’rabu

Import the Necessary Packages

## Warning: package 'imputeTS' was built under R version 4.3.2

## Registered S3 method overwritten by 'quantmod':

## Warning: package 'VIM' was built under R version 4.3.2

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at:

## The following object is masked from 'package:datasets':

## The following objects are masked from 'package:stats':

## The following objects are masked from 'package:base':

## Warning: package 'tidyverse' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse

## Warning: package 'finalfit' was built under R version 4.3.2

## Warning: package 'psych' was built under R version 4.3.2

## Loading required package: Matrix

## Tree_no Diameter_X Age_Y

## 'data.frame': 20 obs. of 3 variables:

## Diameter Sample Total = 188.1

## Age Sample Total = 2148

meany = meanx * (totaly / totalx)

## Population Mean Age = 117.6204

#Approximating Standard Error for above Estimation

## Standard Error= 6.40904

## Therefore, the estimated population mean age of trees is 117.6204 while

c) Repeat b) using Regression Estimation

## 'data.frame': 21 obs. of 6 variables:

## Tree_no Diameter_X Age_Y XY Xsqr Ysqr

#Fitting the Linear Regression Model

## The slope of the model is 11.42115 while the intercept is -0.03030842

The Regression is given by:

d) Label your Estimates on your graph. How do they compare?

colnames(Est) = c("X", "Y")

## The Standard Deviation of Cluster 1 is 83.59964 while the Variance is

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## The Standard Deviation of Cluster 2 is 29.23896 while the Variance is

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## The Standard Deviation of Cluster 3 is 54.53963 while the Variance is

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## The Standard Deviation of Cluster 5 is 7.778175 while the Variance is 60.5

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## The Standard Deviation of Cluster 1 is 22.27106 while the Variance is 496

## Warning in mean.default(Green_Gobules[1, 3]): argument is not numeric or

You might also like