You are on page 1of 11

STA2050 Assignment 2

Mark Bilahi M’rabu

2023-11-28

Import the Necessary Packages


library(imputeTS)

## Warning: package 'imputeTS' was built under R version 4.3.2

## Registered S3 method overwritten by 'quantmod':


## method from
## as.zoo.data.frame zoo

library(VIM)

## Warning: package 'VIM' was built under R version 4.3.2

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at:


https://github.com/statistikat/VIM/issues

##
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':


##
## sleep

library(nnet)
library(readxl)
library(csv)
library(dplyr)

##
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':


##
## filter, lag

## The following objects are masked from 'package:base':


##
## intersect, setdiff, setequal, union
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse


2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0

## ── Conflicts ──────────────────────────────────────────
tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all
conflicts to become errors

library(finalfit)

## Warning: package 'finalfit' was built under R version 4.3.2

library(lubridate)
library(tidyr)
library(ggplot2)
library(psych)

## Warning: package 'psych' was built under R version 4.3.2

##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha

library(lme4)

## Loading required package: Matrix


##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack

library(mice)

##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind

library(tibble)

Question 1
# Create the DataSet
Tree_no = c(1:20)
Diameter_X = c(12, 11.4, 7.9, 10.5, 7.9, 9, 7.3, 10.2, 11.7, 11.3, 5.7, 8,
10.3, 12, 9.2, 8.5, 7, 10.7, 9.3, 8.2)
Age_Y = c(125, 119, 83, 85, 99, 117, 69, 133, 154, 168, 61, 80, 114, 147,
122, 106, 82, 88, 97, 99)
Treedata = data.frame(cbind(Tree_no, Diameter_X, Age_Y))
head(Treedata)

## Tree_no Diameter_X Age_Y


## 1 1 12.0 125
## 2 2 11.4 119
## 3 3 7.9 83
## 4 4 10.5 85
## 5 5 7.9 99
## 6 6 9.0 117

#View(Treedata)
str(Treedata)

## 'data.frame': 20 obs. of 3 variables:


## $ Tree_no : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Diameter_X: num 12 11.4 7.9 10.5 7.9 9 7.3 10.2 11.7 11.3 ...
## $ Age_Y : num 125 119 83 85 99 117 69 133 154 168 ...

a) Draw a scatterplot of y vs x
Treeplot = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y)) + geom_point() +
labs(Treedata, title = 'Scatterplot of Y (Age) against X (Diameter)', x =
'Tree Diameter', y = 'Tree Age')
Treeplot
"\n"

## [1] "\n"

b) Estimate the Population Mean age of trees in the stand using ratio estimation and
give an approximate standard error for your estimate
D ia m e t e r =A u x il li a r y=x
A g e=D e p e n d e nt= y
n
t x =∑ x i
i=1

n
t y =∑ y i
i=1

t x μx
=
t y μy
μ x∗t y
P o p ul a t i o n M e a n A g e μ y =
tx
S tan De v
S t a n d a r d E r r o r=
√n o f a g e
#Estimating Population Mean Age
meanx = 10.3
totalx = sum(Diameter_X)
cat("Diameter Sample Total =", totalx, "\n")

## Diameter Sample Total = 188.1

totaly = sum(Age_Y)
cat("Age Sample Total =", totaly, "\n")

## Age Sample Total = 2148

meany = meanx * (totaly / totalx)


cat("Population Mean Age =", meany, "\n")

## Population Mean Age = 117.6204

#Approximating Standard Error for above Estimation


StanError = sd(Age_Y) / (sqrt(length(Age_Y)))
cat("Standard Error=", StanError, "\n")

## Standard Error= 6.40904

cat("Therefore, the estimated population mean age of trees is", meany, "while
the standard error for said estimation is", StanError, "\n")

## Therefore, the estimated population mean age of trees is 117.6204 while


the standard error for said estimation is 6.40904

c) Repeat b) using Regression Estimation


n∑ x∗y − ∑ x∗∑ y
b i= 2 2
n∗∑ x − ( ∑ x )
y=b0 +b 1 x
#Creating a Subset of the Treedata
Regdata = Treedata
Regdata[4] = Regdata[2] * Regdata[3]
Regdata[5] = Regdata[2]^2
Regdata[6] = Regdata[3]^2
colnames(Regdata) = c("Tree_no", 'Diameter_X', "Age_Y", "XY", "Xsqr", "Ysqr")
Regdata = Regdata %>%
mutate_at(vars(1), as.character)
Totals = Regdata %>%
summarize(Tree_no = "Totals",
Diameter_X = sum(Regdata$Diameter_X),
Age_Y = sum(Regdata$Age_Y),
XY = sum(Regdata$XY),
Xsqr = sum(Regdata$Xsqr),
Ysqr = sum(Regdata$Ysqr))
Regdata = bind_rows(Regdata, Totals)
#Regdata = Regdata %>%
# mutate(Regdata$Tree_no[21], "Totals")
str(Regdata)

## 'data.frame': 21 obs. of 6 variables:


## $ Tree_no : chr "1" "2" "3" "4" ...
## $ Diameter_X: num 12 11.4 7.9 10.5 7.9 9 7.3 10.2 11.7 11.3 ...
## $ Age_Y : num 125 119 83 85 99 117 69 133 154 168 ...
## $ XY : num 1500 1357 656 892 782 ...
## $ Xsqr : num 144 130 62.4 110.2 62.4 ...
## $ Ysqr : num 15625 14161 6889 7225 9801 ...

head(Regdata)

## Tree_no Diameter_X Age_Y XY Xsqr Ysqr


## 1 1 12.0 125 1500.0 144.00 15625
## 2 2 11.4 119 1356.6 129.96 14161
## 3 3 7.9 83 655.7 62.41 6889
## 4 4 10.5 85 892.5 110.25 7225
## 5 5 7.9 99 782.1 62.41 9801
## 6 6 9.0 117 1053.0 81.00 13689

#Fitting the Linear Regression Model


Treemodel = lm(Age_Y ~ Diameter_X, data = Regdata)
summary(Treemodel)

##
## Call:
## lm(formula = Age_Y ~ Diameter_X, data = Regdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.892 -11.171 -0.288 9.977 38.971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03031 4.33635 -0.007 0.994
## Diameter_X 11.42115 0.10301 110.874 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.98 on 19 degrees of freedom
## Multiple R-squared: 0.9985, Adjusted R-squared: 0.9984
## F-statistic: 1.229e+04 on 1 and 19 DF, p-value: < 2.2e-16

Intercept = coef(Treemodel)[1]
Slope = coef(Treemodel)[2]
cat("The slope of the model is", Slope, "while the intercept is", Intercept,
"\n")

## The slope of the model is 11.42115 while the intercept is -0.03030842


Estimate = Intercept + (Slope *meanx)

The Regression is given by:


a g e=−7.63+ 12.23 d i a m e t e r
#Approximating Standard Error for the above Estimation
Residuals = resid(Treemodel)
ResStanDev = sd(Residuals)
SE = ResStanDev * sqrt(1 + 1/length(Age_Y) + ((meanx - mean(Diameter_X))^2 /
sum((Diameter_X - mean(Diameter_X))^2)))
cat("The Regression Estimate of the Population Mean Age is", Estimate, "while
the Standard Error for the above estimation is", SE, "\n")

## The Regression Estimate of the Population Mean Age is 117.6075 while the
Standard Error for the above estimation is 18.0675

d) Label your Estimates on your graph. How do they compare?


Ratio = c(10.3, meany)
Regression = c(10.3, Estimate)
Est = data.frame(rbind(Ratio, Regression))
Est

## V1 X.Intercept.
## Ratio 10.3 117.6204
## Regression 10.3 117.6075

colnames(Est) = c("X", "Y")


X = Est$X
Y = Est$Y
EstGraph = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y), color = "blue",
shape = 15) + geom_point() + labs(Treedata, title = 'Scatterplot of Y (Age)
against X (Diameter)\n with Estimates & Regression Line', x = 'Tree
Diameter', y = 'Tree Age') + geom_abline(intercept = Intercept, slope =
Slope, color = "red") + geom_point(data = Est, aes(x = X, y = Y), color =
"red", shape = 15)
EstGraph
"\n"

## [1] "\n"

From the above graph it is clear to see that both the Ratio Estimation & The Regression
Estimation of the Population Mean Age are very close in their value however the Standard
Error for the methods vary greatly

Question 2
The new candy Green Goobules is being test-marketed in an area of upstate NY. The market
research firm decided to sample 6 cities from the 45 in the area & then to sample
supermarkets within the cities, wanting to know the no. of Green Gobules cases sold
a) Obtain summary statistics for each cluster
# Create the Data Sets
cluster_1 = c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)
cluster_2 = c(99, 101, 52, 121)
cluster_3 = c(199, 179, 98, 63, 126, 87, 62)
cluster_4 = c(226, 129, 57, 46, 86, 43, 85, 165)
cluster_5 = c(12, 23)
cluster_6 = c(87, 43, 59)

#Cluster 1
summary(cluster_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 72.0 147.5 175.5 177.3 184.8 361.0

sd1 = sd(cluster_1)
var1 = var(cluster_1)
cat("The Standard Deviation of Cluster 1 is", sd1, "while the Variance is",
var1, "\n", "\n")

## The Standard Deviation of Cluster 1 is 83.59964 while the Variance is


6988.9
##

#Cluster 2
summary(cluster_2)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 52.00 87.25 100.00 93.25 106.00 121.00

sd2 = sd(cluster_2)
var2 = var(cluster_2)
cat("The Standard Deviation of Cluster 2 is", sd2, "while the Variance is",
var2, "\n", "\n")

## The Standard Deviation of Cluster 2 is 29.23896 while the Variance is


854.9167
##

#Cluster 3
summary(cluster_3)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 62.0 75.0 98.0 116.3 152.5 199.0

sd3 = sd(cluster_3)
var3 = var(cluster_3)
cat("The Standard Deviation of Cluster 3 is", sd3, "while the Variance is",
var3, "\n", "\n")

## The Standard Deviation of Cluster 3 is 54.53963 while the Variance is


2974.571
##

#Cluster 4
summary(cluster_4)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 43.00 54.25 85.50 104.62 138.00 226.00

sd4 = sd(cluster_4)
var4 = var(cluster_4)
cat("The Standard Deviation of Cluster 4 is", sd4, "while the Variance is",
var4, "\n", "\n")
## The Standard Deviation of Cluster 4 is 64.59309 while the Variance is
4172.268
##

#Cluster 5
summary(cluster_5)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 12.00 14.75 17.50 17.50 20.25 23.00

sd5 = sd(cluster_5)
var5 = var(cluster_5)
cat("The Standard Deviation of Cluster 5 is", sd5, "while the Variance is",
var5, "\n", "\n")

## The Standard Deviation of Cluster 5 is 7.778175 while the Variance is 60.5

##

#Cluster 6
summary(cluster_6)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 43 51 59 63 73 87

sd6 = sd(cluster_6)
var6 = var(cluster_6)
cat("The Standard Deviation of Cluster 1 is", sd6, "while the Variance is",
var6, "\n", "\n")

## The Standard Deviation of Cluster 1 is 22.27106 while the Variance is 496


##

b) Estimate the total number of cases sold, and the average number sold per
supermarket, along with the standard errors of your Estimates
# Create the Data Set
City = c(1, 2, 3, 4, 5, 6)
Supermarket_No = c(52, 19, 37, 39, 8, 14)
"cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)))
cluster_2 = I(list(c(99, 101, 52, 121)))
cluster_3 = I(list(c(199, 179, 98, 63, 126, 87, 62)))
cluster_4 = I(list(c(226, 129, 57, 46, 86, 43, 85, 165)))
cluster_5 = I(list(c(12, 23)))
cluster_6 = I(list(c(87, 43, 59)))"

## [1] "cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73,
186)))\ncluster_2 = I(list(c(99, 101, 52, 121)))\ncluster_3 = I(list(c(199,
179, 98, 63, 126, 87, 62)))\ncluster_4 = I(list(c(226, 129, 57, 46, 86, 43,
85, 165)))\ncluster_5 = I(list(c(12, 23)))\ncluster_6 = I(list(c(87, 43,
59)))"
Cases_Sold = data.frame(Clusters = c("Cluster_1", "Cluster_2", "Cluster_3",
"Cluster_4", "Cluster_5", "Cluster_6"), Values = I(list(c(146, 180, 251, 152,
72, 181, 171, 361, 73, 186), c(99, 101, 52, 121), c(199, 179, 98, 63, 126,
87, 62), c(226, 129, 57, 46, 86, 43, 85, 165), c(12, 23), c(87, 43, 59))))
#Cases_Sold = bind_rows(
# tibble(Cluster = "cluster_1", Value = cluster_1),
# tibble(Cluster = "cluster_2", Value = cluster_2),
# tibble(Cluster = "cluster_3", Value = cluster_3),
# tibble(Cluster = "cluster_4", Value = cluster_4),
# tibble(Cluster = "cluster_5", Value = cluster_5),
# tibble(Cluster = "cluster_6", Value = cluster_6)
#)
#Cases_Sold
#Cases_Sold = tidyr::gather(data.frame(cluster_1, cluster_2, cluster_3,
cluster_4, cluster_5, cluster_6), key = "Cluster", value = "value")
#Cases_Sold = data.frame(var1 = rbind(cluster_1, cluster_2, cluster_3,
cluster_4, cluster_5, cluster_6))
View(Cases_Sold)
Green_Gobules = cbind(City, Supermarket_No, Cases_Sold[2])
View(Green_Gobules)
#Green_Gobules = Green_Gobules %>%
# mutate_at(vars(3), as.numeric)
mean(Green_Gobules[1, 3])

## Warning in mean.default(Green_Gobules[1, 3]): argument is not numeric or


## logical: returning NA

## [1] NA

You might also like