Professional Documents
Culture Documents
STA2050 Assignment 2
STA2050 Assignment 2
2023-11-28
library(VIM)
##
## Attaching package: 'VIM'
library(nnet)
library(readxl)
library(csv)
library(dplyr)
##
## Attaching package: 'dplyr'
## ── Conflicts ──────────────────────────────────────────
tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all
conflicts to become errors
library(finalfit)
library(lubridate)
library(tidyr)
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(lme4)
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(tibble)
Question 1
# Create the DataSet
Tree_no = c(1:20)
Diameter_X = c(12, 11.4, 7.9, 10.5, 7.9, 9, 7.3, 10.2, 11.7, 11.3, 5.7, 8,
10.3, 12, 9.2, 8.5, 7, 10.7, 9.3, 8.2)
Age_Y = c(125, 119, 83, 85, 99, 117, 69, 133, 154, 168, 61, 80, 114, 147,
122, 106, 82, 88, 97, 99)
Treedata = data.frame(cbind(Tree_no, Diameter_X, Age_Y))
head(Treedata)
#View(Treedata)
str(Treedata)
a) Draw a scatterplot of y vs x
Treeplot = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y)) + geom_point() +
labs(Treedata, title = 'Scatterplot of Y (Age) against X (Diameter)', x =
'Tree Diameter', y = 'Tree Age')
Treeplot
"\n"
## [1] "\n"
b) Estimate the Population Mean age of trees in the stand using ratio estimation and
give an approximate standard error for your estimate
D ia m e t e r =A u x il li a r y=x
A g e=D e p e n d e nt= y
n
t x =∑ x i
i=1
n
t y =∑ y i
i=1
t x μx
=
t y μy
μ x∗t y
P o p ul a t i o n M e a n A g e μ y =
tx
S tan De v
S t a n d a r d E r r o r=
√n o f a g e
#Estimating Population Mean Age
meanx = 10.3
totalx = sum(Diameter_X)
cat("Diameter Sample Total =", totalx, "\n")
totaly = sum(Age_Y)
cat("Age Sample Total =", totaly, "\n")
cat("Therefore, the estimated population mean age of trees is", meany, "while
the standard error for said estimation is", StanError, "\n")
head(Regdata)
##
## Call:
## lm(formula = Age_Y ~ Diameter_X, data = Regdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.892 -11.171 -0.288 9.977 38.971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03031 4.33635 -0.007 0.994
## Diameter_X 11.42115 0.10301 110.874 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.98 on 19 degrees of freedom
## Multiple R-squared: 0.9985, Adjusted R-squared: 0.9984
## F-statistic: 1.229e+04 on 1 and 19 DF, p-value: < 2.2e-16
Intercept = coef(Treemodel)[1]
Slope = coef(Treemodel)[2]
cat("The slope of the model is", Slope, "while the intercept is", Intercept,
"\n")
## The Regression Estimate of the Population Mean Age is 117.6075 while the
Standard Error for the above estimation is 18.0675
## V1 X.Intercept.
## Ratio 10.3 117.6204
## Regression 10.3 117.6075
## [1] "\n"
From the above graph it is clear to see that both the Ratio Estimation & The Regression
Estimation of the Population Mean Age are very close in their value however the Standard
Error for the methods vary greatly
Question 2
The new candy Green Goobules is being test-marketed in an area of upstate NY. The market
research firm decided to sample 6 cities from the 45 in the area & then to sample
supermarkets within the cities, wanting to know the no. of Green Gobules cases sold
a) Obtain summary statistics for each cluster
# Create the Data Sets
cluster_1 = c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)
cluster_2 = c(99, 101, 52, 121)
cluster_3 = c(199, 179, 98, 63, 126, 87, 62)
cluster_4 = c(226, 129, 57, 46, 86, 43, 85, 165)
cluster_5 = c(12, 23)
cluster_6 = c(87, 43, 59)
#Cluster 1
summary(cluster_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 72.0 147.5 175.5 177.3 184.8 361.0
sd1 = sd(cluster_1)
var1 = var(cluster_1)
cat("The Standard Deviation of Cluster 1 is", sd1, "while the Variance is",
var1, "\n", "\n")
#Cluster 2
summary(cluster_2)
sd2 = sd(cluster_2)
var2 = var(cluster_2)
cat("The Standard Deviation of Cluster 2 is", sd2, "while the Variance is",
var2, "\n", "\n")
#Cluster 3
summary(cluster_3)
sd3 = sd(cluster_3)
var3 = var(cluster_3)
cat("The Standard Deviation of Cluster 3 is", sd3, "while the Variance is",
var3, "\n", "\n")
#Cluster 4
summary(cluster_4)
sd4 = sd(cluster_4)
var4 = var(cluster_4)
cat("The Standard Deviation of Cluster 4 is", sd4, "while the Variance is",
var4, "\n", "\n")
## The Standard Deviation of Cluster 4 is 64.59309 while the Variance is
4172.268
##
#Cluster 5
summary(cluster_5)
sd5 = sd(cluster_5)
var5 = var(cluster_5)
cat("The Standard Deviation of Cluster 5 is", sd5, "while the Variance is",
var5, "\n", "\n")
##
#Cluster 6
summary(cluster_6)
sd6 = sd(cluster_6)
var6 = var(cluster_6)
cat("The Standard Deviation of Cluster 1 is", sd6, "while the Variance is",
var6, "\n", "\n")
b) Estimate the total number of cases sold, and the average number sold per
supermarket, along with the standard errors of your Estimates
# Create the Data Set
City = c(1, 2, 3, 4, 5, 6)
Supermarket_No = c(52, 19, 37, 39, 8, 14)
"cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)))
cluster_2 = I(list(c(99, 101, 52, 121)))
cluster_3 = I(list(c(199, 179, 98, 63, 126, 87, 62)))
cluster_4 = I(list(c(226, 129, 57, 46, 86, 43, 85, 165)))
cluster_5 = I(list(c(12, 23)))
cluster_6 = I(list(c(87, 43, 59)))"
## [1] "cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73,
186)))\ncluster_2 = I(list(c(99, 101, 52, 121)))\ncluster_3 = I(list(c(199,
179, 98, 63, 126, 87, 62)))\ncluster_4 = I(list(c(226, 129, 57, 46, 86, 43,
85, 165)))\ncluster_5 = I(list(c(12, 23)))\ncluster_6 = I(list(c(87, 43,
59)))"
Cases_Sold = data.frame(Clusters = c("Cluster_1", "Cluster_2", "Cluster_3",
"Cluster_4", "Cluster_5", "Cluster_6"), Values = I(list(c(146, 180, 251, 152,
72, 181, 171, 361, 73, 186), c(99, 101, 52, 121), c(199, 179, 98, 63, 126,
87, 62), c(226, 129, 57, 46, 86, 43, 85, 165), c(12, 23), c(87, 43, 59))))
#Cases_Sold = bind_rows(
# tibble(Cluster = "cluster_1", Value = cluster_1),
# tibble(Cluster = "cluster_2", Value = cluster_2),
# tibble(Cluster = "cluster_3", Value = cluster_3),
# tibble(Cluster = "cluster_4", Value = cluster_4),
# tibble(Cluster = "cluster_5", Value = cluster_5),
# tibble(Cluster = "cluster_6", Value = cluster_6)
#)
#Cases_Sold
#Cases_Sold = tidyr::gather(data.frame(cluster_1, cluster_2, cluster_3,
cluster_4, cluster_5, cluster_6), key = "Cluster", value = "value")
#Cases_Sold = data.frame(var1 = rbind(cluster_1, cluster_2, cluster_3,
cluster_4, cluster_5, cluster_6))
View(Cases_Sold)
Green_Gobules = cbind(City, Supermarket_No, Cases_Sold[2])
View(Green_Gobules)
#Green_Gobules = Green_Gobules %>%
# mutate_at(vars(3), as.numeric)
mean(Green_Gobules[1, 3])
## [1] NA