Professional Documents
Culture Documents
Victor Omondi
7/27/2021
## -- Conflicts ------------------------------------------
tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
stem_majors<-read_excel("C:/Users/Adm/Downloads/Re_
RSTUDIO/STEM_Majors.xlsx")
stem_majors
## # A tibble: 16 x 3
## Major College Count
## <chr> <chr> <dbl>
## 1 BIOCHEM H 10
## 2 BIOCHEM WS 11
## 3 BIOL H 17
## 4 BIOL WS 59
## 5 CHEM H 8
## 6 CHEM WS 13
## 7 CS H 13
## 8 CS WS 4
## 9 GEO H 12
## 10 GEO WS 19
## 11 MATH H 2
## 12 MATH WS 1
## 13 PHYS H 7
## 14 PHYS WS 3
## 15 PSYCH H 6
## 16 PSYCH WS 19
stem_majors.contigency<-
pivot_wider(stem_majors,names_from=College,values_from=Count)
#contigency table using tidyr library
#recreate the matrix for chisq
mat1<-as.matrix(stem_majors.contigency[-1])
rownames(mat1) <-
c('BIOCHEM','BIOL','CHEM','CS','GEO','MATH','PHYS','PSYCH' )
colnames(mat1) <- c('H', 'WS')
chisqtest<-chisq.test(mat1)#chisq test
chisqtest
##
## Pearson's Chi-squared test
##
## data: mat1
## X-squared = 27.091, df = 7, p-value = 0.0003209
## H WS
## BIOCHEM 7.720588 13.279412
## BIOL 27.941176 48.058824
## CHEM 7.720588 13.279412
## CS 6.250000 10.750000
## GEO 11.397059 19.602941
## MATH 1.102941 1.897059
## PHYS 3.676471 6.323529
## PSYCH 9.191176 15.808824
#Flip Coin
n = 2
experiment<-sample(c("Heads", "Tails"), n, rep = T)
n = 4
experiment1<-sample(c("Heads", "Tails"), n, rep = T)
n = 8
experiment2<-sample(c("Heads", "Tails"), n, rep = T)
n = 16
experiment3<-sample(c("Heads", "Tails"), n, rep = T)
n.trials=c(2,4,8,16)
freq.heads=c(sum(experiment=="Heads"),sum(experiment1=="Heads"),sum(experimen
t2=="Heads"),sum(experiment3=="Heads"))
plot(n.trials,freq.heads)
#16 number of trials result in 8 heads. Implying that you need to do the
maximum trials to get at least frequency 50% of heads
library(tidyverse)
iris.d<-read.csv("C:/Users/Adm/Downloads/Re_ RSTUDIO/Data integrity.csv")
library('pwr')
d=abs(5-(mean(iris.d$Length)))/(sd(iris.d$Length)**2)#effect size
power.test<-pwr.t.test(n =NULL , d =d , sig.level =0.01 , power =0.8 , type =
"one.sample")
power.test
##
## One-sample t test power calculation
##
## n = 10.1516
## d = 1.316928
## sig.level = 0.01
## power = 0.8
## alternative = two.sided
## 1 2 3 4 5 6 7 8
## 168.9733 177.8012 167.9347 161.1840 176.7626 167.4154 155.4718 188.1869
## 9 10 11
## 180.9169 169.4926 157.8605
This is interpolation because–Interpolation is the process of using the line of best fit to
estimate the value of one variable from the value of another, provided that the value you
are using is within the range of your data.
#Regression analysis of foot and height continued
confint(mod1)#confidence interval of regression coefficient
## 2.5 % 97.5 %
## (Intercept) -20.074344 98.377015
## foot$`Rt foot (cm)` 2.851919 7.533838
summary(mod1)
##
## Call:
## lm(formula = foot$`height (cm)` ~ foot$`Rt foot (cm)`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.917 -3.368 -1.184 4.632 8.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.151 26.181 1.495 0.169022
## foot$`Rt foot (cm)` 5.193 1.035 5.018 0.000721 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.316 on 9 degrees of freedom
## Multiple R-squared: 0.7367, Adjusted R-squared: 0.7074
## F-statistic: 25.18 on 1 and 9 DF, p-value: 0.000721
library(forecast)
library(tidyquant)
##
## Attaching package: 'lubridate'
##
## Attaching package: 'zoo'
##
## Attaching package: 'xts'
##
## Attaching package: 'PerformanceAnalytics'
library(dplyr)
geneva<-read_excel("C:/Users/Adm/Downloads/Re_ RSTUDIO/GENEVATEMPS.xlsx")
colSums(is.na(geneva)) #check for missing data per column
geneva$AVE[is.na(geneva$AVE)]<-median(geneva$AVE,na.rm=TRUE)#replace missing
Average Temperature with median (Median imputation)
colSums(is.na(geneva))
geneva$AVE<-ts(geneva$AVE)
#Time series start and end
ndx<-seq(as.Date("2010-01-01"), as.Date("2019-11-08"), by="day")
myts<-ts(geneva[4], start=c(2010, as.numeric(format(ndx[1],"%j"))),
frequency=365) #Create time series
plot(myts, main="Time Series Plot")
stl.ts<-stl(ts(geneva$AVE, frequency = 365),s.window="periodic")
autoplot(stl.ts)#The trend shows declining temperatures
#Time series 2
library(readxl)
library(tidyverse)
library(timeSeries)
library(forecast)
library(TTR)
geneva<-read_excel("C:/Users/Adm/Downloads/Re_ RSTUDIO/GENEVATEMPS.xlsx")
geneva<-na.omit(geneva)
geneva$AVE<-ts(geneva$AVE)#transform daily average temp to TS() object
#The plot of the smoothed temperatures shows a constant trend (No change in
trend)
#2
#Double Exponential Smoothing
exp.smooth<- HoltWinters(myts, gamma=FALSE)
plot(exp.smooth$fitted)
##1-Jan-2010 to 31-Dec-2018
ndx<-seq(as.Date("2010-01-01"), as.Date("2018-12-31"), by="day")
myts<-ts(geneva[4], start=c(2010, as.numeric(format(ndx[1],"%j"))),
frequency=365) #Create time series
fit <- arima(myts)#Create arima model
#3
Box.test(fit$residuals, type="Ljung-Box")#may include ,lag=30
##
## Box-Ljung test
##
## data: fit$residuals
## X-squared = 3150.2, df = 1, p-value < 2.2e-16
#forecast values
actual.geneva<-subset(geneva, DATE>= "2019-01-01 UTC" & DATE <= "2019-01-31
UTC")#actual measurements
actual.geneva$AVE
## [1] 42.0 30.0 29.0 30.5 38.0 34.0 24.5 24.0 37.0 27.5 16.5 12.5 13.5 12.0
18.5
## [16] 27.5 20.5 18.0 22.5 13.5 2.0 1.5 16.0 35.0 30.5 15.5 16.5 17.0 14.5
13.5