You are on page 1of 12

BS at HSW

Victor Omondi

7/27/2021

#STEM MAJORs Chisquare


library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse


1.3.1 --

## v ggplot2 3.3.5 v purrr 0.3.4


## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1

## -- Conflicts ------------------------------------------
tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()

stem_majors<-read_excel("C:/Users/Adm/Downloads/Re_
RSTUDIO/STEM_Majors.xlsx")
stem_majors

## # A tibble: 16 x 3
## Major College Count
## <chr> <chr> <dbl>
## 1 BIOCHEM H 10
## 2 BIOCHEM WS 11
## 3 BIOL H 17
## 4 BIOL WS 59
## 5 CHEM H 8
## 6 CHEM WS 13
## 7 CS H 13
## 8 CS WS 4
## 9 GEO H 12
## 10 GEO WS 19
## 11 MATH H 2
## 12 MATH WS 1
## 13 PHYS H 7
## 14 PHYS WS 3
## 15 PSYCH H 6
## 16 PSYCH WS 19
stem_majors.contigency<-
pivot_wider(stem_majors,names_from=College,values_from=Count)
#contigency table using tidyr library
#recreate the matrix for chisq
mat1<-as.matrix(stem_majors.contigency[-1])
rownames(mat1) <-
c('BIOCHEM','BIOL','CHEM','CS','GEO','MATH','PHYS','PSYCH' )
colnames(mat1) <- c('H', 'WS')
chisqtest<-chisq.test(mat1)#chisq test

## Warning in chisq.test(mat1): Chi-squared approximation may be incorrect

chisqtest

##
## Pearson's Chi-squared test
##
## data: mat1
## X-squared = 27.091, df = 7, p-value = 0.0003209

chisqtest$expected#expected values table

## H WS
## BIOCHEM 7.720588 13.279412
## BIOL 27.941176 48.058824
## CHEM 7.720588 13.279412
## CS 6.250000 10.750000
## GEO 11.397059 19.602941
## MATH 1.102941 1.897059
## PHYS 3.676471 6.323529
## PSYCH 9.191176 15.808824

#Flip Coin
n = 2
experiment<-sample(c("Heads", "Tails"), n, rep = T)
n = 4
experiment1<-sample(c("Heads", "Tails"), n, rep = T)
n = 8
experiment2<-sample(c("Heads", "Tails"), n, rep = T)
n = 16
experiment3<-sample(c("Heads", "Tails"), n, rep = T)
n.trials=c(2,4,8,16)
freq.heads=c(sum(experiment=="Heads"),sum(experiment1=="Heads"),sum(experimen
t2=="Heads"),sum(experiment3=="Heads"))
plot(n.trials,freq.heads)
#16 number of trials result in 8 heads. Implying that you need to do the
maximum trials to get at least frequency 50% of heads

## Part 2 Sample size in Villosa Iris

library(tidyverse)
iris.d<-read.csv("C:/Users/Adm/Downloads/Re_ RSTUDIO/Data integrity.csv")
library('pwr')
d=abs(5-(mean(iris.d$Length)))/(sd(iris.d$Length)**2)#effect size
power.test<-pwr.t.test(n =NULL , d =d , sig.level =0.01 , power =0.8 , type =
"one.sample")
power.test

##
## One-sample t test power calculation
##
## n = 10.1516
## d = 1.316928
## sig.level = 0.01
## power = 0.8
## alternative = two.sided

Predict/Reconstruct the height of an ancient human who left..


foot<-read_excel("C:/Users/Adm/Downloads/Re_ RSTUDIO/Human height.xlsx")
mod1<-lm(foot$`height (cm)`~foot$`Rt foot (cm)` )
left.foot<-runif(length(foot$`Rt foot (cm)`), 22,26)
predict(mod1, data.frame(left.foot))

## 1 2 3 4 5 6 7 8
## 168.9733 177.8012 167.9347 161.1840 176.7626 167.4154 155.4718 188.1869
## 9 10 11
## 180.9169 169.4926 157.8605

This is interpolation because–Interpolation is the process of using the line of best fit to
estimate the value of one variable from the value of another, provided that the value you
are using is within the range of your data.
#Regression analysis of foot and height continued
confint(mod1)#confidence interval of regression coefficient

## 2.5 % 97.5 %
## (Intercept) -20.074344 98.377015
## foot$`Rt foot (cm)` 2.851919 7.533838

summary(mod1)

##
## Call:
## lm(formula = foot$`height (cm)` ~ foot$`Rt foot (cm)`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.917 -3.368 -1.184 4.632 8.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.151 26.181 1.495 0.169022
## foot$`Rt foot (cm)` 5.193 1.035 5.018 0.000721 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.316 on 9 degrees of freedom
## Multiple R-squared: 0.7367, Adjusted R-squared: 0.7074
## F-statistic: 25.18 on 1 and 9 DF, p-value: 0.000721

#the coefficient for the predictor is statistically significant at alpha=0.05


plot(mod1)
#Time Series Analysis (1)
#install.packages('timeSeries')
#install.packages("forecast")
#install.packages("tidyquant")
library(tidyverse)
library(timeSeries)

## Loading required package: timeDate

library(forecast)

## Registered S3 method overwritten by 'quantmod':


## method from
## as.zoo.data.frame zoo

library(tidyquant)

## Loading required package: lubridate

##
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':


##
## date, intersect, setdiff, union

## Loading required package: PerformanceAnalytics

## Loading required package: xts

## Loading required package: zoo

##
## Attaching package: 'zoo'

## The following object is masked from 'package:timeSeries':


##
## time<-

## The following objects are masked from 'package:base':


##
## as.Date, as.Date.numeric

##
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':


##
## first, last

##
## Attaching package: 'PerformanceAnalytics'

## The following objects are masked from 'package:timeDate':


##
## kurtosis, skewness
## The following object is masked from 'package:graphics':
##
## legend

## Loading required package: quantmod

## Loading required package: TTR

## == Need to Learn tidyquant?


====================================================
## Business Science offers a 1-hour course - Learning Lab #9: Performance
Analysis & Portfolio Optimization with tidyquant!
## </> Learn more at: https://university.business-science.io/p/learning-labs-
pro </>

library(dplyr)
geneva<-read_excel("C:/Users/Adm/Downloads/Re_ RSTUDIO/GENEVATEMPS.xlsx")
colSums(is.na(geneva)) #check for missing data per column

## DATE TMAX TMIN AVE


## 0 10 13 13

geneva$AVE[is.na(geneva$AVE)]<-median(geneva$AVE,na.rm=TRUE)#replace missing
Average Temperature with median (Median imputation)
colSums(is.na(geneva))

## DATE TMAX TMIN AVE


## 0 10 13 0

geneva$AVE<-ts(geneva$AVE)
#Time series start and end
ndx<-seq(as.Date("2010-01-01"), as.Date("2019-11-08"), by="day")
myts<-ts(geneva[4], start=c(2010, as.numeric(format(ndx[1],"%j"))),
frequency=365) #Create time series
plot(myts, main="Time Series Plot")
stl.ts<-stl(ts(geneva$AVE, frequency = 365),s.window="periodic")
autoplot(stl.ts)#The trend shows declining temperatures

#Time series 2
library(readxl)
library(tidyverse)
library(timeSeries)
library(forecast)
library(TTR)
geneva<-read_excel("C:/Users/Adm/Downloads/Re_ RSTUDIO/GENEVATEMPS.xlsx")

geneva<-na.omit(geneva)
geneva$AVE<-ts(geneva$AVE)#transform daily average temp to TS() object

#1 moving average smoothing


ma.smooth<-SMA(myts)
plot.ts(ma.smooth)

#The plot of the smoothed temperatures shows a constant trend (No change in
trend)

#2
#Double Exponential Smoothing
exp.smooth<- HoltWinters(myts, gamma=FALSE)
plot(exp.smooth$fitted)
##1-Jan-2010 to 31-Dec-2018
ndx<-seq(as.Date("2010-01-01"), as.Date("2018-12-31"), by="day")
myts<-ts(geneva[4], start=c(2010, as.numeric(format(ndx[1],"%j"))),
frequency=365) #Create time series
fit <- arima(myts)#Create arima model

#3
Box.test(fit$residuals, type="Ljung-Box")#may include ,lag=30

##
## Box-Ljung test
##
## data: fit$residuals
## X-squared = 3150.2, df = 1, p-value < 2.2e-16

#4 Predict for first 30 days of 2019


library(forecast)
prediction<-forecast(fit, h=30)#forecast.Arima not working

#forecast values
actual.geneva<-subset(geneva, DATE>= "2019-01-01 UTC" & DATE <= "2019-01-31
UTC")#actual measurements
actual.geneva$AVE

## [1] 42.0 30.0 29.0 30.5 38.0 34.0 24.5 24.0 37.0 27.5 16.5 12.5 13.5 12.0
18.5
## [16] 27.5 20.5 18.0 22.5 13.5 2.0 1.5 16.0 35.0 30.5 15.5 16.5 17.0 14.5
13.5

You might also like