Bs at HSW: Victor Omondi 7/27/2021 #Stem Majors Chisquare

BS at HSW
Victor Omondi
7/27/2021
#STEM MAJORs Chisquare

library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse

1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4

## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------
tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
stem_majors<-read_excel("C:/Users/Adm/Downloads/Re_
RSTUDIO/STEM_Majors.xlsx")
stem_majors
## # A tibble: 16 x 3
## Major College Count
## <chr> <chr> <dbl>
## 1 BIOCHEM H 10
## 2 BIOCHEM WS 11
## 3 BIOL H 17
## 4 BIOL WS 59
## 5 CHEM H 8
## 6 CHEM WS 13
## 7 CS H 13
## 8 CS WS 4
## 9 GEO H 12
## 10 GEO WS 19
## 11 MATH H 2
## 12 MATH WS 1
## 13 PHYS H 7
## 14 PHYS WS 3
## 15 PSYCH H 6
## 16 PSYCH WS 19
stem_majors.contigency<-
pivot_wider(stem_majors,names_from=College,values_from=Count)
#contigency table using tidyr library
#recreate the matrix for chisq
mat1<-as.matrix(stem_majors.contigency[-1])
rownames(mat1) <-
c('BIOCHEM','BIOL','CHEM','CS','GEO','MATH','PHYS','PSYCH' )
colnames(mat1) <- c('H', 'WS')
chisqtest<-chisq.test(mat1)#chisq test
## Warning in chisq.test(mat1): Chi-squared approximation may be incorrect
chisqtest
##
## Pearson's Chi-squared test
##
## data: mat1
## X-squared = 27.091, df = 7, p-value = 0.0003209
chisqtest$expected#expected values table
## H WS
## BIOCHEM 7.720588 13.279412
## BIOL 27.941176 48.058824
## CHEM 7.720588 13.279412
## CS 6.250000 10.750000
## GEO 11.397059 19.602941
## MATH 1.102941 1.897059
## PHYS 3.676471 6.323529
## PSYCH 9.191176 15.808824
#Flip Coin
n = 2
experiment<-sample(c("Heads", "Tails"), n, rep = T)
n = 4
experiment1<-sample(c("Heads", "Tails"), n, rep = T)
n = 8
n = 16
n.trials=c(2,4,8,16)
freq.heads=c(sum(experiment=="Heads"),sum(experiment1=="Heads"),sum(experimen
t2=="Heads"),sum(experiment3=="Heads"))
plot(n.trials,freq.heads)
#16 number of trials result in 8 heads. Implying that you need to do the
maximum trials to get at least frequency 50% of heads
## Part 2 Sample size in Villosa Iris
library(tidyverse)
iris.d<-read.csv("C:/Users/Adm/Downloads/Re_ RSTUDIO/Data integrity.csv")
library('pwr')
d=abs(5-(mean(iris.d$Length)))/(sd(iris.d$Length)**2)#effect size
power.test<-pwr.t.test(n =NULL , d =d , sig.level =0.01 , power =0.8 , type =
"one.sample")
power.test
##
## One-sample t test power calculation
##
## n = 10.1516
## d = 1.316928
## sig.level = 0.01
## power = 0.8
## alternative = two.sided
Predict/Reconstruct the height of an ancient human who left..

foot<-read_excel("C:/Users/Adm/Downloads/Re_ RSTUDIO/Human height.xlsx")
mod1<-lm(foot$`height (cm)`~foot$`Rt foot (cm)` )
left.foot<-runif(length(foot$`Rt foot (cm)`), 22,26)
predict(mod1, data.frame(left.foot))
## 1 2 3 4 5 6 7 8
## 168.9733 177.8012 167.9347 161.1840 176.7626 167.4154 155.4718 188.1869
## 9 10 11
## 180.9169 169.4926 157.8605
This is interpolation because–Interpolation is the process of using the line of best fit to
estimate the value of one variable from the value of another, provided that the value you
are using is within the range of your data.
#Regression analysis of foot and height continued
confint(mod1)#confidence interval of regression coefficient
## 2.5 % 97.5 %
## (Intercept) -20.074344 98.377015
## foot$`Rt foot (cm)` 2.851919 7.533838
summary(mod1)
##
## Call:
## lm(formula = foot$`height (cm)` ~ foot$`Rt foot (cm)`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.917 -3.368 -1.184 4.632 8.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.151 26.181 1.495 0.169022
## foot$`Rt foot (cm)` 5.193 1.035 5.018 0.000721 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.316 on 9 degrees of freedom
## Multiple R-squared: 0.7367, Adjusted R-squared: 0.7074
## F-statistic: 25.18 on 1 and 9 DF, p-value: 0.000721
#the coefficient for the predictor is statistically significant at alpha=0.05

plot(mod1)
#Time Series Analysis (1)
#install.packages('timeSeries')
#install.packages("forecast")
#install.packages("tidyquant")
library(tidyverse)
library(timeSeries)
## Loading required package: timeDate
library(forecast)
## Registered S3 method overwritten by 'quantmod':

## method from
## as.zoo.data.frame zoo
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':

##
## date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following object is masked from 'package:timeSeries':

##
## time<-
## The following objects are masked from 'package:base':

##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':

##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following objects are masked from 'package:timeDate':

##
## kurtosis, skewness
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## == Need to Learn tidyquant?

====================================================
## Business Science offers a 1-hour course - Learning Lab #9: Performance
Analysis & Portfolio Optimization with tidyquant!
## </> Learn more at: https://university.business-science.io/p/learning-labs-
pro </>
library(dplyr)
geneva<-read_excel("C:/Users/Adm/Downloads/Re_ RSTUDIO/GENEVATEMPS.xlsx")
colSums(is.na(geneva)) #check for missing data per column
## DATE TMAX TMIN AVE

## 0 10 13 13
geneva$AVE[is.na(geneva$AVE)]<-median(geneva$AVE,na.rm=TRUE)#replace missing
Average Temperature with median (Median imputation)
colSums(is.na(geneva))
## DATE TMAX TMIN AVE

## 0 10 13 0
geneva$AVE<-ts(geneva$AVE)
#Time series start and end
ndx<-seq(as.Date("2010-01-01"), as.Date("2019-11-08"), by="day")
myts<-ts(geneva[4], start=c(2010, as.numeric(format(ndx[1],"%j"))),
frequency=365) #Create time series
plot(myts, main="Time Series Plot")
stl.ts<-stl(ts(geneva$AVE, frequency = 365),s.window="periodic")
autoplot(stl.ts)#The trend shows declining temperatures
#Time series 2
library(readxl)
library(tidyverse)
library(timeSeries)
library(forecast)
library(TTR)
geneva<-read_excel("C:/Users/Adm/Downloads/Re_ RSTUDIO/GENEVATEMPS.xlsx")
geneva<-na.omit(geneva)
geneva$AVE<-ts(geneva$AVE)#transform daily average temp to TS() object
#1 moving average smoothing

ma.smooth<-SMA(myts)
plot.ts(ma.smooth)
#The plot of the smoothed temperatures shows a constant trend (No change in
trend)
#2
#Double Exponential Smoothing
exp.smooth<- HoltWinters(myts, gamma=FALSE)
plot(exp.smooth$fitted)
##1-Jan-2010 to 31-Dec-2018
ndx<-seq(as.Date("2010-01-01"), as.Date("2018-12-31"), by="day")
myts<-ts(geneva[4], start=c(2010, as.numeric(format(ndx[1],"%j"))),
frequency=365) #Create time series
fit <- arima(myts)#Create arima model
#3
Box.test(fit$residuals, type="Ljung-Box")#may include ,lag=30
##
## Box-Ljung test
##
## data: fit$residuals
## X-squared = 3150.2, df = 1, p-value < 2.2e-16
#4 Predict for first 30 days of 2019

library(forecast)
prediction<-forecast(fit, h=30)#forecast.Arima not working
#forecast values
actual.geneva<-subset(geneva, DATE>= "2019-01-01 UTC" & DATE <= "2019-01-31
UTC")#actual measurements
actual.geneva$AVE
## [1] 42.0 30.0 29.0 30.5 38.0 34.0 24.5 24.0 37.0 27.5 16.5 12.5 13.5 12.0
18.5
## [16] 27.5 20.5 18.0 22.5 13.5 2.0 1.5 16.0 35.0 30.5 15.5 16.5 17.0 14.5
13.5

Bs at HSW: Victor Omondi 7/27/2021 #Stem Majors Chisquare

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Bs at HSW: Victor Omondi 7/27/2021 #Stem Majors Chisquare

Uploaded by

Copyright:

Available Formats

BS at HSW

#STEM MAJORs Chisquare

## -- Attaching packages --------------------------------------- tidyverse

## v ggplot2 3.3.5 v purrr 0.3.4

## Warning in chisq.test(mat1): Chi-squared approximation may be incorrect

chisqtest$expected#expected values table

## Part 2 Sample size in Villosa Iris

Predict/Reconstruct the height of an ancient human who left..

#the coefficient for the predictor is statistically significant at alpha=0.05

## Loading required package: timeDate

## Registered S3 method overwritten by 'quantmod':

## Loading required package: lubridate

## The following objects are masked from 'package:base':

## Loading required package: PerformanceAnalytics

## Loading required package: xts

## Loading required package: zoo

## The following object is masked from 'package:timeSeries':

## The following objects are masked from 'package:base':

## The following objects are masked from 'package:dplyr':

## The following objects are masked from 'package:timeDate':

## Loading required package: quantmod

## Loading required package: TTR

## == Need to Learn tidyquant?

## DATE TMAX TMIN AVE

## DATE TMAX TMIN AVE

#1 moving average smoothing

#4 Predict for first 30 days of 2019

You might also like