קודיםR

R
‫ ככה מעלים קובץ‬#

 mydata<-read.csv(header=TRUE,file.choose())
 summary(mydata)
 $
# ‫מאפשר לגשת למשהו ספציפי (עמודה) במידע‬
Plots- ‫גרפים‬
#basic plot function
 plot(data$Y,data$X)
#density plot
 plot(density(mydata$loudness))
# ggplot2
 install.packages("ggplot2")
 library(ggplot2)
 x<-ggplot(data=mydata,aes(x=danceability,y=liveness))+ geom_point()
#plot the data- to see if the relationship of Y and X is linear.

 Plot (y~ x)
functions- ‫פונקציות‬
#sum of squares SS
 sum.of.squares <- function(x,y) {
 x^2 + y^2
 }
 sum.of.squares(12,1)
# ‫פונקציה נוספת למציאת סכום ריבועים‬

 sum.of.squares <- function(v) {
 meanv <- mean (v) (V (‫=ווקטור‬
 sum (v-meanv)^2)
 }
#calculate the mean, median and mode

 meanH<-mean(Heights)
 meanH
 meadianH<-median(Heights)
 meadianH
 modeH<-getmode(Heights)
 modeH
#loss function
 sum(meanHvector==Heights) ‫שמות של וקטורים שאני משווה בניהם‬#
 sum.of.squares <- function(v) { # ‫פונקציית ההפסד עצמה‬
 meanv <- mean(v)
 sum((v-meanv)^2)
 }
 sum.of.squares(Heights)
#sum of the squared errors ‫סכום ריבועי הסטיות‬

 sum((X-y)^2)
‫רגרסיה פשוטה‬:
Y ‫ חישוב ממוצע של‬#

 y_mean<-mean(depression_scores)
 y_mean
 y_mean_vector<-rep(y_mean,length(depression_scores))
 y_mean_vector
#sum of the squared errors SStotal

 sum((depression_scores-y_mean_vector)^2)
#build a simple regression model

 my_model<-lm(y~x)
y=bx+a+e(( ‫בהרצת המודל יש פלט שנותן נתונים לנוסחה של רגרסיה פשוטה‬#
.b ‫ כל שאר הדברים הם‬,a= intercept‫כאשר הפלט נותן את ה‬
 my_model
:‫ נותן את הפלט‬#
summary‫ ב‬,‫ אך בנוסף‬.‫ים‬b-‫ וה‬a‫ ולראות את ה‬summary ‫ ניתן לעשות גם‬-‫הערה‬
.‫ניתן לראות גם מובהקות‬
‫ הוא‬b ‫ וכל‬0,0 ‫ זה הממוצע של קבוצה‬a-‫ אז ה‬dummy model‫ אם מדובר ב‬-‫ הערה‬#
. 0,0 ‫הפרש של קבוצה אחרת ביחס לקבוצת‬
:‫פתרון נומרי לרגרסיה פשוטה‬
 y<-df$Y
 x<-df$x
 mult_Reg<-function(b)
 {pred_y=b[1]+b[2]*x
 res_y<-sum((y-pred_y)^2)
 return(res_y)}
a = b1 #
:‫פתרון נומרי לרגרסיה מרובה‬
 y<-df$Y
 x1<-df$x1
 x2<-df$X2
 mult_Reg<-function(b)
 {pred_y=b[1]+b[2]*x1+ b[3] *x2
 res_y<-sum((y-pred_y)^2)
 return(res_y)}
a = b1 #
 output<-nlminb(c(100,100),mult_Reg)
 output$par
‫) ואני נותנת לאלגוריתם לרוץ‬x,y( ‫ פיתרון נומרי שאני נותנת לו שני ערכי התחלה‬#
.‫ שלי עבור אותם ערכים‬b-‫ ואת ה‬a -‫עליהם ומקבלת את ה‬
#full model-Predicting something using all of the other variables in the data
 fullmodel<-lm(satov~.,data=fulldata)
 summary(fullmodel)
‫ והיא מייצגת את כל‬data,. -‫ מודל שמנבא על פי כל המשתנים לכן כותבים נקודה‬#
.)‫המשתנים (במקום לכתוב את כל השמות‬
 model1<-lm(data$Y~data$X1+ data$X2)
 summary(model1)
# ‫בניית מודל‬
:‫רגרסיה היררכית‬
#Hierarchical regression
 install.packages("olsrr")
 library(olsrr)
:‫גישה שכיחותנית‬
# Null hypothesis testing:
# building a model (block 1):
 model1<-lm(y~x1+x2, data=data)
 summary(model1)
# building another model, in which we test the added variables effect (block 2):
 model2<-lm(y~x1+x2+x3+x4, data=data)
 summary(model2)
#comparing between the two models:
 anova(model1,model2)
:‫וזה נותן לנו את הפלט הבא‬
)‫= התוספת לשונות המובהקת (שנתנו לנו המשתנים שהוספנו בבלוק השני‬5.5228
‫ במקרה הזה היא‬,‫" אנחנו רואים את המובהקות של התוספת‬F Pr(>F)" ‫בעמודה שהכותרת שלה היא‬
.‫לא מובהקת‬
:‫גישה בייסיאנית‬
#Bayesian lm:
 install.packages("BayesFactor")
 library(BayesFactor)
#‫במודל בייסיאני מכניסים את כל המשתנים למודל‬

‫‪‬‬ ‫‪model<-regressionBF(y~x1+x2+x3+x4+x5,data=data) #gives us every possible model‬‬
‫‪‬‬ ‫)‪summary(model‬‬
‫מזה אנחנו מקבלים פלט עם על ה‪ BF-‬של כל המודלים האפשריים (עם מנבא אחד‪ ,‬עם שני מנבאים‪,‬‬
‫עם שלושה‪)....‬‬
‫אם אנחנו רוצים לצמצם את האופציות‪:‬‬
‫‪‬‬ ‫‪head(model) #gives us the top 6 best models‬‬
‫זה נותן לנו את הפלט הבא‪:‬‬
‫(מהפלט הזה אנחנו רואים שבמקרה הזה דיכאון לבד מנבא הכי טוב)‬
‫‪#comparing BF between two models‬‬
‫‪‬‬ ‫]‪model[1]/model[22‬‬
‫(שמים את המספרים של שני המודלים שביניהם אני רוצה להשוות‪ ,‬המספרים שמייצגים כל מודל הם‬
‫לפי הפלט שמקבלים מה‪ summery-‬ולא מפקודת "‪)"head‬‬
‫מזה מקבלים את הפלט הבא‪:‬‬
‫בפלט הזה השוו בין מודל [‪( ]1‬עם מנבא דיכאון) לבין המודל האחרון (עם כל המנבאים)‪.‬‬
‫‪ =1650.463‬פי כמה מודל [‪ ]1‬יותר סביר ממודל [‪( .]22‬זה מראה לנו שממש עדיף לנבא רק עם‬
‫דיכאון)‬
‫ניתוח שונות חד גורמי‪:‬‬

‫‪‬‬ ‫)"‪install.packages("dplyr‬‬
‫‪‬‬ ‫)‪library(dplyr‬‬
‫‪‬‬ ‫‪group_by(data, drug,therapy) %>%‬‬
 summarise(
 count = n(),
 mean = mean(mood.gain, na.rm = TRUE),
 sd = sd(mood.gain, na.rm = TRUE)
 )
# Research question: mood-gain as a function of drug (Anxifree/Joyzepam/Placebo)

# One-way ANOVA:
res.aov <- aov(mood.gain ~ drug, data = data)
summary(res.aov)
# Since we got significent results in our one-way ANOVA, we perform contrasts:

# tell R which groups to compare:
#c1= joyzepam vs. anxifree + placebo
#c2= anxifree vs. placebo
# comaer via contrasts
 data$c1[data$drug=="placebo"]<- -0.5
 data$c1[data$drug=="anxifree"]<- -0.5
 data$c1[data$drug=="joyzepam"]<- 1
 data$c2[data$drug=="placebo"]<- -1
 data$c2[data$drug=="anxifree"]<- 1
# make sure that the contrasts are orthogonal.
#Now we will check which contrast is statisticaly significent:
 model1 <- aov(mood.gain ~ c1+c2, data = data)

 summary(model1)
# Where does the effect at drugs comes from? - To answer that question, we will
need to know the
# actual means of each group. We can get the means via normal R syntax, or by use
of 'dplyr' syntax:
 View(group_by(data, drug) %>%

summarise(
mean = mean(mood.gain)))
#We can see that the 'jotzepam' mean is larger then the rest of the means.
# And since the contrasts told us that there is a significent effect, we can conclude
that these mean differences from 'joyzepam' to the rest of the drugs are statistically
significant.
‫הסבר קריאת פלט בניתוח שונות‬:

:‫ בודקים קודם האם יש מובהקות בממוצע כלשהו בין הקבוצות‬#
,‫מובהקות‬
, ‫כמה סתיות לא הצלחנו להסביר במערך‬
‫ זה‬,‫סכום הסתיות שכן הצלחנו להסביר (אשר מורכבות מסך כל הקונטרסטים הבלתי תלויים‬
‫ ואני אסכום את כל מה‬,‫ אם אני אקח את סך כל הקונטרסים הב"ת ואני אפרק אותם‬.)F‫ה‬
.drug ‫שהם מסבירים לי אני אקבל את ההסבר הכללי של‬
.‫ הכללי‬ss‫יחד הם שוות ל‬
‫ לאחר שאני רואה שיש מובהקות אני בודקת את הקונטרסטים כדי לראות מאיפה‬#
:‫המובהקות באה‬
 c1= joyzepam vs. anxifree + placebo
 c2= anxifree vs. placebo
 data$c1[data$drug=="anxifree"]<- -0.5
 data$c1[data$drug=="placebo"]<- -0.5
 data$c2[data$drug=="anxifree"]<- -1
 data$c2[data$drug=="placebo"]<- 1
 data$c1<-as.numeric(data$c1)
 data$c2<-as.numeric(data$c2)
 model1 <- aov(mood.gain ~ c1+c2, data = data)
 summary(model1)
.‫ נסכם להסבר הכללי שמצאנו למעלה‬c2 ‫ ועוד‬C1

.‫ אינו מובהק‬c2 ‫ מובהק וכי הקונטרסט‬c1 ‫ניתן לראות כי הקונטרסט‬
:‫ניתוח שונות דו גורמי‬

 data2<-read.csv(header=TRUE,file.choose())
 colnames(data2)
# Research question: math score as function of gender (male/female) and lunch

(free/standad)
# getting some summry statistics.
 group_by(data2, gender,lunch) %>%

summarise(
count = n(),
mean = mean(math.score, na.rm = TRUE),
sd = sd(math.score, na.rm = TRUE)
)
# Performing two-way anova:
 res.aov2 <- aov(math.score ~ gender+lunch, data = data2)

 summary(res.aov2)
# Since significent ,we perform contrasts:
#tell R which groups to compare
#c1= gender main effect
#c2= lunch main effect
#c3=interaction
 data2$c1[data2$gender=="male" & data2$lunch=="free/reduced"]<- 1

 data2$c1[data2$gender=="male" & data2$lunch=="standard"]<- 1
 data2$c1[data2$gender=="female" & data2$lunch=="free/reduced"]<- -1
 data2$c1[data2$gender=="female" & data2$lunch=="standard"]<- -1

 data2$c2[data2$gender=="male" & data2$lunch=="standard"]<- -1
 data2$c2[data2$gender=="female" & data2$lunch=="free/reduced"]<- 1
 data2$c2[data2$gender=="female" & data2$lunch=="standard"]<- -1

 data2$c3[data2$gender=="male" & data2$lunch=="standard"]<- -1
 data2$c3[data2$gender=="female" & data2$lunch=="free/reduced"]<- -1
 data2$c3[data2$gender=="female" & data2$lunch=="standard"]<- 1
# Checkign which contrast is significent
 model2 <- aov(math.score ~ c1+c2+c3, data = data2)

 summary(model2)
#4) Are the contrasts orthogonal? - Yes they are.
# from which group does the effect comes from?
 group_by(data2, lunch) %>%

summarise(
mean = mean(math.score, na.rm = TRUE)
)
#We chould avoid all the contrasts coding simply by using this code:
 modelsame <- aov(math.score ~ gender*lunch, data = data2)

 summary(modelsame)
#The above code means: "math.score as a function of gender,lunch AND gender and
lunch combined".
 res.aov2 <- aov(y ~ x1+x2, data = data2)

 summary(res.aov2)
:‫קריאת פלט ניתוח שונות דו גורמי‬

.‫נראה בפלט זה כי שני המנבאים מובהקים‬
.‫סכום הסתיות שלא הצלחנו להסביר‬
‫ אפקט עיקרי שני‬,‫ סטים של קונטרסטים יענו לי על האם יש אפקט עיקרי אחד‬3
‫והאינטראקציה‬
‫ במידה‬.‫ השני והאינטראקציה‬,‫ כשמכפילים בין משנים מקבלים האפקט העיקרי הראשון‬#
‫ורוצים לבדוק גם אינטראקציה משולשת וכו' אני אכפיל בגורם השלישי ואקבל אפקטים‬
.‫ את כל האינטראקציות הזוגיות ואת המשולשות‬,‫עיקריים‬
 modelsame <- aov(y ~ x1*x2, data = data2)
 summary(modelsame)
:dummy coding
#When we perform regression with categorical variables in R,

#lm function performs dummy coding for us.
#the "all zeros" dummy variable is arbitrarily determined
colnames(data2)
# 1) Research question: math score as function of completing test preparation course
group_by(data2, test.preparation.course) %>%

summarise(
count = n(),
mean = mean(math.score, na.rm = TRUE),
sd = sd(math.score, na.rm = TRUE)
)
# 2) The Regression Model:

regmodel<- lm(math.score~ test.preparation.course,data=data2)
summary(regmodel)
#3 ) Which one is the base variable and which one is not? -- answer: the base
variable is the one that is not
#presented in the summary(). In our case, the 'completed' level is the base level
since it's not in our summary()
# this returns the coding that R have used to create the dummy variables:
contrasts(factor(data2$test.preparation.course)) # Notice the factr()! the contrasts()
function only
# deals with factorial variables so we have to transform our variables into factorial
type.
# 4) what is the meaning of the results?
#Now we check if the contrasts is significent:

model2 <- lm(mood.gain ~ c1, data = data)
summary(model2) # It's significent.

:‫חישוב ביטאות‬
‫ ולהכניס‬,‫ צריך לבנות את המודל הרגיל‬.‫ בשורה הראשונה הספרייה שעושה את הפקודה‬#
.‫ – ומקבלים את הבטאות‬lm.beta ‫אותו לתוך הפונקציה‬
 library(QuantPsyc)
 model<- lm (Y~.,data=data)
 model2<- lm.beta(model)
 model2
:Overfitting
:Adjusted R^2
 summary (model)$r.squared
R2 ‫ חישוב‬#
 summary (model)$adj.r.squared
adjusted R2 ‫ חישוב‬#
 R<-
 K<-
 N<-
 Adjusted<-(R^2)-((K*(1-(R^2)))/(N-K-1))
 Adjusted
‫תיקוף צולב‬:
 set.seed(101)
 dgima <- sample.int(n = nrow(fulldata), size = floor(.8*nrow(fulldata)),
replace = F)
# 0.8 is for the size of the train set
 train <- fulldata[dgima, ]
 test <- fulldata[-dgima, ]
#creating train and test sets:

 train_model<-lm(satov~.,data=train)
 summary(train_model)
 pred<-predict(object=train_model,test)
# checking the squared correlation between the predicted values to the

real values:
 (cor(pred,test$satov,method="pearson"))^2
k-fold cv:
# Define training control
 install.packages("caret")
 library(caret)
 set.seed(123)
 train.control <- trainControl(method = "cv", number = 10)
. control-‫ ומה ה‬training -‫ פונקציה המגדירה מה ה‬-trainControl #
)‫ (מספר קבוצות‬number=k
# Train model
 model <- train(satov~., data = fulldata, method = "lm",
 trControl = train.control)
# Summarize the results
 print(model)
:‫ליניאריות‬
.‫ נראה אם יש מגמתיות‬:‫ עבור כל אחד מהמשתנים שאני רוצה לבדוק‬#
#Testing the linearity assumption
plot(data$X~data$Y)
:‫ אם הליניאריות לא יצאה מובהקת אפשר לתקן את המשתנים‬#
#if not linear
plot(data$X~log(data$Y))
‫ נבנה את המודל‬#
#Build a model
mymodel<-lm(Y~X1+X2,data=data)
summary(mymodel)
:‫נורמאליות‬
QQplot‫ נבדוק את הנורמאליות ב‬#

#Testing the normality assumption
#Q-Q plot
 plot(mymodel, 2)
)‫ (בטעיות‬:‫הומוסקדסטיות‬
#Testing the Homoscedasticity Assumption
 plot(mymodel, 1)
#or - another way to get the same plot.
#install.packages("olsrr")
 library(olsrr)
 ols_plot_resid_fit(mymodel)
:‫מולטיקולינאריות‬
 install.packages("olsrr")
 library(olsrr)
#Collinearity Diagnostics - get the Tolerance
 ols_vif_tol(mymodel)
‫ (טולרנס חלקי אחד) (ההפוך) אנו‬VIF ‫ בעמודת‬,‫ רוצים תוצאה גבוהה‬toleranc ‫ בעמות‬#
‫ הפלט מימין הוא‬.‫ זה עונה על השאלה כמה אחד מתואם על השאר‬.‫רוצים תוצאה נמוכה‬
67% ‫ עמידה‬catholic ,‫ למשל‬.‫השוואה בין שלושה משתנים והפלט משמאל שני משתנים‬
.‫ביחס לאחרים‬
‫ בבחירת מנבאים נרצה את המנבא עם הטולרנס הגבוהה ביותר הוא בשורה האחרונה של‬#
.)‫הפלט (המנבאים מופיעים בסדר עולה מהטולרנס הנמוך לגבוה‬
‫מתאם חלקי‪:‬‬
‫‪short way to get partial correlation #‬‬
‫)])"‪ppcor<- pcor(data[,c("y", "x1", "x2", "x3‬‬
‫‪this shows only partial correlation between each variables #‬‬
‫‪ppcor$estimate‬‬
‫מזה נקבל את הפלט הבא‪:‬‬
‫כל המתאמים פה זה מתאמים חלקיים‪ .‬האלכסון זה מתאמים בין כל משתנה עם עצמו‪.‬‬

‫כל מתאם לא באלכסון הוא המתאם בין שני המשתנים שאליהם הוא משתייך בשורה ובטור‪,‬‬
‫כשהמשתנה השלישי מוחזק קבוע‪.‬‬
‫(למשל‪ 0.7548613 :‬הוא המתאם החלקי בין ‪ happiness.score‬לבין ‪ ,social_support‬כאשר‬

‫‪ generosity‬מוחזק קבוע)‬
‫‪:‬מתאם חלק‬
‫‪short way to get semi-partial correlation #‬‬
‫)])"‪spcorr <- spcor(data[,c("y","x1","x2‬‬
‫מזה נקבל את הפלט הבא‪:‬‬
‫כל המתאמים פה הם מתאמי חלק‪ .‬האלכסון זה מתאמים בין כל משתנה עם עצמו‪.‬‬

‫כל מתאם לא באלכסון הוא המתאם בין שני המשתנים שאליהם הוא משתייך בשורה ובטור‪,‬‬
‫כשהמשתנה השלישי מוחזק קבוע עבור אחד מהם‪.‬‬
‫(כפי שניתן לראות‪ ,‬בין ‪ happiness.score‬לבין ‪ social_support‬יש מתאמים שונים בנקודות החיתוך‬
‫ביניהם‪ .‬לפי מה שנאמר בתרגול‪ ,‬לא ניתן לדעת באיזה מהם המשתנה ‪ generosity‬הוחזק קבוע‬
‫לאיזה משתנה‪ ,‬נאמר שתינתן תשובה בנושא בהמשך)‬
:‫החזקת קבוע‬
the long way to get partial\semi-partial correlation #
calculate e from Y#
e.y <- lm(Health..Life.Expectancy.~Economy..GDP.per.Capita.+Happiness.Score, data = data)
if you want partial correlation, also calculate e from X #
e.x <- lm(Generosity~Economy..GDP.per.Capita.+Happiness.Score, data = data)
)e( ‫כדי לקבל את ערכי השארית‬#
res.y <- summary(e.y)$residuals
res.x <- summary(e.x)$residuals
correlation#
y.x1x2x3 <- lm(res.y~res.x)
)‫ שמים את המשתנה הרגיל‬res-‫ במקום אחד מציוני ה‬,‫(אם רוצים לעשות מתאם חלק‬#
:‫מיתון‬
#1. mean-centering the independent variables: ‫מרכוז המשתנים הבלתי תלויים‬
 x1 <- scale(data$x1,scale = F)
 x2 <- scale(data$x2,scale = F)
#2. calculating a third independent variable which represents the moderation: ‫חישוב משתנה‬
‫ מכפלת המשתנים הממורכזים‬-‫בלתי תלוי נוסף שמבטא את המיתון‬
 moderation <- x1*x2
# 3. ‫בניית מודל רגרסיה‬
 model <- lm(Y~x1+x2+moderation,data)

 summary(model1)
 sum_mod<-summary(model1)
# ‫דרך נוספת לפי בייס פקטור‬

 regressionBF(Y~ x1+x2+moderation,data)
 summary(model1)
 sum_mod<-summary(model1)
‫ זה אומר שקיים‬,‫ אם כן‬.‫(נקבל פלט שבו נראה האם תרומתו של משתנה המיתון מובהקת‬
):‫ אבל על מנת לדעת את כיוונו נבנה גרף‬,‫מיתון‬
# moderation plot:
# extracting the a and b's:
 a<-sum_mod$coefficients[1]
 b1<-sum_mod$coefficients[2]
# defining which levels of moderator we want to plot (sd+, sd-, and mean):
 inhibitionHI<-sd(data$inhibition_centered)
 inhibitionLO<-sd(data$inhibition_centered)*-1
# calculating the slope and intercept of the mediator regression lines
 slopeHI=b1+b3*inhibitionHI
 slopeZero=b1
 slopeLO=b1+b3*inhibitionLO
 interHI=a+b2*inhibitionHI
 interZero=a
 interLO=a+b2*inhibitionLO
# plotting
 install.packages("ggplot2")
 library(ggplot2)
 p<-ggplot(data=data,aes(hindrance_centered,negative.affect))
 pp<-p+geom_point()
 pp
 fun_HI<-function(hindrance_centered) interHI + hindrance_centered * slopeHI
 fun_M<-function(hindrance_centered) interZero + hindrance_centered * slopeZero
 fun_LO<-function(hindrance_centered) interLO + hindrance_centered * slopeLO
 pp+stat_function(fun=fun_HI,aes(color="+sd")) + stat_function(fun=fun_M) +
stat_function(fun=fun_LO,aes(color="-sd"))
‫תיווך‬:
 install.packages("medmod")
 library(medmod)
# building a model:
 medmodel<-med(data=data, dep=y, med=m, pred=x, estMethod = "bootstrap",
bootstrap = 1000)
 medmodel
#to compare the result
 data$x<-data$x
 data$y<-data$y
 data$m<-data$m
 modalc<-lm(y~x,data=data)
 summary(modalc)
 modela<-lm(m~x,data=data)
 summary(modela)
 modelb<-lm(y~x+m,data=data)
 summary(modelb)
:‫פלטים‬
:‫בחירת מנבאים‬
# Building a model:
 model<-lm(y~.,data=data)
 summary(model)
# 1 Forward selection
 step_forward_model<-ols_step_forward_p(model)
 step_forward_model$model #gives us the selected variables and their beta
 step_forward_model$rsquared #can give us the r-squared in every phase
# 2 Backward selection
step_backward_model<-ols_step_backward_p(model)
step_backward_model$model
# 3 Stepwise selection
Stepwise_selection_model<-ols_step_both_p(model)
Stepwise_selection_model$model
# 4 All possible subsets:
ols_step_all_possible(model)
# 5 Best subset:
ols_step_best_subset(model)
.overfitting ‫ מעידה על‬Adjusted R ‫ ירידה‬#
:anova ‫ באמצעות היררכית אשווה בין המודלים באמצעות מבחן‬#
‫ נראה כי המודל העליון (במקרה זה המודל ללא מנבאים ) הוא‬:‫ באמצעות בייסיאנית‬#
.)‫ (יצא ערך מאוד גבוה‬.‫העדיף מפני שהוא מסביר הרבה יותר ביחס למודל בתחתון‬
.‫ כדי לבנות מודל רגרסיה‬R-‫ מתייחס לפונקציה שאנו משתמשים בה ב‬:call -‫מתחת ל‬ -
.b ‫ כל מה שמתחתיו הוא‬,a ‫ הוא‬intercept -‫ ה‬:estimates ‫בעמודת‬ -
‫כל מה שבכוכביות הוא מובהק‬ -
.adjusted R , R2 -‫למטה ניתן לראות את ה‬
2
-
‫ליתר ביטחון‬:
)‫ ( נתונים חסרים‬na -‫ להוציא את ה‬#
 mydata<-na.omit(mydata)
 sum(is.na(mydata))
# prepare a vector with the values we got ‫ שכיח‬,‫אפשר לעשות גם עם חציון‬...

 meanH<-mean(Heights)
 meanH
 meanHvector<-rep(meanH,7)
 meanHvector
#plot the data- to see if the relationship of Y and X is linear, ‫לבדיקת לינאריות‬
 plot (y~ x)
R2-‫דרך נוספת להגיע ל‬

#predict the depression scores using the regression model
 depression_scores2<-as.data.frame(depression_scores)
 pred<-predict(my_model,depression_scores2)
 pred
#sum of the squared errors - for the regression model!  SSres

 sum((y-pred)^2)
#now what is the r-squeared?

#‫פה מחשבים שני שלבים‬:
 #SSreg= SStotal - SSres
#)‫חישוב השונות המוסברת (אר בריבוע‬:
 #SSreg/SStotal= proportion.of.explained.variance
#‫ממש לכתוב את המספרים שיצאו לי‬
 (SStotal-SSres)/SStotal
#46.504/)46.504-9.18571(:‫דוגמא‬
‫דרך נוספת לעשות מודל רגרסיה פשוטה‬:
 Reg_fun <- function (parameters)

 {pred_y <- parameters [1]+ parameters [2] * x
 Res <- sum ((y-pred_y)^2)
 Return (res)}
.)‫ (בגלל שלא קיבלנו אותם במודל זה‬b-‫ ו‬a ‫ הצבת ערכי‬#
 Nision <- c (150,800)

 Reg_fun (nision)
# ‫ לוקחת ערכים התחלתיים שאני בוחרת להתחיל מהם‬-‫פונקציית אופטימיליזציה‬
‫ולנסות עד שתגיע לאופציה הכי טובה‬
 Est_par <- nlminb (nision, reg_fun)
 Est_par$Par
:a ‫ בעזרת זה מוצאים גם את‬,b ‫פיתרון אנליטי לאיך מוצאים את‬

#.....pearson correlation
 correlation<-cor(depression_scores,income,method = "pearson")
 correlation^2
#how to calculate b?
 sd_y<-sd(depression_scores)
 sd_x<-sd(income)
 sds<-sd_y/sd_x
 b<-correlation*sds
 b
‫ אחוז מסוים מכל התצפיות‬,‫לקיחת דגימה‬:
#0.9 ‫בדוגמא הזאת האחוז הוא‬

 dgima <- sample.int(n = nrow(data), size = floor(.9*nrow(data)), replace = F)
 sub_data <- data[dgima, ]
 lmNOTfull<-lm(sub_data$Total~sub_data$Quantity)
 lmNOTfull
 lmfull
:Correlation matrix -‫יצירת טבלת קורלציות‬
 data<-df
 data$Country<-NULL
 data<-cor(df2)
 corrplot(data,method = "number",tl.cex = 0.7)
 #corrplot(data,method = "number")
‫ המתאם ה"אמיתי" לפי שיטה זו‬.‫שיטה לתיקון מתאם שחושב על שני מדדים לא מהימנים‬
.‫ שזה למעשה רו‬rxy / sqrt(rx1x2 * ry1y2) :‫שווה ל‬
‫ חלוקה בשבר‬.‫ זה המתאם האמיתי‬,‫המתאם הנצפה חלקי מכלפת שורשי המהימנויות‬
‫ אנחנו יכולים להניח שהמתאם‬100%‫ ואז כל זמן שהמדדים שלנו לא מהימנים ב‬.‫מגדילה‬
.‫ ככל שהמדדים יותר נקיים ומהינים כך הם יעשו לי פחות נזק‬.‫שלנו טיפה יותר גדול‬
# Corrected correlation
 Corrected_correlation<-rxy/sqrt(rx1x2 * ry1y2)
 Corrected_correlation
‫ לא יכול להיות שבגלל מהימנות גבוהה מאוד ומתאם גבוהה‬-‫מהימנות היא חסם עליון לתוקף‬
.1 ‫מאוד יצא לי בתיקון מעל‬
 data<-read.csv(file.choose(),header=T)
‫ בדיקת מהימנות‬#
 rxy<-cor(data$X,data$Y,method="pearson")
 rxy
 rx<-
 ry<-
‫ תיקון למהימנות‬#
 Corrected<-rxy/sqrt(rx * ry)
 Corrected
‫קיצוץ תחום‬:
#lets remove some rows
 x_c$V1more<- x_c$V1> ____

 x_c<-x_c[x_c$V1more==TRUE,]
 range_restriction<-cor(data2$X,data2$Y,method="pearson")
 range_restriction
:b ‫חישוב‬
#Calculating the squared r with pearson correlation:
 correlation<-cor(y,x,method = "pearson")
 correlation^2
#Calculating b:
 sd_y<-sd(y)
 sd_x<-sd(x)
 sds<-sd_y/sd_x
 b<-correlation*sds
 b
#plot a function:
 mydata<-data.frame(x,y)
 p<-ggplot(data=mydata,aes(x=x,y=y))
 pp<-p+geom_point()
 pp
#‫על מנת ליצור גרף‬
 a<- ‫להגדיר‬
 b<- ‫להגדיר‬
 fun_HI<-function(x) a + b * x
 fun_HI(x)
 pp+stat_function(fun=fun_HI,aes(color="red"))

קודיםR

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

קודיםR

Uploaded by

Copyright:

Available Formats

R

‫ ככה מעלים קובץ‬#

#plot the data- to see if the relationship of Y and X is linear.

# ‫פונקציה נוספת למציאת סכום ריבועים‬

#calculate the mean, median and mode

#sum of the squared errors ‫סכום ריבועי הסטיות‬

Y ‫ חישוב ממוצע של‬#

#sum of the squared errors SStotal

#build a simple regression model

:‫פתרון נומרי לרגרסיה פשוטה‬

:‫פתרון נומרי לרגרסיה מרובה‬

# Null hypothesis testing:

# building a model (block 1):

#comparing between the two models:

:‫וזה נותן לנו את הפלט הבא‬

#‫במודל בייסיאני מכניסים את כל המשתנים למודל‬

‫‪‬‬ ‫‪head(model) #gives us the top 6 best models‬‬

‫זה נותן לנו את הפלט הבא‪:‬‬

‫‪#comparing BF between two models‬‬

‫מזה מקבלים את הפלט הבא‪:‬‬

‫ניתוח שונות חד גורמי‪:‬‬

# Research question: mood-gain as a function of drug (Anxifree/Joyzepam/Placebo)

# Since we got significent results in our one-way ANOVA, we perform contrasts:

 model1 <- aov(mood.gain ~ c1+c2, data = data)

 View(group_by(data, drug) %>%

‫הסבר קריאת פלט בניתוח שונות‬:

.‫ נסכם להסבר הכללי שמצאנו למעלה‬c2 ‫ ועוד‬C1

:‫ניתוח שונות דו גורמי‬

# Research question: math score as function of gender (male/female) and lunch

# getting some summry statistics.

 group_by(data2, gender,lunch) %>%

# Performing two-way anova:

 res.aov2 <- aov(math.score ~ gender+lunch, data = data2)

 data2$c1[data2$gender=="male" & data2$lunch=="free/reduced"]<- 1

 data2$c2[data2$gender=="male" & data2$lunch=="free/reduced"]<- 1

 data2$c3[data2$gender=="male" & data2$lunch=="free/reduced"]<- 1

# Checkign which contrast is significent

 model2 <- aov(math.score ~ c1+c2+c3, data = data2)

# from which group does the effect comes from?

 group_by(data2, lunch) %>%

 modelsame <- aov(math.score ~ gender*lunch, data = data2)

 res.aov2 <- aov(y ~ x1+x2, data = data2)

:‫קריאת פלט ניתוח שונות דו גורמי‬

#When we perform regression with categorical variables in R,

group_by(data2, test.preparation.course) %>%

# 2) The Regression Model:

# 4) what is the meaning of the results?

#Now we check if the contrasts is significent:

summary(model2) # It's significent.

#creating train and test sets:

# checking the squared correlation between the predicted values to the

QQplot‫ נבדוק את הנורמאליות ב‬#

‫‪short way to get partial correlation #‬‬

‫)])"‪ppcor<- pcor(data[,c("y", "x1", "x2", "x3‬‬

‫‪this shows only partial correlation between each variables #‬‬

‫מזה נקבל את הפלט הבא‪:‬‬

‫כל המתאמים פה זה מתאמים חלקיים‪ .‬האלכסון זה מתאמים בין כל משתנה עם עצמו‪.‬‬

‫(למשל‪ 0.7548613 :‬הוא המתאם החלקי בין ‪ happiness.score‬לבין ‪ ,social_support‬כאשר‬

‫‪short way to get semi-partial correlation #‬‬

‫)])"‪spcorr <- spcor(data[,c("y","x1","x2‬‬

‫מזה נקבל את הפלט הבא‪:‬‬

‫כל המתאמים פה הם מתאמי חלק‪ .‬האלכסון זה מתאמים בין כל משתנה עם עצמו‪.‬‬

the long way to get partial\semi-partial correlation #

e.y <- lm(Health..Life.Expectancy.~Economy..GDP.per.Capita.+Happiness.Score, data = data)