Professional Documents
Culture Documents
data1[c(631,659,805,442,854,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA
#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"
data1[c(631,659,805,442,854),]<-NA
reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~
Total.years.of.experience+Yearly.bonus...stocks.in.EUR+datatrain$Annual.brutto.salary..without.bon
us.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country+
datatrain$Years.of.experience.in.Germany+datatrain$Annual.bonus.stocks.one.year.ago..Only.answ
er.if.staying.in.same.country+datatrain$Number.of.vacation.days+datatrain$Contract.duration+data
train$Age, data = datatrain)
library(tidyverse)
data1%>%
select(Age,Gender,City,Position,Total.years.of.experience,Years.of.experience.in.Germany
,Seniority.level,Your.main.technology...programming.language,Other.technologies.programming.lan
guages.you.use.often,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in
.EUR,Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.sa
me.country,Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.v
acation.days,Employment.status,Contract.duration,Main.language.at.work,Company.size,Company.t
ype,)%>%
mutate(Position=replace_na(Position,"none"))%>%
view()
-------------------------------------------------------
data1%>%
select(Age,Gender,City,Position,Total.years.of.experience,Total.years.of.experience,Seniority.level,Y
our.main.technology...programming.language,Other.technologies.programming.languages.you.use.o
ften,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in.EUR,Annual.bru
tto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country,Annu
al.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.vacation.days,Empl
oyment.status,)
library(readxl)
read.table()
data.frame()
data1<-na.omit(data1)
library(caTools)
set.seed(100)
split1<-sample.split(data1$Course,SplitRatio =0.7)
summary(split1)
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
library(ggplot2)
cor(datatrain$Student_engagement, datatrain$goal_orientation)
cor.test(datatrain$Student_engagement, datatrain$goal_orientation)
#looks linear
reg1<-lm(datatrain$Student_engagement~
datatrain$goal_orientation+
summary(reg1)
#OR
reg1<-lm(Student_engagement~
goal_orientation+
academic_self_eficacy,
data = datatrain)
summary(reg1)
#Adj r2 for more than one independent variables (adjust for degg of freedom)
#beta2: 73%
#Multicollinearity
library(car)
#Assumptions
plot(reg1$residuals, c(1:length(reg1$residuals)))
hist(reg1$residuals)#not normal
descr(datatrain)
summary(datatrain)
library(moments)
datatrain$residual1<-reg1$residuals
moments::skewness(datatrain$residual1)
moments::kurtosis(datatrain$residual1)
#OR
library(e1071)
kurtosis(reg1$residuals)
#outliers
plot(reg1$residuals, reg1$fitted.values)
library(lmtest)
#outliers---
plot(reg1)
#OR
library(car)
influenceIndexPlot(reg1)#261
#converted outlers as missing using NA and removed in main data set in next step
data1[c(261,799,96,426,248),]<-NA
reg1<-lm(emission~mass+
wheelbase+
axlesteering+
axleother+
enginecapacity+
emisionreduction,data = data1)
summary(reg1)
library(car)
vif(reg1)
regm1<-lm(emission~mass+
wheelbase+
axleother+
enginecapacity+
emisionreduction,data = data1)
summary(regm1)
regm2<-lm(emission~mass+
wheelbase+
axlesteering+
enginecapacity+
emisionreduction,data = data1)
summary(regm2)
vif(regm2)
#rmse------,Anova
prem1<-predict(regm1, data1)
prem2<-predict(regm2, data1)
library(caret)
library(Metrics)
RMSE(data1$emission,prem1)
RMSE(data1$emission,prem2)
# the coefficients and r2 values are similar so we can have both the models for prediction of
"emmission"
#loading the required library and importing the "IT salary survey EU" data set
library(readxl)
library(descr)
read.table()
data.frame()
sum(is.na(data1))
str(data1)
#taken by removing outliers from the outlier section at the bottom after plotting "reg1"
data1[c(631,659,805,442,854,855,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA
#the unwanted columns which has no relation in the given problem are removed
data1<-data1[-23]
data1<-data1[-22]
data1<-data1[-21]
data1<-data1[-1]
view(data1)
#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary
sum(is.na(data1))
data1<-na.omit(data1)
summary(data1)
library(mice)
md.pattern(data1)
unique(data1$Position)
unique(data1$Seniority.level)
fix(data1)
summary(data1)
library(caTools)
set.seed(100)
split1<-sample.split(data1$City,SplitRatio =0.7)
summary(split1)
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
#library is loaded
library(ggplot2)
ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience))+geom_smooth()
cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience)
cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Years.of.experience.in.Germany)
cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR)
ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR))+geom_smooth()
#linear regression
reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~
Total.years.of.experience+Yearly.bonus...stocks.in.EUR+Annual.brutto.salary..without.bonus.and.sto
cks..one.year.ago..Only.answer.if.staying.in.the.same.country+
Years.of.experience.in.Germany+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.
country+Number.of.vacation.days+Age, data = datatrain)
summary(reg1)
plot(reg1)
Total.years.of.experience+Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answ
er.if.staying.in.the.same.country+
Yearly.bonus...stocks.in.EUR+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.cou
ntry, data = datatrain)
summary(reg1)
#Outliers have been detected through the regression plotting and the outliers have been deleted
plot(reg1)
plot(reg1$residuals, c(1:length(reg1$residuals)))
shapiro.test(reg1$residuals)