R Codes

#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"
data1[c(631,659,805,442,854,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA
#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"
data1[c(631,659,805,442,854),]<-NA
reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~
Total.years.of.experience+Yearly.bonus...stocks.in.EUR+datatrain$Annual.brutto.salary..without.bon
us.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country+
datatrain$Years.of.experience.in.Germany+datatrain$Annual.bonus.stocks.one.year.ago..Only.answ
er.if.staying.in.same.country+datatrain$Number.of.vacation.days+datatrain$Contract.duration+data
train$Age, data = datatrain)
For replacing blank categorical column with “none”
library(tidyverse)
data1%>%
select(Age,Gender,City,Position,Total.years.of.experience,Years.of.experience.in.Germany
,Seniority.level,Your.main.technology...programming.language,Other.technologies.programming.lan
guages.you.use.often,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in
.EUR,Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.sa
me.country,Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.v
acation.days,Employment.status,Contract.duration,Main.language.at.work,Company.size,Company.t
ype,)%>%
mutate(Position=replace_na(Position,"none"))%>%
view()
-------------------------------------------------------
data1%>%
select(Age,Gender,City,Position,Total.years.of.experience,Total.years.of.experience,Seniority.level,Y
our.main.technology...programming.language,Other.technologies.programming.languages.you.use.o
ften,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in.EUR,Annual.bru
tto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country,Annu
al.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.vacation.days,Empl
oyment.status,)
library(readxl)
read.table()
data.frame()
data1<-read.csv("C:/Users/hp/Downloads/Sem4/satya 1.csv", stringsAsFactors = TRUE)
data1<-na.omit(data1)#taken by removing outliers(6) from the outlier section at the bottom
data1<-na.omit(data1)
library(caTools)
set.seed(100)
split1<-sample.split(data1$Course,SplitRatio =0.7)
summary(split1)
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
library(ggplot2)
ggplot(datatrain, aes(datatrain$Student_engagement, datatrain$goal_orientation))+geom_point()
cor(datatrain$Student_engagement, datatrain$goal_orientation)
#exists a relationship (p value=0.68)
cor.test(datatrain$Student_engagement, datatrain$goal_orientation)
ggplot(datatrain, aes(datatrain$Student_engagement, datatrain$goal_orientation))+geom_smooth()
#looks linear
reg1<-lm(datatrain$Student_engagement~
datatrain$goal_orientation+
datatrain$academic_self_eficacy, data = datatrain)
summary(reg1)
#OR
reg1<-lm(Student_engagement~
goal_orientation+
academic_self_eficacy,
data = datatrain)
summary(reg1)
#F-statistic p<0.05 reject null, model is good fit
#How good the model is answered with R2 which is 66%
#Adj r2 for more than one independent variables (adjust for degg of freedom)
#intercept: is not significantly effecting dependent variable(if p>0.05)
#beta1: 22% no significant effect of the constant on 'self efficacy'
#beta2: 73%
#Multicollinearity
library(car)
vif(reg1) #variance inflation factor:<4(small model, no multicollinearity)
#Assumptions
#Checking randomness: error term is seen randomly distributed
plot(reg1$residuals, c(1:length(reg1$residuals)))
#normality of error terms(boxplot,qqnorm,shapiro,skewness)
boxplot(reg1$residuals) #the midline is not at the middle, devaitions exists
shapiro.test(reg1$residuals) #P<0.05 reject null, not normally distributed
hist(reg1$residuals)#not normal
descr(datatrain)
summary(datatrain)
library(moments)
# skeness and kurtosis- normality
datatrain$residual1<-reg1$residuals
moments::skewness(datatrain$residual1)
moments::kurtosis(datatrain$residual1)
#OR
library(e1071)
kurtosis(reg1$residuals)
#outliers
#hetro sidasticity, find cone shape, equally distribution on both sides
plot(reg1$residuals, reg1$fitted.values)
library(lmtest)
bptest(reg1) #no equal distribution of , model is not a good fit
#outliers---
plot(reg1)
#OR
library(car)
influenceIndexPlot(reg1)#261
#converted outlers as missing using NA and removed in main data set in next step
data1[c(261,799,96,426,248),]<-NA
reg1<-lm(emission~mass+
wheelbase+
axlesteering+
axleother+
enginecapacity+
emisionreduction,data = data1)
summary(reg1)
library(car)
vif(reg1)
#model---more variable, more data set----two models=axle steering, axle other
#created two models based on the higher ranges of Vif
regm1<-lm(emission~mass+
wheelbase+
axleother+
enginecapacity+
summary(regm1)
vif(regm1) #no-multicollinaerity..no higher values
regm2<-lm(emission~mass+
wheelbase+
axlesteering+
enginecapacity+
summary(regm2)
vif(regm2)
#rmse------,Anova
prem1<-predict(regm1, data1)
prem2<-predict(regm2, data1)
library(caret)
library(Metrics)
RMSE(data1$emission,prem1)
RMSE(data1$emission,prem2)
# the coefficients and r2 values are similar so we can have both the models for prediction of
"emmission"
#loading the required library and importing the "IT salary survey EU" data set
library(readxl)
library(descr)
read.table()
data.frame()
data1<-read.csv("C:/Users/hp/Downloads/IT Salary Survey EU 2020...csv", stringsAsFactors = TRUE)
sum(is.na(data1))
str(data1)
#taken by removing outliers from the outlier section at the bottom after plotting "reg1"
#multiple running of the codes gave the following outliers.
data1[c(631,659,805,442,854,855,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA
#the unwanted columns which has no relation in the given problem are removed
data1<-data1[-23]
data1<-data1[-22]
data1<-data1[-21]
data1<-data1[-1]
view(data1)
#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary
sum(is.na(data1))
data1<-na.omit(data1)
summary(data1)
#The library is used to find unique values in a particular column
library(mice)
md.pattern(data1)
unique(data1$Position)
unique(data1$Seniority.level)
fix(data1)
summary(data1)
#the data set is splitted
library(caTools)
set.seed(100)
split1<-sample.split(data1$City,SplitRatio =0.7)
summary(split1)
#The data set is trained and tested in the ration of 70:30
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
#library is loaded
library(ggplot2)
ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience))+geom_smooth()
#The graph of yearly salary and years of experience of IT professional
cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience)
#There exists a nearly perfect relationship(p=0.43)
datatrain$Years.of.experience.in.Germany)
#There exists no correlation between the salary and experience in germany
datatrain$Yearly.bonus...stocks.in.EUR)
ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR))+geom_smooth()
#linear regression
Total.years.of.experience+Yearly.bonus...stocks.in.EUR+Annual.brutto.salary..without.bonus.and.sto
cks..one.year.ago..Only.answer.if.staying.in.the.same.country+
Years.of.experience.in.Germany+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.
country+Number.of.vacation.days+Age, data = datatrain)
summary(reg1)
#Outliers are marked to N/A and is omited in the above section
plot(reg1)
#Based on the above results the new model is developed here

Total.years.of.experience+Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answ
er.if.staying.in.the.same.country+
Yearly.bonus...stocks.in.EUR+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.cou
ntry, data = datatrain)
summary(reg1)
#F-statistic: p<0.05 reject null, model is good fit
#R-sqr=0.41 which means that 41% of the model looks good.
#Outliers have been detected through the regression plotting and the outliers have been deleted
plot(reg1)
#variance inflation factor:<4(small model, no multicollinearity)
#Checking randomness: error term is seen randomly distributed
plot(reg1$residuals, c(1:length(reg1$residuals)))
#normality of error terms(shapiro test)
shapiro.test(reg1$residuals)
#P<0.05 reject null, not normally distributed
hist(reg1$residuals)#The model is almost near to normal

R Codes

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

R Codes

Uploaded by

Copyright:

Available Formats

#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"

For replacing blank categorical column with “none”

data1<-read.csv("C:/Users/hp/Downloads/Sem4/satya 1.csv", stringsAsFactors = TRUE)

data1<-na.omit(data1)#taken by removing outliers(6) from the outlier section at the bottom

ggplot(datatrain, aes(datatrain$Student_engagement, datatrain$goal_orientation))+geom_point()

#exists a relationship (p value=0.68)

ggplot(datatrain, aes(datatrain$Student_engagement, datatrain$goal_orientation))+geom_smooth()

datatrain$academic_self_eficacy, data = datatrain)

#F-statistic p<0.05 reject null, model is good fit

#How good the model is answered with R2 which is 66%

#intercept: is not significantly effecting dependent variable(if p>0.05)

#beta1: 22% no significant effect of the constant on 'self efficacy'

vif(reg1) #variance inflation factor:<4(small model, no multicollinearity)

#Checking randomness: error term is seen randomly distributed

#normality of error terms(boxplot,qqnorm,shapiro,skewness)

boxplot(reg1$residuals) #the midline is not at the middle, devaitions exists

shapiro.test(reg1$residuals) #P<0.05 reject null, not normally distributed

# skeness and kurtosis- normality

#hetro sidasticity, find cone shape, equally distribution on both sides

bptest(reg1) #no equal distribution of , model is not a good fit

#model---more variable, more data set----two models=axle steering, axle other

#created two models based on the higher ranges of Vif

vif(regm1) #no-multicollinaerity..no higher values

data1<-read.csv("C:/Users/hp/Downloads/IT Salary Survey EU 2020...csv", stringsAsFactors = TRUE)

#multiple running of the codes gave the following outliers.

#The library is used to find unique values in a particular column

#the data set is splitted

#The data set is trained and tested in the ration of 70:30

#The graph of yearly salary and years of experience of IT professional

#There exists a nearly perfect relationship(p=0.43)

#There exists no correlation between the salary and experience in germany

#Outliers are marked to N/A and is omited in the above section

#Based on the above results the new model is developed here

#F-statistic: p<0.05 reject null, model is good fit

#R-sqr=0.41 which means that 41% of the model looks good.

#variance inflation factor:<4(small model, no multicollinearity)

#Checking randomness: error term is seen randomly distributed

#normality of error terms(shapiro test)

#P<0.05 reject null, not normally distributed

hist(reg1$residuals)#The model is almost near to normal

You might also like