You are on page 1of 9

#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"

data1[c(631,659,805,442,854,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA

#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"

data1[c(631,659,805,442,854),]<-NA

reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~

Total.years.of.experience+Yearly.bonus...stocks.in.EUR+datatrain$Annual.brutto.salary..without.bon
us.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country+

datatrain$Years.of.experience.in.Germany+datatrain$Annual.bonus.stocks.one.year.ago..Only.answ
er.if.staying.in.same.country+datatrain$Number.of.vacation.days+datatrain$Contract.duration+data
train$Age, data = datatrain)

For replacing blank categorical column with “none”

library(tidyverse)

data1%>%

select(Age,Gender,City,Position,Total.years.of.experience,Years.of.experience.in.Germany
,Seniority.level,Your.main.technology...programming.language,Other.technologies.programming.lan
guages.you.use.often,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in
.EUR,Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.sa
me.country,Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.v
acation.days,Employment.status,Contract.duration,Main.language.at.work,Company.size,Company.t
ype,)%>%

mutate(Position=replace_na(Position,"none"))%>%

view()

-------------------------------------------------------

data1%>%
select(Age,Gender,City,Position,Total.years.of.experience,Total.years.of.experience,Seniority.level,Y
our.main.technology...programming.language,Other.technologies.programming.languages.you.use.o
ften,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in.EUR,Annual.bru
tto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country,Annu
al.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.vacation.days,Empl
oyment.status,)

library(readxl)

read.table()

data.frame()

data1<-read.csv("C:/Users/hp/Downloads/Sem4/satya 1.csv", stringsAsFactors = TRUE)

data1<-na.omit(data1)#taken by removing outliers(6) from the outlier section at the bottom

data1<-na.omit(data1)

library(caTools)

set.seed(100)

split1<-sample.split(data1$Course,SplitRatio =0.7)

summary(split1)

datatrain<-subset(data1, split1==TRUE)

datatest<-subset(data1, split1==FALSE)

summary(datatrain)

str(datatrain)

library(ggplot2)

ggplot(datatrain, aes(datatrain$Student_engagement, datatrain$goal_orientation))+geom_point()

cor(datatrain$Student_engagement, datatrain$goal_orientation)

#exists a relationship (p value=0.68)

cor.test(datatrain$Student_engagement, datatrain$goal_orientation)

ggplot(datatrain, aes(datatrain$Student_engagement, datatrain$goal_orientation))+geom_smooth()

#looks linear

reg1<-lm(datatrain$Student_engagement~
datatrain$goal_orientation+

datatrain$academic_self_eficacy, data = datatrain)

summary(reg1)

#OR

reg1<-lm(Student_engagement~

goal_orientation+

academic_self_eficacy,

data = datatrain)

summary(reg1)

#F-statistic p<0.05 reject null, model is good fit

#How good the model is answered with R2 which is 66%

#Adj r2 for more than one independent variables (adjust for degg of freedom)

#intercept: is not significantly effecting dependent variable(if p>0.05)

#beta1: 22% no significant effect of the constant on 'self efficacy'

#beta2: 73%

#Multicollinearity

library(car)

vif(reg1) #variance inflation factor:<4(small model, no multicollinearity)

#Assumptions

#Checking randomness: error term is seen randomly distributed

plot(reg1$residuals, c(1:length(reg1$residuals)))

#normality of error terms(boxplot,qqnorm,shapiro,skewness)

boxplot(reg1$residuals) #the midline is not at the middle, devaitions exists

shapiro.test(reg1$residuals) #P<0.05 reject null, not normally distributed

hist(reg1$residuals)#not normal

descr(datatrain)

summary(datatrain)
library(moments)

# skeness and kurtosis- normality

datatrain$residual1<-reg1$residuals

moments::skewness(datatrain$residual1)

moments::kurtosis(datatrain$residual1)

#OR

library(e1071)

kurtosis(reg1$residuals)

#outliers

#hetro sidasticity, find cone shape, equally distribution on both sides

plot(reg1$residuals, reg1$fitted.values)

library(lmtest)

bptest(reg1) #no equal distribution of , model is not a good fit

#outliers---

plot(reg1)

#OR

library(car)

influenceIndexPlot(reg1)#261

#converted outlers as missing using NA and removed in main data set in next step

data1[c(261,799,96,426,248),]<-NA
reg1<-lm(emission~mass+

wheelbase+

axlesteering+

axleother+

enginecapacity+

emisionreduction,data = data1)

summary(reg1)

library(car)

vif(reg1)

#model---more variable, more data set----two models=axle steering, axle other

#created two models based on the higher ranges of Vif

regm1<-lm(emission~mass+

wheelbase+

axleother+

enginecapacity+

emisionreduction,data = data1)

summary(regm1)

vif(regm1) #no-multicollinaerity..no higher values

regm2<-lm(emission~mass+

wheelbase+

axlesteering+

enginecapacity+

emisionreduction,data = data1)

summary(regm2)

vif(regm2)
#rmse------,Anova

prem1<-predict(regm1, data1)

prem2<-predict(regm2, data1)

library(caret)

library(Metrics)

RMSE(data1$emission,prem1)

RMSE(data1$emission,prem2)

# the coefficients and r2 values are similar so we can have both the models for prediction of
"emmission"
#loading the required library and importing the "IT salary survey EU" data set

library(readxl)

library(descr)

read.table()

data.frame()

data1<-read.csv("C:/Users/hp/Downloads/IT Salary Survey EU 2020...csv", stringsAsFactors = TRUE)

sum(is.na(data1))

str(data1)

#taken by removing outliers from the outlier section at the bottom after plotting "reg1"

#multiple running of the codes gave the following outliers.

data1[c(631,659,805,442,854,855,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA

#the unwanted columns which has no relation in the given problem are removed

data1<-data1[-23]

data1<-data1[-22]

data1<-data1[-21]

data1<-data1[-1]

view(data1)

#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary

sum(is.na(data1))

data1<-na.omit(data1)

summary(data1)

#The library is used to find unique values in a particular column

library(mice)

md.pattern(data1)

unique(data1$Position)

unique(data1$Seniority.level)

fix(data1)

summary(data1)

#the data set is splitted

library(caTools)
set.seed(100)

split1<-sample.split(data1$City,SplitRatio =0.7)

summary(split1)

#The data set is trained and tested in the ration of 70:30

datatrain<-subset(data1, split1==TRUE)

datatest<-subset(data1, split1==FALSE)

summary(datatrain)

str(datatrain)

#library is loaded

library(ggplot2)

ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience))+geom_smooth()

#The graph of yearly salary and years of experience of IT professional

cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience)

#There exists a nearly perfect relationship(p=0.43)

cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Years.of.experience.in.Germany)

#There exists no correlation between the salary and experience in germany

cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR)

ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR))+geom_smooth()

#linear regression

reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~

Total.years.of.experience+Yearly.bonus...stocks.in.EUR+Annual.brutto.salary..without.bonus.and.sto
cks..one.year.ago..Only.answer.if.staying.in.the.same.country+

Years.of.experience.in.Germany+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.
country+Number.of.vacation.days+Age, data = datatrain)

summary(reg1)

#Outliers are marked to N/A and is omited in the above section

plot(reg1)

#Based on the above results the new model is developed here


reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~

Total.years.of.experience+Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answ
er.if.staying.in.the.same.country+

Yearly.bonus...stocks.in.EUR+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.cou
ntry, data = datatrain)

summary(reg1)

#F-statistic: p<0.05 reject null, model is good fit

#R-sqr=0.41 which means that 41% of the model looks good.

#Outliers have been detected through the regression plotting and the outliers have been deleted

plot(reg1)

#variance inflation factor:<4(small model, no multicollinearity)

#Checking randomness: error term is seen randomly distributed

plot(reg1$residuals, c(1:length(reg1$residuals)))

#normality of error terms(shapiro test)

shapiro.test(reg1$residuals)

#P<0.05 reject null, not normally distributed

hist(reg1$residuals)#The model is almost near to normal

You might also like