You are on page 1of 9

20BCE1205

SHUBHAM OJHA

EDA

LAB3
2023-01-30

1. Compute the relationship that exists between each variable (x1, x2, x3) with
target(y) of the given dataset by finding the Correlation. Out of the given three
variables which one would you choose for modelling. Simulate the above task
using R code.
df<-
data.frame(x1=c(1,3,4,5,6,7,8),x2=c(8,7,5,6,4,3,2),x3=c(8,2,4,6,3,7,5),y=c(2,
5,7,11,12,15,17))
df

## x1 x2 x3 y
## 1 1 8 8 2
## 2 3 7 2 5
## 3 4 5 4 7
## 4 5 6 6 11
## 5 6 4 3 12
## 6 7 3 7 15
## 7 8 2 5 17

x1=df$x1
x2=df$x2
x3=df$x3
y=df$y

#without inbuilt function


#covariance=sum((x1-mean(x1))*(y-mean(y)))/n-1
#for x1 and y
sdx1=sd(x1)
sdy=sd(y)
correlation1=sum((x1-mean(x1))*(y-mean(y)))/sqrt(sum((x1-mean(x1))^2) *
sum((y-mean(y))^2))

# for x2 and y
sdx2=sd(x2)
correlation2=sum((x2-mean(x2))*(y-mean(y)))/sqrt(sum((x2-mean(x2))^2) *
sum((y-mean(y))^2))

# for(x2 and y)
sdx3=sd(x3)
correlation3=sum((x3-mean(x3))*(y-mean(y)))/sqrt(sum((x3-mean(x3))^2) *
sum((y-mean(y))^2))

print("Without inbuilt method: ")

## [1] "Without inbuilt method: "

paste("The value of correlation without inbuilt function between x1 and y is


",correlation1)

## [1] "The value of correlation without inbuilt function between x1 and y is


0.991610842427341"

paste("The value of correlation without inbuilt function between x1 and y is


",correlation2)

## [1] "The value of correlation without inbuilt function between x1 and y is


-0.937893697625288"

paste("The value of correlation without inbuilt function between x1 and y is


",correlation3)

## [1] "The value of correlation without inbuilt function between x1 and y is


0.0142105105700801"

cat("\n")

#with inbuilt function


print("With inbuilt method: ")

## [1] "With inbuilt method: "

paste("The value of correlation with inbuilt function between x1 and y is


",cor(df$x1,df$y))

## [1] "The value of correlation with inbuilt function between x1 and y is


0.991610842427341"

paste("The value of correlation with inbuilt function between x1 and y is


",cor(df$x2,df$y))

## [1] "The value of correlation with inbuilt function between x1 and y is -


0.937893697625288"

paste("The value of correlation with inbuilt function between x1 and y is


",cor(df$x3,df$y))
## [1] "The value of correlation with inbuilt function between x1 and y is
0.0142105105700801"

print("x1 and y has strong correlation among them so we will be chossing x1


for modelling")

## [1] "x1 and y has strong correlation among them so we will be chossing x1
for modelling"

library(ggplot2,ggpubr)

## Warning: package 'ggplot2' was built under R version 4.2.2

ux=mean(df$x)

## Warning in mean.default(df$x): argument is not numeric or logical:


returning NA

uy=mean(df$y)
b1=sum((df$x-ux)*(df$y-uy))
b1=b1/sum((df$x-ux)^2)
b0=uy-b1*ux
print(paste("y = ",b0," + ",b1," x"))

## [1] "y = NaN + NaN x"

#13
tss=sum((df$y-uy)^2)
tss

## [1] 176.8571

pred=b0+b1*df$x
pred

## numeric(0)

rss=sum((pred-df$y)^2)
r2=1-(rss/tss)
print(paste("R Square = ",r2))

## [1] "R Square = 1"

#14
e=pred-df$y
e

## numeric(0)

sde=sd(e)
sde

## [1] NA
seb0=sde*(sqrt((1/nrow(df))+(((ux)^2)/(sum((df$x-ux)^2)))))
seb1=sde*(sqrt((1/sum((df$x-ux)^2))))
print(paste("Standard Error for B0 = ",seb0," Standard Error for B1 =
",seb1))

## [1] "Standard Error for B0 = NA Standard Error for B1 = NA"

Question 2. Consider the following five training examples # X = [2 3 4 5 6] # Y = [12


17 23 28 32] #Write the R script for the following and give your inference. #(a) Find
the best linear fit (Y=aX+b)
x=c(2,3,4,5,6)
y=c(12,17,23,28,32)
model=lm(y~x)
model

##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 2.0 5.1

#(b) Plot the graph of the model

dt=data.frame(X=x,Y=y)
dt

## X Y
## 1 2 12
## 2 3 17
## 3 4 23
## 4 5 28
## 5 6 32

ggplot(dt, aes(X, Y)) +


geom_point() +
geom_smooth(method = "lm")

## `geom_smooth()` using formula = 'y ~ x'


#(c) Determine the minimum RSS
print(sum((y-predict(model,dt))^2))

## [1] 1.1

#OR
rss=deviance(model)
rss

## [1] 1.1

#(d) Draw the residual plot for the best linear fit and comment on the
suitability of the linear model to this training data.

library(ggplot2,caTools,dplyr)
plot(model, rss)
abline(0,0)
#(e) Evaluate the standard errors associated with a and b.

ux=mean(dt$X)
uy=mean(dt$Y)
b1=sum((dt$X-ux)*(dt$Y-uy))
b1=b1/sum((dt$X-ux)^2)

b0=uy-b1*ux
b0

## [1] 2

pred=b0+b1*dt$X
pred

## [1] 12.2 17.3 22.4 27.5 32.6

e=dt$Y-pred
e

## [1] -0.2 -0.3 0.6 0.5 -0.6

sde=sqrt(sum((e-mean(e))^2)/nrow(dt))

sde
## [1] 0.4690416

seb0=sde*sqrt((1/nrow(dt))+(((ux)^2)/(sum((dt$X-ux)^2))))
seb1=sde*(sqrt((1/sum((dt$X-ux)^2))))

print(paste("Standard Error for B0 = ",seb0," Standard Error for B1 =


",seb1))

## [1] "Standard Error for B0 = 0.629285308902089 Standard Error for B1 =


0.148323969741913"

#(f) Determine the 95% confidence interval for a and b

print("Confidence Interval- \n")

## [1] "Confidence Interval- \n"

confint(model, level=0.95)

## 2.5 % 97.5 %
## (Intercept) -0.5854316 4.585432
## x 4.4906079 5.709392

#(g) Compute R2 statistic

RSQUARE=1-(rss/sum((y-mean(y))^2))
cat("RSQUARE - ",RSQUARE,"\n")

## RSQUARE - 0.9957887

#or
#summary(model)$r.squared

#(h) Predict the value of a test instance from the dataset

p=data.frame(X=x)

y2=predict(model, newdata=p)
y2

## 1 2 3 4 5
## 12.2 17.3 22.4 27.5 32.6

Question 3- Apply linear regression analysis on any prominent dataset


and state the inferences.
#mtcars
data(mtcars)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1

input=mtcars
fit<-lm(mpg~wt,data=input)
summary(fit)

##
## Call:
## lm(formula = mpg ~ wt, data = input)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10

ggplot(input, aes(mpg, wt)) +


geom_point() +
geom_smooth(method = "lm",se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'


print("The relationship between mpg (miles per gallon) and wt (weight of the
car) in the mtcars dataset is typically negative, meaning that as the weight
of the car increases, the miles per gallon decreases. This makes sense as
heavier cars generally require more fuel to operate and therefore have lower
fuel efficiency")

## [1] "The relationship between mpg (miles per gallon) and wt (weight of the
car) in the mtcars dataset is typically negative, meaning that as the weight
of the car increases, the miles per gallon decreases. This makes sense as
heavier cars generally require more fuel to operate and therefore have lower
fuel efficiency"

You might also like