You are on page 1of 5

rm(list=ls(all=T))

mydata=read.csv("Insurance_factor_identification.csv", header=T)
mydata

# data exploration
#the given data is from insurance industry

head(mydata) # give the first 6 records of the data

dim(mydata)

# the data has 2182 no. of observations and 7 variables

summary(mydata) # give the descriptive analysis of the data

str(mydata) #displays the data type of each variable

# In the given data set all the variables are numeric.

#The committee is interested to know each field of the data collected through
descriptive analysis
#to gain basic insights into the data set and to prepare for further analysis.

summary(mydata)

# Splitting the data into Training data set and Testing dataset
#split ratio be 70:30

library(caTools)

# spliting the data : 70% for training and 30% for testing on the basis of dependent
variable

set.seed(125)
sample= sample.split(mydata$Payment,SplitRatio = 0.70)
sample

#Training Data set

train_data=subset(mydata,sample==TRUE);train_data

#Testing data set


test_data= subset(mydata,sample==FALSE);train_data

#Building multiple linear regression model by using train_data

model_1 = lm(Payment ~., data=train_data)


summary(model_1)

#The p-values for the model_1 are in column 4 under coefficients


#Except for Bonus and Make, the p-value for all the independent variables are less
than 0.05
#we can say that, except Bonus and Make , all other independent variables are
significant
#the Insured and Claims are the most significant variable as there p-value is very
small.
#The slope for Kilometres= 4.671e+03, Zone=2.825e+03 , Insured=2.916e+01,
Claims=4.319e+03

cor(mydata[,-c(3,4)])

#Insured and claim show a strong positive correlation with the dependent variable,
#whereas, Kilometer and Zone show a weak negative correlation with Payment.

#Both multiple R-square and adjusted R-square are same for model_1: 0.9947

#since, we have more than one significant variable so we will go for new model
#with significant variables as independent variables

model_1a=lm(Payment~ Kilometres+Zone+Insured+Claims, data=train_data)


summary(model_1a)

#The p-values for the model_1a are in column 4 under coefficients


#The p-value for all the independent variables are less than 0.05
#we can say that all the independent variables are significant
#the Insured and Claims are the most significant variable as there p-value is very
small.
#The slope for Kilometres= 4.625e+03 , Zone=2.782e+03 , Insured=2.948e+01,
Claims=4.310e+03

cor(mydata[,-c(3,4)])

#Insured and claim show a strong positive correlation with the dependent variable,
#whereas, Kilometer and Zone show a weak negative correlation with Payment.

#Both multiple R-square and adjusted R-square are same for model_1: 0.9947

#so model_1a is the final model as all the independent variables are significant i.e
#Kilometres, Zone, Insured and Claims all have significant effect on the Payment.

#To visualize the results for better understanding.


par(mfrow=c(1,2))
plot(mydata$Claims, mydata$Payment, xlab="Number of Claims", ylab="Payments",
main="")

plot(mydata$Insured,mydata$Payment,xlab="The number of insured in policy-years",


ylab="Payments", main="")

#prediction on the on testing data set

predtest = predict(model_1a,test_data)
head(predtest)

#transform into data frame


pred_payment=data.frame(predtest)

#Bind the predicted data set with original data set by cbind function

final_mydata = cbind(test_data,pred_payment); final_mydata

#export the final file with predicted values

write.csv(final_mydata,"InsuranceFinal.csv")

#The insurance company is planning to establish a new branch office, so they are
interested
#to find at what location, kilometer, and bonus level their insured amount, claims,
#and payment get increased.

library(dplyr)

#grouping the data according to Zone and comparing the mean of different zone

grupzone= apply(mydata[,c(5,6,7)], 2, function(x) tapply(x, mydata$Zone, mean))


grupzone
# Zone 4 has the highest number of claims, and thus payment as well.
# Zones 1-4 have more insured years, claims, and payments.

#grouping the data according to Kilometers and comparing the mean of different zone

grupkil= apply(mydata[,c(5,6,7)],2,function(x)tapply(x,mydata$Kilometres,mean))
grupkil

# Kilometer group 2 has the maximum payments.


#Though the insured number of years is lesser than kilometre 1, the claims and
payments are higher for group 2

#grouping the data according to Bonus and comparing the mean of different zone

grupbon=apply(mydata[,c(5,6,7)],2,function(x)tapply(x,mydata$Bonus,mean))
grupbon

Bonus group 7 has the maximum number of claims and Payment

#The committee wants to understand what affects their claim rates so as to decide the
right
#premiums for a certain set of situations. Hence, they need to find whether the insured
#amount, zone, kilometer, bonus, or make affects the claim rates and to what extent.

model_2 = lm(Claims~.,data=mydata)
summary(model_2)

#Dependent variable: claims Independent variable: kilometres, zone, bonus, make, and
insured
#The results provides the intercept and estimated value and this in turn shows
#that all the p values of independent variables, such as kilometres, zone, bonus, make,
and
#insured are highly significant and are making an impact on the claims.

You might also like