R Codes

rm(list=ls(all=T))
mydata=read.csv("Insurance_factor_identification.csv", header=T)
mydata
# data exploration
#the given data is from insurance industry
head(mydata) # give the first 6 records of the data
dim(mydata)
# the data has 2182 no. of observations and 7 variables
summary(mydata) # give the descriptive analysis of the data
str(mydata) #displays the data type of each variable
# In the given data set all the variables are numeric.
#The committee is interested to know each field of the data collected through
descriptive analysis
#to gain basic insights into the data set and to prepare for further analysis.
summary(mydata)
# Splitting the data into Training data set and Testing dataset
#split ratio be 70:30
library(caTools)
# spliting the data : 70% for training and 30% for testing on the basis of dependent
variable
set.seed(125)
sample= sample.split(mydata$Payment,SplitRatio = 0.70)
sample
#Training Data set
train_data=subset(mydata,sample==TRUE);train_data
#Testing data set

test_data= subset(mydata,sample==FALSE);train_data
#Building multiple linear regression model by using train_data
model_1 = lm(Payment ~., data=train_data)

summary(model_1)
#The p-values for the model_1 are in column 4 under coefficients

#Except for Bonus and Make, the p-value for all the independent variables are less
than 0.05
#we can say that, except Bonus and Make , all other independent variables are
significant
#the Insured and Claims are the most significant variable as there p-value is very
small.
#The slope for Kilometres= 4.671e+03, Zone=2.825e+03 , Insured=2.916e+01,
Claims=4.319e+03
cor(mydata[,-c(3,4)])
#Insured and claim show a strong positive correlation with the dependent variable,
#whereas, Kilometer and Zone show a weak negative correlation with Payment.
#Both multiple R-square and adjusted R-square are same for model_1: 0.9947
#since, we have more than one significant variable so we will go for new model
#with significant variables as independent variables
model_1a=lm(Payment~ Kilometres+Zone+Insured+Claims, data=train_data)

summary(model_1a)
#The p-values for the model_1a are in column 4 under coefficients

#The p-value for all the independent variables are less than 0.05
#we can say that all the independent variables are significant
#the Insured and Claims are the most significant variable as there p-value is very
small.
#The slope for Kilometres= 4.625e+03 , Zone=2.782e+03 , Insured=2.948e+01,
Claims=4.310e+03
cor(mydata[,-c(3,4)])
#Insured and claim show a strong positive correlation with the dependent variable,
#whereas, Kilometer and Zone show a weak negative correlation with Payment.
#Both multiple R-square and adjusted R-square are same for model_1: 0.9947
#so model_1a is the final model as all the independent variables are significant i.e
#Kilometres, Zone, Insured and Claims all have significant effect on the Payment.
#To visualize the results for better understanding.

par(mfrow=c(1,2))
plot(mydata$Claims, mydata$Payment, xlab="Number of Claims", ylab="Payments",
main="")
plot(mydata$Insured,mydata$Payment,xlab="The number of insured in policy-years",

ylab="Payments", main="")
#prediction on the on testing data set
predtest = predict(model_1a,test_data)
head(predtest)
#transform into data frame

pred_payment=data.frame(predtest)
#Bind the predicted data set with original data set by cbind function
final_mydata = cbind(test_data,pred_payment); final_mydata
#export the final file with predicted values
write.csv(final_mydata,"InsuranceFinal.csv")
#The insurance company is planning to establish a new branch office, so they are
interested
#to find at what location, kilometer, and bonus level their insured amount, claims,
#and payment get increased.
library(dplyr)
#grouping the data according to Zone and comparing the mean of different zone
grupzone= apply(mydata[,c(5,6,7)], 2, function(x) tapply(x, mydata$Zone, mean))

grupzone
# Zone 4 has the highest number of claims, and thus payment as well.
# Zones 1-4 have more insured years, claims, and payments.
#grouping the data according to Kilometers and comparing the mean of different zone
grupkil= apply(mydata[,c(5,6,7)],2,function(x)tapply(x,mydata$Kilometres,mean))
grupkil
# Kilometer group 2 has the maximum payments.

#Though the insured number of years is lesser than kilometre 1, the claims and
payments are higher for group 2
#grouping the data according to Bonus and comparing the mean of different zone
grupbon=apply(mydata[,c(5,6,7)],2,function(x)tapply(x,mydata$Bonus,mean))
grupbon
Bonus group 7 has the maximum number of claims and Payment
#The committee wants to understand what affects their claim rates so as to decide the
right
#premiums for a certain set of situations. Hence, they need to find whether the insured
#amount, zone, kilometer, bonus, or make affects the claim rates and to what extent.
model_2 = lm(Claims~.,data=mydata)
summary(model_2)
#Dependent variable: claims Independent variable: kilometres, zone, bonus, make, and
insured
#The results provides the intercept and estimated value and this in turn shows
#that all the p values of independent variables, such as kilometres, zone, bonus, make,
and
#insured are highly significant and are making an impact on the claims.

R Codes

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

R Codes

Uploaded by

Copyright:

Available Formats

rm(list=ls(all=T))

head(mydata) # give the first 6 records of the data

# the data has 2182 no. of observations and 7 variables

summary(mydata) # give the descriptive analysis of the data

str(mydata) #displays the data type of each variable

# In the given data set all the variables are numeric.

#Training Data set

#Testing data set

#Building multiple linear regression model by using train_data

model_1 = lm(Payment ~., data=train_data)

#The p-values for the model_1 are in column 4 under coefficients

model_1a=lm(Payment~ Kilometres+Zone+Insured+Claims, data=train_data)

#The p-values for the model_1a are in column 4 under coefficients

#To visualize the results for better understanding.

plot(mydata$Insured,mydata$Payment,xlab="The number of insured in policy-years",

#prediction on the on testing data set

#transform into data frame

final_mydata = cbind(test_data,pred_payment); final_mydata

#export the final file with predicted values

grupzone= apply(mydata[,c(5,6,7)], 2, function(x) tapply(x, mydata$Zone, mean))

# Kilometer group 2 has the maximum payments.

Bonus group 7 has the maximum number of claims and Payment

You might also like