Professional Documents
Culture Documents
mydata=read.csv("Insurance_factor_identification.csv", header=T)
mydata
# data exploration
#the given data is from insurance industry
dim(mydata)
#The committee is interested to know each field of the data collected through
descriptive analysis
#to gain basic insights into the data set and to prepare for further analysis.
summary(mydata)
# Splitting the data into Training data set and Testing dataset
#split ratio be 70:30
library(caTools)
# spliting the data : 70% for training and 30% for testing on the basis of dependent
variable
set.seed(125)
sample= sample.split(mydata$Payment,SplitRatio = 0.70)
sample
train_data=subset(mydata,sample==TRUE);train_data
cor(mydata[,-c(3,4)])
#Insured and claim show a strong positive correlation with the dependent variable,
#whereas, Kilometer and Zone show a weak negative correlation with Payment.
#Both multiple R-square and adjusted R-square are same for model_1: 0.9947
#since, we have more than one significant variable so we will go for new model
#with significant variables as independent variables
cor(mydata[,-c(3,4)])
#Insured and claim show a strong positive correlation with the dependent variable,
#whereas, Kilometer and Zone show a weak negative correlation with Payment.
#Both multiple R-square and adjusted R-square are same for model_1: 0.9947
#so model_1a is the final model as all the independent variables are significant i.e
#Kilometres, Zone, Insured and Claims all have significant effect on the Payment.
predtest = predict(model_1a,test_data)
head(predtest)
#Bind the predicted data set with original data set by cbind function
write.csv(final_mydata,"InsuranceFinal.csv")
#The insurance company is planning to establish a new branch office, so they are
interested
#to find at what location, kilometer, and bonus level their insured amount, claims,
#and payment get increased.
library(dplyr)
#grouping the data according to Zone and comparing the mean of different zone
#grouping the data according to Kilometers and comparing the mean of different zone
grupkil= apply(mydata[,c(5,6,7)],2,function(x)tapply(x,mydata$Kilometres,mean))
grupkil
#grouping the data according to Bonus and comparing the mean of different zone
grupbon=apply(mydata[,c(5,6,7)],2,function(x)tapply(x,mydata$Bonus,mean))
grupbon
#The committee wants to understand what affects their claim rates so as to decide the
right
#premiums for a certain set of situations. Hence, they need to find whether the insured
#amount, zone, kilometer, bonus, or make affects the claim rates and to what extent.
model_2 = lm(Claims~.,data=mydata)
summary(model_2)
#Dependent variable: claims Independent variable: kilometres, zone, bonus, make, and
insured
#The results provides the intercept and estimated value and this in turn shows
#that all the p values of independent variables, such as kilometres, zone, bonus, make,
and
#insured are highly significant and are making an impact on the claims.