You are on page 1of 4

## Exploratory Data Analysis - CardioFitness

## *******************************************
## 1. ***ENVIRONMENT SETUP & DATA IMPORT***
## Importing the dataset in R
setwd("C:/Users/SIVA/Documents/R/BACP-Mini Project")
getwd()
##
##
## Reading the Raw Data
rawdata<-read.csv("CardioGoodFitness.csv")
##
##
## Exploring Raw Data - Dimensions
dim(rawdata)
## [1] 180 9
##
##
## Exploring Raw Data - Columns
names(rawdata)
#[1] "Product" "Age" "Gender" "Education"
"MaritalStatus"
#[6] "Usage" "Fitness" "Income" "Miles"
##
##
## Exploring Raw Data - Structure
str(rawdata)
## data.frame': 180 obs. of 9 variables:
## $ Product : Factor w/ 3 levels "TM195","TM498",..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Age : int 18 19 19 19 20 20 21 21 21 21 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 2 2 1 1 2 2 1 ...
## $ Education : int 14 15 14 12 13 14 14 13 15 15 ...
## $ MaritalStatus: Factor w/ 2 levels "Partnered","Single": 2 2 1 2 1 1 1 2 2
1 ...
## $ Usage : int 3 2 4 3 4 3 3 3 5 2 ...
## $ Fitness : int 4 3 3 3 2 3 3 3 4 3 ...
## $ Income : int 29562 31836 30699 32973 35247 32973 35247 32973 35247
37521 ...
## $ Miles : int 112 75 66 85 47 66 75 85 141 85 ...
##
##Exploring Raw Data - Head
head(rawdata)
##
##Exploring Raw Data - Tail
tail(rawdata)
##
## Summary of Raw Data
summary(rawdata)
##
## Product Age Gender Education MaritalStatus Usage

## TM195:80 Min. :18.00 Female: 76 Min. :12.00 Partnered:107 Min. :


2.000
## TM498:60 1st Qu.:24.00 Male :104 1st Qu.:14.00 Single : 73 1st
Qu.:3.000
## TM798:40 Median :26.00 Median :16.00 Median :
3.000
## Mean :28.79 Mean :15.57 Mean :
3.456
## 3rd Qu.:33.00 3rd Qu.:16.00 3rd
Qu.:4.000
## Max. :50.00 Max. :21.00 Max. :
7.000
##
## Fitness Income Miles Incomelevel Mileslevel
Agegroup
## Min. :1.000 Min. : 29562 Min. : 21.0 <15000 : 0 <60 : 30 21-
25 :69
## 1st Qu.:3.000 1st Qu.: 44059 1st Qu.: 66.0 15K-30K : 1 60-120 :108 26-
30 :41
## Median :3.000 Median : 50597 Median : 94.0 30K-45K :48 120-180: 29 31-
35 :32
## Mean :3.311 Mean : 53720 Mean :103.2 45K-60K :89 180-240: 9 36-
40 :16
## 3rd Qu.:4.000 3rd Qu.: 58668 3rd Qu.:114.8 60K-75K :21 240-300: 3 16-
20 :10
## Max. :5.000 Max. :104581 Max. :360.0 75K-90K : 9 300-360: 1 41-
45 : 6
## 90K-105K:12
(Other): 6
##
## 2. ***UNIVARIATE ANALYSIS***
## Product
barplot(table(rawdata$Product),main = "Product Sales",col = "red",xlab = "Product
Type",ylab = "Total Sales")
##
## Health Conscious Agegroup Distribution
hist(rawdata$Age,main = "AGE", xlab = "AGEGROUP", ylab = "FREQUENCY", col = "blue")
##
boxplot(rawdata$Age,main = "AGE", xlab = "AGEGROUP", ylab = "FREQUENCY", col =
"blue",horizontal = TRUE)
##
##
## Gender
barplot(table(rawdata$Gender),main = "Product usage by Gender",col = "magenta",xlab
= "Gender",ylab = "Frequency")
##
## Effects of Education on Health Consiousness
hist(rawdata$Education,main = "EDUCATION LEVEL", xlab = "EDUCATION LEVEL", ylab =
"FREQUENCY", col="red")
##
boxplot(rawdata$Education,main = "EDUCATION LEVEL", xlab = "EDUCATION LEVEL", ylab
= "FREQUENCY", col="red", horizontal = TRUE)
##
## Marital Status
barplot(table(rawdata$MaritalStatus),main = "MARITAL STATUS",col = "cyan",xlab =
"Relationship",ylab = "Frequency")
##
## Usage Freqeuncy of all Cardio Products
hist(rawdata$Usage,main = "USAGE OF ALL CARDIO PRODUCTS",xlab = "USAGE", ylab =
"FREQUENCY", col = "green")
##
boxplot(rawdata$Usage,main = "USAGE OF ALL CARDIO PRODUCTS",xlab = "USAGE", ylab =
"FREQUENCY", col = "green",horizontal = TRUE)
##
## Fitness Level
hist(rawdata$Fitness,main = "FITNESS", xlab = "Fitness", ylab = "Level",
col="grey")
##
barplot(table(rawdata$Fitness),main = "FITNESS",col = "grey",xlab = "Fitness",ylab
= "Level")
##
##
## Income
hist(rawdata$Income,main = "Income", xlab = "Population", ylab = "Income",
col="orange")
##
boxplot(rawdata$Income,main = "INCOME", xlab = "Population", ylab = "Income",
col="orange", horizontal = TRUE)
##
##
## Miles
hist(rawdata$Miles,main = "MILES", xlab = "Population", ylab = "Miles",
col="yellow")
##
boxplot(rawdata$Miles,main = "MILES", xlab = "Population", ylab = "Miles",
col="yellow", horizontal = TRUE)
##
##
## 3. ***VARIABLE TRANSFORMATION***
##
## Assigning Income levels
rawdata$Incomelevel<-cut(rawdata$Income,breaks =
c(0,15000,30000,45000,60000,75000,90000,105000),labels = c("<15000","15K-30K","30K-
45K","45K-60K","60K-75K","75K-90K","90K-105K"))
##
barplot(table(rawdata$Incomelevel),col = "orange",xlab = "Income levels",ylab =
"Population",main = "INCOME LEVEL")
##
##
## Assigning Miles levels
rawdata$Mileslevel<-cut(rawdata$Miles,breaks = c(0,60,120,180,240,300,360),labels =
c("<60","60-120","120-180","180-240","240-300","300-360"))
##
barplot(table(rawdata$Mileslevel),col = "yellow",xlab = "Miles",ylab =
"Population",main = "MILES LEVEL")##
##
##
## Assigning Age Group
rawdata$Agegroup<-cut(rawdata$Age,breaks = c(0,15,20,25,30,35,40,45,50,100),labels
= c("<15","16-20","21-25","26-30","31-35","36-40","41-45","46-50",">50"))
##
barplot(table(rawdata$Agegroup),col = "blue",xlab = "Age Group", ylab =
"Population",main = "AGE GROUP")
##
##
## 4. ***MISSING VALUE IDENTIFICATION***
## The dataset doesn't have any missing values such as 'NA','-','NIL','Blank Space'
etc,. If the dataset had a missing value,
## we should delete or ignore the observations that are missing and build the
predictive model on the remaining data.
##
##
## 5. ***BIVARIATE ANALYSIS***
##
## Marital Status Vs Gender
MstatusVsGender<-table(rawdata$MaritalStatus,rawdata$Gender)
## Product Type Vs Income Level
ProductVsIncomelevel<-table(rawdata$Product,rawdata$Incomelevel)
prop.table(ProductVsIncomelevel)
## Marital Status Vs Usage
MstatusVsUsage<-table(rawdata$MaritalStatus,rawdata$Usage)
## Fitness Level Vs Age Group
FitnessVsAge<-table(rawdata$Fitness,rawdata$Agegroup)
## Age Group Vs Miles completed
AgegroupVsMiles<-table(rawdata$Agegroup,rawdata$Mileslevel)
## Product Type Vs Age Group
ProductVsAgegroup<-table(rawdata$Product,rawdata$Agegroup)
prop.table(ProductVsAgegroup)
AgegroupVsProduct<-table(rawdata$Agegroup,rawdata$Product)
prop.table(AgegroupVsProduct)
##
ggplot(data = rawdata,mapping = aes(x=rawdata$Age,y=rawdata$Income))
+geom_boxplot(aes(colour=rawdata$Product),outlier.colour = "red")

You might also like