1.1 Loading The Data: Survival by Sex

1.
1 Loading the data

We load the data into a single data structure so that we can pre-process in one go.
train_data <- read.csv("../input/train.csv", stringsAsFactors = F)
test_data <- read.csv("../input/test.csv", stringsAsFactors = F)
combined_data <- bind_rows(train_data, test_data)
str(train_data)
1.2 Having a look around

Before we get too deep into the data, let’s have a look at what we’re dealing with.
# Show sex and survival as a bar plot

barplot(table(train_data$Survived, train_data$Sex), sub="Survival by Sex",
ylab="number of passengers", col=c("steelblue4","steelblue2"))
legend("topleft",legend = c("Died","Survived"),fill=c("steelblue4","steelbl
ue2"),inset = .05)
Survival by Sex
So it looks like there is a clear trend that females are much more likely to survive than are men.
It’s well known fact that women and children were favoured for the lifeboats and our data certainly
supports the hypothesis that a female passenger is more likely to survive, however i want to see what
information lurks behind the Age variable.
# Show sex and survival as a bar plot
temp_data <- train_data[! is.na(train_data$Age),]

temp_data$Child <- temp_data$Age < 16
barplot(prop.table(table(temp_data$Survived, temp_data$Child), 2), sub="Sur

vival by Children", ylab="proportion", col=c("steelblue4","steelblue2"))
legend("topleft",legend = c("Died","Survived"),fill=c("steelblue4","steelbl
ue2"),inset = .05)
Survival by Children
Another factor that we would expect to have a big bearing on our predictions in the passenger class.
It’s been said that the lifeboats were favoured for the richer passengers and as the film depicts 3rd
class passengers were often placed at the very bottom of the ship. So let’s see if the data matches up
to the movies.
mosaicplot(table(train_data$Survived,train_data$Pclass), main="Passenger Su
rvival by Class",ylab="Passenger class",xlab="Survived",col = hcl(c(240, 12
0, 80)),)
2 Constructing new features

2.1 Do nobles survive?
As many kagglers have reported, creating the new feature ‘Title’ based on the name field yields
promising results.
combined_data$Title <- gsub('.*, |\\..*', '', combined_data$Name)
common_title <- c('Miss', 'Mrs', 'Mr','Master')
noble_title <- c('Don', 'Dona','Sir','the Countess', 'Lady', 'Jonkheer')
'%!in%' <- function(x,y)!('%in%'(x,y)) #define not in function
#Assign names to similar ones to avoid outliers
combined_data$Title[combined_data$Title == 'Mlle'] <- 'Miss'
combined_data$Title[combined_data$Title == 'Ms'] <- 'Miss'
combined_data$Title[combined_data$Title == 'Mme'] <- 'Mrs'

combined_data$Title[combined_data$Title %!in% common_title && combined_data
$Title %!in% noble_title] <- 'Rare Title'
combined_data$Title[combined_data$Title %in% noble_title] <- 'Noble'
combined_data$Title <- as.factor(combined_data$Title)
2.2 Travelling alone?

Many kagglers have pointed out that the variables Parch and Sibsp contain hidden information about
a passenger’s family. SibSp tells us how many siblings or spouses are travelling onboard and Parch
tells us how many parents or children are onboard. We can use this information to tell how many
people the passenger was travelling with, which might have an impact on whether they survived or
not.
combined_data$Fsize <- combined_data$SibSp + combined_data$Parch + 1
2.3 Others
Another piece of informatio is hidden in the Cabin variable. You see that the first part of the Cabin
variable indicates the Deck. The data is largely incomplete, but it could prove to be useful for those
passengers that have the cabin data, so let’s add it.
combined_data$Deck <- sapply(combined_data$Cabin, function(x) strsplit(x, N

ULL)[[1]][1])
combined_data$Deck[is.na(combined_data$Deck)] <- 'Unknown'
combined_data$Deck <- as.factor(combined_data$Deck)
3 Fill Missing Data

We want to fill the missing data on the test and training sets independently so first lets return the data
back to normal. We also want to do some performance checkign so we need to split off a small part of
the training set for cross validation.
train_ind = nrow(train_data) # find original train data length
train_data <- combined_data[1:train_ind, ] # replace original train data
test_data <- combined_data[(train_ind+1):nrow(combined_data), ] # replace

original test data
test_data$Survived <- NULL #remove the Survived field from the test data
set
#Now lets split the data 1 step further for our cross_validation_set
temp_data <- split( train_data , train_data$PassengerId > 570 )
train_data <- temp_data$`FALSE`
cross_validate_data <- temp_data$`TRUE`
3.1 Missing embarkment and fare

Some data such as Deck is missing a great deal of data, so we can’t do much about that. But for
some we can make an intelligent guess at what the value should be.
Megan Ridsal explains extremely well the rational behind the following fills:
#Passengers 62 and 830 both embarked at Cherbourg according to Megan
train_data$Embarked[which(train_data$PassengerId == 62)] <- 'C'
cross_validate_data$Embarked[which(cross_validate_data$PassengerId == 830)]
<- 'C'
#To guess the fare of passenger 1044 we use the median value of their class
and embarkment location. Thanks again to Megan
test_data$Fare[which(test_data$PassengerId == 1044)] <- median(train_data[t

rain_data$Pclass == '3' & train_data$Embarked == 'S', ]$Fare, na.rm = TRUE)
3.2 Predict missing ages

When i first started the script, i just wanted to get something working so i went for the easy option of
just putting in the mean. However, after getting some improved results after adding the “Person” field i
decided it would be wise to improve upon the “Age” field.
Let’s use the valid data in our test_data set to predict values in all of the sets.
age_pred <- lm(Age ~ Pclass + Sex + Fare, data = train_data[!is.na(train_da

ta$Age),] )
train_data$Age[is.na(train_data$Age)] <- predict(age_pred, train_data)[is.n

a(train_data$Age)]
cross_validate_data$Age[is.na(cross_validate_data$Age)] <- predict(age_pred

, cross_validate_data)[is.na(cross_validate_data$Age)]
test_data$Age[is.na(test_data$Age)] <- predict(age_pred, test_data)[is.na(t

est_data$Age)]
Now that we have an accurate age field, we can add the field Person that we investigated with earlier
to determine if a passenger is man/woman/boy/girl
train_data$Person <- as.factor(ifelse(train_data$Age< 18, ifelse(train_data

$Sex == 'male','boy','girl'), ifelse(train_data$Sex == 'male','man','woman'
)))
cross_validate_data$Person <- as.factor(ifelse(cross_validate_data$Age< 18,

ifelse(cross_validate_data$Sex == 'male','boy','girl'), ifelse(cross_valida
te_data$Sex == 'male','man','woman')))
test_data$Person <- as.factor(ifelse(test_data$Age< 18, ifelse(test_data$Se

x == 'male','boy','girl'), ifelse(test_data$Sex == 'male','man','woman')))
4 Prediction and Validation
4.1 Create the model
We are going to use a random Forest predictor which seems to be quite a popular choice for a
problem of this nature.
# Set a random seed

set.seed(754)
# Build the model (note: not all possible variables are used)
rf_model <- randomForest(Survived ~ Pclass + Sex + Age + Fsize + Fare + Tit

le + Person + Embarked ,
data = train_data)
Using the variable importance function i removed the unimportant variables, leaving us with only the 8
most important. You can see clearly that Title takes the top spot, so we’re glad we made the new
field.
4.2 Predict on cross-validation set

We can use the cross validation to do some intial predictions and use some performance measures
so that we can pick our best model.
Predict using the test set
prediction <- predict(rf_model, cross_validate_data, type = "prob")
# Recall-Precision curve
tpr = performance(pred, "tpr", "fpr" )
plot(tpr)
The accuracy from the ROCR package give us a good measure for our model.
## accuracy cutoff.254
## 0.8380062 0.6520000
4.3 Predict on test data set and submit

Finally when we have done some fine tuning using the cross-validation set to give us some guidance
we can make our final predictions on our test data.
prediction <- predict(rf_model, test_data)

levels(prediction) <- c('0','1')
# Save the solution to a dataframe with two columns: PassengerId and Surviv
ed (prediction)
solution <- data.frame(PassengerID = test_data$PassengerId, Survived = pred
iction)
# Write the solution to file
write.csv(solution, file = 'my_solutions.csv', row.names = F)
5 Conclusion
At the time of writing im currently at 961/4401 which i don’t think is too bad for a first effort. I could
spend much more time improving the age prediction or fine-tuning the model for futher refinement but
it’s time to move onto more advanced competitions.

1.1 Loading The Data: Survival by Sex

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

1.1 Loading The Data: Survival by Sex

Uploaded by

Copyright:

Available Formats

1.

1 Loading the data

train_data <- read.csv("../input/train.csv", stringsAsFactors = F)

test_data <- read.csv("../input/test.csv", stringsAsFactors = F)

combined_data <- bind_rows(train_data, test_data)

1.2 Having a look around

# Show sex and survival as a bar plot

# Show sex and survival as a bar plot

temp_data <- train_data[! is.na(train_data$Age),]

barplot(prop.table(table(temp_data$Survived, temp_data$Child), 2), sub="Sur

2 Constructing new features

combined_data$Title <- gsub('.*, |\\..*', '', combined_data$Name)

common_title <- c('Miss', 'Mrs', 'Mr','Master')

noble_title <- c('Don', 'Dona','Sir','the Countess', 'Lady', 'Jonkheer')

'%!in%' <- function(x,y)!('%in%'(x,y)) #define not in function

#Assign names to similar ones to avoid outliers

combined_data$Title[combined_data$Title == 'Mlle'] <- 'Miss'

combined_data$Title[combined_data$Title == 'Ms'] <- 'Miss'

combined_data$Title[combined_data$Title == 'Mme'] <- 'Mrs'

combined_data$Title[combined_data$Title %in% noble_title] <- 'Noble'

combined_data$Title <- as.factor(combined_data$Title)

2.2 Travelling alone?

combined_data$Deck <- sapply(combined_data$Cabin, function(x) strsplit(x, N

combined_data$Deck[is.na(combined_data$Deck)] <- 'Unknown'

combined_data$Deck <- as.factor(combined_data$Deck)

3 Fill Missing Data

train_ind = nrow(train_data) # find original train data length

train_data <- combined_data[1:train_ind, ] # replace original train data

test_data <- combined_data[(train_ind+1):nrow(combined_data), ] # replace

train_data <- temp_data$`FALSE`

cross_validate_data <- temp_data$`TRUE`

3.1 Missing embarkment and fare

test_data$Fare[which(test_data$PassengerId == 1044)] <- median(train_data[t

3.2 Predict missing ages

age_pred <- lm(Age ~ Pclass + Sex + Fare, data = train_data[!is.na(train_da

train_data$Age[is.na(train_data$Age)] <- predict(age_pred, train_data)[is.n

cross_validate_data$Age[is.na(cross_validate_data$Age)] <- predict(age_pred

test_data$Age[is.na(test_data$Age)] <- predict(age_pred, test_data)[is.na(t

train_data$Person <- as.factor(ifelse(train_data$Age< 18, ifelse(train_data

cross_validate_data$Person <- as.factor(ifelse(cross_validate_data$Age< 18,

test_data$Person <- as.factor(ifelse(test_data$Age< 18, ifelse(test_data$Se

# Set a random seed

rf_model <- randomForest(Survived ~ Pclass + Sex + Age + Fsize + Fare + Tit

4.2 Predict on cross-validation set

Predict using the test set

prediction <- predict(rf_model, cross_validate_data, type = "prob")

tpr = performance(pred, "tpr", "fpr" )

4.3 Predict on test data set and submit

prediction <- predict(rf_model, test_data)

write.csv(solution, file = 'my_solutions.csv', row.names = F)

You might also like

combined_data$Title <- gsub('., |\\..', '', combined_data$Name)