You are on page 1of 13

11/21/21, 10:23 PM titanic

Who survived the titanic?


Chris Flores
Amber Beebe
Sean Wilson
Agustin Garcia
In [1]: # using the Kaggle Titanic data set found here

# https://www.kaggle.com/c/titanic/data

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from scipy.stats import zscore

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.preprocessing import LabelEncoder


from sklearn.neighbors import KNeighborsRegressor

from sklearn.tree import DecisionTreeClassifier, export_graphviz ,plot_tree

# no need to split data since kaggle gave us split

test = pd.read_csv("https://raw.githubusercontent.com/flores58c/CST_383_Titanic_
train = pd.read_csv("https://raw.githubusercontent.com/flores58c/CST_383_Titanic

In [2]: train.info()

train.describe

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 891 entries, 0 to 890

Data columns (total 12 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 PassengerId 891 non-null int64

1 Survived 891 non-null int64

2 Pclass 891 non-null int64

3 Name 891 non-null object

4 Sex 891 non-null object

5 Age 714 non-null float64

6 SibSp 891 non-null int64

7 Parch 891 non-null int64

8 Ticket 891 non-null object

9 Fare 891 non-null float64

10 Cabin 204 non-null object

11 Embarked 889 non-null object

dtypes: float64(2), int64(5), object(5)

memory usage: 83.7+ KB

Out[2]: <bound method NDFrame.describe of PassengerId Survived Pclass \

0 1 0 3

1 2 1 1

2 3 1 3

3 4 1 1

4 5 0 3

.. ... ... ...

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 1/13
11/21/21, 10:23 PM titanic
886 887 0 2

887 888 1 1

888 889 0 3

889 890 1 1

890 891 0 3

Name Sex Age SibSp \

0 Braund, Mr. Owen Harris male 22.0 1

1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1

2 Heikkinen, Miss. Laina female 26.0 0

3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1

4 Allen, Mr. William Henry male 35.0 0

.. ... ... ... ...

886 Montvila, Rev. Juozas male 27.0 0

887 Graham, Miss. Margaret Edith female 19.0 0

888 Johnston, Miss. Catherine Helen "Carrie" female NaN 1

889 Behr, Mr. Karl Howell male 26.0 0

890 Dooley, Mr. Patrick male 32.0 0

Parch Ticket Fare Cabin Embarked

0 0 A/5 21171 7.2500 NaN S

1 0 PC 17599 71.2833 C85 C

2 0 STON/O2. 3101282 7.9250 NaN S

3 0 113803 53.1000 C123 S

4 0 373450 8.0500 NaN S

.. ... ... ... ... ...

886 0 211536 13.0000 NaN S

887 0 112053 30.0000 B42 S

888 2 W./C. 6607 23.4500 NaN S

889 0 111369 30.0000 C148 C

890 0 370376 7.7500 NaN Q

[891 rows x 12 columns]>

Predictions
We will be predicting survivability based on categorical data of Pclass, Sex, Age. We will use
machine learning methods of knn and or linear regression.
Data Wrangling
In [3]: #find null/na values

train.isna().sum()

Out[3]: PassengerId 0

Survived 0

Pclass 0

Name 0

Sex 0

Age 177

SibSp 0

Parch 0

Ticket 0

Fare 0

Cabin 687

Embarked 2

dtype: int64

In [4]: #Initial preprocessing and cleaning

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 2/13
11/21/21, 10:23 PM titanic
# we can remove cabin column too much NaN values

train.drop("Cabin",axis=1)

#not sure if we should replace some nan columns in age with mean values

Out[4]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare E
Braund,
0 1 0 3 Mr. Owen male 22.0 1 0 A/5 21171 7.2500
Harris
Cumings,
Mrs. John
1 2 1 1 Bradley female 38.0 1 0 PC 17599 71.2833
(Florence
Briggs
Th...
Heikkinen,
2 3 1 3 Miss. female 26.0 0 0 STON/O2.
3101282 7.9250
Laina
Futrelle,
Mrs.
3 4 1 1 Jacques female 35.0 1 0 113803 53.1000
Heath
(Lily May
Peel)
Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450 8.0500
Henry
... ... ... ... ... ... ... ... ... ... ...
Montvila,
886 887 0 2 Rev. male 27.0 0 0 211536 13.0000
Juozas
Graham,
887 888 1 1 Miss. female 19.0 0 0 112053 30.0000
Margaret
Edith
Johnston,
Miss. W./C. 23.4500
888 889 0 3 Catherine female NaN 1 2 6607
Helen
"Carrie"
Behr, Mr.
889 890 1 1 Karl male 26.0 0 0 111369 30.0000
Howell
Dooley,
890 891 0 3 Mr. male 32.0 0 0 370376 7.7500
Patrick
891 rows × 11 columns
In [5]: #changing Sex column to binary 1=male 0=female

le = LabelEncoder()

train["Sex"]=le.fit_transform(train["Sex"])

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 3/13
11/21/21, 10:23 PM titanic
In [6]: #Taking average, rounded age of only males in data. Will possibly import to N/A
male_age = train['Sex'] == 1

mean_for_male_age = np.round(train.loc[male_age, 'Age'].mean())

mean_for_male_age

Out[6]: 31.0

In [7]: #Taking average, rounded age of only females in data. Will possibly import to N/
female_age = train['Sex'] == 0

mean_for_female_age = np.round(train.loc[female_age, 'Age'].mean())

mean_for_female_age

Out[7]: 28.0

In [8]: train.loc[(train.Age.isnull()) & (train.Sex==0),'Age']= mean_for_female_age

train.loc[(train.Age.isnull()) & (train.Sex==1),'Age']= mean_for_male_age

#check if columns filled

train["Age"].isnull().sum()

Out[8]: 0

Data Exploration
In [9]: # data discovery
# Survived: 0 = No, 1 = Yes

test.size == train.size

train.columns

Out[9]: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',

'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],

dtype='object')

In [10]: #boxplot of men and women who survived

sns.boxplot('Sex', 'Survived', data=train, color="red")

/Users/abeebe/opt/anaconda3/lib/python3.8/site-packages/seaborn/_decorators.py:3
6: FutureWarning: Pass the following variables as keyword args: x, y. From versi
on 0.12, the only valid positional argument will be `data`, and passing other ar
guments without an explicit keyword will result in an error or misinterpretatio
n.

warnings.warn(

Out[10]: <AxesSubplot:xlabel='Sex', ylabel='Survived'>

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 4/13
11/21/21, 10:23 PM titanic

In [11]: # The dimensions we will be using are Survived, Sex, Pclass, & Age

# the largest number of passengers were in third class

train.groupby('Pclass').size().plot.bar()

Out[11]: <AxesSubplot:xlabel='Pclass'>

In [12]: # More people died than survived

# what was the percentage of people who died vs. survived

# train[(train.Survived == 0)]

# train['Survived'].size OR len(train)

perc_died = round(train[(train.Survived == 0)]['Survived'].size / len(train), 2)


perc_survived = round(train[(train.Survived == 1)]['Survived'].size / len(train)
print("Percentage Survived: " + str(perc_survived) + "%\nPercentage Died: " + st
# train.groupby('Survived').size().plot.bar()

Percentage Survived: 0.38%

Percentage Died: 0.62%

In [13]: pd.crosstab(train.Survived, train.Sex)

Out[13]: Sex 0 1
Survived
0 81 468
1 233 109
localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 5/13
11/21/21, 10:23 PM titanic

In [14]: pd.crosstab(train.Survived, train.Pclass)

Out[14]: Pclass 1 2 3
Survived
0 80 97 372
1 136 87 119

In [15]: # What is the distribution of ages

import seaborn as sns

sns.distplot(train.Age)

/Users/abeebe/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.p
y:2551: FutureWarning: `distplot` is a deprecated function and will be removed i
n a future version. Please adapt your code to use either `displot` (a figure-lev
el function with similar flexibility) or `histplot` (an axes-level function for
histograms).

warnings.warn(msg, FutureWarning)

Out[15]: <AxesSubplot:xlabel='Age', ylabel='Density'>

In [16]: # Find conditional probability of people under and over 30 who survived

# Under 30: Yes OR No


# train["over_30"] = None

# del train["over_30"]

# train.head

train["Over30"] = None

train.loc[(train.Age <= 30), 'over_30'] = 'yes'

train.loc[(train.Age > 30), 'over_30'] = 'no'

pd.crosstab(train.Survived, train.over_30)

#maybe change yes/no to 1/0

Out[16]: over_30 no yes


Survived
0 289 260
1 140 202

In [17]: # Did men pay more for their fare than women, did older passengers pay less than
localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 6/13
11/21/21, 10:23 PM titanic
#sns.scatterplot(data = train, x = "Age", y = "Fare", hue = "Pclass")

sns.scatterplot(data = train, x = "Age", y = "Fare", hue = "Sex")

Out[17]: <AxesSubplot:xlabel='Age', ylabel='Fare'>

In [18]: corM = train.corr()

sns.heatmap(corM, annot = True)

plt.show()

In [19]: sns.regplot(x="Fare",y="Survived",data=train,logistic=True)

Out[19]: <AxesSubplot:xlabel='Fare', ylabel='Survived'>

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 7/13
11/21/21, 10:23 PM titanic

Paying more had a somewhat positive correlation. Not sure if that's the only reason.
In [20]: #grid of scatterplots
sns.pairplot(train)

Out[20]: <seaborn.axisgrid.PairGrid at 0x7faa75f182e0>

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 8/13
11/21/21, 10:23 PM titanic

Machine Learning
In [21]: predictors = ['Age']

target = 'Survived'

X = train[predictors].values

y = train[target].values

#test.csv unsusable and has no Survived data. Needed to split train.csv

X_train ,X_test, y_train,y_test = train_test_split(X,y,test_size =.8,random_stat


reg = LinearRegression()

reg.fit(X_train,y_train)

Out[21]: LinearRegression()

In [22]: plt.scatter(X_train,y_train)

plt.plot(X_test,reg.predict(X_test),linestyle='dashed',color = 'black')

Out[22]: [<matplotlib.lines.Line2D at 0x7faa59fd3130>]

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 9/13
11/21/21, 10:23 PM titanic

Answering the question which age range is most likely to survive. The answer is unclear and
can't be explicitly solved in this plot.
In [23]: def rmse(predicted, actual):

return np.sqrt(((predicted - actual)**2).mean())

rmse(reg.predict(X_test),y_test)

Out[23]: 0.48659958319426705

Looking at our linear model the error is about half for fitting the data. So this would not be a
good representation for our prediction.
In [24]: # Predicting odds of survival based on age, class, and sex, with KNN Regression.

predictors = ['Age','Sex','Pclass']

target = 'Survived'

X = train[predictors].values

y = train[target].values

from sklearn.neighbors import KNeighborsRegressor

reg = KNeighborsRegressor(n_neighbors=20)

reg.fit(X,y)

i = np.linspace(start=.5, stop = 80, num=3000)

plot_f_3 = pd.DataFrame(i, columns=['Age'])

plot_f_3['Sex'] = 0

plot_f_3['Pclass'] = 3

plot_f_2 = pd.DataFrame(i, columns=['Age'])

plot_f_2['Sex'] = 0

plot_f_2['Pclass'] = 2

plot_f_1 = pd.DataFrame(i, columns=['Age'])

plot_f_1['Sex'] = 0

plot_f_1['Pclass'] = 1

plot_m_3 = pd.DataFrame(i, columns=['Age'])

plot_m_3['Sex'] = 1

plot_m_3['Pclass'] = 3

plot_m_2 = pd.DataFrame(i, columns=['Age'])

plot_m_2['Sex'] = 1

plot_m_2['Pclass'] = 2

plot_m_1 = pd.DataFrame(i, columns=['Age'])

plot_m_1['Sex'] = 1

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 10/13
11/21/21, 10:23 PM titanic
plot_m_1['Pclass'] = 1

fig = plt.figure(figsize=(20, 5))

plt1= fig.add_subplot(121)

plt2= fig.add_subplot(122)

fig.suptitle('Odds of Survival Based on Age, Sex, and Class')

plt1.plot(i, reg.predict(plot_f_3), label='Third class female')

plt1.plot(i, reg.predict(plot_f_2) , color='orange', label='Second class female'


plt1.plot(i, reg.predict(plot_f_1) , color='red', label='First class female')

plt2.plot(i, reg.predict(plot_m_3), label='Third class male', color='black')

plt2.plot(i, reg.predict(plot_m_2) , color='purple', label='Second class male')

plt2.plot(i, reg.predict(plot_m_1) , color='green', label='First class male')

plt1.legend()

plt2.legend()

plt1.set_xlabel('Age')

plt1.set_ylabel('Odds of survival')

plt2.set_xlabel('Age')

plt2.set_ylabel('Odds of survival')

plt1.set_title('Females')

plt2.set_title('Males')

Out[24]: Text(0.5, 1.0, 'Males')

In [25]: #using amount paid to predict Age

predictors = ['Fare']

target = 'Age'

X = train[predictors].values

y = train[target].values

reg2 = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=


reg2.fit(X,y)

plt.scatter(X,y)

plt.plot(X,reg2.predict(X),linestyle='dashed',color = 'black')

Out[25]: [<matplotlib.lines.Line2D at 0x7faa5a9fb760>]

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 11/13
11/21/21, 10:23 PM titanic

The older passengers were more frugal whereas passengers who were in the middle age ranges
paid a bit more.
In [26]: #using amount paid to predict Survivability

predictors = ['Fare']

target = 'Survived'

X = train[predictors].values

y = train[target].values

reg3 = LinearRegression()

reg3.fit(X,y)

plt.scatter(X,y)

plt.plot(X,reg3.predict(X),linestyle='dashed',color = 'black')

Out[26]: [<matplotlib.lines.Line2D at 0x7faa5aacd880>]

This regression plot shows there is a positive correlation with paying more and surviving. This is
because first class had a closer access to the life boats on the titanic.
In [27]: predictors = ['Age']

target = 'Survived'

X = train[predictors].values

y = train[target].values

X_train ,X_test, y_train,y_test = train_test_split(X,y,test_size =.8,random_stat


reg = DecisionTreeClassifier(max_depth=4)

reg.fit(X_train, y_train)

plot_tree(reg)

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 12/13
11/21/21, 10:23 PM titanic

Out[27]: [Text(111.60000000000001, 195.696, 'X[0] <= 1.415\ngini = 0.464\nsamples = 178\n


value = [113, 65]'),

Text(74.4, 152.208, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'),

Text(148.8, 152.208, 'X[0] <= 30.5\ngini = 0.46\nsamples = 176\nvalue = [113, 6


3]'),
Text(74.4, 108.72, 'X[0] <= 4.5\ngini = 0.485\nsamples = 87\nvalue = [51, 3
6]'),
Text(37.2, 65.232, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'),

Text(111.60000000000001, 65.232, 'X[0] <= 13.5\ngini = 0.49\nsamples = 84\nvalu


e = [48, 36]'),

Text(74.4, 21.744, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'),

Text(148.8, 21.744, 'gini = 0.483\nsamples = 81\nvalue = [48, 33]'),

Text(223.20000000000002, 108.72, 'X[0] <= 31.5\ngini = 0.423\nsamples = 89\nval


ue = [62, 27]'),

Text(186.0, 65.232, 'gini = 0.147\nsamples = 25\nvalue = [23, 2]'),

Text(260.40000000000003, 65.232, 'X[0] <= 60.5\ngini = 0.476\nsamples = 64\nval


ue = [39, 25]'),

Text(223.20000000000002, 21.744, 'gini = 0.484\nsamples = 61\nvalue = [36, 2


5]'),
Text(297.6, 21.744, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]')]

In [ ]:

localhost:8888/nbconvert/html/CST_383_Titanic_Project/titanic.ipynb?download=false 13/13

You might also like