You are on page 1of 5

# Preprocessing

# Import the libraries


import pandas as pd
import numpy as np

# Loading the dataset using pandas


df = pd.read_csv("train.csv")

# Displaying the 1st five data elements of the dataframe


df.head()

PassengerId Survived Pclass \


0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3

Name Sex Age


SibSp \
0 Braund, Mr. Owen Harris male 22.0
1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0
1
2 Heikkinen, Miss. Laina female 26.0
0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0
1
4 Allen, Mr. William Henry male 35.0
0

Parch Ticket Fare Cabin Embarked


0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S

# Getting information about the dataset


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

# Dropping columns which may not contribute much to our machine


learning model
cols = ['Name','Ticket','Cabin']
df = df.drop(['Name', 'Ticket', 'Cabin'],axis=1)

# We dropped 3 columns (Name, Ticket, Cabin)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB

# Dropping rows having missing values


df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 712 non-null int64
Survived 712 non-null int64
Pclass 712 non-null int64
Sex 712 non-null object
Age 712 non-null float64
SibSp 712 non-null int64
Parch 712 non-null int64
Fare 712 non-null float64
Embarked 712 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 55.6+ KB
# dataset is reduced to 712 rows from 891, which means we are wasting
data.
# So we preserve the data and make use of it as much as we can
# Convert Pclass, Sex, Embarked to columns in pandas and drop them
after conversion
dummies = []
cols = ['Pclass','Sex','Embarked']
for col in cols:
dummies.append(pd.get_dummies(df[col]))

titanic_dummies = pd.concat(dummies, axis=1)


titanic_dummies.head()

1 2 3 female male C Q S
0 0 0 1 0 1 0 0 1
1 1 0 0 1 0 1 0 0
2 0 0 1 1 0 0 0 1
3 1 0 0 1 0 0 0 1
4 0 0 1 0 1 0 0 1

df = pd.concat((df,titanic_dummies),axis=1)
df.head()

PassengerId Survived Pclass Sex Age SibSp Parch Fare


\
0 1 0 3 male 22.0 1 0 7.2500

1 2 1 1 female 38.0 1 0 71.2833

2 3 1 3 female 26.0 0 0 7.9250

3 4 1 1 female 35.0 1 0 53.1000

4 5 0 3 male 35.0 0 0 8.0500

Embarked 1 2 3 female male C Q S


0 S 0 0 1 0 1 0 0 1
1 C 1 0 0 1 0 1 0 0
2 S 0 0 1 1 0 0 0 1
3 S 1 0 0 1 0 0 0 1
4 S 0 0 1 0 1 0 0 1

# Now that we converted Pclass, Sex, Embarked values into columns,


# we drop the redundant same columns from the dataframe
df = df.drop(['Pclass', 'Sex', 'Embarked'], axis=1)
df.head()

PassengerId Survived Age SibSp Parch Fare 1 2 3 female


male \
0 1 0 22.0 1 0 7.2500 0 0 1 0
1
1 2 1 38.0 1 0 71.2833 1 0 0 1
0
2 3 1 26.0 0 0 7.9250 0 0 1 1
0
3 4 1 35.0 1 0 53.1000 1 0 0 1
0
4 5 0 35.0 0 0 8.0500 0 0 1 0
1

C Q S
0 0 0 1
1 1 0 0
2 0 0 1
3 0 0 1
4 0 0 1

df.info()
# Here we find that age has lots of missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
1 891 non-null uint8
2 891 non-null uint8
3 891 non-null uint8
female 891 non-null uint8
male 891 non-null uint8
C 891 non-null uint8
Q 891 non-null uint8
S 891 non-null uint8
dtypes: float64(2), int64(4), uint8(8)
memory usage: 48.8 KB

# interpolate() function will replace all the missing NaNs to


interpolated values (median of all ages)
df['Age'] = df['Age'].interpolate()

# Age is interpolated now with imputed new values.


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
1 891 non-null uint8
2 891 non-null uint8
3 891 non-null uint8
female 891 non-null uint8
male 891 non-null uint8
C 891 non-null uint8
Q 891 non-null uint8
S 891 non-null uint8
dtypes: float64(2), int64(4), uint8(8)
memory usage: 48.8 KB

# Converting our dataframe from pandas to numpy


# And assigning input and output
X = df.values
y = df['Survived'].values

# X has Survived values in it, which should be removed


# So we drop in numpy column which is the 1st column.
X = np.delete(X, 1, axis=1)

# Now that we are ready with X and y,


# splitting the dataset for 70% Training and 30% test set using
scikit model selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size =
0.3, random_state = 0)

You might also like