Predicting Survival on the Titanic#

The RMS Titanic was known as the unsinkable ship and was the largest, most luxurious passenger ship of its time. Sadly, the British ocean liner sank on April 15, 1912, killing 1502 out of 2224 people on board.

by analysing the probability of survival based on few attributes like gender, age, and social status, we can make very accurate predictions on which passengers would survive. Some groups of people were more likely to survive than others, such as women, children, and the upper-class. Therefore, we can learn about the society priorities and privileges at the time.

For more on the titanic, see https://www.encyclopedia-titanica.org/

We are using the original dataset: https://www.openml.org/search?type=data&status=active&id=40945

The data consists of demographic and traveling information for 1,309 of the Titanic passengers, and the goal is to build a Machine Learning Pipeline, to engineer the features in the data set and predict who is more likely to Survive the catastrophe.

import re

# Mathematical operations and data manipulation
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import StandardScaler

# to build the models
from sklearn.linear_model import LogisticRegression

# to evaluate the models
from sklearn.metrics import accuracy_score, roc_auc_score

# to persist the model and the scaler
import joblib

# to visualise al the columns in the dataframe
pd.pandas.set_option("display.max_columns", None)

Paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Prepare the data set#

# Load the data - it is available open source and online
data = pd.read_csv("https://www.openml.org/data/get_csv/16826755/phpMYEkMl")

data.head()

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29	0	0	24160	211.3375	B5	S	2	?	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.55	C22 C26	S	11	?	Montreal, PQ / Chesterville, ON
2	1	0	Allison, Miss. Helen Loraine	female	2	1	2	113781	151.55	C22 C26	S	?	?	Montreal, PQ / Chesterville, ON
3	1	0	Allison, Mr. Hudson Joshua Creighton	male	30	1	2	113781	151.55	C22 C26	S	?	135	Montreal, PQ / Chesterville, ON
4	1	0	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25	1	2	113781	151.55	C22 C26	S	?	?	Montreal, PQ / Chesterville, ON

# Replace interrogation marks by NaN values
data = data.replace("?", np.nan)

# retain only the first cabin if more than 1 are available per passenger
def get_first_cabin(row):
    try:
        return row.split()[0]
    except:
        return np.nan


data["cabin"] = data["cabin"].apply(get_first_cabin)

# extracts the title (Mr, Ms, etc.) from the name variable
def get_title(passenger):
    line = passenger
    if re.search("Mrs", line):
        return "Mrs"
    elif re.search("Mr", line):
        return "Mr"
    elif re.search("Miss", line):
        return "Miss"
    elif re.search("Master", line):
        return "Master"
    else:
        return "Other"


data["title"] = data["name"].apply(get_title)

# cast numerical variables as floats
data["fare"] = data["fare"].astype("float")
data["age"] = data["age"].astype("float")

# drop unnecessary variables
data.drop(
    labels=["name", "ticket", "boat", "body", "home.dest"],
    axis=1,
    inplace=True,
)

# display data
data.head()

	pclass	survived	sex	age	sibsp	parch	fare	cabin	embarked	title
0	1	1	female	29.0000	0	0	211.3375	B5	S	Miss
1	1	1	male	0.9167	1	2	151.5500	C22	S	Master
2	1	0	female	2.0000	1	2	151.5500	C22	S	Miss
3	1	0	male	30.0000	1	2	151.5500	C22	S	Mr
4	1	0	female	25.0000	1	2	151.5500	C22	S	Mrs

# save the data set
data.to_csv(f"{data_path}/titanic.csv", index=False)

Data Exploration#

Find numerical and categorical variables#

target = "survived"

vars_cat = [var for var in data.columns if data[var].dtype == "O"]

vars_num = [
    var for var in data.columns if var not in vars_cat and var != "survived"
]

print("Number of numerical variables: {}".format(len(vars_num)))
print("Number of categorical variables: {}".format(len(vars_cat)))

Number of numerical variables: 5
Number of categorical variables: 4

Find missing values in variables#

vars_with_na = [var for var in data.columns if data[var].isnull().sum() > 0]

# first in numerical variables
num_na = [var for var in vars_num if var in vars_with_na]
print("Number of categorical variables with na: ", len(num_na))

Number of categorical variables with na:  2

# now in categorical variables
cat_na = [var for var in vars_cat if var in vars_with_na]
print("Number of categorical variables with na: ", len(cat_na))

Number of categorical variables with na:  2

Determine cardinality of categorical variables#

data[vars_cat].nunique().sort_values(ascending=False).plot.bar(figsize=(12, 5))

<AxesSubplot: >

../../_images/e7134a891eefb58936b775879f421a0de3a6fbc086e813bdace57c4bc12fe8c0.png

Determine the distribution of numerical variables#

data[vars_num].hist(bins=30, figsize=(15, 15))
plt.show()

../../_images/158a9be58e513194d80904eca448f05710074419271f17a6113d420b16c3c102.png

Separate data into train and test#

Use the code below for reproducibility. Don’t change it.

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("survived", axis=1),  # predictors
    data["survived"],  # target
    test_size=0.2,  # percentage of obs in test set
    random_state=0,
)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((1047, 9), (262, 9))

Feature Engineering#

Extract only the letter (and drop the number) from the variable Cabin#

X_train["cabin"] = X_train["cabin"].apply(
    lambda s: s[0] if pd.notnull(s) else "M"
)
X_test["cabin"] = X_test["cabin"].apply(
    lambda s: s[0] if pd.notnull(s) else "M"
)
print(X_train["cabin"].unique())

['M' 'E' 'F' 'A' 'C' 'D' 'B' 'T' 'G']

Fill in Missing data in numerical variables:#

Add a binary missing indicator
Fill NA in original variable with the median

X_train[vars_num].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1047 entries, 1118 to 684
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  1047 non-null   int64  
 1   age     841 non-null    float64
 2   sibsp   1047 non-null   int64  
 3   parch   1047 non-null   int64  
 4   fare    1046 non-null   float64
dtypes: float64(2), int64(3)
memory usage: 49.1 KB

for var in vars_num:

    X_train[var + "_NA"] = np.where(X_train[var].isnull(), 1, 0)
    X_test[var + "_NA"] = np.where(X_test[var].isnull(), 1, 0)

    med_val_train = X_train[var].median()
    med_val_test = X_test[var].median()

    X_train[var].fillna(med_val_train, inplace=True)
    X_test[var].fillna(med_val_test, inplace=True)

X_train[vars_num].isna().sum()

pclass    0
age       0
sibsp     0
parch     0
fare      0
dtype: int64

Replace Missing data in categorical variables with the string Missing#

X_train[vars_cat] = X_train[vars_cat].fillna("Missing")
X_test[vars_cat] = X_test[vars_cat].fillna("Missing")

X_train.isna().mean()

pclass       0.0
sex          0.0
age          0.0
sibsp        0.0
parch        0.0
fare         0.0
cabin        0.0
embarked     0.0
title        0.0
pclass_NA    0.0
age_NA       0.0
sibsp_NA     0.0
parch_NA     0.0
fare_NA      0.0
dtype: float64

X_test.isna().mean()

pclass       0.0
sex          0.0
age          0.0
sibsp        0.0
parch        0.0
fare         0.0
cabin        0.0
embarked     0.0
title        0.0
pclass_NA    0.0
age_NA       0.0
sibsp_NA     0.0
parch_NA     0.0
fare_NA      0.0
dtype: float64

Remove rare labels in categorical variables#

remove labels present in less than 5 % of the passengers

X_train[vars_cat].nunique()

sex         2
cabin       9
embarked    4
title       5
dtype: int64

def find_mode_values(df, var, rare_percentage):
    df = df.copy()
    tmp = df.groupby(var)[var].count() / len(df)
    return tmp[tmp > rare_percentage].index


for var in vars_cat:
    freq_values1 = find_mode_values(X_train, var, 0.05)
    freq_values2 = find_mode_values(X_test, var, 0.05)
    X_train[var] = np.where(
        X_train[var].isin(freq_values1), X_train[var], "Rare"
    )
    X_test[var] = np.where(X_test[var].isin(freq_values2), X_test[var], "Rare")

X_train[vars_cat].nunique()

sex         2
cabin       3
embarked    4
title       4
dtype: int64

X_test[vars_cat].nunique()

sex         2
cabin       3
embarked    3
title       5
dtype: int64

Perform one hot encoding of categorical variables into k-1 binary variables#

k-1, means that if the variable contains 9 different categories, we create 8 different binary variables
Remember to drop the original categorical variable (the one with the strings) after the encoding

for var in vars_cat:
    X_train = pd.concat(
        [X_train, pd.get_dummies(X_train[var], prefix=var, drop_first=True)],
        axis=1,
    )
    X_test = pd.concat(
        [X_test, pd.get_dummies(X_test[var], prefix=var, drop_first=True)],
        axis=1,
    )

X_train.drop(labels=vars_cat, axis=1, inplace=True)
X_test.drop(labels=vars_cat, axis=1, inplace=True)

X_test = X_test.drop(["title_Miss"], axis=1)

X_test["embarked_Rare"] = 0

variables = [c for c in X_train.columns]

variables

['pclass',
 'age',
 'sibsp',
 'parch',
 'fare',
 'pclass_NA',
 'age_NA',
 'sibsp_NA',
 'parch_NA',
 'fare_NA',
 'sex_male',
 'cabin_M',
 'cabin_Rare',
 'embarked_Q',
 'embarked_Rare',
 'embarked_S',
 'title_Mr',
 'title_Mrs',
 'title_Rare']

Scale the variables#

Use the standard scaler from Scikit-learn

scaled = StandardScaler()

scaled.fit(X_train[variables])

X_train = scaled.transform(X_train[variables])
X_test = scaled.transform(X_test[variables])

Train the Logistic Regression model#

Set the regularization parameter to 0.0005
Set the seed to 0

model = LogisticRegression(C=0.0005, random_state=0)

model.fit(X_train, y_train)

LogisticRegression(C=0.0005, random_state=0)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Make predictions and evaluate model performance#

Determine:

roc-auc - requires the probability of survival.
accuracy - requires the outcome 0, 1, referring to survived or not.

class_ = model.predict(X_train)
pred = model.predict_proba(X_train)[:, 1]

print("train roc-auc: {}".format(roc_auc_score(y_train, pred)))
print("train mse: {}".format(accuracy_score(y_train, class_)))
print()

class_ = model.predict(X_test)
pred = model.predict_proba(X_test)[:, 1]

print("train roc-auc: {}".format(roc_auc_score(y_test, pred)))
print("train mse: {}".format(accuracy_score(y_test, class_)))

train roc-auc: 0.8431723338485316

train mse: 0.7125119388729704

train roc-auc: 0.8355864197530863
train mse: 0.7022900763358778

joblib.dump(model, f"{assets_path}/logistic_regression.joblib")

['./assets/logistic_regression.joblib']

Table of Contents

Books