Churn#

1raise SystemExit("Stop right there!");
An exception has occurred, use %tb to see the full traceback.

SystemExit: Stop right there!

Importing libraries and packages#

 1# System
 2import os
 3
 4# Mathematical operations and data manipulation
 5import pandas as pd
 6
 7# Modelling
 8from keras.models import Sequential
 9from keras.layers import Dense
10from sklearn.model_selection import train_test_split
11from sklearn.preprocessing import StandardScaler
12
13# Metrics
14from sklearn.metrics import confusion_matrix
15from sklearn.metrics import roc_auc_score
16from sklearn.metrics import roc_curve
17
18# Plotting
19import seaborn as sns
20import matplotlib.pyplot as plt
21
22%matplotlib inline
23
24# Suppress
25# 0 = all messages are logged (default behavior)
26# 1 = INFO messages are not printed
27# 2 = INFO and WARNING messages are not printed
28# 3 = INFO, WARNING, and ERROR messages are not printed
29os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1churn_data = pd.read_csv(
2    f"{data_path}/Churn_Modelling.csv", index_col="RowNumber"
3)
4# Print first 5 rows
5churn_data.head()
CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
RowNumber
1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1
2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0
3 15619304 Onio 502 France Female 42 8 159660.80 3 1 0 113931.57 1
4 15701354 Boni 699 France Female 39 1 0.00 2 0 0 93826.63 0
5 15737888 Mitchell 850 Spain Female 43 2 125510.82 1 1 1 79084.10 0

Data wrangling#

1# Print information about the dataframe including the index
2# dtype and columns, non-null values and memory usage
3churn_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1.1+ MB
 1# Summarize the central tendency, dispersion and shape of a d
 2# ataset’s distribution, excluding NaN values. This is a numeric:
 3# count = number of non-NA/null observations.
 4# mean = mean of the values.
 5# std = standard deviation of the observations.
 6# min = minimum of the values in the object.
 7# lower (25), 50 and upper (75) percentiles. The 50 percentile is the
 8#                                                same as the median.
 9# max = maximum of the values in the object.
10
11churn_data.describe()
CustomerId CreditScore Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
count 1.000000e+04 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.00000 10000.000000 10000.000000 10000.000000
mean 1.569094e+07 650.528800 38.921800 5.012800 76485.889288 1.530200 0.70550 0.515100 100090.239881 0.203700
std 7.193619e+04 96.653299 10.487806 2.892174 62397.405202 0.581654 0.45584 0.499797 57510.492818 0.402769
min 1.556570e+07 350.000000 18.000000 0.000000 0.000000 1.000000 0.00000 0.000000 11.580000 0.000000
25% 1.562853e+07 584.000000 32.000000 3.000000 0.000000 1.000000 0.00000 0.000000 51002.110000 0.000000
50% 1.569074e+07 652.000000 37.000000 5.000000 97198.540000 1.000000 1.00000 1.000000 100193.915000 0.000000
75% 1.575323e+07 718.000000 44.000000 7.000000 127644.240000 2.000000 1.00000 1.000000 149388.247500 0.000000
max 1.581569e+07 850.000000 92.000000 10.000000 250898.090000 4.000000 1.00000 1.000000 199992.480000 1.000000
1# Remove for the purpose unproductive columns
2churn_data.drop(["CustomerId", "Surname"], axis=1, inplace=True)
 1# Two columns are left which have text data: Geography and Gender.
 2# Many machine learning algorithms cannot operate on text data directly.
 3# They require all input variables and output variables to be numeric.
 4# This means that categorical data must be converted to numerical:
 5# 1. Assign each unique category value an integer value, called
 6# Ordinal Encoding (or Label Encoding or Integer Encoding).
 7# 2. For categorical variables where no ordinal relationship exists,
 8# such as for Geography, use One-Hot Encoding.
 9# https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/
10# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
11
12Geography_dummies = pd.get_dummies(
13    prefix="Geo", data=churn_data, columns=["Geography"]
14)
15Geography_dummies.head()
CreditScore Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited Geo_France Geo_Germany Geo_Spain
RowNumber
1 619 Female 42 2 0.00 1 1 1 101348.88 1 1 0 0
2 608 Female 41 1 83807.86 1 0 1 112542.58 0 0 0 1
3 502 Female 42 8 159660.80 3 1 0 113931.57 1 1 0 0
4 699 Female 39 1 0.00 2 0 0 93826.63 0 1 0 0
5 850 Female 43 2 125510.82 1 1 1 79084.10 0 0 0 1
1# For Gender, ordinal encoding can be used
2Gender_dummies = Geography_dummies.replace(
3    to_replace={"Gender": {"Female": 1, "Male": 0}}
4)
5Gender_dummies.head()
CreditScore Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited Geo_France Geo_Germany Geo_Spain
RowNumber
1 619 1 42 2 0.00 1 1 1 101348.88 1 1 0 0
2 608 1 41 1 83807.86 1 0 1 112542.58 0 0 0 1
3 502 1 42 8 159660.80 3 1 0 113931.57 1 1 0 0
4 699 1 39 1 0.00 2 0 0 93826.63 0 1 0 0
5 850 1 43 2 125510.82 1 1 1 79084.10 0 0 0 1
1# Put the results in a new churn_data dataframe, and check results
2churn_data_encoded = Gender_dummies
3churn_data_encoded
CreditScore Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited Geo_France Geo_Germany Geo_Spain
RowNumber
1 619 1 42 2 0.00 1 1 1 101348.88 1 1 0 0
2 608 1 41 1 83807.86 1 0 1 112542.58 0 0 0 1
3 502 1 42 8 159660.80 3 1 0 113931.57 1 1 0 0
4 699 1 39 1 0.00 2 0 0 93826.63 0 1 0 0
5 850 1 43 2 125510.82 1 1 1 79084.10 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
9996 771 0 39 5 0.00 2 1 0 96270.64 0 1 0 0
9997 516 0 35 10 57369.61 1 1 1 101699.77 0 1 0 0
9998 709 1 36 7 0.00 1 0 1 42085.58 1 1 0 0
9999 772 0 42 3 75075.31 2 1 0 92888.52 1 0 1 0
10000 792 1 28 4 130142.79 1 1 0 38190.78 0 1 0 0

10000 rows × 13 columns

1# Show the counts of observations in each categorical bin
2# (column Exited) using bars.
3# https://seaborn.pydata.org/generated/seaborn.countplot.html
4sns.countplot(y=churn_data_encoded.Exited, data=churn_data_encoded)
5plt.xlabel("Count of each Target class")
6plt.ylabel("Target classes")
7plt.show()
../../_images/50737be329925a9c5ba56a39b58549e223a0d08b125f91632cbe97fff7e4d938.png
1# Show distributions of the dataset
2# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.hist.html
3# https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.title.html
4# https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.show.html
5churn_data_encoded.hist(figsize=(15, 12), bins=15)
6plt.title("Features Distribution")
7plt.show()
../../_images/b62522d31549a4fa1573d544551246aebef4dafa2cc00423cd83a8a564c30dba.png
1# Plot rectangular data as a color-encoded matrix.
2# https://seaborn.pydata.org/generated/seaborn.heatmap.html#seaborn.heatmap
3# https://matplotlib.org/stable/api/figure_api.html
4plt.figure(figsize=(15, 15))
5p = sns.heatmap(churn_data_encoded.corr(), annot=True, cmap="RdYlGn", center=0)
../../_images/5b8f4fd1874cb9fdf7d2d85f5edeccc784176e394243f384b2a668d286f4224d.png

Training of the network#

1# Exited is target class
2X = churn_data_encoded.drop(["Exited"], axis=1)
3y = churn_data_encoded.Exited
1# Check
2X.head(10)
CreditScore Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Geo_France Geo_Germany Geo_Spain
RowNumber
1 619 1 42 2 0.00 1 1 1 101348.88 1 0 0
2 608 1 41 1 83807.86 1 0 1 112542.58 0 0 1
3 502 1 42 8 159660.80 3 1 0 113931.57 1 0 0
4 699 1 39 1 0.00 2 0 0 93826.63 1 0 0
5 850 1 43 2 125510.82 1 1 1 79084.10 0 0 1
6 645 0 44 8 113755.78 2 1 0 149756.71 0 0 1
7 822 0 50 7 0.00 2 1 1 10062.80 1 0 0
8 376 1 29 4 115046.74 4 1 0 119346.88 0 1 0
9 501 0 44 4 142051.07 2 0 1 74940.50 1 0 0
10 684 0 27 2 134603.88 1 1 1 71725.73 1 0 0
 1# Split dataset into the Training set and Test set
 2# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
 3X_train, X_test, y_train, y_test = train_test_split(
 4    X, y, test_size=0.33, random_state=0
 5)
 6
 7# The dataset contains attributes with a mixtures of scales for
 8# various quantities.
 9# Many machine learning methods expect or are more effective if
10# the data attributes have the same scale.
11# https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/
12# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
13sc = StandardScaler()
14X_train = sc.fit_transform(X_train)
15X_test = sc.transform(X_test)
1# Sequential model to initialise our ANN and dense module to build the layers.
2# https://keras.io/guides/sequential_model/
3# https://keras.io/api/layers/core_layers/dense/
 1# Create a Sequential model with 3 layers incrementally
 2# https://keras.io/guides/
 3
 4classifier = Sequential()
 5
 6# Adding the input layer and the first hidden layer
 7classifier.add(
 8    Dense(
 9        units=6, kernel_initializer="uniform", activation="relu", input_dim=12
10    )
11)
12# Adding the second hidden layer
13classifier.add(Dense(units=6, kernel_initializer="uniform", activation="relu"))
14# Adding the output layer
15classifier.add(
16    Dense(units=1, kernel_initializer="uniform", activation="sigmoid")
17)
18
19# Compiling the ANN | means applying SGD on the whole ANN
20classifier.compile(
21    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
22)
23
24# Fitting the ANN to the Training set
25classifier.fit(X_train, y_train, batch_size=10, epochs=100, verbose=0)
<tensorflow.python.keras.callbacks.History at 0x7f41e8445040>

Evaluations#

 1score, acc = classifier.evaluate(X_train, y_train, batch_size=10)
 2print("Train score:", score)
 3print("Train accuracy:", acc)
 4
 5# Making predictions and evaluating the model
 6# Predicting the Test set results
 7y_pred = classifier.predict(X_test)
 8y_pred = y_pred > 0.5
 9
10print("*" * 20)
11score, acc = classifier.evaluate(X_test, y_test, batch_size=10)
12print("Test score:", score)
13print("Test accuracy:", acc)
14
15# Making the Confusion Matrix
16cm = confusion_matrix(y_test, y_pred)
1# Evaluation metrics
2p = sns.heatmap(pd.DataFrame(cm), annot=True, cmap="YlGnBu", fmt="g")
3plt.title("Confusion matrix", y=1.1)
4plt.ylabel("Actual label")
5plt.xlabel("Predicted label")
1# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
2y_pred = classifier.predict(X_test)
3fpr, tpr, thresholds = roc_curve(y_test, y_pred)
4plt.plot([0, 1], [0, 1], "k--")
5plt.plot(fpr, tpr, label="ANN")
6plt.xlabel("fpr")
7plt.ylabel("tpr")
8plt.title("ROC curve")
9plt.show()
1# Area under ROC curve
2roc_auc_score(y_test, y_pred)