Initial data analysis#

Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import numpy as np
 3import pandas as pd
 4
 5# Plotting
 6import seaborn as sns
 7import matplotlib.pyplot as plt
 8
 9# Warnings
10import warnings
11
12warnings.filterwarnings("ignore")
13
14%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/bank-additional-full.csv", sep=";")
3dataset.head().T
0 1 2 3 4
age 56 57 37 40 56
job housemaid services services admin. services
marital married married married married married
education basic.4y high.school high.school basic.6y high.school
default no unknown no no no
housing no no yes no no
loan no no no no yes
contact telephone telephone telephone telephone telephone
month may may may may may
day_of_week mon mon mon mon mon
duration 261 149 226 151 307
campaign 1 1 1 1 1
pdays 999 999 999 999 999
previous 0 0 0 0 0
poutcome nonexistent nonexistent nonexistent nonexistent nonexistent
emp.var.rate 1.1 1.1 1.1 1.1 1.1
cons.price.idx 93.994 93.994 93.994 93.994 93.994
cons.conf.idx -36.4 -36.4 -36.4 -36.4 -36.4
euribor3m 4.857 4.857 4.857 4.857 4.857
nr.employed 5191.0 5191.0 5191.0 5191.0 5191.0
y no no no no no

Exploring dataset#

1# Printing dimensionality of the data, columns, types and missing values
2print(f"Data dimension: {dataset.shape}")
3for col in dataset.columns:
4    print(
5        f"Column: {col:35} | "
6        f"type: {str(dataset[col].dtype):7} | "
7        f"missing values: {dataset[col].isna().sum():3d}"
8    )
Data dimension: (41188, 21)
Column: age                                 | type: int64   | missing values:   0
Column: job                                 | type: object  | missing values:   0
Column: marital                             | type: object  | missing values:   0
Column: education                           | type: object  | missing values:   0
Column: default                             | type: object  | missing values:   0
Column: housing                             | type: object  | missing values:   0
Column: loan                                | type: object  | missing values:   0
Column: contact                             | type: object  | missing values:   0
Column: month                               | type: object  | missing values:   0
Column: day_of_week                         | type: object  | missing values:   0
Column: duration                            | type: int64   | missing values:   0
Column: campaign                            | type: int64   | missing values:   0
Column: pdays                               | type: int64   | missing values:   0
Column: previous                            | type: int64   | missing values:   0
Column: poutcome                            | type: object  | missing values:   0
Column: emp.var.rate                        | type: float64 | missing values:   0
Column: cons.price.idx                      | type: float64 | missing values:   0
Column: cons.conf.idx                       | type: float64 | missing values:   0
Column: euribor3m                           | type: float64 | missing values:   0
Column: nr.employed                         | type: float64 | missing values:   0
Column: y                                   | type: object  | missing values:   0
1# Numerical features
2numerical_features = [
3    col
4    for col in dataset.columns
5    if np.issubdtype(dataset[col].dtype, np.number)
6]
7print(numerical_features)
['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
1# Computing statistics on numerical features
2dataset[numerical_features].describe().T
count mean std min 25% 50% 75% max
age 41188.0 40.024060 10.421250 17.000 32.000 38.000 47.000 98.000
duration 41188.0 258.285010 259.279249 0.000 102.000 180.000 319.000 4918.000
campaign 41188.0 2.567593 2.770014 1.000 1.000 2.000 3.000 56.000
pdays 41188.0 962.475454 186.910907 0.000 999.000 999.000 999.000 999.000
previous 41188.0 0.172963 0.494901 0.000 0.000 0.000 0.000 7.000
emp.var.rate 41188.0 0.081886 1.570960 -3.400 -1.800 1.100 1.400 1.400
cons.price.idx 41188.0 93.575664 0.578840 92.201 93.075 93.749 93.994 94.767
cons.conf.idx 41188.0 -40.502600 4.628198 -50.800 -42.700 -41.800 -36.400 -26.900
euribor3m 41188.0 3.621291 1.734447 0.634 1.344 4.857 4.961 5.045
nr.employed 41188.0 5167.035911 72.251528 4963.600 5099.100 5191.000 5228.100 5228.100
1# Distributions of numerical features
2plt.figure(figsize=(10, 18))
3for index, col in enumerate(numerical_features):
4    plt.subplot(5, 2, index + 1)
5    sns.distplot(dataset[col])
6plt.savefig(
7    f"{assets_path}/numerical_distributions.png", format="png", dpi=500
8)
../../_images/b20400adbd5b3d63545579dfd36cf6326cbd517aa1c7225ecb54329db0c93fa8.png
1# Categorical features
2categorical_features = [
3    col
4    for col in dataset.columns
5    if pd.api.types.is_string_dtype(dataset[col])
6]
 1# Distributions of categorical features
 2plt.figure(figsize=(25, 35))
 3for index, col in enumerate(categorical_features):
 4    plt.subplot(6, 2, index + 1)
 5    ax = sns.countplot(y=col, data=dataset)
 6    ax.set_xlabel("count", fontsize=20)
 7    ax.set_ylabel(col, fontsize=20)
 8    ax.tick_params(labelsize=20)
 9
10plt.savefig(f"{assets_path}/categorical_counts.png", format="png", dpi=500)
../../_images/48ceed440e2250aea7e9942e4a035da6b5dee4d701ce6faa877baf463ecc04cf.png
1# Number of entries in y column
2print("Total number of entries:")
3print(dataset["y"].value_counts(ascending=True))
4print()
5print("Percentages:")
6print(dataset["y"].value_counts(normalize=True, ascending=True) * 100)
Total number of entries:
yes     4640
no     36548
Name: y, dtype: int64

Percentages:
yes    11.265417
no     88.734583
Name: y, dtype: float64