Evaluating the correlation between columns using a heatmap#

Uncovering underlying data structures.

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import pandas as pd
 3
 4# Plotting
 5import seaborn as sns
 6
 7# Warnings
 8import warnings
 9
10warnings.filterwarnings("ignore")
11%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/preprocessed_default_credit.csv")
3dataset.head().T
0 1 2 3 4
ID 1 2 3 4 5
LIMIT_BAL 20000 120000 90000 50000 50000
SEX 2 2 2 2 1
EDUCATION 2 2 2 2 2
MARRIAGE 1 2 2 1 1
AGE 24 26 34 37 57
PAY_1 2 -1 0 0 -1
PAY_2 2 2 0 0 0
PAY_3 -1 0 0 0 -1
PAY_4 -1 0 0 0 0
PAY_5 -2 0 0 0 0
PAY_6 -2 2 0 0 0
BILL_AMT1 3913 2682 29239 46990 8617
BILL_AMT2 3102 1725 14027 48233 5670
BILL_AMT3 689 2682 13559 49291 35835
BILL_AMT4 0 3272 14331 28314 20940
BILL_AMT5 0 3455 14948 28959 19146
BILL_AMT6 0 3261 15549 29547 19131
PAY_AMT1 0 0 1518 2000 2000
PAY_AMT2 689 1000 1500 2019 36681
PAY_AMT3 0 1000 1000 1200 10000
PAY_AMT4 0 1000 1000 1100 9000
PAY_AMT5 0 0 1000 1069 689
PAY_AMT6 0 2000 5000 1000 679
DEFAULT 1 1 0 0 0

Exploring dataset#

1# Printing dimensionality of the data, columns, types and missing values
2print(f"Data dimension: {dataset.shape}")
3for col in dataset.columns:
4    print(
5        f"Column: {col:35} | "
6        f"type: {str(dataset[col].dtype):7} | "
7        f"missing values: {dataset[col].isna().sum():3d}"
8    )
Data dimension: (30000, 25)
Column: ID                                  | type: int64   | missing values:   0
Column: LIMIT_BAL                           | type: int64   | missing values:   0
Column: SEX                                 | type: int64   | missing values:   0
Column: EDUCATION                           | type: int64   | missing values:   0
Column: MARRIAGE                            | type: int64   | missing values:   0
Column: AGE                                 | type: int64   | missing values:   0
Column: PAY_1                               | type: int64   | missing values:   0
Column: PAY_2                               | type: int64   | missing values:   0
Column: PAY_3                               | type: int64   | missing values:   0
Column: PAY_4                               | type: int64   | missing values:   0
Column: PAY_5                               | type: int64   | missing values:   0
Column: PAY_6                               | type: int64   | missing values:   0
Column: BILL_AMT1                           | type: int64   | missing values:   0
Column: BILL_AMT2                           | type: int64   | missing values:   0
Column: BILL_AMT3                           | type: int64   | missing values:   0
Column: BILL_AMT4                           | type: int64   | missing values:   0
Column: BILL_AMT5                           | type: int64   | missing values:   0
Column: BILL_AMT6                           | type: int64   | missing values:   0
Column: PAY_AMT1                            | type: int64   | missing values:   0
Column: PAY_AMT2                            | type: int64   | missing values:   0
Column: PAY_AMT3                            | type: int64   | missing values:   0
Column: PAY_AMT4                            | type: int64   | missing values:   0
Column: PAY_AMT5                            | type: int64   | missing values:   0
Column: PAY_AMT6                            | type: int64   | missing values:   0
Column: DEFAULT                             | type: int64   | missing values:   0
1# Computing statistics on numerical features
2dataset.describe().T
count mean std min 25% 50% 75% max
ID 30000.0 15000.500000 8660.398374 1.0 7500.75 15000.5 22500.25 30000.0
LIMIT_BAL 30000.0 167484.322667 129747.661567 10000.0 50000.00 140000.0 240000.00 1000000.0
SEX 30000.0 1.603733 0.489129 1.0 1.00 2.0 2.00 2.0
EDUCATION 30000.0 1.842267 0.744494 1.0 1.00 2.0 2.00 4.0
MARRIAGE 30000.0 1.555467 0.518137 1.0 1.00 2.0 2.00 3.0
AGE 30000.0 35.485500 9.217904 21.0 28.00 34.0 41.00 79.0
PAY_1 30000.0 -0.016700 1.123802 -2.0 -1.00 0.0 0.00 8.0
PAY_2 30000.0 -0.133767 1.197186 -2.0 -1.00 0.0 0.00 8.0
PAY_3 30000.0 -0.166200 1.196868 -2.0 -1.00 0.0 0.00 8.0
PAY_4 30000.0 -0.220667 1.169139 -2.0 -1.00 0.0 0.00 8.0
PAY_5 30000.0 -0.266200 1.133187 -2.0 -1.00 0.0 0.00 8.0
PAY_6 30000.0 -0.291100 1.149988 -2.0 -1.00 0.0 0.00 8.0
BILL_AMT1 30000.0 51223.330900 73635.860576 -165580.0 3558.75 22381.5 67091.00 964511.0
BILL_AMT2 30000.0 49179.075167 71173.768783 -69777.0 2984.75 21200.0 64006.25 983931.0
BILL_AMT3 30000.0 47013.154800 69349.387427 -157264.0 2666.25 20088.5 60164.75 1664089.0
BILL_AMT4 30000.0 43262.948967 64332.856134 -170000.0 2326.75 19052.0 54506.00 891586.0
BILL_AMT5 30000.0 40311.400967 60797.155770 -81334.0 1763.00 18104.5 50190.50 927171.0
BILL_AMT6 30000.0 38871.760400 59554.107537 -339603.0 1256.00 17071.0 49198.25 961664.0
PAY_AMT1 30000.0 5663.580500 16563.280354 0.0 1000.00 2100.0 5006.00 873552.0
PAY_AMT2 30000.0 5921.163500 23040.870402 0.0 833.00 2009.0 5000.00 1684259.0
PAY_AMT3 30000.0 5225.681500 17606.961470 0.0 390.00 1800.0 4505.00 896040.0
PAY_AMT4 30000.0 4826.076867 15666.159744 0.0 296.00 1500.0 4013.25 621000.0
PAY_AMT5 30000.0 4799.387633 15278.305679 0.0 252.50 1500.0 4031.50 426529.0
PAY_AMT6 30000.0 5215.502567 17777.465775 0.0 117.75 1500.0 4000.00 528666.0
DEFAULT 30000.0 0.221200 0.415062 0.0 0.00 0.0 0.00 1.0

Correlations#

1sns.set(rc={"figure.figsize": (30, 10)})
2sns.set_context("talk", font_scale=0.7)
3sns.heatmap(
4    dataset.iloc[:, 1:].corr(method="spearman"), cmap="rainbow_r", annot=True
5)
<AxesSubplot:>
../../_images/d793d898dc3232db9d921e77899eca53edb24f028d57bc7973a494104905843c.png
1# Exact correlation coefficients
2dataset.drop("DEFAULT", axis=1).apply(
3    lambda x: x.corr(dataset.DEFAULT, method="spearman")
4)
ID          -0.013952
LIMIT_BAL   -0.169586
SEX         -0.039961
EDUCATION    0.043425
MARRIAGE    -0.028174
AGE          0.005149
PAY_1        0.292213
PAY_2        0.216919
PAY_3        0.194771
PAY_4        0.173690
PAY_5        0.159043
PAY_6        0.142523
BILL_AMT1   -0.025327
BILL_AMT2   -0.015554
BILL_AMT3   -0.012670
BILL_AMT4   -0.008357
BILL_AMT5   -0.006851
BILL_AMT6   -0.000076
PAY_AMT1    -0.160493
PAY_AMT2    -0.150977
PAY_AMT3    -0.139388
PAY_AMT4    -0.127979
PAY_AMT5    -0.116587
PAY_AMT6    -0.121444
dtype: float64