Evaluating the correlation between columns using a heatmap#
Uncovering underlying data structures.
Importing libraries and packages#
1# Mathematical operations and data manipulation
2import pandas as pd
3
4# Plotting
5import seaborn as sns
6
7# Warnings
8import warnings
9
10warnings.filterwarnings("ignore")
11%matplotlib inline
Set paths#
1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"
Loading dataset#
1# load data
2dataset = pd.read_csv(f"{data_path}/preprocessed_default_credit.csv")
3dataset.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
ID | 1 | 2 | 3 | 4 | 5 |
LIMIT_BAL | 20000 | 120000 | 90000 | 50000 | 50000 |
SEX | 2 | 2 | 2 | 2 | 1 |
EDUCATION | 2 | 2 | 2 | 2 | 2 |
MARRIAGE | 1 | 2 | 2 | 1 | 1 |
AGE | 24 | 26 | 34 | 37 | 57 |
PAY_1 | 2 | -1 | 0 | 0 | -1 |
PAY_2 | 2 | 2 | 0 | 0 | 0 |
PAY_3 | -1 | 0 | 0 | 0 | -1 |
PAY_4 | -1 | 0 | 0 | 0 | 0 |
PAY_5 | -2 | 0 | 0 | 0 | 0 |
PAY_6 | -2 | 2 | 0 | 0 | 0 |
BILL_AMT1 | 3913 | 2682 | 29239 | 46990 | 8617 |
BILL_AMT2 | 3102 | 1725 | 14027 | 48233 | 5670 |
BILL_AMT3 | 689 | 2682 | 13559 | 49291 | 35835 |
BILL_AMT4 | 0 | 3272 | 14331 | 28314 | 20940 |
BILL_AMT5 | 0 | 3455 | 14948 | 28959 | 19146 |
BILL_AMT6 | 0 | 3261 | 15549 | 29547 | 19131 |
PAY_AMT1 | 0 | 0 | 1518 | 2000 | 2000 |
PAY_AMT2 | 689 | 1000 | 1500 | 2019 | 36681 |
PAY_AMT3 | 0 | 1000 | 1000 | 1200 | 10000 |
PAY_AMT4 | 0 | 1000 | 1000 | 1100 | 9000 |
PAY_AMT5 | 0 | 0 | 1000 | 1069 | 689 |
PAY_AMT6 | 0 | 2000 | 5000 | 1000 | 679 |
DEFAULT | 1 | 1 | 0 | 0 | 0 |
Exploring dataset#
1# Printing dimensionality of the data, columns, types and missing values
2print(f"Data dimension: {dataset.shape}")
3for col in dataset.columns:
4 print(
5 f"Column: {col:35} | "
6 f"type: {str(dataset[col].dtype):7} | "
7 f"missing values: {dataset[col].isna().sum():3d}"
8 )
Data dimension: (30000, 25)
Column: ID | type: int64 | missing values: 0
Column: LIMIT_BAL | type: int64 | missing values: 0
Column: SEX | type: int64 | missing values: 0
Column: EDUCATION | type: int64 | missing values: 0
Column: MARRIAGE | type: int64 | missing values: 0
Column: AGE | type: int64 | missing values: 0
Column: PAY_1 | type: int64 | missing values: 0
Column: PAY_2 | type: int64 | missing values: 0
Column: PAY_3 | type: int64 | missing values: 0
Column: PAY_4 | type: int64 | missing values: 0
Column: PAY_5 | type: int64 | missing values: 0
Column: PAY_6 | type: int64 | missing values: 0
Column: BILL_AMT1 | type: int64 | missing values: 0
Column: BILL_AMT2 | type: int64 | missing values: 0
Column: BILL_AMT3 | type: int64 | missing values: 0
Column: BILL_AMT4 | type: int64 | missing values: 0
Column: BILL_AMT5 | type: int64 | missing values: 0
Column: BILL_AMT6 | type: int64 | missing values: 0
Column: PAY_AMT1 | type: int64 | missing values: 0
Column: PAY_AMT2 | type: int64 | missing values: 0
Column: PAY_AMT3 | type: int64 | missing values: 0
Column: PAY_AMT4 | type: int64 | missing values: 0
Column: PAY_AMT5 | type: int64 | missing values: 0
Column: PAY_AMT6 | type: int64 | missing values: 0
Column: DEFAULT | type: int64 | missing values: 0
1# Computing statistics on numerical features
2dataset.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
ID | 30000.0 | 15000.500000 | 8660.398374 | 1.0 | 7500.75 | 15000.5 | 22500.25 | 30000.0 |
LIMIT_BAL | 30000.0 | 167484.322667 | 129747.661567 | 10000.0 | 50000.00 | 140000.0 | 240000.00 | 1000000.0 |
SEX | 30000.0 | 1.603733 | 0.489129 | 1.0 | 1.00 | 2.0 | 2.00 | 2.0 |
EDUCATION | 30000.0 | 1.842267 | 0.744494 | 1.0 | 1.00 | 2.0 | 2.00 | 4.0 |
MARRIAGE | 30000.0 | 1.555467 | 0.518137 | 1.0 | 1.00 | 2.0 | 2.00 | 3.0 |
AGE | 30000.0 | 35.485500 | 9.217904 | 21.0 | 28.00 | 34.0 | 41.00 | 79.0 |
PAY_1 | 30000.0 | -0.016700 | 1.123802 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
PAY_2 | 30000.0 | -0.133767 | 1.197186 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
PAY_3 | 30000.0 | -0.166200 | 1.196868 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
PAY_4 | 30000.0 | -0.220667 | 1.169139 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
PAY_5 | 30000.0 | -0.266200 | 1.133187 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
PAY_6 | 30000.0 | -0.291100 | 1.149988 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
BILL_AMT1 | 30000.0 | 51223.330900 | 73635.860576 | -165580.0 | 3558.75 | 22381.5 | 67091.00 | 964511.0 |
BILL_AMT2 | 30000.0 | 49179.075167 | 71173.768783 | -69777.0 | 2984.75 | 21200.0 | 64006.25 | 983931.0 |
BILL_AMT3 | 30000.0 | 47013.154800 | 69349.387427 | -157264.0 | 2666.25 | 20088.5 | 60164.75 | 1664089.0 |
BILL_AMT4 | 30000.0 | 43262.948967 | 64332.856134 | -170000.0 | 2326.75 | 19052.0 | 54506.00 | 891586.0 |
BILL_AMT5 | 30000.0 | 40311.400967 | 60797.155770 | -81334.0 | 1763.00 | 18104.5 | 50190.50 | 927171.0 |
BILL_AMT6 | 30000.0 | 38871.760400 | 59554.107537 | -339603.0 | 1256.00 | 17071.0 | 49198.25 | 961664.0 |
PAY_AMT1 | 30000.0 | 5663.580500 | 16563.280354 | 0.0 | 1000.00 | 2100.0 | 5006.00 | 873552.0 |
PAY_AMT2 | 30000.0 | 5921.163500 | 23040.870402 | 0.0 | 833.00 | 2009.0 | 5000.00 | 1684259.0 |
PAY_AMT3 | 30000.0 | 5225.681500 | 17606.961470 | 0.0 | 390.00 | 1800.0 | 4505.00 | 896040.0 |
PAY_AMT4 | 30000.0 | 4826.076867 | 15666.159744 | 0.0 | 296.00 | 1500.0 | 4013.25 | 621000.0 |
PAY_AMT5 | 30000.0 | 4799.387633 | 15278.305679 | 0.0 | 252.50 | 1500.0 | 4031.50 | 426529.0 |
PAY_AMT6 | 30000.0 | 5215.502567 | 17777.465775 | 0.0 | 117.75 | 1500.0 | 4000.00 | 528666.0 |
DEFAULT | 30000.0 | 0.221200 | 0.415062 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
Correlations#
1sns.set(rc={"figure.figsize": (30, 10)})
2sns.set_context("talk", font_scale=0.7)
3sns.heatmap(
4 dataset.iloc[:, 1:].corr(method="spearman"), cmap="rainbow_r", annot=True
5)
<AxesSubplot:>
1# Exact correlation coefficients
2dataset.drop("DEFAULT", axis=1).apply(
3 lambda x: x.corr(dataset.DEFAULT, method="spearman")
4)
ID -0.013952
LIMIT_BAL -0.169586
SEX -0.039961
EDUCATION 0.043425
MARRIAGE -0.028174
AGE 0.005149
PAY_1 0.292213
PAY_2 0.216919
PAY_3 0.194771
PAY_4 0.173690
PAY_5 0.159043
PAY_6 0.142523
BILL_AMT1 -0.025327
BILL_AMT2 -0.015554
BILL_AMT3 -0.012670
BILL_AMT4 -0.008357
BILL_AMT5 -0.006851
BILL_AMT6 -0.000076
PAY_AMT1 -0.160493
PAY_AMT2 -0.150977
PAY_AMT3 -0.139388
PAY_AMT4 -0.127979
PAY_AMT5 -0.116587
PAY_AMT6 -0.121444
dtype: float64