Exploratory data analysis#

Uncovering underlying data structures.

Importing libraries and packages#

# Mathematical operations and data manipulation
import pandas as pd

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

# load data
dataset = pd.read_csv(f"{data_path}/preprocessed_default_credit.csv")
dataset.head().T

	0	1	2	3	4
ID	1	2	3	4	5
LIMIT_BAL	20000	120000	90000	50000	50000
SEX	2	2	2	2	1
EDUCATION	2	2	2	2	2
MARRIAGE	1	2	2	1	1
AGE	24	26	34	37	57
PAY_1	2	-1	0	0	-1
PAY_2	2	2	0	0	0
PAY_3	-1	0	0	0	-1
PAY_4	-1	0	0	0	0
PAY_5	-2	0	0	0	0
PAY_6	-2	2	0	0	0
BILL_AMT1	3913	2682	29239	46990	8617
BILL_AMT2	3102	1725	14027	48233	5670
BILL_AMT3	689	2682	13559	49291	35835
BILL_AMT4	0	3272	14331	28314	20940
BILL_AMT5	0	3455	14948	28959	19146
BILL_AMT6	0	3261	15549	29547	19131
PAY_AMT1	0	0	1518	2000	2000
PAY_AMT2	689	1000	1500	2019	36681
PAY_AMT3	0	1000	1000	1200	10000
PAY_AMT4	0	1000	1000	1100	9000
PAY_AMT5	0	0	1000	1069	689
PAY_AMT6	0	2000	5000	1000	679
DEFAULT	1	1	0	0	0

Exploring dataset#

# Printing dimensionality of the data, columns, types and missing values
print(f"Data dimension: {dataset.shape}")
for col in dataset.columns:
    print(
        f"Column: {col:35} | "
        f"type: {str(dataset[col].dtype):7} | "
        f"missing values: {dataset[col].isna().sum():3d}"
    )

Data dimension: (30000, 25)
Column: ID                                  | type: int64   | missing values:   0
Column: LIMIT_BAL                           | type: int64   | missing values:   0
Column: SEX                                 | type: int64   | missing values:   0
Column: EDUCATION                           | type: int64   | missing values:   0
Column: MARRIAGE                            | type: int64   | missing values:   0
Column: AGE                                 | type: int64   | missing values:   0
Column: PAY_1                               | type: int64   | missing values:   0
Column: PAY_2                               | type: int64   | missing values:   0
Column: PAY_3                               | type: int64   | missing values:   0
Column: PAY_4                               | type: int64   | missing values:   0
Column: PAY_5                               | type: int64   | missing values:   0
Column: PAY_6                               | type: int64   | missing values:   0
Column: BILL_AMT1                           | type: int64   | missing values:   0
Column: BILL_AMT2                           | type: int64   | missing values:   0
Column: BILL_AMT3                           | type: int64   | missing values:   0
Column: BILL_AMT4                           | type: int64   | missing values:   0
Column: BILL_AMT5                           | type: int64   | missing values:   0
Column: BILL_AMT6                           | type: int64   | missing values:   0
Column: PAY_AMT1                            | type: int64   | missing values:   0
Column: PAY_AMT2                            | type: int64   | missing values:   0
Column: PAY_AMT3                            | type: int64   | missing values:   0
Column: PAY_AMT4                            | type: int64   | missing values:   0
Column: PAY_AMT5                            | type: int64   | missing values:   0
Column: PAY_AMT6                            | type: int64   | missing values:   0
Column: DEFAULT                             | type: int64   | missing values:   0

# Computing statistics on numerical features
dataset.describe().T

	count	mean	std	min	25%	50%	75%	max
ID	30000.0	15000.500000	8660.398374	1.0	7500.75	15000.5	22500.25	30000.0
LIMIT_BAL	30000.0	167484.322667	129747.661567	10000.0	50000.00	140000.0	240000.00	1000000.0
SEX	30000.0	1.603733	0.489129	1.0	1.00	2.0	2.00	2.0
EDUCATION	30000.0	1.842267	0.744494	1.0	1.00	2.0	2.00	4.0
MARRIAGE	30000.0	1.555467	0.518137	1.0	1.00	2.0	2.00	3.0
AGE	30000.0	35.485500	9.217904	21.0	28.00	34.0	41.00	79.0
PAY_1	30000.0	-0.016700	1.123802	-2.0	-1.00	0.0	0.00	8.0
PAY_2	30000.0	-0.133767	1.197186	-2.0	-1.00	0.0	0.00	8.0
PAY_3	30000.0	-0.166200	1.196868	-2.0	-1.00	0.0	0.00	8.0
PAY_4	30000.0	-0.220667	1.169139	-2.0	-1.00	0.0	0.00	8.0
PAY_5	30000.0	-0.266200	1.133187	-2.0	-1.00	0.0	0.00	8.0
PAY_6	30000.0	-0.291100	1.149988	-2.0	-1.00	0.0	0.00	8.0
BILL_AMT1	30000.0	51223.330900	73635.860576	-165580.0	3558.75	22381.5	67091.00	964511.0
BILL_AMT2	30000.0	49179.075167	71173.768783	-69777.0	2984.75	21200.0	64006.25	983931.0
BILL_AMT3	30000.0	47013.154800	69349.387427	-157264.0	2666.25	20088.5	60164.75	1664089.0
BILL_AMT4	30000.0	43262.948967	64332.856134	-170000.0	2326.75	19052.0	54506.00	891586.0
BILL_AMT5	30000.0	40311.400967	60797.155770	-81334.0	1763.00	18104.5	50190.50	927171.0
BILL_AMT6	30000.0	38871.760400	59554.107537	-339603.0	1256.00	17071.0	49198.25	961664.0
PAY_AMT1	30000.0	5663.580500	16563.280354	0.0	1000.00	2100.0	5006.00	873552.0
PAY_AMT2	30000.0	5921.163500	23040.870402	0.0	833.00	2009.0	5000.00	1684259.0
PAY_AMT3	30000.0	5225.681500	17606.961470	0.0	390.00	1800.0	4505.00	896040.0
PAY_AMT4	30000.0	4826.076867	15666.159744	0.0	296.00	1500.0	4013.25	621000.0
PAY_AMT5	30000.0	4799.387633	15278.305679	0.0	252.50	1500.0	4031.50	426529.0
PAY_AMT6	30000.0	5215.502567	17777.465775	0.0	117.75	1500.0	4000.00	528666.0
DEFAULT	30000.0	0.221200	0.415062	0.0	0.00	0.0	0.00	1.0

Univariate analysis#

# Count of the DEFAULT column
sns.countplot(x="DEFAULT", data=dataset)

<AxesSubplot:xlabel='DEFAULT', ylabel='count'>

../../_images/3c9c9e7da522eb365456ebe186b989106746e7f228df461d7e319db0f3137d07.png

# Distribution of the DEFAULT column, that is, the count of
# defaults versus non-defaults
print(dataset["DEFAULT"].value_counts())
print()
print(dataset["DEFAULT"].value_counts(normalize=True))

0    23364
1     6636
Name: DEFAULT, dtype: int64

0    0.7788
1    0.2212
Name: DEFAULT, dtype: float64

Around 6636 customers have defaulted out of 30000 people, which is around 22%.

# The SEX column
sns.countplot(x="SEX", data=dataset)

<AxesSubplot:xlabel='SEX', ylabel='count'>

../../_images/96550a12bbb03cff68e755e68cbc745c58f6551a5916c8d172696b4afd6cc9c0.png

# Calculation exact number of each visitor type
print(dataset["SEX"].value_counts())
print()
print(dataset["SEX"].value_counts(normalize=True))

2    18112
1    11888
Name: SEX, dtype: int64

2    0.603733
1    0.396267
Name: SEX, dtype: float64

There are a total of 18112 females and 11888 males in the given dataset.

# EDUCATION column
sns.countplot(x="EDUCATION", data=dataset)

<AxesSubplot:xlabel='EDUCATION', ylabel='count'>

../../_images/75da63ebad08c7a0fdc7599429c1ea12c8120bc8fd93e51635c3c4f423632473.png

print(dataset["EDUCATION"].value_counts())
print()
print(dataset["EDUCATION"].value_counts(normalize=True))

  14030
  10585
   4917
    468
Name: EDUCATION, dtype: int64

  0.467667
  0.352833
  0.163900
  0.015600
Name: EDUCATION, dtype: float64

Most customers either went to graduate school or university.

# MARRIAGE column
sns.countplot(x="MARRIAGE", data=dataset)

<AxesSubplot:xlabel='MARRIAGE', ylabel='count'>

../../_images/1036a59d2468fc99c2ed780d1c11f1c2bb17839f6898d98ee8d87cb6f79fb3b0.png

# Count of each subcategory in the weekend column
print(dataset["MARRIAGE"].value_counts())
print()
print(dataset["MARRIAGE"].value_counts(normalize=True))

  16018
  13659
    323
Name: MARRIAGE, dtype: int64

  0.533933
  0.455300
  0.010767
Name: MARRIAGE, dtype: float64

Dataset has a high number of people who are single (unmarried) and people who are married.

Bivariate analysis#

# The SEX column versus the DEFAULT column
sns.set(rc={"figure.figsize": (15, 10)})
edu = sns.countplot(x="SEX", hue="DEFAULT", data=dataset)
edu.set_xticklabels(["Male", "Female"])
plt.show()

../../_images/2eb2a081e63b5f3e6717d1c47394568b14313eab0bff7e3e5e1d39f47454251d.png

Females seem to have defaulted more than males.

# Cross-tabulation
pd.crosstab(dataset.SEX, dataset.DEFAULT, normalize="index", margins=True)

DEFAULT	0	1
SEX
1	0.758328	0.241672
2	0.792237	0.207763
All	0.778800	0.221200

The cross tabulation shows that sround 24% of male customers have defaulted and around 20% of female customers have defaulted.

Relationship between the DEFAULT Column and the EDUCATION and MARRIAGE Columns#

sns.set(rc={"figure.figsize": (15, 10)})
edu = sns.countplot(x="EDUCATION", hue="DEFAULT", data=dataset)
edu.set_xticklabels(["Graduate School", "University", "High School", "Other"])
plt.show()

../../_images/764ad595ffaa54bbdcae13cc36b0004e3e71190efaa51e5a970c2c5d2dd1b9cb.png

# Determining which subcategory has a higher default percentage
# with cross-tabulation
pd.crosstab(dataset.EDUCATION, dataset.DEFAULT, normalize="index")

DEFAULT	0	1
EDUCATION
1	0.807652	0.192348
2	0.762651	0.237349
3	0.748424	0.251576
4	0.929487	0.070513

# Relationship between the MARRIAGE and DEFAULT columns
sns.set(rc={"figure.figsize": (12, 10)})
marriage = sns.countplot(x="MARRIAGE", hue="DEFAULT", data=dataset)
marriage.set_xticklabels(["Married", "Single", "Other"])
plt.show()

../../_images/d57bbff475e5d86aef62edb5cea988922c3f759fc8c173d438e982d9705a7d57.png

# Percentage of married/single/other people which have defaulted
pd.crosstab(dataset.MARRIAGE, dataset.DEFAULT, normalize="index", margins=True)

DEFAULT	0	1
MARRIAGE
1	0.765283	0.234717
2	0.791110	0.208890
3	0.739938	0.260062
All	0.778800	0.221200

Relationships#

# Bounce Rate versus Exit Rate
pd.crosstab(dataset.AGE, dataset.DEFAULT)

DEFAULT	0	1
AGE
21	53	14
22	391	169
23	684	247
24	827	300
25	884	302
26	1003	253
27	1164	313
28	1123	286
29	1292	313
30	1121	274
31	988	229
32	933	225
33	931	215
34	931	231
35	887	226
36	854	254
37	812	229
38	750	194
39	755	199
40	683	187
41	639	185
42	609	185
43	520	150
44	538	162
45	501	116
46	413	157
47	381	120
48	362	104
49	333	119
50	310	101
51	252	88
52	226	78
53	251	74
54	191	56
55	152	57
56	129	49
57	95	27
58	91	31
59	62	21
60	44	23
61	35	21
62	37	7
63	23	8
64	22	9
65	19	5
66	18	7
67	11	5
68	4	1
69	12	3
70	8	2
71	3	0
72	2	1
73	1	3
74	1	0
75	2	1
79	1	0

pd.crosstab(dataset.AGE, dataset.DEFAULT, normalize="index", margins=True)

DEFAULT	0	1
AGE
21	0.791045	0.208955
22	0.698214	0.301786
23	0.734694	0.265306
24	0.733807	0.266193
25	0.745363	0.254637
26	0.798567	0.201433
27	0.788084	0.211916
28	0.797019	0.202981
29	0.804984	0.195016
30	0.803584	0.196416
31	0.811832	0.188168
32	0.805699	0.194301
33	0.812391	0.187609
34	0.801205	0.198795
35	0.796945	0.203055
36	0.770758	0.229242
37	0.780019	0.219981
38	0.794492	0.205508
39	0.791405	0.208595
40	0.785057	0.214943
41	0.775485	0.224515
42	0.767003	0.232997
43	0.776119	0.223881
44	0.768571	0.231429
45	0.811994	0.188006
46	0.724561	0.275439
47	0.760479	0.239521
48	0.776824	0.223176
49	0.736726	0.263274
50	0.754258	0.245742
51	0.741176	0.258824
52	0.743421	0.256579
53	0.772308	0.227692
54	0.773279	0.226721
55	0.727273	0.272727
56	0.724719	0.275281
57	0.778689	0.221311
58	0.745902	0.254098
59	0.746988	0.253012
60	0.656716	0.343284
61	0.625000	0.375000
62	0.840909	0.159091
63	0.741935	0.258065
64	0.709677	0.290323
65	0.791667	0.208333
66	0.720000	0.280000
67	0.687500	0.312500
68	0.800000	0.200000
69	0.800000	0.200000
70	0.800000	0.200000
71	1.000000	0.000000
72	0.666667	0.333333
73	0.250000	0.750000
74	1.000000	0.000000
75	0.666667	0.333333
79	1.000000	0.000000
All	0.778800	0.221200

Table of Contents

Books