Preprocessing#

Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.

Importing libraries and packages#

# Mathematical operations and data manipulation
import pandas as pd

# Warnings
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

# load data
dataset = pd.read_excel(f"{data_path}/default_credit.xls")
dataset.head().T

	0	1	2	3	4
ID	1	2	3	4	5
LIMIT_BAL	20000	120000	90000	50000	50000
SEX	2	2	2	2	1
EDUCATION	2	2	2	2	2
MARRIAGE	1	2	2	1	1
AGE	24	26	34	37	57
PAY_0	2	-1	0	0	-1
PAY_2	2	2	0	0	0
PAY_3	-1	0	0	0	-1
PAY_4	-1	0	0	0	0
PAY_5	-2	0	0	0	0
PAY_6	-2	2	0	0	0
BILL_AMT1	3913	2682	29239	46990	8617
BILL_AMT2	3102	1725	14027	48233	5670
BILL_AMT3	689	2682	13559	49291	35835
BILL_AMT4	0	3272	14331	28314	20940
BILL_AMT5	0	3455	14948	28959	19146
BILL_AMT6	0	3261	15549	29547	19131
PAY_AMT1	0	0	1518	2000	2000
PAY_AMT2	689	1000	1500	2019	36681
PAY_AMT3	0	1000	1000	1200	10000
PAY_AMT4	0	1000	1000	1100	9000
PAY_AMT5	0	0	1000	1069	689
PAY_AMT6	0	2000	5000	1000	679
default payment next month	1	1	0	0	0

Exploring dataset#

# Printing dimensionality of the data, columns, types and missing values
print(f"Data dimension: {dataset.shape}")
for col in dataset.columns:
    print(
        f"Column: {col:35} | "
        f"type: {str(dataset[col].dtype):7} | "
        f"missing values: {dataset[col].isna().sum():3d}"
    )

Data dimension: (30000, 25)
Column: ID                                  | type: int64   | missing values:   0
Column: LIMIT_BAL                           | type: int64   | missing values:   0
Column: SEX                                 | type: int64   | missing values:   0
Column: EDUCATION                           | type: int64   | missing values:   0
Column: MARRIAGE                            | type: int64   | missing values:   0
Column: AGE                                 | type: int64   | missing values:   0
Column: PAY_0                               | type: int64   | missing values:   0
Column: PAY_2                               | type: int64   | missing values:   0
Column: PAY_3                               | type: int64   | missing values:   0
Column: PAY_4                               | type: int64   | missing values:   0
Column: PAY_5                               | type: int64   | missing values:   0
Column: PAY_6                               | type: int64   | missing values:   0
Column: BILL_AMT1                           | type: int64   | missing values:   0
Column: BILL_AMT2                           | type: int64   | missing values:   0
Column: BILL_AMT3                           | type: int64   | missing values:   0
Column: BILL_AMT4                           | type: int64   | missing values:   0
Column: BILL_AMT5                           | type: int64   | missing values:   0
Column: BILL_AMT6                           | type: int64   | missing values:   0
Column: PAY_AMT1                            | type: int64   | missing values:   0
Column: PAY_AMT2                            | type: int64   | missing values:   0
Column: PAY_AMT3                            | type: int64   | missing values:   0
Column: PAY_AMT4                            | type: int64   | missing values:   0
Column: PAY_AMT5                            | type: int64   | missing values:   0
Column: PAY_AMT6                            | type: int64   | missing values:   0
Column: default payment next month          | type: int64   | missing values:   0

# Computing statistics on numerical features
dataset.describe().T

	count	mean	std	min	25%	50%	75%	max
ID	30000.0	15000.500000	8660.398374	1.0	7500.75	15000.5	22500.25	30000.0
LIMIT_BAL	30000.0	167484.322667	129747.661567	10000.0	50000.00	140000.0	240000.00	1000000.0
SEX	30000.0	1.603733	0.489129	1.0	1.00	2.0	2.00	2.0
EDUCATION	30000.0	1.853133	0.790349	0.0	1.00	2.0	2.00	6.0
MARRIAGE	30000.0	1.551867	0.521970	0.0	1.00	2.0	2.00	3.0
AGE	30000.0	35.485500	9.217904	21.0	28.00	34.0	41.00	79.0
PAY_0	30000.0	-0.016700	1.123802	-2.0	-1.00	0.0	0.00	8.0
PAY_2	30000.0	-0.133767	1.197186	-2.0	-1.00	0.0	0.00	8.0
PAY_3	30000.0	-0.166200	1.196868	-2.0	-1.00	0.0	0.00	8.0
PAY_4	30000.0	-0.220667	1.169139	-2.0	-1.00	0.0	0.00	8.0
PAY_5	30000.0	-0.266200	1.133187	-2.0	-1.00	0.0	0.00	8.0
PAY_6	30000.0	-0.291100	1.149988	-2.0	-1.00	0.0	0.00	8.0
BILL_AMT1	30000.0	51223.330900	73635.860576	-165580.0	3558.75	22381.5	67091.00	964511.0
BILL_AMT2	30000.0	49179.075167	71173.768783	-69777.0	2984.75	21200.0	64006.25	983931.0
BILL_AMT3	30000.0	47013.154800	69349.387427	-157264.0	2666.25	20088.5	60164.75	1664089.0
BILL_AMT4	30000.0	43262.948967	64332.856134	-170000.0	2326.75	19052.0	54506.00	891586.0
BILL_AMT5	30000.0	40311.400967	60797.155770	-81334.0	1763.00	18104.5	50190.50	927171.0
BILL_AMT6	30000.0	38871.760400	59554.107537	-339603.0	1256.00	17071.0	49198.25	961664.0
PAY_AMT1	30000.0	5663.580500	16563.280354	0.0	1000.00	2100.0	5006.00	873552.0
PAY_AMT2	30000.0	5921.163500	23040.870402	0.0	833.00	2009.0	5000.00	1684259.0
PAY_AMT3	30000.0	5225.681500	17606.961470	0.0	390.00	1800.0	4505.00	896040.0
PAY_AMT4	30000.0	4826.076867	15666.159744	0.0	296.00	1500.0	4013.25	621000.0
PAY_AMT5	30000.0	4799.387633	15278.305679	0.0	252.50	1500.0	4031.50	426529.0
PAY_AMT6	30000.0	5215.502567	17777.465775	0.0	117.75	1500.0	4000.00	528666.0
default payment next month	30000.0	0.221200	0.415062	0.0	0.00	0.0	0.00	1.0

This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable. This study reviewed the literature and used the following 23 variables as explanatory variables:

X1: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.

X2: Gender (1 = male; 2 = female).

X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).

X4: Marital status (1 = married; 2 = single; 3 = others).

X5: Age (year).

X6 - X11: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; …;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; …; 8 = payment delay for eight months; 9 = payment delay for nine months and above.

X12-X17: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; …; X17 = amount of bill statement in April, 2005.

X18-X23: Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; …;X23 = amount paid in April, 2005.

Preprocessing#

print("SEX " + str(sorted(dataset["SEX"].unique())))

SEX [1, 2]

print("PAY_0 " + str(sorted(dataset["PAY_0"].unique())))

PAY_0 [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]

# Printing the unique values in the default payment next month column,
# sorted in asc order
print(
    "default.payment.next.month: "
    + str(sorted(dataset["default payment next month"].unique()))
)

default.payment.next.month: [0, 1]

# Renaming the PAY_0 column to PAY_1 and the default payment next
# month column to DEFAULT to maintain consistency with the naming
# of other columns
dataset = dataset.rename(
    columns={"default payment next month": "DEFAULT", "PAY_0": "PAY_1"}
)

print("EDUCATION " + str(sorted(dataset["EDUCATION"].unique())))

EDUCATION [0, 1, 2, 3, 4, 5, 6]

The EDUCATION column has 7 unique values, but the data description lists 4 unique values.

# Joining categories 0, 5, and 6 with category 4
fill = (
    (dataset.EDUCATION == 0)
    | (dataset.EDUCATION == 5)
    | (dataset.EDUCATION == 6)
)
dataset.loc[fill, "EDUCATION"] = 4

print("EDUCATION " + str(sorted(dataset["EDUCATION"].unique())))

EDUCATION [1, 2, 3, 4]

print("MARRIAGE " + str(sorted(dataset["MARRIAGE"].unique())))

MARRIAGE [0, 1, 2, 3]

The data description lists 3 unique values, this lists 4.

# Combining category 0 with category 2 (Single)
fill = dataset.MARRIAGE == 0
dataset.loc[fill, "MARRIAGE"] = 2

dataset.head()

	ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_1	PAY_2	PAY_3	PAY_4	...	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	DEFAULT
0	1	20000	2	2	1	24	2	2	-1	-1	...	0	0	0	0	689	0	0	0	0	1
1	2	120000	2	2	2	26	-1	2	0	0	...	3272	3455	3261	0	1000	1000	1000	0	2000	1
2	3	90000	2	2	2	34	0	0	0	0	...	14331	14948	15549	1518	1500	1000	1000	1000	5000	0
3	4	50000	2	2	1	37	0	0	0	0	...	28314	28959	29547	2000	2019	1200	1100	1069	1000	0
4	5	50000	1	2	1	57	-1	0	-1	0	...	20940	19146	19131	2000	36681	10000	9000	689	679	0

5 rows × 25 columns

dataset.to_csv(f"{data_path}/preprocessed_default_credit.csv", index=False)

Table of Contents

Books

Preprocessing#

Importing libraries and packages#

Set paths#

Loading dataset#

Exploring dataset#

Preprocessing#