Preprocessing#

Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.

Importing libraries and packages#

# Mathematical operations and data manipulation
import pandas as pd

# Warnings
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

# load data
dataset = pd.read_csv(f"{data_path}/heart.csv")
dataset.head().T

	0	1	2	3	4
age	63.0	37.0	41.0	56.0	57.0
sex	1.0	1.0	0.0	1.0	0.0
cp	3.0	2.0	1.0	1.0	0.0
trestbps	145.0	130.0	130.0	120.0	120.0
chol	233.0	250.0	204.0	236.0	354.0
fbs	1.0	0.0	0.0	0.0	0.0
restecg	0.0	1.0	0.0	1.0	1.0
thalach	150.0	187.0	172.0	178.0	163.0
exang	0.0	0.0	0.0	0.0	1.0
oldpeak	2.3	3.5	1.4	0.8	0.6
slope	0.0	0.0	2.0	2.0	2.0
ca	0.0	0.0	0.0	0.0	0.0
thal	1.0	2.0	2.0	2.0	2.0
target	1.0	1.0	1.0	1.0	1.0

Exploring dataset#

# Printing dimensionality of the data, columns, types and missing values
print(f"Data dimension: {dataset.shape}")
for col in dataset.columns:
    print(
        f"Column: {col:35} | "
        f"type: {str(dataset[col].dtype):7} | "
        f"missing values: {dataset[col].isna().sum():3d}"
    )

Data dimension: (303, 14)
Column: age                                 | type: int64   | missing values:   0
Column: sex                                 | type: int64   | missing values:   0
Column: cp                                  | type: int64   | missing values:   0
Column: trestbps                            | type: int64   | missing values:   0
Column: chol                                | type: int64   | missing values:   0
Column: fbs                                 | type: int64   | missing values:   0
Column: restecg                             | type: int64   | missing values:   0
Column: thalach                             | type: int64   | missing values:   0
Column: exang                               | type: int64   | missing values:   0
Column: oldpeak                             | type: float64 | missing values:   0
Column: slope                               | type: int64   | missing values:   0
Column: ca                                  | type: int64   | missing values:   0
Column: thal                                | type: int64   | missing values:   0
Column: target                              | type: int64   | missing values:   0

# Computing statistics on numerical features
dataset.describe().T

	count	mean	std	min	25%	50%	75%	max
age	303.0	54.366337	9.082101	29.0	47.5	55.0	61.0	77.0
sex	303.0	0.683168	0.466011	0.0	0.0	1.0	1.0	1.0
cp	303.0	0.966997	1.032052	0.0	0.0	1.0	2.0	3.0
trestbps	303.0	131.623762	17.538143	94.0	120.0	130.0	140.0	200.0
chol	303.0	246.264026	51.830751	126.0	211.0	240.0	274.5	564.0
fbs	303.0	0.148515	0.356198	0.0	0.0	0.0	0.0	1.0
restecg	303.0	0.528053	0.525860	0.0	0.0	1.0	1.0	2.0
thalach	303.0	149.646865	22.905161	71.0	133.5	153.0	166.0	202.0
exang	303.0	0.326733	0.469794	0.0	0.0	0.0	1.0	1.0
oldpeak	303.0	1.039604	1.161075	0.0	0.0	0.8	1.6	6.2
slope	303.0	1.399340	0.616226	0.0	1.0	1.0	2.0	2.0
ca	303.0	0.729373	1.022606	0.0	0.0	0.0	1.0	4.0
thal	303.0	2.313531	0.612277	0.0	2.0	2.0	3.0	3.0
target	303.0	0.544554	0.498835	0.0	0.0	1.0	1.0	1.0

Preprocessing#

dataset.rename(
    index=str,
    columns={
        "cp": "chest_pain",
        "trestbps": "rest_bp",
        "fbs": "fast_bld_sugar",
        "restecg": "rest_ecg",
        "thalach": "max_hr",
        "exang": "ex_angina",
        "oldpeak": "st_depr",
        "ca": "colored_vessels",
        "thal": "thalassemia",
    },
    inplace=True,
)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              303 non-null    int64  
 1   sex              303 non-null    int64  
 2   chest_pain       303 non-null    int64  
 3   rest_bp          303 non-null    int64  
 4   chol             303 non-null    int64  
 5   fast_bld_sugar   303 non-null    int64  
 6   rest_ecg         303 non-null    int64  
 7   max_hr           303 non-null    int64  
 8   ex_angina        303 non-null    int64  
 9   st_depr          303 non-null    float64
 10  slope            303 non-null    int64  
 11  colored_vessels  303 non-null    int64  
 12  thalassemia      303 non-null    int64  
 13  target           303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 35.5+ KB

dataset.to_csv(f"{data_path}/preprocessed_heart.csv", index=False)