Preprocessing#

Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.

Importing libraries and packages#

1# Mathematical operations and data manipulation
2import pandas as pd
3
4# Warnings
5import warnings
6
7warnings.filterwarnings("ignore")
8
9%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/heart.csv")
3dataset.head().T
0 1 2 3 4
age 63.0 37.0 41.0 56.0 57.0
sex 1.0 1.0 0.0 1.0 0.0
cp 3.0 2.0 1.0 1.0 0.0
trestbps 145.0 130.0 130.0 120.0 120.0
chol 233.0 250.0 204.0 236.0 354.0
fbs 1.0 0.0 0.0 0.0 0.0
restecg 0.0 1.0 0.0 1.0 1.0
thalach 150.0 187.0 172.0 178.0 163.0
exang 0.0 0.0 0.0 0.0 1.0
oldpeak 2.3 3.5 1.4 0.8 0.6
slope 0.0 0.0 2.0 2.0 2.0
ca 0.0 0.0 0.0 0.0 0.0
thal 1.0 2.0 2.0 2.0 2.0
target 1.0 1.0 1.0 1.0 1.0

Exploring dataset#

1# Printing dimensionality of the data, columns, types and missing values
2print(f"Data dimension: {dataset.shape}")
3for col in dataset.columns:
4    print(
5        f"Column: {col:35} | "
6        f"type: {str(dataset[col].dtype):7} | "
7        f"missing values: {dataset[col].isna().sum():3d}"
8    )
Data dimension: (303, 14)
Column: age                                 | type: int64   | missing values:   0
Column: sex                                 | type: int64   | missing values:   0
Column: cp                                  | type: int64   | missing values:   0
Column: trestbps                            | type: int64   | missing values:   0
Column: chol                                | type: int64   | missing values:   0
Column: fbs                                 | type: int64   | missing values:   0
Column: restecg                             | type: int64   | missing values:   0
Column: thalach                             | type: int64   | missing values:   0
Column: exang                               | type: int64   | missing values:   0
Column: oldpeak                             | type: float64 | missing values:   0
Column: slope                               | type: int64   | missing values:   0
Column: ca                                  | type: int64   | missing values:   0
Column: thal                                | type: int64   | missing values:   0
Column: target                              | type: int64   | missing values:   0
1# Computing statistics on numerical features
2dataset.describe().T
count mean std min 25% 50% 75% max
age 303.0 54.366337 9.082101 29.0 47.5 55.0 61.0 77.0
sex 303.0 0.683168 0.466011 0.0 0.0 1.0 1.0 1.0
cp 303.0 0.966997 1.032052 0.0 0.0 1.0 2.0 3.0
trestbps 303.0 131.623762 17.538143 94.0 120.0 130.0 140.0 200.0
chol 303.0 246.264026 51.830751 126.0 211.0 240.0 274.5 564.0
fbs 303.0 0.148515 0.356198 0.0 0.0 0.0 0.0 1.0
restecg 303.0 0.528053 0.525860 0.0 0.0 1.0 1.0 2.0
thalach 303.0 149.646865 22.905161 71.0 133.5 153.0 166.0 202.0
exang 303.0 0.326733 0.469794 0.0 0.0 0.0 1.0 1.0
oldpeak 303.0 1.039604 1.161075 0.0 0.0 0.8 1.6 6.2
slope 303.0 1.399340 0.616226 0.0 1.0 1.0 2.0 2.0
ca 303.0 0.729373 1.022606 0.0 0.0 0.0 1.0 4.0
thal 303.0 2.313531 0.612277 0.0 2.0 2.0 3.0 3.0
target 303.0 0.544554 0.498835 0.0 0.0 1.0 1.0 1.0

Preprocessing#

 1dataset.rename(
 2    index=str,
 3    columns={
 4        "cp": "chest_pain",
 5        "trestbps": "rest_bp",
 6        "fbs": "fast_bld_sugar",
 7        "restecg": "rest_ecg",
 8        "thalach": "max_hr",
 9        "exang": "ex_angina",
10        "oldpeak": "st_depr",
11        "ca": "colored_vessels",
12        "thal": "thalassemia",
13    },
14    inplace=True,
15)
1dataset.info()
<class 'pandas.core.frame.DataFrame'>
Index: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              303 non-null    int64  
 1   sex              303 non-null    int64  
 2   chest_pain       303 non-null    int64  
 3   rest_bp          303 non-null    int64  
 4   chol             303 non-null    int64  
 5   fast_bld_sugar   303 non-null    int64  
 6   rest_ecg         303 non-null    int64  
 7   max_hr           303 non-null    int64  
 8   ex_angina        303 non-null    int64  
 9   st_depr          303 non-null    float64
 10  slope            303 non-null    int64  
 11  colored_vessels  303 non-null    int64  
 12  thalassemia      303 non-null    int64  
 13  target           303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 35.5+ KB
1dataset.to_csv(f"{data_path}/preprocessed_heart.csv", index=False)