Exploratory data analysis#

Importing libraries and packages#

# Mathematical operations and data manipulation
import numpy as np
import pandas as pd

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

# load data
dataset = pd.read_csv(f"{data_path}/preprocessed_heart.csv")
dataset.head().T

	0	1	2	3	4
age	63.0	37.0	41.0	56.0	57.0
sex	1.0	1.0	0.0	1.0	0.0
chest_pain	3.0	2.0	1.0	1.0	0.0
rest_bp	145.0	130.0	130.0	120.0	120.0
chol	233.0	250.0	204.0	236.0	354.0
fast_bld_sugar	1.0	0.0	0.0	0.0	0.0
rest_ecg	0.0	1.0	0.0	1.0	1.0
max_hr	150.0	187.0	172.0	178.0	163.0
ex_angina	0.0	0.0	0.0	0.0	1.0
st_depr	2.3	3.5	1.4	0.8	0.6
slope	0.0	0.0	2.0	2.0	2.0
colored_vessels	0.0	0.0	0.0	0.0	0.0
thalassemia	1.0	2.0	2.0	2.0	2.0
target	1.0	1.0	1.0	1.0	1.0

Plotting distributions and relationships#

Plotting the Distributions and Relationships Between Specific Features#

sns.set(
    palette="pastel",
    rc={
        "figure.figsize": (16, 10),
        "axes.titlesize": 18,
        "axes.labelsize": 16,
        "xtick.labelsize": 14,
        "ytick.labelsize": 16,
    },
)

g = sns.countplot(x="age", data=dataset)
g.set_title("Distribution of Age")
plt.xlabel("Age")

Text(0.5, 0, 'Age')

../../_images/a3be31c411d2d6f0bbda500d2748832872df0f6c9728124dc7d29e1689877e99.png

print(dataset["target"].value_counts())
print()
print(dataset["target"].value_counts(normalize=True))

1    165
0    138
Name: target, dtype: int64

1    0.544554
0    0.455446
Name: target, dtype: float64

a = sns.countplot(x="target", data=dataset)
a.set_title("Distribution of Presence of Heart Disease")
a.set_xticklabels(["Absent", "Present"])
plt.xlabel("Presence of Heart Disease")
plt.show()

../../_images/ef72a52ed0596d708143083ca1409f6ca7554d01e879a8f84d9c07f909ec6766.png

print(dataset["sex"].value_counts())
print()
print(dataset["sex"].value_counts(normalize=True))

1    207
0     96
Name: sex, dtype: int64

1    0.683168
0    0.316832
Name: sex, dtype: float64

b = sns.countplot(x="target", data=dataset, hue="sex")
plt.legend(["Female", "Male"])
b.set_title("Distribution of Presence of Heart Disease by Sex")
b.set_xticklabels(["Absent", "Present"])
plt.show()

../../_images/5129625d2c5bb073db133071062dc38416be7cacd11c9556d07f8ab4cce27333.png

Plotting Distributions and Relationships between Columns with Respect to the Target Column#

print(dataset["chest_pain"].value_counts())
print()
print(dataset["chest_pain"].value_counts(normalize=True))

  143
   87
   50
   23
Name: chest_pain, dtype: int64

  0.471947
  0.287129
  0.165017
  0.075908
Name: chest_pain, dtype: float64

c = sns.countplot(x="chest_pain", data=dataset, hue="target")
plt.legend(["Absent", "Present"])
c.set_title("Distribution of Presence of Heart Disease by Chest Pain Type")
c.set_xticklabels(
    ["Typical Angina", "Atypical Angina", "Non-anginal Pain", "Asymptomatic"]
)
plt.show()

../../_images/6d4fcdfda3de1535f364ca1580d31a3a3e366ba4a10ece5220d384518c2d6018.png

print(dataset["colored_vessels"].value_counts())
print()
print(dataset["colored_vessels"].value_counts(normalize=True))

  175
   65
   38
   20
    5
Name: colored_vessels, dtype: int64

  0.577558
  0.214521
  0.125413
  0.066007
  0.016502
Name: colored_vessels, dtype: float64

d = sns.countplot(x="colored_vessels", data=dataset, hue="target")
plt.legend(["Absent", "Present"])
d.set_title(
    "Distribution of Presence of Heart Disease by Number of Major "
    "Vessels Coloured by Fluoroscopy"
)
plt.show()

../../_images/10e25c5c4649a8d268b10255c4c9bb89599afa90abee12293e98a1f548a3ed50.png

print(dataset["slope"].value_counts())
print()
print(dataset["slope"].value_counts(normalize=True))

  142
  140
   21
Name: slope, dtype: int64

  0.468647
  0.462046
  0.069307
Name: slope, dtype: float64

f = sns.countplot(x="slope", data=dataset, hue="target")
plt.legend(["Absent", "Present"])
f.set_title("Distribution of Presence of Heart Disease by Slope")
f.set_xticklabels(["Upsloping", "Flat", "Downsloping"])
plt.show()

../../_images/826dc95f7a12d1494ffddf0e73dcb328272205455e9903bfc5f3776b73ace836.png

Plotting the Relationship between the Presence of Heart Disease and Maximum Recorded Heart Rate#

# Plotting the Relationship between the Presence of Heart Disease and
# Maximum Recorded Heart Rate
sns.set(
    style="whitegrid",
    palette="colorblind",
    rc={
        "figure.figsize": (12, 8),
        "axes.titlesize": 18,
        "axes.labelsize": 16,
        "xtick.labelsize": 16,
        "ytick.labelsize": 16,
    },
)

f = sns.scatterplot(
    x="age", y="max_hr", hue="target", style="target", data=dataset
)
f.set_title("Presence of Heart Disease based on Age and Maximum Heart Rate")
plt.xlabel("Age")
plt.ylabel("Maximum Heart Rate")

Text(0, 0.5, 'Maximum Heart Rate')

../../_images/3f82e0d37e6ae5b9a0956c093be28a7aa270edd594218706f1632e04d2fc7bb9.png

dataset["age_category"] = pd.cut(dataset.age, bins=list(np.arange(25, 85, 5)))

plt.subplot(121)
dataset[dataset.target == 1].groupby("age_category")["age"].count().plot(
    kind="bar"
)
plt.title("Present")
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.subplot(122)
dataset[dataset.target == 0].groupby("age_category")["age"].count().plot(
    kind="bar"
)
plt.title("Absent")
plt.xlabel("Age Group")
plt.ylabel("Count")

Text(0, 0.5, 'Count')

../../_images/3ec70dbd48a6ac7c7c055d867da6b6a70e3365e7e2c138b3c5ff95f8ea2baf52.png

Plotting the Relationship between the Presence of Heart Disease and the Cholesterol Column#

g = sns.scatterplot(
    x="age", y="chol", hue="target", style="target", data=dataset
)
g.set_title("Presence of Heart Disease based on Age and Cholesterol")
plt.xlabel("Age")
plt.ylabel("Cholesterol")

Text(0, 0.5, 'Cholesterol')

../../_images/8c4ff63b6b7415e61de7b0e652e07a1759897b442891e2c98d6fcc0f984d9dda.png

dataset["chol_cat"] = pd.cut(dataset.chol, bins=list(np.arange(120, 380, 20)))
dataset["chol_cat"] = pd.cut(dataset.chol, bins=list(np.arange(120, 380, 20)))

plt.subplot(121)
dataset[dataset.target == 1].groupby("chol_cat")["chol"].count().plot(
    kind="bar"
)
plt.title("Present")
plt.xlabel("Cholesterol Group")
plt.ylabel("Count")

plt.subplot(122)
dataset[dataset.target == 0].groupby("chol_cat")["chol"].count().plot(
    kind="bar"
)
plt.title("Absent")
plt.xlabel("Cholesterol Group")
plt.ylabel("Count")

Text(0, 0.5, 'Count')

../../_images/c6dd8878b74451eb2c26ac17cf2c560fd6b7e6d859b70f99e006bcb36714e59b.png