Exploratory data analysis#

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import numpy as np
 3import pandas as pd
 4
 5# Visualisation
 6import seaborn as sns
 7import matplotlib.pyplot as plt
 8
 9# Warnings
10import warnings
11
12warnings.filterwarnings("ignore")
13
14%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/preprocessed_heart.csv")
3dataset.head().T
0 1 2 3 4
age 63.0 37.0 41.0 56.0 57.0
sex 1.0 1.0 0.0 1.0 0.0
chest_pain 3.0 2.0 1.0 1.0 0.0
rest_bp 145.0 130.0 130.0 120.0 120.0
chol 233.0 250.0 204.0 236.0 354.0
fast_bld_sugar 1.0 0.0 0.0 0.0 0.0
rest_ecg 0.0 1.0 0.0 1.0 1.0
max_hr 150.0 187.0 172.0 178.0 163.0
ex_angina 0.0 0.0 0.0 0.0 1.0
st_depr 2.3 3.5 1.4 0.8 0.6
slope 0.0 0.0 2.0 2.0 2.0
colored_vessels 0.0 0.0 0.0 0.0 0.0
thalassemia 1.0 2.0 2.0 2.0 2.0
target 1.0 1.0 1.0 1.0 1.0

Plotting distributions and relationships#

Plotting the Distributions and Relationships Between Specific Features#

 1sns.set(
 2    palette="pastel",
 3    rc={
 4        "figure.figsize": (16, 10),
 5        "axes.titlesize": 18,
 6        "axes.labelsize": 16,
 7        "xtick.labelsize": 14,
 8        "ytick.labelsize": 16,
 9    },
10)
1g = sns.countplot(x="age", data=dataset)
2g.set_title("Distribution of Age")
3plt.xlabel("Age")
Text(0.5, 0, 'Age')
../../_images/a3be31c411d2d6f0bbda500d2748832872df0f6c9728124dc7d29e1689877e99.png
1print(dataset["target"].value_counts())
2print()
3print(dataset["target"].value_counts(normalize=True))
1    165
0    138
Name: target, dtype: int64

1    0.544554
0    0.455446
Name: target, dtype: float64
1a = sns.countplot(x="target", data=dataset)
2a.set_title("Distribution of Presence of Heart Disease")
3a.set_xticklabels(["Absent", "Present"])
4plt.xlabel("Presence of Heart Disease")
5plt.show()
../../_images/ef72a52ed0596d708143083ca1409f6ca7554d01e879a8f84d9c07f909ec6766.png
1print(dataset["sex"].value_counts())
2print()
3print(dataset["sex"].value_counts(normalize=True))
1    207
0     96
Name: sex, dtype: int64

1    0.683168
0    0.316832
Name: sex, dtype: float64
1b = sns.countplot(x="target", data=dataset, hue="sex")
2plt.legend(["Female", "Male"])
3b.set_title("Distribution of Presence of Heart Disease by Sex")
4b.set_xticklabels(["Absent", "Present"])
5plt.show()
../../_images/5129625d2c5bb073db133071062dc38416be7cacd11c9556d07f8ab4cce27333.png

Plotting Distributions and Relationships between Columns with Respect to the Target Column#

1print(dataset["chest_pain"].value_counts())
2print()
3print(dataset["chest_pain"].value_counts(normalize=True))
0    143
2     87
1     50
3     23
Name: chest_pain, dtype: int64

0    0.471947
2    0.287129
1    0.165017
3    0.075908
Name: chest_pain, dtype: float64
1c = sns.countplot(x="chest_pain", data=dataset, hue="target")
2plt.legend(["Absent", "Present"])
3c.set_title("Distribution of Presence of Heart Disease by Chest Pain Type")
4c.set_xticklabels(
5    ["Typical Angina", "Atypical Angina", "Non-anginal Pain", "Asymptomatic"]
6)
7plt.show()
../../_images/6d4fcdfda3de1535f364ca1580d31a3a3e366ba4a10ece5220d384518c2d6018.png
1print(dataset["colored_vessels"].value_counts())
2print()
3print(dataset["colored_vessels"].value_counts(normalize=True))
0    175
1     65
2     38
3     20
4      5
Name: colored_vessels, dtype: int64

0    0.577558
1    0.214521
2    0.125413
3    0.066007
4    0.016502
Name: colored_vessels, dtype: float64
1d = sns.countplot(x="colored_vessels", data=dataset, hue="target")
2plt.legend(["Absent", "Present"])
3d.set_title(
4    "Distribution of Presence of Heart Disease by Number of Major "
5    "Vessels Coloured by Fluoroscopy"
6)
7plt.show()
../../_images/10e25c5c4649a8d268b10255c4c9bb89599afa90abee12293e98a1f548a3ed50.png
1print(dataset["slope"].value_counts())
2print()
3print(dataset["slope"].value_counts(normalize=True))
2    142
1    140
0     21
Name: slope, dtype: int64

2    0.468647
1    0.462046
0    0.069307
Name: slope, dtype: float64
1f = sns.countplot(x="slope", data=dataset, hue="target")
2plt.legend(["Absent", "Present"])
3f.set_title("Distribution of Presence of Heart Disease by Slope")
4f.set_xticklabels(["Upsloping", "Flat", "Downsloping"])
5plt.show()
../../_images/826dc95f7a12d1494ffddf0e73dcb328272205455e9903bfc5f3776b73ace836.png

Plotting the Relationship between the Presence of Heart Disease and Maximum Recorded Heart Rate#

 1# Plotting the Relationship between the Presence of Heart Disease and
 2# Maximum Recorded Heart Rate
 3sns.set(
 4    style="whitegrid",
 5    palette="colorblind",
 6    rc={
 7        "figure.figsize": (12, 8),
 8        "axes.titlesize": 18,
 9        "axes.labelsize": 16,
10        "xtick.labelsize": 16,
11        "ytick.labelsize": 16,
12    },
13)
1f = sns.scatterplot(
2    x="age", y="max_hr", hue="target", style="target", data=dataset
3)
4f.set_title("Presence of Heart Disease based on Age and Maximum Heart Rate")
5plt.xlabel("Age")
6plt.ylabel("Maximum Heart Rate")
Text(0, 0.5, 'Maximum Heart Rate')
../../_images/3f82e0d37e6ae5b9a0956c093be28a7aa270edd594218706f1632e04d2fc7bb9.png
 1dataset["age_category"] = pd.cut(dataset.age, bins=list(np.arange(25, 85, 5)))
 2
 3plt.subplot(121)
 4dataset[dataset.target == 1].groupby("age_category")["age"].count().plot(
 5    kind="bar"
 6)
 7plt.title("Present")
 8plt.xlabel("Age Group")
 9plt.ylabel("Count")
10plt.subplot(122)
11dataset[dataset.target == 0].groupby("age_category")["age"].count().plot(
12    kind="bar"
13)
14plt.title("Absent")
15plt.xlabel("Age Group")
16plt.ylabel("Count")
Text(0, 0.5, 'Count')
../../_images/3ec70dbd48a6ac7c7c055d867da6b6a70e3365e7e2c138b3c5ff95f8ea2baf52.png

Plotting the Relationship between the Presence of Heart Disease and the Cholesterol Column#

1g = sns.scatterplot(
2    x="age", y="chol", hue="target", style="target", data=dataset
3)
4g.set_title("Presence of Heart Disease based on Age and Cholesterol")
5plt.xlabel("Age")
6plt.ylabel("Cholesterol")
Text(0, 0.5, 'Cholesterol')
../../_images/8c4ff63b6b7415e61de7b0e652e07a1759897b442891e2c98d6fcc0f984d9dda.png
 1dataset["chol_cat"] = pd.cut(dataset.chol, bins=list(np.arange(120, 380, 20)))
 2dataset["chol_cat"] = pd.cut(dataset.chol, bins=list(np.arange(120, 380, 20)))
 3
 4plt.subplot(121)
 5dataset[dataset.target == 1].groupby("chol_cat")["chol"].count().plot(
 6    kind="bar"
 7)
 8plt.title("Present")
 9plt.xlabel("Cholesterol Group")
10plt.ylabel("Count")
11
12plt.subplot(122)
13dataset[dataset.target == 0].groupby("chol_cat")["chol"].count().plot(
14    kind="bar"
15)
16plt.title("Absent")
17plt.xlabel("Cholesterol Group")
18plt.ylabel("Count")
Text(0, 0.5, 'Count')
../../_images/c6dd8878b74451eb2c26ac17cf2c560fd6b7e6d859b70f99e006bcb36714e59b.png