Impact of education#

Get insights on whether the education of people in New York has an influence on their annual salary and weekly working hours.

Importing libraries and packages#

# Warnings
import warnings

# Mathematical operations and data manipulation
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import squarify

sns.set()
warnings.filterwarnings("ignore")

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

Dataset created from The American Community Survey (ACS) Public-Use Microdata Samples (PUMS) dataset (one-year estimate from 2017).

dataset = pd.read_csv(f"{data_path}/age_salary_hours.csv")

Exploring dataset#

# Shape of the dataset
print("Shape of the dataset: ", dataset.shape)
# View
dataset

Shape of the dataset:  (500, 4)

	Age	Annual Salary	Weekly hours	Education
0	72	160000.0	40.0	Bachelor's degree or higher
1	72	100000.0	50.0	Bachelor's degree or higher
2	31	120000.0	40.0	Bachelor's degree or higher
3	28	45000.0	40.0	Bachelor's degree or higher
4	54	85000.0	40.0	Bachelor's degree or higher
...	...	...	...	...
495	27	47000.0	40.0	Bachelor's degree or higher
496	53	132000.0	70.0	Bachelor's degree or higher
497	51	10100.0	20.0	Bachelor's degree or higher
498	32	57000.0	35.0	Bachelor's degree or higher
499	18	18700.0	20.0	Attended college, no degree

500 rows × 4 columns

Preprocessing#

# Compute percentages from dataset
degrees = set(dataset["Education"])
print(degrees)

{'No diploma', "Associate's degree", "Bachelor's degree or higher", 'High school diploma', 'Attended college, no degree'}

percentages = []
for degree in degrees:
    percentages.append(dataset[dataset["Education"] == degree].shape[0])
percentages = np.array(percentages)
percentages = (percentages / percentages.sum()) * 100
print(percentages)

[ 3.4 10.  47.4 21.4 17.8]

# Create labels for tree map
labels = [
    degree + "\n({0:.1f}%)".format(percentage)
    for degree, percentage in zip(degrees, percentages)
]
print(labels)

['No diploma\n(3.4%)', "Associate's degree\n(10.0%)", "Bachelor's degree or higher\n(47.4%)", 'High school diploma\n(21.4%)', 'Attended college, no degree\n(17.8%)']

ordered_degrees = sorted(list(degrees))
ordered_degrees = [
    ordered_degrees[4],
    ordered_degrees[3],
    ordered_degrees[1],
    ordered_degrees[0],
    ordered_degrees[2],
]
ordered_degrees

['No diploma',
 'High school diploma',
 'Attended college, no degree',
 "Associate's degree",
 "Bachelor's degree or higher"]

data = dataset.loc[dataset["Age"] < 65]

Visualisations#

# Create figure
plt.figure(figsize=(9, 6), dpi=200)
squarify.plot(
    percentages,
    label=labels,
    color=sns.color_palette("colorblind", len(degrees)),
)
plt.axis("off")
# Add title
plt.title("Degrees")
# Show plot
plt.show()

../../_images/6eb5d2e8b7155627ea741338c24db599e26e78e9e0649c662e4efaffad4c0aaf.png

# Visualizing two violin plots for the annual salary and weekly working hours
# Set color palette to colorblind
sns.set_palette("colorblind")
# Create subplot with two rows
fig, ax = plt.subplots(2, 1, dpi=200, figsize=(8, 8))
sns.violinplot(
    "Education",
    "Annual Salary",
    data=data,
    cut=0,
    order=ordered_degrees,
    ax=ax[0],
)
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=10)
sns.violinplot(
    "Education",
    "Weekly hours",
    data=data,
    cut=0,
    order=ordered_degrees,
    ax=ax[1],
)
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=10)
plt.tight_layout()
# Add title
fig.suptitle(
    "Impact of Education on Annual Salary and Weekly Working Hours in New York"
)
# Show figure
plt.show()

../../_images/7f535afda5e5f7408ea2719946362560fb8841fca80163423a5e7943a79929b6.png