Impact of education#

Get insights on whether the education of people in New York has an influence on their annual salary and weekly working hours.

Importing libraries and packages#

 1# Warnings
 2import warnings
 3
 4# Mathematical operations and data manipulation
 5import pandas as pd
 6import numpy as np
 7
 8# Plotting
 9import matplotlib.pyplot as plt
10import seaborn as sns
11import squarify
12
13sns.set()
14warnings.filterwarnings("ignore")

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

Dataset created from The American Community Survey (ACS) Public-Use Microdata Samples (PUMS) dataset (one-year estimate from 2017).

1dataset = pd.read_csv(f"{data_path}/age_salary_hours.csv")

Exploring dataset#

1# Shape of the dataset
2print("Shape of the dataset: ", dataset.shape)
3# View
4dataset
Shape of the dataset:  (500, 4)
Age Annual Salary Weekly hours Education
0 72 160000.0 40.0 Bachelor's degree or higher
1 72 100000.0 50.0 Bachelor's degree or higher
2 31 120000.0 40.0 Bachelor's degree or higher
3 28 45000.0 40.0 Bachelor's degree or higher
4 54 85000.0 40.0 Bachelor's degree or higher
... ... ... ... ...
495 27 47000.0 40.0 Bachelor's degree or higher
496 53 132000.0 70.0 Bachelor's degree or higher
497 51 10100.0 20.0 Bachelor's degree or higher
498 32 57000.0 35.0 Bachelor's degree or higher
499 18 18700.0 20.0 Attended college, no degree

500 rows × 4 columns

Preprocessing#

1# Compute percentages from dataset
2degrees = set(dataset["Education"])
3print(degrees)
{'No diploma', "Associate's degree", "Bachelor's degree or higher", 'High school diploma', 'Attended college, no degree'}
1percentages = []
2for degree in degrees:
3    percentages.append(dataset[dataset["Education"] == degree].shape[0])
4percentages = np.array(percentages)
5percentages = (percentages / percentages.sum()) * 100
6print(percentages)
[ 3.4 10.  47.4 21.4 17.8]
1# Create labels for tree map
2labels = [
3    degree + "\n({0:.1f}%)".format(percentage)
4    for degree, percentage in zip(degrees, percentages)
5]
6print(labels)
['No diploma\n(3.4%)', "Associate's degree\n(10.0%)", "Bachelor's degree or higher\n(47.4%)", 'High school diploma\n(21.4%)', 'Attended college, no degree\n(17.8%)']
1ordered_degrees = sorted(list(degrees))
2ordered_degrees = [
3    ordered_degrees[4],
4    ordered_degrees[3],
5    ordered_degrees[1],
6    ordered_degrees[0],
7    ordered_degrees[2],
8]
9ordered_degrees
['No diploma',
 'High school diploma',
 'Attended college, no degree',
 "Associate's degree",
 "Bachelor's degree or higher"]
1data = dataset.loc[dataset["Age"] < 65]

Visualisations#

 1# Create figure
 2plt.figure(figsize=(9, 6), dpi=200)
 3squarify.plot(
 4    percentages,
 5    label=labels,
 6    color=sns.color_palette("colorblind", len(degrees)),
 7)
 8plt.axis("off")
 9# Add title
10plt.title("Degrees")
11# Show plot
12plt.show()
../../_images/6eb5d2e8b7155627ea741338c24db599e26e78e9e0649c662e4efaffad4c0aaf.png
 1# Visualizing two violin plots for the annual salary and weekly working hours
 2# Set color palette to colorblind
 3sns.set_palette("colorblind")
 4# Create subplot with two rows
 5fig, ax = plt.subplots(2, 1, dpi=200, figsize=(8, 8))
 6sns.violinplot(
 7    "Education",
 8    "Annual Salary",
 9    data=data,
10    cut=0,
11    order=ordered_degrees,
12    ax=ax[0],
13)
14ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=10)
15sns.violinplot(
16    "Education",
17    "Weekly hours",
18    data=data,
19    cut=0,
20    order=ordered_degrees,
21    ax=ax[1],
22)
23ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=10)
24plt.tight_layout()
25# Add title
26fig.suptitle(
27    "Impact of Education on Annual Salary and Weekly Working Hours in New York"
28)
29# Show figure
30plt.show()
../../_images/7f535afda5e5f7408ea2719946362560fb8841fca80163423a5e7943a79929b6.png