Data analysis#

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import numpy as np
 3import pandas as pd
 4
 5# Plotting
 6import seaborn as sns
 7import matplotlib.pyplot as plt
 8
 9# Warnings
10import warnings
11
12warnings.filterwarnings("ignore")
13
14%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/cleaned_airquality.csv")
3dataset.head().T
0 1 2 3 4
year 2013 2013 2013 2013 2013
month 3 3 3 3 3
day 1 1 1 1 1
hour 0 1 2 3 4
PM25 4.0 8.0 7.0 6.0 3.0
PM10 4.0 8.0 7.0 6.0 3.0
SO2 4.0 4.0 5.0 11.0 12.0
NO2 7.0 7.0 10.0 11.0 12.0
CO 300.0 300.0 300.0 300.0 300.0
O3 77.0 77.0 73.0 72.0 72.0
TEMP -0.7 -1.1 -1.1 -1.4 -2.0
PRES 1023.0 1023.2 1023.5 1024.5 1025.2
DEWP -18.8 -18.2 -18.2 -19.4 -19.5
RAIN 0.0 0.0 0.0 0.0 0.0
wd NNW N NNW NW N
WSPM 4.4 4.7 5.6 3.1 2.0
station Aotizhongxin Aotizhongxin Aotizhongxin Aotizhongxin Aotizhongxin

Data analysis#

The Concentration of PM25 and PM10 per Year#

1year_pm25 = (
2    dataset[["PM25", "year", "station"]]
3    .groupby(["year"])
4    .median()
5    .reset_index()
6    .sort_values(by="year", ascending=False)
7)
8
9year_pm25
year PM25
4 2017 48.0
3 2016 50.0
2 2015 54.0
1 2014 60.0
0 2013 58.0
1sns.pointplot(x="year", y="PM25", data=year_pm25)
<AxesSubplot:xlabel='year', ylabel='PM25'>
../../_images/c7ff9deee5028c6f9f536a43749d5668ce93c0f8d1005bc6802b6c2f7a9f6330.png

Checking for Correlations between Features#

1corr = dataset.corr()
2mask = np.zeros_like(corr, dtype=np.bool)
3mask[np.triu_indices_from(mask)] = True
4sns.heatmap(corr, annot=True, fmt=".2f", mask=mask)
5plt.xticks(range(len(corr.columns)), corr.columns)
6plt.yticks(range(len(corr.columns)), corr.columns)
7plt.show()
../../_images/47a181458db163a0bd39aae83e575f93de6c3143a2c292d2e541c4fb74135a6c.png