Outliers and missing values#

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import pandas as pd
 3
 4# Plotting
 5import seaborn as sns
 6
 7# Warnings
 8import warnings
 9
10warnings.filterwarnings("ignore")
11
12%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/preprocessed_airquality.csv")
3dataset.head().T
0 1 2 3 4
year 2013 2013 2013 2013 2013
month 3 3 3 3 3
day 1 1 1 1 1
hour 0 1 2 3 4
PM25 4.0 8.0 7.0 6.0 3.0
PM10 4.0 8.0 7.0 6.0 3.0
SO2 4.0 4.0 5.0 11.0 12.0
NO2 7.0 7.0 10.0 11.0 12.0
CO 300.0 300.0 300.0 300.0 300.0
O3 77.0 77.0 73.0 72.0 72.0
TEMP -0.7 -1.1 -1.1 -1.4 -2.0
PRES 1023.0 1023.2 1023.5 1024.5 1025.2
DEWP -18.8 -18.2 -18.2 -19.4 -19.5
RAIN 0.0 0.0 0.0 0.0 0.0
wd NNW N NNW NW N
WSPM 4.4 4.7 5.6 3.1 2.0
station Aotizhongxin Aotizhongxin Aotizhongxin Aotizhongxin Aotizhongxin

Outliers#

1pm_25 = sns.boxplot(dataset["PM25"])
../../_images/f21e5795ec817b3ca58de5465cad5dd9842e29f97814e3351652e8a2350d75ea.png
1(dataset["PM25"] >= 250).sum()
18668
1pm25 = dataset.loc[dataset["PM25"] >= 250]
2pm25.head()
year month day hour PM25 PM10 SO2 NO2 CO O3 TEMP PRES DEWP RAIN wd WSPM station
120 2013 3 6 0 284.0 315.0 133.0 174.0 4000.0 28.0 4.9 1008.5 -6.4 0.0 NE 1.2 Aotizhongxin
121 2013 3 6 1 272.0 300.0 131.0 166.0 4000.0 22.0 4.8 1008.3 -5.9 0.0 NE 1.5 Aotizhongxin
140 2013 3 6 20 254.0 396.0 107.0 154.0 4200.0 17.0 4.4 1008.2 -2.5 0.0 SSW 1.1 Aotizhongxin
141 2013 3 6 21 266.0 380.0 117.0 159.0 3799.0 17.0 3.8 1007.7 -2.2 0.0 WSW 1.9 Aotizhongxin
142 2013 3 6 22 254.0 335.0 111.0 148.0 4099.0 17.0 2.4 1007.3 -2.5 0.0 WSW 1.2 Aotizhongxin
1pm_10 = sns.boxplot(dataset.PM10)
../../_images/644eeb68953970c78e4fa6a1efcfed9ebb19796233e747776cacea70016316a6.png
1pm10 = dataset.loc[dataset["PM10"] >= 320]
2pm10.head()
year month day hour PM25 PM10 SO2 NO2 CO O3 TEMP PRES DEWP RAIN wd WSPM station
139 2013 3 6 19 242.0 338.0 124.0 164.0 3700.0 17.0 5.1 1007.6 -2.5 0.0 WSW 1.2 Aotizhongxin
140 2013 3 6 20 254.0 396.0 107.0 154.0 4200.0 17.0 4.4 1008.2 -2.5 0.0 SSW 1.1 Aotizhongxin
141 2013 3 6 21 266.0 380.0 117.0 159.0 3799.0 17.0 3.8 1007.7 -2.2 0.0 WSW 1.9 Aotizhongxin
142 2013 3 6 22 254.0 335.0 111.0 148.0 4099.0 17.0 2.4 1007.3 -2.5 0.0 WSW 1.2 Aotizhongxin
143 2013 3 6 23 260.0 360.0 119.0 145.0 4000.0 17.0 2.3 1007.2 -2.4 0.0 NNE 1.3 Aotizhongxin
1(dataset["PM10"] >= 320).sum()
12865
1dataset.loc[(dataset["PM25"] >= 250) & (dataset["PM10"] >= 320)]
year month day hour PM25 PM10 SO2 NO2 CO O3 TEMP PRES DEWP RAIN wd WSPM station
140 2013 3 6 20 254.0 396.0 107.0 154.0 4200.0 17.0 4.4 1008.2 -2.5 0.0 SSW 1.1 Aotizhongxin
141 2013 3 6 21 266.0 380.0 117.0 159.0 3799.0 17.0 3.8 1007.7 -2.2 0.0 WSW 1.9 Aotizhongxin
142 2013 3 6 22 254.0 335.0 111.0 148.0 4099.0 17.0 2.4 1007.3 -2.5 0.0 WSW 1.2 Aotizhongxin
143 2013 3 6 23 260.0 360.0 119.0 145.0 4000.0 17.0 2.3 1007.2 -2.4 0.0 NNE 1.3 Aotizhongxin
149 2013 3 7 5 275.0 337.0 46.0 103.0 5599.0 17.0 1.6 1006.7 -1.5 0.0 NNE 1.7 Aotizhongxin
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
420187 2017 2 4 19 409.0 409.0 56.0 159.0 400.0 10.0 7.1 1013.5 -8.1 0.0 WNW 2.4 Wanshouxigong
420188 2017 2 4 20 389.0 389.0 42.0 153.0 400.0 7.0 7.7 1014.1 -10.2 0.0 NW 3.0 Wanshouxigong
420358 2017 2 11 22 259.0 369.0 75.0 108.0 1500.0 76.0 -1.7 1023.6 -13.9 0.0 ENE 0.9 Wanshouxigong
420359 2017 2 11 23 303.0 410.0 92.0 109.0 1700.0 2.0 -1.9 1023.5 -14.0 0.0 N 0.8 Wanshouxigong
420360 2017 2 12 0 350.0 388.0 96.0 106.0 2300.0 2.0 -3.2 1023.6 -12.9 0.0 NE 1.1 Wanshouxigong

10047 rows × 17 columns

1((dataset["PM25"] >= 250) & (dataset["PM10"] >= 320)).sum()
10047
1dataset.describe().T
count mean std min 25% 50% 75% max
year 420768.0 2014.662560 1.177198 2013.0000 2014.00 2015.0 2016.00 2017.0
month 420768.0 6.522930 3.448707 1.0000 4.00 7.0 10.00 12.0
day 420768.0 15.729637 8.800102 1.0000 8.00 16.0 23.00 31.0
hour 420768.0 11.500000 6.922195 0.0000 5.75 11.5 17.25 23.0
PM25 412029.0 79.793428 80.822391 2.0000 20.00 55.0 111.00 999.0
PM10 414319.0 104.602618 91.772426 2.0000 36.00 82.0 145.00 999.0
SO2 411747.0 15.830835 21.650603 0.2856 3.00 7.0 20.00 500.0
NO2 408652.0 50.638586 35.127912 1.0265 23.00 43.0 71.00 290.0
CO 400067.0 1230.766454 1160.182716 100.0000 500.00 900.0 1500.00 10000.0
O3 407491.0 57.372271 56.661607 0.2142 11.00 45.0 82.00 1071.0
TEMP 420370.0 13.538976 11.436139 -19.9000 3.10 14.5 23.30 41.6
PRES 420375.0 1010.746982 10.474055 982.4000 1002.30 1010.4 1019.00 1042.8
DEWP 420365.0 2.490822 13.793847 -43.4000 -8.90 3.1 15.10 29.1
RAIN 420378.0 0.064476 0.821004 0.0000 0.00 0.0 0.00 72.5
WSPM 420450.0 1.729711 1.246386 0.0000 0.90 1.4 2.20 13.2
1so2 = sns.boxplot(dataset.SO2)
../../_images/9807cc6e26b048d4b953909057dd9c346baa95313fe1c03a5ae5655cf6c33bf9.png
1(dataset["SO2"] >= 70).sum()
14571
1(
2    (dataset["PM25"] >= 250)
3    & (dataset["PM10"] >= 320)
4    & (dataset["SO2"] >= 70)
5).sum()
2297
1(dataset["SO2"] >= 300).sum()
9
1no2 = sns.boxplot(dataset.NO2)
../../_images/2e8dfc647e0c4da1dfffd6576516222cb933dd2169af39006f0eae8eb9504a7e.png
1(dataset["NO2"] >= 150).sum()
5502
1(
2    (dataset["PM25"] >= 250)
3    & (dataset["PM10"] >= 320)
4    & (dataset["SO2"] >= 200)
5    & (dataset["NO2"] >= 150).sum()
6).sum()
0
1co = sns.boxplot(dataset.CO)
../../_images/db63a62ec2985ee1b10dabc6592206044436e462554aeba75420fa53d7562942.png
1(dataset["CO"] >= 3000).sum()
30355
1o3 = sns.boxplot(dataset.O3)
../../_images/d6c4b326919b6ee9d24f3264aa1a2068e02e4c39508ed351d1eeeb6cad3fa2a1.png
1(dataset["O3"] >= 200).sum()
13358
1(dataset["O3"] >= 470).sum()
24
1rain = sns.boxplot(dataset.RAIN)
../../_images/2636dc8d905a8245be0be6279c33c6601d6b0d73d5ba73658c0218533e251d2d.png
 1colors = ["windows blue", "amber", "faded green", "dusty purple"]
 2sns.set(
 3    rc={
 4        "figure.figsize": (18, 8),
 5        # "axes.titlesize" : 18,
 6        "axes.labelsize": 14,
 7        "xtick.labelsize": 14,
 8        "ytick.labelsize": 14,
 9    }
10)
11
12sns.boxplot(x="station", y="PM25", data=dataset.dropna(axis=0).reset_index())
<AxesSubplot:xlabel='station', ylabel='PM25'>
../../_images/2f2677cde5461335643fe738cbc12cc44cebd0ec87ada691ddc78a234eca1973.png
1sns.boxplot(x="station", y="PM10", data=dataset.dropna(axis=0).reset_index())
<AxesSubplot:xlabel='station', ylabel='PM10'>
../../_images/4fec5816bdb1cbaabcebeeb77d8feb50450099c548ea159776ffa1706c3a4ec8.png
1sns.boxplot(x="station", y="SO2", data=dataset.dropna(axis=0).reset_index())
<AxesSubplot:xlabel='station', ylabel='SO2'>
../../_images/cbfd133a3407dc9e949e65d7507b96a43aa6eda0686b4869a5a8924c93881757.png
1sns.boxplot(x="station", y="NO2", data=dataset.dropna(axis=0).reset_index())
<AxesSubplot:xlabel='station', ylabel='NO2'>
../../_images/ca2a3ba5524fc2aa694d76868b0e329bf3f490116aa392104b834b44fe9dc46c.png
1sns.boxplot(x="station", y="CO", data=dataset.dropna(axis=0).reset_index())
<AxesSubplot:xlabel='station', ylabel='CO'>
../../_images/78a5f6c5f1f263c1e80c425bd57d307e6fb2e8cd48bddc912c454ae0aef9b038.png
1sns.boxplot(x="station", y="O3", data=dataset.dropna(axis=0).reset_index())
<AxesSubplot:xlabel='station', ylabel='O3'>
../../_images/ffe228dbb833f4b7f04a01486e282b94482d1bd1549e247b697f08dac51ccd54.png
1sns.boxplot(x="station", y="RAIN", data=dataset.dropna(axis=0).reset_index())
<AxesSubplot:xlabel='station', ylabel='RAIN'>
../../_images/a1e109031aead6aa8e2d49cbd9a6c4255b3a0da5303bf2d7c22115a856d71702.png

Missing values#

1new_air = dataset
1new_air.isnull().sum()
year           0
month          0
day            0
hour           0
PM25        8739
PM10        6449
SO2         9021
NO2        12116
CO         20701
O3         13277
TEMP         398
PRES         393
DEWP         403
RAIN         390
wd          1822
WSPM         318
station        0
dtype: int64
1new_air["PM25"].fillna(new_air["PM25"].median(), inplace=True)
1new_air.isnull().sum()
year           0
month          0
day            0
hour           0
PM25           0
PM10        6449
SO2         9021
NO2        12116
CO         20701
O3         13277
TEMP         398
PRES         393
DEWP         403
RAIN         390
wd          1822
WSPM         318
station        0
dtype: int64
1new_air = new_air.fillna(new_air["wd"].value_counts().index[0])
1new_air.isnull().sum()
year       0
month      0
day        0
hour       0
PM25       0
PM10       0
SO2        0
NO2        0
CO         0
O3         0
TEMP       0
PRES       0
DEWP       0
RAIN       0
wd         0
WSPM       0
station    0
dtype: int64
1new_air.to_csv(f"{data_path}/cleaned_airquality.csv", index=False)