Preprocessing#

Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.

Importing libraries and packages#

1# Mathematical operations and data manipulation
2import pandas as pd
3
4# Warnings
5import warnings
6
7warnings.filterwarnings("ignore")
8%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

 1# load datasets
 2aot = pd.read_csv(f"{data_path}/PRSA_Data_Aotizhongxin_20130301-20170228.csv")
 3chan = pd.read_csv(f"{data_path}/PRSA_Data_Changping_20130301-20170228.csv")
 4ding = pd.read_csv(f"{data_path}/PRSA_Data_Dingling_20130301-20170228.csv")
 5dong = pd.read_csv(f"{data_path}/PRSA_Data_Dongsi_20130301-20170228.csv")
 6guan = pd.read_csv(f"{data_path}/PRSA_Data_Guanyuan_20130301-20170228.csv")
 7guch = pd.read_csv(f"{data_path}/PRSA_Data_Gucheng_20130301-20170228.csv")
 8hua = pd.read_csv(f"{data_path}/PRSA_Data_Huairou_20130301-20170228.csv")
 9nong = pd.read_csv(f"{data_path}/PRSA_Data_Nongzhanguan_20130301-20170228.csv")
10shu = pd.read_csv(f"{data_path}/PRSA_Data_Shunyi_20130301-20170228.csv")
11tian = pd.read_csv(f"{data_path}/PRSA_Data_Tiantan_20130301-20170228.csv")
12wan = pd.read_csv(f"{data_path}/PRSA_Data_Wanliu_20130301-20170228.csv")
13wans = pd.read_csv(
14    f"{data_path}/PRSA_Data_Wanshouxigong_20130301-20170228.csv"
15)
1dfs = [aot, chan, ding, dong, guan, guch, hua, nong, shu, tian, wan, wans]
1dataset = pd.concat(dfs)
1dataset.reset_index(drop=True, inplace=True)
2dataset.head()
No year month day hour PM2.5 PM10 SO2 NO2 CO O3 TEMP PRES DEWP RAIN wd WSPM station
0 1 2013 3 1 0 4.0 4.0 4.0 7.0 300.0 77.0 -0.7 1023.0 -18.8 0.0 NNW 4.4 Aotizhongxin
1 2 2013 3 1 1 8.0 8.0 4.0 7.0 300.0 77.0 -1.1 1023.2 -18.2 0.0 N 4.7 Aotizhongxin
2 3 2013 3 1 2 7.0 7.0 5.0 10.0 300.0 73.0 -1.1 1023.5 -18.2 0.0 NNW 5.6 Aotizhongxin
3 4 2013 3 1 3 6.0 6.0 11.0 11.0 300.0 72.0 -1.4 1024.5 -19.4 0.0 NW 3.1 Aotizhongxin
4 5 2013 3 1 4 3.0 3.0 12.0 12.0 300.0 72.0 -2.0 1025.2 -19.5 0.0 N 2.0 Aotizhongxin

Exploring dataset#

1# Printing dimensionality of the data, columns, types and missing values
2print(f"Data dimension: {dataset.shape}")
3for col in dataset.columns:
4    print(
5        f"Column: {col:35} | "
6        f"type: {str(dataset[col].dtype):7} | "
7        f"missing values: {dataset[col].isna().sum():3d}"
8    )
Data dimension: (420768, 18)
Column: No                                  | type: int64   | missing values:   0
Column: year                                | type: int64   | missing values:   0
Column: month                               | type: int64   | missing values:   0
Column: day                                 | type: int64   | missing values:   0
Column: hour                                | type: int64   | missing values:   0
Column: PM2.5                               | type: float64 | missing values: 8739
Column: PM10                                | type: float64 | missing values: 6449
Column: SO2                                 | type: float64 | missing values: 9021
Column: NO2                                 | type: float64 | missing values: 12116
Column: CO                                  | type: float64 | missing values: 20701
Column: O3                                  | type: float64 | missing values: 13277
Column: TEMP                                | type: float64 | missing values: 398
Column: PRES                                | type: float64 | missing values: 393
Column: DEWP                                | type: float64 | missing values: 403
Column: RAIN                                | type: float64 | missing values: 390
Column: wd                                  | type: object  | missing values: 1822
Column: WSPM                                | type: float64 | missing values: 318
Column: station                             | type: object  | missing values:   0

Column Description has some missing values, Customer ID has a lot of (20%) missing values.

1# Computing statistics on numerical features
2dataset.describe().T
count mean std min 25% 50% 75% max
No 420768.0 17532.500000 10122.116943 1.0000 8766.75 17532.5 26298.25 35064.0
year 420768.0 2014.662560 1.177198 2013.0000 2014.00 2015.0 2016.00 2017.0
month 420768.0 6.522930 3.448707 1.0000 4.00 7.0 10.00 12.0
day 420768.0 15.729637 8.800102 1.0000 8.00 16.0 23.00 31.0
hour 420768.0 11.500000 6.922195 0.0000 5.75 11.5 17.25 23.0
PM2.5 412029.0 79.793428 80.822391 2.0000 20.00 55.0 111.00 999.0
PM10 414319.0 104.602618 91.772426 2.0000 36.00 82.0 145.00 999.0
SO2 411747.0 15.830835 21.650603 0.2856 3.00 7.0 20.00 500.0
NO2 408652.0 50.638586 35.127912 1.0265 23.00 43.0 71.00 290.0
CO 400067.0 1230.766454 1160.182716 100.0000 500.00 900.0 1500.00 10000.0
O3 407491.0 57.372271 56.661607 0.2142 11.00 45.0 82.00 1071.0
TEMP 420370.0 13.538976 11.436139 -19.9000 3.10 14.5 23.30 41.6
PRES 420375.0 1010.746982 10.474055 982.4000 1002.30 1010.4 1019.00 1042.8
DEWP 420365.0 2.490822 13.793847 -43.4000 -8.90 3.1 15.10 29.1
RAIN 420378.0 0.064476 0.821004 0.0000 0.00 0.0 0.00 72.5
WSPM 420450.0 1.729711 1.246386 0.0000 0.90 1.4 2.20 13.2

Preprocessing#

1# The No column seems unnecessary (the DataFrame is already indexed).
2dataset = dataset.drop(["No"], axis=1)
1# Rename the PM2.5 column to PM25 to prevent problems
2dataset.rename(
3    index=str,
4    columns={
5        "PM2.5": "PM25",
6    },
7    inplace=True,
8)
1dataset.isnull().sum()
year           0
month          0
day            0
hour           0
PM25        8739
PM10        6449
SO2         9021
NO2        12116
CO         20701
O3         13277
TEMP         398
PRES         393
DEWP         403
RAIN         390
wd          1822
WSPM         318
station        0
dtype: int64
1round(dataset.isnull().sum() / len(dataset.index), 4) * 100
year       0.00
month      0.00
day        0.00
hour       0.00
PM25       2.08
PM10       1.53
SO2        2.14
NO2        2.88
CO         4.92
O3         3.16
TEMP       0.09
PRES       0.09
DEWP       0.10
RAIN       0.09
wd         0.43
WSPM       0.08
station    0.00
dtype: float64
1dataset.head()
year month day hour PM25 PM10 SO2 NO2 CO O3 TEMP PRES DEWP RAIN wd WSPM station
0 2013 3 1 0 4.0 4.0 4.0 7.0 300.0 77.0 -0.7 1023.0 -18.8 0.0 NNW 4.4 Aotizhongxin
1 2013 3 1 1 8.0 8.0 4.0 7.0 300.0 77.0 -1.1 1023.2 -18.2 0.0 N 4.7 Aotizhongxin
2 2013 3 1 2 7.0 7.0 5.0 10.0 300.0 73.0 -1.1 1023.5 -18.2 0.0 NNW 5.6 Aotizhongxin
3 2013 3 1 3 6.0 6.0 11.0 11.0 300.0 72.0 -1.4 1024.5 -19.4 0.0 NW 3.1 Aotizhongxin
4 2013 3 1 4 3.0 3.0 12.0 12.0 300.0 72.0 -2.0 1025.2 -19.5 0.0 N 2.0 Aotizhongxin
1dataset.to_csv(f"{data_path}/preprocessed_airquality.csv", index=False)