Preprocessing#

Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.

Importing libraries and packages#

# Mathematical operations and data manipulation
import pandas as pd

# Warnings
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

# load datasets
aot = pd.read_csv(f"{data_path}/PRSA_Data_Aotizhongxin_20130301-20170228.csv")
chan = pd.read_csv(f"{data_path}/PRSA_Data_Changping_20130301-20170228.csv")
ding = pd.read_csv(f"{data_path}/PRSA_Data_Dingling_20130301-20170228.csv")
dong = pd.read_csv(f"{data_path}/PRSA_Data_Dongsi_20130301-20170228.csv")
guan = pd.read_csv(f"{data_path}/PRSA_Data_Guanyuan_20130301-20170228.csv")
guch = pd.read_csv(f"{data_path}/PRSA_Data_Gucheng_20130301-20170228.csv")
hua = pd.read_csv(f"{data_path}/PRSA_Data_Huairou_20130301-20170228.csv")
nong = pd.read_csv(f"{data_path}/PRSA_Data_Nongzhanguan_20130301-20170228.csv")
shu = pd.read_csv(f"{data_path}/PRSA_Data_Shunyi_20130301-20170228.csv")
tian = pd.read_csv(f"{data_path}/PRSA_Data_Tiantan_20130301-20170228.csv")
wan = pd.read_csv(f"{data_path}/PRSA_Data_Wanliu_20130301-20170228.csv")
wans = pd.read_csv(
    f"{data_path}/PRSA_Data_Wanshouxigong_20130301-20170228.csv"
)

dfs = [aot, chan, ding, dong, guan, guch, hua, nong, shu, tian, wan, wans]

dataset = pd.concat(dfs)

dataset.reset_index(drop=True, inplace=True)
dataset.head()

	No	year	month	day	hour	PM2.5	PM10	SO2	NO2	CO	O3	TEMP	PRES	DEWP	RAIN	wd	WSPM	station
0	1	2013	3	1	0	4.0	4.0	4.0	7.0	300.0	77.0	-0.7	1023.0	-18.8	0.0	NNW	4.4	Aotizhongxin
1	2	2013	3	1	1	8.0	8.0	4.0	7.0	300.0	77.0	-1.1	1023.2	-18.2	0.0	N	4.7	Aotizhongxin
2	3	2013	3	1	2	7.0	7.0	5.0	10.0	300.0	73.0	-1.1	1023.5	-18.2	0.0	NNW	5.6	Aotizhongxin
3	4	2013	3	1	3	6.0	6.0	11.0	11.0	300.0	72.0	-1.4	1024.5	-19.4	0.0	NW	3.1	Aotizhongxin
4	5	2013	3	1	4	3.0	3.0	12.0	12.0	300.0	72.0	-2.0	1025.2	-19.5	0.0	N	2.0	Aotizhongxin

Exploring dataset#

# Printing dimensionality of the data, columns, types and missing values
print(f"Data dimension: {dataset.shape}")
for col in dataset.columns:
    print(
        f"Column: {col:35} | "
        f"type: {str(dataset[col].dtype):7} | "
        f"missing values: {dataset[col].isna().sum():3d}"
    )

Data dimension: (420768, 18)
Column: No                                  | type: int64   | missing values:   0
Column: year                                | type: int64   | missing values:   0
Column: month                               | type: int64   | missing values:   0
Column: day                                 | type: int64   | missing values:   0
Column: hour                                | type: int64   | missing values:   0
Column: PM2.5                               | type: float64 | missing values: 8739
Column: PM10                                | type: float64 | missing values: 6449
Column: SO2                                 | type: float64 | missing values: 9021
Column: NO2                                 | type: float64 | missing values: 12116
Column: CO                                  | type: float64 | missing values: 20701
Column: O3                                  | type: float64 | missing values: 13277
Column: TEMP                                | type: float64 | missing values: 398
Column: PRES                                | type: float64 | missing values: 393
Column: DEWP                                | type: float64 | missing values: 403
Column: RAIN                                | type: float64 | missing values: 390
Column: wd                                  | type: object  | missing values: 1822
Column: WSPM                                | type: float64 | missing values: 318
Column: station                             | type: object  | missing values:   0

Column Description has some missing values, Customer ID has a lot of (20%) missing values.

# Computing statistics on numerical features
dataset.describe().T

	count	mean	std	min	25%	50%	75%	max
No	420768.0	17532.500000	10122.116943	1.0000	8766.75	17532.5	26298.25	35064.0
year	420768.0	2014.662560	1.177198	2013.0000	2014.00	2015.0	2016.00	2017.0
month	420768.0	6.522930	3.448707	1.0000	4.00	7.0	10.00	12.0
day	420768.0	15.729637	8.800102	1.0000	8.00	16.0	23.00	31.0
hour	420768.0	11.500000	6.922195	0.0000	5.75	11.5	17.25	23.0
PM2.5	412029.0	79.793428	80.822391	2.0000	20.00	55.0	111.00	999.0
PM10	414319.0	104.602618	91.772426	2.0000	36.00	82.0	145.00	999.0
SO2	411747.0	15.830835	21.650603	0.2856	3.00	7.0	20.00	500.0
NO2	408652.0	50.638586	35.127912	1.0265	23.00	43.0	71.00	290.0
CO	400067.0	1230.766454	1160.182716	100.0000	500.00	900.0	1500.00	10000.0
O3	407491.0	57.372271	56.661607	0.2142	11.00	45.0	82.00	1071.0
TEMP	420370.0	13.538976	11.436139	-19.9000	3.10	14.5	23.30	41.6
PRES	420375.0	1010.746982	10.474055	982.4000	1002.30	1010.4	1019.00	1042.8
DEWP	420365.0	2.490822	13.793847	-43.4000	-8.90	3.1	15.10	29.1
RAIN	420378.0	0.064476	0.821004	0.0000	0.00	0.0	0.00	72.5
WSPM	420450.0	1.729711	1.246386	0.0000	0.90	1.4	2.20	13.2

Preprocessing#

# The No column seems unnecessary (the DataFrame is already indexed).
dataset = dataset.drop(["No"], axis=1)

# Rename the PM2.5 column to PM25 to prevent problems
dataset.rename(
    index=str,
    columns={
        "PM2.5": "PM25",
    },
    inplace=True,
)

dataset.isnull().sum()

year           0
month          0
day            0
hour           0
PM25        8739
PM10        6449
SO2         9021
NO2        12116
CO         20701
O3         13277
TEMP         398
PRES         393
DEWP         403
RAIN         390
wd          1822
WSPM         318
station        0
dtype: int64

round(dataset.isnull().sum() / len(dataset.index), 4) * 100

year       0.00
month      0.00
day        0.00
hour       0.00
PM25       2.08
PM10       1.53
SO2        2.14
NO2        2.88
CO         4.92
O3         3.16
TEMP       0.09
PRES       0.09
DEWP       0.10
RAIN       0.09
wd         0.43
WSPM       0.08
station    0.00
dtype: float64

dataset.head()

	year	month	day	hour	PM25	PM10	SO2	NO2	CO	O3	TEMP	PRES	DEWP	RAIN	wd	WSPM	station
0	2013	3	1	0	4.0	4.0	4.0	7.0	300.0	77.0	-0.7	1023.0	-18.8	0.0	NNW	4.4	Aotizhongxin
1	2013	3	1	1	8.0	8.0	4.0	7.0	300.0	77.0	-1.1	1023.2	-18.2	0.0	N	4.7	Aotizhongxin
2	2013	3	1	2	7.0	7.0	5.0	10.0	300.0	73.0	-1.1	1023.5	-18.2	0.0	NNW	5.6	Aotizhongxin
3	2013	3	1	3	6.0	6.0	11.0	11.0	300.0	72.0	-1.4	1024.5	-19.4	0.0	NW	3.1	Aotizhongxin
4	2013	3	1	4	3.0	3.0	12.0	12.0	300.0	72.0	-2.0	1025.2	-19.5	0.0	N	2.0	Aotizhongxin

dataset.to_csv(f"{data_path}/preprocessed_airquality.csv", index=False)