Preprocessing#
Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.
Importing libraries and packages#
1# Mathematical operations and data manipulation
2import pandas as pd
3
4# Warnings
5import warnings
6
7warnings.filterwarnings("ignore")
8%matplotlib inline
Set paths#
1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"
Loading dataset#
1# load data
2dataset = pd.read_csv(f"{data_path}/energydata_complete.csv")
3dataset.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
date | 2016-01-11 17:00:00 | 2016-01-11 17:10:00 | 2016-01-11 17:20:00 | 2016-01-11 17:30:00 | 2016-01-11 17:40:00 |
Appliances | 60 | 60 | 50 | 50 | 60 |
lights | 30 | 30 | 30 | 40 | 40 |
T1 | 19.89 | 19.89 | 19.89 | 19.89 | 19.89 |
RH_1 | 47.596667 | 46.693333 | 46.3 | 46.066667 | 46.333333 |
T2 | 19.2 | 19.2 | 19.2 | 19.2 | 19.2 |
RH_2 | 44.79 | 44.7225 | 44.626667 | 44.59 | 44.53 |
T3 | 19.79 | 19.79 | 19.79 | 19.79 | 19.79 |
RH_3 | 44.73 | 44.79 | 44.933333 | 45.0 | 45.0 |
T4 | 19.0 | 19.0 | 18.926667 | 18.89 | 18.89 |
RH_4 | 45.566667 | 45.9925 | 45.89 | 45.723333 | 45.53 |
T5 | 17.166667 | 17.166667 | 17.166667 | 17.166667 | 17.2 |
RH_5 | 55.2 | 55.2 | 55.09 | 55.09 | 55.09 |
T6 | 7.026667 | 6.833333 | 6.56 | 6.433333 | 6.366667 |
RH_6 | 84.256667 | 84.063333 | 83.156667 | 83.423333 | 84.893333 |
T7 | 17.2 | 17.2 | 17.2 | 17.133333 | 17.2 |
RH_7 | 41.626667 | 41.56 | 41.433333 | 41.29 | 41.23 |
T8 | 18.2 | 18.2 | 18.2 | 18.1 | 18.1 |
RH_8 | 48.9 | 48.863333 | 48.73 | 48.59 | 48.59 |
T9 | 17.033333 | 17.066667 | 17.0 | 17.0 | 17.0 |
RH_9 | 45.53 | 45.56 | 45.5 | 45.4 | 45.4 |
T_out | 6.6 | 6.483333 | 6.366667 | 6.25 | 6.133333 |
Press_mm_hg | 733.5 | 733.6 | 733.7 | 733.8 | 733.9 |
RH_out | 92.0 | 92.0 | 92.0 | 92.0 | 92.0 |
Windspeed | 7.0 | 6.666667 | 6.333333 | 6.0 | 5.666667 |
Visibility | 63.0 | 59.166667 | 55.333333 | 51.5 | 47.666667 |
Tdewpoint | 5.3 | 5.2 | 5.1 | 5.0 | 4.9 |
rv1 | 13.275433 | 18.606195 | 28.642668 | 45.410389 | 10.084097 |
rv2 | 13.275433 | 18.606195 | 28.642668 | 45.410389 | 10.084097 |
Exploring dataset#
1# Printing dimensionality of the data, columns, types and missing values
2print(f"Data dimension: {dataset.shape}")
3for col in dataset.columns:
4 print(
5 f"Column: {col:35} | "
6 f"type: {str(dataset[col].dtype):7} | "
7 f"missing values: {dataset[col].isna().sum():3d}"
8 )
Data dimension: (19735, 29)
Column: date | type: object | missing values: 0
Column: Appliances | type: int64 | missing values: 0
Column: lights | type: int64 | missing values: 0
Column: T1 | type: float64 | missing values: 0
Column: RH_1 | type: float64 | missing values: 0
Column: T2 | type: float64 | missing values: 0
Column: RH_2 | type: float64 | missing values: 0
Column: T3 | type: float64 | missing values: 0
Column: RH_3 | type: float64 | missing values: 0
Column: T4 | type: float64 | missing values: 0
Column: RH_4 | type: float64 | missing values: 0
Column: T5 | type: float64 | missing values: 0
Column: RH_5 | type: float64 | missing values: 0
Column: T6 | type: float64 | missing values: 0
Column: RH_6 | type: float64 | missing values: 0
Column: T7 | type: float64 | missing values: 0
Column: RH_7 | type: float64 | missing values: 0
Column: T8 | type: float64 | missing values: 0
Column: RH_8 | type: float64 | missing values: 0
Column: T9 | type: float64 | missing values: 0
Column: RH_9 | type: float64 | missing values: 0
Column: T_out | type: float64 | missing values: 0
Column: Press_mm_hg | type: float64 | missing values: 0
Column: RH_out | type: float64 | missing values: 0
Column: Windspeed | type: float64 | missing values: 0
Column: Visibility | type: float64 | missing values: 0
Column: Tdewpoint | type: float64 | missing values: 0
Column: rv1 | type: float64 | missing values: 0
Column: rv2 | type: float64 | missing values: 0
Column Description has some missing values, Customer ID has a lot of (20%) missing values.
1# Computing statistics on numerical features
2dataset.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Appliances | 19735.0 | 97.694958 | 102.524891 | 10.000000 | 50.000000 | 60.000000 | 100.000000 | 1080.000000 |
lights | 19735.0 | 3.801875 | 7.935988 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 70.000000 |
T1 | 19735.0 | 21.686571 | 1.606066 | 16.790000 | 20.760000 | 21.600000 | 22.600000 | 26.260000 |
RH_1 | 19735.0 | 40.259739 | 3.979299 | 27.023333 | 37.333333 | 39.656667 | 43.066667 | 63.360000 |
T2 | 19735.0 | 20.341219 | 2.192974 | 16.100000 | 18.790000 | 20.000000 | 21.500000 | 29.856667 |
RH_2 | 19735.0 | 40.420420 | 4.069813 | 20.463333 | 37.900000 | 40.500000 | 43.260000 | 56.026667 |
T3 | 19735.0 | 22.267611 | 2.006111 | 17.200000 | 20.790000 | 22.100000 | 23.290000 | 29.236000 |
RH_3 | 19735.0 | 39.242500 | 3.254576 | 28.766667 | 36.900000 | 38.530000 | 41.760000 | 50.163333 |
T4 | 19735.0 | 20.855335 | 2.042884 | 15.100000 | 19.530000 | 20.666667 | 22.100000 | 26.200000 |
RH_4 | 19735.0 | 39.026904 | 4.341321 | 27.660000 | 35.530000 | 38.400000 | 42.156667 | 51.090000 |
T5 | 19735.0 | 19.592106 | 1.844623 | 15.330000 | 18.277500 | 19.390000 | 20.619643 | 25.795000 |
RH_5 | 19735.0 | 50.949283 | 9.022034 | 29.815000 | 45.400000 | 49.090000 | 53.663333 | 96.321667 |
T6 | 19735.0 | 7.910939 | 6.090347 | -6.065000 | 3.626667 | 7.300000 | 11.256000 | 28.290000 |
RH_6 | 19735.0 | 54.609083 | 31.149806 | 1.000000 | 30.025000 | 55.290000 | 83.226667 | 99.900000 |
T7 | 19735.0 | 20.267106 | 2.109993 | 15.390000 | 18.700000 | 20.033333 | 21.600000 | 26.000000 |
RH_7 | 19735.0 | 35.388200 | 5.114208 | 23.200000 | 31.500000 | 34.863333 | 39.000000 | 51.400000 |
T8 | 19735.0 | 22.029107 | 1.956162 | 16.306667 | 20.790000 | 22.100000 | 23.390000 | 27.230000 |
RH_8 | 19735.0 | 42.936165 | 5.224361 | 29.600000 | 39.066667 | 42.375000 | 46.536000 | 58.780000 |
T9 | 19735.0 | 19.485828 | 2.014712 | 14.890000 | 18.000000 | 19.390000 | 20.600000 | 24.500000 |
RH_9 | 19735.0 | 41.552401 | 4.151497 | 29.166667 | 38.500000 | 40.900000 | 44.338095 | 53.326667 |
T_out | 19735.0 | 7.411665 | 5.317409 | -5.000000 | 3.666667 | 6.916667 | 10.408333 | 26.100000 |
Press_mm_hg | 19735.0 | 755.522602 | 7.399441 | 729.300000 | 750.933333 | 756.100000 | 760.933333 | 772.300000 |
RH_out | 19735.0 | 79.750418 | 14.901088 | 24.000000 | 70.333333 | 83.666667 | 91.666667 | 100.000000 |
Windspeed | 19735.0 | 4.039752 | 2.451221 | 0.000000 | 2.000000 | 3.666667 | 5.500000 | 14.000000 |
Visibility | 19735.0 | 38.330834 | 11.794719 | 1.000000 | 29.000000 | 40.000000 | 40.000000 | 66.000000 |
Tdewpoint | 19735.0 | 3.760707 | 4.194648 | -6.600000 | 0.900000 | 3.433333 | 6.566667 | 15.500000 |
rv1 | 19735.0 | 24.988033 | 14.496634 | 0.005322 | 12.497889 | 24.897653 | 37.583769 | 49.996530 |
rv2 | 19735.0 | 24.988033 | 14.496634 | 0.005322 | 12.497889 | 24.897653 | 37.583769 | 49.996530 |
Preprocessing#
1dataset.rename(
2 index=str,
3 columns={
4 "date": "date_time",
5 "Appliances": "a_energy",
6 "lights": "l_energy",
7 "T1": "kitchen_temp",
8 "RH_1": "kitchen_hum",
9 "T2": "liv_temp",
10 "RH_2": "liv_hum",
11 "T3": "laun_temp",
12 "RH_3": "laun_hum",
13 "T4": "off_temp",
14 "RH_4": "off_hum",
15 "T5": "bath_temp",
16 "RH_5": "bath_hum",
17 "T6": "out_b_temp",
18 "RH_6": "out_b_hum",
19 "T7": "iron_temp",
20 "RH_7": "iron_hum",
21 "T8": "teen_temp",
22 "RH_8": "teen_hum",
23 "T9": "par_temp",
24 "RH_9": "par_hum",
25 "T_out": "out_temp",
26 "Press_mm_hg": "out_press",
27 "RH_out": "out_hum",
28 "Windspeed": "wind",
29 "Visibility": "visibility",
30 "Tdewpoint": "dew_point",
31 "rv1": "rv1",
32 "rv2": "rv2",
33 },
34 inplace=True,
35)
1dataset.info()
<class 'pandas.core.frame.DataFrame'>
Index: 19735 entries, 0 to 19734
Data columns (total 29 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date_time 19735 non-null object
1 a_energy 19735 non-null int64
2 l_energy 19735 non-null int64
3 kitchen_temp 19735 non-null float64
4 kitchen_hum 19735 non-null float64
5 liv_temp 19735 non-null float64
6 liv_hum 19735 non-null float64
7 laun_temp 19735 non-null float64
8 laun_hum 19735 non-null float64
9 off_temp 19735 non-null float64
10 off_hum 19735 non-null float64
11 bath_temp 19735 non-null float64
12 bath_hum 19735 non-null float64
13 out_b_temp 19735 non-null float64
14 out_b_hum 19735 non-null float64
15 iron_temp 19735 non-null float64
16 iron_hum 19735 non-null float64
17 teen_temp 19735 non-null float64
18 teen_hum 19735 non-null float64
19 par_temp 19735 non-null float64
20 par_hum 19735 non-null float64
21 out_temp 19735 non-null float64
22 out_press 19735 non-null float64
23 out_hum 19735 non-null float64
24 wind 19735 non-null float64
25 visibility 19735 non-null float64
26 dew_point 19735 non-null float64
27 rv1 19735 non-null float64
28 rv2 19735 non-null float64
dtypes: float64(26), int64(2), object(1)
memory usage: 4.5+ MB
1dataset.to_csv(f"{data_path}/preprocessed_energydata.csv", index=False)