Data preparation and feature engineering#
Importing libraries and packages#
1# Mathematical operations and data manipulation
2import pandas as pd
3
4# Warnings
5import warnings
6
7warnings.filterwarnings("ignore")
8
9%matplotlib inline
Set paths#
1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"
Loading dataset#
1# load data
2dataset = pd.read_csv(f"{data_path}/cleaned_energydata.csv")
3dataset.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
date_time | 2016-01-11 17:00:00 | 2016-01-11 17:10:00 | 2016-01-11 17:20:00 | 2016-01-11 17:30:00 | 2016-01-11 17:40:00 |
a_energy | 60 | 60 | 50 | 50 | 60 |
kitchen_temp | 19.89 | 19.89 | 19.89 | 19.89 | 19.89 |
kitchen_hum | 47.596667 | 46.693333 | 46.3 | 46.066667 | 46.333333 |
liv_temp | 19.2 | 19.2 | 19.2 | 19.2 | 19.2 |
liv_hum | 44.79 | 44.7225 | 44.626667 | 44.59 | 44.53 |
laun_temp | 19.79 | 19.79 | 19.79 | 19.79 | 19.79 |
laun_hum | 44.73 | 44.79 | 44.933333 | 45.0 | 45.0 |
off_temp | 19.0 | 19.0 | 18.926667 | 18.89 | 18.89 |
off_hum | 45.566667 | 45.9925 | 45.89 | 45.723333 | 45.53 |
bath_temp | 17.166667 | 17.166667 | 17.166667 | 17.166667 | 17.2 |
bath_hum | 55.2 | 55.2 | 55.09 | 55.09 | 55.09 |
out_b_temp | 7.026667 | 6.833333 | 6.56 | 6.433333 | 6.366667 |
out_b_hum | 84.256667 | 84.063333 | 83.156667 | 83.423333 | 84.893333 |
iron_temp | 17.2 | 17.2 | 17.2 | 17.133333 | 17.2 |
iron_hum | 41.626667 | 41.56 | 41.433333 | 41.29 | 41.23 |
teen_temp | 18.2 | 18.2 | 18.2 | 18.1 | 18.1 |
teen_hum | 48.9 | 48.863333 | 48.73 | 48.59 | 48.59 |
par_temp | 17.033333 | 17.066667 | 17.0 | 17.0 | 17.0 |
par_hum | 45.53 | 45.56 | 45.5 | 45.4 | 45.4 |
out_temp | 6.6 | 6.483333 | 6.366667 | 6.25 | 6.133333 |
out_press | 733.5 | 733.6 | 733.7 | 733.8 | 733.9 |
out_hum | 92.0 | 92.0 | 92.0 | 92.0 | 92.0 |
wind | 7.0 | 6.666667 | 6.333333 | 6.0 | 5.666667 |
visibility | 63.0 | 59.166667 | 55.333333 | 51.5 | 47.666667 |
dew_point | 5.3 | 5.2 | 5.1 | 5.0 | 4.9 |
rv1 | 13.275433 | 18.606195 | 28.642668 | 45.410389 | 10.084097 |
rv2 | 13.275433 | 18.606195 | 28.642668 | 45.410389 | 10.084097 |
Data preparation and feature engineering#
1dataset["date_time"] = pd.to_datetime(
2 dataset.date_time, format="%Y-%m-%d %H:%M:%S"
3)
4dataset.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
date_time | 2016-01-11 17:00:00 | 2016-01-11 17:10:00 | 2016-01-11 17:20:00 | 2016-01-11 17:30:00 | 2016-01-11 17:40:00 |
a_energy | 60 | 60 | 50 | 50 | 60 |
kitchen_temp | 19.89 | 19.89 | 19.89 | 19.89 | 19.89 |
kitchen_hum | 47.596667 | 46.693333 | 46.3 | 46.066667 | 46.333333 |
liv_temp | 19.2 | 19.2 | 19.2 | 19.2 | 19.2 |
liv_hum | 44.79 | 44.7225 | 44.626667 | 44.59 | 44.53 |
laun_temp | 19.79 | 19.79 | 19.79 | 19.79 | 19.79 |
laun_hum | 44.73 | 44.79 | 44.933333 | 45.0 | 45.0 |
off_temp | 19.0 | 19.0 | 18.926667 | 18.89 | 18.89 |
off_hum | 45.566667 | 45.9925 | 45.89 | 45.723333 | 45.53 |
bath_temp | 17.166667 | 17.166667 | 17.166667 | 17.166667 | 17.2 |
bath_hum | 55.2 | 55.2 | 55.09 | 55.09 | 55.09 |
out_b_temp | 7.026667 | 6.833333 | 6.56 | 6.433333 | 6.366667 |
out_b_hum | 84.256667 | 84.063333 | 83.156667 | 83.423333 | 84.893333 |
iron_temp | 17.2 | 17.2 | 17.2 | 17.133333 | 17.2 |
iron_hum | 41.626667 | 41.56 | 41.433333 | 41.29 | 41.23 |
teen_temp | 18.2 | 18.2 | 18.2 | 18.1 | 18.1 |
teen_hum | 48.9 | 48.863333 | 48.73 | 48.59 | 48.59 |
par_temp | 17.033333 | 17.066667 | 17.0 | 17.0 | 17.0 |
par_hum | 45.53 | 45.56 | 45.5 | 45.4 | 45.4 |
out_temp | 6.6 | 6.483333 | 6.366667 | 6.25 | 6.133333 |
out_press | 733.5 | 733.6 | 733.7 | 733.8 | 733.9 |
out_hum | 92.0 | 92.0 | 92.0 | 92.0 | 92.0 |
wind | 7.0 | 6.666667 | 6.333333 | 6.0 | 5.666667 |
visibility | 63.0 | 59.166667 | 55.333333 | 51.5 | 47.666667 |
dew_point | 5.3 | 5.2 | 5.1 | 5.0 | 4.9 |
rv1 | 13.275433 | 18.606195 | 28.642668 | 45.410389 | 10.084097 |
rv2 | 13.275433 | 18.606195 | 28.642668 | 45.410389 | 10.084097 |
1dataset.insert(loc=1, column="month", value=dataset.date_time.dt.month)
2dataset.insert(loc=2, column="day", value=dataset.date_time.dt.dayofweek + 1)
3dataset.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
date_time | 2016-01-11 17:00:00 | 2016-01-11 17:10:00 | 2016-01-11 17:20:00 | 2016-01-11 17:30:00 | 2016-01-11 17:40:00 |
month | 1 | 1 | 1 | 1 | 1 |
day | 1 | 1 | 1 | 1 | 1 |
a_energy | 60 | 60 | 50 | 50 | 60 |
kitchen_temp | 19.89 | 19.89 | 19.89 | 19.89 | 19.89 |
kitchen_hum | 47.596667 | 46.693333 | 46.3 | 46.066667 | 46.333333 |
liv_temp | 19.2 | 19.2 | 19.2 | 19.2 | 19.2 |
liv_hum | 44.79 | 44.7225 | 44.626667 | 44.59 | 44.53 |
laun_temp | 19.79 | 19.79 | 19.79 | 19.79 | 19.79 |
laun_hum | 44.73 | 44.79 | 44.933333 | 45.0 | 45.0 |
off_temp | 19.0 | 19.0 | 18.926667 | 18.89 | 18.89 |
off_hum | 45.566667 | 45.9925 | 45.89 | 45.723333 | 45.53 |
bath_temp | 17.166667 | 17.166667 | 17.166667 | 17.166667 | 17.2 |
bath_hum | 55.2 | 55.2 | 55.09 | 55.09 | 55.09 |
out_b_temp | 7.026667 | 6.833333 | 6.56 | 6.433333 | 6.366667 |
out_b_hum | 84.256667 | 84.063333 | 83.156667 | 83.423333 | 84.893333 |
iron_temp | 17.2 | 17.2 | 17.2 | 17.133333 | 17.2 |
iron_hum | 41.626667 | 41.56 | 41.433333 | 41.29 | 41.23 |
teen_temp | 18.2 | 18.2 | 18.2 | 18.1 | 18.1 |
teen_hum | 48.9 | 48.863333 | 48.73 | 48.59 | 48.59 |
par_temp | 17.033333 | 17.066667 | 17.0 | 17.0 | 17.0 |
par_hum | 45.53 | 45.56 | 45.5 | 45.4 | 45.4 |
out_temp | 6.6 | 6.483333 | 6.366667 | 6.25 | 6.133333 |
out_press | 733.5 | 733.6 | 733.7 | 733.8 | 733.9 |
out_hum | 92.0 | 92.0 | 92.0 | 92.0 | 92.0 |
wind | 7.0 | 6.666667 | 6.333333 | 6.0 | 5.666667 |
visibility | 63.0 | 59.166667 | 55.333333 | 51.5 | 47.666667 |
dew_point | 5.3 | 5.2 | 5.1 | 5.0 | 4.9 |
rv1 | 13.275433 | 18.606195 | 28.642668 | 45.410389 | 10.084097 |
rv2 | 13.275433 | 18.606195 | 28.642668 | 45.410389 | 10.084097 |
1dataset.to_csv(f"{data_path}/engineered_energydata.csv", index=False)