Preprocessing#

Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.

Importing libraries and packages#

# Mathematical operations and data manipulation
import pandas as pd

# Warnings
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

# load data
dataset = pd.read_csv(f"{data_path}/energydata_complete.csv")
dataset.head().T

	0	1	2	3	4
date	2016-01-11 17:00:00	2016-01-11 17:10:00	2016-01-11 17:20:00	2016-01-11 17:30:00	2016-01-11 17:40:00
Appliances	60	60	50	50	60
lights	30	30	30	40	40
T1	19.89	19.89	19.89	19.89	19.89
RH_1	47.596667	46.693333	46.3	46.066667	46.333333
T2	19.2	19.2	19.2	19.2	19.2
RH_2	44.79	44.7225	44.626667	44.59	44.53
T3	19.79	19.79	19.79	19.79	19.79
RH_3	44.73	44.79	44.933333	45.0	45.0
T4	19.0	19.0	18.926667	18.89	18.89
RH_4	45.566667	45.9925	45.89	45.723333	45.53
T5	17.166667	17.166667	17.166667	17.166667	17.2
RH_5	55.2	55.2	55.09	55.09	55.09
T6	7.026667	6.833333	6.56	6.433333	6.366667
RH_6	84.256667	84.063333	83.156667	83.423333	84.893333
T7	17.2	17.2	17.2	17.133333	17.2
RH_7	41.626667	41.56	41.433333	41.29	41.23
T8	18.2	18.2	18.2	18.1	18.1
RH_8	48.9	48.863333	48.73	48.59	48.59
T9	17.033333	17.066667	17.0	17.0	17.0
RH_9	45.53	45.56	45.5	45.4	45.4
T_out	6.6	6.483333	6.366667	6.25	6.133333
Press_mm_hg	733.5	733.6	733.7	733.8	733.9
RH_out	92.0	92.0	92.0	92.0	92.0
Windspeed	7.0	6.666667	6.333333	6.0	5.666667
Visibility	63.0	59.166667	55.333333	51.5	47.666667
Tdewpoint	5.3	5.2	5.1	5.0	4.9
rv1	13.275433	18.606195	28.642668	45.410389	10.084097
rv2	13.275433	18.606195	28.642668	45.410389	10.084097

Exploring dataset#

# Printing dimensionality of the data, columns, types and missing values
print(f"Data dimension: {dataset.shape}")
for col in dataset.columns:
    print(
        f"Column: {col:35} | "
        f"type: {str(dataset[col].dtype):7} | "
        f"missing values: {dataset[col].isna().sum():3d}"
    )

Data dimension: (19735, 29)
Column: date                                | type: object  | missing values:   0
Column: Appliances                          | type: int64   | missing values:   0
Column: lights                              | type: int64   | missing values:   0
Column: T1                                  | type: float64 | missing values:   0
Column: RH_1                                | type: float64 | missing values:   0
Column: T2                                  | type: float64 | missing values:   0
Column: RH_2                                | type: float64 | missing values:   0
Column: T3                                  | type: float64 | missing values:   0
Column: RH_3                                | type: float64 | missing values:   0
Column: T4                                  | type: float64 | missing values:   0
Column: RH_4                                | type: float64 | missing values:   0
Column: T5                                  | type: float64 | missing values:   0
Column: RH_5                                | type: float64 | missing values:   0
Column: T6                                  | type: float64 | missing values:   0
Column: RH_6                                | type: float64 | missing values:   0
Column: T7                                  | type: float64 | missing values:   0
Column: RH_7                                | type: float64 | missing values:   0
Column: T8                                  | type: float64 | missing values:   0
Column: RH_8                                | type: float64 | missing values:   0
Column: T9                                  | type: float64 | missing values:   0
Column: RH_9                                | type: float64 | missing values:   0
Column: T_out                               | type: float64 | missing values:   0
Column: Press_mm_hg                         | type: float64 | missing values:   0
Column: RH_out                              | type: float64 | missing values:   0
Column: Windspeed                           | type: float64 | missing values:   0
Column: Visibility                          | type: float64 | missing values:   0
Column: Tdewpoint                           | type: float64 | missing values:   0
Column: rv1                                 | type: float64 | missing values:   0
Column: rv2                                 | type: float64 | missing values:   0

Column Description has some missing values, Customer ID has a lot of (20%) missing values.

# Computing statistics on numerical features
dataset.describe().T

	count	mean	std	min	25%	50%	75%	max
Appliances	19735.0	97.694958	102.524891	10.000000	50.000000	60.000000	100.000000	1080.000000
lights	19735.0	3.801875	7.935988	0.000000	0.000000	0.000000	0.000000	70.000000
T1	19735.0	21.686571	1.606066	16.790000	20.760000	21.600000	22.600000	26.260000
RH_1	19735.0	40.259739	3.979299	27.023333	37.333333	39.656667	43.066667	63.360000
T2	19735.0	20.341219	2.192974	16.100000	18.790000	20.000000	21.500000	29.856667
RH_2	19735.0	40.420420	4.069813	20.463333	37.900000	40.500000	43.260000	56.026667
T3	19735.0	22.267611	2.006111	17.200000	20.790000	22.100000	23.290000	29.236000
RH_3	19735.0	39.242500	3.254576	28.766667	36.900000	38.530000	41.760000	50.163333
T4	19735.0	20.855335	2.042884	15.100000	19.530000	20.666667	22.100000	26.200000
RH_4	19735.0	39.026904	4.341321	27.660000	35.530000	38.400000	42.156667	51.090000
T5	19735.0	19.592106	1.844623	15.330000	18.277500	19.390000	20.619643	25.795000
RH_5	19735.0	50.949283	9.022034	29.815000	45.400000	49.090000	53.663333	96.321667
T6	19735.0	7.910939	6.090347	-6.065000	3.626667	7.300000	11.256000	28.290000
RH_6	19735.0	54.609083	31.149806	1.000000	30.025000	55.290000	83.226667	99.900000
T7	19735.0	20.267106	2.109993	15.390000	18.700000	20.033333	21.600000	26.000000
RH_7	19735.0	35.388200	5.114208	23.200000	31.500000	34.863333	39.000000	51.400000
T8	19735.0	22.029107	1.956162	16.306667	20.790000	22.100000	23.390000	27.230000
RH_8	19735.0	42.936165	5.224361	29.600000	39.066667	42.375000	46.536000	58.780000
T9	19735.0	19.485828	2.014712	14.890000	18.000000	19.390000	20.600000	24.500000
RH_9	19735.0	41.552401	4.151497	29.166667	38.500000	40.900000	44.338095	53.326667
T_out	19735.0	7.411665	5.317409	-5.000000	3.666667	6.916667	10.408333	26.100000
Press_mm_hg	19735.0	755.522602	7.399441	729.300000	750.933333	756.100000	760.933333	772.300000
RH_out	19735.0	79.750418	14.901088	24.000000	70.333333	83.666667	91.666667	100.000000
Windspeed	19735.0	4.039752	2.451221	0.000000	2.000000	3.666667	5.500000	14.000000
Visibility	19735.0	38.330834	11.794719	1.000000	29.000000	40.000000	40.000000	66.000000
Tdewpoint	19735.0	3.760707	4.194648	-6.600000	0.900000	3.433333	6.566667	15.500000
rv1	19735.0	24.988033	14.496634	0.005322	12.497889	24.897653	37.583769	49.996530
rv2	19735.0	24.988033	14.496634	0.005322	12.497889	24.897653	37.583769	49.996530

Preprocessing#

dataset.rename(
    index=str,
    columns={
        "date": "date_time",
        "Appliances": "a_energy",
        "lights": "l_energy",
        "T1": "kitchen_temp",
        "RH_1": "kitchen_hum",
        "T2": "liv_temp",
        "RH_2": "liv_hum",
        "T3": "laun_temp",
        "RH_3": "laun_hum",
        "T4": "off_temp",
        "RH_4": "off_hum",
        "T5": "bath_temp",
        "RH_5": "bath_hum",
        "T6": "out_b_temp",
        "RH_6": "out_b_hum",
        "T7": "iron_temp",
        "RH_7": "iron_hum",
        "T8": "teen_temp",
        "RH_8": "teen_hum",
        "T9": "par_temp",
        "RH_9": "par_hum",
        "T_out": "out_temp",
        "Press_mm_hg": "out_press",
        "RH_out": "out_hum",
        "Windspeed": "wind",
        "Visibility": "visibility",
        "Tdewpoint": "dew_point",
        "rv1": "rv1",
        "rv2": "rv2",
    },
    inplace=True,
)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 date_time     19735 non-null  object 
 a_energy      19735 non-null  int64  
 l_energy      19735 non-null  int64  
 kitchen_temp  19735 non-null  float64
 kitchen_hum   19735 non-null  float64
 liv_temp      19735 non-null  float64
 liv_hum       19735 non-null  float64
 laun_temp     19735 non-null  float64
 laun_hum      19735 non-null  float64
 off_temp      19735 non-null  float64
off_hum       19735 non-null  float64
bath_temp     19735 non-null  float64
bath_hum      19735 non-null  float64
out_b_temp    19735 non-null  float64
out_b_hum     19735 non-null  float64
iron_temp     19735 non-null  float64
iron_hum      19735 non-null  float64
teen_temp     19735 non-null  float64
teen_hum      19735 non-null  float64
par_temp      19735 non-null  float64
par_hum       19735 non-null  float64
out_temp      19735 non-null  float64
out_press     19735 non-null  float64
out_hum       19735 non-null  float64
wind          19735 non-null  float64
visibility    19735 non-null  float64
dew_point     19735 non-null  float64
rv1           19735 non-null  float64
rv2           19735 non-null  float64
dtypes: float64(26), int64(2), object(1)
memory usage: 4.5+ MB

dataset.to_csv(f"{data_path}/preprocessed_energydata.csv", index=False)