Exploratory data analysis#

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import numpy as np
 3import pandas as pd
 4
 5# Plotting
 6import seaborn as sns
 7import matplotlib.pyplot as plt
 8import plotly.graph_objs as go
 9
10# Warnings
11import warnings
12
13warnings.filterwarnings("ignore")
14
15%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/engineered_energydata.csv")
3dataset.head().T
0 1 2 3 4
date_time 2016-01-11 17:00:00 2016-01-11 17:10:00 2016-01-11 17:20:00 2016-01-11 17:30:00 2016-01-11 17:40:00
month 1 1 1 1 1
day 1 1 1 1 1
a_energy 60 60 50 50 60
kitchen_temp 19.89 19.89 19.89 19.89 19.89
kitchen_hum 47.596667 46.693333 46.3 46.066667 46.333333
liv_temp 19.2 19.2 19.2 19.2 19.2
liv_hum 44.79 44.7225 44.626667 44.59 44.53
laun_temp 19.79 19.79 19.79 19.79 19.79
laun_hum 44.73 44.79 44.933333 45.0 45.0
off_temp 19.0 19.0 18.926667 18.89 18.89
off_hum 45.566667 45.9925 45.89 45.723333 45.53
bath_temp 17.166667 17.166667 17.166667 17.166667 17.2
bath_hum 55.2 55.2 55.09 55.09 55.09
out_b_temp 7.026667 6.833333 6.56 6.433333 6.366667
out_b_hum 84.256667 84.063333 83.156667 83.423333 84.893333
iron_temp 17.2 17.2 17.2 17.133333 17.2
iron_hum 41.626667 41.56 41.433333 41.29 41.23
teen_temp 18.2 18.2 18.2 18.1 18.1
teen_hum 48.9 48.863333 48.73 48.59 48.59
par_temp 17.033333 17.066667 17.0 17.0 17.0
par_hum 45.53 45.56 45.5 45.4 45.4
out_temp 6.6 6.483333 6.366667 6.25 6.133333
out_press 733.5 733.6 733.7 733.8 733.9
out_hum 92.0 92.0 92.0 92.0 92.0
wind 7.0 6.666667 6.333333 6.0 5.666667
visibility 63.0 59.166667 55.333333 51.5 47.666667
dew_point 5.3 5.2 5.1 5.0 4.9
rv1 13.275433 18.606195 28.642668 45.410389 10.084097
rv2 13.275433 18.606195 28.642668 45.410389 10.084097

Data analysis#

Visualizing the Dataset#

1app_date = go.Scatter(x=dataset.date_time, mode="lines", y=dataset.a_energy)
2
3layout = go.Layout(
4    title="Appliance Energy Consumed by Date",
5    xaxis=dict(title="Date"),
6    yaxis=dict(title="Wh"),
7)
8fig = go.Figure(data=[app_date], layout=layout)
9fig.show()
1app_mon = dataset.groupby(by=["month"], as_index=False)["a_energy"].sum()
2app_mon.sort_values(by="a_energy", ascending=False).head()
month a_energy
2 3 283190
3 4 274030
4 5 259120
1 2 258270
0 1 150060
1plt.subplots(figsize=(15, 6))
2am = sns.barplot(app_mon.month, app_mon.a_energy)
3plt.xlabel("Month")
4plt.ylabel("Energy Consumed by Appliances")
5plt.title("Total Energy Consumed by Appliances per Month")
6plt.show()
../../_images/2db3d2aea826e4df2aa1ba71dd8519bcbb10fe8cae5fef26b0a4990dcc7fdf73.png

Observing the Trend between a_energy and day#

1app_day = dataset.groupby(by=["day"], as_index=False)["a_energy"].sum()
2app_day.sort_values(by="a_energy", ascending=False)
day a_energy
2 3 191700
6 7 183210
3 4 177830
1 2 175930
5 6 173640
0 1 161190
4 5 161170
1plt.subplots(figsize=(15, 6))
2ad = sns.barplot(app_day.day, app_day.a_energy)
3plt.xlabel("Day of the Week")
4plt.ylabel("Energy Consumed by Appliances")
5plt.title("Total Energy Consumed by Appliances")
6plt.show()
../../_images/5adacdb7b28c680315f0d17908c440d896a52bed95d92b8660c32c6d5f580c7a.png

Distributions of the Temperature Columns#

 1col_temp = [
 2    "kitchen_temp",
 3    "liv_temp",
 4    "laun_temp",
 5    "off_temp",
 6    "bath_temp",
 7    "out_b_temp",
 8    "iron_temp",
 9    "teen_temp",
10    "par_temp",
11]
12temp = dataset[col_temp]
13temp.head()
kitchen_temp liv_temp laun_temp off_temp bath_temp out_b_temp iron_temp teen_temp par_temp
0 19.89 19.2 19.79 19.000000 17.166667 7.026667 17.200000 18.2 17.033333
1 19.89 19.2 19.79 19.000000 17.166667 6.833333 17.200000 18.2 17.066667
2 19.89 19.2 19.79 18.926667 17.166667 6.560000 17.200000 18.2 17.000000
3 19.89 19.2 19.79 18.890000 17.166667 6.433333 17.133333 18.1 17.000000
4 19.89 19.2 19.79 18.890000 17.200000 6.366667 17.200000 18.1 17.000000
1temp.hist(bins=15, figsize=(12, 16))
array([[<AxesSubplot:title={'center':'kitchen_temp'}>,
        <AxesSubplot:title={'center':'liv_temp'}>,
        <AxesSubplot:title={'center':'laun_temp'}>],
       [<AxesSubplot:title={'center':'off_temp'}>,
        <AxesSubplot:title={'center':'bath_temp'}>,
        <AxesSubplot:title={'center':'out_b_temp'}>],
       [<AxesSubplot:title={'center':'iron_temp'}>,
        <AxesSubplot:title={'center':'teen_temp'}>,
        <AxesSubplot:title={'center':'par_temp'}>]], dtype=object)
../../_images/18fee684029f3a898081fdabd5ec5aa3bc75762163aa01b540bcc0780b00c004.png

Distributions of the Humidity Columns#

 1col_hum = [
 2    "kitchen_hum",
 3    "liv_hum",
 4    "laun_hum",
 5    "off_hum",
 6    "bath_hum",
 7    "out_b_hum",
 8    "iron_hum",
 9    "teen_hum",
10    "par_hum",
11]
12hum = dataset[col_hum]
13hum.head()
kitchen_hum liv_hum laun_hum off_hum bath_hum out_b_hum iron_hum teen_hum par_hum
0 47.596667 44.790000 44.730000 45.566667 55.20 84.256667 41.626667 48.900000 45.53
1 46.693333 44.722500 44.790000 45.992500 55.20 84.063333 41.560000 48.863333 45.56
2 46.300000 44.626667 44.933333 45.890000 55.09 83.156667 41.433333 48.730000 45.50
3 46.066667 44.590000 45.000000 45.723333 55.09 83.423333 41.290000 48.590000 45.40
4 46.333333 44.530000 45.000000 45.530000 55.09 84.893333 41.230000 48.590000 45.40
1hum.hist(bins=15, figsize=(12, 16))
array([[<AxesSubplot:title={'center':'kitchen_hum'}>,
        <AxesSubplot:title={'center':'liv_hum'}>,
        <AxesSubplot:title={'center':'laun_hum'}>],
       [<AxesSubplot:title={'center':'off_hum'}>,
        <AxesSubplot:title={'center':'bath_hum'}>,
        <AxesSubplot:title={'center':'out_b_hum'}>],
       [<AxesSubplot:title={'center':'iron_hum'}>,
        <AxesSubplot:title={'center':'teen_hum'}>,
        <AxesSubplot:title={'center':'par_hum'}>]], dtype=object)
../../_images/11e37f745a95e3f1da4eb394f43c5ad532d6bbb5c29ae8f5e26e1d113ed28ecb.png
 1col_weather = [
 2    "out_temp",
 3    "dew_point",
 4    "out_hum",
 5    "out_press",
 6    "wind",
 7    "visibility",
 8]
 9weath = dataset[col_weather]
10weath.head()
out_temp dew_point out_hum out_press wind visibility
0 6.600000 5.3 92.0 733.5 7.000000 63.000000
1 6.483333 5.2 92.0 733.6 6.666667 59.166667
2 6.366667 5.1 92.0 733.7 6.333333 55.333333
3 6.250000 5.0 92.0 733.8 6.000000 51.500000
4 6.133333 4.9 92.0 733.9 5.666667 47.666667
1weath.hist(bins=15, figsize=(12, 16))
array([[<AxesSubplot:title={'center':'out_temp'}>,
        <AxesSubplot:title={'center':'dew_point'}>],
       [<AxesSubplot:title={'center':'out_hum'}>,
        <AxesSubplot:title={'center':'out_press'}>],
       [<AxesSubplot:title={'center':'wind'}>,
        <AxesSubplot:title={'center':'visibility'}>]], dtype=object)
../../_images/8c397e374cb61ec5a9f30aa6c9c599f2318dc1e92396301a457ba7e468070092.png

Plotting out_b, out_hum, visibility, and wind#

1f, ax = plt.subplots(2, 2, figsize=(12, 8))
2obh = sns.distplot(hum["out_b_hum"], bins=10, ax=ax[0][0])
3oh = sns.distplot(weath["out_hum"], bins=10, ax=ax[0][1])
4vis = sns.distplot(weath["visibility"], bins=10, ax=ax[1][0])
5wind = sns.distplot(weath["wind"], bins=10, ax=ax[1][1])
../../_images/18631981d8a29c8e9365317cef02f2b8085db7391830501e1471e42353b469af.png
1corr = dataset.corr()
2mask = np.zeros_like(corr, dtype=np.bool)
3mask[np.triu_indices_from(mask)] = True
4f, ax = plt.subplots(figsize=(16, 14))
5sns.heatmap(corr, annot=True, fmt=".2f", mask=mask)
6plt.xticks(range(len(corr.columns)), corr.columns)
7plt.yticks(range(len(corr.columns)), corr.columns)
8plt.show()
../../_images/aebd8792cb7ba2d05880e5e14fc8d74db0a7eaab5931d62420da8ea10f931f35.png