ARIMA models#

Autoregressive Integrated Moving Average (ARIMA) models are a class of statistical models that try to explain the behavior of a time series using its own past values. Being a class of models, ARIMA models are defined by a set of parameters (p,d,q), each one corresponding to a different component of the ARIMA model.

Importing libraries and packages#

# Warnings
import warnings

# Mathematical operations and data manipulation
import pandas as pd

# Models
from pmdarima import auto_arima

# Statistics
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Plotting
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
%matplotlib inline

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Input In [1], in <cell line: 8>()
import pandas as pd
# Models
----> 8 from pmdarima import auto_arima
# Statistics
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/pmdarima/__init__.py:52, in <module>
from . import __check_build
# Stuff we want at top-level
---> 52 from .arima import auto_arima, ARIMA, AutoARIMA, StepwiseContext, decompose
from .utils import acf, autocorr_plot, c, pacf, plot_acf, plot_pacf, \
   tsdisplay
from .utils._show_versions import show_versions

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/pmdarima/arima/__init__.py:5, in <module>
# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
----> 5 from .approx import *
from .arima import *
from .auto import *

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/pmdarima/arima/approx.py:9, in <module>
# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# R approx function
import numpy as np
----> 9 from ..utils.array import c, check_endog
from ..utils import get_callable
from ..compat.numpy import DTYPE

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/pmdarima/utils/__init__.py:7, in <module>
from .array import *
from .metaestimators import *
----> 7 from .visualization import *
from .wrapped import *
def get_callable(key, dct):

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/pmdarima/utils/visualization.py:11, in <module>
from ..compat.pandas import plotting as pd_plotting
from ..compat.matplotlib import get_compatible_pyplot
---> 11 from statsmodels.graphics import tsaplots
import numpy as np
import os

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/statsmodels/graphics/tsaplots.py:11, in <module>
import pandas as pd
from statsmodels.graphics import utils
---> 11 from statsmodels.tsa.stattools import acf, pacf
def _prepare_data_corr_plot(x, lags, zero):
   zero = bool(zero)

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/statsmodels/tsa/stattools.py:19, in <module>
from scipy import stats
from scipy.interpolate import interp1d
---> 19 from scipy.signal import correlate
from statsmodels.regression.linear_model import OLS, yule_walker
from statsmodels.tools.sm_exceptions import (
   CollinearityWarning,
   InfeasibleTestError,
   InterpolationWarning,
   MissingDataError,
)

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/signal/__init__.py:309, in <module>
"""
=======================================
Signal processing (:mod:`scipy.signal`)
   (...)

"""
--> 309 from . import _sigtools, windows
from ._waveforms import *
from ._max_len_seq import max_len_seq

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/signal/windows/__init__.py:41, in <module>
"""
Window functions (:mod:`scipy.signal.windows`)
==============================================
   (...)

"""
---> 41 from ._windows import *
# Deprecated namespaces, to be removed in v2.0.0
from . import windows

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/signal/windows/_windows.py:7, in <module>
import warnings
import numpy as np
----> 7 from scipy import linalg, special, fft as sp_fft
__all__ = ['boxcar', 'triang', 'parzen', 'bohman', 'blackman', 'nuttall',
          'blackmanharris', 'flattop', 'bartlett', 'hanning', 'barthann',
          'hamming', 'kaiser', 'gaussian', 'general_cosine',
          'general_gaussian', 'general_hamming', 'chebwin', 'cosine',
          'hann', 'exponential', 'tukey', 'taylor', 'dpss', 'get_window']
def _len_guards(M):

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/__init__.py:91, in <module>
from ._realtransforms import dct, idct, dst, idst, dctn, idctn, dstn, idstn
from ._fftlog import fht, ifht, fhtoffset
---> 91 from ._helper import next_fast_len
from ._backend import (set_backend, skip_backend, set_global_backend,
                      register_backend)
from numpy.fft import fftfreq, rfftfreq, fftshift, ifftshift

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_helper.py:3, in <module>
from functools import update_wrapper, lru_cache
----> 3 from ._pocketfft import helper as _helper
def next_fast_len(target, real=False):
   """Find the next fast size of input data to ``fft``, for zero-padding, etc.

   SciPy's FFT algorithms gain their speed by a recursive divide and conquer
   (...)

   """

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_pocketfft/__init__.py:3, in <module>
""" FFT backend using pypocketfft """
----> 3 from .basic import *
from .realtransforms import *
from .helper import *

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_pocketfft/basic.py:6, in <module>
import numpy as np
import functools
----> 6 from . import pypocketfft as pfft
from .helper import (_asfarray, _init_nd_shape_and_axes, _datacopied,
                    _fix_shape, _fix_shape_1d, _normalization,
                    _workers)
def c2c(forward, x, n=None, axis=-1, norm=None, overwrite_x=False,
       workers=None, *, plan=None):

ImportError: /home/docs/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /home/docs/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-39-x86_64-linux-gnu.so)

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

# load hourly data
dataset = pd.read_csv(f"{data_path}/preprocessed_hour.csv")
dataset.head()

	instant	dteday	season	yr	mnth	hr	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed	casual	registered	cnt
0	1	2011-01-01	winter	2011	1	0	0	Saturday	0	clear	0.24	0.2879	81.0	0.0	3	13	16
1	2	2011-01-01	winter	2011	1	1	0	Saturday	0	clear	0.22	0.2727	80.0	0.0	8	32	40
2	3	2011-01-01	winter	2011	1	2	0	Saturday	0	clear	0.22	0.2727	80.0	0.0	5	27	32
3	4	2011-01-01	winter	2011	1	3	0	Saturday	0	clear	0.24	0.2879	75.0	0.0	3	10	13
4	5	2011-01-01	winter	2011	1	4	0	Saturday	0	clear	0.24	0.2879	75.0	0.0	0	1	1

# print some generic statistics about the data
print(f"Shape of data: {dataset.shape}")
print(f"Number of missing values in the data: {dataset.isnull().sum().sum()}")

# get statistics on the numerical columns
dataset.describe().T

Shape of data: (17379, 17)
Number of missing values in the data: 0

	count	mean	std	min	25%	50%	75%	max
instant	17379.0	8690.000000	5017.029500	1.00	4345.5000	8690.0000	13034.5000	17379.0000
yr	17379.0	2011.502561	0.500008	2011.00	2011.0000	2012.0000	2012.0000	2012.0000
mnth	17379.0	6.537775	3.438776	1.00	4.0000	7.0000	10.0000	12.0000
hr	17379.0	11.546752	6.914405	0.00	6.0000	12.0000	18.0000	23.0000
holiday	17379.0	0.028770	0.167165	0.00	0.0000	0.0000	0.0000	1.0000
workingday	17379.0	0.682721	0.465431	0.00	0.0000	1.0000	1.0000	1.0000
temp	17379.0	0.496987	0.192556	0.02	0.3400	0.5000	0.6600	1.0000
atemp	17379.0	0.475775	0.171850	0.00	0.3333	0.4848	0.6212	1.0000
hum	17379.0	62.722884	19.292983	0.00	48.0000	63.0000	78.0000	100.0000
windspeed	17379.0	12.736540	8.196795	0.00	7.0015	12.9980	16.9979	56.9969
casual	17379.0	35.676218	49.305030	0.00	4.0000	17.0000	48.0000	367.0000
registered	17379.0	153.786869	151.357286	0.00	34.0000	115.0000	220.0000	886.0000
cnt	17379.0	189.463088	181.387599	1.00	40.0000	142.0000	281.0000	977.0000

Preprocessing#

# get daily rides
daily_rides = dataset[["dteday", "registered", "casual"]]
daily_rides = daily_rides.groupby("dteday").sum()

# convert index to DateTime object
daily_rides.index = pd.to_datetime(daily_rides.index)

# make time series stationary
registered = daily_rides["registered"]
registered_ma = registered.rolling(10).mean()
registered_ma_diff = registered - registered_ma
registered_ma_diff.dropna(inplace=True)

casual = daily_rides["casual"]
casual_ma = casual.rolling(10).mean()
casual_ma_diff = casual - casual_ma
casual_ma_diff.dropna(inplace=True)

ACF and PACF plots for registered rides#

fig, axes = plt.subplots(3, 3, figsize=(25, 12))

# Plotting original series
original = daily_rides["registered"]
axes[0, 0].plot(original)
axes[0, 0].set_title("Original series")
plot_acf(original, ax=axes[0, 1])
plot_pacf(original, ax=axes[0, 2])

# Plotting first order integrated series
first_order_int = original.diff().dropna()
axes[1, 0].plot(first_order_int)
axes[1, 0].set_title("First order integrated")
plot_acf(first_order_int, ax=axes[1, 1])
plot_pacf(first_order_int, ax=axes[1, 2])

# Plotting first order integrated series
second_order_int = first_order_int.diff().dropna()
axes[2, 0].plot(first_order_int)
axes[2, 0].set_title("Second order integrated")
plot_acf(second_order_int, ax=axes[2, 1])
plot_pacf(second_order_int, ax=axes[2, 2])

fig.savefig(f"{assets_path}/acf_pacf.png", format="png")

../../_images/2fba98c3bd862756c573768df6830a21412a42150efe12b3d59529c93c65e863.png

# Fitting an ARIMA model to the registered rides
model = auto_arima(
    registered,
    start_p=1,
    start_q=1,
    max_p=3,
    max_q=3,
    information_criterion="aic",
)

print(model.summary())

                               SARIMAX Results                                
==============================================================================
Dep. Variable:                      y   No. Observations:                  731
Model:               SARIMAX(3, 1, 3)   Log Likelihood               -5854.491
Date:                Sun, 20 Mar 2022   AIC                          11722.982
Time:                        21:28:04   BIC                          11755.133
Sample:                             0   HQIC                         11735.386
                                - 731                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.6077      0.050     32.421      0.000       1.511       1.705
ar.L2         -1.4474      0.062    -23.372      0.000      -1.569      -1.326
ar.L3          0.3608      0.049      7.396      0.000       0.265       0.456
ma.L1         -2.1153      0.034    -62.375      0.000      -2.182      -2.049
ma.L2          2.0601      0.047     43.824      0.000       1.968       2.152
ma.L3         -0.8605      0.032    -27.085      0.000      -0.923      -0.798
sigma2      6.239e+05   2.42e+04     25.784      0.000    5.77e+05    6.71e+05
===================================================================================
Ljung-Box (L1) (Q):                   0.38   Jarque-Bera (JB):               756.72
Prob(Q):                              0.54   Prob(JB):                         0.00
Heteroskedasticity (H):               3.35   Skew:                            -1.32
Prob(H) (two-sided):                  0.00   Kurtosis:                         7.23
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

# plot original and predicted values
plot_data = pd.DataFrame(registered)
plot_data["predicted"] = model.predict_in_sample()
plot_data.plot(figsize=(12, 8))
plt.ylabel("number of registered rides")
plt.title("Predicted vs actual number of rides")
plt.savefig(f"{assets_path}/registered_arima_fit.png", format="png")

../../_images/bb114231924aa0e292d92420e9cb5a49fe208220e73a9a09840cdfae7155d745.png