ARIMA models#

Autoregressive Integrated Moving Average (ARIMA) models are a class of statistical models that try to explain the behavior of a time series using its own past values. Being a class of models, ARIMA models are defined by a set of parameters (p,d,q), each one corresponding to a different component of the ARIMA model.

Importing libraries and packages#

 1# Warnings
 2import warnings
 3
 4# Mathematical operations and data manipulation
 5import pandas as pd
 6
 7# Models
 8from pmdarima import auto_arima
 9
10# Statistics
11from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
12
13# Plotting
14import matplotlib.pyplot as plt
15
16warnings.filterwarnings("ignore")
17%matplotlib inline
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Input In [1], in <cell line: 8>()
      5 import pandas as pd
      7 # Models
----> 8 from pmdarima import auto_arima
     10 # Statistics
     11 from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/pmdarima/__init__.py:52, in <module>
     49 from . import __check_build
     51 # Stuff we want at top-level
---> 52 from .arima import auto_arima, ARIMA, AutoARIMA, StepwiseContext, decompose
     53 from .utils import acf, autocorr_plot, c, pacf, plot_acf, plot_pacf, \
     54     tsdisplay
     55 from .utils._show_versions import show_versions

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/pmdarima/arima/__init__.py:5, in <module>
      1 # -*- coding: utf-8 -*-
      2 #
      3 # Author: Taylor Smith <taylor.smith@alkaline-ml.com>
----> 5 from .approx import *
      6 from .arima import *
      7 from .auto import *

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/pmdarima/arima/approx.py:9, in <module>
      1 # -*- coding: utf-8 -*-
      2 #
      3 # Author: Taylor Smith <taylor.smith@alkaline-ml.com>
      4 #
      5 # R approx function
      7 import numpy as np
----> 9 from ..utils.array import c, check_endog
     10 from ..utils import get_callable
     11 from ..compat.numpy import DTYPE

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/pmdarima/utils/__init__.py:7, in <module>
      5 from .array import *
      6 from .metaestimators import *
----> 7 from .visualization import *
      8 from .wrapped import *
     11 def get_callable(key, dct):

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/pmdarima/utils/visualization.py:11, in <module>
      8 from ..compat.pandas import plotting as pd_plotting
      9 from ..compat.matplotlib import get_compatible_pyplot
---> 11 from statsmodels.graphics import tsaplots
     13 import numpy as np
     14 import os

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/statsmodels/graphics/tsaplots.py:11, in <module>
      8 import pandas as pd
     10 from statsmodels.graphics import utils
---> 11 from statsmodels.tsa.stattools import acf, pacf
     14 def _prepare_data_corr_plot(x, lags, zero):
     15     zero = bool(zero)

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/statsmodels/tsa/stattools.py:19, in <module>
     17 from scipy import stats
     18 from scipy.interpolate import interp1d
---> 19 from scipy.signal import correlate
     21 from statsmodels.regression.linear_model import OLS, yule_walker
     22 from statsmodels.tools.sm_exceptions import (
     23     CollinearityWarning,
     24     InfeasibleTestError,
     25     InterpolationWarning,
     26     MissingDataError,
     27 )

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/signal/__init__.py:309, in <module>
      1 """
      2 =======================================
      3 Signal processing (:mod:`scipy.signal`)
   (...)
    307 
    308 """
--> 309 from . import _sigtools, windows
    310 from ._waveforms import *
    311 from ._max_len_seq import max_len_seq

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/signal/windows/__init__.py:41, in <module>
      1 """
      2 Window functions (:mod:`scipy.signal.windows`)
      3 ==============================================
   (...)
     38 
     39 """
---> 41 from ._windows import *
     43 # Deprecated namespaces, to be removed in v2.0.0
     44 from . import windows

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/signal/windows/_windows.py:7, in <module>
      4 import warnings
      6 import numpy as np
----> 7 from scipy import linalg, special, fft as sp_fft
      9 __all__ = ['boxcar', 'triang', 'parzen', 'bohman', 'blackman', 'nuttall',
     10            'blackmanharris', 'flattop', 'bartlett', 'hanning', 'barthann',
     11            'hamming', 'kaiser', 'gaussian', 'general_cosine',
     12            'general_gaussian', 'general_hamming', 'chebwin', 'cosine',
     13            'hann', 'exponential', 'tukey', 'taylor', 'dpss', 'get_window']
     16 def _len_guards(M):

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/__init__.py:91, in <module>
     89 from ._realtransforms import dct, idct, dst, idst, dctn, idctn, dstn, idstn
     90 from ._fftlog import fht, ifht, fhtoffset
---> 91 from ._helper import next_fast_len
     92 from ._backend import (set_backend, skip_backend, set_global_backend,
     93                        register_backend)
     94 from numpy.fft import fftfreq, rfftfreq, fftshift, ifftshift

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_helper.py:3, in <module>
      1 from functools import update_wrapper, lru_cache
----> 3 from ._pocketfft import helper as _helper
      6 def next_fast_len(target, real=False):
      7     """Find the next fast size of input data to ``fft``, for zero-padding, etc.
      8 
      9     SciPy's FFT algorithms gain their speed by a recursive divide and conquer
   (...)
     59 
     60     """

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_pocketfft/__init__.py:3, in <module>
      1 """ FFT backend using pypocketfft """
----> 3 from .basic import *
      4 from .realtransforms import *
      5 from .helper import *

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_pocketfft/basic.py:6, in <module>
      4 import numpy as np
      5 import functools
----> 6 from . import pypocketfft as pfft
      7 from .helper import (_asfarray, _init_nd_shape_and_axes, _datacopied,
      8                      _fix_shape, _fix_shape_1d, _normalization,
      9                      _workers)
     11 def c2c(forward, x, n=None, axis=-1, norm=None, overwrite_x=False,
     12         workers=None, *, plan=None):

ImportError: /home/docs/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /home/docs/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-39-x86_64-linux-gnu.so)

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load hourly data
2dataset = pd.read_csv(f"{data_path}/preprocessed_hour.csv")
3dataset.head()
instant dteday season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
0 1 2011-01-01 winter 2011 1 0 0 Saturday 0 clear 0.24 0.2879 81.0 0.0 3 13 16
1 2 2011-01-01 winter 2011 1 1 0 Saturday 0 clear 0.22 0.2727 80.0 0.0 8 32 40
2 3 2011-01-01 winter 2011 1 2 0 Saturday 0 clear 0.22 0.2727 80.0 0.0 5 27 32
3 4 2011-01-01 winter 2011 1 3 0 Saturday 0 clear 0.24 0.2879 75.0 0.0 3 10 13
4 5 2011-01-01 winter 2011 1 4 0 Saturday 0 clear 0.24 0.2879 75.0 0.0 0 1 1
1# print some generic statistics about the data
2print(f"Shape of data: {dataset.shape}")
3print(f"Number of missing values in the data: {dataset.isnull().sum().sum()}")
4
5# get statistics on the numerical columns
6dataset.describe().T
Shape of data: (17379, 17)
Number of missing values in the data: 0
count mean std min 25% 50% 75% max
instant 17379.0 8690.000000 5017.029500 1.00 4345.5000 8690.0000 13034.5000 17379.0000
yr 17379.0 2011.502561 0.500008 2011.00 2011.0000 2012.0000 2012.0000 2012.0000
mnth 17379.0 6.537775 3.438776 1.00 4.0000 7.0000 10.0000 12.0000
hr 17379.0 11.546752 6.914405 0.00 6.0000 12.0000 18.0000 23.0000
holiday 17379.0 0.028770 0.167165 0.00 0.0000 0.0000 0.0000 1.0000
workingday 17379.0 0.682721 0.465431 0.00 0.0000 1.0000 1.0000 1.0000
temp 17379.0 0.496987 0.192556 0.02 0.3400 0.5000 0.6600 1.0000
atemp 17379.0 0.475775 0.171850 0.00 0.3333 0.4848 0.6212 1.0000
hum 17379.0 62.722884 19.292983 0.00 48.0000 63.0000 78.0000 100.0000
windspeed 17379.0 12.736540 8.196795 0.00 7.0015 12.9980 16.9979 56.9969
casual 17379.0 35.676218 49.305030 0.00 4.0000 17.0000 48.0000 367.0000
registered 17379.0 153.786869 151.357286 0.00 34.0000 115.0000 220.0000 886.0000
cnt 17379.0 189.463088 181.387599 1.00 40.0000 142.0000 281.0000 977.0000

Preprocessing#

1# get daily rides
2daily_rides = dataset[["dteday", "registered", "casual"]]
3daily_rides = daily_rides.groupby("dteday").sum()
4
5# convert index to DateTime object
6daily_rides.index = pd.to_datetime(daily_rides.index)
 1# make time series stationary
 2registered = daily_rides["registered"]
 3registered_ma = registered.rolling(10).mean()
 4registered_ma_diff = registered - registered_ma
 5registered_ma_diff.dropna(inplace=True)
 6
 7casual = daily_rides["casual"]
 8casual_ma = casual.rolling(10).mean()
 9casual_ma_diff = casual - casual_ma
10casual_ma_diff.dropna(inplace=True)

ACF and PACF plots for registered rides#

 1fig, axes = plt.subplots(3, 3, figsize=(25, 12))
 2
 3# Plotting original series
 4original = daily_rides["registered"]
 5axes[0, 0].plot(original)
 6axes[0, 0].set_title("Original series")
 7plot_acf(original, ax=axes[0, 1])
 8plot_pacf(original, ax=axes[0, 2])
 9
10# Plotting first order integrated series
11first_order_int = original.diff().dropna()
12axes[1, 0].plot(first_order_int)
13axes[1, 0].set_title("First order integrated")
14plot_acf(first_order_int, ax=axes[1, 1])
15plot_pacf(first_order_int, ax=axes[1, 2])
16
17# Plotting first order integrated series
18second_order_int = first_order_int.diff().dropna()
19axes[2, 0].plot(first_order_int)
20axes[2, 0].set_title("Second order integrated")
21plot_acf(second_order_int, ax=axes[2, 1])
22plot_pacf(second_order_int, ax=axes[2, 2])
23
24fig.savefig(f"{assets_path}/acf_pacf.png", format="png")
../../_images/2fba98c3bd862756c573768df6830a21412a42150efe12b3d59529c93c65e863.png
 1# Fitting an ARIMA model to the registered rides
 2model = auto_arima(
 3    registered,
 4    start_p=1,
 5    start_q=1,
 6    max_p=3,
 7    max_q=3,
 8    information_criterion="aic",
 9)
10
11print(model.summary())
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                      y   No. Observations:                  731
Model:               SARIMAX(3, 1, 3)   Log Likelihood               -5854.491
Date:                Sun, 20 Mar 2022   AIC                          11722.982
Time:                        21:28:04   BIC                          11755.133
Sample:                             0   HQIC                         11735.386
                                - 731                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.6077      0.050     32.421      0.000       1.511       1.705
ar.L2         -1.4474      0.062    -23.372      0.000      -1.569      -1.326
ar.L3          0.3608      0.049      7.396      0.000       0.265       0.456
ma.L1         -2.1153      0.034    -62.375      0.000      -2.182      -2.049
ma.L2          2.0601      0.047     43.824      0.000       1.968       2.152
ma.L3         -0.8605      0.032    -27.085      0.000      -0.923      -0.798
sigma2      6.239e+05   2.42e+04     25.784      0.000    5.77e+05    6.71e+05
===================================================================================
Ljung-Box (L1) (Q):                   0.38   Jarque-Bera (JB):               756.72
Prob(Q):                              0.54   Prob(JB):                         0.00
Heteroskedasticity (H):               3.35   Skew:                            -1.32
Prob(H) (two-sided):                  0.00   Kurtosis:                         7.23
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
1# plot original and predicted values
2plot_data = pd.DataFrame(registered)
3plot_data["predicted"] = model.predict_in_sample()
4plot_data.plot(figsize=(12, 8))
5plt.ylabel("number of registered rides")
6plt.title("Predicted vs actual number of rides")
7plt.savefig(f"{assets_path}/registered_arima_fit.png", format="png")
../../_images/bb114231924aa0e292d92420e9cb5a49fe208220e73a9a09840cdfae7155d745.png