Regressions#

Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.

Importing libraries and packages#

# Warnings
import warnings

# Mathematical operations and data manipulation
import numpy as np
import pandas as pd

# Statistics
import statsmodels.api as sm

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
%matplotlib inline

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Input In [1], in <cell line: 9>()
      6 import pandas as pd
      8 # Statistics
----> 9 import statsmodels.api as sm
     11 # Plotting
     12 import seaborn as sns

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/statsmodels/api.py:105, in <module>
     94 from .genmod import api as genmod
     95 from .genmod.api import (
     96     GEE,
     97     GLM,
   (...)
    103     families,
    104 )
--> 105 from .graphics import api as graphics
    106 from .graphics.gofplots import ProbPlot, qqline, qqplot, qqplot_2samples
    107 from .imputation.bayes_mi import MI, BayesGaussMI

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/statsmodels/graphics/api.py:1, in <module>
----> 1 from . import tsaplots as tsa
      2 from .agreement import mean_diff_plot
      3 from .boxplots import beanplot, violinplot

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/statsmodels/graphics/tsaplots.py:11, in <module>
      8 import pandas as pd
     10 from statsmodels.graphics import utils
---> 11 from statsmodels.tsa.stattools import acf, pacf
     14 def _prepare_data_corr_plot(x, lags, zero):
     15     zero = bool(zero)

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/statsmodels/tsa/stattools.py:19, in <module>
     17 from scipy import stats
     18 from scipy.interpolate import interp1d
---> 19 from scipy.signal import correlate
     21 from statsmodels.regression.linear_model import OLS, yule_walker
     22 from statsmodels.tools.sm_exceptions import (
     23     CollinearityWarning,
     24     InfeasibleTestError,
     25     InterpolationWarning,
     26     MissingDataError,
     27 )

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/signal/__init__.py:309, in <module>
      1 """
      2 =======================================
      3 Signal processing (:mod:`scipy.signal`)
   (...)
    307 
    308 """
--> 309 from . import _sigtools, windows
    310 from ._waveforms import *
    311 from ._max_len_seq import max_len_seq

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/signal/windows/__init__.py:41, in <module>
      1 """
      2 Window functions (:mod:`scipy.signal.windows`)
      3 ==============================================
   (...)
     38 
     39 """
---> 41 from ._windows import *
     43 # Deprecated namespaces, to be removed in v2.0.0
     44 from . import windows

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/signal/windows/_windows.py:7, in <module>
      4 import warnings
      6 import numpy as np
----> 7 from scipy import linalg, special, fft as sp_fft
      9 __all__ = ['boxcar', 'triang', 'parzen', 'bohman', 'blackman', 'nuttall',
     10            'blackmanharris', 'flattop', 'bartlett', 'hanning', 'barthann',
     11            'hamming', 'kaiser', 'gaussian', 'general_cosine',
     12            'general_gaussian', 'general_hamming', 'chebwin', 'cosine',
     13            'hann', 'exponential', 'tukey', 'taylor', 'dpss', 'get_window']
     16 def _len_guards(M):

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/__init__.py:91, in <module>
     89 from ._realtransforms import dct, idct, dst, idst, dctn, idctn, dstn, idstn
     90 from ._fftlog import fht, ifht, fhtoffset
---> 91 from ._helper import next_fast_len
     92 from ._backend import (set_backend, skip_backend, set_global_backend,
     93                        register_backend)
     94 from numpy.fft import fftfreq, rfftfreq, fftshift, ifftshift

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_helper.py:3, in <module>
      1 from functools import update_wrapper, lru_cache
----> 3 from ._pocketfft import helper as _helper
      6 def next_fast_len(target, real=False):
      7     """Find the next fast size of input data to ``fft``, for zero-padding, etc.
      8 
      9     SciPy's FFT algorithms gain their speed by a recursive divide and conquer
   (...)
     59 
     60     """

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_pocketfft/__init__.py:3, in <module>
      1 """ FFT backend using pypocketfft """
----> 3 from .basic import *
      4 from .realtransforms import *
      5 from .helper import *

File ~/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_pocketfft/basic.py:6, in <module>
      4 import numpy as np
      5 import functools
----> 6 from . import pypocketfft as pfft
      7 from .helper import (_asfarray, _init_nd_shape_and_axes, _datacopied,
      8                      _fix_shape, _fix_shape_1d, _normalization,
      9                      _workers)
     11 def c2c(forward, x, n=None, axis=-1, norm=None, overwrite_x=False,
     12         workers=None, *, plan=None):

ImportError: /home/docs/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /home/docs/checkouts/readthedocs.org/user_builds/analysing/conda/latest/lib/python3.9/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-39-x86_64-linux-gnu.so)

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

# load data
dataset = pd.read_csv(f"{data_path}/bank-additional-full.csv", sep=";")
dataset.head().T

	0	1	2	3	4
age	56	57	37	40	56
job	housemaid	services	services	admin.	services
marital	married	married	married	married	married
education	basic.4y	high.school	high.school	basic.6y	high.school
default	no	unknown	no	no	no
housing	no	no	yes	no	no
loan	no	no	no	no	yes
contact	telephone	telephone	telephone	telephone	telephone
month	may	may	may	may	may
day_of_week	mon	mon	mon	mon	mon
duration	261	149	226	151	307
campaign	1	1	1	1	1
pdays	999	999	999	999	999
previous	0	0	0	0	0
poutcome	nonexistent	nonexistent	nonexistent	nonexistent	nonexistent
emp.var.rate	1.1	1.1	1.1	1.1	1.1
cons.price.idx	93.994	93.994	93.994	93.994	93.994
cons.conf.idx	-36.4	-36.4	-36.4	-36.4	-36.4
euribor3m	4.857	4.857	4.857	4.857	4.857
nr.employed	5191.0	5191.0	5191.0	5191.0	5191.0
y	no	no	no	no	no

Exploring dataset#

# Printing dimensionality of the data, columns, types and missing values
print(f"Data dimension: {dataset.shape}")
for col in dataset.columns:
    print(
        f"Column: {col:35} | "
        f"type: {str(dataset[col].dtype):7} | "
        f"missing values: {dataset[col].isna().sum():3d}"
    )

Data dimension: (41188, 21)
Column: age                                 | type: int64   | missing values:   0
Column: job                                 | type: object  | missing values:   0
Column: marital                             | type: object  | missing values:   0
Column: education                           | type: object  | missing values:   0
Column: default                             | type: object  | missing values:   0
Column: housing                             | type: object  | missing values:   0
Column: loan                                | type: object  | missing values:   0
Column: contact                             | type: object  | missing values:   0
Column: month                               | type: object  | missing values:   0
Column: day_of_week                         | type: object  | missing values:   0
Column: duration                            | type: int64   | missing values:   0
Column: campaign                            | type: int64   | missing values:   0
Column: pdays                               | type: int64   | missing values:   0
Column: previous                            | type: int64   | missing values:   0
Column: poutcome                            | type: object  | missing values:   0
Column: emp.var.rate                        | type: float64 | missing values:   0
Column: cons.price.idx                      | type: float64 | missing values:   0
Column: cons.conf.idx                       | type: float64 | missing values:   0
Column: euribor3m                           | type: float64 | missing values:   0
Column: nr.employed                         | type: float64 | missing values:   0
Column: y                                   | type: object  | missing values:   0

# Numerical features
numerical_features = [
    col
    for col in dataset.columns
    if np.issubdtype(dataset[col].dtype, np.number)
]
print(numerical_features)

['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# Computing statistics on numerical features
dataset[numerical_features].describe().T

	count	mean	std	min	25%	50%	75%	max
age	41188.0	40.024060	10.421250	17.000	32.000	38.000	47.000	98.000
duration	41188.0	258.285010	259.279249	0.000	102.000	180.000	319.000	4918.000
campaign	41188.0	2.567593	2.770014	1.000	1.000	2.000	3.000	56.000
pdays	41188.0	962.475454	186.910907	0.000	999.000	999.000	999.000	999.000
previous	41188.0	0.172963	0.494901	0.000	0.000	0.000	0.000	7.000
emp.var.rate	41188.0	0.081886	1.570960	-3.400	-1.800	1.100	1.400	1.400
cons.price.idx	41188.0	93.575664	0.578840	92.201	93.075	93.749	93.994	94.767
cons.conf.idx	41188.0	-40.502600	4.628198	-50.800	-42.700	-41.800	-36.400	-26.900
euribor3m	41188.0	3.621291	1.734447	0.634	1.344	4.857	4.961	5.045
nr.employed	41188.0	5167.035911	72.251528	4963.600	5099.100	5191.000	5228.100	5228.100

# Distributions of numerical features
plt.figure(figsize=(10, 18))
for index, col in enumerate(numerical_features):
    plt.subplot(5, 2, index + 1)
    sns.distplot(dataset[col])
plt.savefig(
    f"{assets_path}/numerical_distributions.png", format="png", dpi=500
)

../../_images/b20400adbd5b3d63545579dfd36cf6326cbd517aa1c7225ecb54329db0c93fa8.png

# Categorical features
categorical_features = [
    col
    for col in dataset.columns
    if pd.api.types.is_string_dtype(dataset[col])
]

# Distributions of categorical features
plt.figure(figsize=(25, 35))
for index, col in enumerate(categorical_features):
    plt.subplot(6, 2, index + 1)
    ax = sns.countplot(y=col, data=dataset)
    ax.set_xlabel("count", fontsize=20)
    ax.set_ylabel(col, fontsize=20)
    ax.tick_params(labelsize=20)

plt.savefig(f"{assets_path}/categorical_counts.png", format="png", dpi=500)

../../_images/48ceed440e2250aea7e9942e4a035da6b5dee4d701ce6faa877baf463ecc04cf.png

# Number of entries in y column
print("Total number of entries:")
print(dataset["y"].value_counts(ascending=True))
print()
print("Percentages:")
print(dataset["y"].value_counts(normalize=True, ascending=True) * 100)

Total number of entries:
yes     4640
no     36548
Name: y, dtype: int64

Percentages:
yes    11.265417
no     88.734583
Name: y, dtype: float64

Linear regression on financial columns#

# Feature matrix and target variable
X = dataset[["emp.var.rate", "cons.price.idx", "euribor3m"]]
X = sm.add_constant(X)  # add constant value for the intercept term
y = dataset["cons.conf.idx"]

# Defining and fitting model
lineare_regression_model = sm.OLS(y, X)
result = lineare_regression_model.fit()
print(result.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:          cons.conf.idx   R-squared:                       0.177
Model:                            OLS   Adj. R-squared:                  0.177
Method:                 Least Squares   F-statistic:                     2960.
Date:                Mon, 11 Apr 2022   Prob (F-statistic):               0.00
Time:                        01:41:25   Log-Likelihood:            -1.1753e+05
No. Observations:               41188   AIC:                         2.351e+05
Df Residuals:                   41184   BIC:                         2.351e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const            -82.4025      5.999    -13.736      0.000     -94.161     -70.644
emp.var.rate      -4.1814      0.072    -57.960      0.000      -4.323      -4.040
cons.price.idx     0.2828      0.063      4.478      0.000       0.159       0.407
euribor3m          4.3582      0.057     76.618      0.000       4.247       4.470
==============================================================================
Omnibus:                     3246.559   Durbin-Watson:                   0.001
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             4034.493
Skew:                           0.761   Prob(JB):                         0.00
Kurtosis:                       2.811   Cond. No.                     2.72e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.72e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

Logistic regression on campaign columns#

# Plotting logit function
x = np.arange(0.001, 1, 0.01)
logit = np.log(x / (1 - x))

plt.figure(figsize=(6, 6))
plt.plot(x, logit)
plt.xlabel("p")
plt.ylabel(r"$\log(\frac{p}{1-p})$")
plt.grid()
plt.savefig(f"{assets_path}/logit_function.png", format="png", dpi=300)

../../_images/a36b43e9fcb0c646a5b11923d0150309f69ff8fa8cb9fc4003028c07bb3be708.png

# Feature matrix and target variable
X = dataset[["age", "duration", "campaign", "previous"]]
X = sm.add_constant(X)  # add constant value for the intercept term
y = np.where(dataset["y"] == "yes", 1, 0)  # target has to be numeric

# Defining and fitting model
logistic_regression_model = sm.Logit(y, X)
result = logistic_regression_model.fit()
print(result.summary())

# One hot encoding
print(dataset["education"].unique())

hot_encoded = pd.get_dummies(dataset["education"])
hot_encoded["education"] = dataset["education"]
hot_encoded.head(10)

Logistic regression on the full marketing campaign data#

# Transforming all features into numerical ones using the
# get_dummies() function
X = dataset.drop("y", axis=1)
X = pd.get_dummies(X)
X = sm.add_constant(X)
print(X.columns)

# Extracting and transforming target variable
y = np.where(dataset["y"] == "yes", 1, 0)

# Defining and fitting model
full_logistic_regression_model = sm.Logit(y, X)
result = full_logistic_regression_model.fit(maxiter=500)
print(result.summary())