Source code for statsmodels.tsa.ardl.model
from __future__ import annotations
from statsmodels.compat.pandas import Appender, Substitution, call_cached_func
from collections import defaultdict
from collections.abc import Hashable, Mapping, Sequence
import datetime as dt
from itertools import combinations, product
import textwrap
from types import SimpleNamespace
from typing import TYPE_CHECKING, Any, Literal, NamedTuple, Optional, Union
import warnings
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.base.data import PandasData
import statsmodels.base.wrapper as wrap
from statsmodels.iolib.summary import Summary, summary_params
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.decorators import cache_readonly
from statsmodels.tools.docstring import Docstring, Parameter, remove_parameters
from statsmodels.tools.sm_exceptions import SpecificationWarning
from statsmodels.tools.typing import (
ArrayLike1D,
ArrayLike2D,
Float64Array,
NDArray,
)
from statsmodels.tools.validation import (
array_like,
bool_like,
float_like,
int_like,
)
from statsmodels.tsa.ar_model import (
AROrderSelectionResults,
AutoReg,
AutoRegResults,
sumofsq,
)
from statsmodels.tsa.ardl import pss_critical_values
from statsmodels.tsa.arima_process import arma2ma
from statsmodels.tsa.base import tsa_model
from statsmodels.tsa.base.prediction import PredictionResults
from statsmodels.tsa.deterministic import DeterministicProcess
from statsmodels.tsa.tsatools import lagmat
if TYPE_CHECKING:
import matplotlib.figure
__all__ = [
"ARDL",
"ARDLResults",
"ardl_select_order",
"ARDLOrderSelectionResults",
"UECM",
"UECMResults",
"BoundsTestResult",
]
[docs]
class BoundsTestResult(NamedTuple):
stat: float
crit_vals: pd.DataFrame
p_values: pd.Series
null: str
alternative: str
def __repr__(self):
return f"""\
{self.__class__.__name__}
Stat: {self.stat:0.5f}
Upper P-value: {self.p_values["upper"]:0.3g}
Lower P-value: {self.p_values["lower"]:0.3g}
Null: {self.null}
Alternative: {self.alternative}
"""
_UECMOrder = Union[None, int, dict[Hashable, Optional[int]]]
_ARDLOrder = Union[
None,
int,
_UECMOrder,
Sequence[int],
dict[Hashable, Union[int, Sequence[int], None]],
]
_INT_TYPES = (int, np.integer)
def _check_order(order: int | Sequence[int] | None, causal: bool) -> bool:
if order is None:
return True
if isinstance(order, (int, np.integer)):
if int(order) < int(causal):
raise ValueError(
f"integer orders must be at least {int(causal)} when causal "
f"is {causal}."
)
return True
for v in order:
if not isinstance(v, (int, np.integer)):
raise TypeError("sequence orders must contain non-negative integer values")
order = [int(v) for v in order]
if len(set(order)) != len(order) or min(order) < 0:
raise ValueError("sequence orders must contain distinct non-negative values")
if int(causal) and min(order) < 1:
raise ValueError(
"sequence orders must be strictly positive when causal is True"
)
return True
def _format_order(
exog: ArrayLike2D, order: _ARDLOrder, causal: bool
) -> dict[Hashable, list[int]]:
keys: list[Hashable]
exog_order: dict[Hashable, int | Sequence[int] | None]
if exog is None and order in (0, None):
return {}
if not isinstance(exog, pd.DataFrame):
exog = array_like(exog, "exog", ndim=2, maxdim=2)
keys = list(range(exog.shape[1]))
else:
keys = [col for col in exog.columns]
if order is None:
exog_order = {k: None for k in keys}
elif isinstance(order, Mapping):
exog_order = order
missing = set(keys).difference(order.keys())
extra = set(order.keys()).difference(keys)
if extra:
msg = (
"order dictionary contains keys for exogenous "
"variable(s) that are not contained in exog"
)
msg += " Extra keys: "
msg += ", ".join(list(sorted([str(v) for v in extra]))) + "."
raise ValueError(msg)
if missing:
msg = (
"exog contains variables that are missing from the order "
"dictionary. Missing keys: "
)
msg += ", ".join([str(k) for k in missing]) + "."
warnings.warn(msg, SpecificationWarning, stacklevel=2)
for key in exog_order:
_check_order(exog_order[key], causal)
elif isinstance(order, _INT_TYPES):
_check_order(order, causal)
exog_order = {k: int(order) for k in keys}
else:
_check_order(order, causal)
exog_order = {k: list(order) for k in keys}
final_order: dict[Hashable, list[int]] = {}
for key in exog_order:
value = exog_order[key]
if value is None:
continue
assert value is not None
if isinstance(value, int):
final_order[key] = list(range(int(causal), value + 1))
else:
final_order[key] = [int(lag) for lag in value]
return final_order
[docs]
class ARDL(AutoReg):
r"""
Autoregressive Distributed Lag (ARDL) Model
Parameters
----------
endog : array_like
A 1-d endogenous response variable. The dependent variable.
lags : {int, list[int]}
The number of lags to include in the model if an integer or the
list of lag indices to include. For example, [1, 4] will only
include lags 1 and 4 while lags=4 will include lags 1, 2, 3, and 4.
exog : array_like
Exogenous variables to include in the model. Either a DataFrame or
an 2-d array-like structure that can be converted to a NumPy array.
order : {int, sequence[int], dict}
If int, uses lags 0, 1, ..., order for all exog variables. If
sequence[int], uses the ``order`` for all variables. If a dict,
applies the lags series by series. If ``exog`` is anything other
than a DataFrame, the keys are the column index of exog (e.g., 0,
1, ...). If a DataFrame, keys are column names.
fixed : array_like
Additional fixed regressors that are not lagged.
causal : bool, optional
Whether to include lag 0 of exog variables. If True, only includes
lags 1, 2, ...
trend : {'n', 'c', 't', 'ct'}, optional
The trend to include in the model:
* 'n' - No trend.
* 'c' - Constant only.
* 't' - Time trend only.
* 'ct' - Constant and time trend.
The default is 'c'.
seasonal : bool, optional
Flag indicating whether to include seasonal dummies in the model. If
seasonal is True and trend includes 'c', then the first period
is excluded from the seasonal terms.
deterministic : DeterministicProcess, optional
A deterministic process. If provided, trend and seasonal are ignored.
A warning is raised if trend is not "n" and seasonal is not False.
hold_back : {None, int}, optional
Initial observations to exclude from the estimation sample. If None,
then hold_back is equal to the maximum lag in the model. Set to a
non-zero value to produce comparable models with different lag
length. For example, to compare the fit of a model with lags=3 and
lags=1, set hold_back=3 which ensures that both models are estimated
using observations 3,...,nobs. hold_back must be >= the maximum lag in
the model.
period : {None, int}, optional
The period of the data. Only used if seasonal is True. This parameter
can be omitted if using a pandas object for endog that contains a
recognized frequency.
missing : {"none", "drop", "raise"}, optional
Available options are 'none', 'drop', and 'raise'. If 'none', no NaN
checking is done. If 'drop', any observations with NaNs are dropped.
If 'raise', an error is raised. Default is 'none'.
Notes
-----
The full specification of an ARDL is
.. math ::
Y_t = \delta_0 + \delta_1 t + \delta_2 t^2
+ \sum_{i=1}^{s-1} \gamma_i I_{[(\mod(t,s) + 1) = i]}
+ \sum_{j=1}^p \phi_j Y_{t-j}
+ \sum_{l=1}^k \sum_{m=0}^{o_l} \beta_{l,m} X_{l, t-m}
+ Z_t \lambda
+ \epsilon_t
where :math:`\delta_\bullet` capture trends, :math:`\gamma_\bullet`
capture seasonal shifts, s is the period of the seasonality, p is the
lag length of the endogenous variable, k is the number of exogenous
variables :math:`X_{l}`, :math:`o_l` is included the lag length of
:math:`X_{l}`, :math:`Z_t` are ``r`` included fixed regressors and
:math:`\epsilon_t` is a white noise shock. If ``causal`` is ``True``,
then the 0-th lag of the exogenous variables is not included and the
sum starts at ``m=1``.
See the notebook `Autoregressive Distributed Lag Models
<../examples/notebooks/generated/autoregressive_distributed_lag.html>`__
for an overview.
See Also
--------
statsmodels.tsa.ar_model.AutoReg
Autoregressive model estimation with optional exogenous regressors
statsmodels.tsa.ardl.UECM
Unconstrained Error Correction Model estimation
statsmodels.tsa.statespace.sarimax.SARIMAX
Seasonal ARIMA model estimation with optional exogenous regressors
statsmodels.tsa.arima.model.ARIMA
ARIMA model estimation
Examples
--------
>>> from statsmodels.tsa.api import ARDL
>>> from statsmodels.datasets import danish_data
>>> data = danish_data.load_pandas().data
>>> lrm = data.lrm
>>> exog = data[["lry", "ibo", "ide"]]
A basic model where all variables have 3 lags included
>>> ARDL(data.lrm, 3, data[["lry", "ibo", "ide"]], 3)
A dictionary can be used to pass custom lag orders
>>> ARDL(data.lrm, [1, 3], exog, {"lry": 1, "ibo": 3, "ide": 2})
Setting causal removes the 0-th lag from the exogenous variables
>>> exog_lags = {"lry": 1, "ibo": 3, "ide": 2}
>>> ARDL(data.lrm, [1, 3], exog, exog_lags, causal=True)
A dictionary can also be used to pass specific lags to include.
Sequences hold the specific lags to include, while integers are expanded
to include [0, 1, ..., lag]. If causal is False, then the 0-th lag is
excluded.
>>> ARDL(lrm, [1, 3], exog, {"lry": [0, 1], "ibo": [0, 1, 3], "ide": 2})
When using NumPy arrays, the dictionary keys are the column index.
>>> import numpy as np
>>> lrma = np.asarray(lrm)
>>> exoga = np.asarray(exog)
>>> ARDL(lrma, 3, exoga, {0: [0, 1], 1: [0, 1, 3], 2: 2})
"""
def __init__(
self,
endog: Sequence[float] | pd.Series | ArrayLike2D,
lags: int | Sequence[int] | None,
exog: ArrayLike2D | None = None,
order: _ARDLOrder = 0,
trend: Literal["n", "c", "ct", "ctt"] = "c",
*,
fixed: ArrayLike2D | None = None,
causal: bool = False,
seasonal: bool = False,
deterministic: DeterministicProcess | None = None,
hold_back: int | None = None,
period: int | None = None,
missing: Literal["none", "drop", "raise"] = "none",
) -> None:
self._x = np.empty((0, 0))
self._y = np.empty((0,))
super().__init__(
endog,
lags,
trend=trend,
seasonal=seasonal,
exog=exog,
hold_back=hold_back,
period=period,
missing=missing,
deterministic=deterministic,
old_names=False,
)
# Reset hold back which was set in AutoReg.__init__
self._causal = bool_like(causal, "causal", strict=True)
self.data.orig_fixed = fixed
if fixed is not None:
fixed_arr = array_like(fixed, "fixed", ndim=2, maxdim=2)
if fixed_arr.shape[0] != self.data.endog.shape[0] or not np.all(
np.isfinite(fixed_arr)
):
raise ValueError(
"fixed must be an (nobs, m) array where nobs matches the "
"number of observations in the endog variable, and all"
"values must be finite"
)
if isinstance(fixed, pd.DataFrame):
self._fixed_names = list(fixed.columns)
else:
self._fixed_names = [f"z.{i}" for i in range(fixed_arr.shape[1])]
self._fixed = fixed_arr
else:
self._fixed = np.empty((self.data.endog.shape[0], 0))
self._fixed_names = []
self._blocks: dict[str, np.ndarray] = {}
self._names: dict[str, Sequence[str]] = {}
# 1. Check and update order
self._order = self._check_order(order)
# 2. Construct Regressors
self._y, self._x = self._construct_regressors(hold_back)
# 3. Construct variable names
self._endog_name, self._exog_names = self._construct_variable_names()
self.data.param_names = self.data.xnames = self._exog_names
self.data.ynames = self._endog_name
self._causal = True
if self._order:
min_lags = [min(val) for val in self._order.values()]
self._causal = min(min_lags) > 0
self._results_class = ARDLResults
self._results_wrapper = ARDLResultsWrapper
@property
def fixed(self) -> NDArray | pd.DataFrame | None:
"""The fixed data used to construct the model"""
return self.data.orig_fixed
@property
def causal(self) -> bool:
"""Flag indicating that the ARDL is causal"""
return self._causal
@property
def ar_lags(self) -> list[int] | None:
"""The autoregressive lags included in the model"""
return None if not self._lags else self._lags
@property
def dl_lags(self) -> dict[Hashable, list[int]]:
"""The lags of exogenous variables included in the model"""
return self._order
@property
def ardl_order(self) -> tuple[int, ...]:
"""The order of the ARDL(p,q)"""
ar_order = 0 if not self._lags else int(max(self._lags))
ardl_order = [ar_order]
for lags in self._order.values():
if lags is not None:
ardl_order.append(int(max(lags)))
return tuple(ardl_order)
def _setup_regressors(self) -> None:
"""Place holder to let AutoReg init complete"""
self._y = np.empty((self.endog.shape[0] - self._hold_back, 0))
@staticmethod
def _format_exog(
exog: ArrayLike2D, order: dict[Hashable, list[int]]
) -> dict[Hashable, np.ndarray]:
"""Transform exogenous variables and orders to regressors"""
if not order:
return {}
max_order = 0
for val in order.values():
if val is not None:
max_order = max(max(val), max_order)
if not isinstance(exog, pd.DataFrame):
exog = array_like(exog, "exog", ndim=2, maxdim=2)
exog_lags = {}
for key in order:
if order[key] is None:
continue
if isinstance(exog, np.ndarray):
assert isinstance(key, int)
col = exog[:, key]
else:
col = exog[key]
lagged_col = lagmat(col, max_order, original="in")
lags = order[key]
exog_lags[key] = lagged_col[:, lags]
return exog_lags
def _check_order(self, order: _ARDLOrder) -> dict[Hashable, list[int]]:
"""Validate and standardize the model order"""
return _format_order(self.data.orig_exog, order, self._causal)
def _fit(
self,
cov_type: str = "nonrobust",
cov_kwds: dict[str, Any] = None,
use_t: bool = True,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
if self._x.shape[1] == 0:
return np.empty((0,)), np.empty((0, 0)), np.empty((0, 0))
ols_mod = OLS(self._y, self._x)
ols_res = ols_mod.fit(cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t)
cov_params = ols_res.cov_params()
use_t = ols_res.use_t
if cov_type == "nonrobust" and not use_t:
nobs = self._y.shape[0]
k = self._x.shape[1]
scale = nobs / (nobs - k)
cov_params /= scale
return ols_res.params, cov_params, ols_res.normalized_cov_params
[docs]
def fit(
self,
*,
cov_type: str = "nonrobust",
cov_kwds: dict[str, Any] = None,
use_t: bool = True,
) -> ARDLResults:
"""
Estimate the model parameters.
Parameters
----------
cov_type : str
The covariance estimator to use. The most common choices are listed
below. Supports all covariance estimators that are available
in ``OLS.fit``.
* 'nonrobust' - The class OLS covariance estimator that assumes
homoskedasticity.
* 'HC0', 'HC1', 'HC2', 'HC3' - Variants of White's
(or Eiker-Huber-White) covariance estimator. `HC0` is the
standard implementation. The other make corrections to improve
the finite sample performance of the heteroskedasticity robust
covariance estimator.
* 'HAC' - Heteroskedasticity-autocorrelation robust covariance
estimation. Supports cov_kwds.
- `maxlags` integer (required) : number of lags to use.
- `kernel` callable or str (optional) : kernel
currently available kernels are ['bartlett', 'uniform'],
default is Bartlett.
- `use_correction` bool (optional) : If true, use small sample
correction.
cov_kwds : dict, optional
A dictionary of keyword arguments to pass to the covariance
estimator. `nonrobust` and `HC#` do not support cov_kwds.
use_t : bool, optional
A flag indicating that inference should use the Student's t
distribution that accounts for model degree of freedom. If False,
uses the normal distribution. If None, defers the choice to
the cov_type. It also removes degree of freedom corrections from
the covariance estimator when cov_type is 'nonrobust'.
Returns
-------
ARDLResults
Estimation results.
See Also
--------
statsmodels.tsa.ar_model.AutoReg
Ordinary Least Squares estimation.
statsmodels.regression.linear_model.OLS
Ordinary Least Squares estimation.
statsmodels.regression.linear_model.RegressionResults
See ``get_robustcov_results`` for a detailed list of available
covariance estimators and options.
Notes
-----
Use ``OLS`` to estimate model parameters and to estimate parameter
covariance.
"""
params, cov_params, norm_cov_params = self._fit(
cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t
)
res = ARDLResults(self, params, cov_params, norm_cov_params, use_t=use_t)
return ARDLResultsWrapper(res)
def _construct_regressors(
self, hold_back: int | None
) -> tuple[np.ndarray, np.ndarray]:
"""Construct and format model regressors"""
# TODO: Missing adjustment
self._maxlag = max(self._lags) if self._lags else 0
_endog_reg, _endog = lagmat(self.data.endog, self._maxlag, original="sep")
assert isinstance(_endog, np.ndarray)
assert isinstance(_endog_reg, np.ndarray)
self._endog_reg, self._endog = _endog_reg, _endog
if self._endog_reg.shape[1] != len(self._lags):
lag_locs = [lag - 1 for lag in self._lags]
self._endog_reg = self._endog_reg[:, lag_locs]
orig_exog = self.data.orig_exog
self._exog = self._format_exog(orig_exog, self._order)
exog_maxlag = 0
for val in self._order.values():
exog_maxlag = max(exog_maxlag, max(val) if val is not None else 0)
self._maxlag = max(self._maxlag, exog_maxlag)
self._deterministic_reg = self._deterministics.in_sample()
self._blocks = {
"endog": self._endog_reg,
"exog": self._exog,
"deterministic": self._deterministic_reg,
"fixed": self._fixed,
}
x = [self._deterministic_reg, self._endog_reg]
x += [ex for ex in self._exog.values()] + [self._fixed]
reg = np.column_stack(x)
if hold_back is None:
self._hold_back = int(self._maxlag)
if self._hold_back < self._maxlag:
raise ValueError(
"hold_back must be >= the maximum lag of the endog and exog "
"variables"
)
reg = reg[self._hold_back :]
if reg.shape[1] > reg.shape[0]:
raise ValueError(
f"The number of regressors ({reg.shape[1]}) including "
"deterministics, lags of the endog, lags of the exogenous, "
"and fixed regressors is larger than the sample available "
f"for estimation ({reg.shape[0]})."
)
return self.data.endog[self._hold_back :], reg
def _construct_variable_names(self):
"""Construct model variables names"""
y_name = self.data.ynames
endog_lag_names = [f"{y_name}.L{i}" for i in self._lags]
exog = self.data.orig_exog
exog_names = {}
for key in self._order:
if isinstance(exog, np.ndarray):
base = f"x{key}"
else:
base = str(key)
lags = self._order[key]
exog_names[key] = [f"{base}.L{lag}" for lag in lags]
self._names = {
"endog": endog_lag_names,
"exog": exog_names,
"deterministic": self._deterministic_reg.columns,
"fixed": self._fixed_names,
}
x_names = list(self._deterministic_reg.columns)
x_names += endog_lag_names
for key in exog_names:
x_names += exog_names[key]
x_names += self._fixed_names
return y_name, x_names
def _forecasting_x(
self,
start: int,
end: int,
num_oos: int,
exog: ArrayLike2D | None,
exog_oos: ArrayLike2D | None,
fixed: ArrayLike2D | None,
fixed_oos: ArrayLike2D | None,
) -> np.ndarray:
"""Construct exog matrix for forecasts"""
def pad_x(x: np.ndarray, pad: int) -> np.ndarray:
if pad == 0:
return x
k = x.shape[1]
return np.vstack([np.full((pad, k), np.nan), x])
pad = 0 if start >= self._hold_back else self._hold_back - start
# Shortcut if all in-sample and no new data
if (end + 1) < self.endog.shape[0] and exog is None and fixed is None:
adjusted_start = max(start - self._hold_back, 0)
return pad_x(self._x[adjusted_start : end + 1 - self._hold_back], pad)
# If anything changed, rebuild x array
exog = self.data.exog if exog is None else np.asarray(exog)
if exog_oos is not None:
exog = np.vstack([exog, np.asarray(exog_oos)[:num_oos]])
fixed = self._fixed if fixed is None else np.asarray(fixed)
if fixed_oos is not None:
fixed = np.vstack([fixed, np.asarray(fixed_oos)[:num_oos]])
det = self._deterministics.in_sample()
if num_oos:
oos_det = self._deterministics.out_of_sample(num_oos)
det = pd.concat([det, oos_det], axis=0)
endog = self.data.endog
if num_oos:
endog = np.hstack([endog, np.full(num_oos, np.nan)])
x = [det]
if self._lags:
endog_reg = lagmat(endog, max(self._lags), original="ex")
x.append(endog_reg[:, [lag - 1 for lag in self._lags]])
if self.ardl_order[1:]:
if isinstance(self.data.orig_exog, pd.DataFrame):
exog = pd.DataFrame(exog, columns=self.data.orig_exog.columns)
exog = self._format_exog(exog, self._order)
x.extend([np.asarray(arr) for arr in exog.values()])
if fixed.shape[1] > 0:
x.append(fixed)
_x = np.column_stack(x)
_x[: self._hold_back] = np.nan
return _x[start:]
[docs]
def predict(
self,
params: ArrayLike1D,
start: int | str | dt.datetime | pd.Timestamp | None = None,
end: int | str | dt.datetime | pd.Timestamp | None = None,
dynamic: bool = False,
exog: NDArray | pd.DataFrame | None = None,
exog_oos: NDArray | pd.DataFrame | None = None,
fixed: NDArray | pd.DataFrame | None = None,
fixed_oos: NDArray | pd.DataFrame | None = None,
):
"""
In-sample prediction and out-of-sample forecasting.
Parameters
----------
params : array_like
The fitted model parameters.
start : int, str, or datetime, optional
Zero-indexed observation number at which to start forecasting,
i.e., the first forecast is start. Can also be a date string to
parse or a datetime type. Default is the the zeroth observation.
end : int, str, or datetime, optional
Zero-indexed observation number at which to end forecasting, i.e.,
the last forecast is end. Can also be a date string to
parse or a datetime type. However, if the dates index does not
have a fixed frequency, end must be an integer index if you
want out-of-sample prediction. Default is the last observation in
the sample. Unlike standard python slices, end is inclusive so
that all the predictions [start, start+1, ..., end-1, end] are
returned.
dynamic : {bool, int, str, datetime, Timestamp}, optional
Integer offset relative to `start` at which to begin dynamic
prediction. Prior to this observation, true endogenous values
will be used for prediction; starting with this observation and
continuing through the end of prediction, forecasted endogenous
values will be used instead. Datetime-like objects are not
interpreted as offsets. They are instead used to find the index
location of `dynamic` which is then used to to compute the offset.
exog : array_like
A replacement exogenous array. Must have the same shape as the
exogenous data array used when the model was created.
exog_oos : array_like
An array containing out-of-sample values of the exogenous
variables. Must have the same number of columns as the exog
used when the model was created, and at least as many rows as
the number of out-of-sample forecasts.
fixed : array_like
A replacement fixed array. Must have the same shape as the
fixed data array used when the model was created.
fixed_oos : array_like
An array containing out-of-sample values of the fixed variables.
Must have the same number of columns as the fixed used when the
model was created, and at least as many rows as the number of
out-of-sample forecasts.
Returns
-------
predictions : {ndarray, Series}
Array of out of in-sample predictions and / or out-of-sample
forecasts.
"""
params, exog, exog_oos, start, end, num_oos = self._prepare_prediction(
params, exog, exog_oos, start, end
)
def check_exog(arr, name, orig, exact):
if isinstance(orig, pd.DataFrame):
if not isinstance(arr, pd.DataFrame):
raise TypeError(
f"{name} must be a DataFrame when the original exog "
"was a DataFrame"
)
if sorted(arr.columns) != sorted(self.data.orig_exog.columns):
raise ValueError(
f"{name} must have the same columns as the original exog"
)
else:
arr = array_like(arr, name, ndim=2, optional=False)
if arr.ndim != 2 or arr.shape[1] != orig.shape[1]:
raise ValueError(
f"{name} must have the same number of columns as the "
f"original data, {orig.shape[1]}"
)
if exact and arr.shape[0] != orig.shape[0]:
raise ValueError(
f"{name} must have the same number of rows as the "
f"original data ({n})."
)
return arr
n = self.data.endog.shape[0]
if exog is not None:
exog = check_exog(exog, "exog", self.data.orig_exog, True)
if exog_oos is not None:
exog_oos = check_exog(exog_oos, "exog_oos", self.data.orig_exog, False)
if fixed is not None:
fixed = check_exog(fixed, "fixed", self._fixed, True)
if fixed_oos is not None:
fixed_oos = check_exog(
np.asarray(fixed_oos), "fixed_oos", self._fixed, False
)
# The maximum number of 1-step predictions that can be made,
# which depends on the model and lags
if self._fixed.shape[1] or not self._causal:
max_1step = 0
else:
max_1step = np.inf if not self._lags else min(self._lags)
if self._order:
min_exog = min([min(v) for v in self._order.values()])
max_1step = min(max_1step, min_exog)
if num_oos > max_1step:
if self._order and exog_oos is None:
raise ValueError(
"exog_oos must be provided when out-of-sample "
"observations require values of the exog not in the "
"original sample"
)
elif self._order and (exog_oos.shape[0] + max_1step) < num_oos:
raise ValueError(
f"exog_oos must have at least {num_oos - max_1step} "
f"observations to produce {num_oos} forecasts based on "
"the model specification."
)
if self._fixed.shape[1] and fixed_oos is None:
raise ValueError(
"fixed_oos must be provided when predicting "
"out-of-sample observations"
)
elif self._fixed.shape[1] and fixed_oos.shape[0] < num_oos:
raise ValueError(
f"fixed_oos must have at least {num_oos} observations "
f"to produce {num_oos} forecasts."
)
# Extend exog_oos if fcast is valid for horizon but no exog_oos given
if self.exog is not None and exog_oos is None and num_oos:
exog_oos = np.full((num_oos, self.exog.shape[1]), np.nan)
if isinstance(self.data.orig_exog, pd.DataFrame):
exog_oos = pd.DataFrame(exog_oos, columns=self.data.orig_exog.columns)
x = self._forecasting_x(start, end, num_oos, exog, exog_oos, fixed, fixed_oos)
if dynamic is False:
dynamic_start = end + 1 - start
else:
dynamic_step = self._parse_dynamic(dynamic, start)
dynamic_start = dynamic_step
if start < self._hold_back:
dynamic_start = max(dynamic_start, self._hold_back - start)
fcasts = np.full(x.shape[0], np.nan)
fcasts[:dynamic_start] = x[:dynamic_start] @ params
offset = self._deterministic_reg.shape[1]
for i in range(dynamic_start, fcasts.shape[0]):
for j, lag in enumerate(self._lags):
loc = i - lag
if loc >= dynamic_start:
val = fcasts[loc]
else:
# Actual data
val = self.endog[start + loc]
x[i, offset + j] = val
fcasts[i] = x[i] @ params
return self._wrap_prediction(fcasts, start, end + 1 + num_oos, 0)
[docs]
@classmethod
def from_formula(
cls,
formula: str,
data: pd.DataFrame,
lags: int | Sequence[int] | None = 0,
order: _ARDLOrder = 0,
trend: Literal["n", "c", "ct", "ctt"] = "n",
*,
causal: bool = False,
seasonal: bool = False,
deterministic: DeterministicProcess | None = None,
hold_back: int | None = None,
period: int | None = None,
missing: Literal["none", "raise"] = "none",
) -> ARDL | UECM:
"""
Construct an ARDL from a formula
Parameters
----------
formula : str
Formula with form dependent ~ independent | fixed. See Examples
below.
data : DataFrame
DataFrame containing the variables in the formula.
lags : {int, list[int]}
The number of lags to include in the model if an integer or the
list of lag indices to include. For example, [1, 4] will only
include lags 1 and 4 while lags=4 will include lags 1, 2, 3,
and 4.
order : {int, sequence[int], dict}
If int, uses lags 0, 1, ..., order for all exog variables. If
sequence[int], uses the ``order`` for all variables. If a dict,
applies the lags series by series. If ``exog`` is anything other
than a DataFrame, the keys are the column index of exog (e.g., 0,
1, ...). If a DataFrame, keys are column names.
causal : bool, optional
Whether to include lag 0 of exog variables. If True, only
includes lags 1, 2, ...
trend : {'n', 'c', 't', 'ct'}, optional
The trend to include in the model:
* 'n' - No trend.
* 'c' - Constant only.
* 't' - Time trend only.
* 'ct' - Constant and time trend.
The default is 'c'.
seasonal : bool, optional
Flag indicating whether to include seasonal dummies in the model.
If seasonal is True and trend includes 'c', then the first period
is excluded from the seasonal terms.
deterministic : DeterministicProcess, optional
A deterministic process. If provided, trend and seasonal are
ignored. A warning is raised if trend is not "n" and seasonal
is not False.
hold_back : {None, int}, optional
Initial observations to exclude from the estimation sample. If
None, then hold_back is equal to the maximum lag in the model.
Set to a non-zero value to produce comparable models with
different lag length. For example, to compare the fit of a model
with lags=3 and lags=1, set hold_back=3 which ensures that both
models are estimated using observations 3,...,nobs. hold_back
must be >= the maximum lag in the model.
period : {None, int}, optional
The period of the data. Only used if seasonal is True. This
parameter can be omitted if using a pandas object for endog
that contains a recognized frequency.
missing : {"none", "drop", "raise"}, optional
Available options are 'none', 'drop', and 'raise'. If 'none', no
NaN checking is done. If 'drop', any observations with NaNs are
dropped. If 'raise', an error is raised. Default is 'none'.
Returns
-------
ARDL
The ARDL model instance
Examples
--------
A simple ARDL using the Danish data
>>> from statsmodels.datasets.danish_data import load
>>> from statsmodels.tsa.api import ARDL
>>> data = load().data
>>> mod = ARDL.from_formula("lrm ~ ibo", data, 2, 2)
Fixed regressors can be specified using a |
>>> mod = ARDL.from_formula("lrm ~ ibo | ide", data, 2, 2)
"""
index = data.index
fixed_formula = None
if "|" in formula:
formula, fixed_formula = formula.split("|")
fixed_formula = fixed_formula.strip()
mod = OLS.from_formula(formula + " -1", data)
exog = mod.data.orig_exog
exog.index = index
endog = mod.data.orig_endog
endog.index = index
if fixed_formula is not None:
endog_name = formula.split("~")[0].strip()
fixed_formula = f"{endog_name} ~ {fixed_formula} - 1"
mod = OLS.from_formula(fixed_formula, data)
fixed: pd.DataFrame | None = mod.data.orig_exog
fixed.index = index
else:
fixed = None
return cls(
endog,
lags,
exog,
order,
trend=trend,
fixed=fixed,
causal=causal,
seasonal=seasonal,
deterministic=deterministic,
hold_back=hold_back,
period=period,
missing=missing,
)
doc = Docstring(ARDL.predict.__doc__)
_predict_params = doc.extract_parameters(
["start", "end", "dynamic", "exog", "exog_oos", "fixed", "fixed_oos"], 8
)
[docs]
class ARDLResults(AutoRegResults):
"""
Class to hold results from fitting an ARDL model.
Parameters
----------
model : ARDL
Reference to the model that is fit.
params : ndarray
The fitted parameters from the AR Model.
cov_params : ndarray
The estimated covariance matrix of the model parameters.
normalized_cov_params : ndarray
The array inv(dot(x.T,x)) where x contains the regressors in the
model.
scale : float, optional
An estimate of the scale of the model.
use_t : bool
Whether use_t was set in fit
"""
_cache = {} # for scale setter
def __init__(
self,
model: ARDL,
params: np.ndarray,
cov_params: np.ndarray,
normalized_cov_params: Float64Array | None = None,
scale: float = 1.0,
use_t: bool = False,
):
super().__init__(model, params, normalized_cov_params, scale, use_t=use_t)
self._cache = {}
self._params = params
self._nobs = model.nobs
self._n_totobs = model.endog.shape[0]
self._df_model = model.df_model
self._ar_lags = model.ar_lags
self._max_lag = 0
if self._ar_lags:
self._max_lag = max(self._ar_lags)
self._hold_back = self.model.hold_back
self.cov_params_default = cov_params
[docs]
@Appender(remove_parameters(ARDL.predict.__doc__, "params"))
def predict(
self,
start: int | str | dt.datetime | pd.Timestamp | None = None,
end: int | str | dt.datetime | pd.Timestamp | None = None,
dynamic: bool = False,
exog: NDArray | pd.DataFrame | None = None,
exog_oos: NDArray | pd.DataFrame | None = None,
fixed: NDArray | pd.DataFrame | None = None,
fixed_oos: NDArray | pd.DataFrame | None = None,
):
return self.model.predict(
self._params,
start=start,
end=end,
dynamic=dynamic,
exog=exog,
exog_oos=exog_oos,
fixed=fixed,
fixed_oos=fixed_oos,
)
[docs]
def forecast(
self,
steps: int = 1,
exog: NDArray | pd.DataFrame | None = None,
fixed: NDArray | pd.DataFrame | None = None,
) -> np.ndarray | pd.Series:
"""
Out-of-sample forecasts
Parameters
----------
steps : {int, str, datetime}, default 1
If an integer, the number of steps to forecast from the end of the
sample. Can also be a date string to parse or a datetime type.
However, if the dates index does not have a fixed frequency,
steps must be an integer.
exog : array_like, optional
Exogenous values to use out-of-sample. Must have same number of
columns as original exog data and at least `steps` rows
fixed : array_like, optional
Fixed values to use out-of-sample. Must have same number of
columns as original fixed data and at least `steps` rows
Returns
-------
array_like
Array of out of in-sample predictions and / or out-of-sample
forecasts.
See Also
--------
ARDLResults.predict
In- and out-of-sample predictions
ARDLResults.get_prediction
In- and out-of-sample predictions and confidence intervals
"""
start = self.model.data.orig_endog.shape[0]
if isinstance(steps, (int, np.integer)):
end = start + steps - 1
else:
end = steps
return self.predict(
start=start, end=end, dynamic=False, exog_oos=exog, fixed_oos=fixed
)
def _lag_repr(self) -> np.ndarray:
"""Returns poly repr of an AR, (1 -phi1 L -phi2 L^2-...)"""
ar_lags = self._ar_lags if self._ar_lags is not None else []
k_ar = len(ar_lags)
ar_params = np.zeros(self._max_lag + 1)
ar_params[0] = 1
offset = self.model._deterministic_reg.shape[1]
params = self._params[offset : offset + k_ar]
for i, lag in enumerate(ar_lags):
ar_params[lag] = -params[i]
return ar_params
[docs]
def get_prediction(
self,
start: int | str | dt.datetime | pd.Timestamp | None = None,
end: int | str | dt.datetime | pd.Timestamp | None = None,
dynamic: bool = False,
exog: NDArray | pd.DataFrame | None = None,
exog_oos: NDArray | pd.DataFrame | None = None,
fixed: NDArray | pd.DataFrame | None = None,
fixed_oos: NDArray | pd.DataFrame | None = None,
) -> np.ndarray | pd.Series:
"""
Predictions and prediction intervals
Parameters
----------
start : int, str, or datetime, optional
Zero-indexed observation number at which to start forecasting,
i.e., the first forecast is start. Can also be a date string to
parse or a datetime type. Default is the the zeroth observation.
end : int, str, or datetime, optional
Zero-indexed observation number at which to end forecasting, i.e.,
the last forecast is end. Can also be a date string to
parse or a datetime type. However, if the dates index does not
have a fixed frequency, end must be an integer index if you
want out-of-sample prediction. Default is the last observation in
the sample. Unlike standard python slices, end is inclusive so
that all the predictions [start, start+1, ..., end-1, end] are
returned.
dynamic : {bool, int, str, datetime, Timestamp}, optional
Integer offset relative to `start` at which to begin dynamic
prediction. Prior to this observation, true endogenous values
will be used for prediction; starting with this observation and
continuing through the end of prediction, forecasted endogenous
values will be used instead. Datetime-like objects are not
interpreted as offsets. They are instead used to find the index
location of `dynamic` which is then used to to compute the offset.
exog : array_like
A replacement exogenous array. Must have the same shape as the
exogenous data array used when the model was created.
exog_oos : array_like
An array containing out-of-sample values of the exogenous variable.
Must has the same number of columns as the exog used when the
model was created, and at least as many rows as the number of
out-of-sample forecasts.
fixed : array_like
A replacement fixed array. Must have the same shape as the
fixed data array used when the model was created.
fixed_oos : array_like
An array containing out-of-sample values of the fixed variables.
Must have the same number of columns as the fixed used when the
model was created, and at least as many rows as the number of
out-of-sample forecasts.
Returns
-------
PredictionResults
Prediction results with mean and prediction intervals
"""
mean = self.predict(
start=start,
end=end,
dynamic=dynamic,
exog=exog,
exog_oos=exog_oos,
fixed=fixed,
fixed_oos=fixed_oos,
)
mean_var = np.full_like(mean, fill_value=self.sigma2)
mean_var[np.isnan(mean)] = np.nan
start = 0 if start is None else start
end = self.model._index[-1] if end is None else end
_, _, oos, _ = self.model._get_prediction_index(start, end)
if oos > 0:
ar_params = self._lag_repr()
ma = arma2ma(ar_params, np.ones(1), lags=oos)
mean_var[-oos:] = self.sigma2 * np.cumsum(ma**2)
if isinstance(mean, pd.Series):
mean_var = pd.Series(mean_var, index=mean.index)
return PredictionResults(mean, mean_var)
[docs]
@Substitution(predict_params=_predict_params)
def plot_predict(
self,
start: int | str | dt.datetime | pd.Timestamp | None = None,
end: int | str | dt.datetime | pd.Timestamp | None = None,
dynamic: bool = False,
exog: NDArray | pd.DataFrame | None = None,
exog_oos: NDArray | pd.DataFrame | None = None,
fixed: NDArray | pd.DataFrame | None = None,
fixed_oos: NDArray | pd.DataFrame | None = None,
alpha: float = 0.05,
in_sample: bool = True,
fig: matplotlib.figure.Figure = None,
figsize: tuple[int, int] | None = None,
) -> matplotlib.figure.Figure:
"""
Plot in- and out-of-sample predictions
Parameters
----------\n%(predict_params)s
alpha : {float, None}
The tail probability not covered by the confidence interval. Must
be in (0, 1). Confidence interval is constructed assuming normally
distributed shocks. If None, figure will not show the confidence
interval.
in_sample : bool
Flag indicating whether to include the in-sample period in the
plot.
fig : Figure
An existing figure handle. If not provided, a new figure is
created.
figsize: tuple[float, float]
Tuple containing the figure size values.
Returns
-------
Figure
Figure handle containing the plot.
"""
predictions = self.get_prediction(
start=start,
end=end,
dynamic=dynamic,
exog=exog,
exog_oos=exog_oos,
fixed=fixed,
fixed_oos=fixed_oos,
)
return self._plot_predictions(
predictions, start, end, alpha, in_sample, fig, figsize
)
[docs]
def summary(self, alpha: float = 0.05) -> Summary:
"""
Summarize the Model
Parameters
----------
alpha : float, optional
Significance level for the confidence intervals.
Returns
-------
Summary
This holds the summary table and text, which can be printed or
converted to various output formats.
See Also
--------
statsmodels.iolib.summary.Summary
"""
model = self.model
title = model.__class__.__name__ + " Model Results"
method = "Conditional MLE"
# get sample
start = self._hold_back
if self.data.dates is not None:
dates = self.data.dates
sample = [dates[start].strftime("%m-%d-%Y")]
sample += ["- " + dates[-1].strftime("%m-%d-%Y")]
else:
sample = [str(start), str(len(self.data.orig_endog))]
model = self.model.__class__.__name__ + str(self.model.ardl_order)
if self.model.seasonal:
model = "Seas. " + model
dep_name = str(self.model.endog_names)
top_left = [
("Dep. Variable:", [dep_name]),
("Model:", [model]),
("Method:", [method]),
("Date:", None),
("Time:", None),
("Sample:", [sample[0]]),
("", [sample[1]]),
]
top_right = [
("No. Observations:", [str(len(self.model.endog))]),
("Log Likelihood", ["%#5.3f" % self.llf]),
("S.D. of innovations", ["%#5.3f" % self.sigma2**0.5]),
("AIC", ["%#5.3f" % self.aic]),
("BIC", ["%#5.3f" % self.bic]),
("HQIC", ["%#5.3f" % self.hqic]),
]
smry = Summary()
smry.add_table_2cols(self, gleft=top_left, gright=top_right, title=title)
smry.add_table_params(self, alpha=alpha, use_t=False)
return smry
class ARDLResultsWrapper(wrap.ResultsWrapper):
_attrs = {}
_wrap_attrs = wrap.union_dicts(
tsa_model.TimeSeriesResultsWrapper._wrap_attrs, _attrs
)
_methods = {}
_wrap_methods = wrap.union_dicts(
tsa_model.TimeSeriesResultsWrapper._wrap_methods, _methods
)
wrap.populate_wrapper(ARDLResultsWrapper, ARDLResults)
[docs]
class ARDLOrderSelectionResults(AROrderSelectionResults):
"""
Results from an ARDL order selection
Contains the information criteria for all fitted model orders.
"""
def __init__(self, model, ics, trend, seasonal, period):
_ics = (((0,), (0, 0, 0)),)
super().__init__(model, _ics, trend, seasonal, period)
def _to_dict(d):
return d[0], dict(d[1:])
self._aic = pd.Series({v[0]: _to_dict(k) for k, v in ics.items()}, dtype=object)
self._aic.index.name = self._aic.name = "AIC"
self._aic = self._aic.sort_index()
self._bic = pd.Series({v[1]: _to_dict(k) for k, v in ics.items()}, dtype=object)
self._bic.index.name = self._bic.name = "BIC"
self._bic = self._bic.sort_index()
self._hqic = pd.Series(
{v[2]: _to_dict(k) for k, v in ics.items()}, dtype=object
)
self._hqic.index.name = self._hqic.name = "HQIC"
self._hqic = self._hqic.sort_index()
@property
def dl_lags(self) -> dict[Hashable, list[int]]:
"""The lags of exogenous variables in the selected model"""
return self._model.dl_lags
[docs]
def ardl_select_order(
endog: ArrayLike1D | ArrayLike2D,
maxlag: int,
exog: ArrayLike2D,
maxorder: int | dict[Hashable, int],
trend: Literal["n", "c", "ct", "ctt"] = "c",
*,
fixed: ArrayLike2D | None = None,
causal: bool = False,
ic: Literal["aic", "bic"] = "bic",
glob: bool = False,
seasonal: bool = False,
deterministic: DeterministicProcess | None = None,
hold_back: int | None = None,
period: int | None = None,
missing: Literal["none", "raise"] = "none",
) -> ARDLOrderSelectionResults:
r"""
ARDL order selection
Parameters
----------
endog : array_like
A 1-d endogenous response variable. The dependent variable.
maxlag : int
The maximum lag to consider for the endogenous variable.
exog : array_like
Exogenous variables to include in the model. Either a DataFrame or
an 2-d array-like structure that can be converted to a NumPy array.
maxorder : {int, dict}
If int, sets a common max lag length for all exog variables. If
a dict, then sets individual lag length. They keys are column names
if exog is a DataFrame or column indices otherwise.
trend : {'n', 'c', 't', 'ct'}, optional
The trend to include in the model:
* 'n' - No trend.
* 'c' - Constant only.
* 't' - Time trend only.
* 'ct' - Constant and time trend.
The default is 'c'.
fixed : array_like
Additional fixed regressors that are not lagged.
causal : bool, optional
Whether to include lag 0 of exog variables. If True, only includes
lags 1, 2, ...
ic : {"aic", "bic", "hqic"}
The information criterion to use in model selection.
glob : bool
Whether to consider all possible submodels of the largest model
or only if smaller order lags must be included if larger order
lags are. If ``True``, the number of model considered is of the
order 2**(maxlag + k * maxorder) assuming maxorder is an int. This
can be very large unless k and maxorder are bot relatively small.
If False, the number of model considered is of the order
maxlag*maxorder**k which may also be substantial when k and maxorder
are large.
seasonal : bool, optional
Flag indicating whether to include seasonal dummies in the model. If
seasonal is True and trend includes 'c', then the first period
is excluded from the seasonal terms.
deterministic : DeterministicProcess, optional
A deterministic process. If provided, trend and seasonal are ignored.
A warning is raised if trend is not "n" and seasonal is not False.
hold_back : {None, int}, optional
Initial observations to exclude from the estimation sample. If None,
then hold_back is equal to the maximum lag in the model. Set to a
non-zero value to produce comparable models with different lag
length. For example, to compare the fit of a model with lags=3 and
lags=1, set hold_back=3 which ensures that both models are estimated
using observations 3,...,nobs. hold_back must be >= the maximum lag in
the model.
period : {None, int}, optional
The period of the data. Only used if seasonal is True. This parameter
can be omitted if using a pandas object for endog that contains a
recognized frequency.
missing : {"none", "drop", "raise"}, optional
Available options are 'none', 'drop', and 'raise'. If 'none', no NaN
checking is done. If 'drop', any observations with NaNs are dropped.
If 'raise', an error is raised. Default is 'none'.
Returns
-------
ARDLSelectionResults
A results holder containing the selected model and the complete set
of information criteria for all models fit.
"""
orig_hold_back = int_like(hold_back, "hold_back", optional=True)
def compute_ics(y, x, df):
if x.shape[1]:
resid = y - x @ np.linalg.lstsq(x, y, rcond=None)[0]
else:
resid = y
nobs = resid.shape[0]
sigma2 = 1.0 / nobs * sumofsq(resid)
llf = -nobs * (np.log(2 * np.pi * sigma2) + 1) / 2
res = SimpleNamespace(
nobs=nobs, df_model=df + x.shape[1], sigma2=sigma2, llf=llf
)
aic = call_cached_func(ARDLResults.aic, res)
bic = call_cached_func(ARDLResults.bic, res)
hqic = call_cached_func(ARDLResults.hqic, res)
return aic, bic, hqic
base = ARDL(
endog,
maxlag,
exog,
maxorder,
trend,
fixed=fixed,
causal=causal,
seasonal=seasonal,
deterministic=deterministic,
hold_back=hold_back,
period=period,
missing=missing,
)
hold_back = base.hold_back
blocks = base._blocks
always = np.column_stack([blocks["deterministic"], blocks["fixed"]])
always = always[hold_back:]
select = []
iter_orders = []
select.append(blocks["endog"][hold_back:])
iter_orders.append(list(range(blocks["endog"].shape[1] + 1)))
var_names = []
for var in blocks["exog"]:
block = blocks["exog"][var][hold_back:]
select.append(block)
iter_orders.append(list(range(block.shape[1] + 1)))
var_names.append(var)
y = base._y
if always.shape[1]:
pinv_always = np.linalg.pinv(always)
for i in range(len(select)):
x = select[i]
select[i] = x - always @ (pinv_always @ x)
y = y - always @ (pinv_always @ y)
def perm_to_tuple(keys, perm):
if perm == ():
d = {k: 0 for k, _ in keys if k is not None}
return (0,) + tuple((k, v) for k, v in d.items())
d = defaultdict(list)
y_lags = []
for v in perm:
key = keys[v]
if key[0] is None:
y_lags.append(key[1])
else:
d[key[0]].append(key[1])
d = dict(d)
if not y_lags or y_lags == [0]:
y_lags = 0
else:
y_lags = tuple(y_lags)
for key in keys:
if key[0] not in d and key[0] is not None:
d[key[0]] = None
for key in d:
if d[key] is not None:
d[key] = tuple(d[key])
return (y_lags,) + tuple((k, v) for k, v in d.items())
always_df = always.shape[1]
ics = {}
if glob:
ar_lags = base.ar_lags if base.ar_lags is not None else []
keys = [(None, i) for i in ar_lags]
for k, v in base._order.items():
keys += [(k, i) for i in v]
x = np.column_stack([a for a in select])
all_columns = list(range(x.shape[1]))
for i in range(x.shape[1]):
for perm in combinations(all_columns, i):
key = perm_to_tuple(keys, perm)
ics[key] = compute_ics(y, x[:, perm], always_df)
else:
for io in product(*iter_orders):
x = np.column_stack([a[:, : io[i]] for i, a in enumerate(select)])
key = [io[0] if io[0] else None]
for j, val in enumerate(io[1:]):
var = var_names[j]
if causal:
key.append((var, None if val == 0 else val))
else:
key.append((var, val - 1 if val - 1 >= 0 else None))
key = tuple(key)
ics[key] = compute_ics(y, x, always_df)
index = {"aic": 0, "bic": 1, "hqic": 2}[ic]
lowest = np.inf
for key in ics:
val = ics[key][index]
if val < lowest:
lowest = val
selected_order = key
exog_order = {k: v for k, v in selected_order[1:]}
model = ARDL(
endog,
selected_order[0],
exog,
exog_order,
trend,
fixed=fixed,
causal=causal,
seasonal=seasonal,
deterministic=deterministic,
hold_back=orig_hold_back,
period=period,
missing=missing,
)
return ARDLOrderSelectionResults(model, ics, trend, seasonal, period)
lags_descr = textwrap.wrap(
"The number of lags of the endogenous variable to include in the model. "
"Must be at least 1.",
71,
)
lags_param = Parameter(name="lags", type="int", desc=lags_descr)
order_descr = textwrap.wrap(
"If int, uses lags 0, 1, ..., order for all exog variables. If a dict, "
"applies the lags series by series. If ``exog`` is anything other than a "
"DataFrame, the keys are the column index of exog (e.g., 0, 1, ...). If "
"a DataFrame, keys are column names.",
71,
)
order_param = Parameter(name="order", type="int, dict", desc=order_descr)
from_formula_doc = Docstring(ARDL.from_formula.__doc__)
from_formula_doc.replace_block("Summary", "Construct an UECM from a formula")
from_formula_doc.remove_parameters("lags")
from_formula_doc.remove_parameters("order")
from_formula_doc.insert_parameters("data", lags_param)
from_formula_doc.insert_parameters("lags", order_param)
fit_doc = Docstring(ARDL.fit.__doc__)
fit_doc.replace_block(
"Returns", [Parameter("", "UECMResults", ["Estimation results."])]
)
if fit_doc._ds is not None:
see_also = fit_doc._ds["See Also"]
see_also.insert(
0,
(
[("statsmodels.tsa.ardl.ARDL", None)],
["Autoregressive distributed lag model estimation"],
),
)
fit_doc.replace_block("See Also", see_also)
[docs]
class UECM(ARDL):
r"""
Unconstrained Error Correlation Model(UECM)
Parameters
----------
endog : array_like
A 1-d endogenous response variable. The dependent variable.
lags : {int, list[int]}
The number of lags of the endogenous variable to include in the
model. Must be at least 1.
exog : array_like
Exogenous variables to include in the model. Either a DataFrame or
an 2-d array-like structure that can be converted to a NumPy array.
order : {int, sequence[int], dict}
If int, uses lags 0, 1, ..., order for all exog variables. If a
dict, applies the lags series by series. If ``exog`` is anything
other than a DataFrame, the keys are the column index of exog
(e.g., 0, 1, ...). If a DataFrame, keys are column names.
fixed : array_like
Additional fixed regressors that are not lagged.
causal : bool, optional
Whether to include lag 0 of exog variables. If True, only includes
lags 1, 2, ...
trend : {'n', 'c', 't', 'ct'}, optional
The trend to include in the model:
* 'n' - No trend.
* 'c' - Constant only.
* 't' - Time trend only.
* 'ct' - Constant and time trend.
The default is 'c'.
seasonal : bool, optional
Flag indicating whether to include seasonal dummies in the model. If
seasonal is True and trend includes 'c', then the first period
is excluded from the seasonal terms.
deterministic : DeterministicProcess, optional
A deterministic process. If provided, trend and seasonal are ignored.
A warning is raised if trend is not "n" and seasonal is not False.
hold_back : {None, int}, optional
Initial observations to exclude from the estimation sample. If None,
then hold_back is equal to the maximum lag in the model. Set to a
non-zero value to produce comparable models with different lag
length. For example, to compare the fit of a model with lags=3 and
lags=1, set hold_back=3 which ensures that both models are estimated
using observations 3,...,nobs. hold_back must be >= the maximum lag in
the model.
period : {None, int}, optional
The period of the data. Only used if seasonal is True. This parameter
can be omitted if using a pandas object for endog that contains a
recognized frequency.
missing : {"none", "drop", "raise"}, optional
Available options are 'none', 'drop', and 'raise'. If 'none', no NaN
checking is done. If 'drop', any observations with NaNs are dropped.
If 'raise', an error is raised. Default is 'none'.
Notes
-----
The full specification of an UECM is
.. math ::
\Delta Y_t = \delta_0 + \delta_1 t + \delta_2 t^2
+ \sum_{i=1}^{s-1} \gamma_i I_{[(\mod(t,s) + 1) = i]}
+ \lambda_0 Y_{t-1} + \lambda_1 X_{1,t-1} + \ldots
+ \lambda_{k} X_{k,t-1}
+ \sum_{j=1}^{p-1} \phi_j \Delta Y_{t-j}
+ \sum_{l=1}^k \sum_{m=0}^{o_l-1} \beta_{l,m} \Delta X_{l, t-m}
+ Z_t \lambda
+ \epsilon_t
where :math:`\delta_\bullet` capture trends, :math:`\gamma_\bullet`
capture seasonal shifts, s is the period of the seasonality, p is the
lag length of the endogenous variable, k is the number of exogenous
variables :math:`X_{l}`, :math:`o_l` is included the lag length of
:math:`X_{l}`, :math:`Z_t` are ``r`` included fixed regressors and
:math:`\epsilon_t` is a white noise shock. If ``causal`` is ``True``,
then the 0-th lag of the exogenous variables is not included and the
sum starts at ``m=1``.
See Also
--------
statsmodels.tsa.ardl.ARDL
Autoregressive distributed lag model estimation
statsmodels.tsa.ar_model.AutoReg
Autoregressive model estimation with optional exogenous regressors
statsmodels.tsa.statespace.sarimax.SARIMAX
Seasonal ARIMA model estimation with optional exogenous regressors
statsmodels.tsa.arima.model.ARIMA
ARIMA model estimation
Examples
--------
>>> from statsmodels.tsa.api import UECM
>>> from statsmodels.datasets import danish_data
>>> data = danish_data.load_pandas().data
>>> lrm = data.lrm
>>> exog = data[["lry", "ibo", "ide"]]
A basic model where all variables have 3 lags included
>>> UECM(data.lrm, 3, data[["lry", "ibo", "ide"]], 3)
A dictionary can be used to pass custom lag orders
>>> UECM(data.lrm, [1, 3], exog, {"lry": 1, "ibo": 3, "ide": 2})
Setting causal removes the 0-th lag from the exogenous variables
>>> exog_lags = {"lry": 1, "ibo": 3, "ide": 2}
>>> UECM(data.lrm, 3, exog, exog_lags, causal=True)
When using NumPy arrays, the dictionary keys are the column index.
>>> import numpy as np
>>> lrma = np.asarray(lrm)
>>> exoga = np.asarray(exog)
>>> UECM(lrma, 3, exoga, {0: 1, 1: 3, 2: 2})
"""
def __init__(
self,
endog: ArrayLike1D | ArrayLike2D,
lags: int | None,
exog: ArrayLike2D | None = None,
order: _UECMOrder = 0,
trend: Literal["n", "c", "ct", "ctt"] = "c",
*,
fixed: ArrayLike2D | None = None,
causal: bool = False,
seasonal: bool = False,
deterministic: DeterministicProcess | None = None,
hold_back: int | None = None,
period: int | None = None,
missing: Literal["none", "drop", "raise"] = "none",
) -> None:
super().__init__(
endog,
lags,
exog,
order,
trend=trend,
fixed=fixed,
seasonal=seasonal,
causal=causal,
hold_back=hold_back,
period=period,
missing=missing,
deterministic=deterministic,
)
self._results_class = UECMResults
self._results_wrapper = UECMResultsWrapper
def _check_lags(
self, lags: int | Sequence[int] | None, hold_back: int | None
) -> tuple[list[int], int]:
"""Check lags value conforms to requirement"""
if not (isinstance(lags, _INT_TYPES) or lags is None):
raise TypeError("lags must be an integer or None")
return super()._check_lags(lags, hold_back)
def _check_order(self, order: _ARDLOrder):
"""Check order conforms to requirement"""
if isinstance(order, Mapping):
for k, v in order.items():
if not isinstance(v, _INT_TYPES) and v is not None:
raise TypeError("order values must be positive integers or None")
elif not (isinstance(order, _INT_TYPES) or order is None):
raise TypeError(
"order must be None, a positive integer, or a dict "
"containing positive integers or None"
)
# TODO: Check order is >= 1
order = super()._check_order(order)
if not order:
raise ValueError("Model must contain at least one exogenous variable")
for key, val in order.items():
if val == [0]:
raise ValueError(
"All included exog variables must have a lag length >= 1"
)
return order
def _construct_variable_names(self):
"""Construct model variables names"""
endog = self.data.orig_endog
if isinstance(endog, pd.Series):
y_base = endog.name or "y"
elif isinstance(endog, pd.DataFrame):
y_base = endog.squeeze().name or "y"
else:
y_base = "y"
y_name = f"D.{y_base}"
# 1. Deterministics
x_names = list(self._deterministic_reg.columns)
# 2. Levels
x_names.append(f"{y_base}.L1")
orig_exog = self.data.orig_exog
exog_pandas = isinstance(orig_exog, pd.DataFrame)
dexog_names = []
for key, val in self._order.items():
if val is not None:
if exog_pandas:
x_name = f"{key}.L1"
else:
x_name = f"x{key}.L1"
x_names.append(x_name)
lag_base = x_name[:-1]
for lag in val[:-1]:
dexog_names.append(f"D.{lag_base}{lag}")
# 3. Lagged endog
y_lags = max(self._lags) if self._lags else 0
dendog_names = [f"{y_name}.L{lag}" for lag in range(1, y_lags)]
x_names.extend(dendog_names)
x_names.extend(dexog_names)
x_names.extend(self._fixed_names)
return y_name, x_names
def _construct_regressors(
self, hold_back: int | None
) -> tuple[np.ndarray, np.ndarray]:
"""Construct and format model regressors"""
# 1. Endogenous and endogenous lags
self._maxlag = max(self._lags) if self._lags else 0
dendog = np.full_like(self.data.endog, np.nan)
dendog[1:] = np.diff(self.data.endog, axis=0)
dlag = max(0, self._maxlag - 1)
self._endog_reg, self._endog = lagmat(dendog, dlag, original="sep")
# 2. Deterministics
self._deterministic_reg = self._deterministics.in_sample()
# 3. Levels
orig_exog = self.data.orig_exog
exog_pandas = isinstance(orig_exog, pd.DataFrame)
lvl = np.full_like(self.data.endog, np.nan)
lvl[1:] = self.data.endog[:-1]
lvls = [lvl.copy()]
for key, val in self._order.items():
if val is not None:
if exog_pandas:
loc = orig_exog.columns.get_loc(key)
else:
loc = key
lvl[1:] = self.data.exog[:-1, loc]
lvls.append(lvl.copy())
self._levels = np.column_stack(lvls)
# 4. exog Lags
if exog_pandas:
dexog = orig_exog.diff()
else:
dexog = np.full_like(self.data.exog, np.nan)
dexog[1:] = np.diff(orig_exog, axis=0)
adj_order = {}
for key, val in self._order.items():
val = None if (val is None or val == [1]) else val[:-1]
adj_order[key] = val
self._exog = self._format_exog(dexog, adj_order)
self._blocks = {
"deterministic": self._deterministic_reg,
"levels": self._levels,
"endog": self._endog_reg,
"exog": self._exog,
"fixed": self._fixed,
}
blocks = [self._endog]
for key, val in self._blocks.items():
if key != "exog":
blocks.append(np.asarray(val))
else:
for subval in val.values():
blocks.append(np.asarray(subval))
y = blocks[0]
reg = np.column_stack(blocks[1:])
exog_maxlag = 0
for val in self._order.values():
exog_maxlag = max(exog_maxlag, max(val) if val is not None else 0)
self._maxlag = max(self._maxlag, exog_maxlag)
# Must be at least 1 since the endog is differenced
self._maxlag = max(self._maxlag, 1)
if hold_back is None:
self._hold_back = int(self._maxlag)
if self._hold_back < self._maxlag:
raise ValueError(
"hold_back must be >= the maximum lag of the endog and exog "
"variables"
)
reg = reg[self._hold_back :]
if reg.shape[1] > reg.shape[0]:
raise ValueError(
f"The number of regressors ({reg.shape[1]}) including "
"deterministics, lags of the endog, lags of the exogenous, "
"and fixed regressors is larger than the sample available "
f"for estimation ({reg.shape[0]})."
)
return np.squeeze(y)[self._hold_back :], reg
[docs]
@Appender(str(fit_doc))
def fit(
self,
*,
cov_type: str = "nonrobust",
cov_kwds: dict[str, Any] = None,
use_t: bool = True,
) -> UECMResults:
params, cov_params, norm_cov_params = self._fit(
cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t
)
res = UECMResults(self, params, cov_params, norm_cov_params, use_t=use_t)
return UECMResultsWrapper(res)
[docs]
@classmethod
def from_ardl(cls, ardl: ARDL, missing: Literal["none", "drop", "raise"] = "none"):
"""
Construct a UECM from an ARDL model
Parameters
----------
ardl : ARDL
The ARDL model instance
missing : {"none", "drop", "raise"}, default "none"
How to treat missing observations.
Returns
-------
UECM
The UECM model instance
Notes
-----
The lag requirements for a UECM are stricter than for an ARDL.
Any variable that is included in the UECM must have a lag length
of at least 1. Additionally, the included lags must be contiguous
starting at 0 if non-causal or 1 if causal.
"""
err = (
"UECM can only be created from ARDL models that include all "
"{var_typ} lags up to the maximum lag in the model."
)
uecm_lags = {}
dl_lags = ardl.dl_lags
for key, val in dl_lags.items():
max_val = max(val)
if len(dl_lags[key]) < (max_val + int(not ardl.causal)):
raise ValueError(err.format(var_typ="exogenous"))
uecm_lags[key] = max_val
if ardl.ar_lags is None:
ar_lags = None
else:
max_val = max(ardl.ar_lags)
if len(ardl.ar_lags) != max_val:
raise ValueError(err.format(var_typ="endogenous"))
ar_lags = max_val
return cls(
ardl.data.orig_endog,
ar_lags,
ardl.data.orig_exog,
uecm_lags,
trend=ardl.trend,
fixed=ardl.fixed,
seasonal=ardl.seasonal,
hold_back=ardl.hold_back,
period=ardl.period,
causal=ardl.causal,
missing=missing,
deterministic=ardl.deterministic,
)
[docs]
def predict(
self,
params: ArrayLike1D,
start: int | str | dt.datetime | pd.Timestamp | None = None,
end: int | str | dt.datetime | pd.Timestamp | None = None,
dynamic: bool = False,
exog: NDArray | pd.DataFrame | None = None,
exog_oos: NDArray | pd.DataFrame | None = None,
fixed: NDArray | pd.DataFrame | None = None,
fixed_oos: NDArray | pd.DataFrame | None = None,
) -> np.ndarray:
"""
In-sample prediction and out-of-sample forecasting.
Parameters
----------
params : array_like
The fitted model parameters.
start : int, str, or datetime, optional
Zero-indexed observation number at which to start forecasting,
i.e., the first forecast is start. Can also be a date string to
parse or a datetime type. Default is the the zeroth observation.
end : int, str, or datetime, optional
Zero-indexed observation number at which to end forecasting, i.e.,
the last forecast is end. Can also be a date string to
parse or a datetime type. However, if the dates index does not
have a fixed frequency, end must be an integer index if you
want out-of-sample prediction. Default is the last observation in
the sample. Unlike standard python slices, end is inclusive so
that all the predictions [start, start+1, ..., end-1, end] are
returned.
dynamic : {bool, int, str, datetime, Timestamp}, optional
Integer offset relative to `start` at which to begin dynamic
prediction. Prior to this observation, true endogenous values
will be used for prediction; starting with this observation and
continuing through the end of prediction, forecasted endogenous
values will be used instead. Datetime-like objects are not
interpreted as offsets. They are instead used to find the index
location of `dynamic` which is then used to to compute the offset.
exog : array_like
A replacement exogenous array. Must have the same shape as the
exogenous data array used when the model was created.
exog_oos : array_like
An array containing out-of-sample values of the exogenous
variables. Must have the same number of columns as the exog
used when the model was created, and at least as many rows as
the number of out-of-sample forecasts.
fixed : array_like
A replacement fixed array. Must have the same shape as the
fixed data array used when the model was created.
fixed_oos : array_like
An array containing out-of-sample values of the fixed variables.
Must have the same number of columns as the fixed used when the
model was created, and at least as many rows as the number of
out-of-sample forecasts.
Returns
-------
predictions : {ndarray, Series}
Array of out of in-sample predictions and / or out-of-sample
forecasts.
"""
if dynamic is not False:
raise NotImplementedError("dynamic forecasts are not supported")
params, exog, exog_oos, start, end, num_oos = self._prepare_prediction(
params, exog, exog_oos, start, end
)
if num_oos != 0:
raise NotImplementedError("Out-of-sample forecasts are not supported")
pred = np.full(self.endog.shape[0], np.nan)
pred[-self._x.shape[0] :] = self._x @ params
return pred[start : end + 1]
[docs]
@classmethod
@Appender(from_formula_doc.__str__().replace("ARDL", "UECM"))
def from_formula(
cls,
formula: str,
data: pd.DataFrame,
lags: int | Sequence[int] | None = 0,
order: _ARDLOrder = 0,
trend: Literal["n", "c", "ct", "ctt"] = "n",
*,
causal: bool = False,
seasonal: bool = False,
deterministic: DeterministicProcess | None = None,
hold_back: int | None = None,
period: int | None = None,
missing: Literal["none", "raise"] = "none",
) -> UECM:
return super().from_formula(
formula,
data,
lags,
order,
trend,
causal=causal,
seasonal=seasonal,
deterministic=deterministic,
hold_back=hold_back,
period=period,
missing=missing,
)
[docs]
class UECMResults(ARDLResults):
"""
Class to hold results from fitting an UECM model.
Parameters
----------
model : UECM
Reference to the model that is fit.
params : ndarray
The fitted parameters from the AR Model.
cov_params : ndarray
The estimated covariance matrix of the model parameters.
normalized_cov_params : ndarray
The array inv(dot(x.T,x)) where x contains the regressors in the
model.
scale : float, optional
An estimate of the scale of the model.
"""
_cache: dict[str, Any] = {} # for scale setter
def _ci_wrap(
self, val: np.ndarray, name: str = ""
) -> NDArray | pd.Series | pd.DataFrame:
if not isinstance(self.model.data, PandasData):
return val
ndet = self.model._blocks["deterministic"].shape[1]
nlvl = self.model._blocks["levels"].shape[1]
lbls = self.model.exog_names[: (ndet + nlvl)]
for i in range(ndet, ndet + nlvl):
lbl = lbls[i]
if lbl.endswith(".L1"):
lbls[i] = lbl[:-3]
if val.ndim == 2:
return pd.DataFrame(val, columns=lbls, index=lbls)
return pd.Series(val, index=lbls, name=name)
@cache_readonly
def resid(self):
"""
The residuals of the model.
"""
model = self.model
return model._y - self.fittedvalues
@cache_readonly
def ci_params(self) -> np.ndarray | pd.Series:
"""Parameters of normalized cointegrating relationship"""
ndet = self.model._blocks["deterministic"].shape[1]
nlvl = self.model._blocks["levels"].shape[1]
base = np.asarray(self.params)[ndet]
return self._ci_wrap(self.params[: ndet + nlvl] / base, "ci_params")
@cache_readonly
def ci_bse(self) -> np.ndarray | pd.Series:
"""Standard Errors of normalized cointegrating relationship"""
bse = np.sqrt(np.diag(self.ci_cov_params()))
return self._ci_wrap(bse, "ci_bse")
@cache_readonly
def ci_tvalues(self) -> np.ndarray | pd.Series:
"""T-values of normalized cointegrating relationship"""
ndet = self.model._blocks["deterministic"].shape[1]
with warnings.catch_warnings():
warnings.simplefilter("ignore")
tvalues = np.asarray(self.ci_params) / np.asarray(self.ci_bse)
tvalues[ndet] = np.nan
return self._ci_wrap(tvalues, "ci_tvalues")
@cache_readonly
def ci_pvalues(self) -> np.ndarray | pd.Series:
"""P-values of normalized cointegrating relationship"""
with warnings.catch_warnings():
warnings.simplefilter("ignore")
pvalues = 2 * (1 - stats.norm.cdf(np.abs(self.ci_tvalues)))
return self._ci_wrap(pvalues, "ci_pvalues")
[docs]
def ci_conf_int(self, alpha: float = 0.05) -> Float64Array | pd.DataFrame:
alpha = float_like(alpha, "alpha")
if self.use_t:
q = stats.t(self.df_resid).ppf(1 - alpha / 2)
else:
q = stats.norm().ppf(1 - alpha / 2)
p = self.ci_params
se = self.ci_bse
out = [p - q * se, p + q * se]
if not isinstance(p, pd.Series):
return np.column_stack(out)
df = pd.concat(out, axis=1)
df.columns = ["lower", "upper"]
return df
[docs]
def ci_summary(self, alpha: float = 0.05) -> Summary:
def _ci(alpha=alpha):
return np.asarray(self.ci_conf_int(alpha))
smry = Summary()
ndet = self.model._blocks["deterministic"].shape[1]
nlvl = self.model._blocks["levels"].shape[1]
exog_names = list(self.model.exog_names)[: (ndet + nlvl)]
model = SimpleNamespace(
endog_names=self.model.endog_names, exog_names=exog_names
)
data = SimpleNamespace(
params=self.ci_params,
bse=self.ci_bse,
tvalues=self.ci_tvalues,
pvalues=self.ci_pvalues,
conf_int=_ci,
model=model,
)
tab = summary_params(data)
tab.title = "Cointegrating Vector"
smry.tables.append(tab)
return smry
@cache_readonly
def ci_resids(self) -> np.ndarray | pd.Series:
d = self.model._blocks["deterministic"]
exog = self.model.data.orig_exog
is_pandas = isinstance(exog, pd.DataFrame)
exog = exog if is_pandas else self.model.exog
cols = [np.asarray(d), self.model.endog]
for key, value in self.model.dl_lags.items():
if value is not None:
if is_pandas:
cols.append(np.asarray(exog[key]))
else:
cols.append(exog[:, key])
ci_x = np.column_stack(cols)
resids = ci_x @ self.ci_params
if not isinstance(self.model.data, PandasData):
return resids
index = self.model.data.orig_endog.index
return pd.Series(resids, index=index, name="ci_resids")
[docs]
def ci_cov_params(self) -> Float64Array | pd.DataFrame:
"""Covariance of normalized of cointegrating relationship"""
ndet = self.model._blocks["deterministic"].shape[1]
nlvl = self.model._blocks["levels"].shape[1]
loc = list(range(ndet + nlvl))
cov = self.cov_params()
cov_a = np.asarray(cov)
ci_cov = cov_a[np.ix_(loc, loc)]
m = ci_cov.shape[0]
params = np.asarray(self.params)[: ndet + nlvl]
base = params[ndet]
d = np.zeros((m, m))
for i in range(m):
if i == ndet:
continue
d[i, i] = 1 / base
d[i, ndet] = -params[i] / (base**2)
ci_cov = d @ ci_cov @ d.T
return self._ci_wrap(ci_cov)
def _lag_repr(self):
"""Returns poly repr of an AR, (1 -phi1 L -phi2 L^2-...)"""
# TODO
[docs]
def bounds_test(
self,
case: Literal[1, 2, 3, 4, 5],
cov_type: str = "nonrobust",
cov_kwds: dict[str, Any] = None,
use_t: bool = True,
asymptotic: bool = True,
nsim: int = 100_000,
seed: (
int | Sequence[int] | np.random.RandomState | np.random.Generator | None
) = None,
):
r"""
Cointegration bounds test of Pesaran, Shin, and Smith
Parameters
----------
case : {1, 2, 3, 4, 5}
One of the cases covered in the PSS test.
cov_type : str
The covariance estimator to use. The asymptotic distribution of
the PSS test has only been established in the homoskedastic case,
which is the default.
The most common choices are listed below. Supports all covariance
estimators that are available in ``OLS.fit``.
* 'nonrobust' - The class OLS covariance estimator that assumes
homoskedasticity.
* 'HC0', 'HC1', 'HC2', 'HC3' - Variants of White's
(or Eiker-Huber-White) covariance estimator. `HC0` is the
standard implementation. The other make corrections to improve
the finite sample performance of the heteroskedasticity robust
covariance estimator.
* 'HAC' - Heteroskedasticity-autocorrelation robust covariance
estimation. Supports cov_kwds.
- `maxlags` integer (required) : number of lags to use.
- `kernel` callable or str (optional) : kernel
currently available kernels are ['bartlett', 'uniform'],
default is Bartlett.
- `use_correction` bool (optional) : If true, use small sample
correction.
cov_kwds : dict, optional
A dictionary of keyword arguments to pass to the covariance
estimator. `nonrobust` and `HC#` do not support cov_kwds.
use_t : bool, optional
A flag indicating that small-sample corrections should be applied
to the covariance estimator.
asymptotic : bool
Flag indicating whether to use asymptotic critical values which
were computed by simulation (True, default) or to simulate a
sample-size specific set of critical values. Tables are only
available for up to 10 components in the cointegrating
relationship, so if more variables are included then simulation
is always used. The simulation computed the test statistic under
and assumption that the residuals are homoskedastic.
nsim : int
Number of simulations to run when computing exact critical values.
Only used if ``asymptotic`` is ``True``.
seed : {None, int, sequence[int], RandomState, Generator}, optional
Seed to use when simulating critical values. Must be provided if
reproducible critical value and p-values are required when
``asymptotic`` is ``False``.
Returns
-------
BoundsTestResult
Named tuple containing ``stat``, ``crit_vals``, ``p_values``,
``null` and ``alternative``. The statistic is the F-type
test statistic favored in PSS.
Notes
-----
The PSS bounds test has 5 cases which test the coefficients on the
level terms in the model
.. math::
\Delta Y_{t}=\delta_{0} + \delta_{1}t + Z_{t-1}\beta
+ \sum_{j=0}^{P}\Delta X_{t-j}\Gamma + \epsilon_{t}
where :math:`Z_{t-1}` contains both :math:`Y_{t-1}` and
:math:`X_{t-1}`.
The cases determine which deterministic terms are included in the
model and which are tested as part of the test.
Cases:
1. No deterministic terms
2. Constant included in both the model and the test
3. Constant included in the model but not in the test
4. Constant and trend included in the model, only trend included in
the test
5. Constant and trend included in the model, neither included in the
test
The test statistic is a Wald-type quadratic form test that all of the
coefficients in :math:`\beta` are 0 along with any included
deterministic terms, which depends on the case. The statistic returned
is an F-type test statistic which is the standard quadratic form test
statistic divided by the number of restrictions.
References
----------
.. [*] Pesaran, M. H., Shin, Y., & Smith, R. J. (2001). Bounds testing
approaches to the analysis of level relationships. Journal of
applied econometrics, 16(3), 289-326.
"""
model = self.model
trend: Literal["n", "c", "ct"]
if case == 1:
trend = "n"
elif case in (2, 3):
trend = "c"
else:
trend = "ct"
order = {key: max(val) for key, val in model._order.items()}
uecm = UECM(
model.data.endog,
max(model.ar_lags),
model.data.orig_exog,
order=order,
causal=model.causal,
trend=trend,
)
res = uecm.fit(cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t)
cov = res.cov_params()
nvar = len(res.model.ardl_order)
if case == 1:
rest = np.arange(nvar)
elif case == 2:
rest = np.arange(nvar + 1)
elif case == 3:
rest = np.arange(1, nvar + 1)
elif case == 4:
rest = np.arange(1, nvar + 2)
elif case == 5:
rest = np.arange(2, nvar + 2)
r = np.zeros((rest.shape[0], cov.shape[1]))
for i, loc in enumerate(rest):
r[i, loc] = 1
vcv = r @ cov @ r.T
coef = r @ res.params
stat = coef.T @ np.linalg.inv(vcv) @ coef / r.shape[0]
k = nvar
if asymptotic and k <= 10:
cv = pss_critical_values.crit_vals
key = (k, case)
upper = cv[key + (True,)]
lower = cv[key + (False,)]
crit_vals = pd.DataFrame(
{"lower": lower, "upper": upper},
index=pss_critical_values.crit_percentiles,
)
crit_vals.index.name = "percentile"
p_values = pd.Series(
{
"lower": _pss_pvalue(stat, k, case, False),
"upper": _pss_pvalue(stat, k, case, True),
}
)
else:
nobs = res.resid.shape[0]
crit_vals, p_values = _pss_simulate(
stat, k, case, nobs=nobs, nsim=nsim, seed=seed
)
return BoundsTestResult(
stat,
crit_vals,
p_values,
"No Cointegration",
"Possible Cointegration",
)
def _pss_pvalue(stat: float, k: int, case: int, i1: bool) -> float:
key = (k, case, i1)
large_p = pss_critical_values.large_p[key]
small_p = pss_critical_values.small_p[key]
threshold = pss_critical_values.stat_star[key]
log_stat = np.log(stat)
p = small_p if stat > threshold else large_p
x = [log_stat**i for i in range(len(p))]
return 1 - stats.norm.cdf(x @ np.array(p))
def _pss_simulate(
stat: float,
k: int,
case: Literal[1, 2, 3, 4, 5],
nobs: int,
nsim: int,
seed: int | Sequence[int] | np.random.RandomState | np.random.Generator | None,
) -> tuple[pd.DataFrame, pd.Series]:
rs: np.random.RandomState | np.random.Generator
if not isinstance(seed, np.random.RandomState):
rs = np.random.default_rng(seed)
else:
assert isinstance(seed, np.random.RandomState)
rs = seed
def _vectorized_ols_resid(rhs, lhs):
rhs_t = np.transpose(rhs, [0, 2, 1])
xpx = np.matmul(rhs_t, rhs)
xpy = np.matmul(rhs_t, lhs)
b = np.linalg.solve(xpx, xpy)
return np.squeeze(lhs - np.matmul(rhs, b))
block_size = 100_000_000 // (8 * nobs * k)
remaining = nsim
loc = 0
f_upper = np.empty(nsim)
f_lower = np.empty(nsim)
while remaining > 0:
to_do = min(remaining, block_size)
e = rs.standard_normal((to_do, nobs + 1, k))
y = np.cumsum(e[:, :, :1], axis=1)
x_upper = np.cumsum(e[:, :, 1:], axis=1)
x_lower = e[:, :, 1:]
lhs = np.diff(y, axis=1)
if case in (2, 3):
rhs = np.empty((to_do, nobs, k + 1))
rhs[:, :, -1] = 1
elif case in (4, 5):
rhs = np.empty((to_do, nobs, k + 2))
rhs[:, :, -2] = np.arange(nobs, dtype=float)
rhs[:, :, -1] = 1
else:
rhs = np.empty((to_do, nobs, k))
rhs[:, :, :1] = y[:, :-1]
rhs[:, :, 1:k] = x_upper[:, :-1]
u = _vectorized_ols_resid(rhs, lhs)
df = rhs.shape[1] - rhs.shape[2]
s2 = (u**2).sum(1) / df
if case in (3, 4):
rhs_r = rhs[:, :, -1:]
elif case == 5: # case 5
rhs_r = rhs[:, :, -2:]
if case in (3, 4, 5):
ur = _vectorized_ols_resid(rhs_r, lhs)
nrest = rhs.shape[-1] - rhs_r.shape[-1]
else:
ur = np.squeeze(lhs)
nrest = rhs.shape[-1]
f = ((ur**2).sum(1) - (u**2).sum(1)) / nrest
f /= s2
f_upper[loc : loc + to_do] = f
# Lower
rhs[:, :, 1:k] = x_lower[:, :-1]
u = _vectorized_ols_resid(rhs, lhs)
s2 = (u**2).sum(1) / df
if case in (3, 4):
rhs_r = rhs[:, :, -1:]
elif case == 5: # case 5
rhs_r = rhs[:, :, -2:]
if case in (3, 4, 5):
ur = _vectorized_ols_resid(rhs_r, lhs)
nrest = rhs.shape[-1] - rhs_r.shape[-1]
else:
ur = np.squeeze(lhs)
nrest = rhs.shape[-1]
f = ((ur**2).sum(1) - (u**2).sum(1)) / nrest
f /= s2
f_lower[loc : loc + to_do] = f
loc += to_do
remaining -= to_do
crit_percentiles = pss_critical_values.crit_percentiles
crit_vals = pd.DataFrame(
{
"lower": np.percentile(f_lower, crit_percentiles),
"upper": np.percentile(f_upper, crit_percentiles),
},
index=crit_percentiles,
)
crit_vals.index.name = "percentile"
p_values = pd.Series(
{"lower": (stat < f_lower).mean(), "upper": (stat < f_upper).mean()}
)
return crit_vals, p_values
class UECMResultsWrapper(wrap.ResultsWrapper):
_attrs = {}
_wrap_attrs = wrap.union_dicts(
tsa_model.TimeSeriesResultsWrapper._wrap_attrs, _attrs
)
_methods = {}
_wrap_methods = wrap.union_dicts(
tsa_model.TimeSeriesResultsWrapper._wrap_methods, _methods
)
wrap.populate_wrapper(UECMResultsWrapper, UECMResults)
Last update:
Dec 16, 2024