Source code for statsmodels.tsa.forecasting.theta

r"""
Implementation of the Theta forecasting method of

Assimakopoulos, V., & Nikolopoulos, K. (2000). The theta model: a decomposition
approach to forecasting. International journal of forecasting, 16(4), 521-530.

and updates in

Hyndman, R. J., & Billah, B. (2003). Unmasking the Theta method. International
Journal of Forecasting, 19(2), 287-290.

Fioruci, J. A., Pellegrini, T. R., Louzada, F., & Petropoulos, F. (2015).
The optimized theta method. arXiv preprint arXiv:1503.03529.
"""
from typing import TYPE_CHECKING, Optional, Tuple

import numpy as np
import pandas as pd
from scipy import stats

from statsmodels.iolib.summary import Summary
from statsmodels.iolib.table import SimpleTable
from statsmodels.tools.validation import (
    array_like,
    bool_like,
    float_like,
    int_like,
    string_like,
)
from statsmodels.tsa.deterministic import DeterministicTerm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.exponential_smoothing import (
    ExponentialSmoothing,
)
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.tsatools import add_trend, freq_to_period

if TYPE_CHECKING:
    import matplotlib.figure


def extend_index(steps: int, index: pd.Index) -> pd.Index:
    return DeterministicTerm._extend_index(index, steps)


[docs]class ThetaModel:
    r"""
    The Theta forecasting model of Assimakopoulos and Nikolopoulos (2000)

    Parameters
    ----------
    endog : array_like, 1d
        The data to forecast.
    period : int, default None
        The period of the data that is used in the seasonality test and
        adjustment. If None then the period is determined from y's index,
        if available.
    deseasonalize : bool, default True
        A flag indicating whether the deseasonalize the data. If True and
        use_test is True, the data is only deseasonalized if the null of no
        seasonal component is rejected.
    use_test : bool, default True
        A flag indicating whether test the period-th autocorrelation. If this
        test rejects using a size of 10%, then decomposition is used. Set to
        False to skip the test.
    method : {"auto", "additive", "multiplicative"}, default "auto"
        The model used for the seasonal decomposition. "auto" uses a
        multiplicative if y is non-negative and all estimated seasonal
        components are positive. If either of these conditions is False,
        then it uses an additive decomposition.
    difference : bool, default False
        A flag indicating to difference the data before testing for
        seasonality.

    See Also
    --------
    statsmodels.tsa.statespace.exponential_smoothing.ExponentialSmoothing
        Exponential smoothing parameter estimation and forecasting
    statsmodels.tsa.statespace.sarimax.SARIMAX
        Seasonal ARIMA parameter estimation and forecasting

    Notes
    -----
    The Theta model forecasts the future as a weighted combination of two
    Theta lines.  This class supports combinations of models with two
    thetas: 0 and a user-specified choice (default 2). The forecasts are
    then

    .. math::

       \hat{X}_{T+h|T} = \frac{\theta-1}{\theta} b_0
                         \left[h - 1 + \frac{1}{\alpha}
                         - \frac{(1-\alpha)^T}{\alpha} \right]
                         + \tilde{X}_{T+h|T}

    where :math:`\tilde{X}_{T+h|T}` is the SES forecast of the endogenous
    variable using the parameter :math:`\alpha`. :math:`b_0` is the
    slope of a time trend line fitted to X using the terms 0, 1, ..., T-1.

    The model is estimated in steps:

    1. Test for seasonality
    2. Deseasonalize if seasonality detected
    3. Estimate :math:`\alpha` by fitting a SES model to the data and
       :math:`b_0` by OLS.
    4. Forecast the series
    5. Reseasonalize if the data was deseasonalized.

    The seasonality test examines where the autocorrelation at the
    seasonal period is different from zero. The seasonality is then
    removed using a seasonal decomposition with a multiplicative trend.
    If the seasonality estimate is non-positive then an additive trend
    is used instead. The default deseasonalizing method can be changed
    using the options.

    References
    ----------
    .. [1] Assimakopoulos, V., & Nikolopoulos, K. (2000). The theta model: a
       decomposition approach to forecasting. International Journal of
       Forecasting, 16(4), 521-530.
    .. [2] Hyndman, R. J., & Billah, B. (2003). Unmasking the Theta method.
       International Journal of Forecasting, 19(2), 287-290.
    .. [3] Fioruci, J. A., Pellegrini, T. R., Louzada, F., & Petropoulos, F.
       (2015). The optimized theta method. arXiv preprint arXiv:1503.03529.
    """

    def __init__(
        self,
        endog,
        *,
        period: Optional[int] = None,
        deseasonalize: bool = True,
        use_test: bool = True,
        method: str = "auto",
        difference: bool = False
    ) -> None:
        self._y = array_like(endog, "endog", ndim=1)
        if isinstance(endog, pd.DataFrame):
            self.endog_orig = endog.iloc[:, 0]
        else:
            self.endog_orig = endog
        self._period = int_like(period, "period", optional=True)
        self._deseasonalize = bool_like(deseasonalize, "deseasonalize")
        self._use_test = (
            bool_like(use_test, "use_test") and self._deseasonalize
        )
        self._diff = bool_like(difference, "difference")
        self._method = string_like(
            method,
            "model",
            options=("auto", "additive", "multiplicative", "mul", "add"),
        )
        if self._period is None and self._deseasonalize:
            idx = getattr(endog, "index", None)
            pfreq = None
            if idx is not None:
                pfreq = getattr(idx, "freq", None)
                if pfreq is None:
                    pfreq = getattr(idx, "inferred_freq", None)
            if pfreq is not None:
                self._period = freq_to_period(pfreq)
            else:
                raise ValueError(
                    "You must specify a period or endog must be a "
                    "pandas object with a DatetimeIndex with "
                    "a freq not set to None"
                )

        self._has_seasonality = self._deseasonalize

    def _test_seasonality(self) -> None:
        y = self._y
        if self._diff:
            y = np.diff(y)
        rho = acf(y, nlags=self._period, fft=True)
        nobs = y.shape[0]
        stat = nobs * rho[-1] ** 2 / np.sum(rho[:-1] ** 2)
        # CV is 10% from a chi2(1), 1.645**2
        self._has_seasonality = stat > 2.705543454095404

    def _deseasonalize_data(self) -> Tuple[np.ndarray, np.ndarray]:
        y = self._y
        if not self._has_seasonality:
            return self._y, np.empty(0)
        self._method = (
            "mul" if self._method == "auto" and self._y.min() > 0 else "add"
        )

        res = seasonal_decompose(y, model=self._method, period=self._period)
        if res.seasonal.min() <= 0:
            self._method = "add"
            res = seasonal_decompose(y, model="add", period=self._period)
            return y - res.seasonal, res.seasonal[: self._period]
        else:
            return y / res.seasonal, res.seasonal[: self._period]

[docs]    def fit(
        self, use_mle: bool = False, disp: bool = False
    ) -> "ThetaModelResults":
        r"""
        Estimate model parameters.

        Parameters
        ----------
        use_mle : bool, default False
            Estimate the parameters using MLE by fitting an ARIMA(0,1,1) with
            a drift.  If False (the default), estimates parameters using OLS
            of a constant and a time-trend and by fitting a SES to the model
            data.
        disp : bool, default True
            Display iterative output from fitting the model.

        Notes
        -----
        When using MLE, the parameters are estimated from the ARIMA(0,1,1)

        .. math::

           X_t = X_{t-1} + b_0 + (\alpha-1)\epsilon_{t-1} + \epsilon_t

        When estimating the model using 2-step estimation, the model
        parameters are estimated using the OLS regression

        .. math::

           X_t = a_0 + b_0 (t-1) + \eta_t

        and the SES

        .. math::

           \tilde{X}_{t+1} = \alpha X_{t} + (1-\alpha)\tilde{X}_{t}

        Returns
        -------
        ThetaModelResult
            Model results and forecasting
        """
        if self._deseasonalize and self._use_test:
            self._test_seasonality()
        y, seasonal = self._deseasonalize_data()
        if use_mle:
            mod = SARIMAX(y, order=(0, 1, 1), trend="c")
            res = mod.fit(disp=disp)
            params = np.asarray(res.params)
            alpha = params[1] + 1
            if alpha > 1:
                alpha = 0.9998
                res = mod.fit_constrained({"ma.L1": alpha - 1})
                params = np.asarray(res.params)
            b0 = params[0]
            sigma2 = params[-1]
            one_step = res.forecast(1) - b0
        else:
            ct = add_trend(y, "ct", prepend=True)[:, :2]
            ct[:, 1] -= 1
            _, b0 = np.linalg.lstsq(ct, y, rcond=None)[0]
            res = ExponentialSmoothing(
                y, initial_level=y[0], initialization_method="known"
            ).fit(disp=disp)
            alpha = res.params[0]
            sigma2 = None
            one_step = res.forecast(1)
        return ThetaModelResults(
            b0, alpha, sigma2, one_step, seasonal, use_mle, self
        )

    @property
    def deseasonalize(self) -> bool:
        """Whether to deseasonalize the data"""
        return self._deseasonalize

    @property
    def period(self) -> int:
        """The period of the seasonality"""
        return self._period

    @property
    def use_test(self) -> bool:
        """Whether to test the data for seasonality"""
        return self._use_test

    @property
    def difference(self) -> bool:
        """Whether the data is differenced in the seasonality test"""
        return self._diff

    @property
    def method(self) -> str:
        """The method used to deseasonalize the data"""
        return self._method


[docs]class ThetaModelResults:
    """
    Results class from estimated Theta Models.

    Parameters
    ----------
    b0 : float
        The estimated trend slope.
    alpha : float
        The estimated SES parameter.
    sigma2 : float
        The estimated residual variance from the SES/IMA model.
    one_step : float
        The one-step forecast from the SES.
    seasonal : ndarray
        An array of estimated seasonal terms.
    use_mle : bool
        A flag indicating that the parameters were estimated using MLE.
    model : ThetaModel
        The model used to produce the results.
    """

    def __init__(
        self,
        b0: float,
        alpha: float,
        sigma2: Optional[float],
        one_step: float,
        seasonal: np.ndarray,
        use_mle: bool,
        model: ThetaModel,
    ) -> None:
        self._b0 = b0
        self._alpha = alpha
        self._sigma2 = sigma2
        self._one_step = one_step
        self._nobs = model.endog_orig.shape[0]
        self._model = model
        self._seasonal = seasonal
        self._use_mle = use_mle

    @property
    def params(self) -> pd.Series:
        """The forecasting model parameters"""
        return pd.Series([self._b0, self._alpha], index=["b0", "alpha"])

    @property
    def sigma2(self) -> float:
        """The estimated residual variance"""
        if self._sigma2 is None:
            mod = SARIMAX(self.model._y, order=(0, 1, 1), trend="c")
            res = mod.fit(disp=False)
            self._sigma2 = np.asarray(res.params)[-1]
        assert self._sigma2 is not None
        return self._sigma2

    @property
    def model(self) -> ThetaModel:
        """The model used to produce the results"""
        return self._model

[docs]    def forecast(self, steps: int = 1, theta: float = 2) -> pd.Series:
        r"""
        Forecast the model for a given theta

        Parameters
        ----------
        steps : int
            The number of steps ahead to compute the forecast components.
        theta : float
            The theta value to use when computing the weight to combine
            the trend and the SES forecasts.

        Returns
        -------
        Series
            A Series containing the forecasts

        Notes
        -----
        The forecast is computed as

        .. math::

           \hat{X}_{T+h|T} = \frac{\theta-1}{\theta} b_0
                             \left[h - 1 + \frac{1}{\alpha}
                             - \frac{(1-\alpha)^T}{\alpha} \right]
                             + \tilde{X}_{T+h|T}

        where :math:`\tilde{X}_{T+h|T}` is the SES forecast of the endogenous
        variable using the parameter :math:`\alpha`. :math:`b_0` is the
        slope of a time trend line fitted to X using the terms 0, 1, ..., T-1.

        This expression follows from [1]_ and [2]_ when the combination
        weights are restricted to be (theta-1)/theta and 1/theta. This nests
        the original implementation when theta=2 and the two weights are both
        1/2.

        References
        ----------
        .. [1] Hyndman, R. J., & Billah, B. (2003). Unmasking the Theta method.
           International Journal of Forecasting, 19(2), 287-290.
        .. [2] Fioruci, J. A., Pellegrini, T. R., Louzada, F., & Petropoulos,
           F. (2015). The optimized theta method. arXiv preprint
           arXiv:1503.03529.
        """

        steps = int_like(steps, "steps")
        if steps < 1:
            raise ValueError("steps must be a positive integer")
        theta = float_like(theta, "theta")
        if theta < 1:
            raise ValueError("theta must be a float >= 1")
        thresh = 4.0 / np.finfo(np.double).eps
        trend_weight = (theta - 1) / theta if theta < thresh else 1.0
        comp = self.forecast_components(steps=steps)
        fcast = trend_weight * comp.trend + np.asarray(comp.ses)
        # Re-seasonalize if needed
        if self.model.deseasonalize:
            seasonal = np.asarray(comp.seasonal)
            if self.model.method.startswith("mul"):
                fcast *= seasonal
            else:
                fcast += seasonal
        fcast.name = "forecast"

        return fcast

[docs]    def forecast_components(self, steps: int = 1) -> pd.DataFrame:
        r"""
        Compute the three components of the Theta model forecast

        Parameters
        ----------
        steps : int
            The number of steps ahead to compute the forecast components.

        Returns
        -------
        DataFrame
            A DataFrame with three columns: trend, ses and seasonal containing
            the forecast values of each of the three components.

        Notes
        -----
        For a given value of :math:`\theta`, the deseasonalized forecast is
        `fcast = w * trend + ses` where :math:`w = \frac{theta - 1}{theta}`.
        The reseasonalized forecasts are then `seasonal * fcast` if the
        seasonality is multiplicative or `seasonal + fcast` if the seasonality
        is additive.
        """
        steps = int_like(steps, "steps")
        if steps < 1:
            raise ValueError("steps must be a positive integer")
        alpha = self._alpha
        b0 = self._b0
        nobs = self._nobs
        h = np.arange(1, steps + 1, dtype=np.float64) - 1
        if alpha > 0:
            h += 1 / alpha - ((1 - alpha) ** nobs / alpha)
        trend = b0 * h
        ses = self._one_step * np.ones(steps)
        if self.model.method.startswith("add"):
            season = np.zeros(steps)
        else:
            season = np.ones(steps)
        # Re-seasonalize
        if self.model.deseasonalize:
            seasonal = self._seasonal
            period = self.model.period
            oos_idx = nobs + np.arange(steps)
            seasonal_locs = oos_idx % period
            if seasonal.shape[0]:
                season[:] = seasonal[seasonal_locs]
        index = getattr(self.model.endog_orig, "index", None)
        if index is None:
            index = pd.RangeIndex(0, self.model.endog_orig.shape[0])
        index = extend_index(steps, index)

        df = pd.DataFrame(
            {"trend": trend, "ses": ses, "seasonal": season}, index=index
        )
        return df

[docs]    def summary(self) -> Summary:
        """
        Summarize the model

        Returns
        -------
        Summary
            This holds the summary table and text, which can be printed or
            converted to various output formats.

        See Also
        --------
        statsmodels.iolib.summary.Summary
        """
        model = self.model
        smry = Summary()

        model_name = type(model).__name__
        title = model_name + " Results"
        method = "MLE" if self._use_mle else "OLS/SES"

        is_series = isinstance(model.endog_orig, pd.Series)
        index = getattr(model.endog_orig, "index", None)
        if is_series and isinstance(index, (pd.DatetimeIndex, pd.PeriodIndex)):
            sample = [index[0].strftime("%m-%d-%Y")]
            sample += ["- " + index[-1].strftime("%m-%d-%Y")]
        else:
            sample = [str(0), str(model.endog_orig.shape[0])]

        dep_name = getattr(model.endog_orig, "name", "endog") or "endog"
        top_left = [
            ("Dep. Variable:", [dep_name]),
            ("Method:", [method]),
            ("Date:", None),
            ("Time:", None),
            ("Sample:", [sample[0]]),
            ("", [sample[1]]),
        ]
        method = (
            "Multiplicative" if model.method.startswith("mul") else "Additive"
        )
        top_right = [
            ("No. Observations:", [str(self._nobs)]),
            ("Deseasonalized:", [str(model.deseasonalize)]),
        ]

        if model.deseasonalize:
            top_right.extend(
                [
                    ("Deseas. Method:", [method]),
                    ("Period:", [str(model.period)]),
                    ("", [""]),
                    ("", [""]),
                ]
            )
        else:
            top_right.extend([("", [""])] * 4)

        smry.add_table_2cols(
            self, gleft=top_left, gright=top_right, title=title
        )
        table_fmt = {"data_fmts": ["%s", "%#0.4g"], "data_aligns": "r"}

        data = np.asarray(self.params)[:, None]
        st = SimpleTable(
            data,
            ["Parameters", "Estimate"],
            list(self.params.index),
            title="Parameter Estimates",
            txt_fmt=table_fmt,
        )
        smry.tables.append(st)

        return smry

[docs]    def prediction_intervals(
        self, steps: int = 1, theta: float = 2, alpha: float = 0.05
    ) -> pd.DataFrame:
        r"""
        Parameters
        ----------
        steps : int, default 1
            The number of steps ahead to compute the forecast components.
        theta : float, default 2
            The theta value to use when computing the weight to combine
            the trend and the SES forecasts.
        alpha : float, default 0.05
            Significance level for the confidence intervals.

        Returns
        -------
        DataFrame
            DataFrame with columns lower and upper

        Notes
        -----
        The variance of the h-step forecast is assumed to follow from the
        integrated Moving Average structure of the Theta model, and so is
        :math:`\sigma^2(1 + (h-1)(1 + (\alpha-1)^2)`. The prediction interval
        assumes that innovations are normally distributed.
        """
        model_alpha = self.params[1]
        sigma2_h = (
            1 + np.arange(steps) * (1 + (model_alpha - 1) ** 2)
        ) * self.sigma2
        sigma_h = np.sqrt(sigma2_h)
        quantile = stats.norm.ppf(alpha / 2)
        predictions = self.forecast(steps, theta)
        return pd.DataFrame(
            {
                "lower": predictions + sigma_h * quantile,
                "upper": predictions + sigma_h * -quantile,
            }
        )

[docs]    def plot_predict(
        self,
        steps: int = 1,
        theta: float = 2,
        alpha: Optional[float] = 0.05,
        in_sample: bool = False,
        fig: Optional["matplotlib.figure.Figure"] = None,
        figsize: Tuple[float, float] = None,
    ) -> "matplotlib.figure.Figure":
        r"""
        Plot forecasts, prediction intervals and in-sample values

        Parameters
        ----------
        steps : int, default 1
            The number of steps ahead to compute the forecast components.
        theta : float, default 2
            The theta value to use when computing the weight to combine
            the trend and the SES forecasts.
        alpha : {float, None}, default 0.05
            The tail probability not covered by the confidence interval. Must
            be in (0, 1). Confidence interval is constructed assuming normally
            distributed shocks. If None, figure will not show the confidence
            interval.
        in_sample : bool, default False
            Flag indicating whether to include the in-sample period in the
            plot.
        fig : Figure, default None
            An existing figure handle. If not provided, a new figure is
            created.
        figsize: tuple[float, float], default None
            Tuple containing the figure size.

        Returns
        -------
        Figure
            Figure handle containing the plot.

        Notes
        -----
        The variance of the h-step forecast is assumed to follow from the
        integrated Moving Average structure of the Theta model, and so is
        :math:`\sigma^2(\alpha^2 + (h-1))`. The prediction interval assumes
        that innovations are normally distributed.
        """
        from statsmodels.graphics.utils import _import_mpl, create_mpl_fig

        _import_mpl()
        fig = create_mpl_fig(fig, figsize)
        assert fig is not None
        predictions = self.forecast(steps, theta)
        pred_index = predictions.index

        ax = fig.add_subplot(111)
        nobs = self.model.endog_orig.shape[0]
        index = pd.Index(np.arange(nobs))
        if in_sample:
            if isinstance(self.model.endog_orig, pd.Series):
                index = self.model.endog_orig.index
            ax.plot(index, self.model.endog_orig)
        ax.plot(pred_index, predictions)
        if alpha is not None:
            pi = self.prediction_intervals(steps, theta, alpha)
            label = "{0:.0%} confidence interval".format(1 - alpha)
            ax.fill_between(
                pred_index,
                pi["lower"],
                pi["upper"],
                color="gray",
                alpha=0.5,
                label=label,
            )

        ax.legend(loc="best", frameon=False)
        fig.tight_layout(pad=1.0)

        return fig