Source code for statsmodels.stats._lilliefors

"""
Implements Lilliefors corrected Kolmogorov-Smirnov tests for normal and
exponential distributions.

`kstest_fit` is provided as a top-level function to access both tests.
`kstest_normal` and `kstest_exponential` are provided as convenience functions
with the appropriate test as the default.
`lilliefors` is provided as an alias for `kstest_fit`.

Created on Sat Oct 01 13:16:49 2011

Author: Josef Perktold
License: BSD-3

pvalues for Lilliefors test are based on formula and table in

An Analytic Approximation to the Distribution of Lilliefors's Test Statistic
for Normality
Author(s): Gerard E. Dallal and Leland WilkinsonSource: The American
Statistician, Vol. 40, No. 4 (Nov., 1986), pp. 294-296
Published by: American Statistical Association
Stable URL: http://www.jstor.org/stable/2684607 .

On the Kolmogorov-Smirnov Test for Normality with Mean and Variance Unknown
Hubert W. Lilliefors
Journal of the American Statistical Association, Vol. 62, No. 318.
(Jun., 1967), pp. 399-402.

---

Updated 2017-07-23
Jacob C. Kimmel

Ref:
Lilliefors, H.W.
On the Kolmogorov-Smirnov test for the exponential distribution with mean
unknown. Journal of the American Statistical Association, Vol 64, No. 325.
(1969), pp. 387–389.
"""
from functools import partial

import numpy as np
from scipy import stats

from statsmodels.tools.validation import string_like
from ._lilliefors_critical_values import (critical_values,
                                          asymp_critical_values,
                                          PERCENTILES)
from .tabledist import TableDist


def _make_asymptotic_function(params):
    """
    Generates an asymptotic distribution callable from a param matrix

    Polynomial is a[0] * x**(-1/2) + a[1] * x**(-1) + a[2] * x**(-3/2)

    Parameters
    ----------
    params : ndarray
        Array with shape (nalpha, 3) where nalpha is the number of
        significance levels
    """

    def f(n):
        poly = np.array([1, np.log(n), np.log(n) ** 2])
        return np.exp(poly.dot(params.T))

    return f


def ksstat(x, cdf, alternative='two_sided', args=()):
    """
    Calculate statistic for the Kolmogorov-Smirnov test for goodness of fit

    This calculates the test statistic for a test of the distribution G(x) of
    an observed variable against a given distribution F(x). Under the null
    hypothesis the two distributions are identical, G(x)=F(x). The
    alternative hypothesis can be either 'two_sided' (default), 'less'
    or 'greater'. The KS test is only valid for continuous distributions.

    Parameters
    ----------
    x : array_like, 1d
        array of observations
    cdf : str or callable
        string: name of a distribution in scipy.stats
        callable: function to evaluate cdf
    alternative : 'two_sided' (default), 'less' or 'greater'
        defines the alternative hypothesis (see explanation)
    args : tuple, sequence
        distribution parameters for call to cdf


    Returns
    -------
    D : float
        KS test statistic, either D, D+ or D-

    See Also
    --------
    scipy.stats.kstest

    Notes
    -----

    In the one-sided test, the alternative is that the empirical
    cumulative distribution function of the random variable is "less"
    or "greater" than the cumulative distribution function F(x) of the
    hypothesis, G(x)<=F(x), resp. G(x)>=F(x).

    In contrast to scipy.stats.kstest, this function only calculates the
    statistic which can be used either as distance measure or to implement
    case specific p-values.
    """
    nobs = float(len(x))

    if isinstance(cdf, str):
        cdf = getattr(stats.distributions, cdf).cdf
    elif hasattr(cdf, 'cdf'):
        cdf = getattr(cdf, 'cdf')

    x = np.sort(x)
    cdfvals = cdf(x, *args)

    d_plus = (np.arange(1.0, nobs + 1) / nobs - cdfvals).max()
    d_min = (cdfvals - np.arange(0.0, nobs) / nobs).max()
    if alternative == 'greater':
        return d_plus
    elif alternative == 'less':
        return d_min

    return np.max([d_plus, d_min])


def get_lilliefors_table(dist='norm'):
    """
    Generates tables for significance levels of Lilliefors test statistics

    Tables for available normal and exponential distribution testing,
    as specified in Lilliefors references above

    Parameters
    ----------
    dist : str
        distribution being tested in set {'norm', 'exp'}.

    Returns
    -------
    lf : TableDist object.
        table of critical values
    """
    # function just to keep things together
    # for this test alpha is sf probability, i.e. right tail probability

    alpha = 1 - np.array(PERCENTILES) / 100.0
    alpha = alpha[::-1]
    dist = 'normal' if dist == 'norm' else dist
    if dist not in critical_values:
        raise ValueError("Invalid dist parameter. Must be 'norm' or 'exp'")
    cv_data = critical_values[dist]
    acv_data = asymp_critical_values[dist]

    size = np.array(sorted(cv_data), dtype=float)
    crit_lf = np.array([cv_data[key] for key in sorted(cv_data)])
    crit_lf = crit_lf[:, ::-1]

    asym_params = np.array([acv_data[key] for key in sorted(acv_data)])
    asymp_fn = _make_asymptotic_function(asym_params[::-1])

    lf = TableDist(alpha, size, crit_lf, asymptotic=asymp_fn)
    return lf


lilliefors_table_norm = get_lilliefors_table(dist='norm')
lilliefors_table_expon = get_lilliefors_table(dist='exp')


def pval_lf(d_max, n):
    """
    Approximate pvalues for Lilliefors test

    This is only valid for pvalues smaller than 0.1 which is not checked in
    this function.

    Parameters
    ----------
    d_max : array_like
        two-sided Kolmogorov-Smirnov test statistic
    n : int or float
        sample size

    Returns
    -------
    p-value : float or ndarray
        pvalue according to approximation formula of Dallal and Wilkinson.

    Notes
    -----
    This is mainly a helper function where the calling code should dispatch
    on bound violations. Therefore it does not check whether the pvalue is in
    the valid range.

    Precision for the pvalues is around 2 to 3 decimals. This approximation is
    also used by other statistical packages (e.g. R:fBasics) but might not be
    the most precise available.

    References
    ----------
    DallalWilkinson1986
    """
    # todo: check boundaries, valid range for n and Dmax
    if n > 100:
        d_max *= (n / 100.) ** 0.49
        n = 100
    pval = np.exp(-7.01256 * d_max ** 2 * (n + 2.78019)
                  + 2.99587 * d_max * np.sqrt(n + 2.78019) - 0.122119
                  + 0.974598 / np.sqrt(n) + 1.67997 / n)
    return pval



[docs]
def kstest_fit(x, dist='norm', pvalmethod="table"):
    """
    Test assumed normal or exponential distribution using Lilliefors' test.

    Lilliefors' test is a Kolmogorov-Smirnov test with estimated parameters.

    Parameters
    ----------
    x : array_like, 1d
        Data to test.
    dist : {'norm', 'exp'}, optional
        The assumed distribution.
    pvalmethod : {'approx', 'table'}, optional
        The method used to compute the p-value of the test statistic. In
        general, 'table' is preferred and makes use of a very large simulation.
        'approx' is only valid for normality. if `dist = 'exp'` `table` is
        always used. 'approx' uses the approximation formula of Dalal and
        Wilkinson, valid for pvalues < 0.1. If the pvalue is larger than 0.1,
        then the result of `table` is returned.

    Returns
    -------
    ksstat : float
        Kolmogorov-Smirnov test statistic with estimated mean and variance.
    pvalue : float
        If the pvalue is lower than some threshold, e.g. 0.05, then we can
        reject the Null hypothesis that the sample comes from a normal
        distribution.

    Notes
    -----
    'table' uses an improved table based on 10,000,000 simulations. The
    critical values are approximated using
    log(cv_alpha) = b_alpha + c[0] log(n) + c[1] log(n)**2
    where cv_alpha is the critical value for a test with size alpha,
    b_alpha is an alpha-specific intercept term and c[1] and c[2] are
    coefficients that are shared all alphas.
    Values in the table are linearly interpolated. Values outside the
    range are be returned as bounds, 0.990 for large and 0.001 for small
    pvalues.

    For implementation details, see  lilliefors_critical_value_simulation.py in
    the test directory.
    """
    pvalmethod = string_like(pvalmethod,
                             "pvalmethod",
                             options=("approx", "table"))
    x = np.asarray(x)
    if x.ndim == 2 and x.shape[1] == 1:
        x = x[:, 0]
    elif x.ndim != 1:
        raise ValueError("Invalid parameter `x`: must be a one-dimensional"
                         " array-like or a single-column DataFrame")

    nobs = len(x)

    if dist == 'norm':
        z = (x - x.mean()) / x.std(ddof=1)
        test_d = stats.norm.cdf
        lilliefors_table = lilliefors_table_norm
    elif dist == 'exp':
        z = x / x.mean()
        test_d = stats.expon.cdf
        lilliefors_table = lilliefors_table_expon
        pvalmethod = 'table'
    else:
        raise ValueError("Invalid dist parameter, must be 'norm' or 'exp'")

    min_nobs = 4 if dist == 'norm' else 3
    if nobs < min_nobs:
        raise ValueError('Test for distribution {} requires at least {} '
                         'observations'.format(dist, min_nobs))

    d_ks = ksstat(z, test_d, alternative='two_sided')

    if pvalmethod == 'approx':
        pval = pval_lf(d_ks, nobs)
        # check pval is in desired range
        if pval > 0.1:
            pval = lilliefors_table.prob(d_ks, nobs)
    else:  # pvalmethod == 'table'
        pval = lilliefors_table.prob(d_ks, nobs)

    return d_ks, pval



lilliefors = kstest_fit
kstest_normal = kstest_fit
kstest_exponential = partial(kstest_fit, dist='exp')