Source code for statsmodels.stats.stattools
"""
Statistical tests to be used in conjunction with the models
Notes
-----
These functions have not been formally tested.
"""
from scipy import stats
import numpy as np
from statsmodels.tools.sm_exceptions import ValueWarning
[docs]
def durbin_watson(resids, axis=0):
r"""
Calculates the Durbin-Watson statistic.
Parameters
----------
resids : array_like
Data for which to compute the Durbin-Watson statistic. Usually
regression model residuals.
axis : int, optional
Axis to use if data has more than 1 dimension. Default is 0.
Returns
-------
dw : float, array_like
The Durbin-Watson statistic.
Notes
-----
The null hypothesis of the test is that there is no serial correlation
in the residuals.
The Durbin-Watson test statistic is defined as:
.. math::
\sum_{t=2}^T((e_t - e_{t-1})^2)/\sum_{t=1}^Te_t^2
The test statistic is approximately equal to 2*(1-r) where ``r`` is the
sample autocorrelation of the residuals. Thus, for r == 0, indicating no
serial correlation, the test statistic equals 2. This statistic will
always be between 0 and 4. The closer to 0 the statistic, the more
evidence for positive serial correlation. The closer to 4, the more
evidence for negative serial correlation.
"""
resids = np.asarray(resids)
diff_resids = np.diff(resids, 1, axis=axis)
dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
return dw
[docs]
def omni_normtest(resids, axis=0):
"""
Omnibus test for normality
Parameters
----------
resid : array_like
axis : int, optional
Default is 0
Returns
-------
Chi^2 score, two-tail probability
"""
# TODO: change to exception in summary branch and catch in summary()
# behavior changed between scipy 0.9 and 0.10
resids = np.asarray(resids)
n = resids.shape[axis]
if n < 8:
from warnings import warn
warn("omni_normtest is not valid with less than 8 observations; %i "
"samples were given." % int(n), ValueWarning)
return np.nan, np.nan
return stats.normaltest(resids, axis=axis)
[docs]
def jarque_bera(resids, axis=0):
r"""
The Jarque-Bera test of normality.
Parameters
----------
resids : array_like
Data to test for normality. Usually regression model residuals that
are mean 0.
axis : int, optional
Axis to use if data has more than 1 dimension. Default is 0.
Returns
-------
JB : {float, ndarray}
The Jarque-Bera test statistic.
JBpv : {float, ndarray}
The pvalue of the test statistic.
skew : {float, ndarray}
Estimated skewness of the data.
kurtosis : {float, ndarray}
Estimated kurtosis of the data.
Notes
-----
Each output returned has 1 dimension fewer than data
The Jarque-Bera test statistic tests the null that the data is normally
distributed against an alternative that the data follow some other
distribution. The test statistic is based on two moments of the data,
the skewness, and the kurtosis, and has an asymptotic :math:`\chi^2_2`
distribution.
The test statistic is defined
.. math:: JB = n(S^2/6+(K-3)^2/24)
where n is the number of data points, S is the sample skewness, and K is
the sample kurtosis of the data.
"""
resids = np.atleast_1d(np.asarray(resids, dtype=float))
if resids.size < 2:
raise ValueError("resids must contain at least 2 elements")
# Calculate residual skewness and kurtosis
skew = stats.skew(resids, axis=axis)
kurtosis = 3 + stats.kurtosis(resids, axis=axis)
# Calculate the Jarque-Bera test for normality
n = resids.shape[axis]
jb = (n / 6.) * (skew ** 2 + (1 / 4.) * (kurtosis - 3) ** 2)
jb_pv = stats.chi2.sf(jb, 2)
return jb, jb_pv, skew, kurtosis
[docs]
def robust_skewness(y, axis=0):
"""
Calculates the four skewness measures in Kim & White
Parameters
----------
y : array_like
Data to compute use in the estimator.
axis : int or None, optional
Axis along which the skewness measures are computed. If `None`, the
entire array is used.
Returns
-------
sk1 : ndarray
The standard skewness estimator.
sk2 : ndarray
Skewness estimator based on quartiles.
sk3 : ndarray
Skewness estimator based on mean-median difference, standardized by
absolute deviation.
sk4 : ndarray
Skewness estimator based on mean-median difference, standardized by
standard deviation.
Notes
-----
The robust skewness measures are defined
.. math::
SK_{2}=\\frac{\\left(q_{.75}-q_{.5}\\right)
-\\left(q_{.5}-q_{.25}\\right)}{q_{.75}-q_{.25}}
.. math::
SK_{3}=\\frac{\\mu-\\hat{q}_{0.5}}
{\\hat{E}\\left[\\left|y-\\hat{\\mu}\\right|\\right]}
.. math::
SK_{4}=\\frac{\\mu-\\hat{q}_{0.5}}{\\hat{\\sigma}}
.. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of
skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73,
March 2004.
"""
if axis is None:
y = y.ravel()
axis = 0
y = np.sort(y, axis)
q1, q2, q3 = np.percentile(y, [25.0, 50.0, 75.0], axis=axis)
mu = y.mean(axis)
shape = (y.size,)
if axis is not None:
shape = list(mu.shape)
shape.insert(axis, 1)
shape = tuple(shape)
mu_b = np.reshape(mu, shape)
q2_b = np.reshape(q2, shape)
sigma = np.sqrt(np.mean(((y - mu_b)**2), axis))
sk1 = stats.skew(y, axis=axis)
sk2 = (q1 + q3 - 2.0 * q2) / (q3 - q1)
sk3 = (mu - q2) / np.mean(abs(y - q2_b), axis=axis)
sk4 = (mu - q2) / sigma
return sk1, sk2, sk3, sk4
def _kr3(y, alpha=5.0, beta=50.0):
"""
KR3 estimator from Kim & White
Parameters
----------
y : array_like, 1-d
Data to compute use in the estimator.
alpha : float, optional
Lower cut-off for measuring expectation in tail.
beta : float, optional
Lower cut-off for measuring expectation in center.
Returns
-------
kr3 : float
Robust kurtosis estimator based on standardized lower- and upper-tail
expected values
Notes
-----
.. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of
skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73,
March 2004.
"""
perc = (alpha, 100.0 - alpha, beta, 100.0 - beta)
lower_alpha, upper_alpha, lower_beta, upper_beta = np.percentile(y, perc)
l_alpha = np.mean(y[y < lower_alpha])
u_alpha = np.mean(y[y > upper_alpha])
l_beta = np.mean(y[y < lower_beta])
u_beta = np.mean(y[y > upper_beta])
return (u_alpha - l_alpha) / (u_beta - l_beta)
[docs]
def expected_robust_kurtosis(ab=(5.0, 50.0), dg=(2.5, 25.0)):
"""
Calculates the expected value of the robust kurtosis measures in Kim and
White assuming the data are normally distributed.
Parameters
----------
ab : iterable, optional
Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail
quantile cut-off for measuring the extreme tail and beta is the central
quantile cutoff for the standardization of the measure
db : iterable, optional
Contains 100*(delta, gamma) in the kr4 measure where delta is the tail
quantile for measuring extreme values and gamma is the central quantile
used in the the standardization of the measure
Returns
-------
ekr : ndarray, 4-element
Contains the expected values of the 4 robust kurtosis measures
Notes
-----
See `robust_kurtosis` for definitions of the robust kurtosis measures
"""
alpha, beta = ab
delta, gamma = dg
expected_value = np.zeros(4)
ppf = stats.norm.ppf
pdf = stats.norm.pdf
q1, q2, q3, q5, q6, q7 = ppf(np.array((1.0, 2.0, 3.0, 5.0, 6.0, 7.0)) / 8)
expected_value[0] = 3
expected_value[1] = ((q7 - q5) + (q3 - q1)) / (q6 - q2)
q_alpha, q_beta = ppf(np.array((alpha / 100.0, beta / 100.0)))
expected_value[2] = (2 * pdf(q_alpha) / alpha) / (2 * pdf(q_beta) / beta)
q_delta, q_gamma = ppf(np.array((delta / 100.0, gamma / 100.0)))
expected_value[3] = (-2.0 * q_delta) / (-2.0 * q_gamma)
return expected_value
[docs]
def robust_kurtosis(y, axis=0, ab=(5.0, 50.0), dg=(2.5, 25.0), excess=True):
"""
Calculates the four kurtosis measures in Kim & White
Parameters
----------
y : array_like
Data to compute use in the estimator.
axis : int or None, optional
Axis along which the kurtosis are computed. If `None`, the
entire array is used.
a iterable, optional
Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail
quantile cut-off for measuring the extreme tail and beta is the central
quantile cutoff for the standardization of the measure
db : iterable, optional
Contains 100*(delta, gamma) in the kr4 measure where delta is the tail
quantile for measuring extreme values and gamma is the central quantile
used in the the standardization of the measure
excess : bool, optional
If true (default), computed values are excess of those for a standard
normal distribution.
Returns
-------
kr1 : ndarray
The standard kurtosis estimator.
kr2 : ndarray
Kurtosis estimator based on octiles.
kr3 : ndarray
Kurtosis estimators based on exceedance expectations.
kr4 : ndarray
Kurtosis measure based on the spread between high and low quantiles.
Notes
-----
The robust kurtosis measures are defined
.. math::
KR_{2}=\\frac{\\left(\\hat{q}_{.875}-\\hat{q}_{.625}\\right)
+\\left(\\hat{q}_{.375}-\\hat{q}_{.125}\\right)}
{\\hat{q}_{.75}-\\hat{q}_{.25}}
.. math::
KR_{3}=\\frac{\\hat{E}\\left(y|y>\\hat{q}_{1-\\alpha}\\right)
-\\hat{E}\\left(y|y<\\hat{q}_{\\alpha}\\right)}
{\\hat{E}\\left(y|y>\\hat{q}_{1-\\beta}\\right)
-\\hat{E}\\left(y|y<\\hat{q}_{\\beta}\\right)}
.. math::
KR_{4}=\\frac{\\hat{q}_{1-\\delta}-\\hat{q}_{\\delta}}
{\\hat{q}_{1-\\gamma}-\\hat{q}_{\\gamma}}
where :math:`\\hat{q}_{p}` is the estimated quantile at :math:`p`.
.. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of
skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73,
March 2004.
"""
if (axis is None or
(y.squeeze().ndim == 1 and y.ndim != 1)):
y = y.ravel()
axis = 0
alpha, beta = ab
delta, gamma = dg
perc = (12.5, 25.0, 37.5, 62.5, 75.0, 87.5,
delta, 100.0 - delta, gamma, 100.0 - gamma)
e1, e2, e3, e5, e6, e7, fd, f1md, fg, f1mg = np.percentile(y, perc,
axis=axis)
expected_value = (expected_robust_kurtosis(ab, dg)
if excess else np.zeros(4))
kr1 = stats.kurtosis(y, axis, False) - expected_value[0]
kr2 = ((e7 - e5) + (e3 - e1)) / (e6 - e2) - expected_value[1]
if y.ndim == 1:
kr3 = _kr3(y, alpha, beta)
else:
kr3 = np.apply_along_axis(_kr3, axis, y, alpha, beta)
kr3 -= expected_value[2]
kr4 = (f1md - fd) / (f1mg - fg) - expected_value[3]
return kr1, kr2, kr3, kr4
def _medcouple_1d(y):
"""
Calculates the medcouple robust measure of skew.
Parameters
----------
y : array_like, 1-d
Data to compute use in the estimator.
Returns
-------
mc : float
The medcouple statistic
Notes
-----
The current algorithm requires a O(N**2) memory allocations, and so may
not work for very large arrays (N>10000).
.. [*] M. Hubert and E. Vandervieren, "An adjusted boxplot for skewed
distributions" Computational Statistics & Data Analysis, vol. 52, pp.
5186-5201, August 2008.
"""
# Parameter changes the algorithm to the slower for large n
y = np.squeeze(np.asarray(y))
if y.ndim != 1:
raise ValueError("y must be squeezable to a 1-d array")
y = np.sort(y)
n = y.shape[0]
if n % 2 == 0:
mf = (y[n // 2 - 1] + y[n // 2]) / 2
else:
mf = y[(n - 1) // 2]
z = y - mf
lower = z[z <= 0.0]
upper = z[z >= 0.0]
upper = upper[:, None]
standardization = upper - lower
is_zero = np.logical_and(lower == 0.0, upper == 0.0)
standardization[is_zero] = np.inf
spread = upper + lower
h = spread / standardization
# GH5395
num_ties = np.sum(lower == 0.0)
if num_ties:
# Replacements has -1 above the anti-diagonal, 0 on the anti-diagonal,
# and 1 below the anti-diagonal
replacements = np.ones((num_ties, num_ties)) - np.eye(num_ties)
replacements -= 2 * np.triu(replacements)
# Convert diagonal to anti-diagonal
replacements = np.fliplr(replacements)
# Always replace upper right block
h[:num_ties, -num_ties:] = replacements
return np.median(h)
[docs]
def medcouple(y, axis=0):
"""
Calculate the medcouple robust measure of skew.
Parameters
----------
y : array_like
Data to compute use in the estimator.
axis : {int, None}
Axis along which the medcouple statistic is computed. If `None`, the
entire array is used.
Returns
-------
mc : ndarray
The medcouple statistic with the same shape as `y`, with the specified
axis removed.
Notes
-----
The current algorithm requires a O(N**2) memory allocations, and so may
not work for very large arrays (N>10000).
.. [*] M. Hubert and E. Vandervieren, "An adjusted boxplot for skewed
distributions" Computational Statistics & Data Analysis, vol. 52, pp.
5186-5201, August 2008.
"""
y = np.asarray(y, dtype=np.double) # GH 4243
if axis is None:
return _medcouple_1d(y.ravel())
return np.apply_along_axis(_medcouple_1d, axis, y)
Last update:
Dec 16, 2024