Source code for statsmodels.stats.multivariate

# -*- coding: utf-8 -*-
"""
Created on Sun Nov  5 14:48:19 2017

Author: Josef Perktold
License: BSD-3
"""

import numpy as np
from scipy import stats

from statsmodels.stats.moment_helpers import cov2corr
from statsmodels.stats.base import HolderTuple
from statsmodels.tools.validation import array_like


# shortcut function
def _logdet(x):
    return np.linalg.slogdet(x)[1]


[docs]def test_mvmean(data, mean_null=0, return_results=True): """Hotellings test for multivariate mean in one sample Parameters ---------- data : array_like data with observations in rows and variables in columns mean_null : array_like mean of the multivariate data under the null hypothesis return_results : bool If true, then a results instance is returned. If False, then only the test statistic and pvalue are returned. Returns ------- results : instance of a results class with attributes statistic, pvalue, t2 and df (statistic, pvalue) : tuple If return_results is false, then only the test statistic and the pvalue are returned. """ x = np.asarray(data) nobs, k_vars = x.shape mean = x.mean(0) cov = np.cov(x, rowvar=False, ddof=1) diff = mean - mean_null t2 = nobs * diff.dot(np.linalg.solve(cov, diff)) factor = (nobs - 1) * k_vars / (nobs - k_vars) statistic = t2 / factor df = (k_vars, nobs - k_vars) pvalue = stats.f.sf(statistic, df[0], df[1]) if return_results: res = HolderTuple(statistic=statistic, pvalue=pvalue, df=df, t2=t2, distr="F") return res else: return statistic, pvalue
[docs]def test_mvmean_2indep(data1, data2): """Hotellings test for multivariate mean in two independent samples The null hypothesis is that both samples have the same mean. The alternative hypothesis is that means differ. Parameters ---------- data1 : array_like first sample data with observations in rows and variables in columns data2 : array_like second sample data with observations in rows and variables in columns Returns ------- results : instance of a results class with attributes statistic, pvalue, t2 and df """ x1 = array_like(data1, "x1", ndim=2) x2 = array_like(data2, "x2", ndim=2) nobs1, k_vars = x1.shape nobs2, k_vars2 = x2.shape if k_vars2 != k_vars: msg = "both samples need to have the same number of columns" raise ValueError(msg) mean1 = x1.mean(0) mean2 = x2.mean(0) cov1 = np.cov(x1, rowvar=False, ddof=1) cov2 = np.cov(x2, rowvar=False, ddof=1) nobs_t = nobs1 + nobs2 combined_cov = ((nobs1 - 1) * cov1 + (nobs2 - 1) * cov2) / (nobs_t - 2) diff = mean1 - mean2 t2 = (nobs1 * nobs2) / nobs_t * diff @ np.linalg.solve(combined_cov, diff) factor = ((nobs_t - 2) * k_vars) / (nobs_t - k_vars - 1) statistic = t2 / factor df = (k_vars, nobs_t - 1 - k_vars) pvalue = stats.f.sf(statistic, df[0], df[1]) return HolderTuple(statistic=statistic, pvalue=pvalue, df=df, t2=t2, distr="F")
[docs]def confint_mvmean(data, lin_transf=None, alpha=0.5, simult=False): """Confidence interval for linear transformation of a multivariate mean Either pointwise or simultaneous confidence intervals are returned. Parameters ---------- data : array_like data with observations in rows and variables in columns lin_transf : array_like or None The linear transformation or contrast matrix for transforming the vector of means. If this is None, then the identity matrix is used which specifies the means themselves. alpha : float in (0, 1) confidence level for the confidence interval, commonly used is alpha=0.05. simult : bool If ``simult`` is False (default), then the pointwise confidence interval is returned. Otherwise, a simultaneous confidence interval is returned. Warning: additional simultaneous confidence intervals might be added and the default for those might change. Returns ------- low : ndarray lower confidence bound on the linear transformed upp : ndarray upper confidence bound on the linear transformed values : ndarray mean or their linear transformation, center of the confidence region Notes ----- Pointwise confidence interval is based on Johnson and Wichern equation (5-21) page 224. Simultaneous confidence interval is based on Johnson and Wichern Result 5.3 page 225. This looks like Sheffe simultaneous confidence intervals. Bonferroni corrected simultaneous confidence interval might be added in future References ---------- Johnson, Richard A., and Dean W. Wichern. 2007. Applied Multivariate Statistical Analysis. 6th ed. Upper Saddle River, N.J: Pearson Prentice Hall. """ x = np.asarray(data) nobs, k_vars = x.shape if lin_transf is None: lin_transf = np.eye(k_vars) mean = x.mean(0) cov = np.cov(x, rowvar=False, ddof=0) ci = confint_mvmean_fromstats(mean, cov, nobs, lin_transf=lin_transf, alpha=alpha, simult=simult) return ci
[docs]def confint_mvmean_fromstats(mean, cov, nobs, lin_transf=None, alpha=0.05, simult=False): """Confidence interval for linear transformation of a multivariate mean Either pointwise or simultaneous confidence intervals are returned. Data is provided in the form of summary statistics, mean, cov, nobs. Parameters ---------- mean : ndarray cov : ndarray nobs : int lin_transf : array_like or None The linear transformation or contrast matrix for transforming the vector of means. If this is None, then the identity matrix is used which specifies the means themselves. alpha : float in (0, 1) confidence level for the confidence interval, commonly used is alpha=0.05. simult : bool If simult is False (default), then pointwise confidence interval is returned. Otherwise, a simultaneous confidence interval is returned. Warning: additional simultaneous confidence intervals might be added and the default for those might change. Notes ----- Pointwise confidence interval is based on Johnson and Wichern equation (5-21) page 224. Simultaneous confidence interval is based on Johnson and Wichern Result 5.3 page 225. This looks like Sheffe simultaneous confidence intervals. Bonferroni corrected simultaneous confidence interval might be added in future References ---------- Johnson, Richard A., and Dean W. Wichern. 2007. Applied Multivariate Statistical Analysis. 6th ed. Upper Saddle River, N.J: Pearson Prentice Hall. """ mean = np.asarray(mean) cov = np.asarray(cov) c = np.atleast_2d(lin_transf) k_vars = len(mean) if simult is False: values = c.dot(mean) quad_form = (c * cov.dot(c.T).T).sum(1) df = nobs - 1 t_critval = stats.t.isf(alpha / 2, df) ci_diff = np.sqrt(quad_form / df) * t_critval low = values - ci_diff upp = values + ci_diff else: values = c.dot(mean) quad_form = (c * cov.dot(c.T).T).sum(1) factor = (nobs - 1) * k_vars / (nobs - k_vars) / nobs df = (k_vars, nobs - k_vars) f_critval = stats.f.isf(alpha, df[0], df[1]) ci_diff = np.sqrt(factor * quad_form * f_critval) low = values - ci_diff upp = values + ci_diff return low, upp, values # , (f_critval, factor, quad_form, df)
""" Created on Tue Nov 7 13:22:44 2017 Author: Josef Perktold References ---------- Stata manual for mvtest covariances Rencher and Christensen 2012 Bartlett 1954 Stata refers to Rencher and Christensen for the formulas. Those correspond to the formula collection in Bartlett 1954 for several of them. """ # pylint: disable=W0105
[docs]def test_cov(cov, nobs, cov_null): """One sample hypothesis test for covariance equal to null covariance The Null hypothesis is that cov = cov_null, against the alternative that it is not equal to cov_null Parameters ---------- cov : array_like Covariance matrix of the data, estimated with denominator ``(N - 1)``, i.e. `ddof=1`. nobs : int number of observations used in the estimation of the covariance cov_null : nd_array covariance under the null hypothesis Returns ------- res : instance of HolderTuple results with ``statistic, pvalue`` and other attributes like ``df`` References ---------- Bartlett, M. S. 1954. “A Note on the Multiplying Factors for Various Χ2 Approximations.” Journal of the Royal Statistical Society. Series B (Methodological) 16 (2): 296–98. Rencher, Alvin C., and William F. Christensen. 2012. Methods of Multivariate Analysis: Rencher/Methods. Wiley Series in Probability and Statistics. Hoboken, NJ, USA: John Wiley & Sons, Inc. https://doi.org/10.1002/9781118391686. StataCorp, L. P. Stata Multivariate Statistics: Reference Manual. Stata Press Publication. """ # using Stata formulas where cov_sample use nobs in denominator # Bartlett 1954 has fewer terms S = np.asarray(cov) * (nobs - 1) / nobs S0 = np.asarray(cov_null) k = cov.shape[0] n = nobs fact = nobs - 1. fact *= 1 - (2 * k + 1 - 2 / (k + 1)) / (6 * (n - 1) - 1) fact2 = _logdet(S0) - _logdet(n / (n - 1) * S) fact2 += np.trace(n / (n - 1) * np.linalg.solve(S0, S)) - k statistic = fact * fact2 df = k * (k + 1) / 2 pvalue = stats.chi2.sf(statistic, df) return HolderTuple(statistic=statistic, pvalue=pvalue, df=df, distr="chi2", null="equal value", cov_null=cov_null )
[docs]def test_cov_spherical(cov, nobs): r"""One sample hypothesis test that covariance matrix is spherical The Null and alternative hypotheses are .. math:: H0 &: \Sigma = \sigma I \\ H1 &: \Sigma \neq \sigma I where :math:`\sigma_i` is the common variance with unspecified value. Parameters ---------- cov : array_like Covariance matrix of the data, estimated with denominator ``(N - 1)``, i.e. `ddof=1`. nobs : int number of observations used in the estimation of the covariance Returns ------- res : instance of HolderTuple results with ``statistic, pvalue`` and other attributes like ``df`` References ---------- Bartlett, M. S. 1954. “A Note on the Multiplying Factors for Various Χ2 Approximations.” Journal of the Royal Statistical Society. Series B (Methodological) 16 (2): 296–98. Rencher, Alvin C., and William F. Christensen. 2012. Methods of Multivariate Analysis: Rencher/Methods. Wiley Series in Probability and Statistics. Hoboken, NJ, USA: John Wiley & Sons, Inc. https://doi.org/10.1002/9781118391686. StataCorp, L. P. Stata Multivariate Statistics: Reference Manual. Stata Press Publication. """ # unchanged Stata formula, but denom is cov cancels, AFAICS # Bartlett 1954 correction factor in IIIc cov = np.asarray(cov) k = cov.shape[0] statistic = nobs - 1 - (2 * k**2 + k + 2) / (6 * k) statistic *= k * np.log(np.trace(cov)) - _logdet(cov) - k * np.log(k) df = k * (k + 1) / 2 - 1 pvalue = stats.chi2.sf(statistic, df) return HolderTuple(statistic=statistic, pvalue=pvalue, df=df, distr="chi2", null="spherical" )
[docs]def test_cov_diagonal(cov, nobs): r"""One sample hypothesis test that covariance matrix is diagonal matrix. The Null and alternative hypotheses are .. math:: H0 &: \Sigma = diag(\sigma_i) \\ H1 &: \Sigma \neq diag(\sigma_i) where :math:`\sigma_i` are the variances with unspecified values. Parameters ---------- cov : array_like Covariance matrix of the data, estimated with denominator ``(N - 1)``, i.e. `ddof=1`. nobs : int number of observations used in the estimation of the covariance Returns ------- res : instance of HolderTuple results with ``statistic, pvalue`` and other attributes like ``df`` References ---------- Rencher, Alvin C., and William F. Christensen. 2012. Methods of Multivariate Analysis: Rencher/Methods. Wiley Series in Probability and Statistics. Hoboken, NJ, USA: John Wiley & Sons, Inc. https://doi.org/10.1002/9781118391686. StataCorp, L. P. Stata Multivariate Statistics: Reference Manual. Stata Press Publication. """ cov = np.asarray(cov) k = cov.shape[0] R = cov2corr(cov) statistic = -(nobs - 1 - (2 * k + 5) / 6) * _logdet(R) df = k * (k - 1) / 2 pvalue = stats.chi2.sf(statistic, df) return HolderTuple(statistic=statistic, pvalue=pvalue, df=df, distr="chi2", null="diagonal" )
def _get_blocks(mat, block_len): """get diagonal blocks from matrix """ k = len(mat) idx = np.cumsum(block_len) if idx[-1] == k: idx = idx[:-1] elif idx[-1] > k: raise ValueError("sum of block_len larger than shape of mat") else: # allow one missing block that is the remainder pass idx_blocks = np.split(np.arange(k), idx) blocks = [] for ii in idx_blocks: blocks.append(mat[ii[:, None], ii]) return blocks, idx_blocks
[docs]def test_cov_blockdiagonal(cov, nobs, block_len): r"""One sample hypothesis test that covariance is block diagonal. The Null and alternative hypotheses are .. math:: H0 &: \Sigma = diag(\Sigma_i) \\ H1 &: \Sigma \neq diag(\Sigma_i) where :math:`\Sigma_i` are covariance blocks with unspecified values. Parameters ---------- cov : array_like Covariance matrix of the data, estimated with denominator ``(N - 1)``, i.e. `ddof=1`. nobs : int number of observations used in the estimation of the covariance block_len : list list of length of each square block Returns ------- res : instance of HolderTuple results with ``statistic, pvalue`` and other attributes like ``df`` References ---------- Rencher, Alvin C., and William F. Christensen. 2012. Methods of Multivariate Analysis: Rencher/Methods. Wiley Series in Probability and Statistics. Hoboken, NJ, USA: John Wiley & Sons, Inc. https://doi.org/10.1002/9781118391686. StataCorp, L. P. Stata Multivariate Statistics: Reference Manual. Stata Press Publication. """ cov = np.asarray(cov) cov_blocks = _get_blocks(cov, block_len)[0] k = cov.shape[0] k_blocks = [c.shape[0] for c in cov_blocks] if k != sum(k_blocks): msg = "sample covariances and blocks do not have matching shape" raise ValueError(msg) logdet_blocks = sum(_logdet(c) for c in cov_blocks) a2 = k**2 - sum(ki**2 for ki in k_blocks) a3 = k**3 - sum(ki**3 for ki in k_blocks) statistic = (nobs - 1 - (2 * a3 + 3 * a2) / (6. * a2)) statistic *= logdet_blocks - _logdet(cov) df = a2 / 2 pvalue = stats.chi2.sf(statistic, df) return HolderTuple(statistic=statistic, pvalue=pvalue, df=df, distr="chi2", null="block-diagonal" )
[docs]def test_cov_oneway(cov_list, nobs_list): r"""Multiple sample hypothesis test that covariance matrices are equal. This is commonly known as Box-M test. The Null and alternative hypotheses are .. math:: H0 &: \Sigma_i = \Sigma_j \text{ for all i and j} \\ H1 &: \Sigma_i \neq \Sigma_j \text{ for at least one i and j} where :math:`\Sigma_i` is the covariance of sample `i`. Parameters ---------- cov_list : list of array_like Covariance matrices of the sample, estimated with denominator ``(N - 1)``, i.e. `ddof=1`. nobs_list : list List of the number of observations used in the estimation of the covariance for each sample. Returns ------- res : instance of HolderTuple Results contains test statistic and pvalues for both chisquare and F distribution based tests, identified by the name ending "_chi2" and "_f". Attributes ``statistic, pvalue`` refer to the F-test version. Notes ----- approximations to distribution of test statistic is by Box References ---------- Rencher, Alvin C., and William F. Christensen. 2012. Methods of Multivariate Analysis: Rencher/Methods. Wiley Series in Probability and Statistics. Hoboken, NJ, USA: John Wiley & Sons, Inc. https://doi.org/10.1002/9781118391686. StataCorp, L. P. Stata Multivariate Statistics: Reference Manual. Stata Press Publication. """ # Note stata uses nobs in cov, this uses nobs - 1 cov_list = list(map(np.asarray, cov_list)) m = len(cov_list) nobs = sum(nobs_list) # total number of observations k = cov_list[0].shape[0] cov_pooled = sum((n - 1) * c for (n, c) in zip(nobs_list, cov_list)) cov_pooled /= (nobs - m) stat0 = (nobs - m) * _logdet(cov_pooled) stat0 -= sum((n - 1) * _logdet(c) for (n, c) in zip(nobs_list, cov_list)) # Box's chi2 c1 = sum(1 / (n - 1) for n in nobs_list) - 1 / (nobs - m) c1 *= (2 * k*k + 3 * k - 1) / (6 * (k + 1) * (m - 1)) df_chi2 = (m - 1) * k * (k + 1) / 2 statistic_chi2 = (1 - c1) * stat0 pvalue_chi2 = stats.chi2.sf(statistic_chi2, df_chi2) c2 = sum(1 / (n - 1)**2 for n in nobs_list) - 1 / (nobs - m)**2 c2 *= (k - 1) * (k + 2) / (6 * (m - 1)) a1 = df_chi2 a2 = (a1 + 2) / abs(c2 - c1**2) b1 = (1 - c1 - a1 / a2) / a1 b2 = (1 - c1 + 2 / a2) / a2 if c2 > c1**2: statistic_f = b1 * stat0 else: tmp = b2 * stat0 statistic_f = a2 / a1 * tmp / (1 + tmp) df_f = (a1, a2) pvalue_f = stats.f.sf(statistic_f, *df_f) return HolderTuple(statistic=statistic_f, # name convention, using F here pvalue=pvalue_f, # name convention, using F here statistic_base=stat0, statistic_chi2=statistic_chi2, pvalue_chi2=pvalue_chi2, df_chi2=df_chi2, distr_chi2='chi2', statistic_f=statistic_f, pvalue_f=pvalue_f, df_f=df_f, distr_f='F')