Source code for statsmodels.multivariate.cancorr

# -*- coding: utf-8 -*-

"""Canonical correlation analysis

author: Yichuan Liu
"""
from __future__ import division

import numpy as np
from numpy.linalg import svd
import scipy
import pandas as pd

from statsmodels.base.model import Model
from statsmodels.iolib import summary2
from .multivariate_ols import multivariate_stats


[docs]class CanCorr(Model):
    """
    Canonical correlation analysis using singluar value decomposition

    For matrices exog=x and endog=y, find projections x_cancoef and y_cancoef
    such that:
        
        x1 = x * x_cancoef, x1' * x1 is identity matrix
        y1 = y * y_cancoef, y1' * y1 is identity matrix
    
    and the correlation between x1 and y1 is maximized.

    Attributes
    -----------
    endog : array
        See Parameters.
    exog : array
        See Parameters.
    cancorr : array
        The canonical correlation values
    y_cancoeff: array
        The canonical coeefficients for endog
    x_cancoeff: array
        The canonical coefficients for exog

    References
    ----------
    .. [*] http://numerical.recipes/whp/notes/CanonCorrBySVD.pdf
    .. [*] http://www.csun.edu/~ata20315/psy524/docs/Psy524%20Lecture%208%20CC.pdf
    .. [*] http://www.mathematica-journal.com/2014/06/canonical-correlation-analysis/
    """
    def __init__(self, endog, exog, tolerance=1e-8, missing='none', hasconst=None, **kwargs):
        super(CanCorr, self).__init__(endog, exog, missing=missing,
                                      hasconst=hasconst, **kwargs)
        self._fit(tolerance)

    def _fit(self, tolerance=1e-8):
        """Fit the model

        A ValueError is raised if there are singular values smaller than the
        tolerance. The treatment of singular arrays might change in future.

        Parameters
        ----------
        tolerance : float
            eigenvalue tolerance, values smaller than which is considered 0
        """
        nobs, k_yvar = self.endog.shape
        nobs, k_xvar = self.exog.shape
        k = np.min([k_yvar, k_xvar])

        x = np.array(self.exog)
        x = x - x.mean(0)
        y = np.array(self.endog)
        y = y - y.mean(0)

        ux, sx, vx = svd(x, 0)
        # vx_ds = vx.T divided by sx
        vx_ds = vx.T
        mask = sx > tolerance
        if mask.sum() < len(mask):
            raise ValueError('exog is collinear.')
        vx_ds[:, mask] /= sx[mask]
        uy, sy, vy = svd(y, 0)
        # vy_ds = vy.T divided by sy
        vy_ds = vy.T
        mask = sy > tolerance
        if mask.sum() < len(mask):
            raise ValueError('endog is collinear.')
        vy_ds[:, mask] /= sy[mask]
        u, s, v = svd(ux.T.dot(uy), 0)

        # Correct any roundoff
        self.cancorr = np.array([max(0, min(s[i], 1)) for i in range(len(s))])

        self.x_cancoef = vx_ds.dot(u[:, :k])
        self.y_cancoef = vy_ds.dot(v.T[:, :k])

[docs]    def corr_test(self):
        """Approximate F test
        Perform multivariate statistical tests of the hypothesis that
        there is no canonical correlation between endog and exog.
        For each canonical correlation, testing its significance based on
        Wilks' lambda.

        Returns
        -------
        CanCorrTestResults instance

        """
        nobs, k_yvar = self.endog.shape
        nobs, k_xvar = self.exog.shape
        eigenvals = np.power(self.cancorr, 2)
        stats = pd.DataFrame(columns=['Canonical Correlation', "Wilks' lambda",
                                      'Num DF','Den DF', 'F Value','Pr > F'],
                             index=list(range(len(eigenvals) - 1, -1, -1)))
        prod = 1
        for i in range(len(eigenvals) - 1, -1, -1):
            prod *= 1 - eigenvals[i]
            p = k_yvar - i
            q = k_xvar - i
            r = (nobs - k_yvar - 1) - (p - q + 1) / 2
            u = (p * q - 2) / 4
            df1 = p * q
            if p ** 2 + q ** 2 - 5 > 0:
                t = np.sqrt(((p * q) ** 2 - 4) / (p ** 2 + q ** 2 - 5))
            else:
                t = 1
            df2 = r * t - 2 * u
            lmd = np.power(prod, 1 / t)
            F = (1 - lmd) / lmd * df2 / df1
            stats.loc[i, 'Canonical Correlation'] = self.cancorr[i]
            stats.loc[i, "Wilks' lambda"] = prod
            stats.loc[i, 'Num DF'] = df1
            stats.loc[i, 'Den DF'] = df2
            stats.loc[i, 'F Value'] = F
            pval = scipy.stats.f.sf(F, df1, df2)
            stats.loc[i, 'Pr > F'] = pval
            '''
            # Wilk's Chi square test of each canonical correlation
            df = (p - i + 1) * (q - i + 1)
            chi2 = a * np.log(prod)
            pval = stats.chi2.sf(chi2, df)
            stats.loc[i, 'Canonical correlation'] = self.cancorr[i]
            stats.loc[i, 'Chi-square'] = chi2
            stats.loc[i, 'DF'] = df
            stats.loc[i, 'Pr > ChiSq'] = pval
            '''
        ind = stats.index.values[::-1]
        stats = stats.loc[ind, :]

        # Multivariate tests (remember x has mean removed)
        stats_mv = multivariate_stats(eigenvals,
                                      k_yvar, k_xvar, nobs - k_xvar - 1)
        return CanCorrTestResults(stats, stats_mv)


class CanCorrTestResults(object):
    """
    Canonical correlation results class

    Attributes
    -----------
    stats : DataFrame
        Contain statistical tests results for each canonical correlation
    stats_mv : DataFrame
        Contain the multivariate statistical tests results
    """
    def __init__(self, stats, stats_mv):
        self.stats = stats
        self.stats_mv = stats_mv

    def __str__(self):
        return self.summary().__str__()

    def summary(self):
        summ = summary2.Summary()
        summ.add_title('Cancorr results')
        summ.add_df(self.stats)
        summ.add_dict({'': ''})
        summ.add_dict({'Multivariate Statistics and F Approximations': ''})
        summ.add_df(self.stats_mv)
        return summ