Source code for statsmodels.multivariate.pca

"""Principal Component Analysis

Author: josef-pktd
Modified by Kevin Sheppard
"""

import numpy as np
import pandas as pd

from statsmodels.tools.sm_exceptions import (ValueWarning,
                                             EstimationWarning)
from statsmodels.tools.validation import (string_like,
                                          array_like,
                                          bool_like,
                                          float_like,
                                          int_like,
                                          )


def _norm(x):
    return np.sqrt(np.sum(x * x))



[docs]
class PCA:
    """
    Principal Component Analysis

    Parameters
    ----------
    data : array_like
        Variables in columns, observations in rows.
    ncomp : int, optional
        Number of components to return.  If None, returns the as many as the
        smaller of the number of rows or columns in data.
    standardize : bool, optional
        Flag indicating to use standardized data with mean 0 and unit
        variance.  standardized being True implies demean.  Using standardized
        data is equivalent to computing principal components from the
        correlation matrix of data.
    demean : bool, optional
        Flag indicating whether to demean data before computing principal
        components.  demean is ignored if standardize is True. Demeaning data
        but not standardizing is equivalent to computing principal components
        from the covariance matrix of data.
    normalize : bool , optional
        Indicates whether to normalize the factors to have unit inner product.
        If False, the loadings will have unit inner product.
    gls : bool, optional
        Flag indicating to implement a two-step GLS estimator where
        in the first step principal components are used to estimate residuals,
        and then the inverse residual variance is used as a set of weights to
        estimate the final principal components.  Setting gls to True requires
        ncomp to be less then the min of the number of rows or columns.
    weights : ndarray, optional
        Series weights to use after transforming data according to standardize
        or demean when computing the principal components.
    method : str, optional
        Sets the linear algebra routine used to compute eigenvectors:

        * 'svd' uses a singular value decomposition (default).
        * 'eig' uses an eigenvalue decomposition of a quadratic form
        * 'nipals' uses the NIPALS algorithm and can be faster than SVD when
          ncomp is small and nvars is large. See notes about additional changes
          when using NIPALS.
    missing : {str, None}
        Method for missing data.  Choices are:

        * 'drop-row' - drop rows with missing values.
        * 'drop-col' - drop columns with missing values.
        * 'drop-min' - drop either rows or columns, choosing by data retention.
        * 'fill-em' - use EM algorithm to fill missing value.  ncomp should be
          set to the number of factors required.
        * `None` raises if data contains NaN values.
    tol : float, optional
        Tolerance to use when checking for convergence when using NIPALS.
    max_iter : int, optional
        Maximum iterations when using NIPALS.
    tol_em : float
        Tolerance to use when checking for convergence of the EM algorithm.
    max_em_iter : int
        Maximum iterations for the EM algorithm.
    svd_full_matrices : bool, optional
        If the 'svd' method is selected, this flag is used to set the parameter
        'full_matrices' in the singular value decomposition method. Is set to
        False by default.

    Attributes
    ----------
    factors : array or DataFrame
        nobs by ncomp array of principal components (scores)
    scores :  array or DataFrame
        nobs by ncomp array of principal components - identical to factors
    loadings : array or DataFrame
        ncomp by nvar array of principal component loadings for constructing
        the factors
    coeff : array or DataFrame
        nvar by ncomp array of principal component loadings for constructing
        the projections
    projection : array or DataFrame
        nobs by var array containing the projection of the data onto the ncomp
        estimated factors
    rsquare : array or Series
        ncomp array where the element in the ith position is the R-square
        of including the fist i principal components.  Note: values are
        calculated on the transformed data, not the original data
    ic : array or DataFrame
        ncomp by 3 array containing the Bai and Ng (2003) Information
        criteria.  Each column is a different criteria, and each row
        represents the number of included factors.
    eigenvals : array or Series
        nvar array of eigenvalues
    eigenvecs : array or DataFrame
        nvar by nvar array of eigenvectors
    weights : ndarray
        nvar array of weights used to compute the principal components,
        normalized to unit length
    transformed_data : ndarray
        Standardized, demeaned and weighted data used to compute
        principal components and related quantities
    cols : ndarray
        Array of indices indicating columns used in the PCA
    rows : ndarray
        Array of indices indicating rows used in the PCA

    Notes
    -----
    The default options perform principal component analysis on the
    demeaned, unit variance version of data.  Setting standardize to False
    will instead only demean, and setting both standardized and
    demean to False will not alter the data.

    Once the data have been transformed, the following relationships hold when
    the number of components (ncomp) is the same as tne minimum of the number
    of observation or the number of variables.

    .. math:

        X' X = V \\Lambda V'

    .. math:

        F = X V

    .. math:

        X = F V'

    where X is the `data`, F is the array of principal components (`factors`
    or `scores`), and V is the array of eigenvectors (`loadings`) and V' is
    the array of factor coefficients (`coeff`).

    When weights are provided, the principal components are computed from the
    modified data

    .. math:

        \\Omega^{-\\frac{1}{2}} X

    where :math:`\\Omega` is a diagonal matrix composed of the weights. For
    example, when using the GLS version of PCA, the elements of :math:`\\Omega`
    will be the inverse of the variances of the residuals from

    .. math:

        X - F V'

    where the number of factors is less than the rank of X

    References
    ----------
    .. [*] J. Bai and S. Ng, "Determining the number of factors in approximate
       factor models," Econometrica, vol. 70, number 1, pp. 191-221, 2002

    Examples
    --------
    Basic PCA using the correlation matrix of the data

    >>> import numpy as np
    >>> from statsmodels.multivariate.pca import PCA
    >>> x = np.random.randn(100)[:, None]
    >>> x = x + np.random.randn(100, 100)
    >>> pc = PCA(x)

    Note that the principal components are computed using a SVD and so the
    correlation matrix is never constructed, unless method='eig'.

    PCA using the covariance matrix of the data

    >>> pc = PCA(x, standardize=False)

    Limiting the number of factors returned to 1 computed using NIPALS

    >>> pc = PCA(x, ncomp=1, method='nipals')
    >>> pc.factors.shape
    (100, 1)
    """

    def __init__(self, data, ncomp=None, standardize=True, demean=True,
                 normalize=True, gls=False, weights=None, method='svd',
                 missing=None, tol=5e-8, max_iter=1000, tol_em=5e-8,
                 max_em_iter=100, svd_full_matrices=False):
        self._index = None
        self._columns = []
        if isinstance(data, pd.DataFrame):
            self._index = data.index
            self._columns = data.columns

        self.data = array_like(data, "data", ndim=2)
        # Store inputs
        self._gls = bool_like(gls, "gls")
        self._normalize = bool_like(normalize, "normalize")
        self._svd_full_matrices = bool_like(svd_full_matrices, "svd_fm")
        self._tol = float_like(tol, "tol")
        if not 0 < self._tol < 1:
            raise ValueError('tol must be strictly between 0 and 1')
        self._max_iter = int_like(max_iter, "int_like")
        self._max_em_iter = int_like(max_em_iter, "max_em_iter")
        self._tol_em = float_like(tol_em, "tol_em")

        # Prepare data
        self._standardize = bool_like(standardize, "standardize")
        self._demean = bool_like(demean, "demean")

        self._nobs, self._nvar = self.data.shape
        weights = array_like(weights, "weights", maxdim=1, optional=True)
        if weights is None:
            weights = np.ones(self._nvar)
        else:
            weights = np.array(weights).flatten()
            if weights.shape[0] != self._nvar:
                raise ValueError('weights should have nvar elements')
            weights = weights / np.sqrt((weights ** 2.0).mean())
        self.weights = weights

        # Check ncomp against maximum
        min_dim = min(self._nobs, self._nvar)
        self._ncomp = min_dim if ncomp is None else ncomp
        if self._ncomp > min_dim:
            import warnings

            warn = 'The requested number of components is more than can be ' \
                   'computed from data. The maximum number of components is ' \
                   'the minimum of the number of observations or variables'
            warnings.warn(warn, ValueWarning)
            self._ncomp = min_dim

        self._method = method
        # Workaround to avoid instance methods in __dict__
        if self._method not in ('eig', 'svd', 'nipals'):
            raise ValueError(f'method {method} is not known.')
        if self._method == 'svd':
            self._svd_full_matrices = True

        self.rows = np.arange(self._nobs)
        self.cols = np.arange(self._nvar)
        # Handle missing
        self._missing = string_like(missing, "missing", optional=True)
        self._adjusted_data = self.data
        self._adjust_missing()

        # Update size
        self._nobs, self._nvar = self._adjusted_data.shape
        if self._ncomp == np.min(self.data.shape):
            self._ncomp = np.min(self._adjusted_data.shape)
        elif self._ncomp > np.min(self._adjusted_data.shape):
            raise ValueError('When adjusting for missing values, user '
                             'provided ncomp must be no larger than the '
                             'smallest dimension of the '
                             'missing-value-adjusted data size.')

        # Attributes and internal values
        self._tss = 0.0
        self._ess = None
        self.transformed_data = None
        self._mu = None
        self._sigma = None
        self._ess_indiv = None
        self._tss_indiv = None
        self.scores = self.factors = None
        self.loadings = None
        self.coeff = None
        self.eigenvals = None
        self.eigenvecs = None
        self.projection = None
        self.rsquare = None
        self.ic = None

        # Prepare data
        self.transformed_data = self._prepare_data()
        # Perform the PCA
        self._pca()
        if gls:
            self._compute_gls_weights()
            self.transformed_data = self._prepare_data()
            self._pca()

        # Final calculations
        self._compute_rsquare_and_ic()
        if self._index is not None:
            self._to_pandas()

    def _adjust_missing(self):
        """
        Implements alternatives for handling missing values
        """

        def keep_col(x):
            index = np.logical_not(np.any(np.isnan(x), 0))
            return x[:, index], index

        def keep_row(x):
            index = np.logical_not(np.any(np.isnan(x), 1))
            return x[index, :], index

        if self._missing == 'drop-col':
            self._adjusted_data, index = keep_col(self.data)
            self.cols = np.where(index)[0]
            self.weights = self.weights[index]
        elif self._missing == 'drop-row':
            self._adjusted_data, index = keep_row(self.data)
            self.rows = np.where(index)[0]
        elif self._missing == 'drop-min':
            drop_col, drop_col_index = keep_col(self.data)
            drop_col_size = drop_col.size

            drop_row, drop_row_index = keep_row(self.data)
            drop_row_size = drop_row.size

            if drop_row_size > drop_col_size:
                self._adjusted_data = drop_row
                self.rows = np.where(drop_row_index)[0]
            else:
                self._adjusted_data = drop_col
                self.weights = self.weights[drop_col_index]
                self.cols = np.where(drop_col_index)[0]
        elif self._missing == 'fill-em':
            self._adjusted_data = self._fill_missing_em()
        elif self._missing is None:
            if not np.isfinite(self._adjusted_data).all():
                raise ValueError("""\
data contains non-finite values (inf, NaN). You should drop these values or
use one of the methods for adjusting data for missing-values.""")
        else:
            raise ValueError('missing method is not known.')

        if self._index is not None:
            self._columns = self._columns[self.cols]
            self._index = self._index[self.rows]

        # Check adjusted data size
        if self._adjusted_data.size == 0:
            raise ValueError('Removal of missing values has eliminated '
                             'all data.')

    def _compute_gls_weights(self):
        """
        Computes GLS weights based on percentage of data fit
        """
        projection = np.asarray(self.project(transform=False))
        errors = self.transformed_data - projection
        if self._ncomp == self._nvar:
            raise ValueError('gls can only be used when ncomp < nvar '
                             'so that residuals have non-zero variance')
        var = (errors ** 2.0).mean(0)
        weights = 1.0 / var
        weights = weights / np.sqrt((weights ** 2.0).mean())
        nvar = self._nvar
        eff_series_perc = (1.0 / sum((weights / weights.sum()) ** 2.0)) / nvar
        if eff_series_perc < 0.1:
            eff_series = int(np.round(eff_series_perc * nvar))
            import warnings

            warn = f"""\
Many series are being down weighted by GLS. Of the {nvar} series, the GLS
estimates are based on only {eff_series} (effective) series."""
            warnings.warn(warn, EstimationWarning)

        self.weights = weights

    def _pca(self):
        """
        Main PCA routine
        """
        self._compute_eig()
        self._compute_pca_from_eig()
        self.projection = self.project()

    def __repr__(self):
        string = self.__str__()
        string = string[:-1]
        string += ', id: ' + hex(id(self)) + ')'
        return string

    def __str__(self):
        string = 'Principal Component Analysis('
        string += 'nobs: ' + str(self._nobs) + ', '
        string += 'nvar: ' + str(self._nvar) + ', '
        if self._standardize:
            kind = 'Standardize (Correlation)'
        elif self._demean:
            kind = 'Demean (Covariance)'
        else:
            kind = 'None'
        string += 'transformation: ' + kind + ', '
        if self._gls:
            string += 'GLS, '
        string += 'normalization: ' + str(self._normalize) + ', '
        string += 'number of components: ' + str(self._ncomp) + ', '
        string += 'method: ' + 'Eigenvalue' if self._method == 'eig' else 'SVD'
        string += ')'
        return string

    def _prepare_data(self):
        """
        Standardize or demean data.
        """
        adj_data = self._adjusted_data
        if np.all(np.isnan(adj_data)):
            return np.empty(adj_data.shape[1]).fill(np.nan)

        self._mu = np.nanmean(adj_data, axis=0)
        self._sigma = np.sqrt(np.nanmean((adj_data - self._mu) ** 2.0, axis=0))
        if self._standardize:
            data = (adj_data - self._mu) / self._sigma
        elif self._demean:
            data = (adj_data - self._mu)
        else:
            data = adj_data
        return data / np.sqrt(self.weights)

    def _compute_eig(self):
        """
        Wrapper for actual eigenvalue method

        This is a workaround to avoid instance methods in __dict__
        """
        if self._method == 'eig':
            return self._compute_using_eig()
        elif self._method == 'svd':
            return self._compute_using_svd()
        else:  # self._method == 'nipals'
            return self._compute_using_nipals()

    def _compute_using_svd(self):
        """SVD method to compute eigenvalues and eigenvecs"""
        x = self.transformed_data
        u, s, v = np.linalg.svd(x, full_matrices=self._svd_full_matrices)
        self.eigenvals = s ** 2.0
        self.eigenvecs = v.T

    def _compute_using_eig(self):
        """
        Eigenvalue decomposition method to compute eigenvalues and eigenvectors
        """
        x = self.transformed_data
        self.eigenvals, self.eigenvecs = np.linalg.eigh(x.T.dot(x))

    def _compute_using_nipals(self):
        """
        NIPALS implementation to compute small number of eigenvalues
        and eigenvectors
        """
        x = self.transformed_data
        if self._ncomp > 1:
            x = x + 0.0  # Copy

        tol, max_iter, ncomp = self._tol, self._max_iter, self._ncomp
        vals = np.zeros(self._ncomp)
        vecs = np.zeros((self._nvar, self._ncomp))
        for i in range(ncomp):
            max_var_ind = np.argmax(x.var(0))
            factor = x[:, [max_var_ind]]
            _iter = 0
            diff = 1.0
            while diff > tol and _iter < max_iter:
                vec = x.T.dot(factor) / (factor.T.dot(factor))
                vec = vec / np.sqrt(vec.T.dot(vec))
                factor_last = factor
                factor = x.dot(vec) / (vec.T.dot(vec))
                diff = _norm(factor - factor_last) / _norm(factor)
                _iter += 1
            vals[i] = (factor ** 2).sum()
            vecs[:, [i]] = vec
            if ncomp > 1:
                x -= factor.dot(vec.T)

        self.eigenvals = vals
        self.eigenvecs = vecs

    def _fill_missing_em(self):
        """
        EM algorithm to fill missing values
        """
        non_missing = np.logical_not(np.isnan(self.data))

        # If nothing missing, return without altering the data
        if np.all(non_missing):
            return self.data

        # 1. Standardized data as needed
        data = self.transformed_data = np.asarray(self._prepare_data())

        ncomp = self._ncomp

        # 2. Check for all nans
        col_non_missing = np.sum(non_missing, 1)
        row_non_missing = np.sum(non_missing, 0)
        if np.any(col_non_missing < ncomp) or np.any(row_non_missing < ncomp):
            raise ValueError('Implementation requires that all columns and '
                             'all rows have at least ncomp non-missing values')
        # 3. Get mask
        mask = np.isnan(data)

        # 4. Compute mean
        mu = np.nanmean(data, 0)

        # 5. Replace missing with mean
        projection = np.ones((self._nobs, 1)) * mu
        projection_masked = projection[mask]
        data[mask] = projection_masked

        # 6. Compute eigenvalues and fit
        diff = 1.0
        _iter = 0
        while diff > self._tol_em and _iter < self._max_em_iter:
            last_projection_masked = projection_masked
            # Set transformed data to compute eigenvalues
            self.transformed_data = data
            # Call correct eig function here
            self._compute_eig()
            # Call function to compute factors and projection
            self._compute_pca_from_eig()
            projection = np.asarray(self.project(transform=False,
                                                 unweight=False))
            projection_masked = projection[mask]
            data[mask] = projection_masked
            delta = last_projection_masked - projection_masked
            diff = _norm(delta) / _norm(projection_masked)
            _iter += 1
        # Must copy to avoid overwriting original data since replacing values
        data = self._adjusted_data + 0.0
        projection = np.asarray(self.project())
        data[mask] = projection[mask]

        return data

    def _compute_pca_from_eig(self):
        """
        Compute relevant statistics after eigenvalues have been computed
        """
        # Ensure sorted largest to smallest
        vals, vecs = self.eigenvals, self.eigenvecs
        indices = np.argsort(vals)
        indices = indices[::-1]
        vals = vals[indices]
        vecs = vecs[:, indices]
        if (vals <= 0).any():
            # Discard and warn
            num_good = vals.shape[0] - (vals <= 0).sum()
            if num_good < self._ncomp:
                import warnings

                warnings.warn('Only {num:d} eigenvalues are positive.  '
                              'This is the maximum number of components '
                              'that can be extracted.'.format(num=num_good),
                              EstimationWarning)

                self._ncomp = num_good
                vals[num_good:] = np.finfo(np.float64).tiny
        # Use ncomp for the remaining calculations
        vals = vals[:self._ncomp]
        vecs = vecs[:, :self._ncomp]
        self.eigenvals, self.eigenvecs = vals, vecs
        # Select correct number of components to return
        self.scores = self.factors = self.transformed_data.dot(vecs)
        self.loadings = vecs
        self.coeff = vecs.T
        if self._normalize:
            self.coeff = (self.coeff.T * np.sqrt(vals)).T
            self.factors /= np.sqrt(vals)
            self.scores = self.factors

    def _compute_rsquare_and_ic(self):
        """
        Final statistics to compute
        """
        # TSS and related calculations
        # TODO: This needs careful testing, with and without weights,
        #   gls, standardized and demean
        weights = self.weights
        ss_data = self.transformed_data * np.sqrt(weights)
        self._tss_indiv = np.sum(ss_data ** 2, 0)
        self._tss = np.sum(self._tss_indiv)
        self._ess = np.zeros(self._ncomp + 1)
        self._ess_indiv = np.zeros((self._ncomp + 1, self._nvar))
        for i in range(self._ncomp + 1):
            # Projection in the same space as transformed_data
            projection = self.project(ncomp=i, transform=False, unweight=False)
            indiv_rss = (projection ** 2).sum(axis=0)
            rss = indiv_rss.sum()
            self._ess[i] = self._tss - rss
            self._ess_indiv[i, :] = self._tss_indiv - indiv_rss
        self.rsquare = 1.0 - self._ess / self._tss
        # Information Criteria
        ess = self._ess
        invalid = ess <= 0  # Prevent log issues of 0
        if invalid.any():
            last_obs = (np.where(invalid)[0]).min()
            ess = ess[:last_obs]

        log_ess = np.log(ess)
        r = np.arange(ess.shape[0])

        nobs, nvar = self._nobs, self._nvar
        sum_to_prod = (nobs + nvar) / (nobs * nvar)
        min_dim = min(nobs, nvar)
        penalties = np.array([sum_to_prod * np.log(1.0 / sum_to_prod),
                              sum_to_prod * np.log(min_dim),
                              np.log(min_dim) / min_dim])
        penalties = penalties[:, None]
        ic = log_ess + r * penalties
        self.ic = ic.T


[docs]
    def project(self, ncomp=None, transform=True, unweight=True):
        """
        Project series onto a specific number of factors.

        Parameters
        ----------
        ncomp : int, optional
            Number of components to use.  If omitted, all components
            initially computed are used.
        transform : bool, optional
            Flag indicating whether to return the projection in the original
            space of the data (True, default) or in the space of the
            standardized/demeaned data.
        unweight : bool, optional
            Flag indicating whether to undo the effects of the estimation
            weights.

        Returns
        -------
        array_like
            The nobs by nvar array of the projection onto ncomp factors.

        Notes
        -----
        """
        # Projection needs to be scaled/shifted based on inputs
        ncomp = self._ncomp if ncomp is None else ncomp
        if ncomp > self._ncomp:
            raise ValueError('ncomp must be smaller than the number of '
                             'components computed.')
        factors = np.asarray(self.factors)
        coeff = np.asarray(self.coeff)

        projection = factors[:, :ncomp].dot(coeff[:ncomp, :])
        if transform or unweight:
            projection *= np.sqrt(self.weights)
        if transform:
            # Remove the weights, which do not depend on transformation
            if self._standardize:
                projection *= self._sigma
            if self._standardize or self._demean:
                projection += self._mu
        if self._index is not None:
            projection = pd.DataFrame(projection,
                                      columns=self._columns,
                                      index=self._index)
        return projection


    def _to_pandas(self):
        """
        Returns pandas DataFrames for all values
        """
        index = self._index
        # Principal Components
        num_zeros = np.ceil(np.log10(self._ncomp))
        comp_str = 'comp_{0:0' + str(int(num_zeros)) + 'd}'
        cols = [comp_str.format(i) for i in range(self._ncomp)]
        df = pd.DataFrame(self.factors, columns=cols, index=index)
        self.scores = self.factors = df
        # Projections
        df = pd.DataFrame(self.projection,
                          columns=self._columns,
                          index=index)
        self.projection = df
        # Weights
        df = pd.DataFrame(self.coeff, index=cols,
                          columns=self._columns)
        self.coeff = df
        # Loadings
        df = pd.DataFrame(self.loadings,
                          index=self._columns, columns=cols)
        self.loadings = df
        # eigenvals
        self.eigenvals = pd.Series(self.eigenvals)
        self.eigenvals.name = 'eigenvals'
        # eigenvecs
        vec_str = comp_str.replace('comp', 'eigenvec')
        cols = [vec_str.format(i) for i in range(self.eigenvecs.shape[1])]
        self.eigenvecs = pd.DataFrame(self.eigenvecs, columns=cols)
        # R2
        self.rsquare = pd.Series(self.rsquare)
        self.rsquare.index.name = 'ncomp'
        self.rsquare.name = 'rsquare'
        # IC
        self.ic = pd.DataFrame(self.ic, columns=['IC_p1', 'IC_p2', 'IC_p3'])
        self.ic.index.name = 'ncomp'


[docs]
    def plot_scree(self, ncomp=None, log_scale=True,
                   cumulative=False, ax=None):
        """
        Plot of the ordered eigenvalues

        Parameters
        ----------
        ncomp : int, optional
            Number of components ot include in the plot.  If None, will
            included the same as the number of components computed
        log_scale : boot, optional
            Flag indicating whether ot use a log scale for the y-axis
        cumulative : bool, optional
            Flag indicating whether to plot the eigenvalues or cumulative
            eigenvalues
        ax : AxesSubplot, optional
            An axes on which to draw the graph.  If omitted, new a figure
            is created

        Returns
        -------
        matplotlib.figure.Figure
            The handle to the figure.
        """
        import statsmodels.graphics.utils as gutils

        fig, ax = gutils.create_mpl_ax(ax)

        ncomp = self._ncomp if ncomp is None else ncomp
        vals = np.asarray(self.eigenvals)
        vals = vals[:self._ncomp]
        if cumulative:
            vals = np.cumsum(vals)

        if log_scale:
            ax.set_yscale('log')
        ax.plot(np.arange(ncomp), vals[: ncomp], 'bo')
        ax.autoscale(tight=True)
        xlim = np.array(ax.get_xlim())
        sp = xlim[1] - xlim[0]
        xlim += 0.02 * np.array([-sp, sp])
        ax.set_xlim(xlim)

        ylim = np.array(ax.get_ylim())
        scale = 0.02
        if log_scale:
            sp = np.log(ylim[1] / ylim[0])
            ylim = np.exp(np.array([np.log(ylim[0]) - scale * sp,
                                    np.log(ylim[1]) + scale * sp]))
        else:
            sp = ylim[1] - ylim[0]
            ylim += scale * np.array([-sp, sp])
        ax.set_ylim(ylim)
        ax.set_title('Scree Plot')
        ax.set_ylabel('Eigenvalue')
        ax.set_xlabel('Component Number')
        fig.tight_layout()

        return fig



[docs]
    def plot_rsquare(self, ncomp=None, ax=None):
        """
        Box plots of the individual series R-square against the number of PCs.

        Parameters
        ----------
        ncomp : int, optional
            Number of components ot include in the plot.  If None, will
            plot the minimum of 10 or the number of computed components.
        ax : AxesSubplot, optional
            An axes on which to draw the graph.  If omitted, new a figure
            is created.

        Returns
        -------
        matplotlib.figure.Figure
            The handle to the figure.
        """
        import statsmodels.graphics.utils as gutils

        fig, ax = gutils.create_mpl_ax(ax)

        ncomp = 10 if ncomp is None else ncomp
        ncomp = min(ncomp, self._ncomp)
        # R2s in rows, series in columns
        r2s = 1.0 - self._ess_indiv / self._tss_indiv
        r2s = r2s[1:]
        r2s = r2s[:ncomp]
        ax.boxplot(r2s.T)
        ax.set_title('Individual Input $R^2$')
        ax.set_ylabel('$R^2$')
        ax.set_xlabel('Number of Included Principal Components')

        return fig





[docs]
def pca(data, ncomp=None, standardize=True, demean=True, normalize=True,
        gls=False, weights=None, method='svd'):
    """
    Perform Principal Component Analysis (PCA).

    Parameters
    ----------
    data : ndarray
        Variables in columns, observations in rows.
    ncomp : int, optional
        Number of components to return.  If None, returns the as many as the
        smaller to the number of rows or columns of data.
    standardize : bool, optional
        Flag indicating to use standardized data with mean 0 and unit
        variance.  standardized being True implies demean.
    demean : bool, optional
        Flag indicating whether to demean data before computing principal
        components.  demean is ignored if standardize is True.
    normalize : bool , optional
        Indicates whether th normalize the factors to have unit inner
        product.  If False, the loadings will have unit inner product.
    gls : bool, optional
        Flag indicating to implement a two-step GLS estimator where
        in the first step principal components are used to estimate residuals,
        and then the inverse residual variance is used as a set of weights to
        estimate the final principal components
    weights : ndarray, optional
        Series weights to use after transforming data according to standardize
        or demean when computing the principal components.
    method : str, optional
        Determines the linear algebra routine uses.  'eig', the default,
        uses an eigenvalue decomposition. 'svd' uses a singular value
        decomposition.

    Returns
    -------
    factors : {ndarray, DataFrame}
        Array (nobs, ncomp) of principal components (also known as scores).
    loadings : {ndarray, DataFrame}
        Array (ncomp, nvar) of principal component loadings for constructing
        the factors.
    projection : {ndarray, DataFrame}
        Array (nobs, nvar) containing the projection of the data onto the ncomp
        estimated factors.
    rsquare : {ndarray, Series}
        Array (ncomp,) where the element in the ith position is the R-square
        of including the fist i principal components.  The values are
        calculated on the transformed data, not the original data.
    ic : {ndarray, DataFrame}
        Array (ncomp, 3) containing the Bai and Ng (2003) Information
        criteria.  Each column is a different criteria, and each row
        represents the number of included factors.
    eigenvals : {ndarray, Series}
        Array of eigenvalues (nvar,).
    eigenvecs : {ndarray, DataFrame}
        Array of eigenvectors. (nvar, nvar).

    Notes
    -----
    This is a simple function wrapper around the PCA class. See PCA for
    more information and additional methods.
    """
    pc = PCA(data, ncomp=ncomp, standardize=standardize, demean=demean,
             normalize=normalize, gls=gls, weights=weights, method=method)

    return (pc.factors, pc.loadings, pc.projection, pc.rsquare, pc.ic,
            pc.eigenvals, pc.eigenvecs)
Last update: Feb 19, 2025