Source code for statsmodels.multivariate.factor

import warnings

import numpy as np
from numpy.linalg import eigh, inv, matrix_rank, norm
import pandas as pd
from scipy.optimize import minimize

from statsmodels.base.model import Model
from statsmodels.graphics.utils import _import_mpl
from statsmodels.iolib import summary2
from statsmodels.tools.decorators import cache_readonly

from .factor_rotation import promax, rotate_factors

_opt_defaults = {'gtol': 1e-7}


def _check_args_1(endog, n_factor, corr, nobs):

    msg = "Either endog or corr must be provided."
    if endog is not None and corr is not None:
        raise ValueError(msg)
    if endog is None and corr is None:
        warnings.warn('Both endog and corr are provided, ' +
                      'corr will be used for factor analysis.')

    if n_factor <= 0:
        raise ValueError('n_factor must be larger than 0! %d < 0' %
                         (n_factor))

    if nobs is not None and endog is not None:
        warnings.warn("nobs is ignored when endog is provided")


def _check_args_2(endog, n_factor, corr, nobs, k_endog):

    if n_factor > k_endog:
        raise ValueError('n_factor cannot be greater than the number'
                         ' of variables! %d > %d' %
                         (n_factor, k_endog))

    if np.max(np.abs(np.diag(corr) - 1)) > 1e-10:
        raise ValueError("corr must be a correlation matrix")

    if corr.shape[0] != corr.shape[1]:
        raise ValueError('Correlation matrix corr must be a square '
                         '(rows %d != cols %d)' % corr.shape)



[docs]
class Factor(Model):
    """
    Factor analysis

    Parameters
    ----------
    endog : array_like
        Variables in columns, observations in rows.  May be `None` if
        `corr` is not `None`.
    n_factor : int
        The number of factors to extract
    corr : array_like
        Directly specify the correlation matrix instead of estimating
        it from `endog`.  If provided, `endog` is not used for the
        factor analysis, it may be used in post-estimation.
    method : str
        The method to extract factors, currently must be either 'pa'
        for principal axis factor analysis or 'ml' for maximum
        likelihood estimation.
    smc : True or False
        Whether or not to apply squared multiple correlations (method='pa')
    endog_names : str
        Names of endogenous variables.  If specified, it will be used
        instead of the column names in endog
    nobs : int
        The number of observations, not used if endog is present. Needs to
        be provided for inference if endog is None.
    missing : 'none', 'drop', or 'raise'
        Missing value handling for endog, default is row-wise deletion 'drop'
        If 'none', no nan checking is done. If 'drop', any observations with
        nans are dropped. If 'raise', an error is raised.


    Notes
    -----
    **Experimental**

    Supported rotations: 'varimax', 'quartimax', 'biquartimax',
    'equamax', 'oblimin', 'parsimax', 'parsimony', 'biquartimin',
    'promax'

    If method='ml', the factors are rotated to satisfy condition IC3
    of Bai and Li (2012).  This means that the scores have covariance
    I, so the model for the covariance matrix is L * L' + diag(U),
    where L are the loadings and U are the uniquenesses.  In addition,
    L' * diag(U)^{-1} L must be diagonal.

    References
    ----------
    .. [*] Hofacker, C. (2004). Exploratory Factor Analysis, Mathematical
       Marketing. http://www.openaccesstexts.org/pdf/Quant_Chapter_11_efa.pdf
    .. [*] J Bai, K Li (2012).  Statistical analysis of factor models of high
       dimension.  Annals of Statistics. https://arxiv.org/pdf/1205.6617.pdf
    """
    def __init__(self, endog=None, n_factor=1, corr=None, method='pa',
                 smc=True, endog_names=None, nobs=None, missing='drop'):

        _check_args_1(endog, n_factor, corr, nobs)

        if endog is not None:
            super().__init__(endog, exog=None, missing=missing)
            endog = self.endog   # after preprocessing like missing, asarray
            k_endog = endog.shape[1]
            nobs = endog.shape[0]
            corr = self.corr = np.corrcoef(endog, rowvar=0)
        elif corr is not None:
            corr = self.corr = np.asarray(corr)
            k_endog = self.corr.shape[0]
            self.endog = None
        else:
            msg = "Either endog or corr must be provided."
            raise ValueError(msg)

        _check_args_2(endog, n_factor, corr, nobs, k_endog)

        self.n_factor = n_factor
        self.loadings = None
        self.communality = None
        self.method = method
        self.smc = smc
        self.nobs = nobs
        self.method = method
        self.corr = corr
        self.k_endog = k_endog

        if endog_names is None:
            if hasattr(corr, 'index'):
                endog_names = corr.index
            if hasattr(corr, 'columns'):
                endog_names = corr.columns
        self.endog_names = endog_names

    @property
    def endog_names(self):
        """Names of endogenous variables"""
        if self._endog_names is not None:
            return self._endog_names
        else:
            if self.endog is not None:
                return self.data.ynames
            else:
                d = 0
                n = self.corr.shape[0] - 1
                while n > 0:
                    d += 1
                    n //= 10
                return [('var%0' + str(d) + 'd') % i
                        for i in range(self.corr.shape[0])]

    @endog_names.setter
    def endog_names(self, value):
        # Check validity of endog_names:
        if value is not None:
            if len(value) != self.corr.shape[0]:
                raise ValueError('The length of `endog_names` must '
                                 'equal the number of variables.')
            self._endog_names = np.asarray(value)
        else:
            self._endog_names = None


[docs]
    def fit(self, maxiter=50, tol=1e-8, start=None, opt_method='BFGS',
            opt=None, em_iter=3):
        """
        Estimate factor model parameters.

        Parameters
        ----------
        maxiter : int
            Maximum number of iterations for iterative estimation algorithms
        tol : float
            Stopping criteria (error tolerance) for iterative estimation
            algorithms
        start : array_like
            Starting values, currently only used for ML estimation
        opt_method : str
            Optimization method for ML estimation
        opt : dict-like
            Keyword arguments passed to optimizer, only used for ML estimation
        em_iter : int
            The number of EM iterations before starting gradient optimization,
            only used for ML estimation.

        Returns
        -------
        FactorResults
            Results class instance.
        """
        method = self.method.lower()
        if method == 'pa':
            return self._fit_pa(maxiter=maxiter, tol=tol)
        elif method == 'ml':
            return self._fit_ml(start, em_iter, opt_method, opt)
        else:
            msg = "Unknown factor extraction approach '%s'" % self.method
            raise ValueError(msg)


    def _fit_pa(self, maxiter=50, tol=1e-8):
        """
        Extract factors using the iterative principal axis method

        Parameters
        ----------
        maxiter : int
            Maximum number of iterations for communality estimation
        tol : float
            If `norm(communality - last_communality)  < tolerance`,
            estimation stops

        Returns
        -------
        results : FactorResults instance
        """

        R = self.corr.copy()  # inplace modification below

        # Parameter validation
        self.n_comp = matrix_rank(R)
        if self.n_factor > self.n_comp:
            raise ValueError('n_factor must be smaller or equal to the rank'
                             ' of endog! %d > %d' %
                             (self.n_factor, self.n_comp))
        if maxiter <= 0:
            raise ValueError('n_max_iter must be larger than 0! %d < 0' %
                             (maxiter))
        if tol <= 0 or tol > 0.01:
            raise ValueError('tolerance must be larger than 0 and smaller than'
                             ' 0.01! Got %f instead' % (tol))

        #  Initial communality estimation
        if self.smc:
            c = 1 - 1 / np.diag(inv(R))
        else:
            c = np.ones(len(R))

        # Iterative communality estimation
        eigenvals = None
        for i in range(maxiter):
            # Get eigenvalues/eigenvectors of R with diag replaced by
            # communality
            for j in range(len(R)):
                R[j, j] = c[j]
            L, V = eigh(R, UPLO='U')
            c_last = np.array(c)
            ind = np.argsort(L)
            ind = ind[::-1]
            L = L[ind]
            n_pos = (L > 0).sum()
            V = V[:, ind]
            eigenvals = np.array(L)

            # Select eigenvectors with positive eigenvalues
            n = np.min([n_pos, self.n_factor])
            sL = np.diag(np.sqrt(L[:n]))
            V = V[:, :n]

            # Calculate new loadings and communality
            A = V.dot(sL)
            c = np.power(A, 2).sum(axis=1)
            if norm(c_last - c) < tol:
                break

        self.eigenvals = eigenvals
        self.communality = c
        self.uniqueness = 1 - c
        self.loadings = A
        return FactorResults(self)

    # Unpacks the model parameters from a flat vector, used for ML
    # estimation.  The first k_endog elements of par are the square
    # roots of the uniquenesses.  The remaining elements are the
    # factor loadings, packed one factor at a time.
    def _unpack(self, par):
        return (par[0:self.k_endog]**2,
                np.reshape(par[self.k_endog:], (-1, self.k_endog)).T)

    # Packs the model parameters into a flat parameter, used for ML
    # estimation.
    def _pack(self, load, uniq):
        return np.concatenate((np.sqrt(uniq), load.T.flat))


[docs]
    def loglike(self, par):
        """
        Evaluate the log-likelihood function.

        Parameters
        ----------
        par : ndarray or tuple of 2 ndarray's
            The model parameters, either a packed representation of
            the model parameters or a 2-tuple containing a `k_endog x
            n_factor` matrix of factor loadings and a `k_endog` vector
            of uniquenesses.

        Returns
        -------
        float
            The value of the log-likelihood evaluated at par.
        """

        if type(par) is np.ndarray:
            uniq, load = self._unpack(par)
        else:
            load, uniq = par[0], par[1]

        loadu = load / uniq[:, None]
        lul = np.dot(load.T, loadu)

        # log|GG' + S|
        # Using matrix determinant lemma:
        # |GG' + S| = |I + G'S^{-1}G|*|S|
        lul.flat[::lul.shape[0]+1] += 1
        _, ld = np.linalg.slogdet(lul)
        v = np.sum(np.log(uniq)) + ld

        # tr((GG' + S)^{-1}C)
        # Using Sherman-Morrison-Woodbury
        w = np.sum(1 / uniq)
        b = np.dot(load.T, self.corr / uniq[:, None])
        b = np.linalg.solve(lul, b)
        b = np.dot(loadu, b)
        w -= np.trace(b)

        # Scaled log-likelihood
        return -(v + w) / (2*self.k_endog)



[docs]
    def score(self, par):
        """
        Evaluate the score function (first derivative of loglike).

        Parameters
        ----------
        par : ndarray or tuple of 2 ndarray's
            The model parameters, either a packed representation of
            the model parameters or a 2-tuple containing a `k_endog x
            n_factor` matrix of factor loadings and a `k_endog` vector
            of uniquenesses.

        Returns
        -------
        ndarray
            The score function evaluated at par.
        """

        if type(par) is np.ndarray:
            uniq, load = self._unpack(par)
        else:
            load, uniq = par[0], par[1]

        # Center term of SMW
        loadu = load / uniq[:, None]
        c = np.dot(load.T, loadu)
        c.flat[::c.shape[0]+1] += 1
        d = np.linalg.solve(c, load.T)

        # Precompute these terms
        lud = np.dot(loadu, d)
        cu = (self.corr / uniq) / uniq[:, None]
        r = np.dot(cu, load)
        lul = np.dot(lud.T, load)
        luz = np.dot(cu, lul)

        # First term
        du = 2*np.sqrt(uniq) * (1/uniq - (d * load.T).sum(0) / uniq**2)
        dl = 2*(loadu - np.dot(lud, loadu))

        # Second term
        h = np.dot(lud, cu)
        f = np.dot(h, lud.T)
        du -= 2*np.sqrt(uniq) * (np.diag(cu) - 2*np.diag(h) + np.diag(f))
        dl -= 2*r
        dl += 2*np.dot(lud, r)
        dl += 2*luz
        dl -= 2*np.dot(lud, luz)

        # Cannot use _pack because we are working with the square root
        # uniquenesses directly.
        return -np.concatenate((du, dl.T.flat)) / (2*self.k_endog)


    # Maximum likelihood factor analysis.
    def _fit_ml(self, start, em_iter, opt_method, opt):
        """estimate Factor model using Maximum Likelihood
        """

        # Starting values
        if start is None:
            load, uniq = self._fit_ml_em(em_iter)
            start = self._pack(load, uniq)
        elif len(start) == 2:
            if len(start[1]) != start[0].shape[0]:
                msg = "Starting values have incompatible dimensions"
                raise ValueError(msg)
            start = self._pack(start[0], start[1])
        else:
            raise ValueError("Invalid starting values")

        def nloglike(par):
            return -self.loglike(par)

        def nscore(par):
            return -self.score(par)

        # Do the optimization
        if opt is None:
            opt = _opt_defaults
        r = minimize(nloglike, start, jac=nscore, method=opt_method,
                     options=opt)
        if not r.success:
            warnings.warn("Fitting did not converge")
        par = r.x
        uniq, load = self._unpack(par)

        if uniq.min() < 1e-10:
            warnings.warn("Some uniquenesses are nearly zero")

        # Rotate solution to satisfy IC3 of Bai and Li
        load = self._rotate(load, uniq)

        self.uniqueness = uniq
        self.communality = 1 - uniq
        self.loadings = load
        self.mle_retvals = r

        return FactorResults(self)

    def _fit_ml_em(self, iter, random_state=None):
        """estimate Factor model using EM algorithm
        """
        # Starting values
        if random_state is None:
            random_state = np.random.RandomState(3427)
        load = 0.1 * random_state.standard_normal(size=(self.k_endog, self.n_factor))
        uniq = 0.5 * np.ones(self.k_endog)

        for k in range(iter):

            loadu = load / uniq[:, None]

            f = np.dot(load.T, loadu)
            f.flat[::f.shape[0]+1] += 1

            r = np.linalg.solve(f, loadu.T)
            q = np.dot(loadu.T, load)
            h = np.dot(r, load)

            c = load - np.dot(load, h)
            c /= uniq[:, None]

            g = np.dot(q, r)
            e = np.dot(g, self.corr)
            d = np.dot(loadu.T, self.corr) - e

            a = np.dot(d, c)
            a -= np.dot(load.T, c)
            a.flat[::a.shape[0]+1] += 1

            b = np.dot(self.corr, c)

            load = np.linalg.solve(a, b.T).T
            uniq = np.diag(self.corr) - (load * d.T).sum(1)

        return load, uniq

    def _rotate(self, load, uniq):
        """rotate loadings for MLE
        """
        # Rotations used in ML estimation.
        load, s, _ = np.linalg.svd(load, 0)
        load *= s

        if self.nobs is None:
            nobs = 1
        else:
            nobs = self.nobs

        cm = np.dot(load.T, load / uniq[:, None]) / nobs
        _, f = np.linalg.eig(cm)
        load = np.dot(load, f)
        return load




[docs]
class FactorResults:
    """
    Factor results class

    For result summary, scree/loading plots and factor rotations

    Parameters
    ----------
    factor : Factor
        Fitted Factor class

    Attributes
    ----------
    uniqueness : ndarray
        The uniqueness (variance of uncorrelated errors unique to
        each variable)
    communality : ndarray
        1 - uniqueness
    loadings : ndarray
        Each column is the loading vector for one factor
    loadings_no_rot : ndarray
        Unrotated loadings, not available under maximum likelihood
        analysis.
    eigenvals : ndarray
        The eigenvalues for a factor analysis obtained using
        principal components; not available under ML estimation.
    n_comp : int
        Number of components (factors)
    nbs : int
        Number of observations
    fa_method : str
        The method used to obtain the decomposition, either 'pa' for
        'principal axes' or 'ml' for maximum likelihood.
    df : int
        Degrees of freedom of the factor model.

    Notes
    -----
    Under ML estimation, the default rotation (used for `loadings`) is
    condition IC3 of Bai and Li (2012).  Under this rotation, the
    factor scores are iid and standardized.  If `G` is the canonical
    loadings and `U` is the vector of uniquenesses, then the
    covariance matrix implied by the factor analysis is `GG' +
    diag(U)`.

    Status: experimental, Some refactoring will be necessary when new
        features are added.
    """
    def __init__(self, factor):
        self.model = factor
        self.endog_names = factor.endog_names
        self.loadings_no_rot = factor.loadings
        if hasattr(factor, "eigenvals"):
            self.eigenvals = factor.eigenvals

        self.communality = factor.communality
        self.uniqueness = factor.uniqueness
        self.rotation_method = None
        self.fa_method = factor.method
        self.n_comp = factor.loadings.shape[1]
        self.nobs = factor.nobs
        self._factor = factor
        if hasattr(factor, "mle_retvals"):
            self.mle_retvals = factor.mle_retvals

        p, k = self.loadings_no_rot.shape
        self.df = ((p - k)**2 - (p + k)) // 2

        # no rotation, overwritten in `rotate`
        self.loadings = factor.loadings
        self.rotation_matrix = np.eye(self.n_comp)


    def __str__(self):
        return self.summary().__str__()


[docs]
    def rotate(self, method):
        """
        Apply rotation, inplace modification of this Results instance

        Parameters
        ----------
        method : str
            Rotation to be applied.  Allowed methods are varimax,
            quartimax, biquartimax, equamax, oblimin, parsimax,
            parsimony, biquartimin, promax.

        Returns
        -------
        None : nothing returned, modifications are inplace


        Notes
        -----
        Warning: 'varimax', 'quartimax' and 'oblimin' are verified against R or
        Stata. Some rotation methods such as promax do not produce the same
        results as the R or Stata default functions.

        See Also
        --------
        factor_rotation : subpackage that implements rotation methods
        """
        self.rotation_method = method
        if method not in ['varimax', 'quartimax', 'biquartimax',
                          'equamax', 'oblimin', 'parsimax', 'parsimony',
                          'biquartimin', 'promax']:
            raise ValueError('Unknown rotation method %s' % (method))

        if method in ['varimax', 'quartimax', 'biquartimax', 'equamax',
                      'parsimax', 'parsimony', 'biquartimin']:
            self.loadings, T = rotate_factors(self.loadings_no_rot, method)
        elif method == 'oblimin':
            self.loadings, T = rotate_factors(self.loadings_no_rot,
                                              'quartimin')
        elif method == 'promax':
            self.loadings, T = promax(self.loadings_no_rot)
        else:
            raise ValueError('rotation method not recognized')

        self.rotation_matrix = T


    def _corr_factors(self):
        """correlation of factors implied by rotation

        If the rotation is oblique, then the factors are correlated.

        currently not cached

        Returns
        -------
        corr_f : ndarray
            correlation matrix of rotated factors, assuming initial factors are
            orthogonal
        """
        T = self.rotation_matrix
        corr_f = T.T.dot(T)
        return corr_f


[docs]
    def factor_score_params(self, method='bartlett'):
        """
        Compute factor scoring coefficient matrix

        The coefficient matrix is not cached.

        Parameters
        ----------
        method : 'bartlett' or 'regression'
            Method to use for factor scoring.
            'regression' can be abbreviated to `reg`

        Returns
        -------
        coeff_matrix : ndarray
            matrix s to compute factors f from a standardized endog ys.
            ``f = ys dot s``

        Notes
        -----
        The `regression` method follows the Stata definition.
        Method bartlett and regression are verified against Stats.
        Two unofficial methods, 'ols' and 'gls', produce similar factor scores
        but are not verified.

        See Also
        --------
        statsmodels.multivariate.factor.FactorResults.factor_scoring
        """
        L = self.loadings
        #TODO: check row versus column convention for T
        uni = 1 - self.communality #self.uniqueness

        if method == 'bartlett':
            s_mat = np.linalg.inv(L.T.dot(L/(uni[:,None]))).dot(L.T / uni).T
        elif method.startswith('reg'):
            corr = self.model.corr
            corr_f = self._corr_factors()
            # if orthogonal then corr_f is just eye
            s_mat = corr_f.dot(L.T.dot(np.linalg.inv(corr))).T
        elif method == 'ols':
            # not verified
            corr = self.model.corr
            corr_f = self._corr_factors()
            s_mat = corr_f.dot(np.linalg.pinv(L)).T
        elif method == 'gls':
            # not verified
            #s_mat = np.linalg.inv(1*np.eye(L.shape[1]) + L.T.dot(L/(uni[:,None])))
            corr = self.model.corr
            corr_f = self._corr_factors()
            s_mat = np.linalg.inv(np.linalg.inv(corr_f) + L.T.dot(L/(uni[:,None])))
            s_mat = s_mat.dot(L.T / uni).T
        else:
            raise ValueError('method not available, use "bartlett ' +
                             'or "regression"')
        return s_mat



[docs]
    def factor_scoring(self, endog=None, method='bartlett', transform=True):
        """
        factor scoring: compute factors for endog

        If endog was not provided when creating the factor class, then
        a standarized endog needs to be provided here.

        Parameters
        ----------
        method : 'bartlett' or 'regression'
            Method to use for factor scoring.
            'regression' can be abbreviated to `reg`
        transform : bool
            If transform is true and endog is provided, then it will be
            standardized using mean and scale of original data, which has to
            be available in this case.
            If transform is False, then a provided endog will be used unchanged.
            The original endog in the Factor class will
            always be standardized if endog is None, independently of `transform`.

        Returns
        -------
        factor_score : ndarray
            estimated factors using scoring matrix s and standarized endog ys
            ``f = ys dot s``

        Notes
        -----
        Status: transform option is experimental and might change.

        See Also
        --------
        statsmodels.multivariate.factor.FactorResults.factor_score_params
        """

        if transform is False and endog is not None:
            # no transformation in this case
            endog = np.asarray(endog)
        else:
            # we need to standardize with the original mean and scale
            if self.model.endog is not None:
                m = self.model.endog.mean(0)
                s = self.model.endog.std(ddof=1, axis=0)
                if endog is None:
                    endog = self.model.endog
                else:
                    endog = np.asarray(endog)
            else:
                raise ValueError('If transform is True, then `endog` needs ' +
                                 'to be available in the Factor instance.')

            endog = (endog - m) / s

        s_mat = self.factor_score_params(method=method)
        factors = endog.dot(s_mat)
        return factors



[docs]
    def summary(self):
        """Summary"""
        summ = summary2.Summary()
        summ.add_title('Factor analysis results')
        loadings_no_rot = pd.DataFrame(
            self.loadings_no_rot,
            columns=["factor %d" % (i)
                     for i in range(self.loadings_no_rot.shape[1])],
            index=self.endog_names
        )
        if hasattr(self, "eigenvals"):
            # eigenvals not available for ML method
            eigenvals = pd.DataFrame(
                [self.eigenvals], columns=self.endog_names, index=[''])
            summ.add_dict({'': 'Eigenvalues'})
            summ.add_df(eigenvals)
        communality = pd.DataFrame([self.communality],
                                   columns=self.endog_names, index=[''])
        summ.add_dict({'': ''})
        summ.add_dict({'': 'Communality'})
        summ.add_df(communality)
        summ.add_dict({'': ''})
        summ.add_dict({'': 'Pre-rotated loadings'})
        summ.add_df(loadings_no_rot)
        summ.add_dict({'': ''})
        if self.rotation_method is not None:
            loadings = pd.DataFrame(
                self.loadings,
                columns=["factor %d" % (i)
                         for i in range(self.loadings.shape[1])],
                index=self.endog_names
            )
            summ.add_dict({'': '%s rotated loadings' % (self.rotation_method)})
            summ.add_df(loadings)
        return summ



[docs]
    def get_loadings_frame(self, style='display', sort_=True, threshold=0.3,
                           highlight_max=True, color_max='yellow',
                           decimals=None):
        """get loadings matrix as DataFrame or pandas Styler

        Parameters
        ----------
        style : 'display' (default), 'raw' or 'strings'
            Style to use for display

            * 'raw' returns just a DataFrame of the loadings matrix, no options are
               applied
            * 'display' add sorting and styling as defined by other keywords
            * 'strings' returns a DataFrame with string elements with optional sorting
               and suppressing small loading coefficients.

        sort_ : bool
            If True, then the rows of the DataFrame is sorted by contribution of each
            factor. applies if style is either 'display' or 'strings'
        threshold : float
            If the threshold is larger than zero, then loading coefficients are
            either colored white (if style is 'display') or replace by empty
            string (if style is 'strings').
        highlight_max : bool
            This add a background color to the largest coefficient in each row.
        color_max : html color
            default is 'yellow'. color for background of row maximum
        decimals : None or int
            If None, then pandas default precision applies. Otherwise values are
            rounded to the specified decimals. If style is 'display', then the
            underlying dataframe is not changed. If style is 'strings', then
            values are rounded before conversion to strings.

        Returns
        -------
        loadings : DataFrame or pandas Styler instance
            The return is a pandas Styler instance, if style is 'display' and
            at least one of highlight_max, threshold or decimals is applied.
            Otherwise, the returned loadings is a DataFrame.

        Examples
        --------
        >>> mod = Factor(df, 3, smc=True)
        >>> res = mod.fit()
        >>> res.get_loadings_frame(style='display', decimals=3, threshold=0.2)

        To get a sorted DataFrame, all styling options need to be turned off:

        >>> df_sorted = res.get_loadings_frame(style='display',
        ...             highlight_max=False, decimals=None, threshold=0)

        Options except for highlighting are available for plain test or Latex
        usage:

        >>> lds = res_u.get_loadings_frame(style='strings', decimals=3,
        ...                                threshold=0.3)
        >>> print(lds.to_latex())
        """

        loadings_df = pd.DataFrame(
                self.loadings,
                columns=["factor %d" % (i)
                         for i in range(self.loadings.shape[1])],
                index=self.endog_names
                )

        if style not in ['raw', 'display', 'strings']:
            msg = "style has to be one of 'raw', 'display', 'strings'"
            raise ValueError(msg)

        if style == 'raw':
            return loadings_df

        # add sorting and some formatting
        if sort_ is True:
            loadings_df2 = loadings_df.copy()
            n_f = len(loadings_df2)
            high = np.abs(loadings_df2.values).argmax(1)
            loadings_df2['high'] = high
            loadings_df2['largest'] = np.abs(loadings_df.values[np.arange(n_f), high])
            loadings_df2.sort_values(by=['high', 'largest'], ascending=[True, False], inplace=True)
            loadings_df = loadings_df2.drop(['high', 'largest'], axis=1)

        if style == 'display':
            sty = None
            if threshold > 0:
                def color_white_small(val):
                    """
                    Takes a scalar and returns a string with
                    the css property `'color: white'` for small values, black otherwise.

                    takes threshold from outer scope
                    """
                    color = 'white' if np.abs(val) < threshold else 'black'
                    return 'color: %s' % color
                try:
                    sty = loadings_df.style.map(color_white_small)
                except AttributeError:
                    # Deprecated in pandas 2.1
                    sty = loadings_df.style.applymap(color_white_small)

            if highlight_max is True:
                def highlight_max(s):
                    '''
                    highlight the maximum in a Series yellow.
                    '''
                    s = np.abs(s)
                    is_max = s == s.max()
                    return ['background-color: '+ color_max if v else '' for v in is_max]

                if sty is None:
                    sty = loadings_df.style

                sty = sty.apply(highlight_max, axis=1)

            if decimals is not None:
                if sty is None:
                    sty = loadings_df.style

                sty.format("{:.%sf}" % decimals)

            if sty is None:
                return loadings_df
            else:
                return sty

        if style == 'strings':
            ld = loadings_df
            if decimals is not None:
                ld = ld.round(decimals)
            ld = ld.astype(str)
            if threshold > 0:
                ld[loadings_df.abs() < threshold] = ''
            return ld



[docs]
    def plot_scree(self, ncomp=None):
        """
        Plot of the ordered eigenvalues and variance explained for the loadings

        Parameters
        ----------
        ncomp : int, optional
            Number of loadings to include in the plot.  If None, will
            included the same as the number of maximum possible loadings

        Returns
        -------
        Figure
            Handle to the figure.
        """
        _import_mpl()
        from .plots import plot_scree
        return plot_scree(self.eigenvals, self.n_comp, ncomp)



[docs]
    def plot_loadings(self, loading_pairs=None, plot_prerotated=False):
        """
        Plot factor loadings in 2-d plots

        Parameters
        ----------
        loading_pairs : None or a list of tuples
            Specify plots. Each tuple (i, j) represent one figure, i and j is
            the loading number for x-axis and y-axis, respectively. If `None`,
            all combinations of the loadings will be plotted.
        plot_prerotated : True or False
            If True, the loadings before rotation applied will be plotted. If
            False, rotated loadings will be plotted.

        Returns
        -------
        figs : a list of figure handles
        """
        _import_mpl()
        from .plots import plot_loadings

        if self.rotation_method is None:
            plot_prerotated = True
        loadings = self.loadings_no_rot if plot_prerotated else self.loadings
        if plot_prerotated:
            title = 'Prerotated Factor Pattern'
        else:
            title = '%s Rotated Factor Pattern' % (self.rotation_method)
        var_explained = self.eigenvals / self.n_comp * 100

        return plot_loadings(loadings, loading_pairs=loading_pairs,
                             title=title, row_names=self.endog_names,
                             percent_variance=var_explained)


    @cache_readonly
    def fitted_cov(self):
        """
        Returns the fitted covariance matrix.
        """

        c = np.dot(self.loadings, self.loadings.T)
        c.flat[::c.shape[0]+1] += self.uniqueness
        return c

    @cache_readonly
    def uniq_stderr(self, kurt=0):
        """
        The standard errors of the uniquenesses.

        Parameters
        ----------
        kurt : float
            Excess kurtosis

        Notes
        -----
        If excess kurtosis is known, provide as `kurt`.  Standard
        errors are only available if the model was fit using maximum
        likelihood.  If `endog` is not provided, `nobs` must be
        provided to obtain standard errors.

        These are asymptotic standard errors.  See Bai and Li (2012)
        for conditions under which the standard errors are valid.

        The standard errors are only applicable to the original,
        unrotated maximum likelihood solution.
        """

        if self.fa_method.lower() != "ml":
            msg = "Standard errors only available under ML estimation"
            raise ValueError(msg)

        if self.nobs is None:
            msg = "nobs is required to obtain standard errors."
            raise ValueError(msg)

        v = self.uniqueness**2 * (2 + kurt)
        return np.sqrt(v / self.nobs)

    @cache_readonly
    def load_stderr(self):
        """
        The standard errors of the loadings.

        Standard errors are only available if the model was fit using
        maximum likelihood.  If `endog` is not provided, `nobs` must be
        provided to obtain standard errors.

        These are asymptotic standard errors.  See Bai and Li (2012)
        for conditions under which the standard errors are valid.

        The standard errors are only applicable to the original,
        unrotated maximum likelihood solution.
        """

        if self.fa_method.lower() != "ml":
            msg = "Standard errors only available under ML estimation"
            raise ValueError(msg)

        if self.nobs is None:
            msg = "nobs is required to obtain standard errors."
            raise ValueError(msg)

        v = np.outer(self.uniqueness, np.ones(self.loadings.shape[1]))
        return np.sqrt(v / self.nobs)
Last update: Feb 19, 2025