Source code for statsmodels.regression.quantile_regression

#!/usr/bin/env python

'''
Quantile regression model

Model parameters are estimated using iterated reweighted least squares. The
asymptotic covariance matrix estimated using kernel density estimation.

Author: Vincent Arel-Bundock
License: BSD-3
Created: 2013-03-19

The original IRLS function was written for Matlab by Shapour Mohammadi,
University of Tehran, 2008 (shmohammadi@gmail.com), with some lines based on
code written by James P. Lesage in Applied Econometrics Using MATLAB(1999).PP.
73-4.  Translated to python with permission from original author by Christian
Prinoth (christian at prinoth dot name).
'''

import numpy as np
import warnings
import scipy.stats as stats
from numpy.linalg import pinv
from scipy.stats import norm
from statsmodels.tools.decorators import cache_readonly
from statsmodels.regression.linear_model import (RegressionModel,
                                                 RegressionResults,
                                                 RegressionResultsWrapper)
from statsmodels.tools.sm_exceptions import (ConvergenceWarning,
                                             IterationLimitWarning)

[docs]class QuantReg(RegressionModel): '''Quantile Regression Estimate a quantile regression model using iterative reweighted least squares. Parameters ---------- endog : array or dataframe endogenous/response variable exog : array or dataframe exogenous/explanatory variable(s) Notes ----- The Least Absolute Deviation (LAD) estimator is a special case where quantile is set to 0.5 (q argument of the fit method). The asymptotic covariance matrix is estimated following the procedure in Greene (2008, p.407-408), using either the logistic or gaussian kernels (kernel argument of the fit method). References ---------- General: * Birkes, D. and Y. Dodge(1993). Alternative Methods of Regression, John Wiley and Sons. * Green,W. H. (2008). Econometric Analysis. Sixth Edition. International Student Edition. * Koenker, R. (2005). Quantile Regression. New York: Cambridge University Press. * LeSage, J. P.(1999). Applied Econometrics Using MATLAB, Kernels (used by the fit method): * Green (2008) Table 14.2 Bandwidth selection (used by the fit method): * Bofinger, E. (1975). Estimation of a density function using order statistics. Australian Journal of Statistics 17: 1-17. * Chamberlain, G. (1994). Quantile regression, censoring, and the structure of wages. In Advances in Econometrics, Vol. 1: Sixth World Congress, ed. C. A. Sims, 171-209. Cambridge: Cambridge University Press. * Hall, P., and S. Sheather. (1988). On the distribution of the Studentized quantile. Journal of the Royal Statistical Society, Series B 50: 381-391. Keywords: Least Absolute Deviation(LAD) Regression, Quantile Regression, Regression, Robust Estimation. ''' def __init__(self, endog, exog, **kwargs): super(QuantReg, self).__init__(endog, exog, **kwargs)
[docs] def whiten(self, data): """ QuantReg model whitener does nothing: returns data. """ return data
[docs] def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-6, **kwargs): """ Solve by Iterative Weighted Least Squares Parameters ---------- q : float Quantile must be strictly between 0 and 1 vcov : str, method used to calculate the variance-covariance matrix of the parameters. Default is ``robust``: - robust : heteroskedasticity robust standard errors (as suggested in Greene 6th edition) - iid : iid errors (as in Stata 12) kernel : str, kernel to use in the kernel density estimation for the asymptotic covariance matrix: - epa: Epanechnikov - cos: Cosine - gau: Gaussian - par: Parzene bandwidth : str, Bandwidth selection method in kernel density estimation for asymptotic covariance estimate (full references in QuantReg docstring): - hsheather: Hall-Sheather (1988) - bofinger: Bofinger (1975) - chamberlain: Chamberlain (1994) """ if q <= 0 or q >= 1: raise Exception('q must be strictly between 0 and 1') kern_names = ['biw', 'cos', 'epa', 'gau', 'par'] if kernel not in kern_names: raise Exception("kernel must be one of " + ', '.join(kern_names)) else: kernel = kernels[kernel] if bandwidth == 'hsheather': bandwidth = hall_sheather elif bandwidth == 'bofinger': bandwidth = bofinger elif bandwidth == 'chamberlain': bandwidth = chamberlain else: raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'") endog = self.endog exog = self.exog nobs = self.nobs exog_rank = np.linalg.matrix_rank(self.exog) self.rank = exog_rank self.df_model = float(self.rank - self.k_constant) self.df_resid = self.nobs - self.rank n_iter = 0 xstar = exog beta = np.ones(exog_rank) # TODO: better start, initial beta is used only for convergence check # Note the following does not work yet, # the iteration loop always starts with OLS as initial beta # if start_params is not None: # if len(start_params) != rank: # raise ValueError('start_params has wrong length') # beta = start_params # else: # # start with OLS # beta = np.dot(np.linalg.pinv(exog), endog) diff = 10 cycle = False history = dict(params = [], mse=[]) while n_iter < max_iter and diff > p_tol and not cycle: n_iter += 1 beta0 = beta xtx = np.dot(xstar.T, exog) xty = np.dot(xstar.T, endog) beta = np.dot(pinv(xtx), xty) resid = endog - np.dot(exog, beta) mask = np.abs(resid) < .000001 resid[mask] = ((resid[mask] >= 0) * 2 - 1) * .000001 resid = np.where(resid < 0, q * resid, (1-q) * resid) resid = np.abs(resid) xstar = exog / resid[:, np.newaxis] diff = np.max(np.abs(beta - beta0)) history['params'].append(beta) history['mse'].append(np.mean(resid*resid)) if (n_iter >= 300) and (n_iter % 100 == 0): # check for convergence circle, should not happen for ii in range(2, 10): if np.all(beta == history['params'][-ii]): cycle = True warnings.warn("Convergence cycle detected", ConvergenceWarning) break if n_iter == max_iter: warnings.warn("Maximum number of iterations (" + str(max_iter) + ") reached.", IterationLimitWarning) e = endog - np.dot(exog, beta) # Greene (2008, p.407) writes that Stata 6 uses this bandwidth: # h = 0.9 * np.std(e) / (nobs**0.2) # Instead, we calculate bandwidth as in Stata 12 iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25) h = bandwidth(nobs, q) h = min(np.std(endog), iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h)) fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h)) if vcov == 'robust': d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2) xtxi = pinv(np.dot(exog.T, exog)) xtdx = np.dot(exog.T * d[np.newaxis, :], exog) vcov = xtxi @ xtdx @ xtxi elif vcov == 'iid': vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog)) else: raise Exception("vcov must be 'robust' or 'iid'") lfit = QuantRegResults(self, beta, normalized_cov_params=vcov) lfit.q = q lfit.iterations = n_iter lfit.sparsity = 1. / fhat0 lfit.bandwidth = h lfit.history = history return RegressionResultsWrapper(lfit)
def _parzen(u): z = np.where(np.abs(u) <= .5, 4./3 - 8. * u**2 + 8. * np.abs(u)**3, 8. * (1 - np.abs(u))**3 / 3.) z[np.abs(u) > 1] = 0 return z kernels = {} kernels['biw'] = lambda u: 15. / 16 * (1 - u**2)**2 * np.where(np.abs(u) <= 1, 1, 0) kernels['cos'] = lambda u: np.where(np.abs(u) <= .5, 1 + np.cos(2 * np.pi * u), 0) kernels['epa'] = lambda u: 3. / 4 * (1-u**2) * np.where(np.abs(u) <= 1, 1, 0) kernels['gau'] = lambda u: norm.pdf(u) kernels['par'] = _parzen #kernels['bet'] = lambda u: np.where(np.abs(u) <= 1, .75 * (1 - u) * (1 + u), 0) #kernels['log'] = lambda u: logistic.pdf(u) * (1 - logistic.pdf(u)) #kernels['tri'] = lambda u: np.where(np.abs(u) <= 1, 1 - np.abs(u), 0) #kernels['trw'] = lambda u: 35. / 32 * (1 - u**2)**3 * np.where(np.abs(u) <= 1, 1, 0) #kernels['uni'] = lambda u: 1. / 2 * np.where(np.abs(u) <= 1, 1, 0) def hall_sheather(n, q, alpha=.05): z = norm.ppf(q) num = 1.5 * norm.pdf(z)**2. den = 2. * z**2. + 1. h = n**(-1. / 3) * norm.ppf(1. - alpha / 2.)**(2./3) * (num / den)**(1./3) return h def bofinger(n, q): num = 9. / 2 * norm.pdf(2 * norm.ppf(q))**4 den = (2 * norm.ppf(q)**2 + 1)**2 h = n**(-1. / 5) * (num / den)**(1. / 5) return h def chamberlain(n, q, alpha=.05): return norm.ppf(1 - alpha / 2) * np.sqrt(q*(1 - q) / n)
[docs]class QuantRegResults(RegressionResults): '''Results instance for the QuantReg model''' @cache_readonly def prsquared(self): q = self.q endog = self.model.endog e = self.resid e = np.where(e < 0, (1 - q) * e, q * e) e = np.abs(e) ered = endog - stats.scoreatpercentile(endog, q * 100) ered = np.where(ered < 0, (1 - q) * ered, q * ered) ered = np.abs(ered) return 1 - np.sum(e) / np.sum(ered) #@cache_readonly def scale(self): return 1. @cache_readonly def bic(self): return np.nan @cache_readonly def aic(self): return np.nan @cache_readonly def llf(self): return np.nan @cache_readonly def rsquared(self): return np.nan @cache_readonly def rsquared_adj(self): return np.nan @cache_readonly def mse(self): return np.nan @cache_readonly def mse_model(self): return np.nan @cache_readonly def mse_total(self): return np.nan @cache_readonly def centered_tss(self): return np.nan @cache_readonly def uncentered_tss(self): return np.nan @cache_readonly def HC0_se(self): raise NotImplementedError @cache_readonly def HC1_se(self): raise NotImplementedError @cache_readonly def HC2_se(self): raise NotImplementedError @cache_readonly def HC3_se(self): raise NotImplementedError def summary(self, yname=None, xname=None, title=None, alpha=.05): """Summarize the Regression Results Parameters ---------- yname : str, optional Default is `y` xname : list[str], optional Names for the exogenous variables. Default is `var_##` for ## in the number of regressors. Must match the number of parameters in the model title : str, optional Title for the top table. If not None, then this replaces the default title alpha : float significance level for the confidence intervals Returns ------- smry : Summary instance this holds the summary tables and text, which can be printed or converted to various output formats. See Also -------- statsmodels.iolib.summary.Summary : class to hold summary results """ eigvals = self.eigenvals condno = self.condition_number top_left = [('Dep. Variable:', None), ('Model:', None), ('Method:', ['Least Squares']), ('Date:', None), ('Time:', None) ] top_right = [('Pseudo R-squared:', ["%#8.4g" % self.prsquared]), ('Bandwidth:', ["%#8.4g" % self.bandwidth]), ('Sparsity:', ["%#8.4g" % self.sparsity]), ('No. Observations:', None), ('Df Residuals:', None), ('Df Model:', None) ] if title is None: title = self.model.__class__.__name__ + ' ' + "Regression Results" # create summary table instance from statsmodels.iolib.summary import Summary smry = Summary() smry.add_table_2cols(self, gleft=top_left, gright=top_right, yname=yname, xname=xname, title=title) smry.add_table_params(self, yname=yname, xname=xname, alpha=alpha, use_t=self.use_t) # add warnings/notes, added to text format only etext = [] if eigvals[-1] < 1e-10: wstr = "The smallest eigenvalue is %6.3g. This might indicate " wstr += "that there are\n" wstr += "strong multicollinearity problems or that the design " wstr += "matrix is singular." wstr = wstr % eigvals[-1] etext.append(wstr) elif condno > 1000: # TODO: what is recommended wstr = "The condition number is large, %6.3g. This might " wstr += "indicate that there are\n" wstr += "strong multicollinearity or other numerical " wstr += "problems." wstr = wstr % condno etext.append(wstr) if etext: smry.add_extra_txt(etext) return smry