# -*- coding: utf-8 -*-
u"""
Beta regression for modeling rates and proportions.
References
----------
GrĂ¼n, Bettina, Ioannis Kosmidis, and Achim Zeileis. Extended beta regression
in R: Shaken, stirred, mixed, and partitioned. No. 2011-22. Working Papers in
Economics and Statistics, 2011.
Smithson, Michael, and Jay Verkuilen. "A better lemon squeezer?
Maximum-likelihood regression with beta-distributed dependent variables."
Psychological methods 11.1 (2006): 54.
"""
import numpy as np
from scipy.special import gammaln as lgamma
import patsy
import statsmodels.base.wrapper as wrap
import statsmodels.regression.linear_model as lm
from statsmodels.tools.decorators import cache_readonly
from statsmodels.base.model import (
GenericLikelihoodModel, GenericLikelihoodModelResults, _LLRMixin)
from statsmodels.genmod import families
_init_example = """
Beta regression with default of logit-link for exog and log-link
for precision.
>>> mod = BetaModel(endog, exog)
>>> rslt = mod.fit()
>>> print(rslt.summary())
We can also specify a formula and a specific structure and use the
identity-link for precision.
>>> from sm.families.links import identity
>>> Z = patsy.dmatrix('~ temp', dat, return_type='dataframe')
>>> mod = BetaModel.from_formula('iyield ~ C(batch, Treatment(10)) + temp',
... dat, exog_precision=Z,
... link_precision=identity())
In the case of proportion-data, we may think that the precision depends on
the number of measurements. E.g for sequence data, on the number of
sequence reads covering a site:
>>> Z = patsy.dmatrix('~ coverage', df)
>>> formula = 'methylation ~ disease + age + gender + coverage'
>>> mod = BetaModel.from_formula(formula, df, Z)
>>> rslt = mod.fit()
"""
[docs]class BetaModel(GenericLikelihoodModel):
__doc__ = """Beta Regression.
The Model is parameterized by mean and precision. Both can depend on
explanatory variables through link functions.
Parameters
----------
endog : array_like
1d array of endogenous response variable.
exog : array_like
A nobs x k array where `nobs` is the number of observations and `k`
is the number of regressors. An intercept is not included by default
and should be added by the user (models specified using a formula
include an intercept by default). See `statsmodels.tools.add_constant`.
exog_precision : array_like
2d array of variables for the precision.
link : link
Any link in sm.families.links for mean, should have range in
interval [0, 1]. Default is logit-link.
link_precision : link
Any link in sm.families.links for precision, should have
range in positive line. Default is log-link.
**kwds : extra keywords
Keyword options that will be handled by super classes.
Not all general keywords will be supported in this class.
Notes
-----
Status: experimental, new in 0.13.
Core results are verified, but api can change and some extra results
specific to Beta regression are missing.
Examples
--------
{example}
See Also
--------
:ref:`links`
""".format(example=_init_example)
def __init__(self, endog, exog, exog_precision=None,
link=families.links.Logit(),
link_precision=families.links.Log(), **kwds):
etmp = np.array(endog)
assert np.all((0 < etmp) & (etmp < 1))
if exog_precision is None:
extra_names = ['precision']
exog_precision = np.ones((len(endog), 1), dtype='f')
else:
extra_names = ['precision-%s' % zc for zc in
(exog_precision.columns
if hasattr(exog_precision, 'columns')
else range(1, exog_precision.shape[1] + 1))]
kwds['extra_params_names'] = extra_names
super(BetaModel, self).__init__(endog, exog,
exog_precision=exog_precision,
**kwds)
self.link = link
self.link_precision = link_precision
# not needed, handled by super:
# self.exog_precision = exog_precision
# inherited df do not account for precision params
self.nobs = self.endog.shape[0]
self.df_model = self.nparams - 1
self.df_resid = self.nobs - self.nparams
assert len(self.exog_precision) == len(self.endog)
self.hess_type = "oim"
if 'exog_precision' not in self._init_keys:
self._init_keys.extend(['exog_precision'])
self._init_keys.extend(['link', 'link_precision'])
self._null_drop_keys = ['exog_precision']
self.results_class = BetaResults
self.results_class_wrapper = BetaResultsWrapper
[docs] def predict(self, params, exog=None, exog_precision=None, which="mean"):
"""Predict values for mean or precision
Parameters
----------
params : array_like
The model parameters.
exog : array_like
Array of predictor variables for mean.
exog_precision : array_like
Array of predictor variables for precision.
which : str
- "mean" : mean, conditional expectation E(endog | exog)
- "precision" : predicted precision
- "linpred" : linear predictor for the mean function
- "linpred_precision" : linear predictor for the precision function
Returns
-------
ndarray, predicted values
"""
k_mean = self.exog.shape[1]
if which in ["mean", "linpred"]:
if exog is None:
exog = self.exog
params_mean = params[:k_mean]
# Zparams = params[k_mean:]
linpred = np.dot(exog, params_mean)
if which == "mean":
mu = self.link.inverse(linpred)
return mu
else:
return linpred
elif which in ["precision", "linpred_precision"]:
if exog_precision is None:
exog_precision = self.exog_precision
params_prec = params[k_mean:]
linpred_prec = np.dot(exog_precision, params_prec)
if which == "precision":
phi = self.link_precision.inverse(linpred_prec)
return phi
else:
return linpred_prec
[docs] def predict_precision(self, params, exog_precision=None):
"""Predict values for precision function for given exog_precision.
Parameters
----------
params : array_like
The model parameters.
exog_precision : array_like
Array of predictor variables for precision.
Returns
-------
Predicted precision.
"""
if exog_precision is None:
exog_precision = self.exog_precision
k_mean = self.exog.shape[1]
params_precision = params[k_mean:]
linpred_prec = np.dot(exog_precision, params_precision)
phi = self.link_precision.inverse(linpred_prec)
return phi
[docs] def predict_var(self, params, exog=None, exog_precision=None):
"""predict values for conditional variance V(endog | exog)
Parameters
----------
params : array_like
The model parameters.
exog : array_like
Array of predictor variables for mean.
exog_precision : array_like
Array of predictor variables for precision.
Returns
-------
Predicted conditional variance.
"""
mean = self.predict(params, exog=exog)
precision = self.predict_precision(params,
exog_precision=exog_precision)
var_endog = mean * (1 - mean) / (1 + precision)
return var_endog
[docs] def loglikeobs(self, params):
"""
Loglikelihood for observations of the Beta regressionmodel.
Parameters
----------
params : ndarray
The parameters of the model, coefficients for linear predictors
of the mean and of the precision function.
Returns
-------
loglike : ndarray
The log likelihood for each observation of the model evaluated
at `params`.
"""
return self._llobs(self.endog, self.exog, self.exog_precision, params)
def _llobs(self, endog, exog, exog_precision, params):
"""
Loglikelihood for observations with data arguments.
Parameters
----------
endog : ndarray
1d array of endogenous variable.
exog : ndarray
2d array of explanatory variables.
exog_precision : ndarray
2d array of explanatory variables for precision.
params : ndarray
The parameters of the model, coefficients for linear predictors
of the mean and of the precision function.
Returns
-------
loglike : ndarray
The log likelihood for each observation of the model evaluated
at `params`.
"""
y, X, Z = endog, exog, exog_precision
nz = Z.shape[1]
params_mean = params[:-nz]
params_prec = params[-nz:]
linpred = np.dot(X, params_mean)
linpred_prec = np.dot(Z, params_prec)
mu = self.link.inverse(linpred)
phi = self.link_precision.inverse(linpred_prec)
eps_lb = 1e-200
alpha = np.clip(mu * phi, eps_lb, np.inf)
beta = np.clip((1 - mu) * phi, eps_lb, np.inf)
ll = (lgamma(phi) - lgamma(alpha)
- lgamma(beta)
+ (mu * phi - 1) * np.log(y)
+ (((1 - mu) * phi) - 1) * np.log(1 - y))
return ll
[docs] def score(self, params):
"""
Returns the score vector of the log-likelihood.
http://www.tandfonline.com/doi/pdf/10.1080/00949650903389993
Parameters
----------
params : ndarray
Parameter at which score is evaluated.
Returns
-------
score : ndarray
First derivative of loglikelihood function.
"""
sf = self.score_factor(params)
d1 = np.dot(sf[:, 0], self.exog)
d2 = np.dot(sf[:, 1], self.exog_precision)
return np.concatenate((d1, d2))
def _score_check(self, params):
"""Inherited score with finite differences
Parameters
----------
params : ndarray
Parameter at which score is evaluated.
Returns
-------
score based on numerical derivatives
"""
return super(BetaModel, self).score(params)
[docs] def score_factor(self, params):
"""Derivative of loglikelihood function w.r.t. linear predictors.
This needs to be multiplied with the exog to obtain the score_obs.
Parameters
----------
params : ndarray
Parameter at which score is evaluated.
Returns
-------
score_factor : ndarray, 2-D
A 2d weight vector used in the calculation of the score_obs.
Notes
-----
The score_obs can be obtained from score_factor ``sf`` using
- d1 = sf[:, :1] * exog
- d2 = sf[:, 1:2] * exog_precision
"""
from scipy import special
digamma = special.psi
y, X, Z = self.endog, self.exog, self.exog_precision
nz = Z.shape[1]
Xparams = params[:-nz]
Zparams = params[-nz:]
# NO LINKS
mu = self.link.inverse(np.dot(X, Xparams))
phi = self.link_precision.inverse(np.dot(Z, Zparams))
eps_lb = 1e-200 # lower bound for evaluating digamma, avoids -inf
alpha = np.clip(mu * phi, eps_lb, np.inf)
beta = np.clip((1 - mu) * phi, eps_lb, np.inf)
ystar = np.log(y / (1. - y))
dig_beta = digamma(beta)
mustar = digamma(alpha) - dig_beta
yt = np.log(1 - y)
mut = dig_beta - digamma(phi)
t = 1. / self.link.deriv(mu)
h = 1. / self.link_precision.deriv(phi)
#
sf1 = phi * t * (ystar - mustar)
sf2 = h * (mu * (ystar - mustar) + yt - mut)
return np.column_stack((sf1, sf2))
[docs] def score_hessian_factor(self, params, return_hessian=False,
observed=True):
"""Derivatives of loglikelihood function w.r.t. linear predictors.
This calculates score and hessian factors at the same time, because
there is a large overlap in calculations.
Parameters
----------
params : ndarray
Parameter at which score is evaluated.
return_hessian : bool
If False, then only score_factors are returned
If True, the both score and hessian factors are returned
observed : bool
If True, then the observed Hessian is returned (default).
If False, then the expected information matrix is returned.
Returns
-------
score_factor : ndarray, 2-D
A 2d weight vector used in the calculation of the score_obs.
(-jbb, -jbg, -jgg) : tuple
A tuple with 3 hessian factors, corresponding to the upper
triangle of the Hessian matrix.
TODO: check why there are minus
"""
from scipy import special
digamma = special.psi
y, X, Z = self.endog, self.exog, self.exog_precision
nz = Z.shape[1]
Xparams = params[:-nz]
Zparams = params[-nz:]
# NO LINKS
mu = self.link.inverse(np.dot(X, Xparams))
phi = self.link_precision.inverse(np.dot(Z, Zparams))
# We need to prevent mu = 0 and (1-mu) = 0 in digamma call
eps_lb = 1e-200 # lower bound for evaluating digamma, avoids -inf
alpha = np.clip(mu * phi, eps_lb, np.inf)
beta = np.clip((1 - mu) * phi, eps_lb, np.inf)
ystar = np.log(y / (1. - y))
dig_beta = digamma(beta)
mustar = digamma(alpha) - dig_beta
yt = np.log(1 - y)
mut = dig_beta - digamma(phi)
t = 1. / self.link.deriv(mu)
h = 1. / self.link_precision.deriv(phi)
ymu_star = (ystar - mustar)
sf1 = phi * t * ymu_star
sf2 = h * (mu * ymu_star + yt - mut)
if return_hessian:
trigamma = lambda x: special.polygamma(1, x) # noqa
trig_beta = trigamma(beta)
var_star = trigamma(alpha) + trig_beta
var_t = trig_beta - trigamma(phi)
c = - trig_beta
s = self.link.deriv2(mu)
q = self.link_precision.deriv2(phi)
jbb = (phi * t) * var_star
if observed:
jbb += s * t**2 * ymu_star
jbb *= t * phi
jbg = phi * t * h * (mu * var_star + c)
if observed:
jbg -= ymu_star * t * h
jgg = h**2 * (mu**2 * var_star + 2 * mu * c + var_t)
if observed:
jgg += (mu * ymu_star + yt - mut) * q * h**3 # **3 ?
return np.column_stack((sf1, sf2)), (-jbb, -jbg, -jgg)
else:
return np.column_stack((sf1, sf2))
[docs] def score_obs(self, params):
"""
Score, first derivative of the loglikelihood for each observation.
Parameters
----------
params : ndarray
Parameter at which score is evaluated.
Returns
-------
score_obs : ndarray, 2d
The first derivative of the loglikelihood function evaluated at
params for each observation.
"""
sf = self.score_factor(params)
# elementwise product for each row (observation)
d1 = sf[:, :1] * self.exog
d2 = sf[:, 1:2] * self.exog_precision
return np.column_stack((d1, d2))
[docs] def hessian(self, params, observed=None):
"""Hessian, second derivative of loglikelihood function
Parameters
----------
params : ndarray
Parameter at which Hessian is evaluated.
observed : bool
If True, then the observed Hessian is returned (default).
If False, then the expected information matrix is returned.
Returns
-------
hessian : ndarray
Hessian, i.e. observed information, or expected information matrix.
"""
if self.hess_type == "eim":
observed = False
else:
observed = True
_, hf = self.score_hessian_factor(params, return_hessian=True,
observed=observed)
hf11, hf12, hf22 = hf
# elementwise product for each row (observation)
d11 = (self.exog.T * hf11).dot(self.exog)
d12 = (self.exog.T * hf12).dot(self.exog_precision)
d22 = (self.exog_precision.T * hf22).dot(self.exog_precision)
return np.bmat([[d11, d12], [d12.T, d22]]).A
def _start_params(self, niter=2, return_intermediate=False):
"""find starting values
Parameters
----------
niter : int
Number of iterations of WLS approximation
return_intermediate : bool
If False (default), then only the preliminary parameter estimate
will be returned.
If True, then also the two results instances of the WLS estimate
for mean parameters and for the precision parameters will be
returned.
Returns
-------
sp : ndarray
start parameters for the optimization
res_m2 : results instance (optional)
Results instance for the WLS regression of the mean function.
res_p2 : results instance (optional)
Results instance for the WLS regression of the precision function.
Notes
-----
This calculates a few iteration of weighted least squares. This is not
a full scoring algorithm.
"""
# WLS of the mean equation uses the implied weights (inverse variance),
# WLS for the precision equations uses weights that only take
# account of the link transformation of the precision endog.
from statsmodels.regression.linear_model import OLS, WLS
res_m = OLS(self.link(self.endog), self.exog).fit()
fitted = self.link.inverse(res_m.fittedvalues)
resid = self.endog - fitted
prec_i = fitted * (1 - fitted) / np.maximum(np.abs(resid), 1e-2)**2 - 1
res_p = OLS(self.link_precision(prec_i), self.exog_precision).fit()
prec_fitted = self.link_precision.inverse(res_p.fittedvalues)
# sp = np.concatenate((res_m.params, res_p.params))
for _ in range(niter):
y_var_inv = (1 + prec_fitted) / (fitted * (1 - fitted))
# y_var = fitted * (1 - fitted) / (1 + prec_fitted)
ylink_var_inv = y_var_inv / self.link.deriv(fitted)**2
res_m2 = WLS(self.link(self.endog), self.exog,
weights=ylink_var_inv).fit()
fitted = self.link.inverse(res_m2.fittedvalues)
resid2 = self.endog - fitted
prec_i2 = (fitted * (1 - fitted) /
np.maximum(np.abs(resid2), 1e-2)**2 - 1)
w_p = 1. / self.link_precision.deriv(prec_fitted)**2
res_p2 = WLS(self.link_precision(prec_i2), self.exog_precision,
weights=w_p).fit()
prec_fitted = self.link_precision.inverse(res_p2.fittedvalues)
sp2 = np.concatenate((res_m2.params, res_p2.params))
if return_intermediate:
return sp2, res_m2, res_p2
return sp2
[docs] def fit(self, start_params=None, maxiter=1000, disp=False,
method='bfgs', **kwds):
"""
Fit the model by maximum likelihood.
Parameters
----------
start_params : array-like
A vector of starting values for the regression
coefficients. If None, a default is chosen.
maxiter : integer
The maximum number of iterations
disp : bool
Show convergence stats.
method : str
The optimization method to use.
kwds :
Keyword arguments for the optimizer.
Returns
-------
BetaResults instance.
"""
if start_params is None:
start_params = self._start_params()
# # http://www.ime.usp.br/~sferrari/beta.pdf suggests starting phi
# # on page 8
if "cov_type" in kwds:
# this is a workaround because we cannot tell super to use eim
if kwds["cov_type"].lower() == "eim":
self.hess_type = "eim"
del kwds["cov_type"]
else:
self.hess_type = "oim"
res = super(BetaModel, self).fit(start_params=start_params,
maxiter=maxiter, method=method,
disp=disp, **kwds)
if not isinstance(res, BetaResultsWrapper):
# currently GenericLikelihoodModel doe not add wrapper
res = BetaResultsWrapper(res)
return res
# code duplication with results class
[docs] def get_distribution_params(self, params, exog=None, exog_precision=None):
"""
Return distribution parameters converted from model prediction.
Parameters
----------
params : array_like
The model parameters.
exog : array_like
Array of predictor variables for mean.
exog_precision : array_like
Array of predictor variables for mean.
Returns
-------
(alpha, beta) : tuple of ndarrays
Parameters for the scipy distribution to evaluate predictive
distribution.
"""
mean = self.predict(params, exog=exog)
precision = self.predict(params, exog_precision=exog_precision,
which="precision")
return precision * mean, precision * (1 - mean)
[docs] def get_distribution(self, params, exog=None, exog_precision=None):
"""
Return a instance of the predictive distribution.
Parameters
----------
params : array_like
The model parameters.
exog : array_like
Array of predictor variables for mean.
exog_precision : array_like
Array of predictor variables for mean.
Returns
-------
Instance of a scipy frozen distribution based on estimated
parameters.
See Also
--------
predict
Notes
-----
This function delegates to the predict method to handle exog and
exog_precision, which in turn makes any required transformations.
Due to the behavior of ``scipy.stats.distributions objects``, the
returned random number generator must be called with ``gen.rvs(n)``
where ``n`` is the number of observations in the data set used
to fit the model. If any other value is used for ``n``, misleading
results will be produced.
"""
from scipy import stats
args = self.get_distribution_params(params, exog=exog,
exog_precision=exog_precision)
distr = stats.beta(*args)
return distr
[docs]class BetaResults(GenericLikelihoodModelResults, _LLRMixin):
"""Results class for Beta regression
This class inherits from GenericLikelihoodModelResults and not all
inherited methods might be appropriate in this case.
"""
# GenericLikeihoodmodel doesn't define fittedvalues, residuals and similar
@cache_readonly
def fittedvalues(self):
"""In-sample predicted mean, conditional expectation."""
return self.model.predict(self.params)
@cache_readonly
def fitted_precision(self):
"""In-sample predicted precision"""
return self.model.predict_precision(self.params)
@cache_readonly
def resid(self):
"""Response residual"""
return self.model.endog - self.fittedvalues
@cache_readonly
def resid_pearson(self):
"""Pearson standardize residual"""
return self.resid / np.sqrt(self.model.predict_var(self.params))
@cache_readonly
def prsquared(self):
"""Cox-Snell Likelihood-Ratio pseudo-R-squared.
1 - exp((llnull - .llf) * (2 / nobs))
"""
return self.pseudo_rsquared(kind="lr")
[docs] def get_distribution_params(self, exog=None, exog_precision=None,
transform=True):
"""
Return distribution parameters converted from model prediction.
Parameters
----------
params : array_like
The model parameters.
exog : array_like
Array of predictor variables for mean.
transform : bool
If transform is True and formulas have been used, then predictor
``exog`` is passed through the formula processing. Default is True.
Returns
-------
(alpha, beta) : tuple of ndarrays
Parameters for the scipy distribution to evaluate predictive
distribution.
"""
mean = self.predict(exog=exog, transform=transform)
precision = self.predict(exog_precision=exog_precision,
which="precision", transform=transform)
return precision * mean, precision * (1 - mean)
[docs] def get_distribution(self, exog=None, exog_precision=None, transform=True):
"""
Return a instance of the predictive distribution.
Parameters
----------
exog : array_like
Array of predictor variables for mean.
exog_precision : array_like
Array of predictor variables for mean.
transform : bool
If transform is True and formulas have been used, then predictor
``exog`` is passed through the formula processing. Default is True.
Returns
-------
Instance of a scipy frozen distribution based on estimated
parameters.
See Also
--------
predict
Notes
-----
This function delegates to the predict method to handle exog and
exog_precision, which in turn makes any required transformations.
Due to the behavior of ``scipy.stats.distributions objects``, the
returned random number generator must be called with ``gen.rvs(n)``
where ``n`` is the number of observations in the data set used
to fit the model. If any other value is used for ``n``, misleading
results will be produced.
"""
from scipy import stats
args = self.get_distribution_params(exog=exog,
exog_precision=exog_precision,
transform=transform)
args = (np.asarray(arg) for arg in args)
distr = stats.beta(*args)
return distr
[docs] def bootstrap(self, *args, **kwargs):
raise NotImplementedError
class BetaResultsWrapper(lm.RegressionResultsWrapper):
pass
wrap.populate_wrapper(BetaResultsWrapper,
BetaResults)