Source code for statsmodels.othermod.betareg
"""
Beta regression for modeling rates and proportions.
References
----------
GrĂ¼n, Bettina, Ioannis Kosmidis, and Achim Zeileis. Extended beta regression
in R: Shaken, stirred, mixed, and partitioned. No. 2011-22. Working Papers in
Economics and Statistics, 2011.
Smithson, Michael, and Jay Verkuilen. "A better lemon squeezer?
Maximum-likelihood regression with beta-distributed dependent variables."
Psychological methods 11.1 (2006): 54.
"""
import numpy as np
from scipy.special import gammaln as lgamma
import patsy
import statsmodels.base.wrapper as wrap
import statsmodels.regression.linear_model as lm
from statsmodels.tools.decorators import cache_readonly
from statsmodels.base.model import (
GenericLikelihoodModel, GenericLikelihoodModelResults, _LLRMixin)
from statsmodels.genmod import families
_init_example = """
Beta regression with default of logit-link for exog and log-link
for precision.
>>> mod = BetaModel(endog, exog)
>>> rslt = mod.fit()
>>> print(rslt.summary())
We can also specify a formula and a specific structure and use the
identity-link for precision.
>>> from sm.families.links import identity
>>> Z = patsy.dmatrix('~ temp', dat, return_type='dataframe')
>>> mod = BetaModel.from_formula('iyield ~ C(batch, Treatment(10)) + temp',
... dat, exog_precision=Z,
... link_precision=identity())
In the case of proportion-data, we may think that the precision depends on
the number of measurements. E.g for sequence data, on the number of
sequence reads covering a site:
>>> Z = patsy.dmatrix('~ coverage', df)
>>> formula = 'methylation ~ disease + age + gender + coverage'
>>> mod = BetaModel.from_formula(formula, df, Z)
>>> rslt = mod.fit()
"""
[docs]
class BetaModel(GenericLikelihoodModel):
__doc__ = """Beta Regression.
The Model is parameterized by mean and precision. Both can depend on
explanatory variables through link functions.
Parameters
----------
endog : array_like
1d array of endogenous response variable.
exog : array_like
A nobs x k array where `nobs` is the number of observations and `k`
is the number of regressors. An intercept is not included by default
and should be added by the user (models specified using a formula
include an intercept by default). See `statsmodels.tools.add_constant`.
exog_precision : array_like
2d array of variables for the precision.
link : link
Any link in sm.families.links for mean, should have range in
interval [0, 1]. Default is logit-link.
link_precision : link
Any link in sm.families.links for precision, should have
range in positive line. Default is log-link.
**kwds : extra keywords
Keyword options that will be handled by super classes.
Not all general keywords will be supported in this class.
Notes
-----
Status: experimental, new in 0.13.
Core results are verified, but api can change and some extra results
specific to Beta regression are missing.
Examples
--------
{example}
See Also
--------
:ref:`links`
""".format(example=_init_example)
def __init__(self, endog, exog, exog_precision=None,
link=families.links.Logit(),
link_precision=families.links.Log(), **kwds):
etmp = np.array(endog)
assert np.all((0 < etmp) & (etmp < 1))
if exog_precision is None:
extra_names = ['precision']
exog_precision = np.ones((len(endog), 1), dtype='f')
else:
extra_names = ['precision-%s' % zc for zc in
(exog_precision.columns
if hasattr(exog_precision, 'columns')
else range(1, exog_precision.shape[1] + 1))]
kwds['extra_params_names'] = extra_names
super().__init__(endog, exog, exog_precision=exog_precision, **kwds)
self.link = link
self.link_precision = link_precision
# not needed, handled by super:
# self.exog_precision = exog_precision
# inherited df do not account for precision params
self.nobs = self.endog.shape[0]
self.k_extra = 1
self.df_model = self.nparams - 2
self.df_resid = self.nobs - self.nparams
assert len(self.exog_precision) == len(self.endog)
self.hess_type = "oim"
if 'exog_precision' not in self._init_keys:
self._init_keys.extend(['exog_precision'])
self._init_keys.extend(['link', 'link_precision'])
self._null_drop_keys = ['exog_precision']
del kwds['extra_params_names']
self._check_kwargs(kwds)
self.results_class = BetaResults
self.results_class_wrapper = BetaResultsWrapper
[docs]
@classmethod
def from_formula(cls, formula, data, exog_precision_formula=None,
*args, **kwargs):
if exog_precision_formula is not None:
if 'subset' in kwargs:
d = data.ix[kwargs['subset']]
Z = patsy.dmatrix(exog_precision_formula, d)
else:
Z = patsy.dmatrix(exog_precision_formula, data)
kwargs['exog_precision'] = Z
return super().from_formula(formula, data, *args, **kwargs)
def _get_exogs(self):
return (self.exog, self.exog_precision)
[docs]
def predict(self, params, exog=None, exog_precision=None, which="mean"):
"""Predict values for mean or precision
Parameters
----------
params : array_like
The model parameters.
exog : array_like
Array of predictor variables for mean.
exog_precision : array_like
Array of predictor variables for precision parameter.
which : str
- "mean" : mean, conditional expectation E(endog | exog)
- "precision" : predicted precision
- "linear" : linear predictor for the mean function
- "linear-precision" : linear predictor for the precision parameter
Returns
-------
ndarray, predicted values
"""
# compatibility with old names and misspelling
if which == "linpred":
which = "linear"
if which in ["linpred_precision", "linear_precision"]:
which = "linear-precision"
k_mean = self.exog.shape[1]
if which in ["mean", "linear"]:
if exog is None:
exog = self.exog
params_mean = params[:k_mean]
# Zparams = params[k_mean:]
linpred = np.dot(exog, params_mean)
if which == "mean":
mu = self.link.inverse(linpred)
res = mu
else:
res = linpred
elif which in ["precision", "linear-precision"]:
if exog_precision is None:
exog_precision = self.exog_precision
params_prec = params[k_mean:]
linpred_prec = np.dot(exog_precision, params_prec)
if which == "precision":
phi = self.link_precision.inverse(linpred_prec)
res = phi
else:
res = linpred_prec
elif which == "var":
res = self._predict_var(
params,
exog=exog,
exog_precision=exog_precision
)
else:
raise ValueError('which = %s is not available' % which)
return res
def _predict_precision(self, params, exog_precision=None):
"""Predict values for precision function for given exog_precision.
Parameters
----------
params : array_like
The model parameters.
exog_precision : array_like
Array of predictor variables for precision.
Returns
-------
Predicted precision.
"""
if exog_precision is None:
exog_precision = self.exog_precision
k_mean = self.exog.shape[1]
params_precision = params[k_mean:]
linpred_prec = np.dot(exog_precision, params_precision)
phi = self.link_precision.inverse(linpred_prec)
return phi
def _predict_var(self, params, exog=None, exog_precision=None):
"""predict values for conditional variance V(endog | exog)
Parameters
----------
params : array_like
The model parameters.
exog : array_like
Array of predictor variables for mean.
exog_precision : array_like
Array of predictor variables for precision.
Returns
-------
Predicted conditional variance.
"""
mean = self.predict(params, exog=exog)
precision = self._predict_precision(params,
exog_precision=exog_precision)
var_endog = mean * (1 - mean) / (1 + precision)
return var_endog
[docs]
def loglikeobs(self, params):
"""
Loglikelihood for observations of the Beta regressionmodel.
Parameters
----------
params : ndarray
The parameters of the model, coefficients for linear predictors
of the mean and of the precision function.
Returns
-------
loglike : ndarray
The log likelihood for each observation of the model evaluated
at `params`.
"""
return self._llobs(self.endog, self.exog, self.exog_precision, params)
def _llobs(self, endog, exog, exog_precision, params):
"""
Loglikelihood for observations with data arguments.
Parameters
----------
endog : ndarray
1d array of endogenous variable.
exog : ndarray
2d array of explanatory variables.
exog_precision : ndarray
2d array of explanatory variables for precision.
params : ndarray
The parameters of the model, coefficients for linear predictors
of the mean and of the precision function.
Returns
-------
loglike : ndarray
The log likelihood for each observation of the model evaluated
at `params`.
"""
y, X, Z = endog, exog, exog_precision
nz = Z.shape[1]
params_mean = params[:-nz]
params_prec = params[-nz:]
linpred = np.dot(X, params_mean)
linpred_prec = np.dot(Z, params_prec)
mu = self.link.inverse(linpred)
phi = self.link_precision.inverse(linpred_prec)
eps_lb = 1e-200
alpha = np.clip(mu * phi, eps_lb, np.inf)
beta = np.clip((1 - mu) * phi, eps_lb, np.inf)
ll = (lgamma(phi) - lgamma(alpha)
- lgamma(beta)
+ (mu * phi - 1) * np.log(y)
+ (((1 - mu) * phi) - 1) * np.log(1 - y))
return ll
[docs]
def score(self, params):
"""
Returns the score vector of the log-likelihood.
http://www.tandfonline.com/doi/pdf/10.1080/00949650903389993
Parameters
----------
params : ndarray
Parameter at which score is evaluated.
Returns
-------
score : ndarray
First derivative of loglikelihood function.
"""
sf1, sf2 = self.score_factor(params)
d1 = np.dot(sf1, self.exog)
d2 = np.dot(sf2, self.exog_precision)
return np.concatenate((d1, d2))
def _score_check(self, params):
"""Inherited score with finite differences
Parameters
----------
params : ndarray
Parameter at which score is evaluated.
Returns
-------
score based on numerical derivatives
"""
return super().score(params)
[docs]
def score_factor(self, params, endog=None):
"""Derivative of loglikelihood function w.r.t. linear predictors.
This needs to be multiplied with the exog to obtain the score_obs.
Parameters
----------
params : ndarray
Parameter at which score is evaluated.
Returns
-------
score_factor : ndarray, 2-D
A 2d weight vector used in the calculation of the score_obs.
Notes
-----
The score_obs can be obtained from score_factor ``sf`` using
- d1 = sf[:, :1] * exog
- d2 = sf[:, 1:2] * exog_precision
"""
from scipy import special
digamma = special.psi
y = self.endog if endog is None else endog
X, Z = self.exog, self.exog_precision
nz = Z.shape[1]
Xparams = params[:-nz]
Zparams = params[-nz:]
# NO LINKS
mu = self.link.inverse(np.dot(X, Xparams))
phi = self.link_precision.inverse(np.dot(Z, Zparams))
eps_lb = 1e-200 # lower bound for evaluating digamma, avoids -inf
alpha = np.clip(mu * phi, eps_lb, np.inf)
beta = np.clip((1 - mu) * phi, eps_lb, np.inf)
ystar = np.log(y / (1. - y))
dig_beta = digamma(beta)
mustar = digamma(alpha) - dig_beta
yt = np.log(1 - y)
mut = dig_beta - digamma(phi)
t = 1. / self.link.deriv(mu)
h = 1. / self.link_precision.deriv(phi)
#
sf1 = phi * t * (ystar - mustar)
sf2 = h * (mu * (ystar - mustar) + yt - mut)
return (sf1, sf2)
[docs]
def score_hessian_factor(self, params, return_hessian=False,
observed=True):
"""Derivatives of loglikelihood function w.r.t. linear predictors.
This calculates score and hessian factors at the same time, because
there is a large overlap in calculations.
Parameters
----------
params : ndarray
Parameter at which score is evaluated.
return_hessian : bool
If False, then only score_factors are returned
If True, the both score and hessian factors are returned
observed : bool
If True, then the observed Hessian is returned (default).
If False, then the expected information matrix is returned.
Returns
-------
score_factor : ndarray, 2-D
A 2d weight vector used in the calculation of the score_obs.
(-jbb, -jbg, -jgg) : tuple
A tuple with 3 hessian factors, corresponding to the upper
triangle of the Hessian matrix.
TODO: check why there are minus
"""
from scipy import special
digamma = special.psi
y, X, Z = self.endog, self.exog, self.exog_precision
nz = Z.shape[1]
Xparams = params[:-nz]
Zparams = params[-nz:]
# NO LINKS
mu = self.link.inverse(np.dot(X, Xparams))
phi = self.link_precision.inverse(np.dot(Z, Zparams))
# We need to prevent mu = 0 and (1-mu) = 0 in digamma call
eps_lb = 1e-200 # lower bound for evaluating digamma, avoids -inf
alpha = np.clip(mu * phi, eps_lb, np.inf)
beta = np.clip((1 - mu) * phi, eps_lb, np.inf)
ystar = np.log(y / (1. - y))
dig_beta = digamma(beta)
mustar = digamma(alpha) - dig_beta
yt = np.log(1 - y)
mut = dig_beta - digamma(phi)
t = 1. / self.link.deriv(mu)
h = 1. / self.link_precision.deriv(phi)
ymu_star = (ystar - mustar)
sf1 = phi * t * ymu_star
sf2 = h * (mu * ymu_star + yt - mut)
if return_hessian:
trigamma = lambda x: special.polygamma(1, x) # noqa
trig_beta = trigamma(beta)
var_star = trigamma(alpha) + trig_beta
var_t = trig_beta - trigamma(phi)
c = - trig_beta
s = self.link.deriv2(mu)
q = self.link_precision.deriv2(phi)
jbb = (phi * t) * var_star
if observed:
jbb += s * t**2 * ymu_star
jbb *= t * phi
jbg = phi * t * h * (mu * var_star + c)
if observed:
jbg -= ymu_star * t * h
jgg = h**2 * (mu**2 * var_star + 2 * mu * c + var_t)
if observed:
jgg += (mu * ymu_star + yt - mut) * q * h**3 # **3 ?
return (sf1, sf2), (-jbb, -jbg, -jgg)
else:
return (sf1, sf2)
[docs]
def score_obs(self, params):
"""
Score, first derivative of the loglikelihood for each observation.
Parameters
----------
params : ndarray
Parameter at which score is evaluated.
Returns
-------
score_obs : ndarray, 2d
The first derivative of the loglikelihood function evaluated at
params for each observation.
"""
sf1, sf2 = self.score_factor(params)
# elementwise product for each row (observation)
d1 = sf1[:, None] * self.exog
d2 = sf2[:, None] * self.exog_precision
return np.column_stack((d1, d2))
[docs]
def hessian(self, params, observed=None):
"""Hessian, second derivative of loglikelihood function
Parameters
----------
params : ndarray
Parameter at which Hessian is evaluated.
observed : bool
If True, then the observed Hessian is returned (default).
If False, then the expected information matrix is returned.
Returns
-------
hessian : ndarray
Hessian, i.e. observed information, or expected information matrix.
"""
if self.hess_type == "eim":
observed = False
else:
observed = True
_, hf = self.score_hessian_factor(params, return_hessian=True,
observed=observed)
hf11, hf12, hf22 = hf
# elementwise product for each row (observation)
d11 = (self.exog.T * hf11).dot(self.exog)
d12 = (self.exog.T * hf12).dot(self.exog_precision)
d22 = (self.exog_precision.T * hf22).dot(self.exog_precision)
return np.block([[d11, d12], [d12.T, d22]])
[docs]
def hessian_factor(self, params, observed=True):
"""Derivatives of loglikelihood function w.r.t. linear predictors.
"""
_, hf = self.score_hessian_factor(params, return_hessian=True,
observed=observed)
return hf
def _start_params(self, niter=2, return_intermediate=False):
"""find starting values
Parameters
----------
niter : int
Number of iterations of WLS approximation
return_intermediate : bool
If False (default), then only the preliminary parameter estimate
will be returned.
If True, then also the two results instances of the WLS estimate
for mean parameters and for the precision parameters will be
returned.
Returns
-------
sp : ndarray
start parameters for the optimization
res_m2 : results instance (optional)
Results instance for the WLS regression of the mean function.
res_p2 : results instance (optional)
Results instance for the WLS regression of the precision function.
Notes
-----
This calculates a few iteration of weighted least squares. This is not
a full scoring algorithm.
"""
# WLS of the mean equation uses the implied weights (inverse variance),
# WLS for the precision equations uses weights that only take
# account of the link transformation of the precision endog.
from statsmodels.regression.linear_model import OLS, WLS
res_m = OLS(self.link(self.endog), self.exog).fit()
fitted = self.link.inverse(res_m.fittedvalues)
resid = self.endog - fitted
prec_i = fitted * (1 - fitted) / np.maximum(np.abs(resid), 1e-2)**2 - 1
res_p = OLS(self.link_precision(prec_i), self.exog_precision).fit()
prec_fitted = self.link_precision.inverse(res_p.fittedvalues)
# sp = np.concatenate((res_m.params, res_p.params))
for _ in range(niter):
y_var_inv = (1 + prec_fitted) / (fitted * (1 - fitted))
# y_var = fitted * (1 - fitted) / (1 + prec_fitted)
ylink_var_inv = y_var_inv / self.link.deriv(fitted)**2
res_m2 = WLS(self.link(self.endog), self.exog,
weights=ylink_var_inv).fit()
fitted = self.link.inverse(res_m2.fittedvalues)
resid2 = self.endog - fitted
prec_i2 = (fitted * (1 - fitted) /
np.maximum(np.abs(resid2), 1e-2)**2 - 1)
w_p = 1. / self.link_precision.deriv(prec_fitted)**2
res_p2 = WLS(self.link_precision(prec_i2), self.exog_precision,
weights=w_p).fit()
prec_fitted = self.link_precision.inverse(res_p2.fittedvalues)
sp2 = np.concatenate((res_m2.params, res_p2.params))
if return_intermediate:
return sp2, res_m2, res_p2
return sp2
[docs]
def fit(self, start_params=None, maxiter=1000, disp=False,
method='bfgs', **kwds):
"""
Fit the model by maximum likelihood.
Parameters
----------
start_params : array-like
A vector of starting values for the regression
coefficients. If None, a default is chosen.
maxiter : integer
The maximum number of iterations
disp : bool
Show convergence stats.
method : str
The optimization method to use.
kwds :
Keyword arguments for the optimizer.
Returns
-------
BetaResults instance.
"""
if start_params is None:
start_params = self._start_params()
# # http://www.ime.usp.br/~sferrari/beta.pdf suggests starting phi
# # on page 8
if "cov_type" in kwds:
# this is a workaround because we cannot tell super to use eim
if kwds["cov_type"].lower() == "eim":
self.hess_type = "eim"
del kwds["cov_type"]
else:
self.hess_type = "oim"
res = super().fit(start_params=start_params,
maxiter=maxiter, method=method,
disp=disp, **kwds)
if not isinstance(res, BetaResultsWrapper):
# currently GenericLikelihoodModel doe not add wrapper
res = BetaResultsWrapper(res)
return res
def _deriv_mean_dparams(self, params):
"""
Derivative of the expected endog with respect to the parameters.
not verified yet
Parameters
----------
params : ndarray
parameter at which score is evaluated
Returns
-------
The value of the derivative of the expected endog with respect
to the parameter vector.
"""
link = self.link
lin_pred = self.predict(params, which="linear")
idl = link.inverse_deriv(lin_pred)
dmat = self.exog * idl[:, None]
return np.column_stack((dmat, np.zeros(self.exog_precision.shape)))
def _deriv_score_obs_dendog(self, params):
"""derivative of score_obs w.r.t. endog
Parameters
----------
params : ndarray
parameter at which score is evaluated
Returns
-------
derivative : ndarray_2d
The derivative of the score_obs with respect to endog.
"""
from statsmodels.tools.numdiff import _approx_fprime_cs_scalar
def f(y):
if y.ndim == 2 and y.shape[1] == 1:
y = y[:, 0]
sf = self.score_factor(params, endog=y)
return np.column_stack(sf)
dsf = _approx_fprime_cs_scalar(self.endog[:, None], f)
# deriv is 2d vector
d1 = dsf[:, :1] * self.exog
d2 = dsf[:, 1:2] * self.exog_precision
return np.column_stack((d1, d2))
[docs]
def get_distribution_params(self, params, exog=None, exog_precision=None):
"""
Return distribution parameters converted from model prediction.
Parameters
----------
params : array_like
The model parameters.
exog : array_like
Array of predictor variables for mean.
exog_precision : array_like
Array of predictor variables for mean.
Returns
-------
(alpha, beta) : tuple of ndarrays
Parameters for the scipy distribution to evaluate predictive
distribution.
"""
mean = self.predict(params, exog=exog)
precision = self.predict(params, exog_precision=exog_precision,
which="precision")
return precision * mean, precision * (1 - mean)
[docs]
def get_distribution(self, params, exog=None, exog_precision=None):
"""
Return a instance of the predictive distribution.
Parameters
----------
params : array_like
The model parameters.
exog : array_like
Array of predictor variables for mean.
exog_precision : array_like
Array of predictor variables for mean.
Returns
-------
Instance of a scipy frozen distribution based on estimated
parameters.
See Also
--------
predict
Notes
-----
This function delegates to the predict method to handle exog and
exog_precision, which in turn makes any required transformations.
Due to the behavior of ``scipy.stats.distributions objects``, the
returned random number generator must be called with ``gen.rvs(n)``
where ``n`` is the number of observations in the data set used
to fit the model. If any other value is used for ``n``, misleading
results will be produced.
"""
from scipy import stats
args = self.get_distribution_params(params, exog=exog,
exog_precision=exog_precision)
distr = stats.beta(*args)
return distr
[docs]
class BetaResults(GenericLikelihoodModelResults, _LLRMixin):
"""Results class for Beta regression
This class inherits from GenericLikelihoodModelResults and not all
inherited methods might be appropriate in this case.
"""
# GenericLikeihoodmodel doesn't define fittedvalues, residuals and similar
@cache_readonly
def fittedvalues(self):
"""In-sample predicted mean, conditional expectation."""
return self.model.predict(self.params)
@cache_readonly
def fitted_precision(self):
"""In-sample predicted precision"""
return self.model.predict(self.params, which="precision")
@cache_readonly
def resid(self):
"""Response residual"""
return self.model.endog - self.fittedvalues
@cache_readonly
def resid_pearson(self):
"""Pearson standardize residual"""
std = np.sqrt(self.model.predict(self.params, which="var"))
return self.resid / std
@cache_readonly
def prsquared(self):
"""Cox-Snell Likelihood-Ratio pseudo-R-squared.
1 - exp((llnull - .llf) * (2 / nobs))
"""
return self.pseudo_rsquared(kind="lr")
[docs]
def get_distribution_params(self, exog=None, exog_precision=None,
transform=True):
"""
Return distribution parameters converted from model prediction.
Parameters
----------
params : array_like
The model parameters.
exog : array_like
Array of predictor variables for mean.
transform : bool
If transform is True and formulas have been used, then predictor
``exog`` is passed through the formula processing. Default is True.
Returns
-------
(alpha, beta) : tuple of ndarrays
Parameters for the scipy distribution to evaluate predictive
distribution.
"""
mean = self.predict(exog=exog, transform=transform)
precision = self.predict(exog_precision=exog_precision,
which="precision", transform=transform)
return precision * mean, precision * (1 - mean)
[docs]
def get_distribution(self, exog=None, exog_precision=None, transform=True):
"""
Return a instance of the predictive distribution.
Parameters
----------
exog : array_like
Array of predictor variables for mean.
exog_precision : array_like
Array of predictor variables for mean.
transform : bool
If transform is True and formulas have been used, then predictor
``exog`` is passed through the formula processing. Default is True.
Returns
-------
Instance of a scipy frozen distribution based on estimated
parameters.
See Also
--------
predict
Notes
-----
This function delegates to the predict method to handle exog and
exog_precision, which in turn makes any required transformations.
Due to the behavior of ``scipy.stats.distributions objects``, the
returned random number generator must be called with ``gen.rvs(n)``
where ``n`` is the number of observations in the data set used
to fit the model. If any other value is used for ``n``, misleading
results will be produced.
"""
from scipy import stats
args = self.get_distribution_params(exog=exog,
exog_precision=exog_precision,
transform=transform)
args = (np.asarray(arg) for arg in args)
distr = stats.beta(*args)
return distr
[docs]
def get_influence(self):
"""
Get an instance of MLEInfluence with influence and outlier measures
Returns
-------
infl : MLEInfluence instance
The instance has methods to calculate the main influence and
outlier measures as attributes.
See Also
--------
statsmodels.stats.outliers_influence.MLEInfluence
Notes
-----
Support for mutli-link and multi-exog models is still experimental
in MLEInfluence. Interface and some definitions might still change.
Note: Difference to R betareg: Betareg has the same general leverage
as this model. However, they use a linear approximation hat matrix
to scale and studentize influence and residual statistics.
MLEInfluence uses the generalized leverage as hat_matrix_diag.
Additionally, MLEInfluence uses pearson residuals for residual
analusis.
References
----------
todo
"""
from statsmodels.stats.outliers_influence import MLEInfluence
return MLEInfluence(self)
class BetaResultsWrapper(lm.RegressionResultsWrapper):
pass
wrap.populate_wrapper(BetaResultsWrapper,
BetaResults)
Last update:
Oct 03, 2024