from statsmodels.compat.python import lzip
import datetime
from functools import reduce
import re
import textwrap
import numpy as np
import pandas as pd
from .table import SimpleTable
from .tableformatting import fmt_latex, fmt_txt
[docs]class Summary(object):
def __init__(self):
self.tables = []
self.settings = []
self.extra_txt = []
self.title = None
self._merge_latex = False
def __str__(self):
return self.as_text()
def __repr__(self):
return str(type(self)) + '\n"""\n' + self.__str__() + '\n"""'
def _repr_html_(self):
"""Display as HTML in IPython notebook."""
return self.as_html()
[docs] def add_df(self, df, index=True, header=True, float_format='%.4f',
align='r'):
"""
Add the contents of a DataFrame to summary table
Parameters
----------
df : DataFrame
header : bool
Reproduce the DataFrame column labels in summary table
index : bool
Reproduce the DataFrame row labels in summary table
float_format : str
Formatting to float data columns
align : str
Data alignment (l/c/r)
"""
settings = {'index': index, 'header': header,
'float_format': float_format, 'align': align}
self.tables.append(df)
self.settings.append(settings)
[docs] def add_array(self, array, align='r', float_format="%.4f"):
"""Add the contents of a Numpy array to summary table
Parameters
----------
array : numpy array (2D)
float_format : str
Formatting to array if type is float
align : str
Data alignment (l/c/r)
"""
table = pd.DataFrame(array)
self.add_df(table, index=False, header=False,
float_format=float_format, align=align)
[docs] def add_dict(self, d, ncols=2, align='l', float_format="%.4f"):
"""Add the contents of a Dict to summary table
Parameters
----------
d : dict
Keys and values are automatically coerced to strings with str().
Users are encouraged to format them before using add_dict.
ncols : int
Number of columns of the output table
align : str
Data alignment (l/c/r)
float_format : str
Formatting to float data columns
"""
keys = [_formatter(x, float_format) for x in d.keys()]
vals = [_formatter(x, float_format) for x in d.values()]
data = np.array(lzip(keys, vals))
if data.shape[0] % ncols != 0:
pad = ncols - (data.shape[0] % ncols)
data = np.vstack([data, np.array(pad * [['', '']])])
data = np.split(data, ncols)
data = reduce(lambda x, y: np.hstack([x, y]), data)
self.add_array(data, align=align)
[docs] def add_text(self, string):
"""Append a note to the bottom of the summary table. In ASCII tables,
the note will be wrapped to table width. Notes are not indendented.
"""
self.extra_txt.append(string)
[docs] def add_title(self, title=None, results=None):
"""Insert a title on top of the summary table. If a string is provided
in the title argument, that string is printed. If no title string is
provided but a results instance is provided, statsmodels attempts
to construct a useful title automatically.
"""
if isinstance(title, str):
self.title = title
else:
if results is not None:
model = results.model.__class__.__name__
if model in _model_types:
model = _model_types[model]
self.title = 'Results: ' + model
else:
self.title = ''
[docs] def add_base(self, results, alpha=0.05, float_format="%.4f", title=None,
xname=None, yname=None):
"""Try to construct a basic summary instance.
Parameters
----------
results : Model results instance
alpha : float
significance level for the confidence intervals (optional)
float_format: str
Float formatting for summary of parameters (optional)
title : str
Title of the summary table (optional)
xname : list[str] of length equal to the number of parameters
Names of the independent variables (optional)
yname : str
Name of the dependent variable (optional)
"""
param = summary_params(results, alpha=alpha, use_t=results.use_t)
info = summary_model(results)
if xname is not None:
param.index = xname
if yname is not None:
info['Dependent Variable:'] = yname
self.add_dict(info, align='l')
self.add_df(param, float_format=float_format)
self.add_title(title=title, results=results)
[docs] def as_text(self):
"""Generate ASCII Summary Table
"""
tables = self.tables
settings = self.settings
title = self.title
extra_txt = self.extra_txt
pad_col, pad_index, widest = _measure_tables(tables, settings)
rule_equal = widest * '='
simple_tables = _simple_tables(tables, settings, pad_col, pad_index)
tab = [x.as_text() for x in simple_tables]
tab = '\n'.join(tab)
tab = tab.split('\n')
tab[0] = rule_equal
tab.append(rule_equal)
tab = '\n'.join(tab)
if title is not None:
title = title
if len(title) < widest:
title = ' ' * int(widest / 2 - len(title) / 2) + title
else:
title = ''
txt = [textwrap.wrap(x, widest) for x in extra_txt]
txt = ['\n'.join(x) for x in txt]
txt = '\n'.join(txt)
out = '\n'.join([title, tab, txt])
return out
[docs] def as_html(self):
"""Generate HTML Summary Table
"""
tables = self.tables
settings = self.settings
simple_tables = _simple_tables(tables, settings)
tab = [x.as_html() for x in simple_tables]
tab = '\n'.join(tab)
return tab
[docs] def as_latex(self, label=''):
"""Generate LaTeX Summary Table
Parameters
----------
label : str
Label of the summary table that can be referenced
in a latex document (optional)
"""
tables = self.tables
settings = self.settings
title = self.title
if title is not None:
title = '\\caption{' + title + '}'
else:
title = '\\caption{}'
label = '\\label{' + label + '}'
simple_tables = _simple_tables(tables, settings)
tab = [x.as_latex_tabular() for x in simple_tables]
tab = '\n\n'.join(tab)
to_replace = ('\\\\hline\\n\\\\hline\\n\\\\'
'end{tabular}\\n\\\\begin{tabular}{.*}\\n')
if self._merge_latex:
# create single tabular object for summary_col
tab = re.sub(to_replace, r'\\midrule\n', tab)
out = '\\begin{table}', title, label, tab, '\\end{table}'
out = '\n'.join(out)
return out
def _measure_tables(tables, settings):
"""Compare width of ascii tables in a list and calculate padding values.
We add space to each col_sep to get us as close as possible to the
width of the largest table. Then, we add a few spaces to the first
column to pad the rest.
"""
simple_tables = _simple_tables(tables, settings)
tab = [x.as_text() for x in simple_tables]
length = [len(x.splitlines()[0]) for x in tab]
len_max = max(length)
pad_sep = []
pad_index = []
for i in range(len(tab)):
nsep = max(tables[i].shape[1] - 1, 1)
pad = int((len_max - length[i]) / nsep)
pad_sep.append(pad)
len_new = length[i] + nsep * pad
pad_index.append(len_max - len_new)
return pad_sep, pad_index, max(length)
# Useful stuff # TODO: be more specific
_model_types = {'OLS': 'Ordinary least squares',
'GLS': 'Generalized least squares',
'GLSAR': 'Generalized least squares with AR(p)',
'WLS': 'Weighted least squares',
'RLM': 'Robust linear model',
'NBin': 'Negative binomial model',
'GLM': 'Generalized linear model'
}
def summary_model(results):
"""
Create a dict with information about the model
"""
def time_now(*args, **kwds):
now = datetime.datetime.now()
return now.strftime('%Y-%m-%d %H:%M')
info = {}
info['Model:'] = lambda x: x.model.__class__.__name__
info['Model Family:'] = lambda x: x.family.__class.__name__
info['Link Function:'] = lambda x: x.family.link.__class__.__name__
info['Dependent Variable:'] = lambda x: x.model.endog_names
info['Date:'] = time_now
info['No. Observations:'] = lambda x: "%#6d" % x.nobs
info['Df Model:'] = lambda x: "%#6d" % x.df_model
info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid
info['Converged:'] = lambda x: x.mle_retvals['converged']
info['No. Iterations:'] = lambda x: x.mle_retvals['iterations']
info['Method:'] = lambda x: x.method
info['Norm:'] = lambda x: x.fit_options['norm']
info['Scale Est.:'] = lambda x: x.fit_options['scale_est']
info['Cov. Type:'] = lambda x: x.fit_options['cov']
rsquared_type = '' if results.k_constant else ' (uncentered)'
info['R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared
info['Adj. R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared_adj # noqa:E501
info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared
info['AIC:'] = lambda x: "%8.4f" % x.aic
info['BIC:'] = lambda x: "%8.4f" % x.bic
info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf
info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull
info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue
info['Deviance:'] = lambda x: "%#8.5g" % x.deviance
info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2
info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue
info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue
info['Scale:'] = lambda x: "%#8.5g" % x.scale
out = {}
for key, func in info.items():
try:
out[key] = func(results)
except (AttributeError, KeyError, NotImplementedError):
# NOTE: some models do not have loglike defined (RLM),
# so raise NotImplementedError
pass
return out
def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True,
skip_header=False, float_format="%.4f"):
"""create a summary table of parameters from results instance
Parameters
----------
res : results instance
some required information is directly taken from the result
instance
yname : {str, None}
optional name for the endogenous variable, default is "y"
xname : {list[str], None}
optional names for the exogenous variables, default is "var_xx"
alpha : float
significance level for the confidence intervals
use_t : bool
indicator whether the p-values are based on the Student-t
distribution (if True) or on the normal distribution (if False)
skip_header : bool
If false (default), then the header row is added. If true, then no
header row is added.
float_format : str
float formatting options (e.g. ".3g")
Returns
-------
params_table : SimpleTable instance
"""
if isinstance(results, tuple):
results, params, bse, tvalues, pvalues, conf_int = results
else:
params = results.params
bse = results.bse
tvalues = results.tvalues
pvalues = results.pvalues
conf_int = results.conf_int(alpha)
data = np.array([params, bse, tvalues, pvalues]).T
data = np.hstack([data, conf_int])
data = pd.DataFrame(data)
if use_t:
data.columns = ['Coef.', 'Std.Err.', 't', 'P>|t|',
'[' + str(alpha / 2), str(1 - alpha / 2) + ']']
else:
data.columns = ['Coef.', 'Std.Err.', 'z', 'P>|z|',
'[' + str(alpha / 2), str(1 - alpha / 2) + ']']
if not xname:
try:
data.index = results.model.data.param_names
except AttributeError:
data.index = results.model.exog_names
else:
data.index = xname
return data
# Vertical summary instance for multiple models
def _col_params(result, float_format='%.4f', stars=True):
"""Stack coefficients and standard errors in single column
"""
# Extract parameters
res = summary_params(result)
# Format float
for col in res.columns[:2]:
res[col] = res[col].apply(lambda x: float_format % x)
# Std.Errors in parentheses
res.iloc[:, 1] = '(' + res.iloc[:, 1] + ')'
# Significance stars
if stars:
idx = res.iloc[:, 3] < .1
res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*'
idx = res.iloc[:, 3] < .05
res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*'
idx = res.iloc[:, 3] < .01
res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*'
# Stack Coefs and Std.Errors
res = res.iloc[:, :2]
res = res.stack()
rsquared = getattr(result, 'rsquared', np.nan)
rsquared_adj = getattr(result, 'rsquared_adj', np.nan)
r2 = pd.Series({('R-squared', ""): rsquared,
('R-squared Adj.', ""): rsquared_adj})
if r2.notnull().any():
r2 = r2.apply(lambda x: float_format % x)
res = pd.concat([res, r2], axis=0)
res = pd.DataFrame(res)
res.columns = [str(result.model.endog_names)]
return res
def _col_info(result, info_dict=None):
"""Stack model info in a column
"""
if info_dict is None:
info_dict = {}
out = []
index = []
for i in info_dict:
if isinstance(info_dict[i], dict):
# this is a specific model info_dict, but not for this result...
continue
try:
out.append(info_dict[i](result))
except AttributeError:
out.append('')
index.append(i)
out = pd.DataFrame({str(result.model.endog_names): out}, index=index)
return out
def _make_unique(list_of_names):
if len(set(list_of_names)) == len(list_of_names):
return list_of_names
# pandas does not like it if multiple columns have the same names
from collections import defaultdict
name_counter = defaultdict(str)
header = []
for _name in list_of_names:
name_counter[_name] += "I"
header.append(_name + " " + name_counter[_name])
return header
def summary_col(results, float_format='%.4f', model_names=(), stars=False,
info_dict=None, regressor_order=(), drop_omitted=False):
"""
Summarize multiple results instances side-by-side (coefs and SEs)
Parameters
----------
results : statsmodels results instance or list of result instances
float_format : str, optional
float format for coefficients and standard errors
Default : '%.4f'
model_names : list[str], optional
Must have same length as the number of results. If the names are not
unique, a roman number will be appended to all model names
stars : bool
print significance stars
info_dict : dict, default None
dict of functions to be applied to results instances to retrieve
model info. To use specific information for different models, add a
(nested) info_dict with model name as the key.
Example: `info_dict = {"N":lambda x:(x.nobs), "R2": ..., "OLS":{
"R2":...}}` would only show `R2` for OLS regression models, but
additionally `N` for all other results.
Default : None (use the info_dict specified in
result.default_model_infos, if this property exists)
regressor_order : list[str], optional
list of names of the regressors in the desired order. All regressors
not specified will be appended to the end of the list.
drop_omitted : bool, optional
Includes regressors that are not specified in regressor_order. If
False, regressors not specified will be appended to end of the list.
If True, only regressors in regressor_order will be included.
"""
if not isinstance(results, list):
results = [results]
cols = [_col_params(x, stars=stars, float_format=float_format) for x in
results]
# Unique column names (pandas has problems merging otherwise)
if model_names:
colnames = _make_unique(model_names)
else:
colnames = _make_unique([x.columns[0] for x in cols])
for i in range(len(cols)):
cols[i].columns = [colnames[i]]
def merg(x, y):
return x.merge(y, how='outer', right_index=True,
left_index=True)
summ = reduce(merg, cols)
if regressor_order:
varnames = summ.index.get_level_values(0).tolist()
vc = pd.Series(varnames).value_counts()
varnames = vc.loc[vc == 2].index.tolist()
ordered = [x for x in regressor_order if x in varnames]
unordered = [x for x in varnames if x not in regressor_order]
new_order = ordered + unordered
other = [x for x in summ.index.get_level_values(0)
if x not in new_order]
new_order += other
if drop_omitted:
for uo in unordered:
new_order.remove(uo)
summ = summ.loc[new_order]
idx = []
index = summ.index.get_level_values(0)
for i in range(0, index.shape[0], 2):
idx.append(index[i])
if (i + 1) < index.shape[0] and (index[i] == index[i + 1]):
idx.append("")
else:
idx.append(index[i + 1])
summ.index = idx
# add infos about the models.
if info_dict:
cols = [_col_info(x, info_dict.get(x.model.__class__.__name__,
info_dict)) for x in results]
else:
cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in
results]
# use unique column names, otherwise the merge will not succeed
for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])):
df.columns = [name]
def merg(x, y):
return x.merge(y, how='outer', right_index=True,
left_index=True)
info = reduce(merg, cols)
dat = pd.DataFrame(np.vstack([summ, info])) # pd.concat better, but error
dat.columns = summ.columns
dat.index = pd.Index(summ.index.tolist() + info.index.tolist())
summ = dat
summ = summ.fillna('')
smry = Summary()
smry._merge_latex = True
smry.add_df(summ, header=True, align='l')
smry.add_text('Standard errors in parentheses.')
if stars:
smry.add_text('* p<.1, ** p<.05, ***p<.01')
return smry
def _formatter(element, float_format='%.4f'):
try:
out = float_format % element
except (ValueError, TypeError):
out = str(element)
return out.strip()
def _df_to_simpletable(df, align='r', float_format="%.4f", header=True,
index=True, table_dec_above='-', table_dec_below=None,
header_dec_below='-', pad_col=0, pad_index=0):
dat = df.copy()
dat = dat.applymap(lambda x: _formatter(x, float_format))
if header:
headers = [str(x) for x in dat.columns.tolist()]
else:
headers = None
if index:
stubs = [str(x) + int(pad_index) * ' ' for x in dat.index.tolist()]
else:
dat.iloc[:, 0] = [str(x) + int(pad_index) * ' '
for x in dat.iloc[:, 0]]
stubs = None
st = SimpleTable(np.array(dat), headers=headers, stubs=stubs,
ltx_fmt=fmt_latex, txt_fmt=fmt_txt)
st.output_formats['latex']['data_aligns'] = align
st.output_formats['latex']['header_align'] = align
st.output_formats['txt']['data_aligns'] = align
st.output_formats['txt']['table_dec_above'] = table_dec_above
st.output_formats['txt']['table_dec_below'] = table_dec_below
st.output_formats['txt']['header_dec_below'] = header_dec_below
st.output_formats['txt']['colsep'] = ' ' * int(pad_col + 1)
return st
def _simple_tables(tables, settings, pad_col=None, pad_index=None):
simple_tables = []
float_format = settings[0]['float_format'] if settings else '%.4f'
if pad_col is None:
pad_col = [0] * len(tables)
if pad_index is None:
pad_index = [0] * len(tables)
for i, v in enumerate(tables):
index = settings[i]['index']
header = settings[i]['header']
align = settings[i]['align']
simple_tables.append(_df_to_simpletable(v, align=align,
float_format=float_format,
header=header, index=index,
pad_col=pad_col[i],
pad_index=pad_index[i]))
return simple_tables