from statsmodels.compat.pandas import Appender, is_numeric_dtype
from statsmodels.compat.python import lmap, lrange
from statsmodels.compat.scipy import SP_LT_19
from typing import Sequence, Union
import warnings
import numpy as np
import pandas as pd
from pandas.core.dtypes.common import is_categorical_dtype
from scipy import stats
from statsmodels.iolib.table import SimpleTable
from statsmodels.stats.stattools import jarque_bera
from statsmodels.tools.decorators import cache_readonly
from statsmodels.tools.docstring import Docstring, Parameter
from statsmodels.tools.validation import (
array_like,
bool_like,
float_like,
int_like,
)
DEPRECATION_MSG = """/
``Describe`` has been deprecated in favor of ``Description`` and it's
simplified functional version, ``describe``. ``Describe`` will be removed
after 0.13.
"""
PERCENTILES = (1, 5, 10, 25, 50, 75, 90, 95, 99)
QUANTILES = np.array(PERCENTILES) / 100.0
def pd_ptp(df):
return df.max() - df.min()
def pd_percentiles(df):
return df.quantiles(QUANTILES)
PANDAS = {
"obs": lambda df: df.count(),
"mean": lambda df: df.mean(),
"std": lambda df: df.std(),
"max": lambda df: df.max(),
"min": lambda df: df.min(),
"mode": lambda df: df.mode(),
"ptp": pd_ptp,
"var": lambda df: df.var(),
"skew": lambda df: df.skewness(),
"uss": lambda df: (df ** 2).sum(),
"kurtosis": lambda df: df.kurtosis(),
"percentiles": pd_percentiles,
}
def nancount(x, axis=0):
return (1 - np.isnan(x)).sum(axis=axis)
def nanptp(arr, axis=0):
return np.nanmax(arr, axis=axis) - np.nanmin(arr, axis=axis)
def nanuss(arr, axis=0):
return np.nansum(arr ** 2, axis=axis)
def nanpercentile(arr, axis=0):
return np.nanpercentile(arr, PERCENTILES, axis=axis)
def nankurtosis(arr, axis=0):
return stats.kurtosis(arr, axis=axis, nan_policy="omit")
def nanskewness(arr, axis=0):
return stats.skew(arr, axis=axis, nan_policy="omit")
MISSING = {
"obs": nancount,
"mean": np.nanmean,
"std": np.nanstd,
"max": np.nanmax,
"min": np.nanmin,
"ptp": nanptp,
"var": np.nanvar,
"skew": nanskewness,
"uss": nanuss,
"kurtosis": nankurtosis,
"percentiles": nanpercentile,
}
def _kurtosis(a):
"""
wrapper for scipy.stats.kurtosis that returns nan instead of raising Error
missing options
"""
try:
res = stats.kurtosis(a)
except ValueError:
res = np.nan
return res
def _skew(a):
"""
wrapper for scipy.stats.skew that returns nan instead of raising Error
missing options
"""
try:
res = stats.skew(a)
except ValueError:
res = np.nan
return res
[docs]def sign_test(samp, mu0=0):
"""
Signs test
Parameters
----------
samp : array_like
1d array. The sample for which you want to perform the sign test.
mu0 : float
See Notes for the definition of the sign test. mu0 is 0 by
default, but it is common to set it to the median.
Returns
-------
M
p-value
Notes
-----
The signs test returns
M = (N(+) - N(-))/2
where N(+) is the number of values above `mu0`, N(-) is the number of
values below. Values equal to `mu0` are discarded.
The p-value for M is calculated using the binomial distribution
and can be interpreted the same as for a t-test. The test-statistic
is distributed Binom(min(N(+), N(-)), n_trials, .5) where n_trials
equals N(+) + N(-).
See Also
--------
scipy.stats.wilcoxon
"""
samp = np.asarray(samp)
pos = np.sum(samp > mu0)
neg = np.sum(samp < mu0)
M = (pos - neg) / 2.0
try:
p = stats.binomtest(min(pos, neg), pos + neg, 0.5).pvalue
except AttributeError:
# Remove after min SciPy >= 1.7
p = stats.binom_test(min(pos, neg), pos + neg, 0.5)
return M, p
NUMERIC_STATISTICS = (
"nobs",
"missing",
"mean",
"std_err",
"ci",
"std",
"iqr",
"iqr_normal",
"mad",
"mad_normal",
"coef_var",
"range",
"max",
"min",
"skew",
"kurtosis",
"jarque_bera",
"mode",
"median",
"percentiles",
)
CATEGORICAL_STATISTICS = ("nobs", "missing", "distinct", "top", "freq")
_additional = [
stat for stat in CATEGORICAL_STATISTICS if stat not in NUMERIC_STATISTICS
]
DEFAULT_STATISTICS = NUMERIC_STATISTICS + tuple(_additional)
[docs]class Description:
"""
Extended descriptive statistics for data
Parameters
----------
data : array_like
Data to describe. Must be convertible to a pandas DataFrame.
stats : Sequence[str], optional
Statistics to include. If not provided the full set of statistics is
computed. This list may evolve across versions to reflect best
practices. Supported options are:
"nobs", "missing", "mean", "std_err", "ci", "ci", "std", "iqr",
"iqr_normal", "mad", "mad_normal", "coef_var", "range", "max",
"min", "skew", "kurtosis", "jarque_bera", "mode", "freq",
"median", "percentiles", "distinct", "top", and "freq". See Notes for
details.
numeric : bool, default True
Whether to include numeric columns in the descriptive statistics.
categorical : bool, default True
Whether to include categorical columns in the descriptive statistics.
alpha : float, default 0.05
A number between 0 and 1 representing the size used to compute the
confidence interval, which has coverage 1 - alpha.
use_t : bool, default False
Use the Student's t distribution to construct confidence intervals.
percentiles : sequence[float]
A distinct sequence of floating point values all between 0 and 100.
The default percentiles are 1, 5, 10, 25, 50, 75, 90, 95, 99.
ntop : int, default 5
The number of top categorical labels to report. Default is
Attributes
----------
numeric_statistics
The list of supported statistics for numeric data
categorical_statistics
The list of supported statistics for categorical data
default_statistics
The default list of statistics
See Also
--------
pandas.DataFrame.describe
Basic descriptive statistics
describe
A simplified version that returns a DataFrame
Notes
-----
The selectable statistics include:
* "nobs" - Number of observations
* "missing" - Number of missing observations
* "mean" - Mean
* "std_err" - Standard Error of the mean assuming no correlation
* "ci" - Confidence interval with coverage (1 - alpha) using the normal or
t. This option creates two entries in any tables: lower_ci and upper_ci.
* "std" - Standard Deviation
* "iqr" - Interquartile range
* "iqr_normal" - Interquartile range relative to a Normal
* "mad" - Mean absolute deviation
* "mad_normal" - Mean absolute deviation relative to a Normal
* "coef_var" - Coefficient of variation
* "range" - Range between the maximum and the minimum
* "max" - The maximum
* "min" - The minimum
* "skew" - The skewness defined as the standardized 3rd central moment
* "kurtosis" - The kurtosis defined as the standardized 4th central moment
* "jarque_bera" - The Jarque-Bera test statistic for normality based on
the skewness and kurtosis. This option creates two entries, jarque_bera
and jarque_beta_pval.
* "mode" - The mode of the data. This option creates two entries in all tables,
mode and mode_freq which is the empirical frequency of the modal value.
* "median" - The median of the data.
* "percentiles" - The percentiles. Values included depend on the input value of
``percentiles``.
* "distinct" - The number of distinct categories in a categorical.
* "top" - The mode common categories. Labeled top_n for n in 1, 2, ..., ``ntop``.
* "freq" - The frequency of the common categories. Labeled freq_n for n in 1,
2, ..., ``ntop``.
"""
_int_fmt = ["nobs", "missing", "distinct"]
numeric_statistics = NUMERIC_STATISTICS
categorical_statistics = CATEGORICAL_STATISTICS
default_statistics = DEFAULT_STATISTICS
def __init__(
self,
data: Union[np.ndarray, pd.Series, pd.DataFrame],
stats: Sequence[str] = None,
*,
numeric: bool = True,
categorical: bool = True,
alpha: float = 0.05,
use_t: bool = False,
percentiles: Sequence[Union[int, float]] = PERCENTILES,
ntop: bool = 5,
):
data_arr = data
if not isinstance(data, (pd.Series, pd.DataFrame)):
data_arr = array_like(data, "data", maxdim=2)
if data_arr.ndim == 1:
data = pd.Series(data)
numeric = bool_like(numeric, "numeric")
categorical = bool_like(categorical, "categorical")
include = []
col_types = ""
if numeric:
include.append(np.number)
col_types = "numeric"
if categorical:
include.append("category")
col_types += "and " if col_types != "" else ""
col_types += "categorical"
if not numeric and not categorical:
raise ValueError(
"At least one of numeric and categorical must be True"
)
self._data = pd.DataFrame(data).select_dtypes(include)
if self._data.shape[1] == 0:
raise ValueError(
f"Selecting {col_types} results in an empty DataFrame"
)
self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes]
self._is_cat_like = [
is_categorical_dtype(dt) for dt in self._data.dtypes
]
if stats is not None:
undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS]
if undef:
raise ValueError(
f"{', '.join(undef)} are not known statistics"
)
self._stats = (
list(DEFAULT_STATISTICS) if stats is None else list(stats)
)
self._ntop = int_like(ntop, "ntop")
self._compute_top = "top" in self._stats
self._compute_freq = "freq" in self._stats
if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like):
raise ValueError("top must be a non-negative integer")
# Expand special stats
replacements = {
"mode": ["mode", "mode_freq"],
"ci": ["upper_ci", "lower_ci"],
"jarque_bera": ["jarque_bera", "jarque_bera_pval"],
"top": [f"top_{i}" for i in range(1, self._ntop + 1)],
"freq": [f"freq_{i}" for i in range(1, self._ntop + 1)],
}
for key in replacements:
if key in self._stats:
idx = self._stats.index(key)
self._stats = (
self._stats[:idx]
+ replacements[key]
+ self._stats[idx + 1 :]
)
self._percentiles = array_like(
percentiles, "percentiles", maxdim=1, dtype="d"
)
self._percentiles = np.sort(self._percentiles)
if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]:
raise ValueError("percentiles must be distinct")
if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0):
raise ValueError("percentiles must be strictly between 0 and 100")
self._alpha = float_like(alpha, "alpha")
if not 0 < alpha < 1:
raise ValueError("alpha must be strictly between 0 and 1")
self._use_t = bool_like(use_t, "use_t")
def _reorder(self, df: pd.DataFrame) -> pd.DataFrame:
return df.loc[[s for s in self._stats if s in df.index]]
@cache_readonly
def frame(self) -> pd.DataFrame:
"""
Descriptive statistics for both numeric and categorical data
Returns
-------
DataFrame
The statistics
"""
numeric = self.numeric
categorical = self.categorical
if categorical.shape[1] == 0:
return numeric
elif numeric.shape[1] == 0:
return categorical
df = pd.concat([numeric, categorical], axis=1)
return self._reorder(df[self._data.columns])
@cache_readonly
def numeric(self) -> pd.DataFrame:
"""
Descriptive statistics for numeric data
Returns
-------
DataFrame
The statistics of the numeric columns
"""
df: pd.DataFrame = self._data.loc[:, self._is_numeric]
cols = df.columns
_, k = df.shape
std = df.std()
count = df.count()
mean = df.mean()
mad = (df - mean).abs().mean()
std_err = std.copy()
std_err.loc[count > 0] /= count.loc[count > 0] ** 0.5
if self._use_t:
q = stats.t(count - 1).ppf(1.0 - self._alpha / 2)
else:
q = stats.norm.ppf(1.0 - self._alpha / 2)
def _mode(ser):
if SP_LT_19:
mode_res = stats.mode(ser.dropna())
else:
mode_res = stats.mode(ser.dropna(), keepdims=True)
# Changes in SciPy 1.10
if np.isscalar(mode_res[0]):
return float(mode_res[0]), mode_res[1]
if mode_res[0].shape[0] > 0:
return [float(val) for val in mode_res]
return np.nan, np.nan
mode_values = df.apply(_mode).T
if mode_values.size > 0:
if isinstance(mode_values, pd.DataFrame):
# pandas 1.0 or later
mode = np.asarray(mode_values[0], dtype=float)
mode_counts = np.asarray(mode_values[1], dtype=np.int64)
else:
# pandas before 1.0 returns a Series of 2-elem list
mode = []
mode_counts = []
for idx in mode_values.index:
val = mode_values.loc[idx]
mode.append(val[0])
mode_counts.append(val[1])
mode = np.atleast_1d(mode)
mode_counts = np.atleast_1d(mode_counts)
else:
mode = mode_counts = np.empty(0)
loc = count > 0
mode_freq = np.full(mode.shape[0], np.nan)
mode_freq[loc] = mode_counts[loc] / count.loc[loc]
# TODO: Workaround for pandas AbstractMethodError in extension
# types. Remove when quantile is supported for these
_df = df
try:
from pandas.api.types import is_extension_array_dtype
_df = df.copy()
for col in df:
if is_extension_array_dtype(df[col].dtype):
_df[col] = _df[col].astype(object).fillna(np.nan)
except ImportError:
pass
if df.shape[1] > 0:
iqr = _df.quantile(0.75) - _df.quantile(0.25)
else:
iqr = mean
def _safe_jarque_bera(c):
a = np.asarray(c)
if a.shape[0] < 2:
return (np.nan,) * 4
return jarque_bera(a)
jb = df.apply(
lambda x: list(_safe_jarque_bera(x.dropna())), result_type="expand"
).T
nan_mean = mean.copy()
nan_mean.loc[nan_mean == 0] = np.nan
coef_var = std / nan_mean
results = {
"nobs": pd.Series(
np.ones(k, dtype=np.int64) * df.shape[0], index=cols
),
"missing": df.shape[0] - count,
"mean": mean,
"std_err": std_err,
"upper_ci": mean + q * std_err,
"lower_ci": mean - q * std_err,
"std": std,
"iqr": iqr,
"mad": mad,
"coef_var": coef_var,
"range": pd_ptp(df),
"max": df.max(),
"min": df.min(),
"skew": jb[2],
"kurtosis": jb[3],
"iqr_normal": iqr / np.diff(stats.norm.ppf([0.25, 0.75])),
"mad_normal": mad / np.sqrt(2 / np.pi),
"jarque_bera": jb[0],
"jarque_bera_pval": jb[1],
"mode": pd.Series(mode, index=cols),
"mode_freq": pd.Series(mode_freq, index=cols),
"median": df.median(),
}
final = {k: v for k, v in results.items() if k in self._stats}
results_df = pd.DataFrame(
list(final.values()), columns=cols, index=list(final.keys())
)
if "percentiles" not in self._stats:
return results_df
# Pandas before 1.0 cannot handle empty DF
if df.shape[1] > 0:
# TODO: Remove when extension types support quantile
perc = _df.quantile(self._percentiles / 100).astype(float)
else:
perc = pd.DataFrame(index=self._percentiles / 100, dtype=float)
if np.all(np.floor(100 * perc.index) == (100 * perc.index)):
perc.index = [f"{int(100 * idx)}%" for idx in perc.index]
else:
dupe = True
scale = 100
index = perc.index
while dupe:
scale *= 10
idx = np.floor(scale * perc.index)
if np.all(np.diff(idx) > 0):
dupe = False
index = np.floor(scale * index) / (scale / 100)
fmt = f"0.{len(str(scale//100))-1}f"
output = f"{{0:{fmt}}}%"
perc.index = [output.format(val) for val in index]
# Add in the names of the percentiles to the output
self._stats = self._stats + perc.index.tolist()
return self._reorder(pd.concat([results_df, perc], axis=0))
@cache_readonly
def categorical(self) -> pd.DataFrame:
"""
Descriptive statistics for categorical data
Returns
-------
DataFrame
The statistics of the categorical columns
"""
df = self._data.loc[:, [col for col in self._is_cat_like]]
k = df.shape[1]
cols = df.columns
vc = {col: df[col].value_counts(normalize=True) for col in df}
distinct = pd.Series(
{col: vc[col].shape[0] for col in vc}, dtype=np.int64
)
top = {}
freq = {}
for col in vc:
single = vc[col]
if single.shape[0] >= self._ntop:
top[col] = single.index[: self._ntop]
freq[col] = np.asarray(single.iloc[:5])
else:
val = list(single.index)
val += [None] * (self._ntop - len(val))
top[col] = val
freq_val = list(single)
freq_val += [np.nan] * (self._ntop - len(freq_val))
freq[col] = np.asarray(freq_val)
index = [f"top_{i}" for i in range(1, self._ntop + 1)]
top_df = pd.DataFrame(top, dtype="object", index=index, columns=cols)
index = [f"freq_{i}" for i in range(1, self._ntop + 1)]
freq_df = pd.DataFrame(freq, dtype="object", index=index, columns=cols)
results = {
"nobs": pd.Series(
np.ones(k, dtype=np.int64) * df.shape[0], index=cols
),
"missing": df.shape[0] - df.count(),
"distinct": distinct,
}
final = {k: v for k, v in results.items() if k in self._stats}
results_df = pd.DataFrame(
list(final.values()),
columns=cols,
index=list(final.keys()),
dtype="object",
)
if self._compute_top:
results_df = pd.concat([results_df, top_df], axis=0)
if self._compute_freq:
results_df = pd.concat([results_df, freq_df], axis=0)
return self._reorder(results_df)
[docs] def summary(self) -> SimpleTable:
"""
Summary table of the descriptive statistics
Returns
-------
SimpleTable
A table instance supporting export to text, csv and LaTeX
"""
df = self.frame.astype(object)
df = df.fillna("")
cols = [str(col) for col in df.columns]
stubs = [str(idx) for idx in df.index]
data = []
for _, row in df.iterrows():
data.append([v for v in row])
def _formatter(v):
if isinstance(v, str):
return v
elif v // 1 == v:
return str(int(v))
return f"{v:0.4g}"
return SimpleTable(
data,
header=cols,
stubs=stubs,
title="Descriptive Statistics",
txt_fmt={"data_fmts": {0: "%s", 1: _formatter}},
datatypes=[1] * len(data),
)
def __str__(self) -> str:
return str(self.summary().as_text())
ds = Docstring(Description.__doc__)
ds.replace_block(
"Returns", Parameter(None, "DataFrame", ["Descriptive statistics"])
)
ds.replace_block("Attributes", [])
ds.replace_block(
"See Also",
[
(
[("pandas.DataFrame.describe", None)],
["Basic descriptive statistics"],
),
(
[("Description", None)],
["Descriptive statistics class with additional output options"],
),
],
)
[docs]@Appender(str(ds))
def describe(
data: Union[np.ndarray, pd.Series, pd.DataFrame],
stats: Sequence[str] = None,
*,
numeric: bool = True,
categorical: bool = True,
alpha: float = 0.05,
use_t: bool = False,
percentiles: Sequence[Union[int, float]] = PERCENTILES,
ntop: bool = 5,
) -> pd.DataFrame:
return Description(
data,
stats,
numeric=numeric,
categorical=categorical,
alpha=alpha,
use_t=use_t,
percentiles=percentiles,
ntop=ntop,
).frame
class Describe(object):
"""
Calculates descriptive statistics for data.
.. deprecated:: 0.12
Use ``Description`` or ``describe`` instead
Defaults to a basic set of statistics, "all" can be specified, or a list
can be given.
Parameters
----------
dataset : array_like
2D dataset for descriptive statistics.
"""
def __init__(self, dataset):
warnings.warn(DEPRECATION_MSG, FutureWarning)
self.dataset = dataset
# better if this is initially a list to define order, or use an
# ordered dict. First position is the function
# Second position is the tuple/list of column names/numbers
# third is are the results in order of the columns
self.univariate = dict(
obs=[len, None, None],
mean=[np.mean, None, None],
std=[np.std, None, None],
min=[np.min, None, None],
max=[np.max, None, None],
ptp=[np.ptp, None, None],
var=[np.var, None, None],
mode_val=[self._mode_val, None, None],
mode_bin=[self._mode_bin, None, None],
median=[np.median, None, None],
skew=[stats.skew, None, None],
uss=[lambda x: np.sum(np.asarray(x) ** 2, axis=0), None, None],
kurtosis=[stats.kurtosis, None, None],
percentiles=[self._percentiles, None, None],
# BUG: not single value
# sign_test_M = [self.sign_test_m, None, None],
# sign_test_P = [self.sign_test_p, None, None]
)
# TODO: Basic stats for strings
# self.strings = dict(
# unique = [np.unique, None, None],
# number_uniq = [len(
# most = [
# least = [
# TODO: Multivariate
# self.multivariate = dict(
# corrcoef(x[, y, rowvar, bias]),
# cov(m[, y, rowvar, bias]),
# histogram2d(x, y[, bins, range, normed, weights])
# )
self._arraytype = None
self._columns_list = None
def _percentiles(self, x):
p = [
stats.scoreatpercentile(x, per)
for per in (1, 5, 10, 25, 50, 75, 90, 95, 99)
]
return p
def _mode_val(self, x):
try:
return stats.mode(x, keepdims=True)[0][0]
except TypeError:
return stats.mode(x)[0][0]
def _mode_bin(self, x):
try:
return stats.mode(x, keepdims=True)[1][0]
except TypeError:
return stats.mode(x)[1][0]
def _array_typer(self):
"""if not a sctructured array"""
if not (self.dataset.dtype.names):
"""homogeneous dtype array"""
self._arraytype = "homog"
elif self.dataset.dtype.names:
"""structured or rec array"""
self._arraytype = "sctruct"
else:
assert self._arraytype == "sctruct" or self._arraytype == "homog"
def _is_dtype_like(self, col):
"""
Check whether self.dataset.[col][0] behaves like a string, numbern
unknown. `numpy.lib._iotools._is_string_like`
"""
def string_like():
# TODO: not sure what the result is if the first item is some
# type of missing value
try:
self.dataset[col][0] + ""
except (TypeError, ValueError):
return False
return True
def number_like():
try:
self.dataset[col][0] + 1.0
except (TypeError, ValueError):
return False
return True
if number_like() and not string_like():
return "number"
elif not number_like() and string_like():
return "string"
else:
assert number_like() or string_like(), (
"\
Not sure of dtype"
+ str(self.dataset[col][0])
)
# @property
def summary(self, stats="basic", columns="all", orientation="auto"):
"""
Return a summary of descriptive statistics.
Parameters
----------
stats: list or str
The desired statistics, Accepts 'basic' or 'all' or a list.
'basic' = ('obs', 'mean', 'std', 'min', 'max')
'all' = ('obs', 'mean', 'std', 'min', 'max', 'ptp', 'var',
'mode', 'meadian', 'skew', 'uss', 'kurtosis',
'percentiles')
columns : list or str
The columns/variables to report the statistics, default is 'all'
If an object with named columns is given, you may specify the
column names. For example
"""
# NOTE
# standard array: Specifiy column numbers (NEED TO TEST)
# percentiles currently broken
# mode requires mode_val and mode_bin separately
if self._arraytype is None:
self._array_typer()
if stats == "basic":
stats = ("obs", "mean", "std", "min", "max")
elif stats == "all":
# stats = self.univariate.keys()
# dict does not keep an order, use full list instead
stats = [
"obs",
"mean",
"std",
"min",
"max",
"ptp",
"var",
"mode_val",
"mode_bin",
"median",
"uss",
"skew",
"kurtosis",
"percentiles",
]
else:
for astat in stats:
pass
# assert astat in self.univariate
# hack around percentiles multiple output
# bad naming
import scipy.stats
# BUG: the following has all per the same per=99
##perdict = dict(('perc_%2d'%per, [lambda x:
# scipy.stats.scoreatpercentile(x, per), None, None])
## for per in (1,5,10,25,50,75,90,95,99))
def _fun(per):
return lambda x: scipy.stats.scoreatpercentile(x, per)
perdict = dict(
("perc_%02d" % per, [_fun(per), None, None])
for per in (1, 5, 10, 25, 50, 75, 90, 95, 99)
)
if "percentiles" in stats:
self.univariate.update(perdict)
idx = stats.index("percentiles")
stats[idx : idx + 1] = sorted(perdict.keys())
# JP: this does not allow a change in sequence, sequence in stats is
# ignored
# this is just an if condition
if any(
[
aitem[1]
for aitem in self.univariate.items()
if aitem[0] in stats
]
):
if columns == "all":
self._columns_list = []
if self._arraytype == "sctruct":
self._columns_list = self.dataset.dtype.names
# self._columns_list = [col for col in
# self.dataset.dtype.names if
# (self._is_dtype_like(col)=='number')]
else:
self._columns_list = lrange(self.dataset.shape[1])
else:
self._columns_list = columns
if self._arraytype == "sctruct":
for col in self._columns_list:
assert col in self.dataset.dtype.names
else:
assert self._is_dtype_like(self.dataset) == "number"
columstypes = self.dataset.dtype
# TODO: do we need to make sure they dtype is float64 ?
for astat in stats:
calc = self.univariate[astat]
if self._arraytype == "sctruct":
calc[1] = self._columns_list
calc[2] = [
calc[0](self.dataset[col])
for col in self._columns_list
if (self._is_dtype_like(col) == "number")
]
# calc[2].append([len(np.unique(self.dataset[col])) for col
# in self._columns_list if
# self._is_dtype_like(col)=='string']
else:
calc[1] = ["Col " + str(col) for col in self._columns_list]
calc[2] = [
calc[0](self.dataset[:, col])
for col in self._columns_list
]
return self.print_summary(stats, orientation=orientation)
else:
return self.print_summary(stats, orientation=orientation)
def print_summary(self, stats, orientation="auto"):
# TODO: need to specify a table formating for the numbers, using defualt
title = "Summary Statistics"
header = stats
stubs = self.univariate["obs"][1]
data = [
[self.univariate[astat][2][col] for astat in stats]
for col in range(len(self.univariate["obs"][2]))
]
if (orientation == "varcols") or (
orientation == "auto" and len(stubs) < len(header)
):
# swap rows and columns
data = lmap(lambda *row: list(row), *data)
header, stubs = stubs, header
part_fmt = dict(data_fmts=["%#8.4g"] * (len(header) - 1))
table = SimpleTable(data, header, stubs, title=title, txt_fmt=part_fmt)
return table
@Appender(sign_test.__doc__) # i.e. module-level sign_test
def sign_test(self, samp, mu0=0):
return sign_test(samp, mu0)
# TODO: There must be a better way but formating the stats of a fuction that
# returns 2 values is a problem.
# def sign_test_m(samp,mu0=0):
# return self.sign_test(samp,mu0)[0]
# def sign_test_p(samp,mu0=0):
# return self.sign_test(samp,mu0)[1]