# -*- coding: utf-8 -*-
"""Anova k-sample comparison without and with trimming
Created on Sun Jun 09 23:51:34 2013
Author: Josef Perktold
"""
import numbers
import numpy as np
# the trimboth and trim_mean are taken from scipy.stats.stats
# and enhanced by axis
[docs]def trimboth(a, proportiontocut, axis=0):
"""
Slices off a proportion of items from both ends of an array.
Slices off the passed proportion of items from both ends of the passed
array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and**
rightmost 10% of scores). You must pre-sort the array if you want
'proper' trimming. Slices off less if proportion results in a
non-integer slice index (i.e., conservatively slices off
`proportiontocut`).
Parameters
----------
a : array_like
Data to trim.
proportiontocut : float or int
Proportion of total data set to trim of each end.
axis : int or None
Axis along which the observations are trimmed. The default is to trim
along axis=0. If axis is None then the array will be flattened before
trimming.
Returns
-------
out : ndarray
Trimmed version of array `a`.
Examples
--------
>>> from scipy import stats
>>> a = np.arange(20)
>>> b = stats.trimboth(a, 0.1)
>>> b.shape
(16,)
"""
a = np.asarray(a)
if axis is None:
a = a.ravel()
axis = 0
nobs = a.shape[axis]
lowercut = int(proportiontocut * nobs)
uppercut = nobs - lowercut
if (lowercut >= uppercut):
raise ValueError("Proportion too big.")
sl = [slice(None)] * a.ndim
sl[axis] = slice(lowercut, uppercut)
return a[tuple(sl)]
[docs]def trim_mean(a, proportiontocut, axis=0):
"""
Return mean of array after trimming observations from both lower and upper
tails.
If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of
scores. Slices off LESS if proportion results in a non-integer slice
index (i.e., conservatively slices off `proportiontocut` ).
Parameters
----------
a : array_like
Input array
proportiontocut : float
Fraction to cut off of both tails of the observations
axis : int or None
Axis along which the trimmed means are computed. The default is axis=0.
If axis is None then the trimmed mean will be computed for the
flattened array.
Returns
-------
trim_mean : ndarray
Mean of trimmed array.
"""
newa = trimboth(np.sort(a, axis), proportiontocut, axis=axis)
return np.mean(newa, axis=axis)
[docs]class TrimmedMean(object):
"""
class for trimmed and winsorized one sample statistics
axis is None, i.e. ravelling, is not supported
"""
def __init__(self, data, fraction, is_sorted=False, axis=0):
self.data = np.asarray(data)
# TODO: add pandas handling, maybe not if this stays internal
self.axis = axis
self.fraction = fraction
self.nobs = nobs = self.data.shape[axis]
self.lowercut = lowercut = int(fraction * nobs)
self.uppercut = uppercut = nobs - lowercut
if (lowercut >= uppercut):
raise ValueError("Proportion too big.")
self.nobs_reduced = nobs - 2 * lowercut
self.sl = [slice(None)] * self.data.ndim
self.sl[axis] = slice(self.lowercut, self.uppercut)
# numpy requires now tuple for indexing, not list
self.sl = tuple(self.sl)
if not is_sorted:
self.data_sorted = np.sort(self.data, axis=axis)
else:
self.data_sorted = self.data
# this only works for axis=0
self.lowerbound = np.take(self.data_sorted, lowercut, axis=axis)
self.upperbound = np.take(self.data_sorted, uppercut - 1, axis=axis)
# self.lowerbound = self.data_sorted[lowercut]
# self.upperbound = self.data_sorted[uppercut - 1]
@property
def data_trimmed(self):
# returns a view
return self.data_sorted[self.sl]
@property # cache
def data_winsorized(self):
"""winsorized data
"""
lb = np.expand_dims(self.lowerbound, self.axis)
ub = np.expand_dims(self.upperbound, self.axis)
return np.clip(self.data_sorted, lb, ub)
@property
def mean_trimmed(self):
"""mean of trimmed data
"""
return np.mean(self.data_sorted[tuple(self.sl)], self.axis)
@property
def mean_winsorized(self):
"""mean of winsorized data
"""
return np.mean(self.data_winsorized, self.axis)
@property
def var_winsorized(self):
"""variance of winsorized data
"""
# hardcoded ddof = 1
return np.var(self.data_winsorized, ddof=1, axis=self.axis)
@property
def std_mean_trimmed(self):
"""standard error of trimmed mean
"""
se = np.sqrt(self.var_winsorized / self.nobs_reduced)
# trimming creates correlation across trimmed observations
# trimming is based on order statistics of the data
# wilcox 2012, p.61
se *= np.sqrt(self.nobs / self.nobs_reduced)
return se
@property
def std_mean_winsorized(self):
"""standard error of winsorized mean
"""
# the following matches Wilcox, WRS2
std_ = np.sqrt(self.var_winsorized / self.nobs)
std_ *= (self.nobs - 1) / (self.nobs_reduced - 1)
# old version
# tm = self
# formula from an old SAS manual page, simplified
# std_ = np.sqrt(tm.var_winsorized / (tm.nobs_reduced - 1) *
# (tm.nobs - 1.) / tm.nobs)
return std_
[docs] def ttest_mean(self, value=0, transform='trimmed',
alternative='two-sided'):
"""
One sample ttest for trimmed or Winsorized mean
Parameters
----------
value : float
Value of the mean under the Null hypothesis
transform : {'trimmed', 'winsorized'}
Specified whether the mean test is based on trimmed or winsorized
data.
alternative : {'two-sided', 'larger', 'smaller'}
Notes
-----
p-value is based on the approximate t-distribution of the test
statistic. The approximation is valid if the underlying distribution
is symmetric.
"""
import statsmodels.stats.weightstats as smws
df = self.nobs_reduced - 1
if transform == 'trimmed':
mean_ = self.mean_trimmed
std_ = self.std_mean_trimmed
elif transform == 'winsorized':
mean_ = self.mean_winsorized
std_ = self.std_mean_winsorized
else:
raise ValueError("transform can only be 'trimmed' or 'winsorized'")
res = smws._tstat_generic(mean_, 0, std_,
df, alternative=alternative, diff=value)
return res + (df,)
[docs] def reset_fraction(self, frac):
"""create a TrimmedMean instance with a new trimming fraction
This reuses the sorted array from the current instance.
"""
tm = TrimmedMean(self.data_sorted, frac, is_sorted=True,
axis=self.axis)
tm.data = self.data
# TODO: this will not work if there is processing of meta-information
# in __init__,
# for example storing a pandas DataFrame or Series index
return tm