M-Estimators for Robust Linear Modeling¶

%matplotlib inline

from __future__ import print_function
from statsmodels.compat import lmap
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

import statsmodels.api as sm

An M-estimator minimizes the function

$Q(e_i, \rho) = \sum_i~\rho \left (\frac{e_i}{s}\right )$

where $\rho$ is a symmetric function of the residuals

The effect of $\rho$ is to reduce the influence of outliers
$s$ is an estimate of scale.
The robust estimates $\hat{\beta}$ are computed by the iteratively re-weighted least squares algorithm

We have several choices available for the weighting functions to be used

norms = sm.robust.norms

def plot_weights(support, weights_func, xlabels, xticks):
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    ax.plot(support, weights_func(support))
    ax.set_xticks(xticks)
    ax.set_xticklabels(xlabels, fontsize=16)
    ax.set_ylim(-.1, 1.1)
    return ax

Andrew's Wave¶

help(norms.AndrewWave.weights)

Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Andrew's wave weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = sin(z/a)/(z/a)     for \|z\| <= a*pi
    
        weights(z) = 0                  for \|z\| > a*pi

a = 1.339
support = np.linspace(-np.pi*a, np.pi*a, 100)
andrew = norms.AndrewWave(a=a)
plot_weights(support, andrew.weights, ['$-\pi*a$', '0', '$\pi*a$'], [-np.pi*a, 0, np.pi*a]);

Hampel's 17A¶

help(norms.Hampel.weights)

Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Hampel weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = 1                            for \|z\| <= a
    
        weights(z) = a/\|z\|                        for a < \|z\| <= b
    
        weights(z) = a*(c - \|z\|)/(\|z\|*(c-b))      for b < \|z\| <= c
    
        weights(z) = 0                            for \|z\| > c

c = 8
support = np.linspace(-3*c, 3*c, 1000)
hampel = norms.Hampel(a=2., b=4., c=c)
plot_weights(support, hampel.weights, ['3*c', '0', '3*c'], [-3*c, 0, 3*c]);

Huber's t¶

help(norms.HuberT.weights)

Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Huber's t weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = 1          for \|z\| <= t
    
        weights(z) = t/\|z\|      for \|z\| > t

t = 1.345
support = np.linspace(-3*t, 3*t, 1000)
huber = norms.HuberT(t=t)
plot_weights(support, huber.weights, ['-3*t', '0', '3*t'], [-3*t, 0, 3*t]);

Least Squares¶

help(norms.LeastSquares.weights)

Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    The least squares estimator weighting function for the IRLS algorithm.
    
    The psi function scaled by the input z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = np.ones(z.shape)

support = np.linspace(-3, 3, 1000)
lst_sq = norms.LeastSquares()
plot_weights(support, lst_sq.weights, ['-3', '0', '3'], [-3, 0, 3]);

Ramsay's Ea¶

help(norms.RamsayE.weights)

Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Ramsay's Ea weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = exp(-a*\|z\|)

a = .3
support = np.linspace(-3*a, 3*a, 1000)
ramsay = norms.RamsayE(a=a)
plot_weights(support, ramsay.weights, ['-3*a', '0', '3*a'], [-3*a, 0, 3*a]);

Trimmed Mean¶

help(norms.TrimmedMean.weights)

Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Least trimmed mean weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = 1             for \|z\| <= c
    
        weights(z) = 0             for \|z\| > c

c = 2
support = np.linspace(-3*c, 3*c, 1000)
trimmed = norms.TrimmedMean(c=c)
plot_weights(support, trimmed.weights, ['-3*c', '0', '3*c'], [-3*c, 0, 3*c]);

Tukey's Biweight¶

help(norms.TukeyBiweight.weights)

Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Tukey's biweight weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        psi(z) = (1 - (z/c)**2)**2          for \|z\| <= R
    
        psi(z) = 0                          for \|z\| > R

c = 4.685
support = np.linspace(-3*c, 3*c, 1000)
tukey = norms.TukeyBiweight(c=c)
plot_weights(support, tukey.weights, ['-3*c', '0', '3*c'], [-3*c, 0, 3*c]);

Scale Estimators¶

Robust estimates of the location

x = np.array([1, 2, 3, 4, 500])

The mean is not a robust estimator of location

x.mean()

102.0

The median, on the other hand, is a robust estimator with a breakdown point of 50%

np.median(x)

3.0

Analagously for the scale
The standard deviation is not robust

x.std()

199.00251254695254

Median Absolute Deviation

$median_i |X_i - median_j(X_j)|)$

Standardized Median Absolute Deviation is a consistent estimator for $\hat{\sigma}$

$\hat{\sigma}=K \cdot MAD$

where $K$ depends on the distribution. For the normal distribution for example,

$K = \Phi^{-1}(.75)$

stats.norm.ppf(.75)

0.67448975019608171

print(x)

[  1   2   3   4 500]

sm.robust.scale.mad(x)

1.482602218505602

np.array([1,2,3,4,5.]).std()

1.4142135623730951

The default for Robust Linear Models is MAD
another popular choice is Huber's proposal 2

np.random.seed(12345)
fat_tails = stats.t(6).rvs(40)

kde = sm.nonparametric.KDEUnivariate(fat_tails)
kde.fit()
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax.plot(kde.support, kde.density);

print(fat_tails.mean(), fat_tails.std())

0.0688231044811 1.34716332297

print(stats.norm.fit(fat_tails))

(0.068823104481087499, 1.3471633229698652)

print(stats.t.fit(fat_tails, f0=6))

(6, 0.039009187170278181, 1.0564230978488927)

huber = sm.robust.scale.Huber()
loc, scale = huber(fat_tails)
print(loc, scale)

0.04048984333271795 1.1557140047569665

sm.robust.mad(fat_tails)

1.1153350011654151

sm.robust.mad(fat_tails, c=stats.t(6).ppf(.75))

1.0483916565928972

sm.robust.scale.mad(fat_tails)

1.1153350011654151

Duncan's Occupational Prestige data - M-estimation for outliers¶

from statsmodels.graphics.api import abline_plot
from statsmodels.formula.api import ols, rlm

prestige = sm.datasets.get_rdataset("Duncan", "car", cache=True).data

print(prestige.head(10))

            type  income  education  prestige
accountant  prof      62         86        82
pilot       prof      72         76        83
architect   prof      75         92        90
author      prof      55         90        76
chemist     prof      64         86        90
minister    prof      21         84        87
professor   prof      64         93        93
dentist     prof      80        100        90
reporter      wc      67         87        52
engineer    prof      72         86        88

fig = plt.figure(figsize=(12,12))
ax1 = fig.add_subplot(211, xlabel='Income', ylabel='Prestige')
ax1.scatter(prestige.income, prestige.prestige)
xy_outlier = prestige.ix['minister'][['income','prestige']]
ax1.annotate('Minister', xy_outlier, xy_outlier+1, fontsize=16)
ax2 = fig.add_subplot(212, xlabel='Education',
                           ylabel='Prestige')
ax2.scatter(prestige.education, prestige.prestige);

ols_model = ols('prestige ~ income + education', prestige).fit()
print(ols_model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               prestige   R-squared:                       0.828
Model:                            OLS   Adj. R-squared:                  0.820
Method:                 Least Squares   F-statistic:                     101.2
Date:                Tue, 28 Feb 2017   Prob (F-statistic):           8.65e-17
Time:                        21:34:18   Log-Likelihood:                -178.98
No. Observations:                  45   AIC:                             364.0
Df Residuals:                      42   BIC:                             369.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -6.0647      4.272     -1.420      0.163     -14.686       2.556
income         0.5987      0.120      5.003      0.000       0.357       0.840
education      0.5458      0.098      5.555      0.000       0.348       0.744
==============================================================================
Omnibus:                        1.279   Durbin-Watson:                   1.458
Prob(Omnibus):                  0.528   Jarque-Bera (JB):                0.520
Skew:                           0.155   Prob(JB):                        0.771
Kurtosis:                       3.426   Cond. No.                         163.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

infl = ols_model.get_influence()
student = infl.summary_frame()['student_resid']
print(student)

accountant    0.303900
pilot         0.340920
architect     0.072256
author        0.000711
chemist       0.826578
                ...   
soda.clerk   -0.883095
watchman     -0.513502
janitor      -0.079890
policeman     0.078847
waiter       -0.475972
Name: student_resid, dtype: float64

print(student.ix[np.abs(student) > 2])

minister      3.134519
reporter     -2.397022
contractor    2.043805
Name: student_resid, dtype: float64

print(infl.summary_frame().ix['minister'])

dfb_Intercept      0.144937
dfb_income        -1.220939
dfb_education      1.263019
cooks_d            0.566380
dffits             1.433935
dffits_internal    1.303510
hat_diag           0.173058
standard_resid     2.849416
student_resid      3.134519
Name: minister, dtype: float64

sidak = ols_model.outlier_test('sidak')
sidak.sort('unadj_p', inplace=True)
print(sidak)

                 student_resid   unadj_p  sidak(p)
minister              3.134519  0.003177  0.133421
reporter             -2.397022  0.021170  0.618213
contractor            2.043805  0.047433  0.887721
insurance.agent      -1.930919  0.060428  0.939485
machinist             1.887047  0.066248  0.954247
...                        ...       ...       ...
policeman             0.078847  0.937538  1.000000
architect             0.072256  0.942750  1.000000
teacher               0.050510  0.959961  1.000000
taxi.driver           0.023322  0.981507  1.000000
author                0.000711  0.999436  1.000000

[45 rows x 3 columns]

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app

fdr = ols_model.outlier_test('fdr_bh')
fdr.sort('unadj_p', inplace=True)
print(fdr)

                 student_resid   unadj_p  fdr_bh(p)
minister              3.134519  0.003177   0.142974
reporter             -2.397022  0.021170   0.476332
contractor            2.043805  0.047433   0.596233
insurance.agent      -1.930919  0.060428   0.596233
machinist             1.887047  0.066248   0.596233
...                        ...       ...        ...
policeman             0.078847  0.937538   0.999436
architect             0.072256  0.942750   0.999436
teacher               0.050510  0.959961   0.999436
taxi.driver           0.023322  0.981507   0.999436
author                0.000711  0.999436   0.999436

[45 rows x 3 columns]

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app

rlm_model = rlm('prestige ~ income + education', prestige).fit()
print(rlm_model.summary())

                    Robust linear Model Regression Results                    
==============================================================================
Dep. Variable:               prestige   No. Observations:                   45
Model:                            RLM   Df Residuals:                       42
Method:                          IRLS   Df Model:                            2
Norm:                          HuberT                                         
Scale Est.:                       mad                                         
Cov Type:                          H1                                         
Date:                Tue, 28 Feb 2017                                         
Time:                        21:34:18                                         
No. Iterations:                    18                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -7.1107      3.879     -1.833      0.067     -14.713       0.492
income         0.7015      0.109      6.456      0.000       0.489       0.914
education      0.4854      0.089      5.441      0.000       0.311       0.660
==============================================================================

If the model instance has been used for another fit with different fit
parameters, then the fit options might not be the correct ones anymore .

print(rlm_model.weights)

accountant    1.0
pilot         1.0
architect     1.0
author        1.0
chemist       1.0
             ... 
soda.clerk    1.0
watchman      1.0
janitor       1.0
policeman     1.0
waiter        1.0
dtype: float64

Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points¶

Data is on the luminosity and temperature of 47 stars in the direction of Cygnus.

dta = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data

from matplotlib.patches import Ellipse
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='log(Temp)', ylabel='log(Light)', title='Hertzsprung-Russell Diagram of Star Cluster CYG OB1')
ax.scatter(*dta.values.T)
# highlight outliers
e = Ellipse((3.5, 6), .2, 1, alpha=.25, color='r')
ax.add_patch(e);
ax.annotate('Red giants', xy=(3.6, 6), xytext=(3.8, 6),
            arrowprops=dict(facecolor='black', shrink=0.05, width=2),
            horizontalalignment='left', verticalalignment='bottom',
            clip_on=True, # clip to the axes bounding box
            fontsize=16,
     )
# annotate these with their index
for i,row in dta.ix[dta['log.Te'] < 3.8].iterrows():
    ax.annotate(i, row, row + .01, fontsize=14)
xlim, ylim = ax.get_xlim(), ax.get_ylim()

from IPython.display import Image
Image(filename='star_diagram.png')

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-49-6b65229361a7> in <module>()
      1 from IPython.display import Image
----> 2 Image(filename='star_diagram.png')

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/IPython/core/display.py in __init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata)
    756         self.unconfined = unconfined
    757         self.metadata = metadata
--> 758         super(Image, self).__init__(data=data, url=url, filename=filename)
    759 
    760         if retina:

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/IPython/core/display.py in __init__(self, data, url, filename)
    392         self.filename = None if filename is None else unicode_type(filename)
    393 
--> 394         self.reload()
    395         self._check_data()
    396 

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/IPython/core/display.py in reload(self)
    778         """Reload the raw data from file or URL."""
    779         if self.embed:
--> 780             super(Image,self).reload()
    781             if self.retina:
    782                 self._retina_shape()

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/IPython/core/display.py in reload(self)
    410         """Reload the raw data from file or URL."""
    411         if self.filename is not None:
--> 412             with open(self.filename, self._read_flags) as f:
    413                 self.data = f.read()
    414         elif self.url is not None:

FileNotFoundError: [Errno 2] No such file or directory: 'star_diagram.png'

y = dta['log.light']
X = sm.add_constant(dta['log.Te'], prepend=True)
ols_model = sm.OLS(y, X).fit()
abline_plot(model_results=ols_model, ax=ax)

rlm_mod = sm.RLM(y, X, sm.robust.norms.TrimmedMean(.5)).fit()
abline_plot(model_results=rlm_mod, ax=ax, color='red')

Why? Because M-estimators are not robust to leverage points.

infl = ols_model.get_influence()

h_bar = 2*(ols_model.df_model + 1 )/ols_model.nobs
hat_diag = infl.summary_frame()['hat_diag']
hat_diag.ix[hat_diag > h_bar]

10    0.194103
19    0.194103
29    0.198344
33    0.194103
Name: hat_diag, dtype: float64

sidak2 = ols_model.outlier_test('sidak')
sidak2.sort('unadj_p', inplace=True)
print(sidak2)

    student_resid   unadj_p  sidak(p)
16      -2.049393  0.046415  0.892872
13      -2.035329  0.047868  0.900286
33       1.905847  0.063216  0.953543
18      -1.575505  0.122304  0.997826
1        1.522185  0.135118  0.998911
..            ...       ...       ...
2       -0.182093  0.856346  1.000000
23      -0.156014  0.876736  1.000000
27      -0.147406  0.883485  1.000000
24       0.065195  0.948314  1.000000
45       0.045675  0.963776  1.000000

[47 rows x 3 columns]

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app

fdr2 = ols_model.outlier_test('fdr_bh')
fdr2.sort('unadj_p', inplace=True)
print(fdr2)

    student_resid   unadj_p  fdr_bh(p)
16      -2.049393  0.046415   0.764747
13      -2.035329  0.047868   0.764747
33       1.905847  0.063216   0.764747
18      -1.575505  0.122304   0.764747
1        1.522185  0.135118   0.764747
..            ...       ...        ...
2       -0.182093  0.856346   0.922751
23      -0.156014  0.876736   0.922751
27      -0.147406  0.883485   0.922751
24       0.065195  0.948314   0.963776
45       0.045675  0.963776   0.963776

[47 rows x 3 columns]

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app

Let's delete that line

del ax.lines[-1]

weights = np.ones(len(X))
weights[X[X['log.Te'] < 3.8].index.values - 1] = 0
wls_model = sm.WLS(y, X, weights=weights).fit()
abline_plot(model_results=wls_model, ax=ax, color='green')

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-57-e3ec53a40864> in <module>()
      2 weights[X[X['log.Te'] < 3.8].index.values - 1] = 0
      3 wls_model = sm.WLS(y, X, weights=weights).fit()
----> 4 abline_plot(model_results=wls_model, ax=ax, color='green')

/private/tmp/statsmodels/statsmodels/graphics/regressionplots.py in abline_plot(intercept, slope, horiz, vert, model_results, ax, **kwargs)
    689 
    690     data_y = [x[0]*slope+intercept, x[1]*slope+intercept]
--> 691     ax.set_xlim(x)
    692     #ax.set_ylim(y)
    693 

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/matplotlib/axes/_base.py in set_xlim(self, left, right, emit, auto, **kw)
   2910 
   2911         if emit:
-> 2912             self.callbacks.process('xlim_changed', self)
   2913             # Call all of the other x-axes that are shared with this one
   2914             for other in self._shared_x_axes.get_siblings(self):

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/matplotlib/cbook.py in process(self, s, *args, **kwargs)
    547             for cid, proxy in list(six.iteritems(self.callbacks[s])):
    548                 try:
--> 549                     proxy(*args, **kwargs)
    550                 except ReferenceError:
    551                     self._remove_proxy(proxy)

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/matplotlib/cbook.py in __call__(self, *args, **kwargs)
    414             mtd = self.func
    415         # invoke the callable and return the result
--> 416         return mtd(*args, **kwargs)
    417 
    418     def __eq__(self, other):

/private/tmp/statsmodels/statsmodels/graphics/regressionplots.py in update_datalim(self, ax)
    701             children = ax.get_children()
    702             abline = [children[i] for i in range(len(children))
--> 703                       if isinstance(children[i], ABLine2D)][0]
    704             x = ax.get_xlim()
    705             y = [x[0]*slope+intercept, x[1]*slope+intercept]

IndexError: list index out of range

MM estimators are good for this type of problem, unfortunately, we don't yet have these yet.
It's being worked on, but it gives a good excuse to look at the R cell magics in the notebook.

yy = y.values[:,None]
xx = X['log.Te'].values[:,None]

%load_ext rpy2.ipython

%R library(robustbase)
%Rpush yy xx
%R mod <- lmrob(yy ~ xx);
%R params <- mod$coefficients;
%Rpull params

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-59-a5161b228cba> in <module>()
----> 1 get_ipython().magic('load_ext rpy2.ipython')
      2 
      3 get_ipython().magic('R library(robustbase)')
      4 get_ipython().magic('Rpush yy xx')
      5 get_ipython().magic('R mod <- lmrob(yy ~ xx);')

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/IPython/core/interactiveshell.py in magic(self, arg_s)
   2156         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2157         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2158         return self.run_line_magic(magic_name, magic_arg_s)
   2159 
   2160     #-------------------------------------------------------------------------

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_line_magic(self, magic_name, line)
   2077                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2078             with self.builtin_trap:
-> 2079                 result = fn(*args,**kwargs)
   2080             return result
   2081 

<decorator-gen-62> in load_ext(self, module_str)

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/IPython/core/magics/extension.py in load_ext(self, module_str)
     35         if not module_str:
     36             raise UsageError('Missing module name.')
---> 37         res = self.shell.extension_manager.load_extension(module_str)
     38 
     39         if res == 'already loaded':

/private/tmp/statsmodels/.env/lib/python3.6/site-packages/IPython/core/extensions.py in load_extension(self, module_str)
     81             if module_str not in sys.modules:
     82                 with prepended_to_syspath(self.ipython_extension_dir):
---> 83                     __import__(module_str)
     84             mod = sys.modules[module_str]
     85             if self._call_load_ipython_extension(mod):

ModuleNotFoundError: No module named 'rpy2'

%R print(mod)

ERROR:root:Line magic function `%R` not found.

print(params)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-61-73ac4b936803> in <module>()
----> 1 print(params)

NameError: name 'params' is not defined

abline_plot(intercept=params[0], slope=params[1], ax=ax, color='green')

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-62-e1a9f35b3320> in <module>()
----> 1 abline_plot(intercept=params[0], slope=params[1], ax=ax, color='green')

NameError: name 'params' is not defined

Exercise: Breakdown points of M-estimator¶

np.random.seed(12345)
nobs = 200
beta_true = np.array([3, 1, 2.5, 3, -4])
X = np.random.uniform(-20,20, size=(nobs, len(beta_true)-1))
# stack a constant in front
X = sm.add_constant(X, prepend=True) # np.c_[np.ones(nobs), X]
mc_iter = 500
contaminate = .25 # percentage of response variables to contaminate

all_betas = []
for i in range(mc_iter):
    y = np.dot(X, beta_true) + np.random.normal(size=200)
    random_idx = np.random.randint(0, nobs, size=int(contaminate * nobs))
    y[random_idx] = np.random.uniform(-750, 750)
    beta_hat = sm.RLM(y, X).fit().params
    all_betas.append(beta_hat)

all_betas = np.asarray(all_betas)
se_loss = lambda x : np.linalg.norm(x, ord=2)**2
se_beta = lmap(se_loss, all_betas - beta_true)

Squared error loss¶

np.array(se_beta).mean()

0.44502948730686354

all_betas.mean(0)

array([ 2.99711706,  0.99898147,  2.49909344,  2.99712918, -3.99626521])

beta_true

array([ 3. ,  1. ,  2.5,  3. , -4. ])

se_loss(all_betas.mean(0) - beta_true)

3.2360913286763295e-05