Missing Data

All of the models can handle missing data. For performance reasons, the default is not to do any checking for missing data. If, however, you would like for missing data to be handled internally, you can do so by using the missing keyword argument. The default is to do nothing

In [1]: import statsmodels.api as sm

In [2]: data = sm.datasets.longley.load()

In [3]: data.exog = sm.add_constant(data.exog)

# add in some missing data
In [4]: missing_idx = np.array([False] * len(data.endog))

In [5]: missing_idx[[4, 10, 15]] = True

In [6]: data.endog[missing_idx] = np.nan

In [7]: ols_model = sm.OLS(data.endog, data.exog)

In [8]: ols_fit = ols_model.fit()

In [9]: print(ols_fit.params)
const     NaN
GNPDEFL   NaN
GNP       NaN
UNEMP     NaN
ARMED     NaN
POP       NaN
YEAR      NaN
dtype: float64

This silently fails and all of the model parameters are NaN, which is probably not what you expected. If you are not sure whether or not you have missing data you can use missing = ‘raise’. This will raise a MissingDataError during model instantiation if missing data is present so that you know something was wrong in your input data.

In [10]: ols_model = sm.OLS(data.endog, data.exog, missing='raise')
---------------------------------------------------------------------------
MissingDataError                          Traceback (most recent call last)
Cell In[10], line 1
----> 1 ols_model = sm.OLS(data.endog, data.exog, missing='raise')

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/statsmodels/regression/linear_model.py:921, in OLS.__init__(self, endog, exog, missing, hasconst, **kwargs)
    918     msg = ("Weights are not supported in OLS and will be ignored"
    919            "An exception will be raised in the next version.")
    920     warnings.warn(msg, ValueWarning)
--> 921 super().__init__(endog, exog, missing=missing,
    922                           hasconst=hasconst, **kwargs)
    923 if "weights" in self._init_keys:
    924     self._init_keys.remove("weights")

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/statsmodels/regression/linear_model.py:746, in WLS.__init__(self, endog, exog, weights, missing, hasconst, **kwargs)
    744 else:
    745     weights = weights.squeeze()
--> 746 super().__init__(endog, exog, missing=missing,
    747                           weights=weights, hasconst=hasconst, **kwargs)
    748 nobs = self.exog.shape[0]
    749 weights = self.weights

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/statsmodels/regression/linear_model.py:200, in RegressionModel.__init__(self, endog, exog, **kwargs)
    199 def __init__(self, endog, exog, **kwargs):
--> 200     super().__init__(endog, exog, **kwargs)
    201     self.pinv_wexog: Float64Array | None = None
    202     self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/statsmodels/base/model.py:275, in LikelihoodModel.__init__(self, endog, exog, **kwargs)
    274 def __init__(self, endog, exog=None, **kwargs):
--> 275     super().__init__(endog, exog, **kwargs)
    276     self.initialize()

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/statsmodels/base/model.py:97, in Model.__init__(self, endog, exog, **kwargs)
     95 missing = kwargs.pop('missing', 'none')
     96 hasconst = kwargs.pop('hasconst', None)
---> 97 self.data = self._handle_data(endog, exog, missing, hasconst,
     98                               **kwargs)
     99 self.k_constant = self.data.k_constant
    100 self.exog = self.data.exog

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/statsmodels/base/model.py:137, in Model._handle_data(self, endog, exog, missing, hasconst, **kwargs)
    136 def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
--> 137     data = handle_data(endog, exog, missing, hasconst, **kwargs)
    138     # kwargs arrays could have changed, easier to just attach here
    139     for key in kwargs:

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/statsmodels/base/data.py:702, in handle_data(endog, exog, missing, hasconst, **kwargs)
    699     exog = np.asarray(exog)
    701 klass = handle_data_class_factory(endog, exog)
--> 702 return klass(endog, exog=exog, missing=missing, hasconst=hasconst, **kwargs)

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/statsmodels/base/data.py:76, in ModelData.__init__(self, endog, exog, missing, hasconst, **kwargs)
     74     self.formula = kwargs.pop("formula")
     75 if missing != "none":
---> 76     arrays, nan_idx = self.handle_missing(endog, exog, missing, **kwargs)
     77     self.missing_row_idx = nan_idx
     78     self.__dict__.update(arrays)  # attach all the data arrays

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/statsmodels/base/data.py:297, in ModelData.handle_missing(cls, endog, exog, missing, **kwargs)
    294     return combined, []
    296 elif missing == "raise":
--> 297     raise MissingDataError("NaNs were encountered in the data")
    299 elif missing == "drop":
    300     nan_mask = ~nan_mask

MissingDataError: NaNs were encountered in the data

If you want statsmodels to handle the missing data by dropping the observations, use missing = ‘drop’.

In [11]: ols_model = sm.OLS(data.endog, data.exog, missing='drop')

We are considering adding a configuration framework so that you can set the option with a global setting.


Last update: Dec 16, 2024