{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Discrete Choice Models Overview" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:41.986225Z", "iopub.status.busy": "2022-11-02T17:10:41.984993Z", "iopub.status.idle": "2022-11-02T17:10:42.872671Z", "shell.execute_reply": "2022-11-02T17:10:42.871955Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "import numpy as np\n", "import statsmodels.api as sm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data\n", "\n", "Load data from Spector and Mazzeo (1980). Examples follow Greene's Econometric Analysis Ch. 21 (5th Edition)." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:42.878206Z", "iopub.status.busy": "2022-11-02T17:10:42.876915Z", "iopub.status.idle": "2022-11-02T17:10:42.889707Z", "shell.execute_reply": "2022-11-02T17:10:42.889161Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "spector_data = sm.datasets.spector.load()\n", "spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Inspect the data:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:42.894296Z", "iopub.status.busy": "2022-11-02T17:10:42.893174Z", "iopub.status.idle": "2022-11-02T17:10:42.902676Z", "shell.execute_reply": "2022-11-02T17:10:42.902127Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " GPA TUCE PSI const\n", "0 2.66 20.0 0.0 1.0\n", "1 2.89 22.0 0.0 1.0\n", "2 3.28 24.0 0.0 1.0\n", "3 2.92 12.0 0.0 1.0\n", "4 4.00 21.0 0.0 1.0\n", "0 0.0\n", "1 0.0\n", "2 0.0\n", "3 0.0\n", "4 1.0\n", "Name: GRADE, dtype: float64\n" ] } ], "source": [ "print(spector_data.exog.head())\n", "print(spector_data.endog.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Linear Probability Model (OLS)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:42.907128Z", "iopub.status.busy": "2022-11-02T17:10:42.906026Z", "iopub.status.idle": "2022-11-02T17:10:42.914493Z", "shell.execute_reply": "2022-11-02T17:10:42.913936Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parameters: GPA 0.463852\n", "TUCE 0.010495\n", "PSI 0.378555\n", "dtype: float64\n" ] } ], "source": [ "lpm_mod = sm.OLS(spector_data.endog, spector_data.exog)\n", "lpm_res = lpm_mod.fit()\n", "print(\"Parameters: \", lpm_res.params[:-1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Logit Model" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:42.919564Z", "iopub.status.busy": "2022-11-02T17:10:42.918461Z", "iopub.status.idle": "2022-11-02T17:10:42.934591Z", "shell.execute_reply": "2022-11-02T17:10:42.933976Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parameters: GPA 2.826113\n", "TUCE 0.095158\n", "PSI 2.378688\n", "const -13.021347\n", "dtype: float64\n" ] } ], "source": [ "logit_mod = sm.Logit(spector_data.endog, spector_data.exog)\n", "logit_res = logit_mod.fit(disp=0)\n", "print(\"Parameters: \", logit_res.params)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Marginal Effects" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:42.939181Z", "iopub.status.busy": "2022-11-02T17:10:42.938073Z", "iopub.status.idle": "2022-11-02T17:10:42.946761Z", "shell.execute_reply": "2022-11-02T17:10:42.946176Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Logit Marginal Effects \n", "=====================================\n", "Dep. Variable: GRADE\n", "Method: dydx\n", "At: overall\n", "==============================================================================\n", " dy/dx std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "GPA 0.3626 0.109 3.313 0.001 0.148 0.577\n", "TUCE 0.0122 0.018 0.686 0.493 -0.023 0.047\n", "PSI 0.3052 0.092 3.304 0.001 0.124 0.486\n", "==============================================================================\n" ] } ], "source": [ "margeff = logit_res.get_margeff()\n", "print(margeff.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As in all the discrete data models presented below, we can print a nice summary of results:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:42.951244Z", "iopub.status.busy": "2022-11-02T17:10:42.950142Z", "iopub.status.idle": "2022-11-02T17:10:42.977509Z", "shell.execute_reply": "2022-11-02T17:10:42.976853Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Logit Regression Results \n", "==============================================================================\n", "Dep. Variable: GRADE No. Observations: 32\n", "Model: Logit Df Residuals: 28\n", "Method: MLE Df Model: 3\n", "Date: Wed, 02 Nov 2022 Pseudo R-squ.: 0.3740\n", "Time: 17:10:42 Log-Likelihood: -12.890\n", "converged: True LL-Null: -20.592\n", "Covariance Type: nonrobust LLR p-value: 0.001502\n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "GPA 2.8261 1.263 2.238 0.025 0.351 5.301\n", "TUCE 0.0952 0.142 0.672 0.501 -0.182 0.373\n", "PSI 2.3787 1.065 2.234 0.025 0.292 4.465\n", "const -13.0213 4.931 -2.641 0.008 -22.687 -3.356\n", "==============================================================================\n" ] } ], "source": [ "print(logit_res.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Probit Model " ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:42.981278Z", "iopub.status.busy": "2022-11-02T17:10:42.981042Z", "iopub.status.idle": "2022-11-02T17:10:42.997910Z", "shell.execute_reply": "2022-11-02T17:10:42.997318Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 0.400588\n", " Iterations 6\n", "Parameters: GPA 1.625810\n", "TUCE 0.051729\n", "PSI 1.426332\n", "const -7.452320\n", "dtype: float64\n", "Marginal effects: \n", " Probit Marginal Effects \n", "=====================================\n", "Dep. Variable: GRADE\n", "Method: dydx\n", "At: overall\n", "==============================================================================\n", " dy/dx std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "GPA 0.3608 0.113 3.182 0.001 0.139 0.583\n", "TUCE 0.0115 0.018 0.624 0.533 -0.025 0.048\n", "PSI 0.3165 0.090 3.508 0.000 0.140 0.493\n", "==============================================================================\n" ] } ], "source": [ "probit_mod = sm.Probit(spector_data.endog, spector_data.exog)\n", "probit_res = probit_mod.fit()\n", "probit_margeff = probit_res.get_margeff()\n", "print(\"Parameters: \", probit_res.params)\n", "print(\"Marginal effects: \")\n", "print(probit_margeff.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Multinomial Logit" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load data from the American National Election Studies:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:43.002598Z", "iopub.status.busy": "2022-11-02T17:10:43.001474Z", "iopub.status.idle": "2022-11-02T17:10:43.020479Z", "shell.execute_reply": "2022-11-02T17:10:43.019859Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "anes_data = sm.datasets.anes96.load()\n", "anes_exog = anes_data.exog\n", "anes_exog = sm.add_constant(anes_exog)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Inspect the data:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:43.025421Z", "iopub.status.busy": "2022-11-02T17:10:43.024266Z", "iopub.status.idle": "2022-11-02T17:10:43.033509Z", "shell.execute_reply": "2022-11-02T17:10:43.032952Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " logpopul selfLR age educ income\n", "0 -2.302585 7.0 36.0 3.0 1.0\n", "1 5.247550 3.0 20.0 4.0 1.0\n", "2 3.437208 2.0 24.0 6.0 1.0\n", "3 4.420045 3.0 28.0 6.0 1.0\n", "4 6.461624 5.0 68.0 6.0 1.0\n", "0 6.0\n", "1 1.0\n", "2 1.0\n", "3 1.0\n", "4 0.0\n", "Name: PID, dtype: float64\n" ] } ], "source": [ "print(anes_data.exog.head())\n", "print(anes_data.endog.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit MNL model:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:43.038096Z", "iopub.status.busy": "2022-11-02T17:10:43.036965Z", "iopub.status.idle": "2022-11-02T17:10:43.145734Z", "shell.execute_reply": "2022-11-02T17:10:43.145126Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 1.548647\n", " Iterations 7\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 0 1 2 3 4 5\n", "const -0.373402 -2.250913 -3.665584 -7.613843 -7.060478 -12.105751\n", "logpopul -0.011536 -0.088751 -0.105967 -0.091557 -0.093285 -0.140881\n", "selfLR 0.297714 0.391669 0.573451 1.278772 1.346962 2.070080\n", "age -0.024945 -0.022898 -0.014851 -0.008681 -0.017904 -0.009433\n", "educ 0.082491 0.181043 -0.007152 0.199828 0.216939 0.321926\n", "income 0.005197 0.047874 0.057575 0.084498 0.080958 0.108894\n" ] } ], "source": [ "mlogit_mod = sm.MNLogit(anes_data.endog, anes_exog)\n", "mlogit_res = mlogit_mod.fit()\n", "print(mlogit_res.params)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Poisson\n", "\n", "Load the Rand data. Note that this example is similar to Cameron and Trivedi's `Microeconometrics` Table 20.5, but it is slightly different because of minor changes in the data. " ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:43.149191Z", "iopub.status.busy": "2022-11-02T17:10:43.148787Z", "iopub.status.idle": "2022-11-02T17:10:43.199644Z", "shell.execute_reply": "2022-11-02T17:10:43.199001Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "rand_data = sm.datasets.randhie.load()\n", "rand_exog = rand_data.exog\n", "rand_exog = sm.add_constant(rand_exog, prepend=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit Poisson model: " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:43.203560Z", "iopub.status.busy": "2022-11-02T17:10:43.203101Z", "iopub.status.idle": "2022-11-02T17:10:43.347012Z", "shell.execute_reply": "2022-11-02T17:10:43.346256Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 3.091609\n", " Iterations 6\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Poisson Regression Results \n", "==============================================================================\n", "Dep. Variable: mdvis No. Observations: 20190\n", "Model: Poisson Df Residuals: 20180\n", "Method: MLE Df Model: 9\n", "Date: Wed, 02 Nov 2022 Pseudo R-squ.: 0.06343\n", "Time: 17:10:43 Log-Likelihood: -62420.\n", "converged: True LL-Null: -66647.\n", "Covariance Type: nonrobust LLR p-value: 0.000\n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "lncoins -0.0525 0.003 -18.216 0.000 -0.058 -0.047\n", "idp -0.2471 0.011 -23.272 0.000 -0.268 -0.226\n", "lpi 0.0353 0.002 19.302 0.000 0.032 0.039\n", "fmde -0.0346 0.002 -21.439 0.000 -0.038 -0.031\n", "physlm 0.2717 0.012 22.200 0.000 0.248 0.296\n", "disea 0.0339 0.001 60.098 0.000 0.033 0.035\n", "hlthg -0.0126 0.009 -1.366 0.172 -0.031 0.005\n", "hlthf 0.0541 0.015 3.531 0.000 0.024 0.084\n", "hlthp 0.2061 0.026 7.843 0.000 0.155 0.258\n", "const 0.7004 0.011 62.741 0.000 0.678 0.722\n", "==============================================================================" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "poisson_mod = sm.Poisson(rand_data.endog, rand_exog)\n", "poisson_res = poisson_mod.fit(method=\"newton\")\n", "print(poisson_res.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Negative Binomial\n", "\n", "The negative binomial model gives slightly different results. " ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:43.351168Z", "iopub.status.busy": "2022-11-02T17:10:43.350786Z", "iopub.status.idle": "2022-11-02T17:10:44.013179Z", "shell.execute_reply": "2022-11-02T17:10:44.012429Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/hostedtoolcache/Python/3.10.8/x64/lib/python3.10/site-packages/statsmodels/base/model.py:604: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n", " warnings.warn(\"Maximum Likelihood optimization failed to \"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " NegativeBinomial Regression Results \n", "==============================================================================\n", "Dep. Variable: mdvis No. Observations: 20190\n", "Model: NegativeBinomial Df Residuals: 20180\n", "Method: MLE Df Model: 9\n", "Date: Wed, 02 Nov 2022 Pseudo R-squ.: 0.01845\n", "Time: 17:10:43 Log-Likelihood: -43384.\n", "converged: False LL-Null: -44199.\n", "Covariance Type: nonrobust LLR p-value: 0.000\n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "lncoins -0.0579 0.006 -9.515 0.000 -0.070 -0.046\n", "idp -0.2678 0.023 -11.802 0.000 -0.312 -0.223\n", "lpi 0.0412 0.004 9.938 0.000 0.033 0.049\n", "fmde -0.0381 0.003 -11.216 0.000 -0.045 -0.031\n", "physlm 0.2691 0.030 8.985 0.000 0.210 0.328\n", "disea 0.0382 0.001 26.080 0.000 0.035 0.041\n", "hlthg -0.0441 0.020 -2.201 0.028 -0.083 -0.005\n", "hlthf 0.0173 0.036 0.478 0.632 -0.054 0.088\n", "hlthp 0.1782 0.074 2.399 0.016 0.033 0.324\n", "const 0.6635 0.025 26.786 0.000 0.615 0.712\n", "alpha 1.2930 0.019 69.477 0.000 1.256 1.329\n", "==============================================================================\n" ] } ], "source": [ "mod_nbin = sm.NegativeBinomial(rand_data.endog, rand_exog)\n", "res_nbin = mod_nbin.fit(disp=False)\n", "print(res_nbin.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Alternative solvers\n", "\n", "The default method for fitting discrete data MLE models is Newton-Raphson. You can use other solvers by using the ``method`` argument: " ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2022-11-02T17:10:44.017351Z", "iopub.status.busy": "2022-11-02T17:10:44.016868Z", "iopub.status.idle": "2022-11-02T17:10:44.400642Z", "shell.execute_reply": "2022-11-02T17:10:44.399896Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 1.548647\n", " Iterations: 111\n", " Function evaluations: 117\n", " Gradient evaluations: 117\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " MNLogit Regression Results \n", "==============================================================================\n", "Dep. Variable: PID No. Observations: 944\n", "Model: MNLogit Df Residuals: 908\n", "Method: MLE Df Model: 30\n", "Date: Wed, 02 Nov 2022 Pseudo R-squ.: 0.1648\n", "Time: 17:10:44 Log-Likelihood: -1461.9\n", "converged: True LL-Null: -1750.3\n", "Covariance Type: nonrobust LLR p-value: 1.822e-102\n", "==============================================================================\n", " PID=1 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -0.3734 0.630 -0.593 0.553 -1.608 0.861\n", "logpopul -0.0115 0.034 -0.337 0.736 -0.079 0.056\n", "selfLR 0.2977 0.094 3.180 0.001 0.114 0.481\n", "age -0.0249 0.007 -3.823 0.000 -0.038 -0.012\n", "educ 0.0825 0.074 1.121 0.262 -0.062 0.227\n", "income 0.0052 0.018 0.295 0.768 -0.029 0.040\n", "------------------------------------------------------------------------------\n", " PID=2 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -2.2509 0.763 -2.949 0.003 -3.747 -0.755\n", "logpopul -0.0888 0.039 -2.266 0.023 -0.166 -0.012\n", "selfLR 0.3917 0.108 3.619 0.000 0.180 0.604\n", "age -0.0229 0.008 -2.893 0.004 -0.038 -0.007\n", "educ 0.1810 0.085 2.123 0.034 0.014 0.348\n", "income 0.0479 0.022 2.149 0.032 0.004 0.092\n", "------------------------------------------------------------------------------\n", " PID=3 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -3.6656 1.157 -3.169 0.002 -5.932 -1.399\n", "logpopul -0.1060 0.057 -1.858 0.063 -0.218 0.006\n", "selfLR 0.5734 0.159 3.617 0.000 0.263 0.884\n", "age -0.0149 0.011 -1.311 0.190 -0.037 0.007\n", "educ -0.0072 0.126 -0.057 0.955 -0.255 0.240\n", "income 0.0576 0.034 1.713 0.087 -0.008 0.123\n", "------------------------------------------------------------------------------\n", " PID=4 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -7.6139 0.958 -7.951 0.000 -9.491 -5.737\n", "logpopul -0.0916 0.044 -2.091 0.037 -0.177 -0.006\n", "selfLR 1.2788 0.129 9.921 0.000 1.026 1.531\n", "age -0.0087 0.008 -1.031 0.302 -0.025 0.008\n", "educ 0.1998 0.094 2.123 0.034 0.015 0.384\n", "income 0.0845 0.026 3.226 0.001 0.033 0.136\n", "------------------------------------------------------------------------------\n", " PID=5 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -7.0605 0.844 -8.362 0.000 -8.715 -5.406\n", "logpopul -0.0933 0.039 -2.371 0.018 -0.170 -0.016\n", "selfLR 1.3470 0.117 11.494 0.000 1.117 1.577\n", "age -0.0179 0.008 -2.352 0.019 -0.033 -0.003\n", "educ 0.2169 0.085 2.552 0.011 0.050 0.384\n", "income 0.0810 0.023 3.524 0.000 0.036 0.126\n", "------------------------------------------------------------------------------\n", " PID=6 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -12.1058 1.060 -11.421 0.000 -14.183 -10.028\n", "logpopul -0.1409 0.042 -3.343 0.001 -0.223 -0.058\n", "selfLR 2.0701 0.143 14.435 0.000 1.789 2.351\n", "age -0.0094 0.008 -1.160 0.246 -0.025 0.007\n", "educ 0.3219 0.091 3.534 0.000 0.143 0.500\n", "income 0.1089 0.025 4.304 0.000 0.059 0.158\n", "==============================================================================\n" ] } ], "source": [ "mlogit_res = mlogit_mod.fit(method=\"bfgs\", maxiter=250)\n", "print(mlogit_res.summary())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 4 }