Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add comparative dataset analysis to paper #117

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: all format test install download upload docker documentation data clean build paper clean-paper
.PHONY: all format test install download upload docker documentation data clean build paper clean-paper presentations

all: data test

Expand Down Expand Up @@ -50,14 +50,21 @@ build:
publish:
twine upload dist/*

paper: paper/main.pdf
paper: paper/woodruff_ghenis_2024_enhanced_cps.pdf

paper/main.pdf: $(wildcard paper/sections/**/*.tex) $(wildcard paper/bibliography/*.bib) paper/main.tex paper/macros.tex
paper/woodruff_ghenis_2024_enhanced_cps.pdf: $(wildcard paper/sections/**/*.tex) $(wildcard paper/bibliography/*.bib) paper/main.tex paper/macros.tex
cd paper && \
BIBINPUTS=./bibliography pdflatex main && \
BIBINPUTS=./bibliography bibtex main && \
pdflatex main && \
pdflatex main
pdflatex -jobname=woodruff_ghenis_2024_enhanced_cps main && \
pdflatex -jobname=woodruff_ghenis_2024_enhanced_cps main

clean-paper:
rm -f paper/*.aux paper/*.bbl paper/*.blg paper/*.log paper/*.out paper/*.toc paper/main.pdf paper/sections/**/*.aux
rm -f paper/*.aux paper/*.bbl paper/*.blg paper/*.log paper/*.out paper/*.toc paper/*.pdf paper/sections/**/*.aux

presentations: presentations/nta_2024_11/nta_2024_slides.pdf

presentations/nta_2024_11/nta_2024_slides.pdf: presentations/nta_2024_11/main.tex
cd presentations/nta_2024_11 && \
pdflatex -jobname=nta_2024_slides main && \
pdflatex -jobname=nta_2024_slides main
3 changes: 3 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- bump: minor
added:
- Paper sections on inequality results, weights, demographic analysis, and PolicyEngine integration.
32 changes: 28 additions & 4 deletions docs/results.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,25 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ValueError",
"evalue": "Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 7\u001b[0m\n\u001b[1;32m 5\u001b[0m cps \u001b[38;5;241m=\u001b[39m Microsimulation()\n\u001b[1;32m 6\u001b[0m ecps \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124menhanced_cps_2024\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 7\u001b[0m puf \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39mPUF_2024)\n",
"File \u001b[0;32m~/miniconda3/envs/policyengine/lib/python3.11/site-packages/policyengine_us/system.py:150\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 150\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 152\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m 153\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, year_start\n\u001b[1;32m 154\u001b[0m )\n\u001b[1;32m 155\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"File \u001b[0;32m~/miniconda3/envs/policyengine/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:164\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace)\u001b[0m\n\u001b[1;32m 160\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m 161\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 162\u001b[0m )\n\u001b[1;32m 163\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 164\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m dataset(require\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 165\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m 167\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 168\u001b[0m )\n",
"File \u001b[0;32m~/miniconda3/envs/policyengine/lib/python3.11/site-packages/policyengine_core/data/dataset.py:66\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists \u001b[38;5;129;01mand\u001b[39;00m require:\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 66\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload()\n\u001b[1;32m 67\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgenerate()\n",
"File \u001b[0;32m~/miniconda3/envs/policyengine/lib/python3.11/site-packages/policyengine_core/data/dataset.py:308\u001b[0m, in \u001b[0;36mDataset.download\u001b[0;34m(self, url)\u001b[0m\n\u001b[1;32m 306\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mget(url, headers\u001b[38;5;241m=\u001b[39mauth_headers)\n\u001b[1;32m 307\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[0;32m--> 308\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 309\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid response code \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 310\u001b[0m )\n\u001b[1;32m 311\u001b[0m assets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massets\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m asset \u001b[38;5;129;01min\u001b[39;00m assets:\n",
"\u001b[0;31mValueError\u001b[0m: Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0."
]
}
],
"source": [
"from policyengine_us import Microsimulation\n",
"from policyengine_us_data import PUF_2024\n",
Expand Down Expand Up @@ -7270,8 +7286,16 @@
" title=\"Weight\",\n",
" type=\"log\",\n",
" ),\n",
")"
")\n",
"format_fig(fig).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -7290,7 +7314,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
3 changes: 3 additions & 0 deletions paper/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
*.cb
*.cb2
.*.lb
*.nav
*.snm
*.vrb

## Generated if empty string is given at "Please type another file name for output:"
.pdf
Expand Down
25 changes: 25 additions & 0 deletions paper/bibliography/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,28 @@ @techreport{woodruff2023survey
note = {Demonstrates superiority of machine learning approaches over traditional methods for survey enhancement through comprehensive benchmarking},
url = {https://github.com/policyengine/survey-enhance/blob/main/docs/paper/project_paper.pdf}
}

@techreport{khitatrakun2023race,
title = {A New Approach for Estimating the Impact of Tax Policies by Race and Ethnicity},
author = {Khitatrakun, Surachai and Mermin, Gordon and Page, Benjamin and Rohaly, Jeffrey},
institution = {Tax Policy Center},
year = {2023},
month = {2},
url = {https://taxpolicycenter.org/sites/default/files/publication/164920/a_new_approach_for_estimating_the_impact_of_tax_policies_by_race_and_ethnicity.pdf}
}

@misc{itep2024race,
title = {ITEP's Approach to Modeling Taxes by Race and Ethnicity},
author = {{Institute on Taxation and Economic Policy}},
year = {2024},
url = {https://itep.org/itep-tax-model/iteps-approach-to-modeling-taxes-by-race-and-ethnicity/}
}

@article{cbo2024race,
title = {CBO Presents Initial Estimates of Taxpayers by Race and Ethnicity},
author = {Heller, Rebecca and Mok, Shannon and Pearce, James},
journal = {Congressional Budget Office Blog},
year = {2024},
month = {5},
url = {https://www.cbo.gov/publication/60171}
}
Binary file added paper/figures/logos/blue.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added paper/figures/logos/white.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added paper/figures/policyengine_policy.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added paper/figures/policyengine_results.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added paper/figures/quantile_loss.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed paper/main.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion paper/main.tex
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
\usepackage{hyperref}
\usepackage{booktabs}
\usepackage{geometry}
\usepackage{microtype}
\usepackage[disable]{microtype}
\usepackage{xcolor}

% Set citation style in preamble
Expand Down
2 changes: 2 additions & 0 deletions paper/sections/methodology.tex
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ \section{Methodology}\label{sec:methodology}
\end{figure}

\input{sections/methodology/overview}
\input{sections/methodology/policyengine}
\input{sections/methodology/demographic_analysis}
\input{sections/methodology/demographic_variables}
\input{sections/methodology/puf_preprocessing}
\input{sections/methodology/aging}
Expand Down
24 changes: 24 additions & 0 deletions paper/sections/methodology/demographic_analysis.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
\subsection{Demographic Analysis}

A key advantage of building from CPS microdata is the ability to analyze policies by demographics not available in tax returns. While organizations using tax return data as their base must develop complex methods to impute race and ethnicity, our approach provides these characteristics directly from the survey data.

The Internal Revenue Service does not collect information on race or ethnicity from tax filers. Other microsimulation models have addressed this limitation through various imputation approaches:

\begin{itemize}
\item The Congressional Budget Office statistically matches tax returns to survey records using income and limited demographic characteristics, then validates against linked Census-IRS data \citep{cbo2024race}
\item The Tax Policy Center creates multiple copies of each tax unit record, then uses an algorithm to reweight these copies to match aggregate race and ethnicity statistics from survey data \citep{khitatrakun2023race}
\item The Institute on Taxation and Economic Policy assigns each tax record probabilities of different racial and ethnic identities based on characteristics like income, marital status, state, and homeownership \citep{itep2024race}
\end{itemize}

These approaches require complex statistical methods and face inherent limitations in accuracy, particularly when analyzing subgroups or policy impacts that may vary by demographic characteristics not used in the imputation process.

In contrast, our approach provides race and ethnicity variables directly from the CPS without requiring complex imputation. This offers several advantages:

\begin{itemize}
\item Race and ethnicity are observed rather than imputed, avoiding potential biases from statistical matching
\item Demographic information is available at the individual level, not just for tax unit heads
\item The same enhancement methodology can be applied to analyze other demographic characteristics like disability status and educational attainment
\item Interactions between demographics can be analyzed naturally (e.g., poverty impacts by both race and age)
\end{itemize}

This capability enables more reliable analysis of how tax and benefit policies affect different demographic groups. For example, using the enhanced CPS we can directly examine how the Earned Income Tax Credit's benefits vary by race and ethnicity, or analyze the distributional effects of Child Tax Credit reforms across both income levels and demographic categories.
80 changes: 80 additions & 0 deletions paper/sections/methodology/policyengine.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
\subsection{PolicyEngine Integration and Access}

The enhanced dataset is designed to integrate seamlessly with PolicyEngine, an open-source tax-benefit microsimulation platform available both as a Python package and a web application at \url{policyengine.org}. The platform provides comprehensive tools for analyzing tax and benefit reforms through both programmatic and web interfaces.

\subsubsection{Web Interface}

PolicyEngine's web interface at \url{policyengine.org/us} allows users to:
\begin{itemize}
\item Modify thousands of policy parameters across federal and state tax and benefit programs
\item Analyze reforms' impacts on:
\begin{itemize}
\item Government budgets (federal taxes, benefits, and state/local taxes)
\item Income distribution (gains and losses across the income spectrum)
\item Poverty (by age, race/ethnicity, and sex using the SPM)
\item Inequality (various metrics)
\item Labor supply (with customizable elasticities)
\end{itemize}
\item Generate natural language summaries of policy impacts using Claude 3.5 Sonnet
\item Calculate household-specific impacts by entering detailed information
\item View marginal tax rates under current law and reforms
\end{itemize}

\begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{figures/policyengine_policy.png}
\caption{PolicyEngine's policy editor interface, showing modification of the top marginal tax rate.}
\label{fig:policyengine_policy}
\end{figure}

\begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{figures/policyengine_results.png}
\caption{Example distributional analysis from PolicyEngine showing population impacts by income decile.}
\label{fig:policyengine_results}
\end{figure}

\subsubsection{Python Package}

The Python package provides programmatic access with just a few lines of code:

\begin{verbatim}
from policyengine_us import Microsimulation

# Load enhanced CPS dataset
sim = Microsimulation(dataset="enhanced_cps_2024")

# Analyze a tax reform
reform = {
"gov.irs.tax_rate.single": {
2024: [
{"threshold": 400_000, "rate": 0.396}
]
}
}
reformed = Microsimulation(reform=reform)

# Calculate revenue impact
baseline_revenue = sim.calculate("income_tax").sum()
reform_revenue = reformed.calculate("income_tax").sum()
revenue_impact = reform_revenue - baseline_revenue

# Analyze impacts by group
income_deciles = sim.calculate("income_decile")
for decile in range(1, 11):
mask = income_deciles == decile
impact = reformed.calculate(
"household_net_income",
mask=mask
).mean() - sim.calculate(
"household_net_income",
mask=mask
).mean()
print(f"Decile {decile}: ${impact:,.0f}")
\end{verbatim}

\subsubsection{International Applications}

The same enhancement methodology and software infrastructure powers PolicyEngine UK, which incorporates additional data sources including the Living Costs and Food Survey (for consumption) and Wealth and Assets Survey. This demonstrates the approach's adaptability to different national contexts and data environments.

While designed for seamless integration with PolicyEngine, the enhanced dataset is also available as a standalone HDF5 file that can be used with other microsimulation frameworks, promoting broader research applications while maintaining the benefits of integrated analysis tools.
Loading
Loading