Skip to content

Commit

Permalink
Merge pull request #202 from jdebacker/flex_data
Browse files Browse the repository at this point in the history
Allow for more flexible microdata inputs
  • Loading branch information
jdebacker authored Jun 10, 2024
2 parents 8b9fba3 + 782931e commit 1cd87f4
Show file tree
Hide file tree
Showing 11 changed files with 312 additions and 97 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
shell: bash -l {0}
working-directory: ./
run: |
pytest --cov=./ --cov-report=xml
pytest -m 'not requires_pufcsv and not requires_tmdcsv' --cov=./ --cov-report=xml
- name: Upload coverage to Codecov
if: matrix.os == 'ubuntu-latest' && contains(github.repository, 'PSLmodels/Tax-Brain')
uses: codecov/codecov-action@v4
Expand Down
10 changes: 10 additions & 0 deletions RELEASES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Tax-Brain Release History


## 2024-06-10 Release 2.7.0

Last Merged Pull Request: [#202](https://github.com/PSLmodels/Tax-Brain/pull/196)

Changes in this release:

* Use of the [Tax Micro Data (TMD)](https://github.com/PSLmodels/tax-microdata-benchmarking) file: [#202](https://github.com/PSLmodels/Tax-Brain/pull/202)


## 2024-04-25 Release 2.7.0

Last Merged Pull Request: [#196](https://github.com/PSLmodels/Tax-Brain/pull/196)
Expand Down
42 changes: 36 additions & 6 deletions cs-config/cs_config/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
postprocess,
nth_year_results,
retrieve_puf,
retrieve_tmd,
)
from .outputs import create_layout, aggregate_plot
from taxbrain import TaxBrain, report
Expand All @@ -25,6 +26,9 @@
PUF_S3_FILE_LOCATION = os.environ.get(
"PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)
TMD_S3_FILE_LOCATION = os.environ.get(
"TMD_S3_LOCATION", "s3://ospc-data-files/tmd.20210720.csv.gz"
)

CUR_PATH = os.path.abspath(os.path.dirname(__file__))

Expand Down Expand Up @@ -108,7 +112,6 @@ def run_model(meta_params_dict, adjustment):
behavior_mods = cs2tc.convert_behavior_adjustment(adjustment["behavior"])
user_mods = {"policy": policy_mods, "behavior": behavior_mods}
start_year = int(meta_params.year)
use_cps = meta_params.data_source == "CPS"
if meta_params.data_source == "PUF":
puf_df = retrieve_puf(
PUF_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
Expand All @@ -117,22 +120,45 @@ def run_model(meta_params_dict, adjustment):
if not isinstance(puf_df, pd.DataFrame):
raise TypeError("'puf_df' must be a Pandas DataFrame.")
fuzz = True
use_cps = False
sampling_frac = 0.05
sampling_seed = 2222
full_sample = puf_df
data_start_year = taxcalc.Records.PUFCSV_YEAR
weights = taxcalc.Records.PUF_WEIGHTS_FILENAME
else:
# Access keys are not available. Default to the CPS.
print("Defaulting to the CPS")
meta_params.adjust({"data_source": "CPS"})
if meta_params.data_source == "CPS":
elif meta_params.data_source == "TMD":
tmd_df = retrieve_tmd(
TMD_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
)
if tmd_df is not None:
if not isinstance(tmd_df, pd.DataFrame):
raise TypeError("'tmd_df' must be a Pandas DataFrame.")
fuzz = True
sampling_frac = 0.05
sampling_seed = 2222
full_sample = tmd_df
data_start_year = taxcalc.Records.TMDCSV_YEAR
weights = taxcalc.Records.TMD_WEIGHTS_FILENAME
else:
# Access keys are not available. Default to the CPS.
print("Defaulting to the CPS")
meta_params.adjust({"data_source": "CPS"})
elif meta_params.data_source == "CPS":
fuzz = False
use_cps = True
input_path = os.path.join(TCDIR, "cps.csv.gz")
# full_sample = read_egg_csv(cpspath) # pragma: no cover
sampling_frac = 0.03
sampling_seed = 180
full_sample = pd.read_csv(input_path)
data_start_year = taxcalc.Records.CPSCSV_YEAR
weights = taxcalc.Records.CPS_WEIGHTS_FILENAME
else:
raise ValueError(
f"Data source '{meta_params.data_source}' is not supported."
)

if meta_params.use_full_sample:
sample = full_sample
Expand All @@ -146,8 +172,12 @@ def run_model(meta_params_dict, adjustment):
tb = TaxBrain(
start_year,
end_year,
microdata=sample,
use_cps=use_cps,
microdata={
"data": sample,
"start_year": data_start_year,
"growfactors": None,
"weights": weights,
},
reform=policy_mods,
behavior=behavior_mods,
)
Expand Down
43 changes: 42 additions & 1 deletion cs-config/cs_config/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@
"PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)

TMD_S3_FILE_LOCATION = os.environ.get(
"TMD_S3_LOCATION", "s3://ospc-data-files/tmd.20210720.csv.gz"
)


def random_seed(user_mods, year):
"""
Expand Down Expand Up @@ -376,7 +380,7 @@ def retrieve_puf(
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
):
"""
Function for retrieving the PUF from the OSPC S3 bucket
Function for retrieving the PUF from the S3 bucket
"""
s3_reader_installed = S3FileSystem is not None
has_credentials = (
Expand Down Expand Up @@ -405,3 +409,40 @@ def retrieve_puf(
f"s3_reader_installed={s3_reader_installed})"
)
return None


def retrieve_tmd(
tmd_s3_file_location=TMD_S3_FILE_LOCATION,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
):
"""
Function for retrieving the TMD from the S3 bucket
"""
s3_reader_installed = S3FileSystem is not None
has_credentials = (
aws_access_key_id is not None and aws_secret_access_key is not None
)
if tmd_s3_file_location and has_credentials and s3_reader_installed:
print("Reading tmd from S3 bucket.", tmd_s3_file_location)
fs = S3FileSystem(
key=AWS_ACCESS_KEY_ID,
secret=AWS_SECRET_ACCESS_KEY,
)
with fs.open(tmd_s3_file_location) as f:
# Skips over header from top of file.
tmd_df = pd.read_csv(f)
return tmd_df
elif Path("tmd.csv.gz").exists():
print("Reading tmd from tmd.csv.gz.")
return pd.read_csv("tmd.csv.gz", compression="gzip")
elif Path("tmd.csv").exists():
print("Reading tmd from tmd.csv.")
return pd.read_csv("tmd.csv")
else:
warnings.warn(
f"TMD file not available (tmd_location={tmd_s3_file_location}, "
f"has_credentials={has_credentials}, "
f"s3_reader_installed={s3_reader_installed})"
)
return None
7 changes: 7 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[pytest]
testpaths =
taxbrain
cs-config/cs_config/tests
markers =
requires_pufcsv
requires_tmdcsv
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

with open("README.md", "r") as f:
long_description = f.read()
version = "2.7.0"
version = "2.7.1"
setuptools.setup(
name="taxbrain",
version=version,
Expand Down
16 changes: 4 additions & 12 deletions taxbrain/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def cli_core(
startyear,
endyear,
data,
usecps,
reform,
behavior,
assump,
Expand Down Expand Up @@ -91,12 +90,14 @@ def cli_core(
start_year=startyear,
end_year=endyear,
microdata=data,
use_cps=usecps,
reform=reform,
behavior=behavior,
assump=assump,
base_policy=baseline,
verbose=True,
corp_revenue=None,
corp_incidence_assumptions=None,
verbose=False,
stacked=False,
)
tb.run()

Expand Down Expand Up @@ -156,15 +157,6 @@ def cli_main():
),
default=None,
)
parser.add_argument(
"--usecps",
help=(
"If this argument is present, the CPS file included in "
"Tax-Calculator will be used for the analysis."
),
default=False,
action="store_true",
),
parser.add_argument(
"--reform",
help=("--reform should be a path to a JSON file."),
Expand Down
Loading

0 comments on commit 1cd87f4

Please sign in to comment.