Merge pull request #202 from jdebacker/flex_data

Allow for more flexible microdata inputs
PSLmodels · Jun 10, 2024 · 1cd87f4 · 1cd87f4
2 parents 8b9fba3 + 782931e
commit 1cd87f4
Show file tree

Hide file tree

Showing 11 changed files with 312 additions and 97 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -53,7 +53,7 @@ jobs:
         shell: bash -l {0}
         working-directory: ./
         run: |
-          pytest --cov=./ --cov-report=xml
+          pytest -m 'not requires_pufcsv and not requires_tmdcsv' --cov=./ --cov-report=xml
       - name: Upload coverage to Codecov
         if: matrix.os == 'ubuntu-latest' && contains(github.repository, 'PSLmodels/Tax-Brain')
         uses: codecov/codecov-action@v4

diff --git a/RELEASES.md b/RELEASES.md
@@ -1,5 +1,15 @@
 # Tax-Brain Release History
 
+
+## 2024-06-10 Release 2.7.0
+
+Last Merged Pull Request: [#202](https://github.com/PSLmodels/Tax-Brain/pull/196)
+
+Changes in this release:
+
+* Use of the [Tax Micro Data (TMD)](https://github.com/PSLmodels/tax-microdata-benchmarking) file: [#202](https://github.com/PSLmodels/Tax-Brain/pull/202)
+
+
 ## 2024-04-25 Release 2.7.0
 
 Last Merged Pull Request: [#196](https://github.com/PSLmodels/Tax-Brain/pull/196)

diff --git a/cs-config/cs_config/functions.py b/cs-config/cs_config/functions.py
@@ -12,6 +12,7 @@
     postprocess,
     nth_year_results,
     retrieve_puf,
+    retrieve_tmd,
 )
 from .outputs import create_layout, aggregate_plot
 from taxbrain import TaxBrain, report
@@ -25,6 +26,9 @@
 PUF_S3_FILE_LOCATION = os.environ.get(
     "PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
 )
+TMD_S3_FILE_LOCATION = os.environ.get(
+    "TMD_S3_LOCATION", "s3://ospc-data-files/tmd.20210720.csv.gz"
+)
 
 CUR_PATH = os.path.abspath(os.path.dirname(__file__))
 
@@ -108,7 +112,6 @@ def run_model(meta_params_dict, adjustment):
     behavior_mods = cs2tc.convert_behavior_adjustment(adjustment["behavior"])
     user_mods = {"policy": policy_mods, "behavior": behavior_mods}
     start_year = int(meta_params.year)
-    use_cps = meta_params.data_source == "CPS"
     if meta_params.data_source == "PUF":
         puf_df = retrieve_puf(
             PUF_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
@@ -117,22 +120,45 @@ def run_model(meta_params_dict, adjustment):
             if not isinstance(puf_df, pd.DataFrame):
                 raise TypeError("'puf_df' must be a Pandas DataFrame.")
             fuzz = True
-            use_cps = False
             sampling_frac = 0.05
             sampling_seed = 2222
             full_sample = puf_df
+            data_start_year = taxcalc.Records.PUFCSV_YEAR
+            weights = taxcalc.Records.PUF_WEIGHTS_FILENAME
         else:
             # Access keys are not available. Default to the CPS.
             print("Defaulting to the CPS")
             meta_params.adjust({"data_source": "CPS"})
-    if meta_params.data_source == "CPS":
+    elif meta_params.data_source == "TMD":
+        tmd_df = retrieve_tmd(
+            TMD_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
+        )
+        if tmd_df is not None:
+            if not isinstance(tmd_df, pd.DataFrame):
+                raise TypeError("'tmd_df' must be a Pandas DataFrame.")
+            fuzz = True
+            sampling_frac = 0.05
+            sampling_seed = 2222
+            full_sample = tmd_df
+            data_start_year = taxcalc.Records.TMDCSV_YEAR
+            weights = taxcalc.Records.TMD_WEIGHTS_FILENAME
+        else:
+            # Access keys are not available. Default to the CPS.
+            print("Defaulting to the CPS")
+            meta_params.adjust({"data_source": "CPS"})
+    elif meta_params.data_source == "CPS":
         fuzz = False
-        use_cps = True
         input_path = os.path.join(TCDIR, "cps.csv.gz")
         # full_sample = read_egg_csv(cpspath)  # pragma: no cover
         sampling_frac = 0.03
         sampling_seed = 180
         full_sample = pd.read_csv(input_path)
+        data_start_year = taxcalc.Records.CPSCSV_YEAR
+        weights = taxcalc.Records.CPS_WEIGHTS_FILENAME
+    else:
+        raise ValueError(
+            f"Data source '{meta_params.data_source}' is not supported."
+        )
 
     if meta_params.use_full_sample:
         sample = full_sample
@@ -146,8 +172,12 @@ def run_model(meta_params_dict, adjustment):
     tb = TaxBrain(
         start_year,
         end_year,
-        microdata=sample,
-        use_cps=use_cps,
+        microdata={
+            "data": sample,
+            "start_year": data_start_year,
+            "growfactors": None,
+            "weights": weights,
+        },
         reform=policy_mods,
         behavior=behavior_mods,
     )

diff --git a/cs-config/cs_config/helpers.py b/cs-config/cs_config/helpers.py
@@ -58,6 +58,10 @@
     "PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
 )
 
+TMD_S3_FILE_LOCATION = os.environ.get(
+    "TMD_S3_LOCATION", "s3://ospc-data-files/tmd.20210720.csv.gz"
+)
+
 
 def random_seed(user_mods, year):
     """
@@ -376,7 +380,7 @@ def retrieve_puf(
     aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
 ):
     """
-    Function for retrieving the PUF from the OSPC S3 bucket
+    Function for retrieving the PUF from the S3 bucket
     """
     s3_reader_installed = S3FileSystem is not None
     has_credentials = (
@@ -405,3 +409,40 @@ def retrieve_puf(
             f"s3_reader_installed={s3_reader_installed})"
         )
         return None
+
+
+def retrieve_tmd(
+    tmd_s3_file_location=TMD_S3_FILE_LOCATION,
+    aws_access_key_id=AWS_ACCESS_KEY_ID,
+    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+):
+    """
+    Function for retrieving the TMD from the S3 bucket
+    """
+    s3_reader_installed = S3FileSystem is not None
+    has_credentials = (
+        aws_access_key_id is not None and aws_secret_access_key is not None
+    )
+    if tmd_s3_file_location and has_credentials and s3_reader_installed:
+        print("Reading tmd from S3 bucket.", tmd_s3_file_location)
+        fs = S3FileSystem(
+            key=AWS_ACCESS_KEY_ID,
+            secret=AWS_SECRET_ACCESS_KEY,
+        )
+        with fs.open(tmd_s3_file_location) as f:
+            # Skips over header from top of file.
+            tmd_df = pd.read_csv(f)
+        return tmd_df
+    elif Path("tmd.csv.gz").exists():
+        print("Reading tmd from tmd.csv.gz.")
+        return pd.read_csv("tmd.csv.gz", compression="gzip")
+    elif Path("tmd.csv").exists():
+        print("Reading tmd from tmd.csv.")
+        return pd.read_csv("tmd.csv")
+    else:
+        warnings.warn(
+            f"TMD file not available (tmd_location={tmd_s3_file_location}, "
+            f"has_credentials={has_credentials}, "
+            f"s3_reader_installed={s3_reader_installed})"
+        )
+        return None
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+testpaths =
+    taxbrain
+    cs-config/cs_config/tests
+markers =
+    requires_pufcsv
+    requires_tmdcsv
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 with open("README.md", "r") as f:
     long_description = f.read()
-version = "2.7.0"
+version = "2.7.1"
 setuptools.setup(
     name="taxbrain",
     version=version,

diff --git a/taxbrain/cli.py b/taxbrain/cli.py
@@ -47,7 +47,6 @@ def cli_core(
     startyear,
     endyear,
     data,
-    usecps,
     reform,
     behavior,
     assump,
@@ -91,12 +90,14 @@ def cli_core(
         start_year=startyear,
         end_year=endyear,
         microdata=data,
-        use_cps=usecps,
         reform=reform,
         behavior=behavior,
         assump=assump,
         base_policy=baseline,
-        verbose=True,
+        corp_revenue=None,
+        corp_incidence_assumptions=None,
+        verbose=False,
+        stacked=False,
     )
     tb.run()
 
@@ -156,15 +157,6 @@ def cli_main():
         ),
         default=None,
     )
-    parser.add_argument(
-        "--usecps",
-        help=(
-            "If this argument is present, the CPS file included in "
-            "Tax-Calculator will be used for the analysis."
-        ),
-        default=False,
-        action="store_true",
-    ),
     parser.add_argument(
         "--reform",
         help=("--reform should be a path to a JSON file."),