From 643649a90cf2cdc4992d65e8b3c74b6d3f4f774e Mon Sep 17 00:00:00 2001 From: martinholmer Date: Mon, 2 Oct 2017 11:40:45 -0400 Subject: [PATCH 01/11] Rename dropq as tbi (taxbrain interface) --- setup.py | 2 +- taxcalc/__init__.py | 2 +- taxcalc/dropq/__init__.py | 4 -- taxcalc/tbi/__init__.py | 4 ++ taxcalc/{dropq/dropq.py => tbi/tbi.py} | 39 ++++++++++--------- .../dropq_utils.py => tbi/tbi_utils.py} | 16 ++++---- taxcalc/tests/{test_dropq.py => test_tbi.py} | 15 ++++--- taxcalc/tests/test_utils.py | 2 +- 8 files changed, 42 insertions(+), 42 deletions(-) delete mode 100644 taxcalc/dropq/__init__.py create mode 100644 taxcalc/tbi/__init__.py rename taxcalc/{dropq/dropq.py => tbi/tbi.py} (88%) rename taxcalc/{dropq/dropq_utils.py => tbi/tbi_utils.py} (98%) rename taxcalc/tests/{test_dropq.py => test_tbi.py} (94%) diff --git a/setup.py b/setup.py index 1b1195c8a..a075625f1 100755 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ 'cmdclass': cmdclass, 'license': 'MIT', 'packages': ['taxcalc', 'taxcalc.filings', 'taxcalc.filings.forms', - 'taxcalc.dropq', 'taxcalc.cli'], + 'taxcalc.tbi', 'taxcalc.cli'], 'include_package_data': True, 'name': 'taxcalc', 'install_requires': ['numpy', 'pandas'], diff --git a/taxcalc/__init__.py b/taxcalc/__init__.py index 7791680bc..f27d23013 100755 --- a/taxcalc/__init__.py +++ b/taxcalc/__init__.py @@ -9,7 +9,7 @@ from taxcalc.taxcalcio import * from taxcalc.utils import * from taxcalc.macro_elasticity import * -from taxcalc.dropq import * +from taxcalc.tbi import * from taxcalc.cli import * from taxcalc._version import get_versions diff --git a/taxcalc/dropq/__init__.py b/taxcalc/dropq/__init__.py deleted file mode 100644 index f116dc920..000000000 --- a/taxcalc/dropq/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from taxcalc.dropq.dropq import (run_nth_year_tax_calc_model, - run_nth_year_gdp_elast_model, - create_json_table, - reform_warnings_errors) diff --git a/taxcalc/tbi/__init__.py b/taxcalc/tbi/__init__.py new file mode 100644 index 000000000..0ec6d5297 --- /dev/null +++ b/taxcalc/tbi/__init__.py @@ -0,0 +1,4 @@ +from taxcalc.tbi.tbi import (run_nth_year_tax_calc_model, + run_nth_year_gdp_elast_model, + create_json_table, + reform_warnings_errors) diff --git a/taxcalc/dropq/dropq.py b/taxcalc/tbi/tbi.py similarity index 88% rename from taxcalc/dropq/dropq.py rename to taxcalc/tbi/tbi.py index a3816a6b4..84e9ae49b 100644 --- a/taxcalc/dropq/dropq.py +++ b/taxcalc/tbi/tbi.py @@ -1,24 +1,25 @@ """ -The dropq functions are used by TaxBrain to call Tax-Calculator in order -to maintain the privacy of the IRS-SOI PUF data being used by TaxBrain. -This is done by "fuzzing" reform results for several randomly selected +The tbi functions are used by TaxBrain to call Tax-Calculator in order +to do distributed processing of TaxBrain runs and in order to maintain +the privacy of the IRS-SOI PUF data being used by TaxBrain. Maintaining +privacy is done by "fuzzing" reform results for several randomly selected filing units in each table cell. The filing units randomly selected differ for each policy reform and the "fuzzing" involves replacing the post-reform tax results for the selected units with their pre-reform tax results. """ # CODING-STYLE CHECKS: -# pep8 --ignore=E402 dropq.py -# pylint --disable=locally-disabled dropq.py +# pep8 --ignore=E402 tbi.py +# pylint --disable=locally-disabled tbi.py from __future__ import print_function import time import numpy as np import pandas as pd -from taxcalc.dropq.dropq_utils import (dropq_calculate, - random_seed, - dropq_summary, - AGGR_ROW_NAMES) +from taxcalc.tbi.tbi_utils import (calculate, + random_seed, + summary, + AGGR_ROW_NAMES) from taxcalc import (results, DIST_TABLE_LABELS, proportional_change_gdp, Growdiff, Growfactors, Policy) @@ -93,10 +94,10 @@ def run_nth_year_tax_calc_model(year_n, start_year, start_time = time.time() # create calc1 and calc2 calculated for year_n and mask - (calc1, calc2, mask) = dropq_calculate(year_n, start_year, - taxrec_df, user_mods, - behavior_allowed=True, - mask_computed=True) + (calc1, calc2, mask) = calculate(year_n, start_year, + taxrec_df, user_mods, + behavior_allowed=True, + mask_computed=True) # extract raw results from calc1 and calc2 rawres1 = results(calc1.records) @@ -107,8 +108,8 @@ def run_nth_year_tax_calc_model(year_n, start_year, print('seed={}'.format(seed)) np.random.seed(seed) # pylint: disable=no-member - # construct dropq summary results from raw results - summ = dropq_summary(rawres1, rawres2, mask) + # construct TaxBrain summary results from raw results + summ = summary(rawres1, rawres2, mask) elapsed_time = time.time() - start_time print('elapsed time for this run: ', elapsed_time) @@ -167,10 +168,10 @@ def run_nth_year_gdp_elast_model(year_n, start_year, 'gdp_elasticity': {'value': }. """ # create calc1 and calc2 calculated for year_n - (calc1, calc2, _) = dropq_calculate(year_n, start_year, - taxrec_df, user_mods, - behavior_allowed=False, - mask_computed=False) + (calc1, calc2, _) = calculate(year_n, start_year, + taxrec_df, user_mods, + behavior_allowed=False, + mask_computed=False) # compute GDP effect given assumed gdp elasticity gdp_elasticity = user_mods['gdp_elasticity']['value'] diff --git a/taxcalc/dropq/dropq_utils.py b/taxcalc/tbi/tbi_utils.py similarity index 98% rename from taxcalc/dropq/dropq_utils.py rename to taxcalc/tbi/tbi_utils.py index 4ceed6ab0..efdd0b132 100644 --- a/taxcalc/dropq/dropq_utils.py +++ b/taxcalc/tbi/tbi_utils.py @@ -1,9 +1,9 @@ """ -Private utility functions used only by public functions in the dropq.py file. +Private utility functions used only by public functions in the tbi.py file. """ # CODING-STYLE CHECKS: -# pep8 --ignore=E402 dropq_utils.py -# pylint --disable=locally-disabled dropq_utils.py +# pep8 --ignore=E402 tbi_utils.py +# pylint --disable=locally-disabled tbi_utils.py import copy import hashlib @@ -51,11 +51,11 @@ def check_user_mods(user_mods): raise ValueError('user_mods has extra keys: {}'.format(extra_keys)) -def dropq_calculate(year_n, start_year, - taxrec_df, user_mods, - behavior_allowed, mask_computed): +def calculate(year_n, start_year, + taxrec_df, user_mods, + behavior_allowed, mask_computed): """ - The dropq_calculate function assumes specified user_mods is + The calculate function assumes specified user_mods is a dictionary returned by the Calculator.read_json_parameter_files() function with an extra key:value pair that is specified as 'gdp_elasticity': {'value': }. @@ -301,7 +301,7 @@ def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz): AGGR_ROW_NAMES = ['ind_tax', 'payroll_tax', 'combined_tax'] -def dropq_summary(df1, df2, mask): +def summary(df1, df2, mask): """ df1 contains raw results for baseline plan df2 contains raw results for reform plan diff --git a/taxcalc/tests/test_dropq.py b/taxcalc/tests/test_tbi.py similarity index 94% rename from taxcalc/tests/test_dropq.py rename to taxcalc/tests/test_tbi.py index 85b696d4d..6bfb082c7 100644 --- a/taxcalc/tests/test_dropq.py +++ b/taxcalc/tests/test_tbi.py @@ -1,12 +1,11 @@ """ -test_dropq.py uses only PUF input data because the dropq algorithm -is designed to work exclusively with private IRS-SOI PUF input data. +Test functions in taxcalc/tbi directory using both puf.csv and cps.csv input. """ import numpy as np import pandas as pd import pytest -from taxcalc.dropq.dropq_utils import * -from taxcalc.dropq import * +from taxcalc.tbi.tbi_utils import * +from taxcalc.tbi import * from taxcalc import (Policy, Records, Calculator, multiyear_diagnostic_table, results) @@ -190,12 +189,12 @@ def test_with_pufcsv(puf_fullsample): tax_data, usermods, return_json=True) total = resdict['aggr_2'] - dropq_reform_revenue = float(total['combined_tax_9']) * 1e-9 - # assert that dropq revenue is similar to the fullsample calculation - diff = abs(fulls_reform_revenue - dropq_reform_revenue) + tbi_reform_revenue = float(total['combined_tax_9']) * 1e-9 + # assert that tbi revenue is similar to the fullsample calculation + diff = abs(fulls_reform_revenue - tbi_reform_revenue) proportional_diff = diff / fulls_reform_revenue frmt = 'f,d,adiff,pdiff= {:.4f} {:.4f} {:.4f} {}' - print(frmt.format(fulls_reform_revenue, dropq_reform_revenue, + print(frmt.format(fulls_reform_revenue, tbi_reform_revenue, diff, proportional_diff)) assert proportional_diff < 0.0001 # one-hundredth of one percent # assert 1 == 2 # uncomment to force test failure with above print out diff --git a/taxcalc/tests/test_utils.py b/taxcalc/tests/test_utils.py index e5f5d0897..d930a6173 100644 --- a/taxcalc/tests/test_utils.py +++ b/taxcalc/tests/test_utils.py @@ -256,7 +256,7 @@ def test_diff_count_precision(): """ Estimate bootstrap standard error and confidence interval for count statistics ('tax_cut' and 'tax_inc') in difference table generated - using puf.csv input data taking no account of dropq fuzzing and + using puf.csv input data taking no account of tbi privacy fuzzing and assuming all filing units in each bin have the same weight. These assumptions imply that the estimates produced here are likely to over-estimate the precision of the count statistics. From 9575e2c522742fee71e60c459e465882fa420bf4 Mon Sep 17 00:00:00 2001 From: martinholmer Date: Mon, 2 Oct 2017 13:04:47 -0400 Subject: [PATCH 02/11] Revise Read-the-Docs public_api.rst for dropq-to-tbi rename --- read-the-docs/source/public_api.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/read-the-docs/source/public_api.rst b/read-the-docs/source/public_api.rst index 2144f88e5..972690213 100644 --- a/read-the-docs/source/public_api.rst +++ b/read-the-docs/source/public_api.rst @@ -52,12 +52,6 @@ taxcalc.decorators .. automodule:: taxcalc.decorators :members: -taxcalc.dropq.dropq -------------------- - -.. automodule:: taxcalc.dropq.dropq - :members: - taxcalc.functions ----------------- @@ -108,6 +102,12 @@ taxcalc.TaxCalcIO .. autoclass:: taxcalc.TaxCalcIO :members: +taxcalc.tbi.tbi +------------------- + +.. automodule:: taxcalc.tbi.tbi + :members: + taxcalc.utils ------------- From 60897ab5d7ce209cf01c98f3200cfcca265f1ff7 Mon Sep 17 00:00:00 2001 From: martinholmer Date: Mon, 2 Oct 2017 13:09:15 -0400 Subject: [PATCH 03/11] Eliminate embedded gdp_elasticity dict in user_mods dict --- taxcalc/macro_elasticity.py | 2 +- taxcalc/tbi/tbi.py | 16 ++++++---------- taxcalc/tbi/tbi_utils.py | 22 +++++++++------------- taxcalc/tests/test_tbi.py | 17 ++++++++++------- 4 files changed, 26 insertions(+), 31 deletions(-) diff --git a/taxcalc/macro_elasticity.py b/taxcalc/macro_elasticity.py index b54f20e66..bd462650d 100644 --- a/taxcalc/macro_elasticity.py +++ b/taxcalc/macro_elasticity.py @@ -3,7 +3,7 @@ """ -def proportional_change_gdp(calc1, calc2, elasticity=0.0): +def proportional_change_gdp(calc1, calc2, elasticity): ''' This function harnesses econometric estimates of the historic relationship between tax policy and the macroeconomy to predict the effect of tax diff --git a/taxcalc/tbi/tbi.py b/taxcalc/tbi/tbi.py index 84e9ae49b..d1855bbbc 100644 --- a/taxcalc/tbi/tbi.py +++ b/taxcalc/tbi/tbi.py @@ -85,10 +85,8 @@ def run_nth_year_tax_calc_model(year_n, start_year, taxrec_df, user_mods, return_json=True): """ - The run_nth_year_tax_calc_model function assumes user_mods is a - dictionary returned by the Calculator.read_json_parameter_files() - function with an extra key:value pair that is specified as - 'gdp_elasticity': {'value': }. + The run_nth_year_tax_calc_model function assumes user_mods is a dictionary + returned by the Calculator.read_json_param_objects() function. """ # pylint: disable=too-many-locals start_time = time.time() @@ -160,12 +158,11 @@ def append_year(pdf): def run_nth_year_gdp_elast_model(year_n, start_year, taxrec_df, user_mods, + gdp_elasticity, return_json=True): """ - The run_nth_year_gdp_elast_model function assumes user_mods is a - dictionary returned by the Calculator.read_json_parameter_files() - function with an extra key:value pair that is specified as - 'gdp_elasticity': {'value': }. + The run_nth_year_gdp_elast_model function assumes user_mods is a dictionary + returned by the Calculator.read_json_param_objects() function. """ # create calc1 and calc2 calculated for year_n (calc1, calc2, _) = calculate(year_n, start_year, @@ -173,8 +170,7 @@ def run_nth_year_gdp_elast_model(year_n, start_year, behavior_allowed=False, mask_computed=False) - # compute GDP effect given assumed gdp elasticity - gdp_elasticity = user_mods['gdp_elasticity']['value'] + # compute GDP effect given specified gdp_elasticity gdp_effect = proportional_change_gdp(calc1, calc2, gdp_elasticity) # return gdp_effect results diff --git a/taxcalc/tbi/tbi_utils.py b/taxcalc/tbi/tbi_utils.py index efdd0b132..8ddcb7aee 100644 --- a/taxcalc/tbi/tbi_utils.py +++ b/taxcalc/tbi/tbi_utils.py @@ -41,28 +41,24 @@ def check_user_mods(user_mods): raise ValueError('user_mods is not a dictionary') actual_keys = set(list(user_mods.keys())) expected_keys = set(['policy', 'consumption', 'behavior', - 'growdiff_baseline', 'growdiff_response', - 'gdp_elasticity']) - missing_keys = expected_keys - actual_keys - if len(missing_keys) > 0: - raise ValueError('user_mods has missing keys: {}'.format(missing_keys)) - extra_keys = actual_keys - expected_keys - if len(extra_keys) > 0: - raise ValueError('user_mods has extra keys: {}'.format(extra_keys)) + 'growdiff_baseline', 'growdiff_response']) + if actual_keys != expected_keys: + msg = 'actual user_mod keys not equal to expected keys\n' + msg += ' actual: {}'.format(actual_keys) + msg += ' expect: {}'.format(expected_keys) + raise ValueError(msg) def calculate(year_n, start_year, taxrec_df, user_mods, behavior_allowed, mask_computed): """ - The calculate function assumes specified user_mods is - a dictionary returned by the Calculator.read_json_parameter_files() - function with an extra key:value pair that is specified as - 'gdp_elasticity': {'value': }. + The calculate function assumes the specified user_mods is a dictionary + returned by the Calculator.read_json_param_objects() function. The function returns (calc1, calc2, mask) where calc1 is pre-reform Calculator object calculated for year_n, calc2 is post-reform Calculator object calculated for year_n, and - mask is boolean array if compute_mask=True or None otherwise + mask is boolean array if mask_computeed=True or None otherwise """ # pylint: disable=too-many-arguments,too-many-locals,too-many-statements diff --git a/taxcalc/tests/test_tbi.py b/taxcalc/tests/test_tbi.py index 6bfb082c7..d77ddd261 100644 --- a/taxcalc/tests/test_tbi.py +++ b/taxcalc/tests/test_tbi.py @@ -27,8 +27,6 @@ }, 'growdiff_response': { }, - 'gdp_elasticity': { - } } @@ -62,12 +60,18 @@ def test_check_user_mods_errors(): @pytest.mark.requires_pufcsv def test_run_nth_year_value_errors(puf_subsample): usermods = USER_MODS + # test for growdiff_response not allowed error usermods['growdiff_response'] = {2018: {'_AINTS': [0.02]}} with pytest.raises(ValueError): - run_nth_year_gdp_elast_model(1, 2013, puf_subsample, usermods, False) + run_nth_year_gdp_elast_model(1, 2013, puf_subsample, + usermods, gdp_elasticity=0.36, + return_json=False) usermods['growdiff_response'] = dict() + # test for behavior not allowed error with pytest.raises(ValueError): - run_nth_year_gdp_elast_model(1, 2013, puf_subsample, usermods, False) + run_nth_year_gdp_elast_model(1, 2013, puf_subsample, + usermods, gdp_elasticity=0.36, + return_json=False) @pytest.mark.requires_pufcsv @@ -98,8 +102,8 @@ def test_run_tax_calc_model(puf_subsample, resjson): def test_run_gdp_elast_model(puf_subsample, resjson): usermods = USER_MODS usermods['behavior'] = dict() - usermods['gdp_elasticity'] = {'value': 0.36} - res = run_nth_year_gdp_elast_model(2, 2016, puf_subsample, usermods, + res = run_nth_year_gdp_elast_model(2, 2016, puf_subsample, + usermods, gdp_elasticity=0.36, return_json=resjson) if resjson: assert isinstance(res, dict) @@ -165,7 +169,6 @@ def test_with_pufcsv(puf_fullsample): usermods['behavior'] = {} usermods['growdiff_baseline'] = {} usermods['growdiff_response'] = {} - usermods['gdp_elasticity'] = {} seed = random_seed(usermods) assert seed == 1574318062 # create a Policy object (pol) containing reform policy parameters From 8c6a71a6e1271aaee3b46470809e86e58e130000 Mon Sep 17 00:00:00 2001 From: martinholmer Date: Mon, 2 Oct 2017 18:24:11 -0400 Subject: [PATCH 04/11] Refactor run_nth_year*model functions to use puf or cps --- taxcalc/records.py | 6 +- taxcalc/taxcalcio.py | 4 +- taxcalc/tbi/tbi.py | 47 +++++++++++----- taxcalc/tbi/tbi_utils.py | 114 ++++++++++++++++++++++++++------------ taxcalc/tests/test_tbi.py | 38 ++++++++----- 5 files changed, 142 insertions(+), 67 deletions(-) diff --git a/taxcalc/records.py b/taxcalc/records.py index 02021befc..1f4a0fa6c 100644 --- a/taxcalc/records.py +++ b/taxcalc/records.py @@ -173,7 +173,7 @@ def __init__(self, @staticmethod def cps_constructor(data=None, exact_calculations=False, - growfactors=Growfactors()): + gfactors=Growfactors()): """ Static method returns a Records object instantiated with CPS input data. This works in a analogous way to Records(), which @@ -188,7 +188,7 @@ def cps_constructor(data=None, data = os.path.join(Records.CUR_PATH, 'cps.csv.gz') return Records(data=data, exact_calculations=exact_calculations, - gfactors=growfactors, + gfactors=gfactors, weights=Records.CPS_WEIGHTS_FILENAME, adjust_ratios=Records.CPS_RATIOS_FILENAME, start_year=CPSCSV_YEAR) @@ -213,7 +213,7 @@ def increment_year(self): Also, does extrapolation, reweighting, adjusting for new current year. """ self._current_year += 1 - # apply variable extrapolation growfactors + # apply variable extrapolation grow factors if self.gfactors is not None: self._blowup(self.current_year) # apply variable adjustment ratios diff --git a/taxcalc/taxcalcio.py b/taxcalc/taxcalcio.py index ade6c2917..82d94f576 100644 --- a/taxcalc/taxcalcio.py +++ b/taxcalc/taxcalcio.py @@ -228,11 +228,11 @@ def init(self, input_data, tax_year, reform, assump, if aging_input_data: if self.cps_input_data: recs = Records.cps_constructor( - growfactors=gfactors_ref, + gfactors=gfactors_ref, exact_calculations=exact_calculations ) recs_clp = Records.cps_constructor( - growfactors=gfactors_clp, + gfactors=gfactors_clp, exact_calculations=exact_calculations ) else: # if not cps_input_data diff --git a/taxcalc/tbi/tbi.py b/taxcalc/tbi/tbi.py index d1855bbbc..3485169bf 100644 --- a/taxcalc/tbi/tbi.py +++ b/taxcalc/tbi/tbi.py @@ -46,7 +46,7 @@ def reform_warnings_errors(user_mods): """ The reform_warnings_errors function assumes user_mods is a dictionary - returned by the Calculator.read_json_parameter_files() function. + returned by the Calculator.read_json_param_objects() function. This function returns a dictionary containing two STR:STR pairs: {'warnings': '', 'errors': ''} @@ -82,20 +82,27 @@ def reform_warnings_errors(user_mods): def run_nth_year_tax_calc_model(year_n, start_year, - taxrec_df, user_mods, + use_puf_not_cps, + use_full_sample, + user_mods, return_json=True): """ The run_nth_year_tax_calc_model function assumes user_mods is a dictionary - returned by the Calculator.read_json_param_objects() function. + returned by the Calculator.read_json_param_objects() function. + Setting use_puf_not_cps=True implies use puf.csv input file; + otherwise, use cps.csv input file. + Setting use_full_sample=False implies use sub-sample of input file; + otherwsie, use the complete sample. """ - # pylint: disable=too-many-locals + # pylint: disable=too-many-arguments,too-many-locals + start_time = time.time() # create calc1 and calc2 calculated for year_n and mask (calc1, calc2, mask) = calculate(year_n, start_year, - taxrec_df, user_mods, - behavior_allowed=True, - mask_computed=True) + use_puf_not_cps, use_full_sample, + user_mods, + behavior_allowed=True) # extract raw results from calc1 and calc2 rawres1 = results(calc1.records) @@ -109,9 +116,6 @@ def run_nth_year_tax_calc_model(year_n, start_year, # construct TaxBrain summary results from raw results summ = summary(rawres1, rawres2, mask) - elapsed_time = time.time() - start_time - print('elapsed time for this run: ', elapsed_time) - def append_year(pdf): """ append_year embedded function revises all column names in pdf @@ -124,6 +128,8 @@ def append_year(pdf): res = dict() for tbl in summ: res[tbl] = append_year(summ[tbl]) + elapsed_time = time.time() - start_time + print('elapsed time for this run: ', elapsed_time) return res # optionally construct JSON results tables for year n @@ -153,22 +159,33 @@ def append_year(pdf): res[tbl] = create_json_table(summ[tbl], row_names=info[tbl]['row_names'], column_types=info[tbl]['col_types']) + elapsed_time = time.time() - start_time + print('elapsed time for this run: ', elapsed_time) return res def run_nth_year_gdp_elast_model(year_n, start_year, - taxrec_df, user_mods, + use_puf_not_cps, + use_full_sample, + user_mods, gdp_elasticity, return_json=True): """ The run_nth_year_gdp_elast_model function assumes user_mods is a dictionary - returned by the Calculator.read_json_param_objects() function. + returned by the Calculator.read_json_param_objects() function. + Setting use_puf_not_cps=True implies use puf.csv input file; + otherwise, use cps.csv input file. + Setting use_full_sample=False implies use sub-sample of input file; + otherwsie, use the complete sample. """ + # pylint: disable=too-many-arguments + # create calc1 and calc2 calculated for year_n (calc1, calc2, _) = calculate(year_n, start_year, - taxrec_df, user_mods, - behavior_allowed=False, - mask_computed=False) + use_puf_not_cps, + use_full_sample, + user_mods, + behavior_allowed=False) # compute GDP effect given specified gdp_elasticity gdp_effect = proportional_change_gdp(calc1, calc2, gdp_elasticity) diff --git a/taxcalc/tbi/tbi_utils.py b/taxcalc/tbi/tbi_utils.py index 8ddcb7aee..a78d3f877 100644 --- a/taxcalc/tbi/tbi_utils.py +++ b/taxcalc/tbi/tbi_utils.py @@ -5,6 +5,7 @@ # pep8 --ignore=E402 tbi_utils.py # pylint --disable=locally-disabled tbi_utils.py +import os import copy import hashlib import numpy as np @@ -44,23 +45,28 @@ def check_user_mods(user_mods): 'growdiff_baseline', 'growdiff_response']) if actual_keys != expected_keys: msg = 'actual user_mod keys not equal to expected keys\n' - msg += ' actual: {}'.format(actual_keys) + msg += ' actual: {}\n'.format(actual_keys) msg += ' expect: {}'.format(expected_keys) raise ValueError(msg) def calculate(year_n, start_year, - taxrec_df, user_mods, - behavior_allowed, mask_computed): + use_puf_not_cps, + use_full_sample, + user_mods, + behavior_allowed): """ The calculate function assumes the specified user_mods is a dictionary returned by the Calculator.read_json_param_objects() function. The function returns (calc1, calc2, mask) where calc1 is pre-reform Calculator object calculated for year_n, calc2 is post-reform Calculator object calculated for year_n, and - mask is boolean array if mask_computeed=True or None otherwise + mask is boolean array marking records with reform-induced iitax diffs + Set behavior_allowed to False when generating static results or + set behavior_allowed to True when generating dynamic results. """ - # pylint: disable=too-many-arguments,too-many-locals,too-many-statements + # pylint: disable=too-many-arguments,too-many-locals + # pylint: disable=too-many-branches,too-many-statements check_years(start_year, year_n) check_user_mods(user_mods) @@ -85,9 +91,34 @@ def calculate(year_n, start_year, growdiff_baseline.apply_to(growfactors_post) growdiff_response.apply_to(growfactors_post) - # create pre-reform Calculator instance using PUF input data & weights - recs1 = Records(data=copy.deepcopy(taxrec_df), - gfactors=growfactors_pre) + # create sample pd.DataFrame from specified input file and sampling scheme + tbi_path = os.path.abspath(os.path.dirname(__file__)) + if use_puf_not_cps: + input_path = os.path.join(tbi_path, '..', '..', 'puf.csv.gz') + if not os.path.isfile(input_path): + input_path = os.path.join(tbi_path, '..', '..', 'puf.csv') + sampling_frac = 0.05 + sampling_seed = 180 + else: + input_path = os.path.join(tbi_path, '..', 'cps.csv.gz') + sampling_frac = 0.05 + sampling_seed = 180 + full_sample = pd.read_csv(input_path) + if use_full_sample: + sample = full_sample + else: + sample = full_sample.sample( # pylint: disable=no-member + frac=sampling_frac, + random_state=sampling_seed + ) + + # create pre-reform Calculator instance + if use_puf_not_cps: + recs1 = Records(data=copy.deepcopy(sample), + gfactors=growfactors_pre) + else: + recs1 = Records.cps_constructor(data=copy.deepcopy(sample), + gfactors=growfactors_pre) policy1 = Policy(gfactors=growfactors_pre) calc1 = Calculator(policy=policy1, records=recs1, consumption=consump) while calc1.current_year < start_year: @@ -95,11 +126,11 @@ def calculate(year_n, start_year, calc1.calc_all() assert calc1.current_year == start_year - # optionally compute mask - if mask_computed: - # create pre-reform Calculator instance with extra income using - # PUF input data & weights - recs1p = Records(data=copy.deepcopy(taxrec_df), + # compute mask array + res1 = results(calc1.records) + if use_puf_not_cps: + # create pre-reform Calculator instance with extra income + recs1p = Records(data=copy.deepcopy(sample), gfactors=growfactors_pre) # add one dollar to the income of each filing unit to determine # which filing units undergo a resulting change in tax liability @@ -116,13 +147,14 @@ def calculate(year_n, start_year, # compute mask showing which of the calc1 and calc1p results differ; # mask is true if a filing unit's income tax liability changed after # a dollar was added to the filing unit's wage and salary income - res1 = results(calc1.records) res1p = results(calc1p.records) mask = np.logical_not( # pylint: disable=no-member np.isclose(res1.iitax, res1p.iitax, atol=0.001, rtol=0.0) ) - else: - mask = None + assert np.any(mask) + else: # if use_cps_not_cps is False + # indicate that no fuzzing of reform results is required + mask = np.zeros(res1.shape[0], dtype=np.int8) # specify Behavior instance behv = Behavior() @@ -139,9 +171,13 @@ def calculate(year_n, start_year, msg = 'A behavior RESPONSE IS NOT ALLOWED' raise ValueError(msg) - # create post-reform Calculator instance using PUF input data & weights - recs2 = Records(data=copy.deepcopy(taxrec_df), - gfactors=growfactors_post) + # create post-reform Calculator instance + if use_puf_not_cps: + recs2 = Records(data=copy.deepcopy(sample), + gfactors=growfactors_post) + else: + recs2 = Records.cps_constructor(data=copy.deepcopy(sample), + gfactors=growfactors_post) policy2 = Policy(gfactors=growfactors_post) policy_reform = user_mods['policy'] policy2.implement_reform(policy_reform) @@ -202,7 +238,7 @@ def random_seed_from_subdict(subdict): return seed % np.iinfo(np.uint32).max # pylint: disable=no-member -NUM_TO_FUZZ = 3 +NUM_TO_FUZZ = 3 # when using dropq algorithm on puf.csv results def chooser(agg): @@ -223,7 +259,7 @@ def chooser(agg): msg = ('Not enough differences in income tax when adding ' 'one dollar for chunk with name: {}') raise ValueError(msg.format(agg.name)) - # mark the records chosen to be fuzzed + # mark the records chosen to be fuzzed (ans=0) ans = [1] * len(agg) for idx in choices: ans[idx] = 0 @@ -232,7 +268,7 @@ def chooser(agg): def fuzz_df2_records(df1, df2, mask): """ - Modify df2 by adding random fuzz for data privacy. + Possibly modify df2 results by adding random fuzz for data privacy. Parameters ---------- @@ -243,11 +279,12 @@ def fuzz_df2_records(df1, df2, mask): contains results for the reform plan mask: boolean numpy array - contains info about whether or not each row might be fuzzed + contains info about whether or not units have reform-induced tax diffs + (if mask contains all False values, then no results fuzzing is done) Returns ------- - fuzzed df2: Pandas DataFrame + possibly fuzzed df2: Pandas DataFrame Notes ----- @@ -257,7 +294,7 @@ def fuzz_df2_records(df1, df2, mask): involves overwriting df2 columns in cols_to_fuzz with df1 values. """ # nested function that does the fuzzing - def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz): + def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing): """ Fuzz some df2 records in each bin defined by bin_type and imeasure. The fuzzed records have their post-reform tax results (in df2) @@ -272,25 +309,34 @@ def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz): else: df2 = add_quantile_bins(df2, imeasure, 1) gdf2 = df2.groupby('bins') - df2['nofuzz'] = gdf2['mask'].transform(chooser) + if do_fuzzing: + df2['nofuzz'] = gdf2['mask'].transform(chooser) + else: # never do any results fuzzing + df2['nofuzz'] = np.ones(df2.shape[0], dtype=np.int8) for col in cols_to_fuzz: df2[col + suffix] = (df2[col] * df2['nofuzz'] - df1[col] * df2['nofuzz'] + df1[col]) # main logic of fuzz_df2_records + do_fuzzing = np.any(mask) skips = set(['num_returns_ItemDed', 'num_returns_StandardDed', 'num_returns_AMT', 's006']) - columns_to_fuzz = (set(DIST_TABLE_COLUMNS) | set(STATS_COLUMNS)) - skips + columns_to_fuzz = (set(DIST_TABLE_COLUMNS) | + set(STATS_COLUMNS)) - skips df2['mask'] = mask - # always use expanded income in df1 baseline to groupby into bins df2['expanded_income_baseline'] = df1['expanded_income'] - fuzz(df1, df2, 'dec', 'expanded_income_baseline', '_xdec', columns_to_fuzz) - fuzz(df1, df2, 'bin', 'expanded_income_baseline', '_xbin', columns_to_fuzz) - fuzz(df1, df2, 'agg', 'expanded_income_baseline', '_agg', columns_to_fuzz) + fuzz(df1, df2, 'dec', 'expanded_income_baseline', '_xdec', + columns_to_fuzz, do_fuzzing) + fuzz(df1, df2, 'bin', 'expanded_income_baseline', '_xbin', + columns_to_fuzz, do_fuzzing) + fuzz(df1, df2, 'agg', 'expanded_income_baseline', '_agg', + columns_to_fuzz, do_fuzzing) df2['c00100_baseline'] = df1['c00100'] # c00100 is AGI - fuzz(df1, df2, 'dec', 'c00100_baseline', '_adec', columns_to_fuzz) - fuzz(df1, df2, 'bin', 'c00100_baseline', '_abin', columns_to_fuzz) + fuzz(df1, df2, 'dec', 'c00100_baseline', '_adec', + columns_to_fuzz, do_fuzzing) + fuzz(df1, df2, 'bin', 'c00100_baseline', '_abin', + columns_to_fuzz, do_fuzzing) return df2 @@ -301,7 +347,7 @@ def summary(df1, df2, mask): """ df1 contains raw results for baseline plan df2 contains raw results for reform plan - mask is the boolean array specifying which records might be fuzzed + mask is the boolean array specifying records with reform-induced tax diffs returns dictionary of summary results DataFrames """ # pylint: disable=too-many-statements,too-many-locals diff --git a/taxcalc/tests/test_tbi.py b/taxcalc/tests/test_tbi.py index d77ddd261..6ed0f618b 100644 --- a/taxcalc/tests/test_tbi.py +++ b/taxcalc/tests/test_tbi.py @@ -58,26 +58,35 @@ def test_check_user_mods_errors(): @pytest.mark.requires_pufcsv -def test_run_nth_year_value_errors(puf_subsample): +def test_run_nth_year_value_errors(): usermods = USER_MODS # test for growdiff_response not allowed error usermods['growdiff_response'] = {2018: {'_AINTS': [0.02]}} with pytest.raises(ValueError): - run_nth_year_gdp_elast_model(1, 2013, puf_subsample, - usermods, gdp_elasticity=0.36, + run_nth_year_gdp_elast_model(1, 2013, + use_puf_not_cps=True, + use_full_sample=False, + user_mods=usermods, + gdp_elasticity=0.36, return_json=False) usermods['growdiff_response'] = dict() # test for behavior not allowed error with pytest.raises(ValueError): - run_nth_year_gdp_elast_model(1, 2013, puf_subsample, - usermods, gdp_elasticity=0.36, + run_nth_year_gdp_elast_model(1, 2013, + use_puf_not_cps=True, + use_full_sample=False, + user_mods=usermods, + gdp_elasticity=0.36, return_json=False) @pytest.mark.requires_pufcsv @pytest.mark.parametrize('resjson', [True, False]) -def test_run_tax_calc_model(puf_subsample, resjson): - res = run_nth_year_tax_calc_model(2, 2016, puf_subsample, USER_MODS, +def test_run_tax_calc_model(resjson): + res = run_nth_year_tax_calc_model(2, 2016, + use_puf_not_cps=resjson, + use_full_sample=False, + user_mods=USER_MODS, return_json=resjson) assert isinstance(res, dict) dump = False # set to True in order to dump returned results and fail test @@ -99,11 +108,14 @@ def test_run_tax_calc_model(puf_subsample, resjson): @pytest.mark.requires_pufcsv @pytest.mark.parametrize('resjson', [True, False]) -def test_run_gdp_elast_model(puf_subsample, resjson): +def test_run_gdp_elast_model(resjson): usermods = USER_MODS usermods['behavior'] = dict() - res = run_nth_year_gdp_elast_model(2, 2016, puf_subsample, - usermods, gdp_elasticity=0.36, + res = run_nth_year_gdp_elast_model(2, 2016, + use_puf_not_cps=True, + use_full_sample=False, + user_mods=usermods, + gdp_elasticity=0.36, return_json=resjson) if resjson: assert isinstance(res, dict) @@ -185,11 +197,11 @@ def test_with_pufcsv(puf_fullsample): taxes_fullsample = adt.loc["Combined Liability ($b)"] assert taxes_fullsample is not None fulls_reform_revenue = float(taxes_fullsample.loc[analysis_year]) - # create a Public Use File object - tax_data = puf_fullsample # call run_nth_year_tax_calc_model function resdict = run_nth_year_tax_calc_model(year_n, start_year, - tax_data, usermods, + use_puf_not_cps=True, + use_full_sample=True, + user_mods=usermods, return_json=True) total = resdict['aggr_2'] tbi_reform_revenue = float(total['combined_tax_9']) * 1e-9 From a16bb185b3da64ea6ba39f415326dde4188fd522 Mon Sep 17 00:00:00 2001 From: martinholmer Date: Mon, 2 Oct 2017 18:46:21 -0400 Subject: [PATCH 05/11] Update RELEASES.md info --- RELEASES.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/RELEASES.md b/RELEASES.md index abd9df8e2..ce20d76a5 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -16,6 +16,9 @@ Release 0.12.0 on 2017-??-?? - Remove arrays_not_lists argument from read_json_param_objects [[#1568](https://github.com/open-source-economics/Tax-Calculator/pull/1568) by Martin Holmer] +- Rename dropq as tbi (taxbrain interface) and refactor run_nth_year_*_model functions + [[#1577](https://github.com/open-source-economics/Tax-Calculator/pull/1577) + by Martin Holmer] **New Features** - Add Calculator.reform_documentation that generates plain text documentation of a reform From 6433b1f59e9d65b7c41fba3fc0b0564af941a29b Mon Sep 17 00:00:00 2001 From: martinholmer Date: Mon, 2 Oct 2017 18:46:53 -0400 Subject: [PATCH 06/11] Change dropq to tbi in .coveragerc file --- .coveragerc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index eb6d5b460..3eb7ba134 100644 --- a/.coveragerc +++ b/.coveragerc @@ -4,6 +4,6 @@ omit = taxcalc/functions.py taxcalc/*.json taxcalc/cli/* - taxcalc/dropq/* + taxcalc/tbi/* taxcalc/tests/* taxcalc/validation/* From cfbf8fa3ff6b4e9ba7acab06f5b8a8637e91c84f Mon Sep 17 00:00:00 2001 From: martinholmer Date: Tue, 3 Oct 2017 08:00:04 -0400 Subject: [PATCH 07/11] Rename tbi_utils.py functions to indicate their function --- taxcalc/tbi/tbi_utils.py | 50 ++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/taxcalc/tbi/tbi_utils.py b/taxcalc/tbi/tbi_utils.py index a78d3f877..cabcf5bdc 100644 --- a/taxcalc/tbi/tbi_utils.py +++ b/taxcalc/tbi/tbi_utils.py @@ -266,9 +266,10 @@ def chooser(agg): return ans -def fuzz_df2_records(df1, df2, mask): +def create_results_columns(df1, df2, mask): """ - Possibly modify df2 results by adding random fuzz for data privacy. + Create columns in df2 results dataframe and possibly + modify df2 results by adding random fuzz for data privacy. Parameters ---------- @@ -284,20 +285,22 @@ def fuzz_df2_records(df1, df2, mask): Returns ------- - possibly fuzzed df2: Pandas DataFrame + expanded and possibly fuzzed df2: Pandas DataFrame Notes ----- - This function groups both DataFrames based on the web application's + When doing the fuzzing for puf.csv results, this + function groups both DataFrames based on the web application's income groupings (both decile and income bins), and then randomly selects NUM_TO_FUZZ records to fuzz within each bin. The fuzzing involves overwriting df2 columns in cols_to_fuzz with df1 values. """ # nested function that does the fuzzing - def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing): + def create(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing): """ - Fuzz some df2 records in each bin defined by bin_type and imeasure. - The fuzzed records have their post-reform tax results (in df2) + Create additional df2 columns. If do_fuzzing is True, also + fuzz some df2 records in each bin defined by bin_type and imeasure + with the fuzzed records having their post-reform tax results (in df2) set to their pre-reform tax results (in df1). """ # pylint: disable=too-many-arguments @@ -316,27 +319,28 @@ def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing): for col in cols_to_fuzz: df2[col + suffix] = (df2[col] * df2['nofuzz'] - df1[col] * df2['nofuzz'] + df1[col]) - # main logic of fuzz_df2_records - do_fuzzing = np.any(mask) + # main logic of create_results_columns function skips = set(['num_returns_ItemDed', 'num_returns_StandardDed', 'num_returns_AMT', 's006']) - columns_to_fuzz = (set(DIST_TABLE_COLUMNS) | - set(STATS_COLUMNS)) - skips - df2['mask'] = mask + columns_to_create = (set(DIST_TABLE_COLUMNS) | + set(STATS_COLUMNS)) - skips + do_fuzzing = np.any(mask) + if do_fuzzing: + df2['mask'] = mask df2['expanded_income_baseline'] = df1['expanded_income'] - fuzz(df1, df2, 'dec', 'expanded_income_baseline', '_xdec', - columns_to_fuzz, do_fuzzing) - fuzz(df1, df2, 'bin', 'expanded_income_baseline', '_xbin', - columns_to_fuzz, do_fuzzing) - fuzz(df1, df2, 'agg', 'expanded_income_baseline', '_agg', - columns_to_fuzz, do_fuzzing) + create(df1, df2, 'dec', 'expanded_income_baseline', '_xdec', + columns_to_create, do_fuzzing) + create(df1, df2, 'bin', 'expanded_income_baseline', '_xbin', + columns_to_create, do_fuzzing) + create(df1, df2, 'agg', 'expanded_income_baseline', '_agg', + columns_to_create, do_fuzzing) df2['c00100_baseline'] = df1['c00100'] # c00100 is AGI - fuzz(df1, df2, 'dec', 'c00100_baseline', '_adec', - columns_to_fuzz, do_fuzzing) - fuzz(df1, df2, 'bin', 'c00100_baseline', '_abin', - columns_to_fuzz, do_fuzzing) + create(df1, df2, 'dec', 'c00100_baseline', '_adec', + columns_to_create, do_fuzzing) + create(df1, df2, 'bin', 'c00100_baseline', '_abin', + columns_to_create, do_fuzzing) return df2 @@ -352,7 +356,7 @@ def summary(df1, df2, mask): """ # pylint: disable=too-many-statements,too-many-locals - df2 = fuzz_df2_records(df1, df2, mask) + df2 = create_results_columns(df1, df2, mask) summ = dict() From 16a79dfde25d38685bf15302d94625386d16d3f4 Mon Sep 17 00:00:00 2001 From: martinholmer Date: Tue, 3 Oct 2017 11:14:28 -0400 Subject: [PATCH 08/11] Fix puf input_path and time input read --- taxcalc/tbi/tbi_utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/taxcalc/tbi/tbi_utils.py b/taxcalc/tbi/tbi_utils.py index cabcf5bdc..4cd05c6cd 100644 --- a/taxcalc/tbi/tbi_utils.py +++ b/taxcalc/tbi/tbi_utils.py @@ -5,7 +5,9 @@ # pep8 --ignore=E402 tbi_utils.py # pylint --disable=locally-disabled tbi_utils.py +from __future__ import print_function import os +import time import copy import hashlib import numpy as np @@ -92,17 +94,20 @@ def calculate(year_n, start_year, growdiff_response.apply_to(growfactors_post) # create sample pd.DataFrame from specified input file and sampling scheme + stime = time.time() tbi_path = os.path.abspath(os.path.dirname(__file__)) if use_puf_not_cps: - input_path = os.path.join(tbi_path, '..', '..', 'puf.csv.gz') + # first try TaxBrain deployment path + input_path = 'puf.csv.gz' if not os.path.isfile(input_path): + # otherwise try local Tax-Calculator deployment path input_path = os.path.join(tbi_path, '..', '..', 'puf.csv') sampling_frac = 0.05 sampling_seed = 180 else: input_path = os.path.join(tbi_path, '..', 'cps.csv.gz') - sampling_frac = 0.05 - sampling_seed = 180 + sampling_frac = 0.05 # TODO: using same as for puf for now + sampling_seed = 180 # TODO: using same as for puf for now full_sample = pd.read_csv(input_path) if use_full_sample: sample = full_sample @@ -111,6 +116,10 @@ def calculate(year_n, start_year, frac=sampling_frac, random_state=sampling_seed ) + if use_puf_not_cps: + print('puf-read-time= {:.1f}'.format(time.time() - stime)) + else: + print('cps-read-time= {:.1f}'.format(time.time() - stime)) # create pre-reform Calculator instance if use_puf_not_cps: From 8abb6aea917afb3d4ff3798df4133dcec8b8e0bf Mon Sep 17 00:00:00 2001 From: martinholmer Date: Tue, 3 Oct 2017 11:18:39 -0400 Subject: [PATCH 09/11] Rename create_json_table as create_dict_table --- taxcalc/tbi/__init__.py | 2 +- taxcalc/tbi/tbi.py | 24 ++++++++++++------------ taxcalc/tests/test_tbi.py | 32 ++++++++++++++++---------------- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/taxcalc/tbi/__init__.py b/taxcalc/tbi/__init__.py index 0ec6d5297..b46660300 100644 --- a/taxcalc/tbi/__init__.py +++ b/taxcalc/tbi/__init__.py @@ -1,4 +1,4 @@ from taxcalc.tbi.tbi import (run_nth_year_tax_calc_model, run_nth_year_gdp_elast_model, - create_json_table, + create_dict_table, reform_warnings_errors) diff --git a/taxcalc/tbi/tbi.py b/taxcalc/tbi/tbi.py index 3485169bf..31422e491 100644 --- a/taxcalc/tbi/tbi.py +++ b/taxcalc/tbi/tbi.py @@ -85,7 +85,7 @@ def run_nth_year_tax_calc_model(year_n, start_year, use_puf_not_cps, use_full_sample, user_mods, - return_json=True): + return_dict=True): """ The run_nth_year_tax_calc_model function assumes user_mods is a dictionary returned by the Calculator.read_json_param_objects() function. @@ -124,12 +124,12 @@ def append_year(pdf): return pdf # optionally return non-JSON results - if not return_json: + if not return_dict: res = dict() for tbl in summ: res[tbl] = append_year(summ[tbl]) elapsed_time = time.time() - start_time - print('elapsed time for this run: ', elapsed_time) + print('elapsed time for this run: {:.1f}'.format(elapsed_time)) return res # optionally construct JSON results tables for year n @@ -152,15 +152,15 @@ def append_year(pdf): res = dict() for tbl in summ: if 'aggr' in tbl: - res_table = create_json_table(summ[tbl], + res_table = create_dict_table(summ[tbl], row_names=info[tbl]['row_names']) res[tbl] = dict((k, v[0]) for k, v in res_table.items()) else: - res[tbl] = create_json_table(summ[tbl], + res[tbl] = create_dict_table(summ[tbl], row_names=info[tbl]['row_names'], column_types=info[tbl]['col_types']) elapsed_time = time.time() - start_time - print('elapsed time for this run: ', elapsed_time) + print('elapsed time for this run: {:.1f}'.format(elapsed_time)) return res @@ -169,7 +169,7 @@ def run_nth_year_gdp_elast_model(year_n, start_year, use_full_sample, user_mods, gdp_elasticity, - return_json=True): + return_dict=True): """ The run_nth_year_gdp_elast_model function assumes user_mods is a dictionary returned by the Calculator.read_json_param_objects() function. @@ -191,11 +191,11 @@ def run_nth_year_gdp_elast_model(year_n, start_year, gdp_effect = proportional_change_gdp(calc1, calc2, gdp_elasticity) # return gdp_effect results - if return_json: + if return_dict: gdp_df = pd.DataFrame(data=[gdp_effect], columns=['col0']) gdp_elast_names_n = [x + '_' + str(year_n) for x in GDP_ELAST_ROW_NAMES] - gdp_elast_total = create_json_table(gdp_df, + gdp_elast_total = create_dict_table(gdp_df, row_names=gdp_elast_names_n, num_decimals=5) gdp_elast_total = dict((k, v[0]) for k, v in gdp_elast_total.items()) @@ -204,10 +204,10 @@ def run_nth_year_gdp_elast_model(year_n, start_year, return gdp_effect -def create_json_table(dframe, row_names=None, column_types=None, +def create_dict_table(dframe, row_names=None, column_types=None, num_decimals=2): """ - Create and return dictionary with JSON-like contents from specified dframe. + Create and return dictionary with JSON-like content from specified dframe. """ # embedded formatted_string function def formatted_string(val, _type, num_decimals): @@ -230,7 +230,7 @@ def formatted_string(val, _type, num_decimals): except ValueError: # try making it a string - good luck! return str(val) - # high-level create_json_table function logic + # high-level create_dict_table function logic out = dict() if row_names is None: row_names = [str(x) for x in list(dframe.index)] diff --git a/taxcalc/tests/test_tbi.py b/taxcalc/tests/test_tbi.py index 6ed0f618b..c84be8360 100644 --- a/taxcalc/tests/test_tbi.py +++ b/taxcalc/tests/test_tbi.py @@ -68,7 +68,7 @@ def test_run_nth_year_value_errors(): use_full_sample=False, user_mods=usermods, gdp_elasticity=0.36, - return_json=False) + return_dict=False) usermods['growdiff_response'] = dict() # test for behavior not allowed error with pytest.raises(ValueError): @@ -77,26 +77,26 @@ def test_run_nth_year_value_errors(): use_full_sample=False, user_mods=usermods, gdp_elasticity=0.36, - return_json=False) + return_dict=False) @pytest.mark.requires_pufcsv -@pytest.mark.parametrize('resjson', [True, False]) -def test_run_tax_calc_model(resjson): +@pytest.mark.parametrize('resdict', [True, False]) +def test_run_tax_calc_model(resdict): res = run_nth_year_tax_calc_model(2, 2016, - use_puf_not_cps=resjson, + use_puf_not_cps=resdict, use_full_sample=False, user_mods=USER_MODS, - return_json=resjson) + return_dict=resdict) assert isinstance(res, dict) dump = False # set to True in order to dump returned results and fail test for tbl in sorted(res.keys()): - if resjson: + if resdict: assert isinstance(res[tbl], dict) else: assert isinstance(res[tbl], pd.DataFrame) if dump: - if resjson: + if resdict: cols = sorted(res[tbl].keys()) else: cols = sorted(list(res[tbl])) @@ -107,8 +107,8 @@ def test_run_tax_calc_model(resjson): @pytest.mark.requires_pufcsv -@pytest.mark.parametrize('resjson', [True, False]) -def test_run_gdp_elast_model(resjson): +@pytest.mark.parametrize('resdict', [True, False]) +def test_run_gdp_elast_model(resdict): usermods = USER_MODS usermods['behavior'] = dict() res = run_nth_year_gdp_elast_model(2, 2016, @@ -116,8 +116,8 @@ def test_run_gdp_elast_model(resjson): use_full_sample=False, user_mods=usermods, gdp_elasticity=0.36, - return_json=resjson) - if resjson: + return_dict=resdict) + if resdict: assert isinstance(res, dict) else: assert isinstance(res, float) @@ -149,11 +149,11 @@ def test_chooser_error(): chooser(dframe['zeros']) -def test_create_json_table(): +def test_create_dict_table(): # test correct usage dframe = pd.DataFrame(data=[[1., 2, 3], [4, 5, 6], [7, 8, 9]], columns=['a', 'b', 'c']) - ans = create_json_table(dframe) + ans = create_dict_table(dframe) exp = {'0': ['1.00', '2', '3'], '1': ['4.00', '5', '6'], '2': ['7.00', '8', '9']} @@ -162,7 +162,7 @@ def test_create_json_table(): dframe = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=['a', 'b', 'c'], dtype='i2') with pytest.raises(NotImplementedError): - create_json_table(dframe) + create_dict_table(dframe) @pytest.mark.requires_pufcsv @@ -202,7 +202,7 @@ def test_with_pufcsv(puf_fullsample): use_puf_not_cps=True, use_full_sample=True, user_mods=usermods, - return_json=True) + return_dict=True) total = resdict['aggr_2'] tbi_reform_revenue = float(total['combined_tax_9']) * 1e-9 # assert that tbi revenue is similar to the fullsample calculation From 5c60bc1ed88f31b49ecf6e36a2518922e20d0e5d Mon Sep 17 00:00:00 2001 From: martinholmer Date: Tue, 3 Oct 2017 12:56:25 -0400 Subject: [PATCH 10/11] Fix cps input logic --- taxcalc/tbi/tbi_utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/taxcalc/tbi/tbi_utils.py b/taxcalc/tbi/tbi_utils.py index 4cd05c6cd..306611e32 100644 --- a/taxcalc/tbi/tbi_utils.py +++ b/taxcalc/tbi/tbi_utils.py @@ -17,7 +17,7 @@ from taxcalc.utils import (add_income_bins, add_quantile_bins, results, create_difference_table, create_distribution_table, STATS_COLUMNS, DIST_TABLE_COLUMNS, - WEBAPP_INCOME_BINS) + WEBAPP_INCOME_BINS, read_egg_csv) def check_years(start_year, year_n): @@ -104,11 +104,17 @@ def calculate(year_n, start_year, input_path = os.path.join(tbi_path, '..', '..', 'puf.csv') sampling_frac = 0.05 sampling_seed = 180 - else: + else: # if using cps input not puf input + # first try Tax-Calculator code path input_path = os.path.join(tbi_path, '..', 'cps.csv.gz') + if not os.path.isfile(input_path): + # otherwise try taxcalc package path + input_path = None + full_sample = read_egg_csv('cps.csv.gz') # pragma: no cover sampling_frac = 0.05 # TODO: using same as for puf for now sampling_seed = 180 # TODO: using same as for puf for now - full_sample = pd.read_csv(input_path) + if input_path: + full_sample = pd.read_csv(input_path) if use_full_sample: sample = full_sample else: From 85610849966912770b74b48730d4124ea8050b0f Mon Sep 17 00:00:00 2001 From: martinholmer Date: Tue, 10 Oct 2017 04:36:27 -0400 Subject: [PATCH 11/11] Strengthen test in test_macro_elasticity.py --- taxcalc/tests/test_macro_elasticity.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/taxcalc/tests/test_macro_elasticity.py b/taxcalc/tests/test_macro_elasticity.py index b5dd930da..b4b15ed28 100644 --- a/taxcalc/tests/test_macro_elasticity.py +++ b/taxcalc/tests/test_macro_elasticity.py @@ -9,5 +9,9 @@ def test_proportional_change_gdp(cps_subsample): reform = {2013: {'_II_em': [0.0]}} # reform increases taxes and MTRs pol2.implement_reform(reform) calc2 = Calculator(policy=pol2, records=rec2) - gdp_diff = proportional_change_gdp(calc1, calc2, elasticity=0.36) - assert gdp_diff < 0. # higher MTRs imply negative GDP effect + calc1.advance_to_year(2014) + calc2.advance_to_year(2014) + gdp_pchg = 100.0 * proportional_change_gdp(calc1, calc2, elasticity=0.36) + exp_pchg = -0.6 # higher MTRs imply negative expected GDP percent change + abs_diff_pchg = abs(gdp_pchg - exp_pchg) + assert abs_diff_pchg < 0.05