From b26d3598184ad3b79250a40d633ea833439dde7c Mon Sep 17 00:00:00 2001 From: matt bowen Date: Thu, 1 Sep 2022 14:14:53 -0400 Subject: [PATCH 01/10] Remove unused persistent poverty from score (#1835) --- .../data_pipeline/etl/score/etl_score.py | 13 ------------- .../data_pipeline/tests/score/test_calculation.py | 1 - 2 files changed, 14 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index ad6941d0d..62a5006d2 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -42,7 +42,6 @@ def __init__(self): self.doe_energy_burden_df: pd.DataFrame self.national_risk_index_df: pd.DataFrame self.geocorr_urban_rural_df: pd.DataFrame - self.persistent_poverty_df: pd.DataFrame self.census_decennial_df: pd.DataFrame self.census_2010_df: pd.DataFrame self.national_tract_df: pd.DataFrame @@ -159,16 +158,6 @@ def extract(self) -> None: low_memory=False, ) - # Load persistent poverty - persistent_poverty_csv = ( - constants.DATA_PATH / "dataset" / "persistent_poverty" / "usa.csv" - ) - self.persistent_poverty_df = pd.read_csv( - persistent_poverty_csv, - dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, - low_memory=False, - ) - # Load decennial census data census_decennial_csv = ( constants.DATA_PATH @@ -359,7 +348,6 @@ def _prepare_initial_df(self) -> pd.DataFrame: self.doe_energy_burden_df, self.ejscreen_df, self.geocorr_urban_rural_df, - self.persistent_poverty_df, self.national_risk_index_df, self.census_acs_median_incomes_df, self.census_decennial_df, @@ -484,7 +472,6 @@ def _prepare_initial_df(self) -> pd.DataFrame: non_numeric_columns = [ self.GEOID_TRACT_FIELD_NAME, - field_names.PERSISTENT_POVERTY_FIELD, field_names.TRACT_ELIGIBLE_FOR_NONNATURAL_THRESHOLD, field_names.AGRICULTURAL_VALUE_BOOL_FIELD, ] diff --git a/data/data-pipeline/data_pipeline/tests/score/test_calculation.py b/data/data-pipeline/data_pipeline/tests/score/test_calculation.py index 783474e4a..d241918cd 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_calculation.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_calculation.py @@ -28,7 +28,6 @@ def full_percentile_column_name(self): return self.percentile_column_name -### TODO: we need to blow this out for all eight categories def _check_percentile_against_threshold(df, config: PercentileTestConfig): """Note - for the purpose of testing, this fills with False""" is_minimum_flagged_ok = ( From ec79f878df45c20c1a81daedf0996b39cb7a54cf Mon Sep 17 00:00:00 2001 From: matt bowen Date: Thu, 1 Sep 2022 17:47:30 -0400 Subject: [PATCH 02/10] Test a few datasets for overlap in the final score (#1835) --- .../data_pipeline/tests/score/fixtures.py | 87 +++++++++++++++++++ .../data_pipeline/tests/score/test_output.py | 52 ++++++++++- 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/fixtures.py b/data/data-pipeline/data_pipeline/tests/score/fixtures.py index 096a3e072..bd117ba5b 100644 --- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py +++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py @@ -2,6 +2,9 @@ import pytest from data_pipeline.config import settings from data_pipeline.score import field_names +from data_pipeline.etl.score import constants + +GEOID_TRACT_FIELD_NAME = field_names.GEOID_TRACT_FIELD @pytest.fixture(scope="session") @@ -11,3 +14,87 @@ def final_score_df(): dtype={field_names.GEOID_TRACT_FIELD: str}, low_memory=False, ) + + +@pytest.fixture(scope="session") +def census_df(): + census_csv = constants.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv" + return pd.read_csv( + census_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture(scope="session") +def ejscreen_df(): + ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv" + return pd.read_csv( + ejscreen_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture(scope="session") +def hud_housing_df(): + hud_housing_csv = ( + constants.DATA_PATH / "dataset" / "hud_housing" / "usa.csv" + ) + return pd.read_csv( + hud_housing_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture(scope="session") +def cdc_places_df(): + cdc_places_csv = constants.DATA_PATH / "dataset" / "cdc_places" / "usa.csv" + return pd.read_csv( + cdc_places_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture(scope="session") +def census_acs_median_incomes_df(): + census_acs_median_incomes_csv = ( + constants.DATA_PATH + / "dataset" + / "census_acs_median_income_2019" + / "usa.csv" + ) + return pd.read_csv( + census_acs_median_incomes_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture(scope="session") +def cdc_life_expectancy_df(): + cdc_life_expectancy_csv = ( + constants.DATA_PATH / "dataset" / "cdc_life_expectancy" / "usa.csv" + ) + return pd.read_csv( + cdc_life_expectancy_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture(scope="session") +def doe_energy_burden_df(): + doe_energy_burden_csv = ( + constants.DATA_PATH / "dataset" / "doe_energy_burden" / "usa.csv" + ) + return pd.read_csv( + doe_energy_burden_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +# TODO: The datasets that are loaded from data_pipeline/etl/score/etl_score.py:131 diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 70e95be4d..54e27652a 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -1,12 +1,21 @@ # flake8: noqa: W0613,W0611,F811 +# pylint: disable=unused-import +import inspect from dataclasses import dataclass from typing import List import pytest import pandas as pd +import numpy as np from data_pipeline.score import field_names -from .fixtures import final_score_df # pylint: disable=unused-import +from .fixtures import ( + final_score_df, + ejscreen_df, + hud_housing_df, + cdc_places_df, +) pytestmark = pytest.mark.smoketest +GEOID_TRACT_FIELD_NAME = field_names.GEOID_TRACT_FIELD def _helper_test_count_exceeding_threshold(df, col, error_check=1000): @@ -203,3 +212,44 @@ def test_donut_hole_addition_to_score_n(final_score_df): assert ( new_donuts > 0 ), "FYI: The adjacency index is doing nothing. Consider removing it?" + + +def test_data_sources( + final_score_df, hud_housing_df, ejscreen_df, cdc_places_df +): + data_sources = { + key: value for key, value in locals().items() if key != "final_score_df" + } + + for data_source_name, data_source in data_sources.items(): + final = "_final" + df: pd.DataFrame = final_score_df.merge( + data_source, + on=GEOID_TRACT_FIELD_NAME, + indicator="MERGE", + suffixes=(final, f"_{data_source_name}"), + how="left", + ) + data_source_columns = [ + f"{col}_{data_source_name}" + for col in data_source.columns + if (col != GEOID_TRACT_FIELD_NAME and col in final_score_df.columns) + ] + final_columns = [ + f"{col}{final}" + for col in data_source.columns + if (col != GEOID_TRACT_FIELD_NAME and col in final_score_df.columns) + ] + assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) + df = df[df.MERGE == "both"] + assert ( + final_columns + ), "No columns from data source show up in final score" + for final_column, data_source_column in zip( + data_source_columns, final_columns + ): + assert np.allclose( + df[final_column], + df[data_source_column], + equal_nan=True, + ), f"Column {final_column} not equal between {data_source_name} and final score" From d90292baf082df5ae5e65cd61784da52bd5fcc9a Mon Sep 17 00:00:00 2001 From: matt bowen Date: Fri, 2 Sep 2022 11:16:14 -0400 Subject: [PATCH 03/10] Add remaining data sources (#1853) --- .../data_pipeline/tests/score/fixtures.py | 125 ++++++++++++++++-- .../data_pipeline/tests/score/test_output.py | 67 ++++++++-- 2 files changed, 174 insertions(+), 18 deletions(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/fixtures.py b/data/data-pipeline/data_pipeline/tests/score/fixtures.py index bd117ba5b..2b1c7fc68 100644 --- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py +++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py @@ -11,12 +11,12 @@ def final_score_df(): return pd.read_csv( settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv", - dtype={field_names.GEOID_TRACT_FIELD: str}, + dtype={GEOID_TRACT_FIELD_NAME: str}, low_memory=False, ) -@pytest.fixture(scope="session") +@pytest.fixture() def census_df(): census_csv = constants.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv" return pd.read_csv( @@ -26,7 +26,7 @@ def census_df(): ) -@pytest.fixture(scope="session") +@pytest.fixture() def ejscreen_df(): ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv" return pd.read_csv( @@ -36,7 +36,7 @@ def ejscreen_df(): ) -@pytest.fixture(scope="session") +@pytest.fixture() def hud_housing_df(): hud_housing_csv = ( constants.DATA_PATH / "dataset" / "hud_housing" / "usa.csv" @@ -48,7 +48,7 @@ def hud_housing_df(): ) -@pytest.fixture(scope="session") +@pytest.fixture() def cdc_places_df(): cdc_places_csv = constants.DATA_PATH / "dataset" / "cdc_places" / "usa.csv" return pd.read_csv( @@ -58,7 +58,7 @@ def cdc_places_df(): ) -@pytest.fixture(scope="session") +@pytest.fixture() def census_acs_median_incomes_df(): census_acs_median_incomes_csv = ( constants.DATA_PATH @@ -73,7 +73,7 @@ def census_acs_median_incomes_df(): ) -@pytest.fixture(scope="session") +@pytest.fixture() def cdc_life_expectancy_df(): cdc_life_expectancy_csv = ( constants.DATA_PATH / "dataset" / "cdc_life_expectancy" / "usa.csv" @@ -85,7 +85,7 @@ def cdc_life_expectancy_df(): ) -@pytest.fixture(scope="session") +@pytest.fixture() def doe_energy_burden_df(): doe_energy_burden_csv = ( constants.DATA_PATH / "dataset" / "doe_energy_burden" / "usa.csv" @@ -97,4 +97,111 @@ def doe_energy_burden_df(): ) -# TODO: The datasets that are loaded from data_pipeline/etl/score/etl_score.py:131 +@pytest.fixture() +def national_risk_index_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "national_risk_index" / "usa.csv", + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def dot_travel_disadvantage_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "travel_composite" / "usa.csv", + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def fsf_fire_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "fsf_wildfire_risk" / "usa.csv", + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def fsf_flood_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "fsf_flood_risk" / "usa.csv", + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def nature_deprived_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "nlcd_nature_deprived" / "usa.csv", + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def eamlis_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "eamlis" / "usa.csv", + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def fuds_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "us_army_fuds" / "usa.csv", + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def geocorr_urban_rural_df(): + geocorr_urban_rural_csv = ( + constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv" + ) + return pd.read_csv( + geocorr_urban_rural_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def census_decennial_df(): + census_decennial_csv = ( + constants.DATA_PATH / "dataset" / "census_decennial_2010" / "usa.csv" + ) + return pd.read_csv( + census_decennial_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def census_2010_df(): + census_2010_csv = ( + constants.DATA_PATH / "dataset" / "census_acs_2010" / "usa.csv" + ) + return pd.read_csv( + census_2010_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def hrs_df(): + hrs_csv = constants.DATA_PATH / "dataset" / "historic_redlining" / "usa.csv" + + return pd.read_csv( + hrs_csv, + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 54e27652a..0ad06b616 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -1,7 +1,8 @@ -# flake8: noqa: W0613,W0611,F811 -# pylint: disable=unused-import +# flake8: noqa: W0613,W0611,F811, +# pylint: disable=unused-import,R0913 import inspect from dataclasses import dataclass +from re import I from typing import List import pytest import pandas as pd @@ -11,9 +12,24 @@ final_score_df, ejscreen_df, hud_housing_df, + census_df, cdc_places_df, + census_acs_median_incomes_df, + cdc_life_expectancy_df, + doe_energy_burden_df, + national_risk_index_df, + dot_travel_disadvantage_df, + fsf_fire_df, + nature_deprived_df, + eamlis_df, + fuds_df, + geocorr_urban_rural_df, + census_decennial_df, + census_2010_df, + hrs_df, ) + pytestmark = pytest.mark.smoketest GEOID_TRACT_FIELD_NAME = field_names.GEOID_TRACT_FIELD @@ -215,7 +231,24 @@ def test_donut_hole_addition_to_score_n(final_score_df): def test_data_sources( - final_score_df, hud_housing_df, ejscreen_df, cdc_places_df + final_score_df, + hud_housing_df, + ejscreen_df, + census_df, + cdc_places_df, + census_acs_median_incomes_df, + cdc_life_expectancy_df, + doe_energy_burden_df, + national_risk_index_df, + dot_travel_disadvantage_df, + fsf_fire_df, + nature_deprived_df, + eamlis_df, + fuds_df, + geocorr_urban_rural_df, + census_decennial_df, + census_2010_df, + hrs_df, ): data_sources = { key: value for key, value in locals().items() if key != "final_score_df" @@ -244,12 +277,28 @@ def test_data_sources( df = df[df.MERGE == "both"] assert ( final_columns - ), "No columns from data source show up in final score" + ), f"No columns from data source show up in final score in source {data_source_name}" + + # Compare every column for equality, using close equality for numerics and + # `equals` equality for non-numeric columns for final_column, data_source_column in zip( data_source_columns, final_columns ): - assert np.allclose( - df[final_column], - df[data_source_column], - equal_nan=True, - ), f"Column {final_column} not equal between {data_source_name} and final score" + error_message = ( + f"Column {final_column} not equal " + f"between {data_source_name} and final score" + ) + if df[final_column].dtype in [ + np.dtype(object), + np.dtype(bool), + np.dtype(str), + ]: + assert df[final_column].equals( + df[data_source_column] + ), error_message + else: + assert np.allclose( + df[final_column], + df[data_source_column], + equal_nan=True, + ), error_message From c0c20647c50ed59f1cbdc971e12d725488a87b8e Mon Sep 17 00:00:00 2001 From: matt bowen Date: Fri, 2 Sep 2022 11:24:57 -0400 Subject: [PATCH 04/10] Apply code-review feedback (#1835) --- .../data_pipeline/tests/score/test_output.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 0ad06b616..3d8b743ae 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -263,16 +263,11 @@ def test_data_sources( suffixes=(final, f"_{data_source_name}"), how="left", ) - data_source_columns = [ - f"{col}_{data_source_name}" - for col in data_source.columns - if (col != GEOID_TRACT_FIELD_NAME and col in final_score_df.columns) - ] - final_columns = [ - f"{col}{final}" - for col in data_source.columns - if (col != GEOID_TRACT_FIELD_NAME and col in final_score_df.columns) - ] + core_cols = data_source.columns.intersection( + final_score_df.columns + ).drop(GEOID_TRACT_FIELD_NAME) + data_source_columns = [f"{col}_{data_source_name}" for col in core_cols] + final_columns = [f"{col}{final}" for col in core_cols] assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) df = df[df.MERGE == "both"] assert ( From 9cd5eb685843c0d82da3e7f087c5f726bb6aea7d Mon Sep 17 00:00:00 2001 From: matt bowen Date: Fri, 2 Sep 2022 11:31:37 -0400 Subject: [PATCH 05/10] Rearrange a little for readabililty (#1835) --- .../data_pipeline/tests/score/test_output.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 3d8b743ae..b79b5c711 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -255,7 +255,7 @@ def test_data_sources( } for data_source_name, data_source in data_sources.items(): - final = "_final" + final = "final" df: pd.DataFrame = final_score_df.merge( data_source, on=GEOID_TRACT_FIELD_NAME, @@ -263,13 +263,18 @@ def test_data_sources( suffixes=(final, f"_{data_source_name}"), how="left", ) + + # Make sure we have NAs for any tracts in the final data that aren't + # covered in the final data + assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) + df = df[df.MERGE == "both"] + + # Make our lists of columns for later comparison core_cols = data_source.columns.intersection( final_score_df.columns ).drop(GEOID_TRACT_FIELD_NAME) data_source_columns = [f"{col}_{data_source_name}" for col in core_cols] - final_columns = [f"{col}{final}" for col in core_cols] - assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) - df = df[df.MERGE == "both"] + final_columns = [f"{col}_{final}" for col in core_cols] assert ( final_columns ), f"No columns from data source show up in final score in source {data_source_name}" From 98a02c496ea42974e127dc9243986fd1fd1f9f4c Mon Sep 17 00:00:00 2001 From: matt bowen Date: Fri, 2 Sep 2022 11:44:47 -0400 Subject: [PATCH 06/10] Add tract test (#1835) --- .../data_pipeline/tests/score/fixtures.py | 12 +++++++++ .../data_pipeline/tests/score/test_output.py | 27 ++++++++++++++----- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/fixtures.py b/data/data-pipeline/data_pipeline/tests/score/fixtures.py index 2b1c7fc68..64d80bfad 100644 --- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py +++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py @@ -205,3 +205,15 @@ def hrs_df(): dtype={GEOID_TRACT_FIELD_NAME: "string"}, low_memory=False, ) + + +@pytest.fixture() +def national_tract_df(): + national_tract_csv = constants.DATA_CENSUS_CSV_FILE_PATH + return pd.read_csv( + national_tract_csv, + names=[GEOID_TRACT_FIELD_NAME], + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + header=None, + ) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index b79b5c711..6f73553eb 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -27,6 +27,7 @@ census_decennial_df, census_2010_df, hrs_df, + national_tract_df, ) @@ -255,7 +256,7 @@ def test_data_sources( } for data_source_name, data_source in data_sources.items(): - final = "final" + final = "final_" df: pd.DataFrame = final_score_df.merge( data_source, on=GEOID_TRACT_FIELD_NAME, @@ -264,21 +265,21 @@ def test_data_sources( how="left", ) - # Make sure we have NAs for any tracts in the final data that aren't - # covered in the final data - assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) - df = df[df.MERGE == "both"] - # Make our lists of columns for later comparison core_cols = data_source.columns.intersection( final_score_df.columns ).drop(GEOID_TRACT_FIELD_NAME) data_source_columns = [f"{col}_{data_source_name}" for col in core_cols] - final_columns = [f"{col}_{final}" for col in core_cols] + final_columns = [f"{col}{final}" for col in core_cols] assert ( final_columns ), f"No columns from data source show up in final score in source {data_source_name}" + # Make sure we have NAs for any tracts in the final data that aren't + # covered in the final data + assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) + df = df[df.MERGE == "both"] + # Compare every column for equality, using close equality for numerics and # `equals` equality for non-numeric columns for final_column, data_source_column in zip( @@ -302,3 +303,15 @@ def test_data_sources( df[data_source_column], equal_nan=True, ), error_message + + +def test_output_tracts(final_score_df, national_tract_df): + df = final_score_df.merge( + national_tract_df, + on=GEOID_TRACT_FIELD_NAME, + how="outer", + indicator="MERGE", + ) + counts = df.value_counts("MERGE") + assert counts.loc["left_only"] == 0 + assert counts.loc["right_only"] == 0 From 0d919fd79ae40aa4162fc7aa8316d26a7ec6a0e1 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Fri, 2 Sep 2022 13:09:17 -0400 Subject: [PATCH 07/10] Add test for score values (#1835) --- data/data-pipeline/data_pipeline/tests/score/test_output.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 6f73553eb..51e30fb73 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -315,3 +315,7 @@ def test_output_tracts(final_score_df, national_tract_df): counts = df.value_counts("MERGE") assert counts.loc["left_only"] == 0 assert counts.loc["right_only"] == 0 + + +def test_all_tracts_have_scores(final_score_df): + assert not final_score_df[field_names.SCORE_N_COMMUNITIES].isna().any() From 358a1fd90381b661867b15f48852ef571a4acb99 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Fri, 2 Sep 2022 13:51:27 -0400 Subject: [PATCH 08/10] Check for unmatched source tracts (#1835) --- .../data_pipeline/tests/score/test_output.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 51e30fb73..777b6ea8c 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -33,6 +33,7 @@ pytestmark = pytest.mark.smoketest GEOID_TRACT_FIELD_NAME = field_names.GEOID_TRACT_FIELD +UNMATCHED_TRACK_THRESHOLD = 1000 def _helper_test_count_exceeding_threshold(df, col, error_check=1000): @@ -262,7 +263,7 @@ def test_data_sources( on=GEOID_TRACT_FIELD_NAME, indicator="MERGE", suffixes=(final, f"_{data_source_name}"), - how="left", + how="outer", ) # Make our lists of columns for later comparison @@ -278,6 +279,11 @@ def test_data_sources( # Make sure we have NAs for any tracts in the final data that aren't # covered in the final data assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) + + # Make sure the datasource doesn't have a ton of unmatched tracts, implying it + # has moved to 2020 tracts + assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACK_THRESHOLD + df = df[df.MERGE == "both"] # Compare every column for equality, using close equality for numerics and From 17669c99a9e8d346bb05e03fb523affadf1cffd0 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Tue, 6 Sep 2022 11:11:46 -0400 Subject: [PATCH 09/10] Cleanup numeric code to plaintext (#1835) --- data/data-pipeline/data_pipeline/tests/score/test_output.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 777b6ea8c..4dfc773e5 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -1,8 +1,6 @@ # flake8: noqa: W0613,W0611,F811, -# pylint: disable=unused-import,R0913 -import inspect +# pylint: disable=unused-import,too-many-arguments from dataclasses import dataclass -from re import I from typing import List import pytest import pandas as pd From 4459a77c15da5da54f121b788a153a228bd08ffa Mon Sep 17 00:00:00 2001 From: matt bowen Date: Tue, 6 Sep 2022 13:14:47 -0400 Subject: [PATCH 10/10] Make import more obvious (#1835) --- .../data_pipeline/tests/score/fixtures.py | 46 +++++++++---------- .../data_pipeline/tests/score/test_output.py | 8 ++-- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/fixtures.py b/data/data-pipeline/data_pipeline/tests/score/fixtures.py index 64d80bfad..805c77262 100644 --- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py +++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py @@ -1,17 +1,15 @@ import pandas as pd import pytest from data_pipeline.config import settings -from data_pipeline.score import field_names +from data_pipeline.score.field_names import GEOID_TRACT_FIELD from data_pipeline.etl.score import constants -GEOID_TRACT_FIELD_NAME = field_names.GEOID_TRACT_FIELD - @pytest.fixture(scope="session") def final_score_df(): return pd.read_csv( settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv", - dtype={GEOID_TRACT_FIELD_NAME: str}, + dtype={GEOID_TRACT_FIELD: str}, low_memory=False, ) @@ -21,7 +19,7 @@ def census_df(): census_csv = constants.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv" return pd.read_csv( census_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -31,7 +29,7 @@ def ejscreen_df(): ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv" return pd.read_csv( ejscreen_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -43,7 +41,7 @@ def hud_housing_df(): ) return pd.read_csv( hud_housing_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -53,7 +51,7 @@ def cdc_places_df(): cdc_places_csv = constants.DATA_PATH / "dataset" / "cdc_places" / "usa.csv" return pd.read_csv( cdc_places_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -68,7 +66,7 @@ def census_acs_median_incomes_df(): ) return pd.read_csv( census_acs_median_incomes_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -80,7 +78,7 @@ def cdc_life_expectancy_df(): ) return pd.read_csv( cdc_life_expectancy_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -92,7 +90,7 @@ def doe_energy_burden_df(): ) return pd.read_csv( doe_energy_burden_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -101,7 +99,7 @@ def doe_energy_burden_df(): def national_risk_index_df(): return pd.read_csv( constants.DATA_PATH / "dataset" / "national_risk_index" / "usa.csv", - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -110,7 +108,7 @@ def national_risk_index_df(): def dot_travel_disadvantage_df(): return pd.read_csv( constants.DATA_PATH / "dataset" / "travel_composite" / "usa.csv", - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -119,7 +117,7 @@ def dot_travel_disadvantage_df(): def fsf_fire_df(): return pd.read_csv( constants.DATA_PATH / "dataset" / "fsf_wildfire_risk" / "usa.csv", - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -128,7 +126,7 @@ def fsf_fire_df(): def fsf_flood_df(): return pd.read_csv( constants.DATA_PATH / "dataset" / "fsf_flood_risk" / "usa.csv", - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -137,7 +135,7 @@ def fsf_flood_df(): def nature_deprived_df(): return pd.read_csv( constants.DATA_PATH / "dataset" / "nlcd_nature_deprived" / "usa.csv", - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -146,7 +144,7 @@ def nature_deprived_df(): def eamlis_df(): return pd.read_csv( constants.DATA_PATH / "dataset" / "eamlis" / "usa.csv", - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -155,7 +153,7 @@ def eamlis_df(): def fuds_df(): return pd.read_csv( constants.DATA_PATH / "dataset" / "us_army_fuds" / "usa.csv", - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -167,7 +165,7 @@ def geocorr_urban_rural_df(): ) return pd.read_csv( geocorr_urban_rural_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -179,7 +177,7 @@ def census_decennial_df(): ) return pd.read_csv( census_decennial_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -191,7 +189,7 @@ def census_2010_df(): ) return pd.read_csv( census_2010_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -202,7 +200,7 @@ def hrs_df(): return pd.read_csv( hrs_csv, - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) @@ -212,8 +210,8 @@ def national_tract_df(): national_tract_csv = constants.DATA_CENSUS_CSV_FILE_PATH return pd.read_csv( national_tract_csv, - names=[GEOID_TRACT_FIELD_NAME], - dtype={GEOID_TRACT_FIELD_NAME: "string"}, + names=[GEOID_TRACT_FIELD], + dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, header=None, ) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 4dfc773e5..0945fb9e9 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -6,6 +6,7 @@ import pandas as pd import numpy as np from data_pipeline.score import field_names +from data_pipeline.score.field_names import GEOID_TRACT_FIELD from .fixtures import ( final_score_df, ejscreen_df, @@ -30,7 +31,6 @@ pytestmark = pytest.mark.smoketest -GEOID_TRACT_FIELD_NAME = field_names.GEOID_TRACT_FIELD UNMATCHED_TRACK_THRESHOLD = 1000 @@ -258,7 +258,7 @@ def test_data_sources( final = "final_" df: pd.DataFrame = final_score_df.merge( data_source, - on=GEOID_TRACT_FIELD_NAME, + on=GEOID_TRACT_FIELD, indicator="MERGE", suffixes=(final, f"_{data_source_name}"), how="outer", @@ -267,7 +267,7 @@ def test_data_sources( # Make our lists of columns for later comparison core_cols = data_source.columns.intersection( final_score_df.columns - ).drop(GEOID_TRACT_FIELD_NAME) + ).drop(GEOID_TRACT_FIELD) data_source_columns = [f"{col}_{data_source_name}" for col in core_cols] final_columns = [f"{col}{final}" for col in core_cols] assert ( @@ -312,7 +312,7 @@ def test_data_sources( def test_output_tracts(final_score_df, national_tract_df): df = final_score_df.merge( national_tract_df, - on=GEOID_TRACT_FIELD_NAME, + on=GEOID_TRACT_FIELD, how="outer", indicator="MERGE", )