From 0808e8a1708b09c609e74eb90a44ce4855cc22eb Mon Sep 17 00:00:00 2001 From: matt bowen Date: Tue, 2 Aug 2022 16:24:38 -0400 Subject: [PATCH] Run YAML load on all subclasses (#1518) --- data/data-pipeline/data_pipeline/etl/base.py | 79 ++++++++++--------- .../etl/score/config/datasets.yml | 16 +++- .../sources/child_opportunity_index/etl.py | 1 - .../etl/sources/doe_energy_burden/etl.py | 2 - .../etl/sources/national_risk_index/etl.py | 3 - 5 files changed, 56 insertions(+), 45 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 21f008ccc..d93721910 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -98,48 +98,51 @@ class ExtractTransformLoad: # It is used on the "load" base class method output_df: pd.DataFrame = None + def __init_subclass__(cls) -> None: + cls.DATASET_CONFIG = cls.yaml_config_load() + @classmethod def yaml_config_load(cls) -> dict: """Generate config dictionary and set instance variables from YAML dataset.""" - - # check if the class instance has score YAML definitions - datasets_config = load_yaml_dict_from_file( - cls.DATASET_CONFIG / "datasets.yml", - DatasetsConfig, - ) - - # get the config for this dataset - try: - dataset_config = next( - item - for item in datasets_config.get("datasets") - if item["module_name"] == cls.NAME + if cls.NAME is not None: + # check if the class instance has score YAML definitions + datasets_config = load_yaml_dict_from_file( + cls.DATASET_CONFIG / "datasets.yml", + DatasetsConfig, ) - except StopIteration: - # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope. - logger.error( - f"Exception encountered while extracting dataset config for dataset {cls.NAME}" - ) - sys.exit() - - # set some of the basic fields - cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[ - "input_geoid_tract_field_name" - ] - - # get the columns to write on the CSV - # and set the constants - cls.COLUMNS_TO_KEEP = [ - cls.GEOID_TRACT_FIELD_NAME, # always index with geoid tract id - ] - for field in dataset_config["load_fields"]: - cls.COLUMNS_TO_KEEP.append(field["long_name"]) - - # set the constants for the class - setattr(cls, field["df_field_name"], field["long_name"]) - - # return the config dict - return dataset_config + + # get the config for this dataset + try: + dataset_config = next( + item + for item in datasets_config.get("datasets") + if item["module_name"] == cls.NAME + ) + except StopIteration: + # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope. + logger.error( + f"Exception encountered while extracting dataset config for dataset {cls.NAME}" + ) + sys.exit() + + # set some of the basic fields + cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[ + "input_geoid_tract_field_name" + ] + + # get the columns to write on the CSV + # and set the constants + cls.COLUMNS_TO_KEEP = [ + cls.GEOID_TRACT_FIELD_NAME, # always index with geoid tract id + ] + for field in dataset_config["load_fields"]: + cls.COLUMNS_TO_KEEP.append(field["long_name"]) + + # set the constants for the class + setattr(cls, field["df_field_name"], field["long_name"]) + + # return the config dict + return dataset_config # This is a classmethod so it can be used by `get_data_frame` without # needing to create an instance of the class. This is a use case in `etl_score`. diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml index db0c53c21..a53018d8a 100644 --- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml +++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml @@ -129,4 +129,18 @@ datasets: include_in_csv: true include_in_excel: true column_position: 1 - \ No newline at end of file + - long_name: "Exaple ETL" + short_name: "Example" + module_name: "example_dataset" + description: "An example dataset for documentation" + input_geoid_tract_field_name: "GEOID10_TRACT" + load_fields: + - short_name: "EXAMPLE_FIELD" + df_field_name: "Input Field 1" + long_name: "Example Field 1" + field_type: float + include_in_tiles: true + include_in_csv: true + include_in_excel: true + column_position: 1 + diff --git a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py index 6f55458c9..beace4205 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py @@ -38,7 +38,6 @@ class ChildOpportunityIndex(ExtractTransformLoad): READING_FIELD: str def __init__(self): - self.DATASET_CONFIG = super().yaml_config_load() self.SOURCE_URL = ( "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-" "3a0ededa30a0?format=csv" diff --git a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py index 6250aaff9..0f67c4023 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py @@ -19,8 +19,6 @@ class DOEEnergyBurden(ExtractTransformLoad): REVISED_ENERGY_BURDEN_FIELD_NAME: str def __init__(self): - self.DATASET_CONFIG = super().yaml_config_load() - self.OUTPUT_PATH: Path = ( self.DATA_PATH / "dataset" / "doe_energy_burden" ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index 5b14d79b8..0b7ff12eb 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -33,9 +33,6 @@ class NationalRiskIndexETL(ExtractTransformLoad): AGRIVALUE_LOWER_BOUND = 408000 def __init__(self): - # load YAML config - self.DATASET_CONFIG = super().yaml_config_load() - # define the full path for the input CSV file self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"