Skip to content

Commit

Permalink
Run YAML load on all subclasses (#1518)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattbowen-usds committed Aug 10, 2022
1 parent e19f31b commit 0808e8a
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 45 deletions.
79 changes: 41 additions & 38 deletions data/data-pipeline/data_pipeline/etl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,48 +98,51 @@ class ExtractTransformLoad:
# It is used on the "load" base class method
output_df: pd.DataFrame = None

def __init_subclass__(cls) -> None:
cls.DATASET_CONFIG = cls.yaml_config_load()

@classmethod
def yaml_config_load(cls) -> dict:
"""Generate config dictionary and set instance variables from YAML dataset."""

# check if the class instance has score YAML definitions
datasets_config = load_yaml_dict_from_file(
cls.DATASET_CONFIG / "datasets.yml",
DatasetsConfig,
)

# get the config for this dataset
try:
dataset_config = next(
item
for item in datasets_config.get("datasets")
if item["module_name"] == cls.NAME
if cls.NAME is not None:
# check if the class instance has score YAML definitions
datasets_config = load_yaml_dict_from_file(
cls.DATASET_CONFIG / "datasets.yml",
DatasetsConfig,
)
except StopIteration:
# Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
logger.error(
f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
)
sys.exit()

# set some of the basic fields
cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
"input_geoid_tract_field_name"
]

# get the columns to write on the CSV
# and set the constants
cls.COLUMNS_TO_KEEP = [
cls.GEOID_TRACT_FIELD_NAME, # always index with geoid tract id
]
for field in dataset_config["load_fields"]:
cls.COLUMNS_TO_KEEP.append(field["long_name"])

# set the constants for the class
setattr(cls, field["df_field_name"], field["long_name"])

# return the config dict
return dataset_config

# get the config for this dataset
try:
dataset_config = next(
item
for item in datasets_config.get("datasets")
if item["module_name"] == cls.NAME
)
except StopIteration:
# Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
logger.error(
f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
)
sys.exit()

# set some of the basic fields
cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
"input_geoid_tract_field_name"
]

# get the columns to write on the CSV
# and set the constants
cls.COLUMNS_TO_KEEP = [
cls.GEOID_TRACT_FIELD_NAME, # always index with geoid tract id
]
for field in dataset_config["load_fields"]:
cls.COLUMNS_TO_KEEP.append(field["long_name"])

# set the constants for the class
setattr(cls, field["df_field_name"], field["long_name"])

# return the config dict
return dataset_config

# This is a classmethod so it can be used by `get_data_frame` without
# needing to create an instance of the class. This is a use case in `etl_score`.
Expand Down
16 changes: 15 additions & 1 deletion data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,18 @@ datasets:
include_in_csv: true
include_in_excel: true
column_position: 1

- long_name: "Exaple ETL"
short_name: "Example"
module_name: "example_dataset"
description: "An example dataset for documentation"
input_geoid_tract_field_name: "GEOID10_TRACT"
load_fields:
- short_name: "EXAMPLE_FIELD"
df_field_name: "Input Field 1"
long_name: "Example Field 1"
field_type: float
include_in_tiles: true
include_in_csv: true
include_in_excel: true
column_position: 1

Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class ChildOpportunityIndex(ExtractTransformLoad):
READING_FIELD: str

def __init__(self):
self.DATASET_CONFIG = super().yaml_config_load()
self.SOURCE_URL = (
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
"3a0ededa30a0?format=csv"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ class DOEEnergyBurden(ExtractTransformLoad):
REVISED_ENERGY_BURDEN_FIELD_NAME: str

def __init__(self):
self.DATASET_CONFIG = super().yaml_config_load()

self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "doe_energy_burden"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
AGRIVALUE_LOWER_BOUND = 408000

def __init__(self):
# load YAML config
self.DATASET_CONFIG = super().yaml_config_load()

# define the full path for the input CSV file
self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"

Expand Down

0 comments on commit 0808e8a

Please sign in to comment.