From fcdebb2cd39503167593f71c8d63e2af19668740 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Thu, 9 Jan 2025 09:59:00 -0500 Subject: [PATCH 1/5] add censusfips metadata --- src/pudl/metadata/sources.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/pudl/metadata/sources.py b/src/pudl/metadata/sources.py index 3e0309fd0..880304a29 100644 --- a/src/pudl/metadata/sources.py +++ b/src/pudl/metadata/sources.py @@ -24,6 +24,21 @@ "license_pudl": LICENSES["cc-by-4.0"], "contributors": [CONTRIBUTORS["catalyst-cooperative"]], }, + "censusfips": { + "title": "Population Estimates FIPS Codes", + "path": "https://www.census.gov/geographies/reference-files/2023/demo/popest/2023-fips.html", + "description": "Reference files for Federal Information Processing Series (FIPS) Geographic Codes.", + "working_partitions": {"years": sorted(set(range(2011, 2024)))}, + "keywords": sorted( + { + "fips", + "census", + } + ), + "license_raw": LICENSES["us-govt"], + "license_pudl": LICENSES["cc-by-4.0"], + "contributors": [CONTRIBUTORS["catalyst-cooperative"]], + }, "eia176": { "title": "EIA Form 176 -- Annual Report of Natural and Supplemental Gas Supply and Disposition", "path": "https://www.eia.gov/naturalgas/ngqs/", From 3b14928a4522056e963c1fcf6f3b2db6a54441d6 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Thu, 9 Jan 2025 14:50:08 -0500 Subject: [PATCH 2/5] change fips dataset name --- src/pudl/metadata/sources.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/pudl/metadata/sources.py b/src/pudl/metadata/sources.py index 9368dac12..1227c66fc 100644 --- a/src/pudl/metadata/sources.py +++ b/src/pudl/metadata/sources.py @@ -24,10 +24,13 @@ "license_pudl": LICENSES["cc-by-4.0"], "contributors": [CONTRIBUTORS["catalyst-cooperative"]], }, - "censusfips": { - "title": "Population Estimates FIPS Codes", + "censuspep": { + "title": "Population Estimates Program's (PEP) Federal Information Processing Series (FIPS) Codes", "path": "https://www.census.gov/geographies/reference-files/2023/demo/popest/2023-fips.html", - "description": "Reference files for Federal Information Processing Series (FIPS) Geographic Codes.", + "description": ( + "Reference files for Federal Information Processing Series (FIPS) Geographic Codes. " + "These FIPS Codes are a subset of a broader Population Estimates dataset." + ), "working_partitions": {"years": sorted(set(range(2011, 2024)))}, "keywords": sorted( { From e38b4220c5f4f98ed61b979fd465ab85acedf9db Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 14 Jan 2025 13:09:33 -0700 Subject: [PATCH 3/5] Update src/pudl/metadata/sources.py Co-authored-by: E. Belfer <37471869+e-belfer@users.noreply.github.com> --- src/pudl/metadata/sources.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pudl/metadata/sources.py b/src/pudl/metadata/sources.py index 1227c66fc..7404561d7 100644 --- a/src/pudl/metadata/sources.py +++ b/src/pudl/metadata/sources.py @@ -36,6 +36,9 @@ { "fips", "census", + "county", + "state", + "geography", } ), "license_raw": LICENSES["us-govt"], From cfa402ecec62effe36a7eb84b7e8d24ffcfa1a5e Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 14 Jan 2025 15:03:53 -0500 Subject: [PATCH 4/5] Add extractor - currently using sandbox doi --- src/pudl/extract/__init__.py | 1 + src/pudl/extract/censuspep.py | 29 +++++++++++++++++++ src/pudl/metadata/sources.py | 2 +- src/pudl/package_data/censuspep/__init__.py | 1 + .../censuspep/column_maps/__init__.py | 1 + .../censuspep/column_maps/geocodes.csv | 8 +++++ src/pudl/package_data/censuspep/file_map.csv | 2 ++ src/pudl/package_data/censuspep/page_map.csv | 2 ++ .../package_data/censuspep/skipfooter.csv | 2 ++ src/pudl/package_data/censuspep/skiprows.csv | 2 ++ src/pudl/workspace/datastore.py | 1 + 11 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 src/pudl/extract/censuspep.py create mode 100644 src/pudl/package_data/censuspep/__init__.py create mode 100644 src/pudl/package_data/censuspep/column_maps/__init__.py create mode 100644 src/pudl/package_data/censuspep/column_maps/geocodes.csv create mode 100644 src/pudl/package_data/censuspep/file_map.csv create mode 100644 src/pudl/package_data/censuspep/page_map.csv create mode 100644 src/pudl/package_data/censuspep/skipfooter.csv create mode 100644 src/pudl/package_data/censuspep/skiprows.csv diff --git a/src/pudl/extract/__init__.py b/src/pudl/extract/__init__.py index ebecf594b..59f159d7d 100644 --- a/src/pudl/extract/__init__.py +++ b/src/pudl/extract/__init__.py @@ -9,6 +9,7 @@ """ from . import ( + censuspep, eia176, eia191, eia757a, diff --git a/src/pudl/extract/censuspep.py b/src/pudl/extract/censuspep.py new file mode 100644 index 000000000..029788d85 --- /dev/null +++ b/src/pudl/extract/censuspep.py @@ -0,0 +1,29 @@ +"""Retrieve data from Census PEP spreadsheets.""" + +import pudl +import pudl.logging_helpers +from pudl.extract import excel + +logger = pudl.logging_helpers.get_logger(__name__) + + +class Extractor(excel.ExcelExtractor): + """Extractor for the excel dataset Census PEP FIPS Codes.""" + + def __init__(self, *args, **kwargs): + """Initialize the module. + + Args: + ds (:class:datastore.Datastore): Initialized datastore. + """ + self.METADATA = excel.ExcelMetadata("censuspep") + self.cols_added = [] + super().__init__(*args, **kwargs) + + def process_raw(self, df, page, **partition): + """Apply necessary pre-processing to the dataframe.""" + df = df.rename(columns=self._metadata.get_column_map(page, **partition)) + if "report_year" not in df.columns: + df["report_year"] = list(partition.values())[0] + self.cols_added = ["report_year"] + return df diff --git a/src/pudl/metadata/sources.py b/src/pudl/metadata/sources.py index 916921138..5df368ebd 100644 --- a/src/pudl/metadata/sources.py +++ b/src/pudl/metadata/sources.py @@ -31,7 +31,7 @@ "Reference files for Federal Information Processing Series (FIPS) Geographic Codes. " "These FIPS Codes are a subset of a broader Population Estimates dataset." ), - "working_partitions": {"years": sorted(set(range(2011, 2024)))}, + "working_partitions": {"years": [2023]}, "keywords": sorted( { "fips", diff --git a/src/pudl/package_data/censuspep/__init__.py b/src/pudl/package_data/censuspep/__init__.py new file mode 100644 index 000000000..e740653c6 --- /dev/null +++ b/src/pudl/package_data/censuspep/__init__.py @@ -0,0 +1 @@ +"""Excel spreadsheet extraction maps for EIA 860.""" diff --git a/src/pudl/package_data/censuspep/column_maps/__init__.py b/src/pudl/package_data/censuspep/column_maps/__init__.py new file mode 100644 index 000000000..eff9fda13 --- /dev/null +++ b/src/pudl/package_data/censuspep/column_maps/__init__.py @@ -0,0 +1 @@ +"""Metadata linking semantic meaning of EIA 860 spreadsheet columns across years.""" diff --git a/src/pudl/package_data/censuspep/column_maps/geocodes.csv b/src/pudl/package_data/censuspep/column_maps/geocodes.csv new file mode 100644 index 000000000..e766a4771 --- /dev/null +++ b/src/pudl/package_data/censuspep/column_maps/geocodes.csv @@ -0,0 +1,8 @@ +year_index,2023 +fips_level,summary_level +state_id_fips,state_fips_code +county_id_fips,county_fips_code +county_subdivision_fips,county_subdivision_fips_code +place_fips,place_fips_code +consolidated_city_fips,consolidated_city_fips_code +area_name,area_name diff --git a/src/pudl/package_data/censuspep/file_map.csv b/src/pudl/package_data/censuspep/file_map.csv new file mode 100644 index 000000000..1a27650e7 --- /dev/null +++ b/src/pudl/package_data/censuspep/file_map.csv @@ -0,0 +1,2 @@ +page,2023 +geocodes,all-geocodes-v2023.xlsx diff --git a/src/pudl/package_data/censuspep/page_map.csv b/src/pudl/package_data/censuspep/page_map.csv new file mode 100644 index 000000000..44d5998f8 --- /dev/null +++ b/src/pudl/package_data/censuspep/page_map.csv @@ -0,0 +1,2 @@ +year_index,2023 +geocodes,0 diff --git a/src/pudl/package_data/censuspep/skipfooter.csv b/src/pudl/package_data/censuspep/skipfooter.csv new file mode 100644 index 000000000..44d5998f8 --- /dev/null +++ b/src/pudl/package_data/censuspep/skipfooter.csv @@ -0,0 +1,2 @@ +year_index,2023 +geocodes,0 diff --git a/src/pudl/package_data/censuspep/skiprows.csv b/src/pudl/package_data/censuspep/skiprows.csv new file mode 100644 index 000000000..9cca9debb --- /dev/null +++ b/src/pudl/package_data/censuspep/skiprows.csv @@ -0,0 +1,2 @@ +year_index,2023 +geocodes,4 diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py index 725d098ad..4cc1f6684 100644 --- a/src/pudl/workspace/datastore.py +++ b/src/pudl/workspace/datastore.py @@ -187,6 +187,7 @@ class ZenodoDoiSettings(BaseSettings): """Digital Object Identifiers pointing to currently used Zenodo archives.""" censusdp1tract: ZenodoDoi = "10.5281/zenodo.4127049" + censuspep: ZenodoDoi = "10.5281/zenodo.14624611" # "10.5072/zenodo.151369" eia176: ZenodoDoi = "10.5281/zenodo.14589676" eia191: ZenodoDoi = "10.5281/zenodo.10607837" eia757a: ZenodoDoi = "10.5281/zenodo.10607839" From 932ab34631d9bd4ab61c514c3a33a14f17a7cf53 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 17 Jan 2025 08:20:11 -0500 Subject: [PATCH 5/5] remove the extraction step from this pr --- src/pudl/extract/__init__.py | 1 - src/pudl/extract/censuspep.py | 29 ------------------- src/pudl/package_data/censuspep/__init__.py | 1 - .../censuspep/column_maps/__init__.py | 1 - .../censuspep/column_maps/geocodes.csv | 8 ----- src/pudl/package_data/censuspep/file_map.csv | 2 -- src/pudl/package_data/censuspep/page_map.csv | 2 -- .../package_data/censuspep/skipfooter.csv | 2 -- src/pudl/package_data/censuspep/skiprows.csv | 2 -- src/pudl/workspace/datastore.py | 1 - 10 files changed, 49 deletions(-) delete mode 100644 src/pudl/extract/censuspep.py delete mode 100644 src/pudl/package_data/censuspep/__init__.py delete mode 100644 src/pudl/package_data/censuspep/column_maps/__init__.py delete mode 100644 src/pudl/package_data/censuspep/column_maps/geocodes.csv delete mode 100644 src/pudl/package_data/censuspep/file_map.csv delete mode 100644 src/pudl/package_data/censuspep/page_map.csv delete mode 100644 src/pudl/package_data/censuspep/skipfooter.csv delete mode 100644 src/pudl/package_data/censuspep/skiprows.csv diff --git a/src/pudl/extract/__init__.py b/src/pudl/extract/__init__.py index 59f159d7d..ebecf594b 100644 --- a/src/pudl/extract/__init__.py +++ b/src/pudl/extract/__init__.py @@ -9,7 +9,6 @@ """ from . import ( - censuspep, eia176, eia191, eia757a, diff --git a/src/pudl/extract/censuspep.py b/src/pudl/extract/censuspep.py deleted file mode 100644 index 029788d85..000000000 --- a/src/pudl/extract/censuspep.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Retrieve data from Census PEP spreadsheets.""" - -import pudl -import pudl.logging_helpers -from pudl.extract import excel - -logger = pudl.logging_helpers.get_logger(__name__) - - -class Extractor(excel.ExcelExtractor): - """Extractor for the excel dataset Census PEP FIPS Codes.""" - - def __init__(self, *args, **kwargs): - """Initialize the module. - - Args: - ds (:class:datastore.Datastore): Initialized datastore. - """ - self.METADATA = excel.ExcelMetadata("censuspep") - self.cols_added = [] - super().__init__(*args, **kwargs) - - def process_raw(self, df, page, **partition): - """Apply necessary pre-processing to the dataframe.""" - df = df.rename(columns=self._metadata.get_column_map(page, **partition)) - if "report_year" not in df.columns: - df["report_year"] = list(partition.values())[0] - self.cols_added = ["report_year"] - return df diff --git a/src/pudl/package_data/censuspep/__init__.py b/src/pudl/package_data/censuspep/__init__.py deleted file mode 100644 index e740653c6..000000000 --- a/src/pudl/package_data/censuspep/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Excel spreadsheet extraction maps for EIA 860.""" diff --git a/src/pudl/package_data/censuspep/column_maps/__init__.py b/src/pudl/package_data/censuspep/column_maps/__init__.py deleted file mode 100644 index eff9fda13..000000000 --- a/src/pudl/package_data/censuspep/column_maps/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Metadata linking semantic meaning of EIA 860 spreadsheet columns across years.""" diff --git a/src/pudl/package_data/censuspep/column_maps/geocodes.csv b/src/pudl/package_data/censuspep/column_maps/geocodes.csv deleted file mode 100644 index e766a4771..000000000 --- a/src/pudl/package_data/censuspep/column_maps/geocodes.csv +++ /dev/null @@ -1,8 +0,0 @@ -year_index,2023 -fips_level,summary_level -state_id_fips,state_fips_code -county_id_fips,county_fips_code -county_subdivision_fips,county_subdivision_fips_code -place_fips,place_fips_code -consolidated_city_fips,consolidated_city_fips_code -area_name,area_name diff --git a/src/pudl/package_data/censuspep/file_map.csv b/src/pudl/package_data/censuspep/file_map.csv deleted file mode 100644 index 1a27650e7..000000000 --- a/src/pudl/package_data/censuspep/file_map.csv +++ /dev/null @@ -1,2 +0,0 @@ -page,2023 -geocodes,all-geocodes-v2023.xlsx diff --git a/src/pudl/package_data/censuspep/page_map.csv b/src/pudl/package_data/censuspep/page_map.csv deleted file mode 100644 index 44d5998f8..000000000 --- a/src/pudl/package_data/censuspep/page_map.csv +++ /dev/null @@ -1,2 +0,0 @@ -year_index,2023 -geocodes,0 diff --git a/src/pudl/package_data/censuspep/skipfooter.csv b/src/pudl/package_data/censuspep/skipfooter.csv deleted file mode 100644 index 44d5998f8..000000000 --- a/src/pudl/package_data/censuspep/skipfooter.csv +++ /dev/null @@ -1,2 +0,0 @@ -year_index,2023 -geocodes,0 diff --git a/src/pudl/package_data/censuspep/skiprows.csv b/src/pudl/package_data/censuspep/skiprows.csv deleted file mode 100644 index 9cca9debb..000000000 --- a/src/pudl/package_data/censuspep/skiprows.csv +++ /dev/null @@ -1,2 +0,0 @@ -year_index,2023 -geocodes,4 diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py index 4cc1f6684..725d098ad 100644 --- a/src/pudl/workspace/datastore.py +++ b/src/pudl/workspace/datastore.py @@ -187,7 +187,6 @@ class ZenodoDoiSettings(BaseSettings): """Digital Object Identifiers pointing to currently used Zenodo archives.""" censusdp1tract: ZenodoDoi = "10.5281/zenodo.4127049" - censuspep: ZenodoDoi = "10.5281/zenodo.14624611" # "10.5072/zenodo.151369" eia176: ZenodoDoi = "10.5281/zenodo.14589676" eia191: ZenodoDoi = "10.5281/zenodo.10607837" eia757a: ZenodoDoi = "10.5281/zenodo.10607839"