From 7eaf63e8a30e9e57d52e8a3b4eb085dba5a692f6 Mon Sep 17 00:00:00 2001 From: Cecilie Seim <68303562+tilen1976@users.noreply.github.com> Date: Mon, 29 Jan 2024 10:21:01 +0100 Subject: [PATCH] Implement derivation of SSB-uniquely formatted dates (#137) * Create class SsbDateFormat for SSB special formats with start and end months for each instance of SsbDateFormat type. Add return type SsbDateFormat on method categorize_period_string. Method get_ssb_period for handling SsDateFormat objects, return string with year and month. Change methods contains_date_from and contains_date_until to handle SsbDateFormat. Add testcases in test_dapla_dataset_path. * Add doctests for convert_ssb_period. * Add doctests for SSB date formats to categorize_period_string * Add docstring for SSB date format to contains_date_from and contains_date_until * Correct half-year months and type hint supported_date_formats --- .../backend/dapla_dataset_path_info.py | 183 +++++++++++++++++- tests/backend/test_dapla_dataset_path_info.py | 30 +++ 2 files changed, 211 insertions(+), 2 deletions(-) diff --git a/src/datadoc/backend/dapla_dataset_path_info.py b/src/datadoc/backend/dapla_dataset_path_info.py index c9aea105..e6b397d8 100644 --- a/src/datadoc/backend/dapla_dataset_path_info.py +++ b/src/datadoc/backend/dapla_dataset_path_info.py @@ -49,15 +49,120 @@ class IsoDateFormat: timeframe="week", ) -SUPPORTED_DATE_FORMATS = [ + +@dataclass +class SsbDateFormat: + """An date format with relevant patterns for SSB special date formats.""" + + name: str + regex_pattern: str + arrow_pattern: str + time_frame: dict + + +SSB_BIMESTER = SsbDateFormat( + name="SSB_BIMESTER", + regex_pattern=r"\d{4}[B]\d{1}$", + arrow_pattern="YYYYMM", + time_frame={ + "B1": { + "start": "01", + "end": "02", + }, + "B2": { + "start": "03", + "end": "04", + }, + "B3": { + "start": "05", + "end": "06", + }, + "B4": { + "start": "07", + "end": "08", + }, + "B5": { + "start": "09", + "end": "10", + }, + "B6": { + "start": "11", + "end": "12", + }, + }, +) + +SSB_QUARTERLY = SsbDateFormat( + name="SSB_QUARTERLY", + regex_pattern=r"\d{4}[Q]\d{1}$", + arrow_pattern="YYYYMM", + time_frame={ + "Q1": { + "start": "01", + "end": "03", + }, + "Q2": { + "start": "04", + "end": "06", + }, + "Q3": { + "start": "07", + "end": "09", + }, + "Q4": { + "start": "10", + "end": "12", + }, + }, +) +SSB_TRIANNUAL = SsbDateFormat( + name="SSB_TRIANNUAL", + regex_pattern=r"\d{4}[T]\d{1}$", + arrow_pattern="YYYYMM", + time_frame={ + "T1": { + "start": "01", + "end": "04", + }, + "T2": { + "start": "05", + "end": "08", + }, + "T3": { + "start": "09", + "end": "12", + }, + }, +) +SSB_HALF_YEAR = SsbDateFormat( + name="SSB_HALF_YEAR", + regex_pattern=r"\d{4}[H]\d{1}$", + arrow_pattern="YYYYMM", + time_frame={ + "H1": { + "start": "01", + "end": "06", + }, + "H2": { + "start": "07", + "end": "12", + }, + }, +) + +SUPPORTED_DATE_FORMATS: list[IsoDateFormat | SsbDateFormat] = [ ISO_YEAR, ISO_YEAR_MONTH, ISO_YEAR_MONTH_DAY, ISO_YEAR_WEEK, + SSB_BIMESTER, + SSB_QUARTERLY, + SSB_TRIANNUAL, + SSB_HALF_YEAR, ] -def categorize_period_string(period: str) -> IsoDateFormat: +def categorize_period_string(period: str) -> IsoDateFormat | SsbDateFormat: """Categorize a period string into one of the supported date formats. If the period string is not recognized, a NotImplementedError is raised. @@ -71,6 +176,22 @@ def categorize_period_string(period: str) -> IsoDateFormat: >>> date_format.name ISO_YEAR_WEEK + >>> date_format = categorize_period_string('2022B1') + >>> date_format.name + SSB_BIMESTER + + >>> date_format = categorize_period_string('1980Q3') + >>> date_format.name + SSB_QUARTERLY + + >>> date_format = categorize_period_string('1954T2') + >>> date_format.name + SSB_TRIANNUAL + + >>> date_format = categorize_period_string('1876H1') + >>> date_format.name + SSB_HALF_YEAR + >>> categorize_period_string('unknown format') Traceback (most recent call last): ... @@ -86,6 +207,50 @@ def categorize_period_string(period: str) -> IsoDateFormat: ) +def convert_ssb_period( + period_string: str, + period_type: str, + date_format: SsbDateFormat, +) -> str: + """Convert ssb-format for bimester, quarterly, triannual and half year to start and end months. + + Usage-examples: + >>> ssb_bimester_period_start = convert_ssb_period("2022B1","start",SSB_BIMESTER) + >>> ssb_bimester_period_start + 202201 + + >>> ssb_bimester_period_end = convert_ssb_period("2022B1","end",SSB_BIMESTER) + >>> ssb_bimester_period_end + 202202 + + >>> ssb_quarterly_period_start = convert_ssb_period("2015Q3","start",SSB_QUARTERLY) + >>> ssb_quarterly_period_start + 201507 + + >>> ssb_quarterly_period_end = convert_ssb_period("2015Q3","end",SSB_QUARTERLY) + >>> ssb_quarterly_period_end + 201509 + + >>> ssb_triannual_period_start = convert_ssb_period("1998T2","start",SSB_TRIANNUAL) + >>> ssb_triannual_period_start + 199805 + + >>> ssb_quarterly_period_end = convert_ssb_period("1998T2","end",SSB_TRIANNUAL) + >>> ssb_quarterly_period_end + 199808 + + >>> ssb_half_year_period_start = convert_ssb_period("1898H2","start",SSB_HALF_YEAR) + >>> ssb_half_year_period_start + 189807 + + >>> ssb_half_year_period_end = convert_ssb_period("1898H2","end",SSB_HALF_YEAR) + >>> ssb_half_year_period_end + 189812 + + """ + return period_string[:4] + date_format.time_frame[period_string[-2:]][period_type] + + class DaplaDatasetPathInfo: """Extract info from a path following SSB's dataset naming convention.""" @@ -132,6 +297,15 @@ def _extract_period_strings(dataset_name_sections: list[str]) -> list[str]: def contains_data_from(self) -> datetime.date: """The earliest date from which data in the dataset is relevant for.""" date_format = categorize_period_string(self.first_period_string) + if isinstance(date_format, SsbDateFormat): + """If dateformat is SSB date format return start month of ssb period.""" + period = convert_ssb_period( + self.first_period_string, + "start", + date_format, + ) + return arrow.get(period, date_format.arrow_pattern).floor("month").date() + return ( arrow.get(self.first_period_string, date_format.arrow_pattern) .floor(date_format.timeframe) @@ -143,6 +317,11 @@ def contains_data_until(self) -> datetime.date: """The latest date until which data in the dataset is relevant for.""" period_string = self.second_period_string or self.first_period_string date_format = categorize_period_string(period_string) + if isinstance(date_format, SsbDateFormat): + """If dateformat is SSB date format return end month of ssb period.""" + period = convert_ssb_period(period_string, "end", date_format) + return arrow.get(period, date_format.arrow_pattern).ceil("month").date() + return ( arrow.get(period_string, date_format.arrow_pattern) .ceil(date_format.timeframe) diff --git a/tests/backend/test_dapla_dataset_path_info.py b/tests/backend/test_dapla_dataset_path_info.py index a882ac09..b49047c8 100644 --- a/tests/backend/test_dapla_dataset_path_info.py +++ b/tests/backend/test_dapla_dataset_path_info.py @@ -51,6 +51,36 @@ class DatasetPathTestCase: expected_contains_data_from=datetime.date(1981, 12, 21), expected_contains_data_until=datetime.date(1981, 12, 27), ), + DatasetPathTestCase( + path="personinntekt_p2022H1_v1.parquet", + expected_contains_data_from=datetime.date(2022, 1, 1), + expected_contains_data_until=datetime.date(2022, 6, 30), + ), + DatasetPathTestCase( + path="nybilreg_p2022T1_v1.parquet", + expected_contains_data_from=datetime.date(2022, 1, 1), + expected_contains_data_until=datetime.date(2022, 4, 30), + ), + DatasetPathTestCase( + path="varehandel_p2018Q1_p2018Q4_v1.parquet", + expected_contains_data_from=datetime.date(2018, 1, 1), + expected_contains_data_until=datetime.date(2018, 12, 31), + ), + DatasetPathTestCase( + path="pensjon_p2018Q1_v1.parquet", + expected_contains_data_from=datetime.date(2018, 1, 1), + expected_contains_data_until=datetime.date(2018, 3, 31), + ), + DatasetPathTestCase( + path="skipsanloep_p2021B2_v1.parquet", + expected_contains_data_from=datetime.date(2021, 3, 1), + expected_contains_data_until=datetime.date(2021, 4, 30), + ), + DatasetPathTestCase( + path="skipsanloep_p2022B1_v1.parquet", + expected_contains_data_from=datetime.date(2022, 1, 1), + expected_contains_data_until=datetime.date(2022, 2, 28), + ), ]