From 52db8835451b5a62044ab070893803e4cac61353 Mon Sep 17 00:00:00 2001 From: Cecilie Seim <68303562+tilen1976@users.noreply.github.com> Date: Thu, 1 Feb 2024 12:40:56 +0100 Subject: [PATCH] Refactor date format classes (#146) * Test set-up for new date format class * IsoDateFormat class inherits from DateFormat - methods get_floor og get_ceil returns date - tests ok * Add test for new date classes * Handle KeyError in SsbDateFormat class - return None * Add doctests and docstring * Remove comment * Remove unnecessary doctests * Correct name and structure * Fix parameters in test * Make DateFormat abstract with abstract methods * Formatting --- .../backend/dapla_dataset_path_info.py | 219 ++++++++---------- tests/backend/test_dapla_dataset_path_info.py | 56 +++++ 2 files changed, 154 insertions(+), 121 deletions(-) diff --git a/src/datadoc/backend/dapla_dataset_path_info.py b/src/datadoc/backend/dapla_dataset_path_info.py index 34293e30..0f5ee7fb 100644 --- a/src/datadoc/backend/dapla_dataset_path_info.py +++ b/src/datadoc/backend/dapla_dataset_path_info.py @@ -3,6 +3,8 @@ import pathlib import re +from abc import ABC +from abc import abstractmethod from dataclasses import dataclass from typing import TYPE_CHECKING from typing import Final @@ -16,17 +18,61 @@ if TYPE_CHECKING: import datetime import os + from datetime import date @dataclass -class IsoDateFormat: - """An ISO date format with relevant patterns.""" +class DateFormat(ABC): + """A super class for date formats.""" name: str regex_pattern: str arrow_pattern: str timeframe: Literal["year", "month", "day", "week"] + @abstractmethod + def get_floor(self, period_string: str) -> date | None: + """Return first date of timeframe period.""" + + @abstractmethod + def get_ceil(self, period_string: str) -> date | None: + """Return last date of timeframe period.""" + + +@dataclass +class IsoDateFormat(DateFormat): + """A subclass of Dateformat with relevant patterns for ISO dates.""" + + def get_floor(self, period_string: str) -> date | None: + """Method. + + >>> ISO_YEAR_MONTH.get_floor("1980-08") + datetime.date(1980, 8, 1) + + >>> ISO_YEAR.get_floor("2021") + datetime.date(2021, 1, 1) + + >>> SSB_BIMESTER.get_floor("2003B4") + datetime.date(2003, 7, 1) + + """ + return arrow.get(period_string, self.arrow_pattern).floor(self.timeframe).date() + + def get_ceil(self, period_string: str) -> date | None: + """Method. + + >>> ISO_YEAR.get_ceil("1921") + datetime.date(1921, 12, 31) + + >>> ISO_YEAR_MONTH.get_ceil("2021-05") + datetime.date(2021, 5, 31) + + >>> SSB_HALF_YEAR.get_ceil("2024H1") + datetime.date(2024, 6, 30) + + """ + return arrow.get(period_string, self.arrow_pattern).ceil(self.timeframe).date() + ISO_YEAR = IsoDateFormat( name="ISO_YEAR", @@ -55,20 +101,52 @@ class IsoDateFormat: @dataclass -class SsbDateFormat: - """An date format with relevant patterns for SSB special date formats.""" +class SsbDateFormat(DateFormat): + """A subclass of Dateformat with relevant patterns for SSB unique dates.""" - name: str - regex_pattern: str - arrow_pattern: str - time_frame: dict + ssb_dates: dict + + def get_floor(self, period_string: str) -> date | None: + """Convert SSB format to date-string and return first date. + + If not excisting SSB format, return None + + >>> SSB_BIMESTER.get_floor("2003B8") + None + + """ + try: + year = period_string[:4] + month = self.ssb_dates[period_string[-2:]]["start"] + period = year + month + return arrow.get(period, self.arrow_pattern).floor(self.timeframe).date() + except KeyError: + return None + + def get_ceil(self, period_string: str) -> date | None: + """Convert SSB format to date-string and return last date. + + If not excisting SSB format, return None + + >>> SSB_TRIANNUAL.get_ceil("1999T11") + None + + """ + try: + year = period_string[:4] + month = self.ssb_dates[period_string[-2:]]["end"] + period = year + month + return arrow.get(period, self.arrow_pattern).ceil(self.timeframe).date() + except KeyError: + return None SSB_BIMESTER = SsbDateFormat( name="SSB_BIMESTER", regex_pattern=r"\d{4}[B]\d{1}$", arrow_pattern="YYYYMM", - time_frame={ + timeframe="month", + ssb_dates={ "B1": { "start": "01", "end": "02", @@ -100,7 +178,8 @@ class SsbDateFormat: name="SSB_QUARTERLY", regex_pattern=r"\d{4}[Q]\d{1}$", arrow_pattern="YYYYMM", - time_frame={ + timeframe="month", + ssb_dates={ "Q1": { "start": "01", "end": "03", @@ -119,11 +198,13 @@ class SsbDateFormat: }, }, ) + SSB_TRIANNUAL = SsbDateFormat( name="SSB_TRIANNUAL", regex_pattern=r"\d{4}[T]\d{1}$", arrow_pattern="YYYYMM", - time_frame={ + timeframe="month", + ssb_dates={ "T1": { "start": "01", "end": "04", @@ -142,7 +223,8 @@ class SsbDateFormat: name="SSB_HALF_YEAR", regex_pattern=r"\d{4}[H]\d{1}$", arrow_pattern="YYYYMM", - time_frame={ + timeframe="month", + ssb_dates={ "H1": { "start": "01", "end": "06", @@ -172,34 +254,14 @@ def categorize_period_string(period: str) -> IsoDateFormat | SsbDateFormat: If the period string is not recognized, a NotImplementedError is raised. Examples: - >>> date_format = categorize_period_string('2022') - >>> date_format.name - ISO_YEAR - >>> date_format = categorize_period_string('2022-W01') >>> date_format.name ISO_YEAR_WEEK - >>> date_format = categorize_period_string('2022B1') - >>> date_format.name - SSB_BIMESTER - - >>> date_format = categorize_period_string('1980Q3') - >>> date_format.name - SSB_QUARTERLY - >>> date_format = categorize_period_string('1954T2') >>> date_format.name SSB_TRIANNUAL - >>> date_format = categorize_period_string('1876H1') - >>> date_format.name - SSB_HALF_YEAR - - >>> date_format = categorize_period_string('1876H5') # Not valid SSB date format, number is out of range - >>> date_format.name - SSB_HALF_YEAR - >>> categorize_period_string('unknown format') Traceback (most recent call last): ... @@ -215,61 +277,6 @@ def categorize_period_string(period: str) -> IsoDateFormat | SsbDateFormat: ) -def convert_ssb_period( - period_string: str, - period_type: str, - date_format: SsbDateFormat, -) -> str | None: - """Convert ssb-format for bimester, quarterly, triannual and half year to start and end months. - - If invalid SSB key, the method returns None. - - Usage-examples: - >>> ssb_bimester_period_start = convert_ssb_period("2022B1","start",SSB_BIMESTER) - >>> ssb_bimester_period_start - 202201 - - >>> ssb_bimester_period_end = convert_ssb_period("2022B1","end",SSB_BIMESTER) - >>> ssb_bimester_period_end - 202202 - - >>> ssb_quarterly_period_start = convert_ssb_period("2015Q3","start",SSB_QUARTERLY) - >>> ssb_quarterly_period_start - 201507 - - >>> ssb_quarterly_period_end = convert_ssb_period("2015Q3","end",SSB_QUARTERLY) - >>> ssb_quarterly_period_end - 201509 - - >>> ssb_triannual_period_start = convert_ssb_period("1998T2","start",SSB_TRIANNUAL) - >>> ssb_triannual_period_start - 199805 - - >>> ssb_quarterly_period_end = convert_ssb_period("1998T2","end",SSB_TRIANNUAL) - >>> ssb_quarterly_period_end - 199808 - - >>> ssb_half_year_period_start = convert_ssb_period("1898H2","start",SSB_HALF_YEAR) - >>> ssb_half_year_period_start - 189807 - - >>> ssb_half_year_period_end = convert_ssb_period("1898H2","end",SSB_HALF_YEAR) - >>> ssb_half_year_period_end - 189812 - - >>> ssb_invalid_key = convert_ssb_period("2018Q5","start",SSB_QUARTERLY) - >>> ssb_invalid_key - None - - """ - try: - return ( - period_string[:4] + date_format.time_frame[period_string[-2:]][period_type] - ) - except KeyError: - return None - - class DaplaDatasetPathInfo: """Extract info from a path following SSB's dataset naming convention.""" @@ -321,32 +328,12 @@ def _extract_period_string_from_index(self, index: int) -> str | None: def contains_data_from(self) -> datetime.date | None: """The earliest date from which data in the dataset is relevant for.""" period_string = self._extract_period_string_from_index(0) - if ( - not period_string - or len(self._period_strings) > 1 - and period_string > self._period_strings[1] + if not period_string or ( + len(self._period_strings) > 1 and period_string > self._period_strings[1] ): return None date_format = categorize_period_string(period_string) - - if isinstance(date_format, SsbDateFormat): - """If dateformat is SSB date format return start month of ssb period.""" - period = convert_ssb_period( - period_string, - "start", - date_format, - ) - if period is not None: - return ( - arrow.get(period, date_format.arrow_pattern).floor("month").date() - ) - return None - - return ( - arrow.get(period_string, date_format.arrow_pattern) - .floor(date_format.timeframe) - .date() - ) + return date_format.get_floor(period_string) @property def contains_data_until(self) -> datetime.date | None: @@ -361,17 +348,7 @@ def contains_data_until(self) -> datetime.date | None: ): return None date_format = categorize_period_string(period_string) - if isinstance(date_format, SsbDateFormat): - """If dateformat is SSB date format return end month of ssb period.""" - period = convert_ssb_period(period_string, "end", date_format) - if period is not None: - return arrow.get(period, date_format.arrow_pattern).ceil("month").date() - return None - return ( - arrow.get(period_string, date_format.arrow_pattern) - .ceil(date_format.timeframe) - .date() - ) + return date_format.get_ceil(period_string) @property def dataset_state( diff --git a/tests/backend/test_dapla_dataset_path_info.py b/tests/backend/test_dapla_dataset_path_info.py index 8ed12e79..e644e406 100644 --- a/tests/backend/test_dapla_dataset_path_info.py +++ b/tests/backend/test_dapla_dataset_path_info.py @@ -8,6 +8,10 @@ import pytest +from datadoc.backend.dapla_dataset_path_info import ISO_YEAR +from datadoc.backend.dapla_dataset_path_info import ISO_YEAR_MONTH +from datadoc.backend.dapla_dataset_path_info import ISO_YEAR_MONTH_DAY +from datadoc.backend.dapla_dataset_path_info import SSB_BIMESTER from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo from datadoc.enums import DatasetState from tests.utils import TEST_PARQUET_FILEPATH @@ -222,3 +226,55 @@ def test_extract_period_info_date_until_invalid_pathname( ) -> None: dataset = DaplaDatasetPathInfo(dataset_path_name) assert dataset.contains_data_until is None + + +@pytest.mark.parametrize( + ("date_format", "period"), + [ + (ISO_YEAR, "1980"), + (ISO_YEAR_MONTH, "1888-11"), + (ISO_YEAR_MONTH_DAY, "2203-01-24"), + (SSB_BIMESTER, "1963B3"), + ], +) +def test_date_format_return_date_object_period_start(date_format, period): + assert isinstance(date_format.get_floor(period), datetime.date) + + +@pytest.mark.parametrize( + ("date_format", "period"), + [ + (ISO_YEAR, "1980"), + (ISO_YEAR_MONTH, "1888-11"), + (ISO_YEAR_MONTH_DAY, "2203-01-24"), + (SSB_BIMESTER, "1963B3"), + ], +) +def test_date_format_return_date_object_period_end(date_format, period): + assert isinstance(date_format.get_ceil(period), datetime.date) + + +@pytest.mark.parametrize( + ("date_format", "period", "expected"), + [ + (ISO_YEAR, "1980", datetime.date(1980, 1, 1)), + (ISO_YEAR_MONTH, "1888-11", datetime.date(1888, 11, 1)), + (ISO_YEAR_MONTH_DAY, "2203-01-24", datetime.date(2203, 1, 24)), + (SSB_BIMESTER, "1963B3", datetime.date(1963, 5, 1)), + ], +) +def test_date_format_correct_from_date(date_format, period, expected: datetime.date): + assert date_format.get_floor(period) == expected + + +@pytest.mark.parametrize( + ("date_format", "period", "expected"), + [ + (ISO_YEAR, "1980", datetime.date(1980, 12, 31)), + (ISO_YEAR_MONTH, "1888-11", datetime.date(1888, 11, 30)), + (ISO_YEAR_MONTH_DAY, "2203-01-24", datetime.date(2203, 1, 24)), + (SSB_BIMESTER, "1963B3", datetime.date(1963, 6, 30)), + ], +) +def test_date_format_correct_end_date(date_format, period, expected): + assert date_format.get_ceil(period) == expected