Skip to content

Commit

Permalink
Dataclass to store date format information
Browse files Browse the repository at this point in the history
  • Loading branch information
mmwinther committed Jan 25, 2024
1 parent b84e995 commit b2b6b5f
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 92 deletions.
145 changes: 80 additions & 65 deletions src/datadoc/backend/dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,62 +4,66 @@
import contextlib
import pathlib
import re
from datetime import datetime
from datetime import timezone
from enum import Enum
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Literal

import arrow


class SupportedDateFormats(Enum):
"""The Date formats supported by the naming convention."""

ISO_YEAR = "YYYY" # String format YYYY
ISO_YEAR_MONTH = "YYYY-MM" # String format YYYY-MM
ISO_YEAR_MONTH_DAY = "YYYY-MM-DD" # String format YYYY-MM-DD
SSB_YEAR_SEMESTER = "YYYY-Hn" # String format YYYY-Hn
SSB_YEAR_TRIMESTER = "YYYY-Tn" # String format YYYY-Tn
SSB_YEAR_QUARTER = "YYYY-Qn" # String format YYYY-Qn
SSB_YEAR_BIMESTER = "YYYY-Bn" # String format YYYY-Bn
SSB_YEAR_WEEK = "YYYY-Wnn" # String format YYYY-Wnn
UNKNOWN = "UNKNOWN"


def categorize_period_string(period: str) -> SupportedDateFormats: # noqa: PLR0911
if TYPE_CHECKING:
import datetime


@dataclass
class IsoDateFormat:
"""An ISO date format with relevant patterns."""

name: str
regex_pattern: str
arrow_pattern: str
timeframe: Literal["year", "month", "day", "week"]


ISO_YEAR = IsoDateFormat(
name="ISO_YEAR",
regex_pattern=r"^\d{4}$",
arrow_pattern="YYYY",
timeframe="year",
)
ISO_YEAR_MONTH = IsoDateFormat(
name="ISO_YEAR_MONTH",
regex_pattern=r"^\d{4}\-\d{2}$",
arrow_pattern="YYYY-MM",
timeframe="month",
)
ISO_YEAR_MONTH_DAY = IsoDateFormat(
name="ISO_YEAR_MONTH_DAY",
regex_pattern=r"^\d{4}\-\d{2}\-\d{2}$",
arrow_pattern="YYYY-MM-DD",
timeframe="day",
)
ISO_YEAR_WEEK = IsoDateFormat(
name="ISO_YEAR_WEEK",
regex_pattern=r"^\d{4}\-W\d{2}$",
arrow_pattern="YYYY-Wnn",
timeframe="week",
)

SUPPORTED_DATE_FORMATS = [
ISO_YEAR,
ISO_YEAR_MONTH,
ISO_YEAR_MONTH_DAY,
ISO_YEAR_WEEK,
]


def categorize_period_string(period: str) -> IsoDateFormat | None:
"""A naive string validator."""
match RegexEqualCompiler(period):
case r"\d{4}\-H\d":
return SupportedDateFormats.SSB_YEAR_SEMESTER
case r"\d{4}\-T\d":
return SupportedDateFormats.SSB_YEAR_TRIMESTER
case r"\d{4}\-Q\d":
return SupportedDateFormats.SSB_YEAR_QUARTER
case r"\d{4}\-B\d":
return SupportedDateFormats.SSB_YEAR_BIMESTER
case r"\d{4}\-W\d\d":
return SupportedDateFormats.SSB_YEAR_WEEK
case r"\d{4}\-\d{2}\-\d{2}":
return SupportedDateFormats.ISO_YEAR_MONTH_DAY
case r"\d{4}\-\d{2}":
return SupportedDateFormats.ISO_YEAR_MONTH
case r"\d{4}":
return SupportedDateFormats.ISO_YEAR
case _:
return SupportedDateFormats.UNKNOWN


class RegexEqualCompiler(str):
"""Handler class for checking regex patterns."""

__slots__ = ["subject_string"]

def __init__(self, subject_string: str) -> None:
"""Store the string to search against."""
self.subject_string = subject_string

def __eq__(self, pattern: object) -> bool:
"""Returns true on match with tested pattern."""
return bool(re.search(str(pattern), self.subject_string))
for date_format in reversed(SUPPORTED_DATE_FORMATS):
if re.match(date_format.regex_pattern, period):
return date_format

return None


class DaplaDatasetPathInfo:
Expand All @@ -71,8 +75,7 @@ def __init__(self, dataset_path: str) -> None:
self.dataset_name_sections = self.dataset_path.stem.split("_")
_period_strings = self._extract_period_strings(self.dataset_name_sections)
self.first_period_string = _period_strings[0]
self.second_period_string = _period_strings[0]
self.date_format = categorize_period_string(self.first_period_string)
self.second_period_string: str | None = None

with contextlib.suppress(IndexError):
self.second_period_string = _period_strings[1]
Expand All @@ -97,18 +100,30 @@ def _extract_period_strings(self, dataset_name_sections: list[str]) -> list[str]
@property
def contains_data_from(self) -> datetime.date:
"""The earliest date from which data in the dataset is relevant for."""
match (self.date_format):
case SupportedDateFormats.ISO_YEAR:
return (
arrow.get(self.first_period_string, self.date_format.value)
.floor("year")
.date()
)
case _:
return datetime.now(timezone.utc).astimezone()
if date_format := categorize_period_string(self.first_period_string):
return (
arrow.get(self.first_period_string, date_format.arrow_pattern)
.floor(date_format.timeframe)
.date()
)

msg = f"Period format {self.first_period_string} is not supported"
raise NotImplementedError(
msg,
)

@property
def contains_data_until(self) -> datetime.date:
"""The latest date until which data in the dataset is relevant for."""
year = self.second_period_string
return arrow.get(year, "YYYY").ceil("year").date()
period_string = self.second_period_string or self.first_period_string
if date_format := categorize_period_string(self.first_period_string):
return (
arrow.get(period_string, date_format.arrow_pattern)
.ceil(date_format.timeframe)
.date()
)

msg = f"Period format {period_string} is not supported"
raise NotImplementedError(
msg,
)
29 changes: 2 additions & 27 deletions tests/backend/test_dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import pytest

from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo
from datadoc.backend.dapla_dataset_path_info import SupportedDateFormats
from datadoc.backend.dapla_dataset_path_info import categorize_period_string


@dataclass
Expand All @@ -15,57 +13,49 @@ class DatasetPathTestCase:
path: str
expected_contains_data_from: datetime.date
expected_contains_data_until: datetime.date
expected_date_format: SupportedDateFormats


TEST_CASES = [
DatasetPathTestCase(
path="grensehandel_imputert_p2022-B1_v1.parquet",
expected_contains_data_from=datetime.date(2022, 1, 1),
expected_contains_data_until=datetime.date(2022, 2, 28),
expected_date_format=SupportedDateFormats.SSB_YEAR_BIMESTER,
),
DatasetPathTestCase(
path="grensehandel_imputert_p2022-B1_p2022-B2_v1.parquet",
expected_contains_data_from=datetime.date(2022, 1, 1),
expected_contains_data_until=datetime.date(2022, 4, 30),
expected_date_format=SupportedDateFormats.SSB_YEAR_BIMESTER,
),
DatasetPathTestCase(
path="grensehandel_imputert_p2022-10-01_p2022-12-31_v1.parquet",
expected_contains_data_from=datetime.date(2022, 10, 1),
expected_contains_data_until=datetime.date(2022, 12, 31),
expected_date_format=SupportedDateFormats.ISO_YEAR_MONTH_DAY,
),
DatasetPathTestCase(
path="grensehandel_imputert_p2022-10_p2022-12_v1.parquet",
expected_contains_data_from=datetime.date(2022, 10, 1),
expected_contains_data_until=datetime.date(2022, 12, 31),
expected_date_format=SupportedDateFormats.ISO_YEAR_MONTH,
),
DatasetPathTestCase(
path="flygende_objekter_p2019_v1.parquet",
expected_contains_data_from=datetime.date(2019, 1, 1),
expected_contains_data_until=datetime.date(2019, 12, 31),
expected_date_format=SupportedDateFormats.ISO_YEAR,
),
DatasetPathTestCase(
path="framskrevne-befolkningsendringer_p2019_p2050_v1.parquet",
expected_contains_data_from=datetime.date(2019, 1, 1),
expected_contains_data_until=datetime.date(2050, 12, 31),
expected_date_format=SupportedDateFormats.ISO_YEAR,
),
DatasetPathTestCase(
path="ufo_observasjoner_p2019_p2020_v1.parquet",
expected_contains_data_from=datetime.date(2019, 1, 1),
expected_contains_data_until=datetime.date(2020, 12, 31),
expected_date_format=SupportedDateFormats.ISO_YEAR,
),
]


@pytest.fixture(
ids=[f"{tc.expected_date_format.name}-{tc.path}" for tc in TEST_CASES],
ids=[tc.path for tc in TEST_CASES],
params=TEST_CASES,
)
def test_data(request: pytest.FixtureRequest) -> DatasetPathTestCase:
Expand Down Expand Up @@ -94,23 +84,8 @@ def test_extract_period_info_date_from(
assert dataset_path.contains_data_from == expected_contains_data_from


""" def test_extract_period_info_date_until(
def test_extract_period_info_date_until(
dataset_path: DaplaDatasetPathInfo,
expected_contains_data_until: datetime.date,
):
assert dataset_path.contains_data_until == expected_contains_data_until
"""


@pytest.mark.parametrize(
("period", "expected"),
[
("2022", SupportedDateFormats.ISO_YEAR),
("2022-10", SupportedDateFormats.ISO_YEAR_MONTH),
("2022-10-10", SupportedDateFormats.ISO_YEAR_MONTH_DAY),
("2022-H1", SupportedDateFormats.SSB_YEAR_SEMESTER),
("DEFAULT_ON_FAIL", SupportedDateFormats.UNKNOWN),
],
)
def test_categorize_period_string(period: str, expected: SupportedDateFormats):
assert expected == categorize_period_string(period)

0 comments on commit b2b6b5f

Please sign in to comment.