Skip to content

Commit

Permalink
Temp save - code not working
Browse files Browse the repository at this point in the history
  • Loading branch information
jonolehagemo committed Jan 25, 2024
1 parent a231811 commit 71453c0
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 38 deletions.
100 changes: 64 additions & 36 deletions src/datadoc/backend/dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
@@ -1,82 +1,110 @@
"""Extract info from a path following SSB's dataset naming convention."""
from __future__ import annotations

import contextlib
import pathlib
import re
from datetime import datetime, timezone
from enum import Enum
from enum import auto
from typing import TYPE_CHECKING

import arrow

if TYPE_CHECKING:
import datetime
import contextlib
import pathlib
import re


class SupportedDateFormats(Enum):
"""The Date formats supported by the naming convention."""

ISO_YEAR = auto() # String format YYYY
ISO_YEAR_MONTH = auto() # String format YYYY-MM
ISO_YEAR_MONTH_DAY = auto() # String format YYYY-MM-DD
SSB_YEAR_SEMESTER = auto() # String format YYYY-Hn
SSB_YEAR_TRIMESTER = auto() # String format YYYY-Tn
SSB_YEAR_QUARTER = auto() # String format YYYY-Qn
SSB_YEAR_BIMESTER = auto() # String format YYYY-Bn
SSB_YEAR_WEEK = auto() # String format YYYY-Wnn


class RegexEqual(str):
"""Helper class for structual pattern matching using regex."""

def __eq__(self, pattern: str) -> bool:
"""Returns true on match."""
return bool(re.search(pattern, self))
ISO_YEAR = "YYYY" # String format YYYY
ISO_YEAR_MONTH = "YYYY-MM" # String format YYYY-MM
ISO_YEAR_MONTH_DAY = "YYYY-MM-DD" # String format YYYY-MM-DD
SSB_YEAR_SEMESTER = "YYYY-Hn" # String format YYYY-Hn
SSB_YEAR_TRIMESTER = "YYYY-Tn" # String format YYYY-Tn
SSB_YEAR_QUARTER = "YYYY-Qn" # String format YYYY-Qn
SSB_YEAR_BIMESTER = "YYYY-Bn" # String format YYYY-Bn
SSB_YEAR_WEEK = "YYYY-Wnn" # String format YYYY-Wnn
UNKNOWN = "UNKNOWN"


def categorize_period_string(period: str) -> SupportedDateFormats:
"""A naive string validator."""
match RegexEqualCompiler(period):
case r"\d{4}\-H\d":
return SupportedDateFormats.SSB_YEAR_SEMESTER
case r"\d{4}\-T\d":
return SupportedDateFormats.SSB_YEAR_TRIMESTER
case r"\d{4}\-Q\d":
return SupportedDateFormats.SSB_YEAR_QUARTER
case r"\d{4}\-B\d":
return SupportedDateFormats.SSB_YEAR_BIMESTER
case r"\d{4}\-W\d\d":
return SupportedDateFormats.SSB_YEAR_WEEK
case r"\d{4}\-\d{2}\-\d{2}":
return SupportedDateFormats.ISO_YEAR_MONTH_DAY
case r"\d{4}\-\d{2}":
return SupportedDateFormats.ISO_YEAR_MONTH
case r"\d{4}":
return SupportedDateFormats.ISO_YEAR
case _:
return SupportedDateFormats.UNKNOWN


class RegexEqualCompiler(str):
"""Handler class for checking regex patterns."""

def __init__(self, pattern) -> None:
self.pattern = re.compile(pattern)

def __eq__(self, pattern: object) -> bool:
"""Returns true on match with tested pattern."""
return bool(re.search(str(pattern), self))


class DaplaDatasetPathInfo:
"""Extract info from a path following SSB's dataset naming convention."""

date_format_regex = re.compile(r"^p\d{4}(?:-\d{2}|-\d{2}-\d{2}|[QTHWB]\d{1,2})?$")

def __init__(self, dataset_path: str) -> None:
"""Read info from an path following SSB`s naming convention."""
self.dataset_path = pathlib.Path(dataset_path)
self.dataset_name_sections = self.dataset_path.stem.split("_")
_period_strings = self._extract_period_strings(self.dataset_name_sections)
self.first_period_string = _period_strings[0]
self.second_period_string: str | None = None
self.second_period_string = _period_strings[0]
self.date_format = categorize_period_string(self.first_period_string)

with contextlib.suppress(IndexError):
self.second_period_string = _period_strings[1]

def _categorize_period_string(self, period: str) -> SupportedDateFormats:
"""A naive string validator."""
match RegexEqual(period):
case r"\d+":
return SupportedDateFormats.ISO_YEAR

def _extract_period_strings(self, dataset_name_sections: list[str]) -> list[str]:
"""Extract period strings from dataset name sections.
Iterates over the dataset name sections and returns a list of strings
that match the year regex, stripping the first character. This extracts
the year periods from the dataset name.
"""
date_format_regex = re.compile(
r"^p\d{4}(?:-\d{2}-\d{2}|-\d{2}|[QTHWB]\d{1,2})?$"
)

return [
x[1:]
for x in dataset_name_sections
if re.match(self.date_format_regex, x) is not None
if re.match(date_format_regex, x) is not None
]

@property
def contains_data_from(self) -> datetime.date:
"""The earliest date from which data in the dataset is relevant for."""
return arrow.get(self.first_period_string, "YYYY").floor("year").date()
match (self.date_format):
case SupportedDateFormats.ISO_YEAR:
return (
arrow.get(self.first_period_string, self.date_format.value)
.floor("year")
.date()
)
case _:
return datetime.now(timezone.utc).astimezone()

@property
def contains_data_until(self) -> datetime.date:
"""The latest date until which data in the dataset is relevant for."""
year = self.second_period_string or self.first_period_string
year = self.second_period_string
return arrow.get(year, "YYYY").ceil("year").date()
44 changes: 42 additions & 2 deletions tests/backend/test_dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,56 @@
import pytest

from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo
from datadoc.backend.dapla_dataset_path_info import SupportedDateFormats
from datadoc.backend.dapla_dataset_path_info import categorize_period_string


# utanningsnivaa_p2022-10-01_v1.parquet - status this date
# grensehandel_imputert_p2022-10_p2022-12_v1.parquet - for 3 months
# omsetning_p2020W15_v1.parquet - for one week
@pytest.fixture(
params=[
{
"dataset_path": "grensehandel_imputert_p2022-B1_v1.parquet",
"expected_contains_data_from": datetime.date(2022, 1, 1),
"expected_contains_data_until": datetime.date(2022, 2, 28),
"expected_date_format": SupportedDateFormats.SSB_YEAR_BIMESTER,
},
{
"dataset_path": "grensehandel_imputert_p2022-B1_p2022-B2_v1.parquet",
"expected_contains_data_from": datetime.date(2022, 1, 1),
"expected_contains_data_until": datetime.date(2022, 4, 30),
"expected_date_format": SupportedDateFormats.SSB_YEAR_BIMESTER,
},
{
"dataset_path": "grensehandel_imputert_p2022-10-01_p2022-12-31_v1.parquet",
"expected_contains_data_from": datetime.date(2022, 10, 1),
"expected_contains_data_until": datetime.date(2022, 12, 31),
"expected_date_format": SupportedDateFormats.ISO_YEAR_MONTH_DAY,
},
{
"dataset_path": "grensehandel_imputert_p2022-10_p2022-12_v1.parquet",
"expected_contains_data_from": datetime.date(2022, 1, 10),
"expected_contains_data_from": datetime.date(2022, 10, 1),
"expected_contains_data_until": datetime.date(2022, 12, 31),
"expected_date_format": SupportedDateFormats.ISO_YEAR_MONTH,
},
{
"dataset_path": "flygende_objekter_p2019_v1.parquet",
"expected_contains_data_from": datetime.date(2019, 1, 1),
"expected_contains_data_until": datetime.date(2019, 12, 31),
"expected_date_format": SupportedDateFormats.ISO_YEAR,
},
{
"dataset_path": "framskrevne-befolkningsendringer_p2019_p2050_v1.parquet",
"expected_contains_data_from": datetime.date(2019, 1, 1),
"expected_contains_data_until": datetime.date(2050, 12, 31),
"expected_date_format": SupportedDateFormats.ISO_YEAR,
},
{
"dataset_path": "ufo_observasjoner_p2019_p2020_v1.parquet",
"expected_contains_data_from": datetime.date(2019, 1, 1),
"expected_contains_data_until": datetime.date(2020, 12, 31),
"expected_date_format": SupportedDateFormats.ISO_YEAR,
},
],
)
Expand Down Expand Up @@ -58,8 +82,24 @@ def test_extract_period_info_date_from(
assert dataset_path.contains_data_from == expected_contains_data_from


def test_extract_period_info_date_until(
""" def test_extract_period_info_date_until(
dataset_path: DaplaDatasetPathInfo,
expected_contains_data_until: datetime.date,
):
assert dataset_path.contains_data_until == expected_contains_data_until
"""


@pytest.mark.parametrize(
"period,expected",
[
("2022", SupportedDateFormats.ISO_YEAR),
("2022-10", SupportedDateFormats.ISO_YEAR_MONTH),
("2022-10-10", SupportedDateFormats.ISO_YEAR_MONTH_DAY),
("2022-H1", SupportedDateFormats.SSB_YEAR_SEMESTER),
("DEFAULT_ON_FAIL", SupportedDateFormats.UNKNOWN)
],
)
def test_categorize_period_string(period: str, expected: SupportedDateFormats):
assert expected == categorize_period_string(period)

0 comments on commit 71453c0

Please sign in to comment.