Skip to content

Commit

Permalink
Implement derivation of SSB-uniquely formatted dates (#137)
Browse files Browse the repository at this point in the history
* Create class SsbDateFormat for SSB special formats with start and end months for each instance of SsbDateFormat type. Add return type SsbDateFormat on method categorize_period_string. Method get_ssb_period for handling SsDateFormat objects, return string with year and month. Change methods contains_date_from and contains_date_until to handle SsbDateFormat. Add testcases in test_dapla_dataset_path.

* Add doctests for convert_ssb_period.

* Add doctests for SSB date formats to categorize_period_string

* Add docstring for SSB date format  to contains_date_from and contains_date_until

* Correct half-year months and type hint supported_date_formats
  • Loading branch information
tilen1976 authored Jan 29, 2024
1 parent 24f5f48 commit 7eaf63e
Show file tree
Hide file tree
Showing 2 changed files with 211 additions and 2 deletions.
183 changes: 181 additions & 2 deletions src/datadoc/backend/dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,120 @@ class IsoDateFormat:
timeframe="week",
)

SUPPORTED_DATE_FORMATS = [

@dataclass
class SsbDateFormat:
"""An date format with relevant patterns for SSB special date formats."""

name: str
regex_pattern: str
arrow_pattern: str
time_frame: dict


SSB_BIMESTER = SsbDateFormat(
name="SSB_BIMESTER",
regex_pattern=r"\d{4}[B]\d{1}$",
arrow_pattern="YYYYMM",
time_frame={
"B1": {
"start": "01",
"end": "02",
},
"B2": {
"start": "03",
"end": "04",
},
"B3": {
"start": "05",
"end": "06",
},
"B4": {
"start": "07",
"end": "08",
},
"B5": {
"start": "09",
"end": "10",
},
"B6": {
"start": "11",
"end": "12",
},
},
)

SSB_QUARTERLY = SsbDateFormat(
name="SSB_QUARTERLY",
regex_pattern=r"\d{4}[Q]\d{1}$",
arrow_pattern="YYYYMM",
time_frame={
"Q1": {
"start": "01",
"end": "03",
},
"Q2": {
"start": "04",
"end": "06",
},
"Q3": {
"start": "07",
"end": "09",
},
"Q4": {
"start": "10",
"end": "12",
},
},
)
SSB_TRIANNUAL = SsbDateFormat(
name="SSB_TRIANNUAL",
regex_pattern=r"\d{4}[T]\d{1}$",
arrow_pattern="YYYYMM",
time_frame={
"T1": {
"start": "01",
"end": "04",
},
"T2": {
"start": "05",
"end": "08",
},
"T3": {
"start": "09",
"end": "12",
},
},
)
SSB_HALF_YEAR = SsbDateFormat(
name="SSB_HALF_YEAR",
regex_pattern=r"\d{4}[H]\d{1}$",
arrow_pattern="YYYYMM",
time_frame={
"H1": {
"start": "01",
"end": "06",
},
"H2": {
"start": "07",
"end": "12",
},
},
)

SUPPORTED_DATE_FORMATS: list[IsoDateFormat | SsbDateFormat] = [
ISO_YEAR,
ISO_YEAR_MONTH,
ISO_YEAR_MONTH_DAY,
ISO_YEAR_WEEK,
SSB_BIMESTER,
SSB_QUARTERLY,
SSB_TRIANNUAL,
SSB_HALF_YEAR,
]


def categorize_period_string(period: str) -> IsoDateFormat:
def categorize_period_string(period: str) -> IsoDateFormat | SsbDateFormat:
"""Categorize a period string into one of the supported date formats.
If the period string is not recognized, a NotImplementedError is raised.
Expand All @@ -71,6 +176,22 @@ def categorize_period_string(period: str) -> IsoDateFormat:
>>> date_format.name
ISO_YEAR_WEEK
>>> date_format = categorize_period_string('2022B1')
>>> date_format.name
SSB_BIMESTER
>>> date_format = categorize_period_string('1980Q3')
>>> date_format.name
SSB_QUARTERLY
>>> date_format = categorize_period_string('1954T2')
>>> date_format.name
SSB_TRIANNUAL
>>> date_format = categorize_period_string('1876H1')
>>> date_format.name
SSB_HALF_YEAR
>>> categorize_period_string('unknown format')
Traceback (most recent call last):
...
Expand All @@ -86,6 +207,50 @@ def categorize_period_string(period: str) -> IsoDateFormat:
)


def convert_ssb_period(
period_string: str,
period_type: str,
date_format: SsbDateFormat,
) -> str:
"""Convert ssb-format for bimester, quarterly, triannual and half year to start and end months.
Usage-examples:
>>> ssb_bimester_period_start = convert_ssb_period("2022B1","start",SSB_BIMESTER)
>>> ssb_bimester_period_start
202201
>>> ssb_bimester_period_end = convert_ssb_period("2022B1","end",SSB_BIMESTER)
>>> ssb_bimester_period_end
202202
>>> ssb_quarterly_period_start = convert_ssb_period("2015Q3","start",SSB_QUARTERLY)
>>> ssb_quarterly_period_start
201507
>>> ssb_quarterly_period_end = convert_ssb_period("2015Q3","end",SSB_QUARTERLY)
>>> ssb_quarterly_period_end
201509
>>> ssb_triannual_period_start = convert_ssb_period("1998T2","start",SSB_TRIANNUAL)
>>> ssb_triannual_period_start
199805
>>> ssb_quarterly_period_end = convert_ssb_period("1998T2","end",SSB_TRIANNUAL)
>>> ssb_quarterly_period_end
199808
>>> ssb_half_year_period_start = convert_ssb_period("1898H2","start",SSB_HALF_YEAR)
>>> ssb_half_year_period_start
189807
>>> ssb_half_year_period_end = convert_ssb_period("1898H2","end",SSB_HALF_YEAR)
>>> ssb_half_year_period_end
189812
"""
return period_string[:4] + date_format.time_frame[period_string[-2:]][period_type]


class DaplaDatasetPathInfo:
"""Extract info from a path following SSB's dataset naming convention."""

Expand Down Expand Up @@ -132,6 +297,15 @@ def _extract_period_strings(dataset_name_sections: list[str]) -> list[str]:
def contains_data_from(self) -> datetime.date:
"""The earliest date from which data in the dataset is relevant for."""
date_format = categorize_period_string(self.first_period_string)
if isinstance(date_format, SsbDateFormat):
"""If dateformat is SSB date format return start month of ssb period."""
period = convert_ssb_period(
self.first_period_string,
"start",
date_format,
)
return arrow.get(period, date_format.arrow_pattern).floor("month").date()

return (
arrow.get(self.first_period_string, date_format.arrow_pattern)
.floor(date_format.timeframe)
Expand All @@ -143,6 +317,11 @@ def contains_data_until(self) -> datetime.date:
"""The latest date until which data in the dataset is relevant for."""
period_string = self.second_period_string or self.first_period_string
date_format = categorize_period_string(period_string)
if isinstance(date_format, SsbDateFormat):
"""If dateformat is SSB date format return end month of ssb period."""
period = convert_ssb_period(period_string, "end", date_format)
return arrow.get(period, date_format.arrow_pattern).ceil("month").date()

return (
arrow.get(period_string, date_format.arrow_pattern)
.ceil(date_format.timeframe)
Expand Down
30 changes: 30 additions & 0 deletions tests/backend/test_dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,36 @@ class DatasetPathTestCase:
expected_contains_data_from=datetime.date(1981, 12, 21),
expected_contains_data_until=datetime.date(1981, 12, 27),
),
DatasetPathTestCase(
path="personinntekt_p2022H1_v1.parquet",
expected_contains_data_from=datetime.date(2022, 1, 1),
expected_contains_data_until=datetime.date(2022, 6, 30),
),
DatasetPathTestCase(
path="nybilreg_p2022T1_v1.parquet",
expected_contains_data_from=datetime.date(2022, 1, 1),
expected_contains_data_until=datetime.date(2022, 4, 30),
),
DatasetPathTestCase(
path="varehandel_p2018Q1_p2018Q4_v1.parquet",
expected_contains_data_from=datetime.date(2018, 1, 1),
expected_contains_data_until=datetime.date(2018, 12, 31),
),
DatasetPathTestCase(
path="pensjon_p2018Q1_v1.parquet",
expected_contains_data_from=datetime.date(2018, 1, 1),
expected_contains_data_until=datetime.date(2018, 3, 31),
),
DatasetPathTestCase(
path="skipsanloep_p2021B2_v1.parquet",
expected_contains_data_from=datetime.date(2021, 3, 1),
expected_contains_data_until=datetime.date(2021, 4, 30),
),
DatasetPathTestCase(
path="skipsanloep_p2022B1_v1.parquet",
expected_contains_data_from=datetime.date(2022, 1, 1),
expected_contains_data_until=datetime.date(2022, 2, 28),
),
]


Expand Down

0 comments on commit 7eaf63e

Please sign in to comment.