Skip to content

Commit

Permalink
Move extract methods to DaplaDatasetPathInfo (#143)
Browse files Browse the repository at this point in the history
  • Loading branch information
mmwinther authored Feb 1, 2024
1 parent e4ec9de commit 80788df
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 107 deletions.
61 changes: 61 additions & 0 deletions src/datadoc/backend/dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@
import re
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Final
from typing import Literal

import arrow

from datadoc.enums import DatasetState
from datadoc.enums import SupportedLanguages

if TYPE_CHECKING:
import datetime
import os
Expand Down Expand Up @@ -368,3 +372,60 @@ def contains_data_until(self) -> datetime.date | None:
.ceil(date_format.timeframe)
.date()
)

@property
def dataset_state(
self,
) -> DatasetState | None:
"""Extract the dataset state from the path.
Examples:
>>> DaplaDatasetPathInfo('klargjorte_data/person_data_v1.parquet').dataset_state
<DatasetState.PROCESSED_DATA: 'PROCESSED_DATA'>
>>> DaplaDatasetPathInfo('utdata/min_statistikk/person_data_v1.parquet').dataset_state
<DatasetState.OUTPUT_DATA: 'OUTPUT_DATA'>
>>> DaplaDatasetPathInfo('my_special_data/person_data_v1.parquet').dataset_state
None
"""
dataset_path_parts = set(self.dataset_path.parts)
for s in DatasetState:
# We assume that files are saved in the Norwegian language as specified by SSB.
norwegian_dataset_state_path_part = s.get_value_for_language(
SupportedLanguages.NORSK_BOKMÅL,
).lower()
norwegian_dataset_state_path_part_variations = {
norwegian_dataset_state_path_part.replace(" ", x) for x in ["-", "_"]
}
# Match on any of the variations anywhere in the path.
if norwegian_dataset_state_path_part_variations.intersection(
dataset_path_parts,
):
return s

return None

@property
def dataset_version(
self,
) -> str | None:
"""Extract version information if exists in filename.
Examples:
>>> DaplaDatasetPathInfo('person_data_v1.parquet').dataset_version
'1'
>>> DaplaDatasetPathInfo('person_data_v20.parquet').dataset_version
'20'
>>> DaplaDatasetPathInfo('person_data.parquet').dataset_version
None
"""
minimum_elements_in_file_name: Final[int] = 2
minimum_characters_in_version_string: Final[int] = 2
if len(self.dataset_name_sections) >= minimum_elements_in_file_name:
last_filename_element = str(self.dataset_name_sections[-1])
if (
len(last_filename_element) >= minimum_characters_in_version_string
and last_filename_element[0:1] == "v"
and last_filename_element[1:].isdigit()
):
return last_filename_element[1:]
return None
53 changes: 4 additions & 49 deletions src/datadoc/backend/datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import json
import logging
import pathlib
import typing as t
import uuid
from typing import TYPE_CHECKING

Expand All @@ -16,7 +15,6 @@
from datadoc.backend.model_backwards_compatibility import upgrade_metadata
from datadoc.backend.storage_adapter import StorageAdapter
from datadoc.enums import DatasetState
from datadoc.enums import SupportedLanguages
from datadoc.enums import VariableRole
from datadoc.frontend.fields import display_dataset
from datadoc.frontend.fields import display_variables
Expand Down Expand Up @@ -65,6 +63,7 @@ def __init__(
self.extract_metadata_from_existing_document(self.metadata_document)

elif dataset_path:
# This is the most common use case.
self.dataset = pathlib.Path(dataset_path)
# The short_name is set as the dataset filename without file extension
self.short_name = pathlib.Path(
Expand All @@ -76,7 +75,6 @@ def __init__(
self.metadata_document.joinpath(
self.short_name + METADATA_DOCUMENT_FILE_SUFFIX,
)
self.dataset_state = self.get_dataset_state(self.dataset)

self.extract_metadata_from_files()

Expand All @@ -88,48 +86,6 @@ def __init__(
self.current_user,
)

def get_dataset_state(
self,
dataset: pathlib.Path,
) -> DatasetState | None:
"""Use the path to attempt to guess the state of the dataset."""
dataset_path_parts = set(dataset.parts)
for state in DatasetState:
# We assume that files are saved in the Norwegian language as specified by SSB.
norwegian_dataset_state_path_part = state.get_value_for_language(
SupportedLanguages.NORSK_BOKMÅL,
).lower()
norwegian_dataset_state_path_part_variations = {
norwegian_dataset_state_path_part.replace(" ", x) for x in ["-", "_"]
}
# Match on any of the variations anywhere in the path.
if norwegian_dataset_state_path_part_variations.intersection(
dataset_path_parts,
):
return state
return None

@staticmethod
def get_dataset_version(
dataset_stem: str,
) -> str | None:
"""Find version information if exists in filename.
eg. 'v1' in filename 'person_data_v1.parquet'
"""
minimum_elements_in_file_name: t.Final[int] = 2
minimum_characters_in_version_string: t.Final[int] = 2
split_file_name = str(dataset_stem).split("_")
if len(split_file_name) >= minimum_elements_in_file_name:
last_filename_element = str(split_file_name[-1])
if (
len(last_filename_element) >= minimum_characters_in_version_string
and last_filename_element[0:1] == "v"
and last_filename_element[1:].isdigit()
):
return last_filename_element[1:]
return None

def extract_metadata_from_files(self) -> None:
"""Read metadata from an existing metadata document.
Expand All @@ -139,7 +95,7 @@ def extract_metadata_from_files(self) -> None:
if self.metadata_document is not None and self.metadata_document.exists():
self.extract_metadata_from_existing_document(self.metadata_document)
elif self.dataset is not None:
self.extract_metadata_from_dataset(self.dataset, self.short_name or "")
self.extract_metadata_from_dataset(self.dataset)

self.meta.dataset.id = uuid.uuid4()

Expand Down Expand Up @@ -205,7 +161,6 @@ def is_metadata_in_container_structure(
def extract_metadata_from_dataset(
self,
dataset: pathlib.Path,
short_name: str,
) -> None:
"""Obtain what metadata we can from the dataset itself.
Expand All @@ -218,8 +173,8 @@ def extract_metadata_from_dataset(

self.meta.dataset = model.Dataset(
short_name=self.short_name,
dataset_state=self.dataset_state,
version=self.get_dataset_version(short_name),
dataset_state=dapla_dataset_path_info.dataset_state,
version=dapla_dataset_path_info.dataset_version,
contains_data_from=str(dapla_dataset_path_info.contains_data_from),
contains_data_until=str(dapla_dataset_path_info.contains_data_until),
data_source_path=self.dataset,
Expand Down
59 changes: 59 additions & 0 deletions tests/backend/test_dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from __future__ import annotations

import copy
import datetime
import pathlib
from dataclasses import dataclass
from pathlib import PurePath

import pytest

from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo
from datadoc.enums import DatasetState
from tests.utils import TEST_PARQUET_FILEPATH


Expand Down Expand Up @@ -134,6 +140,59 @@ def test_extract_period_info_no_period_info_in_path(data: str):
assert DaplaDatasetPathInfo(data).contains_data_from is None


@pytest.fixture()
def full_dataset_state_path(
dataset_state_path: str,
) -> pathlib.PurePath:
"""Create a longer path structure from just one section.
Examples:
>>> full_dataset_state_path('inndata')
'tests/inndata/resources/person_data_v1.parquet'
"""
split_path = list(PurePath(TEST_PARQUET_FILEPATH).parts)
new_path = copy.copy(split_path)
new_path.insert(-2, dataset_state_path)
return PurePath().joinpath(*new_path)


@pytest.mark.parametrize(
("dataset_state_path", "expected_result"),
[
("kildedata", DatasetState.SOURCE_DATA),
("inndata", DatasetState.INPUT_DATA),
("roskildedata/klargjorte-data", DatasetState.PROCESSED_DATA),
("klargjorte_data", DatasetState.PROCESSED_DATA),
("klargjorte-data", DatasetState.PROCESSED_DATA),
("statistikk", DatasetState.STATISTICS),
("", None),
],
)
def test_get_dataset_state(
full_dataset_state_path: pathlib.Path,
expected_result: DatasetState,
):
actual_state = DaplaDatasetPathInfo(full_dataset_state_path).dataset_state
assert actual_state == expected_result


@pytest.mark.parametrize(
("path", "expected"),
[
("person_data_v1", "1"),
("person_data_v2", "2"),
("person_data_vwrong", None),
("person_data", None),
("person_testdata_p2021-12-31_p2021-12-31_v20", "20"),
],
)
def test_get_dataset_version(
path: str,
expected: str | None,
):
assert DaplaDatasetPathInfo(path).dataset_version == expected


# These tests covers both date until after date from, mix of SSB keys and invalid SSB keys
@pytest.mark.parametrize(
"dataset_path_name",
Expand Down
53 changes: 0 additions & 53 deletions tests/backend/test_datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
from __future__ import annotations

import json
import pathlib
from copy import copy
from pathlib import Path
from pathlib import PurePath
from typing import TYPE_CHECKING
from uuid import UUID

Expand All @@ -21,47 +18,13 @@
from datadoc.enums import VariableRole
from tests.utils import TEST_EXISTING_METADATA_DIRECTORY
from tests.utils import TEST_EXISTING_METADATA_FILE_NAME
from tests.utils import TEST_PARQUET_FILEPATH
from tests.utils import TEST_RESOURCES_DIRECTORY
from tests.utils import TEST_RESOURCES_METADATA_DOCUMENT

if TYPE_CHECKING:
from datetime import datetime


def make_paths() -> list[tuple[str, DatasetState | None]]:
split_path = list(PurePath(TEST_PARQUET_FILEPATH).parts)
initial_data = [
("kildedata", DatasetState.SOURCE_DATA),
("inndata", DatasetState.INPUT_DATA),
("roskildedata/klargjorte-data", DatasetState.PROCESSED_DATA),
("klargjorte_data", DatasetState.PROCESSED_DATA),
("klargjorte-data", DatasetState.PROCESSED_DATA),
("statistikk", DatasetState.STATISTICS),
("", None),
]
test_data = []

# Construct paths with each of the potential options in them
for to_insert, state in initial_data:
new_path = copy(split_path)
new_path.insert(-2, to_insert)
joined_path = PurePath().joinpath(*new_path)
test_data.append((str(joined_path), state))

return test_data


@pytest.mark.parametrize(("path", "expected_result"), make_paths())
def test_get_dataset_state(
path: str,
expected_result: DatasetState,
metadata: DataDocMetadata,
):
actual_state = metadata.get_dataset_state(pathlib.Path(path))
assert actual_state == expected_result


@pytest.mark.usefixtures("existing_metadata_file")
def test_existing_metadata_file(
metadata: DataDocMetadata,
Expand All @@ -83,22 +46,6 @@ def test_metadata_document_percent_complete(metadata: DataDocMetadata):
assert metadata.percent_complete == 17 # noqa: PLR2004


@pytest.mark.parametrize(
("short_name", "expected"),
[
("person_data_v1", "1"),
("person_data_v2", "2"),
("person_data", None),
("person_testdata_p2021-12-31_p2021-12-31_v20", "20"),
],
)
def test_get_dataset_version(
short_name: str,
expected: str | None,
):
assert DataDocMetadata.get_dataset_version(short_name) == expected


def test_write_metadata_document(
dummy_timestamp: datetime,
metadata: DataDocMetadata,
Expand Down
5 changes: 0 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from .utils import TEST_BUCKET_PARQUET_FILEPATH
from .utils import TEST_EXISTING_METADATA_DIRECTORY
from .utils import TEST_EXISTING_METADATA_FILE_NAME
from .utils import TEST_EXISTING_METADATA_WITH_VALID_ID_DIRECTORY
from .utils import TEST_PARQUET_FILEPATH
from .utils import TEST_RESOURCES_DIRECTORY
from .utils import TEST_RESOURCES_METADATA_DOCUMENT
Expand Down Expand Up @@ -67,10 +66,6 @@ def existing_metadata_file(existing_metadata_path: Path) -> str:


@pytest.fixture()
@pytest.mark.parametrize(
"existing_metadata_file",
[TEST_EXISTING_METADATA_WITH_VALID_ID_DIRECTORY],
)
def existing_metadata_with_valid_id_file(existing_metadata_file: Path) -> Path:
return existing_metadata_file

Expand Down

0 comments on commit 80788df

Please sign in to comment.