From 80788df90cebfab2e7e5b838250f198465ee69a9 Mon Sep 17 00:00:00 2001 From: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com> Date: Thu, 1 Feb 2024 09:42:07 +0100 Subject: [PATCH] Move extract methods to DaplaDatasetPathInfo (#143) --- .../backend/dapla_dataset_path_info.py | 61 +++++++++++++++++++ src/datadoc/backend/datadoc_metadata.py | 53 ++-------------- tests/backend/test_dapla_dataset_path_info.py | 59 ++++++++++++++++++ tests/backend/test_datadoc_metadata.py | 53 ---------------- tests/conftest.py | 5 -- 5 files changed, 124 insertions(+), 107 deletions(-) diff --git a/src/datadoc/backend/dapla_dataset_path_info.py b/src/datadoc/backend/dapla_dataset_path_info.py index e0199467..34293e30 100644 --- a/src/datadoc/backend/dapla_dataset_path_info.py +++ b/src/datadoc/backend/dapla_dataset_path_info.py @@ -5,10 +5,14 @@ import re from dataclasses import dataclass from typing import TYPE_CHECKING +from typing import Final from typing import Literal import arrow +from datadoc.enums import DatasetState +from datadoc.enums import SupportedLanguages + if TYPE_CHECKING: import datetime import os @@ -368,3 +372,60 @@ def contains_data_until(self) -> datetime.date | None: .ceil(date_format.timeframe) .date() ) + + @property + def dataset_state( + self, + ) -> DatasetState | None: + """Extract the dataset state from the path. + + Examples: + >>> DaplaDatasetPathInfo('klargjorte_data/person_data_v1.parquet').dataset_state + + >>> DaplaDatasetPathInfo('utdata/min_statistikk/person_data_v1.parquet').dataset_state + + >>> DaplaDatasetPathInfo('my_special_data/person_data_v1.parquet').dataset_state + None + """ + dataset_path_parts = set(self.dataset_path.parts) + for s in DatasetState: + # We assume that files are saved in the Norwegian language as specified by SSB. + norwegian_dataset_state_path_part = s.get_value_for_language( + SupportedLanguages.NORSK_BOKMÅL, + ).lower() + norwegian_dataset_state_path_part_variations = { + norwegian_dataset_state_path_part.replace(" ", x) for x in ["-", "_"] + } + # Match on any of the variations anywhere in the path. + if norwegian_dataset_state_path_part_variations.intersection( + dataset_path_parts, + ): + return s + + return None + + @property + def dataset_version( + self, + ) -> str | None: + """Extract version information if exists in filename. + + Examples: + >>> DaplaDatasetPathInfo('person_data_v1.parquet').dataset_version + '1' + >>> DaplaDatasetPathInfo('person_data_v20.parquet').dataset_version + '20' + >>> DaplaDatasetPathInfo('person_data.parquet').dataset_version + None + """ + minimum_elements_in_file_name: Final[int] = 2 + minimum_characters_in_version_string: Final[int] = 2 + if len(self.dataset_name_sections) >= minimum_elements_in_file_name: + last_filename_element = str(self.dataset_name_sections[-1]) + if ( + len(last_filename_element) >= minimum_characters_in_version_string + and last_filename_element[0:1] == "v" + and last_filename_element[1:].isdigit() + ): + return last_filename_element[1:] + return None diff --git a/src/datadoc/backend/datadoc_metadata.py b/src/datadoc/backend/datadoc_metadata.py index fd627101..bfe23777 100644 --- a/src/datadoc/backend/datadoc_metadata.py +++ b/src/datadoc/backend/datadoc_metadata.py @@ -4,7 +4,6 @@ import json import logging import pathlib -import typing as t import uuid from typing import TYPE_CHECKING @@ -16,7 +15,6 @@ from datadoc.backend.model_backwards_compatibility import upgrade_metadata from datadoc.backend.storage_adapter import StorageAdapter from datadoc.enums import DatasetState -from datadoc.enums import SupportedLanguages from datadoc.enums import VariableRole from datadoc.frontend.fields import display_dataset from datadoc.frontend.fields import display_variables @@ -65,6 +63,7 @@ def __init__( self.extract_metadata_from_existing_document(self.metadata_document) elif dataset_path: + # This is the most common use case. self.dataset = pathlib.Path(dataset_path) # The short_name is set as the dataset filename without file extension self.short_name = pathlib.Path( @@ -76,7 +75,6 @@ def __init__( self.metadata_document.joinpath( self.short_name + METADATA_DOCUMENT_FILE_SUFFIX, ) - self.dataset_state = self.get_dataset_state(self.dataset) self.extract_metadata_from_files() @@ -88,48 +86,6 @@ def __init__( self.current_user, ) - def get_dataset_state( - self, - dataset: pathlib.Path, - ) -> DatasetState | None: - """Use the path to attempt to guess the state of the dataset.""" - dataset_path_parts = set(dataset.parts) - for state in DatasetState: - # We assume that files are saved in the Norwegian language as specified by SSB. - norwegian_dataset_state_path_part = state.get_value_for_language( - SupportedLanguages.NORSK_BOKMÅL, - ).lower() - norwegian_dataset_state_path_part_variations = { - norwegian_dataset_state_path_part.replace(" ", x) for x in ["-", "_"] - } - # Match on any of the variations anywhere in the path. - if norwegian_dataset_state_path_part_variations.intersection( - dataset_path_parts, - ): - return state - return None - - @staticmethod - def get_dataset_version( - dataset_stem: str, - ) -> str | None: - """Find version information if exists in filename. - - eg. 'v1' in filename 'person_data_v1.parquet' - """ - minimum_elements_in_file_name: t.Final[int] = 2 - minimum_characters_in_version_string: t.Final[int] = 2 - split_file_name = str(dataset_stem).split("_") - if len(split_file_name) >= minimum_elements_in_file_name: - last_filename_element = str(split_file_name[-1]) - if ( - len(last_filename_element) >= minimum_characters_in_version_string - and last_filename_element[0:1] == "v" - and last_filename_element[1:].isdigit() - ): - return last_filename_element[1:] - return None - def extract_metadata_from_files(self) -> None: """Read metadata from an existing metadata document. @@ -139,7 +95,7 @@ def extract_metadata_from_files(self) -> None: if self.metadata_document is not None and self.metadata_document.exists(): self.extract_metadata_from_existing_document(self.metadata_document) elif self.dataset is not None: - self.extract_metadata_from_dataset(self.dataset, self.short_name or "") + self.extract_metadata_from_dataset(self.dataset) self.meta.dataset.id = uuid.uuid4() @@ -205,7 +161,6 @@ def is_metadata_in_container_structure( def extract_metadata_from_dataset( self, dataset: pathlib.Path, - short_name: str, ) -> None: """Obtain what metadata we can from the dataset itself. @@ -218,8 +173,8 @@ def extract_metadata_from_dataset( self.meta.dataset = model.Dataset( short_name=self.short_name, - dataset_state=self.dataset_state, - version=self.get_dataset_version(short_name), + dataset_state=dapla_dataset_path_info.dataset_state, + version=dapla_dataset_path_info.dataset_version, contains_data_from=str(dapla_dataset_path_info.contains_data_from), contains_data_until=str(dapla_dataset_path_info.contains_data_until), data_source_path=self.dataset, diff --git a/tests/backend/test_dapla_dataset_path_info.py b/tests/backend/test_dapla_dataset_path_info.py index d74b7040..8ed12e79 100644 --- a/tests/backend/test_dapla_dataset_path_info.py +++ b/tests/backend/test_dapla_dataset_path_info.py @@ -1,9 +1,15 @@ +from __future__ import annotations + +import copy import datetime +import pathlib from dataclasses import dataclass +from pathlib import PurePath import pytest from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo +from datadoc.enums import DatasetState from tests.utils import TEST_PARQUET_FILEPATH @@ -134,6 +140,59 @@ def test_extract_period_info_no_period_info_in_path(data: str): assert DaplaDatasetPathInfo(data).contains_data_from is None +@pytest.fixture() +def full_dataset_state_path( + dataset_state_path: str, +) -> pathlib.PurePath: + """Create a longer path structure from just one section. + + Examples: + >>> full_dataset_state_path('inndata') + 'tests/inndata/resources/person_data_v1.parquet' + """ + split_path = list(PurePath(TEST_PARQUET_FILEPATH).parts) + new_path = copy.copy(split_path) + new_path.insert(-2, dataset_state_path) + return PurePath().joinpath(*new_path) + + +@pytest.mark.parametrize( + ("dataset_state_path", "expected_result"), + [ + ("kildedata", DatasetState.SOURCE_DATA), + ("inndata", DatasetState.INPUT_DATA), + ("roskildedata/klargjorte-data", DatasetState.PROCESSED_DATA), + ("klargjorte_data", DatasetState.PROCESSED_DATA), + ("klargjorte-data", DatasetState.PROCESSED_DATA), + ("statistikk", DatasetState.STATISTICS), + ("", None), + ], +) +def test_get_dataset_state( + full_dataset_state_path: pathlib.Path, + expected_result: DatasetState, +): + actual_state = DaplaDatasetPathInfo(full_dataset_state_path).dataset_state + assert actual_state == expected_result + + +@pytest.mark.parametrize( + ("path", "expected"), + [ + ("person_data_v1", "1"), + ("person_data_v2", "2"), + ("person_data_vwrong", None), + ("person_data", None), + ("person_testdata_p2021-12-31_p2021-12-31_v20", "20"), + ], +) +def test_get_dataset_version( + path: str, + expected: str | None, +): + assert DaplaDatasetPathInfo(path).dataset_version == expected + + # These tests covers both date until after date from, mix of SSB keys and invalid SSB keys @pytest.mark.parametrize( "dataset_path_name", diff --git a/tests/backend/test_datadoc_metadata.py b/tests/backend/test_datadoc_metadata.py index b252c5b3..d14c5cf0 100644 --- a/tests/backend/test_datadoc_metadata.py +++ b/tests/backend/test_datadoc_metadata.py @@ -2,10 +2,7 @@ from __future__ import annotations import json -import pathlib -from copy import copy from pathlib import Path -from pathlib import PurePath from typing import TYPE_CHECKING from uuid import UUID @@ -21,7 +18,6 @@ from datadoc.enums import VariableRole from tests.utils import TEST_EXISTING_METADATA_DIRECTORY from tests.utils import TEST_EXISTING_METADATA_FILE_NAME -from tests.utils import TEST_PARQUET_FILEPATH from tests.utils import TEST_RESOURCES_DIRECTORY from tests.utils import TEST_RESOURCES_METADATA_DOCUMENT @@ -29,39 +25,6 @@ from datetime import datetime -def make_paths() -> list[tuple[str, DatasetState | None]]: - split_path = list(PurePath(TEST_PARQUET_FILEPATH).parts) - initial_data = [ - ("kildedata", DatasetState.SOURCE_DATA), - ("inndata", DatasetState.INPUT_DATA), - ("roskildedata/klargjorte-data", DatasetState.PROCESSED_DATA), - ("klargjorte_data", DatasetState.PROCESSED_DATA), - ("klargjorte-data", DatasetState.PROCESSED_DATA), - ("statistikk", DatasetState.STATISTICS), - ("", None), - ] - test_data = [] - - # Construct paths with each of the potential options in them - for to_insert, state in initial_data: - new_path = copy(split_path) - new_path.insert(-2, to_insert) - joined_path = PurePath().joinpath(*new_path) - test_data.append((str(joined_path), state)) - - return test_data - - -@pytest.mark.parametrize(("path", "expected_result"), make_paths()) -def test_get_dataset_state( - path: str, - expected_result: DatasetState, - metadata: DataDocMetadata, -): - actual_state = metadata.get_dataset_state(pathlib.Path(path)) - assert actual_state == expected_result - - @pytest.mark.usefixtures("existing_metadata_file") def test_existing_metadata_file( metadata: DataDocMetadata, @@ -83,22 +46,6 @@ def test_metadata_document_percent_complete(metadata: DataDocMetadata): assert metadata.percent_complete == 17 # noqa: PLR2004 -@pytest.mark.parametrize( - ("short_name", "expected"), - [ - ("person_data_v1", "1"), - ("person_data_v2", "2"), - ("person_data", None), - ("person_testdata_p2021-12-31_p2021-12-31_v20", "20"), - ], -) -def test_get_dataset_version( - short_name: str, - expected: str | None, -): - assert DataDocMetadata.get_dataset_version(short_name) == expected - - def test_write_metadata_document( dummy_timestamp: datetime, metadata: DataDocMetadata, diff --git a/tests/conftest.py b/tests/conftest.py index ce798203..83477560 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,7 +19,6 @@ from .utils import TEST_BUCKET_PARQUET_FILEPATH from .utils import TEST_EXISTING_METADATA_DIRECTORY from .utils import TEST_EXISTING_METADATA_FILE_NAME -from .utils import TEST_EXISTING_METADATA_WITH_VALID_ID_DIRECTORY from .utils import TEST_PARQUET_FILEPATH from .utils import TEST_RESOURCES_DIRECTORY from .utils import TEST_RESOURCES_METADATA_DOCUMENT @@ -67,10 +66,6 @@ def existing_metadata_file(existing_metadata_path: Path) -> str: @pytest.fixture() -@pytest.mark.parametrize( - "existing_metadata_file", - [TEST_EXISTING_METADATA_WITH_VALID_ID_DIRECTORY], -) def existing_metadata_with_valid_id_file(existing_metadata_file: Path) -> Path: return existing_metadata_file