From 736521e69061500195a1738478249ccc2eab1527 Mon Sep 17 00:00:00 2001 From: Jan Sander <63044278+JanhSander@users.noreply.github.com> Date: Thu, 7 Mar 2024 11:13:49 +0100 Subject: [PATCH 1/7] Fixed unittests --- poetry.lock | 16 ++--- pyproject.toml | 2 +- src/datadoc/app.py | 8 ++- .../backend/{unit_types.py => code_list.py} | 58 ++++++++-------- .../external_sources/external_sources.py | 1 - src/datadoc/config.py | 5 ++ .../frontend/fields/display_dataset.py | 20 +++++- src/datadoc/state.py | 6 +- .../{test_unit_types.py => test_code_list.py} | 66 +++++++++---------- tests/backend/test_datadoc_metadata.py | 3 +- tests/conftest.py | 40 +++++------ .../callbacks/test_dataset_callbacks.py | 12 ++-- tests/frontend/fields/test_display_dataset.py | 4 +- .../code_list.csv} | 0 .../code_list_en.csv} | 0 .../code_list_nb.csv} | 0 .../code_list_nn.csv} | 0 .../{unit_types => code_list}/empty.csv | 0 .../{unit_types => code_list}/no_code.csv | 0 ...tdata_p2021-12-31_p2021-12-31_v1__DOC.json | 6 +- tests/test_smoke.py | 4 +- 21 files changed, 132 insertions(+), 119 deletions(-) rename src/datadoc/backend/{unit_types.py => code_list.py} (81%) rename tests/backend/{test_unit_types.py => test_code_list.py} (60%) rename tests/resources/{unit_types/unit_types.csv => code_list/code_list.csv} (100%) rename tests/resources/{unit_types/unit_types_en.csv => code_list/code_list_en.csv} (100%) rename tests/resources/{unit_types/unit_types_nb.csv => code_list/code_list_nb.csv} (100%) rename tests/resources/{unit_types/unit_types_nn.csv => code_list/code_list_nn.csv} (100%) rename tests/resources/{unit_types => code_list}/empty.csv (100%) rename tests/resources/{unit_types => code_list}/no_code.csv (100%) diff --git a/poetry.lock b/poetry.lock index 2e1776bc..b8bb15b0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1226,12 +1226,12 @@ files = [ google-auth = ">=2.14.1,<3.0.dev0" googleapis-common-protos = ">=1.56.2,<2.0.dev0" grpcio = [ - {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, + {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, ] grpcio-status = [ - {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, + {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, ] protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0.dev0" requests = ">=2.18.0,<3.0.0.dev0" @@ -1318,8 +1318,8 @@ grpc-google-iam-v1 = ">=0.12.4,<1.0.0dev" grpcio = ">=1.51.3,<2.0dev" grpcio-status = ">=1.33.2" proto-plus = [ - {version = ">=1.22.2,<2.0.0dev", markers = "python_version >= \"3.11\""}, {version = ">=1.22.0,<2.0.0dev", markers = "python_version < \"3.11\""}, + {version = ">=1.22.2,<2.0.0dev", markers = "python_version >= \"3.11\""}, ] protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev" @@ -2595,9 +2595,9 @@ files = [ [package.dependencies] lxml = {version = ">=4.9.2", optional = true, markers = "extra == \"xml\""} numpy = [ + {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""}, {version = ">=1.23.2,<2", markers = "python_version == \"3.11\""}, {version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""}, - {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""}, ] odfpy = {version = ">=1.4.1", optional = true, markers = "extra == \"excel\""} openpyxl = {version = ">=3.1.0", optional = true, markers = "extra == \"excel\""} @@ -4139,13 +4139,13 @@ dash = ">=2.15.0" [[package]] name = "ssb-datadoc-model" -version = "4.2.0" +version = "4.3.1" description = "Data Model for use in Statistics Norway's Metadata system" optional = false python-versions = ">=3.10" files = [ - {file = "ssb_datadoc_model-4.2.0-py3-none-any.whl", hash = "sha256:402157eafd855387cc9015d07e6912f8e882a753afcc2165cc753b7ce158cd71"}, - {file = "ssb_datadoc_model-4.2.0.tar.gz", hash = "sha256:474204c21b3acae0871ac457f51a0e75c15a3806d5d382a5e0ecce1082b9d04c"}, + {file = "ssb_datadoc_model-4.3.1-py3-none-any.whl", hash = "sha256:a1400bff45c27237acb87156d5cd695d0e12dd33b7c04746e336516dbfe00e61"}, + {file = "ssb_datadoc_model-4.3.1.tar.gz", hash = "sha256:f6836dfe26a331229b317155a5eecd5e0479799097ffba25f136bb3947ab1771"}, ] [package.dependencies] @@ -4675,4 +4675,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "76358066a497e342d515bd98d8ace7cd022b03e569ef3e5902857ad05e82c39c" +content-hash = "7f1449637ef52c69a2daaf14c59d601ea850af5032c97e93766ca6ff0a2f4082" diff --git a/pyproject.toml b/pyproject.toml index 95e4011f..88036730 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dash = ">=2.15.0" pydantic = "==2.5.2" dash-bootstrap-components = ">=1.1.0" pandas = ">=1.4.2" -ssb-datadoc-model = "==4.2.0" +ssb-datadoc-model = "==4.3.1" dapla-toolbelt = ">=1.3.3" gunicorn = ">=21.2.0" flask-healthz = ">=0.0.3" diff --git a/src/datadoc/app.py b/src/datadoc/app.py index 4fc0ae10..69a6cb1b 100644 --- a/src/datadoc/app.py +++ b/src/datadoc/app.py @@ -14,9 +14,9 @@ from datadoc import config from datadoc import state +from datadoc.backend.code_list import CodeList from datadoc.backend.datadoc_metadata import DataDocMetadata from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping -from datadoc.backend.unit_types import UnitTypes from datadoc.enums import SupportedLanguages from datadoc.frontend.callbacks.register_callbacks import register_callbacks from datadoc.frontend.callbacks.register_callbacks import ( @@ -139,10 +139,14 @@ def collect_data_from_external_sources() -> None: config.get_statistical_subject_source_url(), ) - state.unit_types = UnitTypes( + state.unit_types = CodeList( config.get_unit_code(), ) + state.organisational_units = CodeList( + config.get_organisational_unit_code(), + ) + def main(dataset_path: str | None = None) -> None: """Entrypoint when running as a script.""" diff --git a/src/datadoc/backend/unit_types.py b/src/datadoc/backend/code_list.py similarity index 81% rename from src/datadoc/backend/unit_types.py rename to src/datadoc/backend/code_list.py index 2249111f..44c8bf85 100644 --- a/src/datadoc/backend/unit_types.py +++ b/src/datadoc/backend/code_list.py @@ -16,8 +16,8 @@ @dataclass -class UnitType: - """Data structure for the a unit type.""" +class CodeListItem: + """Data structure for the a code list item.""" titles: dict[str, str] unit_code: str @@ -25,28 +25,31 @@ class UnitType: def get_title(self, language: SupportedLanguages) -> str: """Get the title in the given language.""" try: - return self.titles[ - ( - # Adjust to language codes in the UnitTypes structure. - "nb" - if language - in [ - SupportedLanguages.NORSK_BOKMÅL, - SupportedLanguages.NORSK_NYNORSK, - ] - else "en" - ) - ] + return self.titles[language] except KeyError: - logger.exception( - "Could not find title for subject %s and language: %s", - self, - language.name, - ) - return "" + try: + return self.titles[ + ( + # Adjust to language codes in the unit_type codelist. + "nb" + if language + in [ + SupportedLanguages.NORSK_BOKMÅL, + SupportedLanguages.NORSK_NYNORSK, + ] + else "en" + ) + ] + except KeyError: + logger.exception( + "Could not find title for subject %s and language: %s", + self, + language.name, + ) + return "" -class UnitTypes(GetExternalSource): +class CodeList(GetExternalSource): """Class for retrieving classifications from Klass.""" def __init__(self, classification_id: int | None) -> None: @@ -58,13 +61,9 @@ def __init__(self, classification_id: int | None) -> None: SupportedLanguages.NORSK_BOKMÅL.value, SupportedLanguages.ENGLISH.value, ] - - self._classifications: list[UnitType] = [] - + self._classifications: list[CodeListItem] = [] self.classification_id = classification_id - self.classifications_dataframes: dict[str, pd.DataFrame] | None = None - super().__init__() def _fetch_data_from_external_source( @@ -85,7 +84,6 @@ def _fetch_data_from_external_source( .get_codes() .data ) - except Exception: logger.exception( "Exception while getting classifications from Klass", @@ -113,7 +111,7 @@ def _extract_titles( def _create_unit_types_from_dataframe( self, classifications_dataframes: dict[SupportedLanguages, pd.DataFrame], - ) -> list[UnitType]: + ) -> list[CodeListItem]: """Method that finds the name column in the dataframe, and returns all values in a list.""" classification_names = self._extract_titles(classifications_dataframes) classification_codes: list @@ -128,7 +126,7 @@ def _create_unit_types_from_dataframe( unit_types = [] for a, b in zip(classification_names, classification_codes): unit_types.append( - UnitType(a, b), + CodeListItem(a, b), ) return unit_types @@ -151,7 +149,7 @@ def _get_classification_dataframe_if_loaded(self) -> bool: return False @property - def classifications(self) -> list[UnitType]: + def classifications(self) -> list[CodeListItem]: """Getter for primary subjects.""" self._get_classification_dataframe_if_loaded() logger.debug("Got %s classifications subjects", len(self._classifications)) diff --git a/src/datadoc/backend/external_sources/external_sources.py b/src/datadoc/backend/external_sources/external_sources.py index c6bdd5db..2c1c7525 100644 --- a/src/datadoc/backend/external_sources/external_sources.py +++ b/src/datadoc/backend/external_sources/external_sources.py @@ -21,7 +21,6 @@ def __init__(self) -> None: Initializes the future object. """ self.future: concurrent.futures.Future[T | None] | None = None - executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) self.future = executor.submit( self._fetch_data_from_external_source, diff --git a/src/datadoc/config.py b/src/datadoc/config.py index 1b72b009..a7b28ad2 100644 --- a/src/datadoc/config.py +++ b/src/datadoc/config.py @@ -133,3 +133,8 @@ def get_oidc_token() -> str | None: def get_unit_code() -> int | None: """The code for the Unit Type code list in Klass.""" return int(_get_config_item("DATADOC_UNIT_CODE") or 702) + + +def get_organisational_unit_code() -> int | None: + """The code for the organisational units code list in Klass.""" + return int(_get_config_item("DATADOC_ORGANISATIONAL_UNIT_CODE") or 83) diff --git a/src/datadoc/frontend/fields/display_dataset.py b/src/datadoc/frontend/fields/display_dataset.py index 3c49a3e3..c30df914 100644 --- a/src/datadoc/frontend/fields/display_dataset.py +++ b/src/datadoc/frontend/fields/display_dataset.py @@ -66,6 +66,19 @@ def get_unit_type_options( ] +def get_owner_options( + language: SupportedLanguages, +) -> list[dict[str, str]]: + """Collect the owner options for the given language.""" + return [ + { + "label": f"{option.unit_code} - {option.get_title(language)}", + "value": option.unit_code, + } + for option in state.organisational_units.classifications + ] + + class DatasetIdentifiers(str, Enum): """As defined here: https://statistics-norway.atlassian.net/l/c/aoSfEWJU.""" @@ -225,13 +238,14 @@ class DatasetIdentifiers(str, Enum): editable=False, value_getter=get_metadata_and_stringify, ), - DatasetIdentifiers.OWNER: DisplayDatasetMetadata( + DatasetIdentifiers.OWNER: DisplayDatasetMetadataDropdown( identifier=DatasetIdentifiers.OWNER.value, display_name="Eier", description="Maskingenerert seksjonstilhørighet til den som oppretter metadata om datasettet, men kan korrigeres manuelt", obligatory=True, - editable=False, - multiple_language_support=True, + editable=True, + multiple_language_support=False, + options_getter=get_owner_options, ), DatasetIdentifiers.FILE_PATH: DisplayDatasetMetadata( identifier=DatasetIdentifiers.FILE_PATH.value, diff --git a/src/datadoc/state.py b/src/datadoc/state.py index 2219bb33..0e43c6c0 100644 --- a/src/datadoc/state.py +++ b/src/datadoc/state.py @@ -13,9 +13,9 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: + from datadoc.backend.code_list import CodeList from datadoc.backend.datadoc_metadata import DataDocMetadata from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping - from datadoc.backend.unit_types import UnitTypes from datadoc.enums import SupportedLanguages @@ -26,4 +26,6 @@ statistic_subject_mapping: StatisticSubjectMapping -unit_types: UnitTypes +unit_types: CodeList + +organisational_units: CodeList diff --git a/tests/backend/test_unit_types.py b/tests/backend/test_code_list.py similarity index 60% rename from tests/backend/test_unit_types.py rename to tests/backend/test_code_list.py index 85998334..fa49add5 100644 --- a/tests/backend/test_unit_types.py +++ b/tests/backend/test_code_list.py @@ -1,26 +1,26 @@ import pytest -from datadoc.backend.unit_types import UnitType -from datadoc.backend.unit_types import UnitTypes +from datadoc.backend.code_list import CodeList +from datadoc.backend.code_list import CodeListItem from tests.utils import TEST_RESOURCES_DIRECTORY -UNIT_TYPES_DIR = "unit_types" +CODE_LIST_DIR = "code_list" @pytest.mark.parametrize( ( - "unit_types_csv_filepath_nb", - "unit_types_csv_filepath_nn", - "unit_types_csv_filepath_en", + "code_list_csv_filepath_nb", + "code_list_csv_filepath_nn", + "code_list_csv_filepath_en", "expected", ), [ ( - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "unit_types_nb.csv", - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "unit_types_nn.csv", - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "unit_types_en.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_nb.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_nn.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_en.csv", [ - UnitType( + CodeListItem( titles={ "nb": "Adresse", "nn": "Adresse", @@ -28,7 +28,7 @@ }, unit_code="01", ), - UnitType( + CodeListItem( titles={ "nb": "Arbeidsulykke", "nn": "Arbeidsulykke", @@ -36,7 +36,7 @@ }, unit_code="02", ), - UnitType( + CodeListItem( titles={ "nb": "Bolig", "nn": "Bolig", @@ -47,17 +47,17 @@ ], ), ( - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "empty.csv", - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "empty.csv", - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "empty.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "empty.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "empty.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "empty.csv", [], ), ( - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "unit_types_nb.csv", - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "empty.csv", - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "empty.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_nb.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "empty.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "empty.csv", [ - UnitType( + CodeListItem( titles={ "nb": "Adresse", "nn": None, @@ -65,7 +65,7 @@ }, unit_code="01", ), - UnitType( + CodeListItem( titles={ "nb": "Arbeidsulykke", "nn": None, @@ -73,7 +73,7 @@ }, unit_code="02", ), - UnitType( + CodeListItem( titles={ "nb": "Bolig", "nn": None, @@ -84,11 +84,11 @@ ], ), ( - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "no_code.csv", - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "no_code.csv", - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "no_code.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "no_code.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "no_code.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "no_code.csv", [ - UnitType( + CodeListItem( titles={ "nb": "Adresse", "nn": "Adresse", @@ -96,7 +96,7 @@ }, unit_code=None, ), - UnitType( + CodeListItem( titles={ "nb": "Arbeidsulykke", "nn": "Arbeidsulykke", @@ -104,7 +104,7 @@ }, unit_code=None, ), - UnitType( + CodeListItem( titles={ "nb": "Bolig", "nn": "Bolig", @@ -118,14 +118,14 @@ ) @pytest.mark.usefixtures("_mock_fetch_dataframe") def test_read_dataframe( - unit_types_fake_structure: UnitTypes, + code_list_fake_structure: CodeList, expected: list[str], ): - unit_types_fake_structure.wait_for_external_result() - assert unit_types_fake_structure.classifications == expected + code_list_fake_structure.wait_for_external_result() + assert code_list_fake_structure.classifications == expected def test_non_existent_code(): - unit_types = UnitTypes(0) - unit_types.wait_for_external_result() - assert unit_types.classifications == [] + code_list = CodeList(0) + code_list.wait_for_external_result() + assert code_list.classifications == [] diff --git a/tests/backend/test_datadoc_metadata.py b/tests/backend/test_datadoc_metadata.py index 65c7fef5..a14db8eb 100644 --- a/tests/backend/test_datadoc_metadata.py +++ b/tests/backend/test_datadoc_metadata.py @@ -77,7 +77,7 @@ def test_metadata_document_percent_complete(metadata: DataDocMetadata): metadata.dataset = document.dataset # type: ignore [assignment] metadata.variables = document.variables # type: ignore [assignment] - assert metadata.percent_complete == 17 # noqa: PLR2004 + assert metadata.percent_complete == 16 # noqa: PLR2004 def test_write_metadata_document( @@ -279,7 +279,6 @@ def test_dataset_status_default_value( subject_mapping_fake_statistical_structure, str(dataset_path), ) - assert datadoc_metadata.dataset.dataset_status == expected_type diff --git a/tests/conftest.py b/tests/conftest.py index c2e288d9..d4ba6243 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,9 +19,9 @@ from datadoc_model import model from datadoc import state +from datadoc.backend.code_list import CodeList from datadoc.backend.datadoc_metadata import DataDocMetadata from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping -from datadoc.backend.unit_types import UnitTypes from datadoc.backend.user_info import TestUserInfo from tests.backend.test_statistic_subject_mapping import ( STATISTICAL_SUBJECT_STRUCTURE_DIR, @@ -33,7 +33,7 @@ from .utils import TEST_PARQUET_FILEPATH from .utils import TEST_RESOURCES_DIRECTORY -UNIT_TYPES_DIR = "unit_types" +CODE_LIST_DIR = "code_list" if TYPE_CHECKING: from pytest_mock import MockerFixture @@ -204,45 +204,45 @@ def subject_mapping_http_exception( @pytest.fixture() -def unit_types_csv_filepath_nb() -> pathlib.Path: - return TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "unit_types_nb.csv" +def code_list_csv_filepath_nb() -> pathlib.Path: + return TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_nb.csv" @pytest.fixture() -def unit_types_csv_filepath_nn() -> pathlib.Path: - return TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "unit_types_nn.csv" +def code_list_csv_filepath_nn() -> pathlib.Path: + return TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_nn.csv" @pytest.fixture() -def unit_types_csv_filepath_en() -> pathlib.Path: - return TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "unit_types_en.csv" +def code_list_csv_filepath_en() -> pathlib.Path: + return TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_en.csv" @pytest.fixture() def _mock_fetch_dataframe( mocker, - unit_types_csv_filepath_nb: pathlib.Path, - unit_types_csv_filepath_nn: pathlib.Path, - unit_types_csv_filepath_en: pathlib.Path, + code_list_csv_filepath_nb: pathlib.Path, + code_list_csv_filepath_nn: pathlib.Path, + code_list_csv_filepath_en: pathlib.Path, ) -> None: - def fake_unit_types() -> dict[str, pd.DataFrame]: + def fake_code_list() -> dict[str, pd.DataFrame]: return { - "nb": pd.read_csv(unit_types_csv_filepath_nb, converters={"code": str}), - "nn": pd.read_csv(unit_types_csv_filepath_nn, converters={"code": str}), - "en": pd.read_csv(unit_types_csv_filepath_en, converters={"code": str}), + "nb": pd.read_csv(code_list_csv_filepath_nb, converters={"code": str}), + "nn": pd.read_csv(code_list_csv_filepath_nn, converters={"code": str}), + "en": pd.read_csv(code_list_csv_filepath_en, converters={"code": str}), } mocker.patch( - "datadoc.backend.unit_types.UnitTypes._fetch_data_from_external_source", - functools.partial(fake_unit_types), + "datadoc.backend.code_list.CodeList._fetch_data_from_external_source", + functools.partial(fake_code_list), ) @pytest.fixture() -def unit_types_fake_structure( +def code_list_fake_structure( _mock_fetch_dataframe, -) -> UnitTypes: - return UnitTypes(100) +) -> CodeList: + return CodeList(100) @pytest.fixture() diff --git a/tests/frontend/callbacks/test_dataset_callbacks.py b/tests/frontend/callbacks/test_dataset_callbacks.py index 562beec4..93c6fb05 100644 --- a/tests/frontend/callbacks/test_dataset_callbacks.py +++ b/tests/frontend/callbacks/test_dataset_callbacks.py @@ -27,8 +27,8 @@ from tests.utils import TEST_PARQUET_FILEPATH if TYPE_CHECKING: + from datadoc.backend.code_list import CodeList from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping - from datadoc.backend.unit_types import UnitTypes from datadoc.frontend.callbacks.utils import MetadataInputTypes DATASET_CALLBACKS_MODULE = "datadoc.frontend.callbacks.dataset" @@ -119,11 +119,6 @@ def file_path(): "2f72477a-f051-43ee-bf8b-0d8f47b5e0a7", UUID("2f72477a-f051-43ee-bf8b-0d8f47b5e0a7"), ), - ( - DatasetIdentifiers.OWNER, - "Seksjon for dataplattform", - enums.LanguageStringType(nb="Seksjon for dataplattform"), - ), ], ) def test_accept_dataset_metadata_input_valid_data( @@ -243,14 +238,15 @@ def test_update_dataset_metadata_language_enums(): @pytest.mark.parametrize("language", list(SupportedLanguages)) def test_change_language_dataset_metadata_options_enums( subject_mapping_fake_statistical_structure: StatisticSubjectMapping, - unit_types_fake_structure: UnitTypes, + code_list_fake_structure: CodeList, metadata: DataDocMetadata, enum_for_options: LanguageStringsEnum, language: SupportedLanguages, ): state.metadata = metadata state.statistic_subject_mapping = subject_mapping_fake_statistical_structure - state.unit_types = unit_types_fake_structure + state.unit_types = code_list_fake_structure + state.organisational_units = code_list_fake_structure value = change_language_dataset_metadata(language) for options in cast(list[list[dict[str, str]]], value[0:-1]): diff --git a/tests/frontend/fields/test_display_dataset.py b/tests/frontend/fields/test_display_dataset.py index d11c7d86..8550cf59 100644 --- a/tests/frontend/fields/test_display_dataset.py +++ b/tests/frontend/fields/test_display_dataset.py @@ -4,10 +4,10 @@ from datadoc.enums import SupportedLanguages from datadoc.frontend.fields.display_dataset import get_statistical_subject_options from datadoc.frontend.fields.display_dataset import get_unit_type_options +from tests.backend.test_code_list import CODE_LIST_DIR from tests.backend.test_statistic_subject_mapping import ( STATISTICAL_SUBJECT_STRUCTURE_DIR, ) -from tests.backend.test_unit_types import UNIT_TYPES_DIR from tests.utils import TEST_RESOURCES_DIRECTORY @@ -51,7 +51,7 @@ def test_get_statistical_subject_options( ("unit_types_csv_filepath_nb", "expected"), [ ( - TEST_RESOURCES_DIRECTORY / UNIT_TYPES_DIR / "unit_types_nb.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "unit_types_nb.csv", [ {"label": "Adresse", "value": "01"}, {"label": "Arbeidsulykke", "value": "02"}, diff --git a/tests/resources/unit_types/unit_types.csv b/tests/resources/code_list/code_list.csv similarity index 100% rename from tests/resources/unit_types/unit_types.csv rename to tests/resources/code_list/code_list.csv diff --git a/tests/resources/unit_types/unit_types_en.csv b/tests/resources/code_list/code_list_en.csv similarity index 100% rename from tests/resources/unit_types/unit_types_en.csv rename to tests/resources/code_list/code_list_en.csv diff --git a/tests/resources/unit_types/unit_types_nb.csv b/tests/resources/code_list/code_list_nb.csv similarity index 100% rename from tests/resources/unit_types/unit_types_nb.csv rename to tests/resources/code_list/code_list_nb.csv diff --git a/tests/resources/unit_types/unit_types_nn.csv b/tests/resources/code_list/code_list_nn.csv similarity index 100% rename from tests/resources/unit_types/unit_types_nn.csv rename to tests/resources/code_list/code_list_nn.csv diff --git a/tests/resources/unit_types/empty.csv b/tests/resources/code_list/empty.csv similarity index 100% rename from tests/resources/unit_types/empty.csv rename to tests/resources/code_list/empty.csv diff --git a/tests/resources/unit_types/no_code.csv b/tests/resources/code_list/no_code.csv similarity index 100% rename from tests/resources/unit_types/no_code.csv rename to tests/resources/code_list/no_code.csv diff --git a/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json b/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json index e8e4bc98..4cb53dd2 100644 --- a/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json +++ b/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json @@ -52,11 +52,7 @@ "nb": "Norge" }, "id": "2f72477a-f051-43ee-bf8b-0d8f47b5e0a7", - "owner": { - "en": "", - "nn": "", - "nb": "Seksjon NNN" - }, + "owner": "NNN", "file_path": null, "metadata_created_date": "2022-10-07T07:35:01Z", "metadata_created_by": "default_user@ssb.no", diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 91f5300d..c3a1b51d 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -4,9 +4,9 @@ from datadoc.app import get_app -def test_get_app(subject_mapping_fake_statistical_structure, unit_types_fake_structure): +def test_get_app(subject_mapping_fake_statistical_structure, code_list_fake_structure): state.statistic_subject_mapping = subject_mapping_fake_statistical_structure - state.unit_types = unit_types_fake_structure + state.code_list = code_list_fake_structure app, _ = get_app() assert app.config["name"] == "Datadoc" From d17d2e35fc4c3f00f7ef273428c4978cbf042610 Mon Sep 17 00:00:00 2001 From: Jan Sander <63044278+JanhSander@users.noreply.github.com> Date: Thu, 7 Mar 2024 11:23:32 +0100 Subject: [PATCH 2/7] Rename --- tests/frontend/fields/test_display_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/frontend/fields/test_display_dataset.py b/tests/frontend/fields/test_display_dataset.py index 8550cf59..99888e3f 100644 --- a/tests/frontend/fields/test_display_dataset.py +++ b/tests/frontend/fields/test_display_dataset.py @@ -48,10 +48,10 @@ def test_get_statistical_subject_options( @pytest.mark.parametrize( - ("unit_types_csv_filepath_nb", "expected"), + ("code_list_csv_filepath_nb", "expected"), [ ( - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "unit_types_nb.csv", + TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_nb.csv", [ {"label": "Adresse", "value": "01"}, {"label": "Arbeidsulykke", "value": "02"}, @@ -61,9 +61,9 @@ def test_get_statistical_subject_options( ], ) def test_get_unit_type_options( - unit_types_fake_structure, + code_list_fake_structure, expected, ): - state.unit_types = unit_types_fake_structure + state.unit_types = code_list_fake_structure state.unit_types.wait_for_external_result() assert get_unit_type_options(SupportedLanguages.NORSK_BOKMÅL) == expected From 127be43e99c5c89b1c76c47b023a38c76ee1bf6e Mon Sep 17 00:00:00 2001 From: Jan Sander <63044278+JanhSander@users.noreply.github.com> Date: Thu, 7 Mar 2024 12:40:08 +0100 Subject: [PATCH 3/7] Update src/datadoc/backend/code_list.py Co-authored-by: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com> --- src/datadoc/backend/code_list.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datadoc/backend/code_list.py b/src/datadoc/backend/code_list.py index 44c8bf85..896ca17b 100644 --- a/src/datadoc/backend/code_list.py +++ b/src/datadoc/backend/code_list.py @@ -30,7 +30,6 @@ def get_title(self, language: SupportedLanguages) -> str: try: return self.titles[ ( - # Adjust to language codes in the unit_type codelist. "nb" if language in [ From 42d338a5923f4eb85682191391f40d3c48a0c8e0 Mon Sep 17 00:00:00 2001 From: Jan Sander <63044278+JanhSander@users.noreply.github.com> Date: Thu, 7 Mar 2024 12:40:20 +0100 Subject: [PATCH 4/7] Update src/datadoc/backend/code_list.py Co-authored-by: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com> --- src/datadoc/backend/code_list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datadoc/backend/code_list.py b/src/datadoc/backend/code_list.py index 896ca17b..4e99a2c0 100644 --- a/src/datadoc/backend/code_list.py +++ b/src/datadoc/backend/code_list.py @@ -17,7 +17,7 @@ @dataclass class CodeListItem: - """Data structure for the a code list item.""" + """Data structure for a code list item.""" titles: dict[str, str] unit_code: str From 3cb49dff41c1ba4ed44638e4dd855b88735945ef Mon Sep 17 00:00:00 2001 From: Jan Sander <63044278+JanhSander@users.noreply.github.com> Date: Fri, 8 Mar 2024 11:30:29 +0100 Subject: [PATCH 5/7] Resolved some PR comments --- poetry.lock | 4 ++-- src/datadoc/backend/code_list.py | 6 +++--- src/datadoc/frontend/fields/display_dataset.py | 6 +++--- tests/backend/test_code_list.py | 18 +++++++++--------- .../callbacks/test_dataset_callbacks.py | 5 +++++ ...stdata_p2021-12-31_p2021-12-31_v1__DOC.json | 6 +++++- 6 files changed, 27 insertions(+), 18 deletions(-) diff --git a/poetry.lock b/poetry.lock index 3d65d202..1741bf63 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -4675,4 +4675,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "7f1449637ef52c69a2daaf14c59d601ea850af5032c97e93766ca6ff0a2f4082" +content-hash = "ed14a20f2c283622c84c08597e90f8fbbfb0484ac54480d4289e15959c3460e9" diff --git a/src/datadoc/backend/code_list.py b/src/datadoc/backend/code_list.py index 4e99a2c0..8be28699 100644 --- a/src/datadoc/backend/code_list.py +++ b/src/datadoc/backend/code_list.py @@ -20,7 +20,7 @@ class CodeListItem: """Data structure for a code list item.""" titles: dict[str, str] - unit_code: str + code: str def get_title(self, language: SupportedLanguages) -> str: """Get the title in the given language.""" @@ -107,7 +107,7 @@ def _extract_titles( list_of_titles.append(titles) return list_of_titles - def _create_unit_types_from_dataframe( + def _create_code_list_from_dataframe( self, classifications_dataframes: dict[SupportedLanguages, pd.DataFrame], ) -> list[CodeListItem]: @@ -134,7 +134,7 @@ def _get_classification_dataframe_if_loaded(self) -> bool: if not self._classifications: self.classifications_dataframes = self.retrieve_external_data() if self.classifications_dataframes is not None: - self._classifications = self._create_unit_types_from_dataframe( + self._classifications = self._create_code_list_from_dataframe( self.classifications_dataframes, ) logger.debug( diff --git a/src/datadoc/frontend/fields/display_dataset.py b/src/datadoc/frontend/fields/display_dataset.py index c30df914..89f7716e 100644 --- a/src/datadoc/frontend/fields/display_dataset.py +++ b/src/datadoc/frontend/fields/display_dataset.py @@ -60,7 +60,7 @@ def get_unit_type_options( return [ { "label": unit_type.get_title(language), - "value": unit_type.unit_code, + "value": unit_type.code, } for unit_type in state.unit_types.classifications ] @@ -72,8 +72,8 @@ def get_owner_options( """Collect the owner options for the given language.""" return [ { - "label": f"{option.unit_code} - {option.get_title(language)}", - "value": option.unit_code, + "label": f"{option.code} - {option.get_title(language)}", + "value": option.code, } for option in state.organisational_units.classifications ] diff --git a/tests/backend/test_code_list.py b/tests/backend/test_code_list.py index fa49add5..bb4d78a0 100644 --- a/tests/backend/test_code_list.py +++ b/tests/backend/test_code_list.py @@ -26,7 +26,7 @@ "nn": "Adresse", "en": "Adresse", }, - unit_code="01", + code="01", ), CodeListItem( titles={ @@ -34,7 +34,7 @@ "nn": "Arbeidsulykke", "en": "Arbeidsulykke", }, - unit_code="02", + code="02", ), CodeListItem( titles={ @@ -42,7 +42,7 @@ "nn": "Bolig", "en": "Bolig", }, - unit_code="03", + code="03", ), ], ), @@ -63,7 +63,7 @@ "nn": None, "en": None, }, - unit_code="01", + code="01", ), CodeListItem( titles={ @@ -71,7 +71,7 @@ "nn": None, "en": None, }, - unit_code="02", + code="02", ), CodeListItem( titles={ @@ -79,7 +79,7 @@ "nn": None, "en": None, }, - unit_code="03", + code="03", ), ], ), @@ -94,7 +94,7 @@ "nn": "Adresse", "en": "Adresse", }, - unit_code=None, + code=None, ), CodeListItem( titles={ @@ -102,7 +102,7 @@ "nn": "Arbeidsulykke", "en": "Arbeidsulykke", }, - unit_code=None, + code=None, ), CodeListItem( titles={ @@ -110,7 +110,7 @@ "nn": "Bolig", "en": "Bolig", }, - unit_code=None, + code=None, ), ], ), diff --git a/tests/frontend/callbacks/test_dataset_callbacks.py b/tests/frontend/callbacks/test_dataset_callbacks.py index b7aaffe0..cb8057cb 100644 --- a/tests/frontend/callbacks/test_dataset_callbacks.py +++ b/tests/frontend/callbacks/test_dataset_callbacks.py @@ -119,6 +119,11 @@ def file_path(): "2f72477a-f051-43ee-bf8b-0d8f47b5e0a7", UUID("2f72477a-f051-43ee-bf8b-0d8f47b5e0a7"), ), + ( + DatasetIdentifiers.OWNER, + "Seksjon for dataplattform", + "Seksjon for dataplattform", + ), ], ) def test_accept_dataset_metadata_input_valid_data( diff --git a/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json b/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json index 4cb53dd2..e8e4bc98 100644 --- a/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json +++ b/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json @@ -52,7 +52,11 @@ "nb": "Norge" }, "id": "2f72477a-f051-43ee-bf8b-0d8f47b5e0a7", - "owner": "NNN", + "owner": { + "en": "", + "nn": "", + "nb": "Seksjon NNN" + }, "file_path": null, "metadata_created_date": "2022-10-07T07:35:01Z", "metadata_created_by": "default_user@ssb.no", From 36db33def9e9f9bba66bda5ff7626a1d590f4319 Mon Sep 17 00:00:00 2001 From: Jan Sander <63044278+JanhSander@users.noreply.github.com> Date: Mon, 11 Mar 2024 08:35:47 +0100 Subject: [PATCH 6/7] Model backwards compability --- poetry.lock | 8 +- pyproject.toml | 2 +- .../backend/model_backwards_compatibility.py | 31 ++- .../test_model_backwards_compatibility.py | 2 +- .../v2_1_0/person_data_v1__DOC.json | 261 ++++++++++++++++++ ...tdata_p2021-12-31_p2021-12-31_v1__DOC.json | 8 +- 6 files changed, 290 insertions(+), 22 deletions(-) create mode 100644 tests/resources/existing_metadata_file/compatibility/v2_1_0/person_data_v1__DOC.json diff --git a/poetry.lock b/poetry.lock index 1741bf63..4738e9dd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4139,13 +4139,13 @@ dash = ">=2.15.0" [[package]] name = "ssb-datadoc-model" -version = "4.3.1" +version = "4.3.2" description = "Data Model for use in Statistics Norway's Metadata system" optional = false python-versions = ">=3.10" files = [ - {file = "ssb_datadoc_model-4.3.1-py3-none-any.whl", hash = "sha256:a1400bff45c27237acb87156d5cd695d0e12dd33b7c04746e336516dbfe00e61"}, - {file = "ssb_datadoc_model-4.3.1.tar.gz", hash = "sha256:f6836dfe26a331229b317155a5eecd5e0479799097ffba25f136bb3947ab1771"}, + {file = "ssb_datadoc_model-4.3.2-py3-none-any.whl", hash = "sha256:86b998ddb7b5e926896a6ce37c1d5ac5c80b512f5b0ccd6b821050a625a7b71c"}, + {file = "ssb_datadoc_model-4.3.2.tar.gz", hash = "sha256:85566a5b5a24f40954d10e7ae4130c452941ce8362550d6fea27541775c981a7"}, ] [package.dependencies] @@ -4675,4 +4675,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "ed14a20f2c283622c84c08597e90f8fbbfb0484ac54480d4289e15959c3460e9" +content-hash = "713694ade4a05085744e77aa840336b010bfcc5ad30944fd5474945cd7596ef4" diff --git a/pyproject.toml b/pyproject.toml index b4932860..218554b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dash = ">=2.15.0" pydantic = "==2.5.2" dash-bootstrap-components = ">=1.1.0" pandas = ">=1.4.2" -ssb-datadoc-model = "==4.3.1" +ssb-datadoc-model = "4.3.2" dapla-toolbelt = ">=1.3.3" gunicorn = ">=21.2.0" flask-healthz = ">=0.0.3" diff --git a/src/datadoc/backend/model_backwards_compatibility.py b/src/datadoc/backend/model_backwards_compatibility.py index b0ed6282..ea9e2e12 100644 --- a/src/datadoc/backend/model_backwards_compatibility.py +++ b/src/datadoc/backend/model_backwards_compatibility.py @@ -63,6 +63,24 @@ def handle_current_version(supplied_metadata: dict[str, Any]) -> dict[str, Any]: return supplied_metadata +def handle_version_2_1_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: + """Handle breaking changes for v2.1.0. + + Datatype changed from LanguageStringType to str for + - unit_type + - owner + TODO: PR ref here? + """ + dropdown_fields = ["unit_type", "owner"] + for field in dropdown_fields: + data = supplied_metadata["dataset"][field] + supplied_metadata["dataset"][field] = str( + data["nb"] or data["nn"] or data["en"], + ) + supplied_metadata["document_version"] = "2.2.0" + return supplied_metadata + + def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: """Handle breaking changes for v1.0.0.""" datetime_fields = [ @@ -77,13 +95,11 @@ def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: ), timespec="seconds", ) - if isinstance(supplied_metadata["dataset"]["data_source"], str): supplied_metadata["dataset"]["data_source"] = LanguageStringType( en=supplied_metadata["dataset"]["data_source"], ) supplied_metadata["document_version"] = "2.1.0" - return supplied_metadata @@ -113,14 +129,9 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]: # Register all the supported versions and their handlers. # MUST be ordered from oldest to newest. BackwardsCompatibleVersion(version="0.1.1", handler=handle_version_0_1_1) -BackwardsCompatibleVersion( - version="1.0.0", - handler=handle_version_1_0_0, -) -BackwardsCompatibleVersion( - version="2.1.0", - handler=handle_current_version, -) +BackwardsCompatibleVersion(version="1.0.0", handler=handle_version_1_0_0) +BackwardsCompatibleVersion(version="2.1.0", handler=handle_version_2_1_0) +BackwardsCompatibleVersion(version="2.2.0", handler=handle_current_version) def upgrade_metadata(fresh_metadata: dict[str, Any]) -> dict[str, Any]: diff --git a/tests/backend/test_model_backwards_compatibility.py b/tests/backend/test_model_backwards_compatibility.py index 12e26fb8..0c532055 100644 --- a/tests/backend/test_model_backwards_compatibility.py +++ b/tests/backend/test_model_backwards_compatibility.py @@ -19,7 +19,7 @@ def test_existing_metadata_current_model_version(): - current_model_version = "2.1.0" + current_model_version = "2.2.0" fresh_metadata = {"document_version": current_model_version} upgraded_metadata = upgrade_metadata(fresh_metadata) assert upgraded_metadata == fresh_metadata diff --git a/tests/resources/existing_metadata_file/compatibility/v2_1_0/person_data_v1__DOC.json b/tests/resources/existing_metadata_file/compatibility/v2_1_0/person_data_v1__DOC.json new file mode 100644 index 00000000..7e5d9a9b --- /dev/null +++ b/tests/resources/existing_metadata_file/compatibility/v2_1_0/person_data_v1__DOC.json @@ -0,0 +1,261 @@ +{ + "percentage_complete": 98, + "document_version": "2.1.0", + "dataset": { + "short_name": "person_data_v1", + "assessment": "SENSITIVE", + "dataset_status": "DRAFT", + "dataset_state": "PROCESSED_DATA", + "name": { + "en": "successfully_read_existing_file", + "nn": "", + "nb": "Persondata" + }, + "description": { + "en": "", + "nn": "", + "nb": "" + }, + "data_source": { + "en": "", + "nn": "", + "nb": "Skatteetaten" + }, + "register_uri": null, + "population_description": { + "en": "", + "nn": "", + "nb": "Norsk befolkning" + }, + "version": "1", + "version_description": null, + "unit_type": { + "en": "", + "nn": "", + "nb": "PERSON" + }, + "temporality_type": "STATUS", + "subject_field": { + "en": "", + "nn": "", + "nb": "Skatt" + }, + "keyword": [ + "Skatt", + "Person", + "Helsepenger" + ], + "spatial_coverage_description": { + "en": "", + "nn": "", + "nb": "Norge" + }, + "id": "143fca77-ef56-419c-a1e1-d69c4199f020", + "owner": { + "en": "", + "nn": "", + "nb": "723" + }, + "file_path": null, + "metadata_created_date": "2022-09-05T11:07:14Z", + "metadata_created_by": "default_user@ssb.no", + "metadata_last_updated_date": "2024-01-08T15:41:05.681664Z", + "metadata_last_updated_by": "default_user@ssb.no", + "contains_data_from": "2010-09-05", + "contains_data_until": "2022-09-05" + }, + "variables": [ + { + "short_name": "pers_id", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "STRING", + "variable_role": "IDENTIFIER", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/26/nb", + "direct_person_identifying": true, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "tidspunkt", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "DATETIME", + "variable_role": "START_TIME", + "definition_uri": null, + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "sivilstand", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "STRING", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/91/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "alm_inntekt", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "INTEGER", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/1438/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "sykepenger", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "INTEGER", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/3366/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "ber_bruttoformue", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "INTEGER", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/3327/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "fullf_utdanning", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "STRING", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/3242/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "hoveddiagnose", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "STRING", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/2578/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + } + ] + } diff --git a/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json b/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json index e8e4bc98..54947ae6 100644 --- a/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json +++ b/tests/resources/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json @@ -2,7 +2,7 @@ "document_version": "0.0.1", "datadoc": { "percentage_complete": 97, - "document_version": "2.1.0", + "document_version": "2.2.0", "dataset": { "short_name": "person_testdata_p2021-12-31_p2021-12-31", "assessment": "PROTECTED", @@ -52,11 +52,7 @@ "nb": "Norge" }, "id": "2f72477a-f051-43ee-bf8b-0d8f47b5e0a7", - "owner": { - "en": "", - "nn": "", - "nb": "Seksjon NNN" - }, + "owner": "722", "file_path": null, "metadata_created_date": "2022-10-07T07:35:01Z", "metadata_created_by": "default_user@ssb.no", From 582a2c3b13f37d42155c06864b6cc491275a508e Mon Sep 17 00:00:00 2001 From: Jan Sander <63044278+JanhSander@users.noreply.github.com> Date: Mon, 11 Mar 2024 13:22:05 +0100 Subject: [PATCH 7/7] Updated for backwards compability --- src/datadoc/backend/datadoc_metadata.py | 23 ++----------------- .../backend/model_backwards_compatibility.py | 22 ++++-------------- .../v0_1_1/person_data_v1__DOC.json | 6 ++++- .../v1_0_0/person_data_v1__DOC.json | 6 ++++- .../v2_1_0/person_data_v1__DOC.json | 6 +---- .../invalid_id_field/person_data_v1__DOC.json | 2 +- .../person_data_v1__DOC.json | 4 ++-- .../valid_id_field/person_data_v1__DOC.json | 2 +- 8 files changed, 21 insertions(+), 50 deletions(-) diff --git a/src/datadoc/backend/datadoc_metadata.py b/src/datadoc/backend/datadoc_metadata.py index 5a2643fe..fe005a4e 100644 --- a/src/datadoc/backend/datadoc_metadata.py +++ b/src/datadoc/backend/datadoc_metadata.py @@ -8,7 +8,6 @@ import uuid from typing import TYPE_CHECKING -import pydantic from cloudpathlib import CloudPath from cloudpathlib import GSClient from cloudpathlib import GSPath @@ -49,16 +48,13 @@ def __init__( ) -> None: """Read in a dataset if supplied, otherwise naively instantiate the class.""" self._statistic_subject_mapping = statistic_subject_mapping - self.metadata_document: pathlib.Path | CloudPath | None = None self.container: model.MetadataContainer | None = None self.dataset_path: pathlib.Path | CloudPath | None = None self.short_name: str | None = None self.dataset = model.Dataset() self.variables: list = [] - self.variables_lookup: dict[str, model.Variable] = {} - if metadata_document_path: # In this case the user has specified an independent metadata document for editing # without a dataset. @@ -70,7 +66,6 @@ def __init__( self.metadata_document = self.dataset_path.parent / ( self.dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX ) - self.extract_metadata_from_files() @staticmethod @@ -121,10 +116,7 @@ def extract_metadata_from_existing_document( try: with document.open(mode="r", encoding="utf-8") as file: fresh_metadata = json.load(file) - logger.info( - "Opened existing metadata file %s", - document, - ) + logger.info("Opened existing metadata file %s", document) if self.is_metadata_in_container_structure(fresh_metadata): self.container = model.MetadataContainer.model_validate_json( json.dumps(fresh_metadata), @@ -132,16 +124,13 @@ def extract_metadata_from_existing_document( datadoc_metadata = fresh_metadata["datadoc"] else: datadoc_metadata = fresh_metadata - if datadoc_metadata is None: # In this case we've read in a file with an empty "datadoc" structure. # A typical example of this is a file produced from a pseudonymization process. return - datadoc_metadata = upgrade_metadata( datadoc_metadata, ) - meta = model.DatadocMetadata.model_validate_json( json.dumps(datadoc_metadata), ) @@ -149,7 +138,6 @@ def extract_metadata_from_existing_document( self.dataset = meta.dataset if meta.variables is not None: self.variables = meta.variables - except json.JSONDecodeError: logger.warning( "Could not open existing metadata file %s. \ @@ -167,14 +155,7 @@ def is_metadata_in_container_structure( The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc. This method returns True if the metadata is in the container structure, False otherwise. """ - try: - model.MetadataContainer.model_validate_json( - json.dumps(metadata), - ) - except pydantic.ValidationError: - return False - else: - return True + return "datadoc" in metadata def extract_metadata_from_dataset( self, diff --git a/src/datadoc/backend/model_backwards_compatibility.py b/src/datadoc/backend/model_backwards_compatibility.py index ea9e2e12..2d482d67 100644 --- a/src/datadoc/backend/model_backwards_compatibility.py +++ b/src/datadoc/backend/model_backwards_compatibility.py @@ -66,27 +66,17 @@ def handle_current_version(supplied_metadata: dict[str, Any]) -> dict[str, Any]: def handle_version_2_1_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: """Handle breaking changes for v2.1.0. - Datatype changed from LanguageStringType to str for - - unit_type - - owner - TODO: PR ref here? + Datatype changed from LanguageStringType to str for owner """ - dropdown_fields = ["unit_type", "owner"] - for field in dropdown_fields: - data = supplied_metadata["dataset"][field] - supplied_metadata["dataset"][field] = str( - data["nb"] or data["nn"] or data["en"], - ) + data = supplied_metadata["dataset"]["owner"] + supplied_metadata["dataset"]["owner"] = str(data["nb"] or data["nn"] or data["en"]) supplied_metadata["document_version"] = "2.2.0" return supplied_metadata def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: """Handle breaking changes for v1.0.0.""" - datetime_fields = [ - ("metadata_created_date"), - ("metadata_last_updated_date"), - ] + datetime_fields = [("metadata_created_date"), ("metadata_last_updated_date")] for field in datetime_fields: if supplied_metadata["dataset"][field]: supplied_metadata["dataset"][field] = datetime.isoformat( @@ -118,7 +108,6 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]: supplied_metadata["dataset"][new_key] = supplied_metadata["dataset"].pop( old_key, ) - # Replace empty strings with None, empty strings are not valid for LanguageStrings values supplied_metadata["dataset"] = { k: None if v == "" else v for k, v in supplied_metadata["dataset"].items() @@ -139,15 +128,12 @@ def upgrade_metadata(fresh_metadata: dict[str, Any]) -> dict[str, Any]: # Special case for current version, we expose the current_model_version parameter for test purposes supplied_version = fresh_metadata[VERSION_FIELD_NAME] start_running_handlers = False - # Run all the handlers in order from the supplied version onwards for k, v in SUPPORTED_VERSIONS.items(): if k == supplied_version: start_running_handlers = True if start_running_handlers: fresh_metadata = v.handler(fresh_metadata) - if not start_running_handlers: raise UnknownModelVersionError(supplied_version) - return fresh_metadata diff --git a/tests/resources/existing_metadata_file/compatibility/v0_1_1/person_data_v1__DOC.json b/tests/resources/existing_metadata_file/compatibility/v0_1_1/person_data_v1__DOC.json index c3f9ff55..724976b0 100644 --- a/tests/resources/existing_metadata_file/compatibility/v0_1_1/person_data_v1__DOC.json +++ b/tests/resources/existing_metadata_file/compatibility/v0_1_1/person_data_v1__DOC.json @@ -20,7 +20,11 @@ "subject_field": null, "spatial_coverage_description": null, "id": null, - "owner": null, + "owner": { + "en": "", + "nn": "", + "nb": "724" + }, "data_source_path": "klargjorte_data/person_data_v1.parquet", "created_date": "2022-07-13T17:28:01.617657", "created_by": "default_user@ssb.no", diff --git a/tests/resources/existing_metadata_file/compatibility/v1_0_0/person_data_v1__DOC.json b/tests/resources/existing_metadata_file/compatibility/v1_0_0/person_data_v1__DOC.json index f23018ac..6b062b2e 100644 --- a/tests/resources/existing_metadata_file/compatibility/v1_0_0/person_data_v1__DOC.json +++ b/tests/resources/existing_metadata_file/compatibility/v1_0_0/person_data_v1__DOC.json @@ -46,7 +46,11 @@ "nb": "Norge" }, "id": "143fca77-ef56-419c-a1e1-d69c4199f020", - "owner": null, + "owner": { + "en": "", + "nn": "", + "nb": "722" + }, "data_source_path": "klargjorte_data/person_data_v1.parquet", "metadata_created_date": "2022-09-05T13:07:14.066023", "metadata_created_by": "default_user@ssb.no", diff --git a/tests/resources/existing_metadata_file/compatibility/v2_1_0/person_data_v1__DOC.json b/tests/resources/existing_metadata_file/compatibility/v2_1_0/person_data_v1__DOC.json index 7e5d9a9b..908e7752 100644 --- a/tests/resources/existing_metadata_file/compatibility/v2_1_0/person_data_v1__DOC.json +++ b/tests/resources/existing_metadata_file/compatibility/v2_1_0/person_data_v1__DOC.json @@ -29,11 +29,7 @@ }, "version": "1", "version_description": null, - "unit_type": { - "en": "", - "nn": "", - "nb": "PERSON" - }, + "unit_type": "PERSON", "temporality_type": "STATUS", "subject_field": { "en": "", diff --git a/tests/resources/existing_metadata_file/invalid_id_field/person_data_v1__DOC.json b/tests/resources/existing_metadata_file/invalid_id_field/person_data_v1__DOC.json index 653c2a9c..b7eccdcd 100644 --- a/tests/resources/existing_metadata_file/invalid_id_field/person_data_v1__DOC.json +++ b/tests/resources/existing_metadata_file/invalid_id_field/person_data_v1__DOC.json @@ -2,7 +2,7 @@ "document_version": "0.0.1", "datadoc": { "percentage_complete": 6, - "document_version": "2.1.0", + "document_version": "2.2.0", "dataset": { "short_name": "person_data_v1", "assessment": "OPEN", diff --git a/tests/resources/existing_metadata_file/person_data_v1__DOC.json b/tests/resources/existing_metadata_file/person_data_v1__DOC.json index ed7dcdc3..71ade1b8 100644 --- a/tests/resources/existing_metadata_file/person_data_v1__DOC.json +++ b/tests/resources/existing_metadata_file/person_data_v1__DOC.json @@ -2,7 +2,7 @@ "document_version": "0.0.1", "datadoc": { "percentage_complete": 98, - "document_version": "2.1.0", + "document_version": "2.2.0", "dataset": { "short_name": "person_data_v1", "assessment": "SENSITIVE", @@ -49,7 +49,7 @@ "nb": "Norge" }, "id": "143fca77-ef56-419c-a1e1-d69c4199f020", - "owner": null, + "owner": "NNN", "file_path": null, "metadata_created_date": "2022-09-05T11:07:14Z", "metadata_created_by": "default_user@ssb.no", diff --git a/tests/resources/existing_metadata_file/valid_id_field/person_data_v1__DOC.json b/tests/resources/existing_metadata_file/valid_id_field/person_data_v1__DOC.json index 90a178bd..0662d9bb 100644 --- a/tests/resources/existing_metadata_file/valid_id_field/person_data_v1__DOC.json +++ b/tests/resources/existing_metadata_file/valid_id_field/person_data_v1__DOC.json @@ -2,7 +2,7 @@ "document_version": "0.0.1", "datadoc": { "percentage_complete": 6, - "document_version": "2.1.0", + "document_version": "2.2.0", "dataset": { "short_name": "person_data_v1", "assessment": "OPEN",