From 7f64c6a38db6bdf5e5a471463d888e38ded1d1fe Mon Sep 17 00:00:00 2001 From: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com> Date: Wed, 6 Mar 2024 10:25:41 +0100 Subject: [PATCH] Support reading metadata files from pseudo (#202) * Support reading metadata files from pseudo * Simplify logic for extracting metadata * Fix mypy * Fix checks --- src/datadoc/backend/datadoc_metadata.py | 31 ++++++++++++---- tests/backend/test_datadoc_metadata.py | 35 ++++++++++++++----- .../test_model_backwards_compatibility.py | 4 +-- tests/conftest.py | 4 +-- .../pseudo/person_data_v1__DOC.json | 22 ++++++++++++ 5 files changed, 77 insertions(+), 19 deletions(-) create mode 100644 tests/resources/existing_metadata_file/pseudo/person_data_v1__DOC.json diff --git a/src/datadoc/backend/datadoc_metadata.py b/src/datadoc/backend/datadoc_metadata.py index bd330dd3..5a2643fe 100644 --- a/src/datadoc/backend/datadoc_metadata.py +++ b/src/datadoc/backend/datadoc_metadata.py @@ -8,6 +8,7 @@ import uuid from typing import TYPE_CHECKING +import pydantic from cloudpathlib import CloudPath from cloudpathlib import GSClient from cloudpathlib import GSPath @@ -57,21 +58,20 @@ def __init__( self.variables: list = [] self.variables_lookup: dict[str, model.Variable] = {} + if metadata_document_path: # In this case the user has specified an independent metadata document for editing # without a dataset. self.metadata_document = self._open_path(metadata_document_path) - self.extract_metadata_from_existing_document(self.metadata_document) elif dataset_path: self.dataset_path = self._open_path(dataset_path) - # The short_name is set as the dataset filename without file extension - self.short_name = self.dataset_path.stem # Build the metadata document path based on the dataset path # Example: /path/to/dataset.parquet -> /path/to/dataset__DOC.json self.metadata_document = self.dataset_path.parent / ( self.dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX ) - self.extract_metadata_from_files() + + self.extract_metadata_from_files() @staticmethod def _open_path(path: str) -> pathlib.Path | CloudPath: @@ -93,7 +93,12 @@ def extract_metadata_from_files(self) -> None: """ if self.metadata_document is not None and self.metadata_document.exists(): self.extract_metadata_from_existing_document(self.metadata_document) - elif self.dataset_path is not None: + + if ( + self.dataset_path is not None + and self.dataset == model.Dataset() + and len(self.variables) == 0 + ): self.extract_metadata_from_dataset(self.dataset_path) self.dataset.id = uuid.uuid4() # Set default values for variables where appropriate @@ -128,6 +133,11 @@ def extract_metadata_from_existing_document( else: datadoc_metadata = fresh_metadata + if datadoc_metadata is None: + # In this case we've read in a file with an empty "datadoc" structure. + # A typical example of this is a file produced from a pseudonymization process. + return + datadoc_metadata = upgrade_metadata( datadoc_metadata, ) @@ -157,7 +167,14 @@ def is_metadata_in_container_structure( The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc. This method returns True if the metadata is in the container structure, False otherwise. """ - return "datadoc" in metadata and "dataset" in metadata["datadoc"] + try: + model.MetadataContainer.model_validate_json( + json.dumps(metadata), + ) + except pydantic.ValidationError: + return False + else: + return True def extract_metadata_from_dataset( self, @@ -176,7 +193,7 @@ def extract_metadata_from_dataset( ) self.dataset = model.Dataset( - short_name=self.short_name, + short_name=self.dataset_path.stem if self.dataset_path else None, dataset_state=dapla_dataset_path_info.dataset_state, dataset_status=DataSetStatus.DRAFT, assessment=self.get_assessment_by_state( diff --git a/tests/backend/test_datadoc_metadata.py b/tests/backend/test_datadoc_metadata.py index 65c7fef5..362734e7 100644 --- a/tests/backend/test_datadoc_metadata.py +++ b/tests/backend/test_datadoc_metadata.py @@ -143,15 +143,15 @@ def test_metadata_id(metadata: DataDocMetadata): [TEST_EXISTING_METADATA_DIRECTORY / "invalid_id_field"], ) def test_existing_metadata_none_id( - existing_metadata_file: str, + existing_metadata_file: Path, metadata: DataDocMetadata, ): - with Path.open(Path(existing_metadata_file)) as f: + with existing_metadata_file.open() as f: pre_open_id: None = json.load(f)["datadoc"]["dataset"]["id"] assert pre_open_id is None assert isinstance(metadata.dataset.id, UUID) metadata.write_metadata_document() - with Path.open(Path(existing_metadata_file)) as f: + with existing_metadata_file.open() as f: post_write_id = json.load(f)["datadoc"]["dataset"]["id"] assert post_write_id == str(metadata.dataset.id) @@ -161,18 +161,18 @@ def test_existing_metadata_none_id( [TEST_EXISTING_METADATA_DIRECTORY / "valid_id_field"], ) def test_existing_metadata_valid_id( - existing_metadata_file: str, + existing_metadata_file: Path, metadata: DataDocMetadata, ): pre_open_id = "" post_write_id = "" - with Path.open(Path(existing_metadata_file)) as f: + with existing_metadata_file.open() as f: pre_open_id = json.load(f)["datadoc"]["dataset"]["id"] assert pre_open_id is not None assert isinstance(metadata.dataset.id, UUID) assert str(metadata.dataset.id) == pre_open_id metadata.write_metadata_document() - with Path.open(Path(existing_metadata_file)) as f: + with existing_metadata_file.open() as f: post_write_id = json.load(f)["datadoc"]["dataset"]["id"] assert post_write_id == pre_open_id @@ -188,11 +188,11 @@ def test_direct_person_identifying_default_value(metadata: DataDocMetadata): def test_save_file_path_metadata_field( - existing_metadata_file: str, + existing_metadata_file: Path, metadata: DataDocMetadata, ): metadata.write_metadata_document() - with Path.open(Path(existing_metadata_file)) as f: + with existing_metadata_file.open() as f: saved_file_path = json.load(f)["datadoc"]["dataset"]["file_path"] assert saved_file_path == str(metadata.dataset_path) @@ -345,3 +345,22 @@ def test_extract_subject_field_value_from_statistic_structure_xml( # TODO @mmwinther: Remove multiple_language_support once the model is updated. # https://github.com/statisticsnorway/ssb-datadoc-model/issues/41 assert metadata.dataset.subject_field.en == expected_subject_code # type: ignore [union-attr] + + +@pytest.mark.parametrize( + "existing_metadata_path", + [TEST_EXISTING_METADATA_DIRECTORY / "pseudo"], +) +def test_existing_pseudo_metadata_file( + existing_metadata_file: Path, + metadata: DataDocMetadata, +): + pre_open_metadata = json.loads(existing_metadata_file.read_text()) + metadata.write_metadata_document() + post_open_metadata = json.loads(existing_metadata_file.read_text()) + + assert len(metadata.variables) == 8 # noqa: PLR2004 + assert ( + pre_open_metadata["pseudonymization"] == post_open_metadata["pseudonymization"] + ) + assert post_open_metadata["datadoc"] is not None diff --git a/tests/backend/test_model_backwards_compatibility.py b/tests/backend/test_model_backwards_compatibility.py index 1b8c9b34..12e26fb8 100644 --- a/tests/backend/test_model_backwards_compatibility.py +++ b/tests/backend/test_model_backwards_compatibility.py @@ -37,10 +37,10 @@ def test_existing_metadata_unknown_model_version(): ids=BACKWARDS_COMPATIBLE_VERSION_NAMES, ) def test_backwards_compatibility( - existing_metadata_file: str, + existing_metadata_file: Path, metadata: DataDocMetadata, ): - with Path.open(Path(existing_metadata_file)) as f: + with existing_metadata_file.open() as f: file_metadata = json.loads(f.read()) # Just test a single value to make sure we have a working model diff --git a/tests/conftest.py b/tests/conftest.py index c2e288d9..a7a85344 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -91,13 +91,13 @@ def existing_metadata_path() -> Path: @pytest.fixture() -def existing_metadata_file(tmp_path: Path, existing_metadata_path: Path) -> str: +def existing_metadata_file(tmp_path: Path, existing_metadata_path: Path) -> Path: # Setup by copying the file into the relevant directory shutil.copy( existing_metadata_path / TEST_EXISTING_METADATA_FILE_NAME, tmp_path / TEST_EXISTING_METADATA_FILE_NAME, ) - return str(tmp_path / TEST_EXISTING_METADATA_FILE_NAME) + return tmp_path / TEST_EXISTING_METADATA_FILE_NAME @pytest.fixture(autouse=True) diff --git a/tests/resources/existing_metadata_file/pseudo/person_data_v1__DOC.json b/tests/resources/existing_metadata_file/pseudo/person_data_v1__DOC.json new file mode 100644 index 00000000..2e2e3a18 --- /dev/null +++ b/tests/resources/existing_metadata_file/pseudo/person_data_v1__DOC.json @@ -0,0 +1,22 @@ +{ + "document_version": "0.0.1", + "datadoc": null, + "pseudonymization": { + "document_version": "0.1.0", + "pseudo_dataset": null, + "pseudo_variables": [ + { + "short_name": "fnr", + "data_element_path": "fnr", + "data_element_pattern": "**/fnr", + "stable_identifier_type": null, + "stable_identifier_version": null, + "encryption_algorithm": "TINK-DAEAD", + "encryption_key_reference": "ssb-common-key-1", + "encryption_algorithm_parameters": [{ "keyId": "ssb-common-key-1" }], + "source_variable": null, + "source_variable_datatype": null + } + ] + } +}