From 7f64c6a38db6bdf5e5a471463d888e38ded1d1fe Mon Sep 17 00:00:00 2001
From: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com>
Date: Wed, 6 Mar 2024 10:25:41 +0100
Subject: [PATCH] Support reading metadata files from pseudo (#202)

* Support reading metadata files from pseudo

* Simplify logic for extracting metadata

* Fix mypy

* Fix checks
---
 src/datadoc/backend/datadoc_metadata.py       | 31 ++++++++++++----
 tests/backend/test_datadoc_metadata.py        | 35 ++++++++++++++-----
 .../test_model_backwards_compatibility.py     |  4 +--
 tests/conftest.py                             |  4 +--
 .../pseudo/person_data_v1__DOC.json           | 22 ++++++++++++
 5 files changed, 77 insertions(+), 19 deletions(-)
 create mode 100644 tests/resources/existing_metadata_file/pseudo/person_data_v1__DOC.json

diff --git a/src/datadoc/backend/datadoc_metadata.py b/src/datadoc/backend/datadoc_metadata.py
index bd330dd3..5a2643fe 100644
--- a/src/datadoc/backend/datadoc_metadata.py
+++ b/src/datadoc/backend/datadoc_metadata.py
@@ -8,6 +8,7 @@
 import uuid
 from typing import TYPE_CHECKING
 
+import pydantic
 from cloudpathlib import CloudPath
 from cloudpathlib import GSClient
 from cloudpathlib import GSPath
@@ -57,21 +58,20 @@ def __init__(
         self.variables: list = []
 
         self.variables_lookup: dict[str, model.Variable] = {}
+
         if metadata_document_path:
             # In this case the user has specified an independent metadata document for editing
             # without a dataset.
             self.metadata_document = self._open_path(metadata_document_path)
-            self.extract_metadata_from_existing_document(self.metadata_document)
         elif dataset_path:
             self.dataset_path = self._open_path(dataset_path)
-            # The short_name is set as the dataset filename without file extension
-            self.short_name = self.dataset_path.stem
             # Build the metadata document path based on the dataset path
             # Example: /path/to/dataset.parquet -> /path/to/dataset__DOC.json
             self.metadata_document = self.dataset_path.parent / (
                 self.dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX
             )
-            self.extract_metadata_from_files()
+
+        self.extract_metadata_from_files()
 
     @staticmethod
     def _open_path(path: str) -> pathlib.Path | CloudPath:
@@ -93,7 +93,12 @@ def extract_metadata_from_files(self) -> None:
         """
         if self.metadata_document is not None and self.metadata_document.exists():
             self.extract_metadata_from_existing_document(self.metadata_document)
-        elif self.dataset_path is not None:
+
+        if (
+            self.dataset_path is not None
+            and self.dataset == model.Dataset()
+            and len(self.variables) == 0
+        ):
             self.extract_metadata_from_dataset(self.dataset_path)
             self.dataset.id = uuid.uuid4()
             # Set default values for variables where appropriate
@@ -128,6 +133,11 @@ def extract_metadata_from_existing_document(
             else:
                 datadoc_metadata = fresh_metadata
 
+            if datadoc_metadata is None:
+                # In this case we've read in a file with an empty "datadoc" structure.
+                # A typical example of this is a file produced from a pseudonymization process.
+                return
+
             datadoc_metadata = upgrade_metadata(
                 datadoc_metadata,
             )
@@ -157,7 +167,14 @@ def is_metadata_in_container_structure(
         The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc.
         This method returns True if the metadata is in the container structure, False otherwise.
         """
-        return "datadoc" in metadata and "dataset" in metadata["datadoc"]
+        try:
+            model.MetadataContainer.model_validate_json(
+                json.dumps(metadata),
+            )
+        except pydantic.ValidationError:
+            return False
+        else:
+            return True
 
     def extract_metadata_from_dataset(
         self,
@@ -176,7 +193,7 @@ def extract_metadata_from_dataset(
         )
 
         self.dataset = model.Dataset(
-            short_name=self.short_name,
+            short_name=self.dataset_path.stem if self.dataset_path else None,
             dataset_state=dapla_dataset_path_info.dataset_state,
             dataset_status=DataSetStatus.DRAFT,
             assessment=self.get_assessment_by_state(
diff --git a/tests/backend/test_datadoc_metadata.py b/tests/backend/test_datadoc_metadata.py
index 65c7fef5..362734e7 100644
--- a/tests/backend/test_datadoc_metadata.py
+++ b/tests/backend/test_datadoc_metadata.py
@@ -143,15 +143,15 @@ def test_metadata_id(metadata: DataDocMetadata):
     [TEST_EXISTING_METADATA_DIRECTORY / "invalid_id_field"],
 )
 def test_existing_metadata_none_id(
-    existing_metadata_file: str,
+    existing_metadata_file: Path,
     metadata: DataDocMetadata,
 ):
-    with Path.open(Path(existing_metadata_file)) as f:
+    with existing_metadata_file.open() as f:
         pre_open_id: None = json.load(f)["datadoc"]["dataset"]["id"]
     assert pre_open_id is None
     assert isinstance(metadata.dataset.id, UUID)
     metadata.write_metadata_document()
-    with Path.open(Path(existing_metadata_file)) as f:
+    with existing_metadata_file.open() as f:
         post_write_id = json.load(f)["datadoc"]["dataset"]["id"]
     assert post_write_id == str(metadata.dataset.id)
 
@@ -161,18 +161,18 @@ def test_existing_metadata_none_id(
     [TEST_EXISTING_METADATA_DIRECTORY / "valid_id_field"],
 )
 def test_existing_metadata_valid_id(
-    existing_metadata_file: str,
+    existing_metadata_file: Path,
     metadata: DataDocMetadata,
 ):
     pre_open_id = ""
     post_write_id = ""
-    with Path.open(Path(existing_metadata_file)) as f:
+    with existing_metadata_file.open() as f:
         pre_open_id = json.load(f)["datadoc"]["dataset"]["id"]
     assert pre_open_id is not None
     assert isinstance(metadata.dataset.id, UUID)
     assert str(metadata.dataset.id) == pre_open_id
     metadata.write_metadata_document()
-    with Path.open(Path(existing_metadata_file)) as f:
+    with existing_metadata_file.open() as f:
         post_write_id = json.load(f)["datadoc"]["dataset"]["id"]
     assert post_write_id == pre_open_id
 
@@ -188,11 +188,11 @@ def test_direct_person_identifying_default_value(metadata: DataDocMetadata):
 
 
 def test_save_file_path_metadata_field(
-    existing_metadata_file: str,
+    existing_metadata_file: Path,
     metadata: DataDocMetadata,
 ):
     metadata.write_metadata_document()
-    with Path.open(Path(existing_metadata_file)) as f:
+    with existing_metadata_file.open() as f:
         saved_file_path = json.load(f)["datadoc"]["dataset"]["file_path"]
     assert saved_file_path == str(metadata.dataset_path)
 
@@ -345,3 +345,22 @@ def test_extract_subject_field_value_from_statistic_structure_xml(
     # TODO @mmwinther: Remove multiple_language_support once the model is updated.
     # https://github.com/statisticsnorway/ssb-datadoc-model/issues/41
     assert metadata.dataset.subject_field.en == expected_subject_code  # type: ignore [union-attr]
+
+
+@pytest.mark.parametrize(
+    "existing_metadata_path",
+    [TEST_EXISTING_METADATA_DIRECTORY / "pseudo"],
+)
+def test_existing_pseudo_metadata_file(
+    existing_metadata_file: Path,
+    metadata: DataDocMetadata,
+):
+    pre_open_metadata = json.loads(existing_metadata_file.read_text())
+    metadata.write_metadata_document()
+    post_open_metadata = json.loads(existing_metadata_file.read_text())
+
+    assert len(metadata.variables) == 8  # noqa: PLR2004
+    assert (
+        pre_open_metadata["pseudonymization"] == post_open_metadata["pseudonymization"]
+    )
+    assert post_open_metadata["datadoc"] is not None
diff --git a/tests/backend/test_model_backwards_compatibility.py b/tests/backend/test_model_backwards_compatibility.py
index 1b8c9b34..12e26fb8 100644
--- a/tests/backend/test_model_backwards_compatibility.py
+++ b/tests/backend/test_model_backwards_compatibility.py
@@ -37,10 +37,10 @@ def test_existing_metadata_unknown_model_version():
     ids=BACKWARDS_COMPATIBLE_VERSION_NAMES,
 )
 def test_backwards_compatibility(
-    existing_metadata_file: str,
+    existing_metadata_file: Path,
     metadata: DataDocMetadata,
 ):
-    with Path.open(Path(existing_metadata_file)) as f:
+    with existing_metadata_file.open() as f:
         file_metadata = json.loads(f.read())
 
     # Just test a single value to make sure we have a working model
diff --git a/tests/conftest.py b/tests/conftest.py
index c2e288d9..a7a85344 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -91,13 +91,13 @@ def existing_metadata_path() -> Path:
 
 
 @pytest.fixture()
-def existing_metadata_file(tmp_path: Path, existing_metadata_path: Path) -> str:
+def existing_metadata_file(tmp_path: Path, existing_metadata_path: Path) -> Path:
     # Setup by copying the file into the relevant directory
     shutil.copy(
         existing_metadata_path / TEST_EXISTING_METADATA_FILE_NAME,
         tmp_path / TEST_EXISTING_METADATA_FILE_NAME,
     )
-    return str(tmp_path / TEST_EXISTING_METADATA_FILE_NAME)
+    return tmp_path / TEST_EXISTING_METADATA_FILE_NAME
 
 
 @pytest.fixture(autouse=True)
diff --git a/tests/resources/existing_metadata_file/pseudo/person_data_v1__DOC.json b/tests/resources/existing_metadata_file/pseudo/person_data_v1__DOC.json
new file mode 100644
index 00000000..2e2e3a18
--- /dev/null
+++ b/tests/resources/existing_metadata_file/pseudo/person_data_v1__DOC.json
@@ -0,0 +1,22 @@
+{
+  "document_version": "0.0.1",
+  "datadoc": null,
+  "pseudonymization": {
+    "document_version": "0.1.0",
+    "pseudo_dataset": null,
+    "pseudo_variables": [
+      {
+        "short_name": "fnr",
+        "data_element_path": "fnr",
+        "data_element_pattern": "**/fnr",
+        "stable_identifier_type": null,
+        "stable_identifier_version": null,
+        "encryption_algorithm": "TINK-DAEAD",
+        "encryption_key_reference": "ssb-common-key-1",
+        "encryption_algorithm_parameters": [{ "keyId": "ssb-common-key-1" }],
+        "source_variable": null,
+        "source_variable_datatype": null
+      }
+    ]
+  }
+}