Skip to content

Commit

Permalink
Support reading metadata files from pseudo (#202)
Browse files Browse the repository at this point in the history
* Support reading metadata files from pseudo

* Simplify logic for extracting metadata

* Fix mypy

* Fix checks
  • Loading branch information
mmwinther authored Mar 6, 2024
1 parent 14d515f commit 7f64c6a
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 19 deletions.
31 changes: 24 additions & 7 deletions src/datadoc/backend/datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import uuid
from typing import TYPE_CHECKING

import pydantic
from cloudpathlib import CloudPath
from cloudpathlib import GSClient
from cloudpathlib import GSPath
Expand Down Expand Up @@ -57,21 +58,20 @@ def __init__(
self.variables: list = []

self.variables_lookup: dict[str, model.Variable] = {}

if metadata_document_path:
# In this case the user has specified an independent metadata document for editing
# without a dataset.
self.metadata_document = self._open_path(metadata_document_path)
self.extract_metadata_from_existing_document(self.metadata_document)
elif dataset_path:
self.dataset_path = self._open_path(dataset_path)
# The short_name is set as the dataset filename without file extension
self.short_name = self.dataset_path.stem
# Build the metadata document path based on the dataset path
# Example: /path/to/dataset.parquet -> /path/to/dataset__DOC.json
self.metadata_document = self.dataset_path.parent / (
self.dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX
)
self.extract_metadata_from_files()

self.extract_metadata_from_files()

@staticmethod
def _open_path(path: str) -> pathlib.Path | CloudPath:
Expand All @@ -93,7 +93,12 @@ def extract_metadata_from_files(self) -> None:
"""
if self.metadata_document is not None and self.metadata_document.exists():
self.extract_metadata_from_existing_document(self.metadata_document)
elif self.dataset_path is not None:

if (
self.dataset_path is not None
and self.dataset == model.Dataset()
and len(self.variables) == 0
):
self.extract_metadata_from_dataset(self.dataset_path)
self.dataset.id = uuid.uuid4()
# Set default values for variables where appropriate
Expand Down Expand Up @@ -128,6 +133,11 @@ def extract_metadata_from_existing_document(
else:
datadoc_metadata = fresh_metadata

if datadoc_metadata is None:
# In this case we've read in a file with an empty "datadoc" structure.
# A typical example of this is a file produced from a pseudonymization process.
return

datadoc_metadata = upgrade_metadata(
datadoc_metadata,
)
Expand Down Expand Up @@ -157,7 +167,14 @@ def is_metadata_in_container_structure(
The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc.
This method returns True if the metadata is in the container structure, False otherwise.
"""
return "datadoc" in metadata and "dataset" in metadata["datadoc"]
try:
model.MetadataContainer.model_validate_json(
json.dumps(metadata),
)
except pydantic.ValidationError:
return False
else:
return True

def extract_metadata_from_dataset(
self,
Expand All @@ -176,7 +193,7 @@ def extract_metadata_from_dataset(
)

self.dataset = model.Dataset(
short_name=self.short_name,
short_name=self.dataset_path.stem if self.dataset_path else None,
dataset_state=dapla_dataset_path_info.dataset_state,
dataset_status=DataSetStatus.DRAFT,
assessment=self.get_assessment_by_state(
Expand Down
35 changes: 27 additions & 8 deletions tests/backend/test_datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,15 @@ def test_metadata_id(metadata: DataDocMetadata):
[TEST_EXISTING_METADATA_DIRECTORY / "invalid_id_field"],
)
def test_existing_metadata_none_id(
existing_metadata_file: str,
existing_metadata_file: Path,
metadata: DataDocMetadata,
):
with Path.open(Path(existing_metadata_file)) as f:
with existing_metadata_file.open() as f:
pre_open_id: None = json.load(f)["datadoc"]["dataset"]["id"]
assert pre_open_id is None
assert isinstance(metadata.dataset.id, UUID)
metadata.write_metadata_document()
with Path.open(Path(existing_metadata_file)) as f:
with existing_metadata_file.open() as f:
post_write_id = json.load(f)["datadoc"]["dataset"]["id"]
assert post_write_id == str(metadata.dataset.id)

Expand All @@ -161,18 +161,18 @@ def test_existing_metadata_none_id(
[TEST_EXISTING_METADATA_DIRECTORY / "valid_id_field"],
)
def test_existing_metadata_valid_id(
existing_metadata_file: str,
existing_metadata_file: Path,
metadata: DataDocMetadata,
):
pre_open_id = ""
post_write_id = ""
with Path.open(Path(existing_metadata_file)) as f:
with existing_metadata_file.open() as f:
pre_open_id = json.load(f)["datadoc"]["dataset"]["id"]
assert pre_open_id is not None
assert isinstance(metadata.dataset.id, UUID)
assert str(metadata.dataset.id) == pre_open_id
metadata.write_metadata_document()
with Path.open(Path(existing_metadata_file)) as f:
with existing_metadata_file.open() as f:
post_write_id = json.load(f)["datadoc"]["dataset"]["id"]
assert post_write_id == pre_open_id

Expand All @@ -188,11 +188,11 @@ def test_direct_person_identifying_default_value(metadata: DataDocMetadata):


def test_save_file_path_metadata_field(
existing_metadata_file: str,
existing_metadata_file: Path,
metadata: DataDocMetadata,
):
metadata.write_metadata_document()
with Path.open(Path(existing_metadata_file)) as f:
with existing_metadata_file.open() as f:
saved_file_path = json.load(f)["datadoc"]["dataset"]["file_path"]
assert saved_file_path == str(metadata.dataset_path)

Expand Down Expand Up @@ -345,3 +345,22 @@ def test_extract_subject_field_value_from_statistic_structure_xml(
# TODO @mmwinther: Remove multiple_language_support once the model is updated.
# https://github.com/statisticsnorway/ssb-datadoc-model/issues/41
assert metadata.dataset.subject_field.en == expected_subject_code # type: ignore [union-attr]


@pytest.mark.parametrize(
"existing_metadata_path",
[TEST_EXISTING_METADATA_DIRECTORY / "pseudo"],
)
def test_existing_pseudo_metadata_file(
existing_metadata_file: Path,
metadata: DataDocMetadata,
):
pre_open_metadata = json.loads(existing_metadata_file.read_text())
metadata.write_metadata_document()
post_open_metadata = json.loads(existing_metadata_file.read_text())

assert len(metadata.variables) == 8 # noqa: PLR2004
assert (
pre_open_metadata["pseudonymization"] == post_open_metadata["pseudonymization"]
)
assert post_open_metadata["datadoc"] is not None
4 changes: 2 additions & 2 deletions tests/backend/test_model_backwards_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ def test_existing_metadata_unknown_model_version():
ids=BACKWARDS_COMPATIBLE_VERSION_NAMES,
)
def test_backwards_compatibility(
existing_metadata_file: str,
existing_metadata_file: Path,
metadata: DataDocMetadata,
):
with Path.open(Path(existing_metadata_file)) as f:
with existing_metadata_file.open() as f:
file_metadata = json.loads(f.read())

# Just test a single value to make sure we have a working model
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,13 @@ def existing_metadata_path() -> Path:


@pytest.fixture()
def existing_metadata_file(tmp_path: Path, existing_metadata_path: Path) -> str:
def existing_metadata_file(tmp_path: Path, existing_metadata_path: Path) -> Path:
# Setup by copying the file into the relevant directory
shutil.copy(
existing_metadata_path / TEST_EXISTING_METADATA_FILE_NAME,
tmp_path / TEST_EXISTING_METADATA_FILE_NAME,
)
return str(tmp_path / TEST_EXISTING_METADATA_FILE_NAME)
return tmp_path / TEST_EXISTING_METADATA_FILE_NAME


@pytest.fixture(autouse=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"document_version": "0.0.1",
"datadoc": null,
"pseudonymization": {
"document_version": "0.1.0",
"pseudo_dataset": null,
"pseudo_variables": [
{
"short_name": "fnr",
"data_element_path": "fnr",
"data_element_pattern": "**/fnr",
"stable_identifier_type": null,
"stable_identifier_version": null,
"encryption_algorithm": "TINK-DAEAD",
"encryption_key_reference": "ssb-common-key-1",
"encryption_algorithm_parameters": [{ "keyId": "ssb-common-key-1" }],
"source_variable": null,
"source_variable_datatype": null
}
]
}
}

0 comments on commit 7f64c6a

Please sign in to comment.