Skip to content

Commit

Permalink
Update to latest datadoc model (#291)
Browse files Browse the repository at this point in the history
* Added latest model. Updated enums. Started work with backwards compability.

* Removed some variables

* Added v2_2_0 json test file

* Some work with backwards compability

* Added convertion of language string type

* Refactored cheking if there is a metadata container

* Some more work

* adding container

* Add assert

* One more assert

* Upadet tests

* Handeled ugrade of meteadta for version 0_1_1 to 2_2_0

* Changed metadata field upgrade to look  all possible previous values

* Changed the order, when upgrading from LanguageStringType to String

* Some more work

* Added possiblilty to add new langues to language strings

* Fixed tests

* Fixed tests

* Removed unused code

* Removing custom_type and special_value from GUI

* Fixed error showing None in the input components

* Added check for not none for list

* Fixed pre commit errors

* Fixed pre commit errors

* Change return type

* Fix MyPy errors

* Removed dummy comment

* Removed TODO

* Renamed method

* Disabled unused fiel

* Disabled unused fields

* Removed TODO

* Removed commented code

* Fixed dataset input field tests

* Removed special value and custom type, and changed tests to be compatible

* Update src/datadoc/frontend/fields/display_dataset.py

Co-authored-by: Miles Mason Winther <[email protected]>

* Added dropdown field to use restriction, and changed contains personal data to checkbox

* Fixed error from merge conflict

* Fixed pre commit

* Fixed pre commit

* Fixed mypy errors

* Added noqa to lambda function

* Fixed typo

---------

Co-authored-by: Joergen <[email protected]>
Co-authored-by: Jorgen-5 <[email protected]>
Co-authored-by: Miles Mason Winther <[email protected]>
  • Loading branch information
4 people authored Apr 15, 2024
1 parent 51b2786 commit 64fc301
Show file tree
Hide file tree
Showing 23 changed files with 1,408 additions and 431 deletions.
594 changes: 370 additions & 224 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ dash = ">=2.15.0"
pydantic = "==2.5.2"
dash-bootstrap-components = ">=1.1.0"
pandas = ">=1.4.2"
ssb-datadoc-model = "4.3.2"
ssb-datadoc-model = "5.1.0"
dapla-toolbelt = ">=1.3.3"
gunicorn = ">=21.2.0"
flask-healthz = ">=0.0.3"
Expand Down
11 changes: 9 additions & 2 deletions src/datadoc/backend/dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,15 @@ def _extract_norwegian_dataset_state_path_part(
) -> set:
norwegian_dataset_state_path_part = dataset_state.get_value_for_language(
SupportedLanguages.NORSK_BOKMÅL,
).lower()
return {norwegian_dataset_state_path_part.replace(" ", x) for x in ["-", "_"]}
)
if norwegian_dataset_state_path_part is not None:
norwegian_dataset_state_path_part = (
norwegian_dataset_state_path_part.lower()
)
return_value = {
norwegian_dataset_state_path_part.replace(" ", x) for x in ["-", "_"]
}
return return_value

@property
def dataset_short_name(
Expand Down
28 changes: 8 additions & 20 deletions src/datadoc/backend/datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
from datadoc.backend import user_info
from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo
from datadoc.backend.dataset_parser import DatasetParser
from datadoc.backend.model_backwards_compatibility import (
is_metadata_in_container_structure,
)
from datadoc.backend.model_backwards_compatibility import upgrade_metadata
from datadoc.enums import Assessment
from datadoc.enums import DataSetState
Expand Down Expand Up @@ -118,7 +121,10 @@ def extract_metadata_from_existing_document(
with document.open(mode="r", encoding="utf-8") as file:
fresh_metadata = json.load(file)
logger.info("Opened existing metadata file %s", document)
if self.is_metadata_in_container_structure(fresh_metadata):
fresh_metadata = upgrade_metadata(
fresh_metadata,
)
if is_metadata_in_container_structure(fresh_metadata):
self.container = model.MetadataContainer.model_validate_json(
json.dumps(fresh_metadata),
)
Expand All @@ -129,9 +135,6 @@ def extract_metadata_from_existing_document(
# In this case we've read in a file with an empty "datadoc" structure.
# A typical example of this is a file produced from a pseudonymization process.
return
datadoc_metadata = upgrade_metadata(
datadoc_metadata,
)
meta = model.DatadocMetadata.model_validate_json(
json.dumps(datadoc_metadata),
)
Expand All @@ -147,17 +150,6 @@ def extract_metadata_from_existing_document(
exc_info=True,
)

def is_metadata_in_container_structure(
self,
metadata: dict,
) -> bool:
"""At a certain point a metadata 'container' was introduced.
The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc.
This method returns True if the metadata is in the container structure, False otherwise.
"""
return "datadoc" in metadata

def extract_metadata_from_dataset(
self,
dataset: pathlib.Path | CloudPath,
Expand Down Expand Up @@ -188,11 +180,7 @@ def extract_metadata_from_dataset(
metadata_created_by=user_info.get_user_info_for_current_platform().short_email,
# TODO @mmwinther: Remove multiple_language_support once the model is updated.
# https://github.com/statisticsnorway/ssb-datadoc-model/issues/41
subject_field=model.LanguageStringType(
en=subject_field,
nb=subject_field,
nn=subject_field,
),
subject_field=subject_field,
)
self.variables = self.ds_schema.get_fields()

Expand Down
14 changes: 9 additions & 5 deletions src/datadoc/backend/dataset_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import pandas as pd
from datadoc_model.model import LanguageStringType
from datadoc_model.model import LanguageStringTypeItem
from datadoc_model.model import Variable
from pyarrow import parquet as pq

Expand Down Expand Up @@ -212,11 +213,14 @@ def get_fields(self) -> list[Variable]:
# Assume labels are defined in the default language (NORSK_BOKMÅL)
# If this is not correct, the user may fix it via the UI
name=LanguageStringType(
**{
state.current_metadata_language.value: sas_reader.columns[ # type: ignore [attr-defined]
i
].label,
},
[
LanguageStringTypeItem(
languageCode=state.current_metadata_language.value,
languageText=sas_reader.columns[ # type: ignore [attr-defined]
i
].label,
),
],
),
# Access the python type for the value and transform it to a DataDoc Data type
data_type=self.transform_data_type(type(v).__name__.lower()),
Expand Down
126 changes: 117 additions & 9 deletions src/datadoc/backend/model_backwards_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
from typing import TYPE_CHECKING
from typing import Any

from datadoc_model.model import LanguageStringType

if TYPE_CHECKING:
from collections.abc import Callable

Expand Down Expand Up @@ -63,6 +61,79 @@ def handle_current_version(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
return supplied_metadata


def _find_and_update_language_strings(supplied_metadata: dict | None) -> dict | None:
if isinstance(supplied_metadata, dict):
for key, value in supplied_metadata.items():
if isinstance(value, dict) and "en" in value:
supplied_metadata[key] = _convert_language_string_type(value)
return supplied_metadata
return None


def _convert_language_string_type(supplied_value: dict) -> list[dict[str, str]]:
return [
{
"languageCode": "en",
"languageText": supplied_value["en"],
},
{
"languageCode": "nn",
"languageText": supplied_value["nn"],
},
{
"languageCode": "nb",
"languageText": supplied_value["nb"],
},
]


def _remove_element_from_model(
supplied_metadata: dict[str, Any],
element_to_remove: str,
) -> None:
if element_to_remove in supplied_metadata:
del supplied_metadata[element_to_remove]


def handle_version_2_2_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
"""Handle breaking changes for v2.2.0."""
if supplied_metadata["datadoc"]["dataset"]["subject_field"] is not None:
data = supplied_metadata["datadoc"]["dataset"]["subject_field"]
supplied_metadata["datadoc"]["dataset"]["subject_field"] = str(
data["nb"] or data["nn"] or data["en"],
)

_remove_element_from_model(supplied_metadata["datadoc"]["dataset"], "register_uri")

for i in range(len(supplied_metadata["datadoc"]["variables"])):
_remove_element_from_model(
supplied_metadata["datadoc"]["variables"][i],
"sentinel_value_uri",
)
supplied_metadata["datadoc"]["variables"][i]["special_value"] = None
supplied_metadata["datadoc"]["variables"][i]["custom_type"] = None
supplied_metadata["datadoc"]["variables"][
i
] = _find_and_update_language_strings(
supplied_metadata["datadoc"]["variables"][i],
)
supplied_metadata["datadoc"]["dataset"]["custom_type"] = None
supplied_metadata["datadoc"]["dataset"] = _find_and_update_language_strings(
supplied_metadata["datadoc"]["dataset"],
)
supplied_metadata["datadoc"]["document_version"] = "3.1.0"
return supplied_metadata


def add_container(existing_metadata: dict) -> dict:
"""Add container for previous versions."""
return {
"document_version": "0.0.1",
"datadoc": existing_metadata,
"pseudonymization": None,
}


def handle_version_2_1_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
"""Handle breaking changes for v2.1.0.
Expand All @@ -71,7 +142,7 @@ def handle_version_2_1_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
data = supplied_metadata["dataset"]["owner"]
supplied_metadata["dataset"]["owner"] = str(data["nb"] or data["nn"] or data["en"])
supplied_metadata["document_version"] = "2.2.0"
return supplied_metadata
return add_container(supplied_metadata)


def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
Expand All @@ -86,9 +157,14 @@ def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
timespec="seconds",
)
if isinstance(supplied_metadata["dataset"]["data_source"], str):
supplied_metadata["dataset"]["data_source"] = LanguageStringType(
en=supplied_metadata["dataset"]["data_source"],
)
supplied_metadata["dataset"]["data_source"] = {
"en": supplied_metadata["dataset"]["data_source"],
"nn": "",
"nb": "",
}

_remove_element_from_model(supplied_metadata["dataset"], "data_source_path")

supplied_metadata["document_version"] = "2.1.0"
return supplied_metadata

Expand All @@ -112,21 +188,42 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
supplied_metadata["dataset"] = {
k: None if v == "" else v for k, v in supplied_metadata["dataset"].items()
}

key_renaming = [("data_type", "datatype")]

for i in range(len(supplied_metadata["variables"])):
for new_key, old_key in key_renaming:
supplied_metadata["variables"][i][new_key] = supplied_metadata["variables"][
i
].pop(
old_key,
)

return supplied_metadata


# Register all the supported versions and their handlers.
# MUST be ordered from oldest to newest.
BackwardsCompatibleVersion(version="0.1.1", handler=handle_version_0_1_1)
BackwardsCompatibleVersion(version="1.0.0", handler=handle_version_1_0_0)
BackwardsCompatibleVersion(version="2.1.0", handler=handle_version_2_1_0)
BackwardsCompatibleVersion(version="2.2.0", handler=handle_current_version)
BackwardsCompatibleVersion(
version="2.1.0",
handler=handle_version_2_1_0,
) # Her må det lages container
BackwardsCompatibleVersion(version="2.2.0", handler=handle_version_2_2_0)
BackwardsCompatibleVersion(version="3.1.0", handler=handle_current_version)


def upgrade_metadata(fresh_metadata: dict[str, Any]) -> dict[str, Any]:
"""Run the handler for this version to upgrade the document to the latest version."""
# Special case for current version, we expose the current_model_version parameter for test purposes
supplied_version = fresh_metadata[VERSION_FIELD_NAME]

if is_metadata_in_container_structure(fresh_metadata):
if fresh_metadata["datadoc"] is None:
return fresh_metadata
supplied_version = fresh_metadata["datadoc"][VERSION_FIELD_NAME]
else:
supplied_version = fresh_metadata[VERSION_FIELD_NAME]
start_running_handlers = False
# Run all the handlers in order from the supplied version onwards
for k, v in SUPPORTED_VERSIONS.items():
Expand All @@ -137,3 +234,14 @@ def upgrade_metadata(fresh_metadata: dict[str, Any]) -> dict[str, Any]:
if not start_running_handlers:
raise UnknownModelVersionError(supplied_version)
return fresh_metadata


def is_metadata_in_container_structure(
metadata: dict,
) -> bool:
"""At a certain point a metadata 'container' was introduced.
The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc.
This method returns True if the metadata is in the container structure, False otherwise.
"""
return "datadoc" in metadata
Loading

0 comments on commit 64fc301

Please sign in to comment.