Merge pull request #206 from statisticsnorway/feat/dpmeta-62-add-klas…

…s-codes-to-owner Changes to owner
statisticsnorway · Mar 11, 2024 · 9c547f8 · 9c547f8
2 parents 518804e + 83107aa
commit 9c547f8
Show file tree

Hide file tree

Showing 30 changed files with 435 additions and 175 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dash = ">=2.15.0"
 pydantic = "==2.5.2"
 dash-bootstrap-components = ">=1.1.0"
 pandas = ">=1.4.2"
-ssb-datadoc-model = "==4.2.0"
+ssb-datadoc-model = "4.3.2"
 dapla-toolbelt = ">=1.3.3"
 gunicorn = ">=21.2.0"
 flask-healthz = ">=0.0.3"

diff --git a/src/datadoc/app.py b/src/datadoc/app.py
@@ -14,9 +14,9 @@
 
 from datadoc import config
 from datadoc import state
+from datadoc.backend.code_list import CodeList
 from datadoc.backend.datadoc_metadata import DataDocMetadata
 from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping
-from datadoc.backend.unit_types import UnitTypes
 from datadoc.enums import SupportedLanguages
 from datadoc.frontend.callbacks.register_callbacks import register_callbacks
 from datadoc.frontend.components.alerts import dataset_validation_error
@@ -122,10 +122,14 @@ def collect_data_from_external_sources() -> None:
         config.get_statistical_subject_source_url(),
     )
 
-    state.unit_types = UnitTypes(
+    state.unit_types = CodeList(
         config.get_unit_code(),
     )
 
+    state.organisational_units = CodeList(
+        config.get_organisational_unit_code(),
+    )
+
 
 def main(dataset_path: str | None = None) -> None:
     """Entrypoint when running as a script."""

diff --git a/src/datadoc/backend/unit_types.py → src/datadoc/backend/code_list.py b/src/datadoc/backend/unit_types.py → src/datadoc/backend/code_list.py
@@ -16,37 +16,39 @@
 
 
 @dataclass
-class UnitType:
-    """Data structure for the a unit type."""
+class CodeListItem:
+    """Data structure for a code list item."""
 
     titles: dict[str, str]
-    unit_code: str
+    code: str
 
     def get_title(self, language: SupportedLanguages) -> str:
         """Get the title in the given language."""
         try:
-            return self.titles[
-                (
-                    # Adjust to language codes in the UnitTypes structure.
-                    "nb"
-                    if language
-                    in [
-                        SupportedLanguages.NORSK_BOKMÅL,
-                        SupportedLanguages.NORSK_NYNORSK,
-                    ]
-                    else "en"
-                )
-            ]
+            return self.titles[language]
         except KeyError:
-            logger.exception(
-                "Could not find title for subject %s  and language: %s",
-                self,
-                language.name,
-            )
-            return ""
+            try:
+                return self.titles[
+                    (
+                        "nb"
+                        if language
+                        in [
+                            SupportedLanguages.NORSK_BOKMÅL,
+                            SupportedLanguages.NORSK_NYNORSK,
+                        ]
+                        else "en"
+                    )
+                ]
+            except KeyError:
+                logger.exception(
+                    "Could not find title for subject %s  and language: %s",
+                    self,
+                    language.name,
+                )
+                return ""
 
 
-class UnitTypes(GetExternalSource):
+class CodeList(GetExternalSource):
     """Class for retrieving classifications from Klass."""
 
     def __init__(self, classification_id: int | None) -> None:
@@ -58,13 +60,9 @@ def __init__(self, classification_id: int | None) -> None:
             SupportedLanguages.NORSK_BOKMÅL.value,
             SupportedLanguages.ENGLISH.value,
         ]
-
-        self._classifications: list[UnitType] = []
-
+        self._classifications: list[CodeListItem] = []
         self.classification_id = classification_id
-
         self.classifications_dataframes: dict[str, pd.DataFrame] | None = None
-
         super().__init__()
 
     def _fetch_data_from_external_source(
@@ -85,7 +83,6 @@ def _fetch_data_from_external_source(
                     .get_codes()
                     .data
                 )
-
         except Exception:
             logger.exception(
                 "Exception while getting classifications from Klass",
@@ -110,10 +107,10 @@ def _extract_titles(
             list_of_titles.append(titles)
         return list_of_titles
 
-    def _create_unit_types_from_dataframe(
+    def _create_code_list_from_dataframe(
         self,
         classifications_dataframes: dict[SupportedLanguages, pd.DataFrame],
-    ) -> list[UnitType]:
+    ) -> list[CodeListItem]:
         """Method that finds the name column in the dataframe, and returns all values in a list."""
         classification_names = self._extract_titles(classifications_dataframes)
         classification_codes: list
@@ -128,7 +125,7 @@ def _create_unit_types_from_dataframe(
         unit_types = []
         for a, b in zip(classification_names, classification_codes):
             unit_types.append(
-                UnitType(a, b),
+                CodeListItem(a, b),
             )
         return unit_types
 
@@ -137,7 +134,7 @@ def _get_classification_dataframe_if_loaded(self) -> bool:
         if not self._classifications:
             self.classifications_dataframes = self.retrieve_external_data()
             if self.classifications_dataframes is not None:
-                self._classifications = self._create_unit_types_from_dataframe(
+                self._classifications = self._create_code_list_from_dataframe(
                     self.classifications_dataframes,
                 )
                 logger.debug(
@@ -151,7 +148,7 @@ def _get_classification_dataframe_if_loaded(self) -> bool:
         return False
 
     @property
-    def classifications(self) -> list[UnitType]:
+    def classifications(self) -> list[CodeListItem]:
         """Getter for primary subjects."""
         self._get_classification_dataframe_if_loaded()
         logger.debug("Got %s classifications subjects", len(self._classifications))

diff --git a/src/datadoc/backend/datadoc_metadata.py b/src/datadoc/backend/datadoc_metadata.py
@@ -8,7 +8,6 @@
 import uuid
 from typing import TYPE_CHECKING
 
-import pydantic
 from cloudpathlib import CloudPath
 from cloudpathlib import GSClient
 from cloudpathlib import GSPath
@@ -51,16 +50,13 @@ def __init__(
     ) -> None:
         """Read in a dataset if supplied, otherwise naively instantiate the class."""
         self._statistic_subject_mapping = statistic_subject_mapping
-
         self.metadata_document: pathlib.Path | CloudPath | None = None
         self.container: model.MetadataContainer | None = None
         self.dataset_path: pathlib.Path | CloudPath | None = None
         self.short_name: str | None = None
         self.dataset = model.Dataset()
         self.variables: list = []
-
         self.variables_lookup: dict[str, model.Variable] = {}
-
         if metadata_document_path:
             # In this case the user has specified an independent metadata document for editing
             # without a dataset.
@@ -72,7 +68,6 @@ def __init__(
             self.metadata_document = self.dataset_path.parent / (
                 self.dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX
             )
-
         self.extract_metadata_from_files()
 
     @staticmethod
@@ -123,35 +118,28 @@ def extract_metadata_from_existing_document(
         try:
             with document.open(mode="r", encoding="utf-8") as file:
                 fresh_metadata = json.load(file)
-            logger.info(
-                "Opened existing metadata file %s",
-                document,
-            )
+            logger.info("Opened existing metadata file %s", document)
             if self.is_metadata_in_container_structure(fresh_metadata):
                 self.container = model.MetadataContainer.model_validate_json(
                     json.dumps(fresh_metadata),
                 )
                 datadoc_metadata = fresh_metadata["datadoc"]
             else:
                 datadoc_metadata = fresh_metadata
-
             if datadoc_metadata is None:
                 # In this case we've read in a file with an empty "datadoc" structure.
                 # A typical example of this is a file produced from a pseudonymization process.
                 return
-
             datadoc_metadata = upgrade_metadata(
                 datadoc_metadata,
             )
-
             meta = model.DatadocMetadata.model_validate_json(
                 json.dumps(datadoc_metadata),
             )
             if meta.dataset is not None:
                 self.dataset = meta.dataset
             if meta.variables is not None:
                 self.variables = meta.variables
-
         except json.JSONDecodeError:
             logger.warning(
                 "Could not open existing metadata file %s. \
@@ -169,14 +157,7 @@ def is_metadata_in_container_structure(
         The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc.
         This method returns True if the metadata is in the container structure, False otherwise.
         """
-        try:
-            model.MetadataContainer.model_validate_json(
-                json.dumps(metadata),
-            )
-        except pydantic.ValidationError:
-            return False
-        else:
-            return True
+        return "datadoc" in metadata
 
     def extract_metadata_from_dataset(
         self,

diff --git a/src/datadoc/backend/external_sources/external_sources.py b/src/datadoc/backend/external_sources/external_sources.py
@@ -21,7 +21,6 @@ def __init__(self) -> None:
         Initializes the future object.
         """
         self.future: concurrent.futures.Future[T | None] | None = None
-
         executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
         self.future = executor.submit(
             self._fetch_data_from_external_source,

diff --git a/src/datadoc/backend/model_backwards_compatibility.py b/src/datadoc/backend/model_backwards_compatibility.py
@@ -63,12 +63,20 @@ def handle_current_version(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
     return supplied_metadata
 
 
+def handle_version_2_1_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
+    """Handle breaking changes for v2.1.0.
+
+    Datatype changed from LanguageStringType to str for owner
+    """
+    data = supplied_metadata["dataset"]["owner"]
+    supplied_metadata["dataset"]["owner"] = str(data["nb"] or data["nn"] or data["en"])
+    supplied_metadata["document_version"] = "2.2.0"
+    return supplied_metadata
+
+
 def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
     """Handle breaking changes for v1.0.0."""
-    datetime_fields = [
-        ("metadata_created_date"),
-        ("metadata_last_updated_date"),
-    ]
+    datetime_fields = [("metadata_created_date"), ("metadata_last_updated_date")]
     for field in datetime_fields:
         if supplied_metadata["dataset"][field]:
             supplied_metadata["dataset"][field] = datetime.isoformat(
@@ -77,13 +85,11 @@ def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
                 ),
                 timespec="seconds",
             )
-
     if isinstance(supplied_metadata["dataset"]["data_source"], str):
         supplied_metadata["dataset"]["data_source"] = LanguageStringType(
             en=supplied_metadata["dataset"]["data_source"],
         )
     supplied_metadata["document_version"] = "2.1.0"
-
     return supplied_metadata
 
 
@@ -102,7 +108,6 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
         supplied_metadata["dataset"][new_key] = supplied_metadata["dataset"].pop(
             old_key,
         )
-
     # Replace empty strings with None, empty strings are not valid for LanguageStrings values
     supplied_metadata["dataset"] = {
         k: None if v == "" else v for k, v in supplied_metadata["dataset"].items()
@@ -113,30 +118,22 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
 # Register all the supported versions and their handlers.
 # MUST be ordered from oldest to newest.
 BackwardsCompatibleVersion(version="0.1.1", handler=handle_version_0_1_1)
-BackwardsCompatibleVersion(
-    version="1.0.0",
-    handler=handle_version_1_0_0,
-)
-BackwardsCompatibleVersion(
-    version="2.1.0",
-    handler=handle_current_version,
-)
+BackwardsCompatibleVersion(version="1.0.0", handler=handle_version_1_0_0)
+BackwardsCompatibleVersion(version="2.1.0", handler=handle_version_2_1_0)
+BackwardsCompatibleVersion(version="2.2.0", handler=handle_current_version)
 
 
 def upgrade_metadata(fresh_metadata: dict[str, Any]) -> dict[str, Any]:
     """Run the handler for this version to upgrade the document to the latest version."""
     # Special case for current version, we expose the current_model_version parameter for test purposes
     supplied_version = fresh_metadata[VERSION_FIELD_NAME]
     start_running_handlers = False
-
     # Run all the handlers in order from the supplied version onwards
     for k, v in SUPPORTED_VERSIONS.items():
         if k == supplied_version:
             start_running_handlers = True
         if start_running_handlers:
             fresh_metadata = v.handler(fresh_metadata)
-
     if not start_running_handlers:
         raise UnknownModelVersionError(supplied_version)
-
     return fresh_metadata
diff --git a/src/datadoc/config.py b/src/datadoc/config.py
@@ -128,3 +128,8 @@ def get_oidc_token() -> str | None:
 def get_unit_code() -> int | None:
     """The code for the Unit Type code list in Klass."""
     return int(_get_config_item("DATADOC_UNIT_CODE") or 702)
+
+
+def get_organisational_unit_code() -> int | None:
+    """The code for the organisational units code list in Klass."""
+    return int(_get_config_item("DATADOC_ORGANISATIONAL_UNIT_CODE") or 83)