From 3b12a66c0f590d63c12247e918faf7409632edc2 Mon Sep 17 00:00:00 2001 From: Cecilie Seim <68303562+tilen1976@users.noreply.github.com> Date: Mon, 19 Aug 2024 11:35:55 +0200 Subject: [PATCH] Import dapla-toolbelt-metadata (#395) * delete backend * add dapla-toolbelt-metadata * change imports * change import * fix conftest * ruff allow norwegian characters * refactor imports from dapla-toolbelt-metadata * remove model from ssb-datadoc-model * remove requests dependency * remove requests dependency * remove pandas dependency * remove dapla-toolbelt, gunicorn,pyarrow,ssb-klass-python, pyjwt,cloudpathlib and beatifulsoup4 dependencies * remove * update docs - add TODOs for possible removal * re-add gunicorn - in use in dockerfile * remove unused method _missing method in enum.py * remove unused fixture 'metadata merged' * remove unused fixtures * Update pyproject.toml Co-authored-by: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com> * Update pyproject.toml Co-authored-by: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com> --------- Co-authored-by: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com> --- docs/reference.md | 178 ++-- poetry.lock | 54 +- pyproject.toml | 15 +- src/datadoc/app.py | 6 +- src/datadoc/backend/__init__.py | 1 - src/datadoc/backend/code_list.py | 241 ------ src/datadoc/backend/constants.py | 90 -- src/datadoc/backend/core.py | 544 ------------ .../backend/dapla_dataset_path_info.py | 691 --------------- src/datadoc/backend/dataset_parser.py | 240 ------ .../backend/external_sources/__init__.py | 1 - .../external_sources/external_sources.py | 87 -- .../backend/model_backwards_compatibility.py | 520 ------------ src/datadoc/backend/model_validation.py | 188 ---- .../backend/statistic_subject_mapping.py | 182 ---- src/datadoc/backend/user_info.py | 88 -- src/datadoc/backend/utils.py | 388 --------- src/datadoc/config.py | 11 +- src/datadoc/enums.py | 325 ++++--- src/datadoc/frontend/callbacks/dataset.py | 18 +- .../frontend/callbacks/register_callbacks.py | 4 +- src/datadoc/frontend/callbacks/utils.py | 2 +- src/datadoc/frontend/callbacks/variables.py | 15 +- src/datadoc/frontend/components/builders.py | 2 +- .../frontend/{text.py => constants.py} | 2 + src/datadoc/frontend/fields/display_base.py | 29 +- .../frontend/fields/display_dataset.py | 25 +- .../frontend/fields/display_variables.py | 15 +- src/datadoc/state.py | 8 +- tests/backend/__init__.py | 1 - tests/backend/test_code_list.py | 131 --- tests/backend/test_dapla_dataset_path_info.py | 307 ------- tests/backend/test_datadoc_metadata.py | 800 ------------------ tests/backend/test_dataset_parser.py | 130 --- .../test_model_backwards_compatibility.py | 103 --- .../backend/test_statistic_subject_mapping.py | 176 ---- tests/backend/test_user_info.py | 112 --- tests/backend/test_utils.py | 41 - tests/backend/test_validators.py | 305 ------- tests/conftest.py | 94 +- .../callbacks/test_callbacks_utils.py | 2 +- .../callbacks/test_dataset_callbacks.py | 35 +- .../callbacks/test_variables_callbacks.py | 42 +- .../test_build_dataset_edit_section.py | 11 +- .../components/test_build_edit_section.py | 2 +- .../components/test_build_input_section.py | 2 +- tests/frontend/fields/test_display_dataset.py | 6 +- tests/test_model.py | 7 +- 48 files changed, 479 insertions(+), 5798 deletions(-) delete mode 100644 src/datadoc/backend/__init__.py delete mode 100644 src/datadoc/backend/code_list.py delete mode 100644 src/datadoc/backend/constants.py delete mode 100644 src/datadoc/backend/core.py delete mode 100644 src/datadoc/backend/dapla_dataset_path_info.py delete mode 100644 src/datadoc/backend/dataset_parser.py delete mode 100644 src/datadoc/backend/external_sources/__init__.py delete mode 100644 src/datadoc/backend/external_sources/external_sources.py delete mode 100644 src/datadoc/backend/model_backwards_compatibility.py delete mode 100644 src/datadoc/backend/model_validation.py delete mode 100644 src/datadoc/backend/statistic_subject_mapping.py delete mode 100644 src/datadoc/backend/user_info.py delete mode 100644 src/datadoc/backend/utils.py rename src/datadoc/frontend/{text.py => constants.py} (73%) delete mode 100644 tests/backend/__init__.py delete mode 100644 tests/backend/test_code_list.py delete mode 100644 tests/backend/test_dapla_dataset_path_info.py delete mode 100644 tests/backend/test_datadoc_metadata.py delete mode 100644 tests/backend/test_dataset_parser.py delete mode 100644 tests/backend/test_model_backwards_compatibility.py delete mode 100644 tests/backend/test_statistic_subject_mapping.py delete mode 100644 tests/backend/test_user_info.py delete mode 100644 tests/backend/test_utils.py delete mode 100644 tests/backend/test_validators.py diff --git a/docs/reference.md b/docs/reference.md index 9444ec93..b2ec9adc 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -15,6 +15,18 @@ delete the .rst file afterwards. datadoc package =============== +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + datadoc.frontend + datadoc.logging_configuration + +Submodules +---------- + datadoc.app module ------------------ @@ -23,6 +35,22 @@ datadoc.app module :undoc-members: :show-inheritance: +datadoc.config module +--------------------- + +.. automodule:: datadoc.config + :members: + :undoc-members: + :show-inheritance: + +datadoc.constants module +------------------------ + +.. automodule:: datadoc.constants + :members: + :undoc-members: + :show-inheritance: + datadoc.enums module -------------------- @@ -50,38 +78,45 @@ datadoc.utils module datadoc.wsgi module ------------------- -datadoc.backend package -======================= - +.. automodule:: datadoc.wsgi + :members: + :undoc-members: + :show-inheritance: -datadoc.backend.datadoc\_metadata module ----------------------------------------- +Module contents +--------------- -.. automodule:: datadoc.backend.core +.. automodule:: datadoc :members: :undoc-members: :show-inheritance: -datadoc.backend.dataset\_parser module --------------------------------------- +datadoc.logging\_configuration package +====================================== + +Submodules +---------- -.. automodule:: datadoc.backend.dataset_parser +datadoc.logging\_configuration.json\_formatter module +----------------------------------------------------- + +.. automodule:: datadoc.logging_configuration.json_formatter :members: :undoc-members: :show-inheritance: -datadoc.backend.model\_backwards\_compatibility module ------------------------------------------------------- +datadoc.logging\_configuration.logging\_config module +----------------------------------------------------- -.. automodule:: datadoc.backend.model_backwards_compatibility +.. automodule:: datadoc.logging_configuration.logging_config :members: :undoc-members: :show-inheritance: -datadoc.backend.storage\_adapter module ---------------------------------------- +Module contents +--------------- -.. automodule:: datadoc.backend.storage_adapter +.. automodule:: datadoc.logging_configuration :members: :undoc-members: :show-inheritance: @@ -89,55 +124,79 @@ datadoc.backend.storage\_adapter module datadoc.frontend package ======================== +Subpackages +----------- -datadoc.frontend.callbacks package -================================== +.. toctree:: + :maxdepth: 4 + datadoc.frontend.callbacks + datadoc.frontend.components + datadoc.frontend.fields -datadoc.frontend.callbacks.dataset module ------------------------------------------ +Submodules +---------- -.. automodule:: datadoc.frontend.callbacks.dataset +datadoc.frontend.constants module +--------------------------------- + +.. automodule:: datadoc.frontend.constants :members: :undoc-members: :show-inheritance: -datadoc.frontend.callbacks.register\_callbacks module ------------------------------------------------------ +Module contents +--------------- -.. automodule:: datadoc.frontend.callbacks.register_callbacks +.. automodule:: datadoc.frontend :members: :undoc-members: :show-inheritance: -datadoc.frontend.callbacks.utils module ---------------------------------------- +datadoc.frontend.fields package +=============================== -.. automodule:: datadoc.frontend.callbacks.utils +Submodules +---------- + +datadoc.frontend.fields.display\_base module +-------------------------------------------- + +.. automodule:: datadoc.frontend.fields.display_base :members: :undoc-members: :show-inheritance: -datadoc.frontend.callbacks.variables module -------------------------------------------- +datadoc.frontend.fields.display\_dataset module +----------------------------------------------- -.. automodule:: datadoc.frontend.callbacks.variables +.. automodule:: datadoc.frontend.fields.display_dataset :members: :undoc-members: :show-inheritance: -datadoc.frontend.components package -=================================== +datadoc.frontend.fields.display\_variables module +------------------------------------------------- +.. automodule:: datadoc.frontend.fields.display_variables + :members: + :undoc-members: + :show-inheritance: -datadoc.frontend.components.alerts module ------------------------------------------ +Module contents +--------------- -.. automodule:: datadoc.frontend.components.alerts +.. automodule:: datadoc.frontend.fields :members: :undoc-members: :show-inheritance: +datadoc.frontend.components package +=================================== + +Submodules +---------- + datadoc.frontend.components.builders module ------------------------------------------- @@ -154,48 +213,65 @@ datadoc.frontend.components.control\_bars module :undoc-members: :show-inheritance: -datadoc.frontend.components.dataset\_tab module ------------------------------------------------ +datadoc.frontend.components.identifiers module +---------------------------------------------- -.. automodule:: datadoc.frontend.components.dataset_tab +.. automodule:: datadoc.frontend.components.identifiers :members: :undoc-members: :show-inheritance: -datadoc.frontend.components.variables\_tab module -------------------------------------------------- +Module contents +--------------- -.. automodule:: datadoc.frontend.components.variables_tab +.. automodule:: datadoc.frontend.components :members: :undoc-members: :show-inheritance: -datadoc.frontend.fields package -=============================== +datadoc.frontend.callbacks package +================================== +Submodules +---------- -datadoc.frontend.fields.display\_base module --------------------------------------------- +datadoc.frontend.callbacks.dataset module +----------------------------------------- -.. automodule:: datadoc.frontend.fields.display_base +.. automodule:: datadoc.frontend.callbacks.dataset :members: :undoc-members: :show-inheritance: -datadoc.frontend.fields.display\_dataset module ------------------------------------------------ +datadoc.frontend.callbacks.register\_callbacks module +----------------------------------------------------- -.. automodule:: datadoc.frontend.fields.display_dataset +.. automodule:: datadoc.frontend.callbacks.register_callbacks :members: :undoc-members: :show-inheritance: -datadoc.frontend.fields.display\_variables module -------------------------------------------------- +datadoc.frontend.callbacks.utils module +--------------------------------------- -.. automodule:: datadoc.frontend.fields.display_variables +.. automodule:: datadoc.frontend.callbacks.utils :members: :undoc-members: :show-inheritance: +datadoc.frontend.callbacks.variables module +------------------------------------------- + +.. automodule:: datadoc.frontend.callbacks.variables + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: datadoc.frontend.callbacks + :members: + :undoc-members: + :show-inheritance: ``` diff --git a/poetry.lock b/poetry.lock index 643579dd..22686273 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -344,6 +344,20 @@ files = [ {file = "blinker-1.8.2.tar.gz", hash = "sha256:8f77b09d3bf7c795e969e9486f39c2c5e9c39d4ee07424be2bc594ece9642d83"}, ] +[[package]] +name = "bs4" +version = "0.0.2" +description = "Dummy package for Beautiful Soup (beautifulsoup4)" +optional = false +python-versions = "*" +files = [ + {file = "bs4-0.0.2-py2.py3-none-any.whl", hash = "sha256:abf8742c0805ef7f662dce4b51cca104cffe52b835238afc169142ab9b3fbccc"}, + {file = "bs4-0.0.2.tar.gz", hash = "sha256:a48685c58f50fe127722417bae83fe6badf500d54b55f7e39ffe43b798653925"}, +] + +[package.dependencies] +beautifulsoup4 = "*" + [[package]] name = "cachetools" version = "5.4.0" @@ -796,6 +810,32 @@ pyjwt = ">=2.6.0" requests = ">=2.27.1" tomli = ">=1.1.0" +[[package]] +name = "dapla-toolbelt-metadata" +version = "0.2.0" +description = "Dapla Toolbelt Metadata" +optional = false +python-versions = "<4.0,>=3.10" +files = [ + {file = "dapla_toolbelt_metadata-0.2.0-py3-none-any.whl", hash = "sha256:b2479b36d91b03b96735cd2e7a55631a7ffd1f4eb28aea17f297d07cc354897a"}, + {file = "dapla_toolbelt_metadata-0.2.0.tar.gz", hash = "sha256:6bd69eddcac54985fcb541751a929ddf16010243b90d6631427e13296b4e440c"}, +] + +[package.dependencies] +arrow = ">=1.3.0" +beautifulsoup4 = ">=4.12.3" +bs4 = ">=0.0.2,<0.0.3" +cloudpathlib = {version = ">=0.17.0", extras = ["gs"]} +dapla-toolbelt = ">=1.3.3" +pandas = ">=1.4.2" +pyarrow = ">=8.0.0" +pydantic = ">=2.5.2" +pyjwt = ">=2.8.0" +python-dotenv = ">=1.0.1" +requests = ">=2.31.0" +ssb-datadoc-model = ">=6.0.0,<7.0.0" +ssb-klass-python = ">=0.0.9" + [[package]] name = "dash" version = "2.17.1" @@ -990,13 +1030,13 @@ tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipyth [[package]] name = "faker" -version = "27.0.0" +version = "26.3.0" description = "Faker is a Python package that generates fake data for you." optional = false python-versions = ">=3.8" files = [ - {file = "Faker-27.0.0-py3-none-any.whl", hash = "sha256:55ed0c4ed7bf16800c64823805f6fbbe6d4823db4b7c0903f6f890b8e4d6c34b"}, - {file = "faker-27.0.0.tar.gz", hash = "sha256:32c78b68d2ba97aaad78422e4035785de2b4bb46b81e428190fc11978da9036c"}, + {file = "Faker-26.3.0-py3-none-any.whl", hash = "sha256:97fe1e7e953dd640ca2cd4dfac4db7c4d2432dd1b7a244a3313517707f3b54e9"}, + {file = "Faker-26.3.0.tar.gz", hash = "sha256:7c10ebdf74aaa0cc4fe6ec6db5a71e8598ec33503524bd4b5f4494785a5670dd"}, ] [package.dependencies] @@ -2790,9 +2830,9 @@ files = [ [package.dependencies] lxml = {version = ">=4.9.2", optional = true, markers = "extra == \"xml\""} numpy = [ - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, ] odfpy = {version = ">=1.4.1", optional = true, markers = "extra == \"excel\""} openpyxl = {version = ">=3.1.0", optional = true, markers = "extra == \"excel\""} @@ -3276,8 +3316,8 @@ files = [ annotated-types = ">=0.4.0" pydantic-core = "2.20.1" typing-extensions = [ - {version = ">=4.6.1", markers = "python_version < \"3.13\""}, {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, + {version = ">=4.6.1", markers = "python_version < \"3.13\""}, ] [package.extras] @@ -5210,4 +5250,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "983f3554264eb2156d216c20ce1d42f683a4755c2061a0bfa388222c3082774e" +content-hash = "0e17893f396e1e140d092af6b293b4aa3d08214c6e02e40353e0a2e400cca0fb" diff --git a/pyproject.toml b/pyproject.toml index ecbdbdc9..6e432dee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,23 +24,15 @@ Changelog = "https://github.com/statisticsnorway/datadoc/releases" [tool.poetry.dependencies] python = ">=3.10,<4.0" -pyarrow = ">=8.0.0" dash = ">=2.15.0" pydantic = ">=2.5.2" dash-bootstrap-components = ">=1.1.0" -pandas = ">=1.4.2" -dapla-toolbelt = ">=1.3.3" -gunicorn = ">=21.2.0" flask-healthz = ">=0.0.3" arrow = ">=1.3.0" python-dotenv = ">=1.0.1" -requests = ">=2.31.0" -beautifulsoup4 = ">=4.12.3" -cloudpathlib = { extras = ["gs"], version = ">=0.17.0" } -pyjwt = ">=2.8.0" -ssb-klass-python = ">=0.0.9" ssb-dash-components = ">=0.8.1" -ssb-datadoc-model = "^6.0.0" +dapla-toolbelt-metadata = ">=0.2.0" +gunicorn = ">=23.0.0" [tool.poetry.group.dev.dependencies] mypy = ">=0.950" @@ -115,8 +107,6 @@ module = [ "pyarrow", "pyarrow.parquet", "dash.development.base_component", - "datadoc_model", - "datadoc_model.model", "pytest_mock", ] ignore_missing_imports = true @@ -159,6 +149,7 @@ ignore = [ "E501", # Let black handle line length "TRY003", "FIX002", # It's OK to use TODOs if they meet the requirements + "PLC2401", # Allow non-ASCII character because of norwegian letters ] [tool.ruff.lint.isort] diff --git a/src/datadoc/app.py b/src/datadoc/app.py index cbfa58a2..af515462 100644 --- a/src/datadoc/app.py +++ b/src/datadoc/app.py @@ -9,6 +9,9 @@ import logging from pathlib import Path +from dapla_metadata.datasets import Datadoc +from dapla_metadata.datasets.code_list import CodeList +from dapla_metadata.datasets.statistic_subject_mapping import StatisticSubjectMapping from dash import Dash from dash import dcc from dash import html @@ -16,9 +19,6 @@ from datadoc import config from datadoc import state -from datadoc.backend.code_list import CodeList -from datadoc.backend.core import Datadoc -from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping from datadoc.frontend.callbacks.register_callbacks import register_callbacks from datadoc.frontend.components.control_bars import build_controls_bar from datadoc.frontend.components.control_bars import build_footer_control_bar diff --git a/src/datadoc/backend/__init__.py b/src/datadoc/backend/__init__.py deleted file mode 100644 index 750f62ba..00000000 --- a/src/datadoc/backend/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Code which does not directly interact with th UI.""" diff --git a/src/datadoc/backend/code_list.py b/src/datadoc/backend/code_list.py deleted file mode 100644 index 7f438818..00000000 --- a/src/datadoc/backend/code_list.py +++ /dev/null @@ -1,241 +0,0 @@ -from __future__ import annotations - -import logging -from dataclasses import dataclass -from typing import TYPE_CHECKING - -from datadoc.backend.external_sources.external_sources import GetExternalSource -from datadoc.enums import SupportedLanguages - -if TYPE_CHECKING: - import concurrent - - import pandas as pd - -from klass.classes.classification import KlassClassification - -logger = logging.getLogger(__name__) - - -@dataclass -class CodeListItem: - """Data structure for a code list item. - - Attributes: - titles: A dictionary mapping language codes to titles. - code: The code associated with the item. - """ - - titles: dict[str, str] - code: str - - def get_title(self, language: SupportedLanguages) -> str: - """Return the title in the specified language. - - Args: - language: The language code for which to get the title. - - Returns: - The title in the specified language. It returns the title in Norwegian - Bokmål ("nb") if the language is either Norwegian Bokmål or Norwegian - Nynorsk, otherwise it returns the title in English ("en"). If none of - these are available, it returns an empty string and logs an exception. - """ - try: - return self.titles[language] - except KeyError: - try: - return self.titles[ - ( - "nb" - if language - in [ - SupportedLanguages.NORSK_BOKMÅL, - SupportedLanguages.NORSK_NYNORSK, - ] - else "en" - ) - ] - except KeyError: - logger.exception( - "Could not find title for subject %s and language: %s", - self, - language.name, - ) - return "" - - -class CodeList(GetExternalSource): - """Class for retrieving classifications from Klass. - - This class fetches a classification given a classification ID - and supports multiple languages. - - Attributes: - supported_languages: A list of supported language codes. - _classifications: A list to store classification items. - classification_id: The ID of the classification to retrieve. - classifications_dataframes: A dictionary to store dataframes of - classifications. - """ - - def __init__( - self, - executor: concurrent.futures.ThreadPoolExecutor, - classification_id: int | None, - ) -> None: - """Initialize the CodeList with the given classification ID and executor. - - Args: - executor: An instance of ThreadPoolExecutor to manage the asynchronous - execution of data fetching. - classification_id: The ID of the classification to retrieve. - """ - self.supported_languages = [ - SupportedLanguages.NORSK_BOKMÅL.value, - SupportedLanguages.ENGLISH.value, - ] - self._classifications: list[CodeListItem] = [] - self.classification_id = classification_id - self.classifications_dataframes: dict[str, pd.DataFrame] | None = None - super().__init__(executor) - - def _fetch_data_from_external_source( - self, - ) -> dict[str, pd.DataFrame] | None: - """Fetch the classifications from Klass by classification ID. - - This method retrieves classification data for each supported language and - stores it in a dictionary where the keys are language codes and the values - are pandas DataFrames containing the classification data. - - Returns: - A dictionary mapping language codes to pandas DataFrames containing the - classification data for the given classification ID. - If an exception occurs during the fetching process, logs the exception - and returns None. - """ - classifications_dataframes = {} - for i in self.supported_languages: - try: - classifications_dataframes[i] = ( - KlassClassification( - str(self.classification_id), - i, - ) - .get_codes() - .data - ) - except Exception: # noqa: PERF203 - logger.exception( - "Exception while getting classifications from Klass", - ) - return None - else: - return classifications_dataframes - return None - - def _extract_titles( - self, - dataframes: dict[SupportedLanguages, pd.DataFrame], - ) -> list[dict[str, str]]: - """Extract titles from the dataframes for each supported language. - - This method processes the provided dataframes and extracts the title from - each row for all supported languages, creating a list of dictionaries where - each dictionary maps language codes to titles. - - Args: - dataframes: A dictionary mapping language codes to pandas DataFrames - containing classification data. - - Returns: - A list of dictionaries, each mapping language codes to titles. - If a title is not available in a dataframe, the corresponding dictionary - value will be None. - """ - list_of_titles = [] - languages = list(dataframes) - for i in range(len(dataframes[SupportedLanguages.NORSK_BOKMÅL])): - titles = {} - for j in languages: - if "name" in dataframes[j]: - titles[str(j)] = dataframes[j].loc[:, "name"][i] - else: - titles[str(j)] = None - list_of_titles.append(titles) - return list_of_titles - - def _create_code_list_from_dataframe( - self, - classifications_dataframes: dict[SupportedLanguages, pd.DataFrame], - ) -> list[CodeListItem]: - """Create a list of CodeListItem objects from the classification dataframes. - - This method extracts titles from the provided dataframes and pairs them - with their corresponding classification codes to create a list of - CodeListItem objects. - - Args: - classifications_dataframes: A dictionary mapping language codes to - pandas DataFrames containing classification data. - - Returns: - A list of CodeListItem objects containing classification titles - and codes. - """ - classification_names = self._extract_titles(classifications_dataframes) - classification_codes: list - if "code" in classifications_dataframes[SupportedLanguages.NORSK_BOKMÅL]: - classification_codes = ( - classifications_dataframes[SupportedLanguages.NORSK_BOKMÅL] - .loc[:, "code"] - .to_list() - ) - else: - classification_codes = [None] * len(classification_names) - classification_items = [] - for a, b in zip(classification_names, classification_codes): - classification_items.append( - CodeListItem(a, b), - ) - return classification_items - - def _get_classification_dataframe_if_loaded(self) -> bool: - """Check if the classification data from Klass is loaded. - - This method verifies whether the classification data has been loaded. - If not, it retrieves the data from an external source and populates the - classifications. It logs the process and returns a boolean indicating the - success of the operation. - - Returns: - True if the data is loaded and classifications are successfully extracted, - False otherwise. - """ - if not self._classifications: - self.classifications_dataframes = self.retrieve_external_data() - if self.classifications_dataframes is not None: - self._classifications = self._create_code_list_from_dataframe( - self.classifications_dataframes, - ) - logger.debug( - "Thread finished. found %s classifications", - len(self._classifications), - ) - return True - logger.warning( - "Thread is not done. Cannot get classifications from the dataframe.", - ) - return False - - @property - def classifications(self) -> list[CodeListItem]: - """Get the list of classifications. - - Returns: - A list of CodeListItem objects. - """ - self._get_classification_dataframe_if_loaded() - logger.debug("Got %s classifications subjects", len(self._classifications)) - return self._classifications diff --git a/src/datadoc/backend/constants.py b/src/datadoc/backend/constants.py deleted file mode 100644 index 110e4246..00000000 --- a/src/datadoc/backend/constants.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Repository for constant values in Datadoc backend.""" - -from datadoc.enums import LanguageStringType -from datadoc.enums import LanguageStringTypeItem - -VALIDATION_ERROR = "Validation error: " - -DATE_VALIDATION_MESSAGE = f"{VALIDATION_ERROR}contains_data_from must be the same or earlier date than contains_data_until" - -OBLIGATORY_METADATA_WARNING = "Obligatory metadata is missing: " - -INCONSISTENCIES_MESSAGE = "Inconsistencies found between extracted and existing metadata. Inconsistencies are:" - -OBLIGATORY_DATASET_METADATA_IDENTIFIERS: list = [ - "assessment", - "dataset_state", - "dataset_status", - "name", - "description", - "data_source", - "population_description", - "version", - "version_description", - "unit_type", - "temporality_type", - "subject_field", - "spatial_coverage_description", - "owner", - "contains_data_from", - "contains_data_until", - "contains_personal_data", -] - -OBLIGATORY_DATASET_METADATA_IDENTIFIERS_MULTILANGUAGE = [ - "name", - "description", - "population_description", - "version_description", - "spatial_coverage_description", -] - -OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS = [ - "name", - "data_type", - "variable_role", - "is_personal_data", -] - -OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE = [ - "name", -] - -DEFAULT_SPATIAL_COVERAGE_DESCRIPTION = LanguageStringType( - [ - LanguageStringTypeItem( - languageCode="nb", - languageText="Norge", - ), - LanguageStringTypeItem( - languageCode="nn", - languageText="Noreg", - ), - LanguageStringTypeItem( - languageCode="en", - languageText="Norway", - ), - ], -) - -NUM_OBLIGATORY_DATASET_FIELDS = len(OBLIGATORY_DATASET_METADATA_IDENTIFIERS) - -NUM_OBLIGATORY_VARIABLES_FIELDS = len(OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS) - -DATASET_FIELDS_FROM_EXISTING_METADATA = [ - "dataset_status", - "name", - "description", - "data_source", - "population_description", - "unit_type", - "temporality_type", - "subject_field", - "keyword", - "spatial_coverage_description", - "contains_personal_data", - "use_restriction", - "use_restriction_date", - "custom_type", - "owner", -] diff --git a/src/datadoc/backend/core.py b/src/datadoc/backend/core.py deleted file mode 100644 index 066eafdd..00000000 --- a/src/datadoc/backend/core.py +++ /dev/null @@ -1,544 +0,0 @@ -"""Handle reading, updating and writing of metadata.""" - -from __future__ import annotations - -import concurrent -import copy -import json -import logging -import warnings -from pathlib import Path -from typing import TYPE_CHECKING - -from datadoc_model import model - -from datadoc import config -from datadoc.backend import user_info -from datadoc.backend.constants import DATASET_FIELDS_FROM_EXISTING_METADATA -from datadoc.backend.constants import DEFAULT_SPATIAL_COVERAGE_DESCRIPTION -from datadoc.backend.constants import INCONSISTENCIES_MESSAGE -from datadoc.backend.constants import NUM_OBLIGATORY_DATASET_FIELDS -from datadoc.backend.constants import NUM_OBLIGATORY_VARIABLES_FIELDS -from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo -from datadoc.backend.dataset_parser import DatasetParser -from datadoc.backend.model_backwards_compatibility import ( - is_metadata_in_container_structure, -) -from datadoc.backend.model_backwards_compatibility import upgrade_metadata -from datadoc.backend.model_validation import ValidateDatadocMetadata -from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping -from datadoc.backend.utils import calculate_percentage -from datadoc.backend.utils import derive_assessment_from_state -from datadoc.backend.utils import normalize_path -from datadoc.backend.utils import num_obligatory_dataset_fields_completed -from datadoc.backend.utils import num_obligatory_variables_fields_completed -from datadoc.backend.utils import set_default_values_dataset -from datadoc.backend.utils import set_default_values_variables -from datadoc.enums import DataSetStatus -from datadoc.utils import METADATA_DOCUMENT_FILE_SUFFIX -from datadoc.utils import get_timestamp_now - -if TYPE_CHECKING: - import pathlib - from datetime import datetime - - from cloudpathlib import CloudPath - - -logger = logging.getLogger(__name__) - - -class InconsistentDatasetsWarning(UserWarning): - """Existing and new datasets differ significantly from one another.""" - - -class InconsistentDatasetsError(ValueError): - """Existing and new datasets differ significantly from one another.""" - - -class Datadoc: - """Handle reading, updating and writing of metadata. - - If a metadata document exists, it is this information that is loaded. Nothing - is inferred from the dataset. If only a dataset path is supplied the metadata - document path is build based on the dataset path. - - Example: /path/to/dataset.parquet -> /path/to/dataset__DOC.json - - Attributes: - dataset_path: A file path to the path to where the dataset is stored. - metadata_document_path: A path to a metadata document if it exists. - statistic_subject_mapping: An instance of StatisticSubjectMapping. - """ - - def __init__( - self, - dataset_path: str | None = None, - metadata_document_path: str | None = None, - statistic_subject_mapping: StatisticSubjectMapping | None = None, - *, - errors_as_warnings: bool = False, - ) -> None: - """Initialize the Datadoc instance. - - If a dataset path is supplied, it attempts to locate and load the - corresponding metadata document. If no dataset path is provided, the class - is instantiated without loading any metadata. - - Args: - dataset_path: The file path to the dataset. Defaults to None. - metadata_document_path: The file path to the metadata document. - Defaults to None. - statistic_subject_mapping: An instance of StatisticSubjectMapping. - Defaults to None - errors_as_warnings: Disable raising exceptions if inconsistencies - are found between existing and extracted metadata. - """ - self._statistic_subject_mapping = statistic_subject_mapping - self.errors_as_warnings = errors_as_warnings - self.metadata_document: pathlib.Path | CloudPath | None = None - self.container: model.MetadataContainer | None = None - self.dataset_path: pathlib.Path | CloudPath | None = None - self.dataset = model.Dataset() - self.variables: list = [] - self.variables_lookup: dict[str, model.Variable] = {} - self.explicitly_defined_metadata_document = False - if metadata_document_path: - self.metadata_document = normalize_path(metadata_document_path) - self.explicitly_defined_metadata_document = True - if not self.metadata_document.exists(): - msg = f"Metadata document does not exist! Provided path: {self.metadata_document}" - raise ValueError( - msg, - ) - if dataset_path: - self.dataset_path = normalize_path(dataset_path) - if not metadata_document_path: - self.metadata_document = self.build_metadata_document_path( - self.dataset_path, - ) - if metadata_document_path or dataset_path: - self._extract_metadata_from_files() - - def _extract_metadata_from_files(self) -> None: - """Read metadata from an existing metadata document or create one. - - If a metadata document exists, it reads and extracts metadata from it. - If no metadata document is found, it creates metadata from scratch by - extracting information from the dataset file. - - This method ensures that: - - Metadata is extracted from an existing document if available. - - If metadata is not available, it is extracted from the dataset file. - - The dataset ID is set if not already present. - - Default values are set for variables, particularly the variable role on - creation. - - Default values for variables ID and 'is_personal_data' are set if the - values are None. - - The 'contains_personal_data' attribute is set to False if not specified. - - A lookup dictionary for variables is created based on their short names. - """ - extracted_metadata: model.DatadocMetadata | None = None - existing_metadata: model.DatadocMetadata | None = None - if self.metadata_document is not None and self.metadata_document.exists(): - existing_metadata = self._extract_metadata_from_existing_document( - self.metadata_document, - ) - if ( - self.dataset_path is not None - and self.dataset == model.Dataset() - and len(self.variables) == 0 - ): - extracted_metadata = self._extract_metadata_from_dataset(self.dataset_path) - - if ( - self.dataset_path - and self.explicitly_defined_metadata_document - and self.metadata_document is not None - and self.metadata_document.exists() - and extracted_metadata is not None - and existing_metadata is not None - ): - if ( - extracted_metadata.dataset is not None - and extracted_metadata.dataset.file_path is not None - ): - existing_file_path = extracted_metadata.dataset.file_path - else: - msg = "Could not access existing dataset file path" - raise ValueError(msg) - self._check_ready_to_merge( - self.dataset_path, - Path(existing_file_path), - extracted_metadata, - existing_metadata, - errors_as_warnings=self.errors_as_warnings, - ) - merged_metadata = self._merge_metadata( - extracted_metadata, - existing_metadata, - ) - # We need to override this so that the document gets saved to the correct - # location, otherwise we would overwrite the existing document! - self.metadata_document = self.build_metadata_document_path( - self.dataset_path, - ) - if merged_metadata.dataset and merged_metadata.variables: - self.dataset = merged_metadata.dataset - self.variables = merged_metadata.variables - else: - msg = "Could not read metadata" - raise ValueError(msg) - elif ( - existing_metadata - and existing_metadata.dataset - and existing_metadata.variables - ): - self.dataset = existing_metadata.dataset - self.variables = existing_metadata.variables - elif ( - extracted_metadata - and extracted_metadata.dataset - and extracted_metadata.variables - ): - self.dataset = extracted_metadata.dataset - self.variables = extracted_metadata.variables - else: - msg = "Could not read metadata" - raise ValueError(msg) - set_default_values_variables(self.variables) - set_default_values_dataset(self.dataset) - self.variables_lookup = { - v.short_name: v for v in self.variables if v.short_name - } - - @staticmethod - def _check_ready_to_merge( - new_dataset_path: Path | CloudPath, - existing_dataset_path: Path, - extracted_metadata: model.DatadocMetadata, - existing_metadata: model.DatadocMetadata, - *, - errors_as_warnings: bool, - ) -> None: - """Check if the datasets are consistent enough to make a successful merge of metadata. - - Args: - new_dataset_path: Path to the dataset to be documented. - existing_dataset_path: Path stored in the existing metadata. - extracted_metadata: Metadata extracted from a physical dataset. - existing_metadata: Metadata from a previously created metadata document. - errors_as_warnings: True if failing checks should be raised as warnings, not errors. - - Raises: - InconsistentDatasetsError: If inconsistencies are found and `errors_as_warnings == False` - """ - new_dataset_path_info = DaplaDatasetPathInfo(new_dataset_path) - existing_dataset_path_info = DaplaDatasetPathInfo(existing_dataset_path) - results = [ - { - "name": "Bucket name", - "success": ( - new_dataset_path_info.bucket_name - == existing_dataset_path_info.bucket_name - ), - }, - { - "name": "Data product name", - "success": ( - new_dataset_path_info.statistic_short_name - == existing_dataset_path_info.statistic_short_name - ), - }, - { - "name": "Dataset state", - "success": ( - new_dataset_path_info.dataset_state - == existing_dataset_path_info.dataset_state - ), - }, - { - "name": "Dataset short name", - "success": ( - new_dataset_path_info.dataset_short_name - == existing_dataset_path_info.dataset_short_name - ), - }, - { - "name": "Variable names", - "success": ( - {v.short_name for v in extracted_metadata.variables or []} - == {v.short_name for v in existing_metadata.variables or []} - ), - }, - { - "name": "Variable datatypes", - "success": ( - [v.data_type for v in extracted_metadata.variables or []] - == [v.data_type for v in existing_metadata.variables or []] - ), - }, - ] - if failures := [result for result in results if not result["success"]]: - msg = f"{INCONSISTENCIES_MESSAGE} {', '.join(str(f['name']) for f in failures)}" - if errors_as_warnings: - warnings.warn( - message=msg, - category=InconsistentDatasetsWarning, - stacklevel=2, - ) - else: - raise InconsistentDatasetsError( - msg, - ) - - @staticmethod - def _merge_metadata( - extracted_metadata: model.DatadocMetadata | None, - existing_metadata: model.DatadocMetadata | None, - ) -> model.DatadocMetadata: - if not existing_metadata: - logger.warning( - "No existing metadata found, no merge to perform. Continuing with extracted metadata.", - ) - return extracted_metadata or model.DatadocMetadata() - if not extracted_metadata: - return existing_metadata - # Use the extracted metadata as a base - merged_metadata = model.DatadocMetadata( - dataset=copy.deepcopy(extracted_metadata.dataset), - variables=[], - ) - if ( - merged_metadata.dataset is not None - and existing_metadata.dataset is not None - ): - # Override the fields as defined - for field in DATASET_FIELDS_FROM_EXISTING_METADATA: - setattr( - merged_metadata.dataset, - field, - getattr(existing_metadata.dataset, field), - ) - - # Merge variables. - # For each extracted variable, copy existing metadata into the merged metadata - if ( - existing_metadata.variables is not None - and extracted_metadata is not None - and extracted_metadata.variables is not None - and merged_metadata.variables is not None - ): - for extracted in extracted_metadata.variables: - existing = next( - ( - existing - for existing in existing_metadata.variables - if existing.short_name == extracted.short_name - ), - None, - ) - if existing: - existing.id = None # Set to None so that it will be set assigned a fresh ID later - existing.contains_data_from = ( - extracted.contains_data_from or existing.contains_data_from - ) - existing.contains_data_until = ( - extracted.contains_data_until or existing.contains_data_until - ) - merged_metadata.variables.append(existing) - else: - # If there is no existing metadata for this variable, we just use what we have extracted - merged_metadata.variables.append(extracted) - return merged_metadata - - def _extract_metadata_from_existing_document( - self, - document: pathlib.Path | CloudPath, - ) -> model.DatadocMetadata | None: - """Read metadata from an existing metadata document. - - If an existing metadata document is available, this method reads and - loads the metadata from it. It validates and upgrades the metadata as - necessary. If we have read in a file with an empty "datadoc" structure - the process ends. - A typical example causing a empty datadoc is a file produced from a - pseudonymization process. - - Args: - document: A path to the existing metadata document. - - Raises: - json.JSONDecodeError: If the metadata document cannot be parsed. - """ - fresh_metadata = {} - try: - with document.open(mode="r", encoding="utf-8") as file: - fresh_metadata = json.load(file) - logger.info("Opened existing metadata file %s", document) - fresh_metadata = upgrade_metadata( - fresh_metadata, - ) - if is_metadata_in_container_structure(fresh_metadata): - self.container = model.MetadataContainer.model_validate_json( - json.dumps(fresh_metadata), - ) - datadoc_metadata = fresh_metadata["datadoc"] - else: - datadoc_metadata = fresh_metadata - if datadoc_metadata is None: - return None - return model.DatadocMetadata.model_validate_json( - json.dumps(datadoc_metadata), - ) - except json.JSONDecodeError: - logger.warning( - "Could not open existing metadata file %s. \ - Falling back to collecting data from the dataset", - document, - exc_info=True, - ) - return None - - def _extract_subject_field_from_path( - self, - dapla_dataset_path_info: DaplaDatasetPathInfo, - ) -> str | None: - """Extract the statistic short name from the dataset file path. - - Map the extracted statistic short name to its corresponding statistical - subject. - - Args: - dapla_dataset_path_info: The object representing the decomposed file - path. - - Returns: - The code for the statistical subject or None if we couldn't map to one. - """ - if self._statistic_subject_mapping is None: - with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor: - return StatisticSubjectMapping( - executor, - config.get_statistical_subject_source_url(), - ).get_secondary_subject( - dapla_dataset_path_info.statistic_short_name, - ) - else: - return self._statistic_subject_mapping.get_secondary_subject( - dapla_dataset_path_info.statistic_short_name, - ) - - def _extract_metadata_from_dataset( - self, - dataset: pathlib.Path | CloudPath, - ) -> model.DatadocMetadata: - """Obtain what metadata we can from the dataset itself. - - This makes it easier for the user by 'pre-filling' certain fields. - Certain elements are dependent on the dataset being saved according - to SSB's standard. - - Args: - dataset: The path to the dataset file, which can be a local or - cloud path. - - Side Effects: - Updates the following instance attributes: - - ds_schema: An instance of DatasetParser initialized for the - given dataset file. - - dataset: An instance of model.Dataset with pre-filled metadata - fields. - - variables: A list of fields extracted from the dataset schema. - """ - dapla_dataset_path_info = DaplaDatasetPathInfo(dataset) - metadata = model.DatadocMetadata() - - metadata.dataset = model.Dataset( - short_name=dapla_dataset_path_info.dataset_short_name, - dataset_state=dapla_dataset_path_info.dataset_state, - dataset_status=DataSetStatus.DRAFT, - assessment=( - derive_assessment_from_state( - dapla_dataset_path_info.dataset_state, - ) - if dapla_dataset_path_info.dataset_state is not None - else None - ), - version=dapla_dataset_path_info.dataset_version, - contains_data_from=dapla_dataset_path_info.contains_data_from, - contains_data_until=dapla_dataset_path_info.contains_data_until, - file_path=str(self.dataset_path), - metadata_created_by=user_info.get_user_info_for_current_platform().short_email, - subject_field=self._extract_subject_field_from_path( - dapla_dataset_path_info, - ), - spatial_coverage_description=DEFAULT_SPATIAL_COVERAGE_DESCRIPTION, - ) - metadata.variables = DatasetParser.for_file(dataset).get_fields() - return metadata - - @staticmethod - def build_metadata_document_path( - dataset_path: pathlib.Path | CloudPath, - ) -> pathlib.Path | CloudPath: - """Build the path to the metadata document corresponding to the given dataset. - - Args: - dataset_path: Path to the dataset we wish to create metadata for. - """ - return dataset_path.parent / (dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX) - - def write_metadata_document(self) -> None: - """Write all currently known metadata to file. - - Side Effects: - - Updates the dataset's metadata_last_updated_date and - metadata_last_updated_by attributes. - - Updates the dataset's file_path attribute. - - Validates the metadata model and stores it in a MetadataContainer. - - Writes the validated metadata to a file if the metadata_document - attribute is set. - - Logs the action and the content of the metadata document. - - Raises: - ValueError: If no metadata document is specified for saving. - """ - timestamp: datetime = get_timestamp_now() - self.dataset.metadata_last_updated_date = timestamp - self.dataset.metadata_last_updated_by = ( - user_info.get_user_info_for_current_platform().short_email - ) - self.dataset.file_path = str(self.dataset_path) - datadoc: ValidateDatadocMetadata = ValidateDatadocMetadata( - percentage_complete=self.percent_complete, - dataset=self.dataset, - variables=self.variables, - ) - if self.container: - self.container.datadoc = datadoc - else: - self.container = model.MetadataContainer(datadoc=datadoc) - if self.metadata_document: - content = self.container.model_dump_json(indent=4) - self.metadata_document.write_text(content) - logger.info("Saved metadata document %s", self.metadata_document) - logger.info("Metadata content:\n%s", content) - else: - msg = "No metadata document to save" - raise ValueError(msg) - - @property - def percent_complete(self) -> int: - """The percentage of obligatory metadata completed. - - A metadata field is counted as complete when any non-None value is - assigned. This value is saved int the metadata document as a simple quality indicator. - """ - num_all_fields = NUM_OBLIGATORY_DATASET_FIELDS + ( - NUM_OBLIGATORY_VARIABLES_FIELDS * len(self.variables) - ) - num_set_fields = num_obligatory_dataset_fields_completed( - self.dataset, - ) + num_obligatory_variables_fields_completed(self.variables) - return calculate_percentage(num_set_fields, num_all_fields) diff --git a/src/datadoc/backend/dapla_dataset_path_info.py b/src/datadoc/backend/dapla_dataset_path_info.py deleted file mode 100644 index 818ed26c..00000000 --- a/src/datadoc/backend/dapla_dataset_path_info.py +++ /dev/null @@ -1,691 +0,0 @@ -"""Extract info from a path following SSB's dataset naming convention.""" - -from __future__ import annotations - -import logging -import pathlib -import re -from abc import ABC -from abc import abstractmethod -from dataclasses import dataclass -from typing import TYPE_CHECKING -from typing import Final -from typing import Literal - -import arrow -from cloudpathlib import GSPath - -from datadoc.enums import DataSetState -from datadoc.enums import SupportedLanguages - -if TYPE_CHECKING: - import datetime - import os - from datetime import date - -logger = logging.getLogger(__name__) - -GS_PREFIX_FROM_PATHLIB = "gs:/" - - -@dataclass -class DateFormat(ABC): - """A super class for date formats.""" - - name: str - regex_pattern: str - arrow_pattern: str - timeframe: Literal["year", "month", "day", "week"] - - @abstractmethod - def get_floor(self, period_string: str) -> date | None: - """Abstract method implemented in the child class. - - Return the first date of the timeframe period. - - Args: - period_string: A string representing the timeframe period. - """ - - @abstractmethod - def get_ceil(self, period_string: str) -> date | None: - """Abstract method implemented in the child class. - - Return the last date of the timeframe period. - - Args: - period_string: A string representing the timeframe period. - """ - - -@dataclass -class IsoDateFormat(DateFormat): - """A subclass of Dateformat with relevant patterns for ISO dates.""" - - def get_floor(self, period_string: str) -> date | None: - """Return first date of timeframe period defined in ISO date format. - - Examples: - >>> ISO_YEAR_MONTH.get_floor("1980-08") - datetime.date(1980, 8, 1) - - >>> ISO_YEAR.get_floor("2021") - datetime.date(2021, 1, 1) - """ - return arrow.get(period_string, self.arrow_pattern).floor(self.timeframe).date() - - def get_ceil(self, period_string: str) -> date | None: - """Return last date of timeframe period defined in ISO date format. - - Examples: - >>> ISO_YEAR.get_ceil("1921") - datetime.date(1921, 12, 31) - - >>> ISO_YEAR_MONTH.get_ceil("2021-05") - datetime.date(2021, 5, 31) - """ - return arrow.get(period_string, self.arrow_pattern).ceil(self.timeframe).date() - - -ISO_YEAR = IsoDateFormat( - name="ISO_YEAR", - regex_pattern=r"^\d{4}$", - arrow_pattern="YYYY", - timeframe="year", -) -ISO_YEAR_MONTH = IsoDateFormat( - name="ISO_YEAR_MONTH", - regex_pattern=r"^\d{4}\-\d{2}$", - arrow_pattern="YYYY-MM", - timeframe="month", -) -ISO_YEAR_MONTH_DAY = IsoDateFormat( - name="ISO_YEAR_MONTH_DAY", - regex_pattern=r"^\d{4}\-\d{2}\-\d{2}$", - arrow_pattern="YYYY-MM-DD", - timeframe="day", -) -ISO_YEAR_WEEK = IsoDateFormat( - name="ISO_YEAR_WEEK", - regex_pattern=r"^\d{4}\-{0,1}W\d{2}$", - arrow_pattern="W", - timeframe="week", -) - - -@dataclass -class SsbDateFormat(DateFormat): - """A subclass of Dateformat with relevant patterns for SSB unique dates. - - Attributes: - ssb_dates: A dictionary where keys are date format strings and values - are corresponding date patterns specific to SSB. - """ - - ssb_dates: dict - - def get_floor(self, period_string: str) -> date | None: - """Return first date of the timeframe period defined in SSB date format. - - Convert SSB format to date-string and return the first date. - - Args: - period_string: A string representing the timeframe period in - SSB format. - - Returns: - The first date of the period if the period_string is a valid - SSB format, otherwise None. - - Example: - >>> SSB_BIMESTER.get_floor("2003B8") - None - - >>> SSB_BIMESTER.get_floor("2003B4") - datetime.date(2003, 7, 1) - """ - try: - year = period_string[:4] - month = self.ssb_dates[period_string[-2:]]["start"] - period = year + month - return arrow.get(period, self.arrow_pattern).floor(self.timeframe).date() - except KeyError: - logger.exception("Error while converting to SSB date format") - return None - - def get_ceil(self, period_string: str) -> date | None: - """Return last date of the timeframe period defined in SSB date format. - - Convert SSB format to date-string and return the last date. - - Args: - period_string: A string representing the timeframe period in SSB - format. - - Returns: - The last date of the period if the period_string is a valid SSB format, - otherwise None. - - Example: - >>> SSB_TRIANNUAL.get_ceil("1999T11") - None - - >>> SSB_HALF_YEAR.get_ceil("2024H1") - datetime.date(2024, 6, 30) - """ - try: - year = period_string[:4] - month = self.ssb_dates[period_string[-2:]]["end"] - period = year + month - return arrow.get(period, self.arrow_pattern).ceil(self.timeframe).date() - except KeyError: - return None - - -SSB_BIMESTER = SsbDateFormat( - name="SSB_BIMESTER", - regex_pattern=r"^\d{4}[B]\d{1}$", - arrow_pattern="YYYYMM", - timeframe="month", - ssb_dates={ - "B1": { - "start": "01", - "end": "02", - }, - "B2": { - "start": "03", - "end": "04", - }, - "B3": { - "start": "05", - "end": "06", - }, - "B4": { - "start": "07", - "end": "08", - }, - "B5": { - "start": "09", - "end": "10", - }, - "B6": { - "start": "11", - "end": "12", - }, - }, -) - -SSB_QUARTERLY = SsbDateFormat( - name="SSB_QUARTERLY", - regex_pattern=r"^\d{4}[Q]\d{1}$", - arrow_pattern="YYYYMM", - timeframe="month", - ssb_dates={ - "Q1": { - "start": "01", - "end": "03", - }, - "Q2": { - "start": "04", - "end": "06", - }, - "Q3": { - "start": "07", - "end": "09", - }, - "Q4": { - "start": "10", - "end": "12", - }, - }, -) - -SSB_TRIANNUAL = SsbDateFormat( - name="SSB_TRIANNUAL", - regex_pattern=r"^\d{4}[T]\d{1}$", - arrow_pattern="YYYYMM", - timeframe="month", - ssb_dates={ - "T1": { - "start": "01", - "end": "04", - }, - "T2": { - "start": "05", - "end": "08", - }, - "T3": { - "start": "09", - "end": "12", - }, - }, -) -SSB_HALF_YEAR = SsbDateFormat( - name="SSB_HALF_YEAR", - regex_pattern=r"^\d{4}[H]\d{1}$", - arrow_pattern="YYYYMM", - timeframe="month", - ssb_dates={ - "H1": { - "start": "01", - "end": "06", - }, - "H2": { - "start": "07", - "end": "12", - }, - }, -) - -SUPPORTED_DATE_FORMATS: list[IsoDateFormat | SsbDateFormat] = [ - ISO_YEAR, - ISO_YEAR_MONTH, - ISO_YEAR_MONTH_DAY, - ISO_YEAR_WEEK, - SSB_BIMESTER, - SSB_QUARTERLY, - SSB_TRIANNUAL, - SSB_HALF_YEAR, -] - - -def categorize_period_string(period: str) -> IsoDateFormat | SsbDateFormat: - """Categorize a period string into one of the supported date formats. - - Args: - period: A string representing the period to be categorized. - - Returns: - An instance of either IsoDateFormat or SsbDateFormat depending on the - format of the input period string. - - Raises: - NotImplementedError: If the period string is not recognized as either an - ISO or SSB date format. - - Examples: - >>> date_format = categorize_period_string('2022-W01') - >>> date_format.name - ISO_YEAR_WEEK - - >>> date_format = categorize_period_string('1954T2') - >>> date_format.name - SSB_TRIANNUAL - - >>> categorize_period_string('unknown format') - Traceback (most recent call last): - ... - NotImplementedError: Period format unknown format is not supported - """ - for date_format in SUPPORTED_DATE_FORMATS: - if re.match(date_format.regex_pattern, period): - return date_format - - msg = f"Period format {period} is not supported" - raise NotImplementedError( - msg, - ) - - -class DaplaDatasetPathInfo: - """Extract info from a path following SSB's dataset naming convention.""" - - def __init__(self, dataset_path: str | os.PathLike[str]) -> None: - """Digest the path so that it's ready for further parsing.""" - self.dataset_string = str(dataset_path) - self.dataset_path = pathlib.Path(dataset_path) - self.dataset_name_sections = self.dataset_path.stem.split("_") - self._period_strings = self._extract_period_strings(self.dataset_name_sections) - - @staticmethod - def _get_period_string_indices(dataset_name_sections: list[str]) -> list[int]: - """Get all the indices at which period strings are found in list. - - Args: - dataset_name_sections: A list of strings representing sections of a - dataset name. - - Returns: - A list of indices where period strings are found within the - dataset_name_sections. - - Examples: - >>> DaplaDatasetPathInfo._get_period_string_indices(['kommune', 'p2022', 'v1']) - [1] - - >>> DaplaDatasetPathInfo._get_period_string_indices(['kommune', 'p2022-01', 'p2023-06', 'v1']) - [1, 2] - - >>> DaplaDatasetPathInfo._get_period_string_indices(['kommune', 'p1990Q1', 'v1']) - [1] - - >>> DaplaDatasetPathInfo._get_period_string_indices(['varehandel','v1']) - [] - """ - - def insert_p(regex: str) -> str: - r"""Insert a 'p' as the second character. - - Args: - regex: A string representing the regular expression pattern to be - modified. - - Returns: - The modified regular expression pattern with 'p' inserted as the - second character. - - Examples: - >>> insert_p(r"^\d{4}[H]\d{1}$") - '^p\d{4}[H]\d{1}$' - """ - return regex[:1] + "p" + regex[1:] - - return [ - i - for i, x in enumerate(dataset_name_sections) - if any( - re.match(insert_p(date_format.regex_pattern), x) - for date_format in SUPPORTED_DATE_FORMATS - ) - ] - - @staticmethod - def _extract_period_strings(dataset_name_sections: list[str]) -> list[str]: - """Extract period strings from dataset name sections. - - Iterates over the dataset name sections and returns a list of strings - that match the year regex, stripping the first character. This extracts - the year periods from the dataset name. - - Args: - dataset_name_sections: A list of strings representing sections of a - dataset name. - - Returns: - A list of extracted period strings, with the first character stripped - from each match. - - Examples: - >>> DaplaDatasetPathInfo._extract_period_strings(['p2022', 'kommune', 'v1']) - ['2022'] - - >>> DaplaDatasetPathInfo._extract_period_strings(['p2022-01', 'p2023-06', 'kommune', 'v1']) - ['2022-01', '2023-06'] - - >>> DaplaDatasetPathInfo._extract_period_strings(['p1990Q1', 'kommune', 'v1']) - ['1990Q1'] - - >>> DaplaDatasetPathInfo._extract_period_strings(['varehandel','v1']) - [] - """ - return [ - dataset_name_sections[i][1:] - for i in DaplaDatasetPathInfo._get_period_string_indices( - dataset_name_sections, - ) - ] - - def _extract_period_string_from_index(self, index: int) -> str | None: - """Extract a period string by its index from the list of period strings. - - Args: - index: The index of the period string to extract. - - Returns: - The extracted period string if it exists, otherwise None. - """ - try: - return self._period_strings[index] - except IndexError: - return None - - def _extract_norwegian_dataset_state_path_part( - self, - dataset_state: DataSetState, - ) -> set: - """Extract the Norwegian dataset state path part. - - Args: - dataset_state: The dataset state. - - Returns: - A set of variations of the Norwegian dataset state path part. - """ - norwegian_dataset_state_path_part = dataset_state.get_value_for_language( - SupportedLanguages.NORSK_BOKMÅL, - ) - if norwegian_dataset_state_path_part is not None: - norwegian_dataset_state_path_part = ( - norwegian_dataset_state_path_part.lower() - ) - return_value = { - norwegian_dataset_state_path_part.replace(" ", x) for x in ["-", "_"] - } - return return_value - - @property - def bucket_name( - self, - ) -> str | None: - """Extract the bucket name from the dataset path. - - Returns: - The bucket name or None if the dataset path is not a GCS path. - - Examples: - >>> DaplaDatasetPathInfo('gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name - ssb-staging-dapla-felles-data-delt - - >>> DaplaDatasetPathInfo(pathlib.Path('gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet')).bucket_name - ssb-staging-dapla-felles-data-delt - - >>> DaplaDatasetPathInfo('gs:/ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name - ssb-staging-dapla-felles-data-delt - - >>> DaplaDatasetPathInfo('ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name - None - """ - prefix: str | None = None - if self.dataset_string.startswith(GSPath.cloud_prefix): - prefix = GSPath.cloud_prefix - elif self.dataset_string.startswith(GS_PREFIX_FROM_PATHLIB): - prefix = GS_PREFIX_FROM_PATHLIB - else: - return None - - return pathlib.Path( - self.dataset_string.removeprefix(prefix), - ).parts[0] - - @property - def dataset_short_name( - self, - ) -> str | None: - """Extract the dataset short name from the filepath. - - The dataset short name is defined as the first section of the stem, up to - the period information or the version information if no period information - is present. - - Returns: - The extracted dataset short name if it can be determined, otherwise - None. - - Examples: - >>> DaplaDatasetPathInfo('prosjekt/befolkning/klargjorte_data/person_data_v1.parquet').dataset_short_name - person_data - - >>> DaplaDatasetPathInfo('befolkning/inndata/sykepenger_p2022Q1_p2022Q2_v23.parquet').dataset_short_name - sykepenger - - >>> DaplaDatasetPathInfo('my_data/simple_dataset_name.parquet').dataset_short_name - simple_dataset_name - """ - if self.contains_data_from or self.contains_data_until: - short_name_sections = self.dataset_name_sections[ - : min( - DaplaDatasetPathInfo._get_period_string_indices( - self.dataset_name_sections, - ), - ) - ] - elif self.dataset_version: - short_name_sections = self.dataset_name_sections[:-1] - else: - short_name_sections = self.dataset_name_sections - - return "_".join(short_name_sections) - - @property - def contains_data_from(self) -> datetime.date | None: - """The earliest date from which data in the dataset is relevant for. - - Returns: - The earliest relevant date for the dataset if available, otherwise None. - """ - period_string = self._extract_period_string_from_index(0) - if not period_string or ( - len(self._period_strings) > 1 and period_string > self._period_strings[1] - ): - return None - date_format = categorize_period_string(period_string) - return date_format.get_floor(period_string) - - @property - def contains_data_until(self) -> datetime.date | None: - """The latest date until which data in the dataset is relevant for. - - Returns: - The latest relevant date for the dataset if available, otherwise None. - """ - first_period_string = self._extract_period_string_from_index(0) - second_period_string = self._extract_period_string_from_index(1) - period_string = second_period_string or first_period_string - if not period_string or ( - second_period_string - and first_period_string is not None - and second_period_string < first_period_string - ): - return None - date_format = categorize_period_string(period_string) - return date_format.get_ceil(period_string) - - @property - def dataset_state( - self, - ) -> DataSetState | None: - """Extract the dataset state from the path. - - We assume that files are saved in the Norwegian language as specified by - SSB. - - Returns: - The extracted dataset state if it can be determined from the path, - otherwise None. - - Examples: - >>> DaplaDatasetPathInfo('klargjorte_data/person_data_v1.parquet').dataset_state - - - >>> DaplaDatasetPathInfo('utdata/min_statistikk/person_data_v1.parquet').dataset_state - - - >>> DaplaDatasetPathInfo('my_special_data/person_data_v1.parquet').dataset_state - None - """ - dataset_path_parts = set(self.dataset_path.parts) - for s in DataSetState: - norwegian_dataset_state_path_part_variations = ( - self._extract_norwegian_dataset_state_path_part(s) - ) - # Match on any of the variations anywhere in the path. - if norwegian_dataset_state_path_part_variations.intersection( - dataset_path_parts, - ): - return s - return None - - @property - def dataset_version( - self, - ) -> str | None: - """Extract version information if exists in filename. - - Returns: - The extracted version information if available in the filename, - otherwise None. - - Examples: - >>> DaplaDatasetPathInfo('person_data_v1.parquet').dataset_version - '1' - - >>> DaplaDatasetPathInfo('person_data_v20.parquet').dataset_version - '20' - - >>> DaplaDatasetPathInfo('person_data.parquet').dataset_version - None - """ - minimum_elements_in_file_name: Final[int] = 2 - minimum_characters_in_version_string: Final[int] = 2 - if len(self.dataset_name_sections) >= minimum_elements_in_file_name: - last_filename_element = str(self.dataset_name_sections[-1]) - if ( - len(last_filename_element) >= minimum_characters_in_version_string - and last_filename_element[0:1] == "v" - and last_filename_element[1:].isdigit() - ): - return last_filename_element[1:] - return None - - @property - def statistic_short_name( - self, - ) -> str | None: - """Extract the statistical short name from the filepath. - - Extract the statistical short name from the filepath right before the - dataset state based on the Dapla filepath naming convention. - - Returns: - The extracted statistical short name if it can be determined, - otherwise None. - - Examples: - >>> DaplaDatasetPathInfo('prosjekt/befolkning/klargjorte_data/person_data_v1.parquet').statistic_short_name - befolkning - - >>> DaplaDatasetPathInfo('befolkning/inndata/person_data_v1.parquet').statistic_short_name - befolkning - - >>> DaplaDatasetPathInfo('befolkning/person_data.parquet').statistic_short_name - None - """ - dataset_state = self.dataset_state - if dataset_state is not None: - dataset_state_names = self._extract_norwegian_dataset_state_path_part( - dataset_state, - ) - dataset_path_parts = list(self.dataset_path.parts) - for i in dataset_state_names: - if i in dataset_path_parts and dataset_path_parts.index(i) != 0: - return dataset_path_parts[dataset_path_parts.index(i) - 1] - return None - - def path_complies_with_naming_standard(self) -> bool: - """Check if path is valid according to SSB standard. - - Read more about SSB naming convention in the Dapla manual: - https://manual.dapla.ssb.no/statistikkere/navnestandard.html - - Returns: - True if the path conforms to the SSB naming standard, otherwise False. - """ - if ( - self.dataset_state - and self.statistic_short_name - and self.contains_data_from - and self.contains_data_until - and self.dataset_version - ): - return True - return False diff --git a/src/datadoc/backend/dataset_parser.py b/src/datadoc/backend/dataset_parser.py deleted file mode 100644 index 7acf13d0..00000000 --- a/src/datadoc/backend/dataset_parser.py +++ /dev/null @@ -1,240 +0,0 @@ -"""Abstractions for dataset file formats. - -Handles reading in the data and transforming data types to generic metadata types. -""" - -from __future__ import annotations - -import pathlib # noqa: TCH003 import is needed for docs build -import re -import typing as t -from abc import ABC -from abc import abstractmethod - -import pandas as pd -from datadoc_model.model import LanguageStringType -from datadoc_model.model import LanguageStringTypeItem -from datadoc_model.model import Variable -from pyarrow import parquet as pq - -from datadoc.enums import DataType -from datadoc.enums import SupportedLanguages - -if t.TYPE_CHECKING: - import pyarrow as pa - from cloudpathlib import CloudPath - -KNOWN_INTEGER_TYPES = ( - "int", - "int_", - "int8", - "int16", - "int32", - "int64", - "integer", - "long", - "uint", - "uint8", - "uint16", - "uint32", - "uint64", -) - -KNOWN_FLOAT_TYPES = ( - "double", - "float", - "float_", - "float16", - "float32", - "float64", - "decimal", - "number", - "numeric", - "num", -) - -KNOWN_STRING_TYPES = ( - "string", - "str", - "char", - "varchar", - "varchar2", - "text", - "txt", - "bytes", -) - -KNOWN_DATETIME_TYPES = ( - "timestamp", - "timestamp[us]", - "timestamp[ns]", - "datetime64", - " datetime64[ns]", - " datetime64[us]", - "date", - "datetime", - "time", -) - -KNOWN_BOOLEAN_TYPES = ("bool", "bool_", "boolean") - - -TYPE_CORRESPONDENCE: list[tuple[tuple[str, ...], DataType]] = [ - (KNOWN_INTEGER_TYPES, DataType.INTEGER), - (KNOWN_FLOAT_TYPES, DataType.FLOAT), - (KNOWN_STRING_TYPES, DataType.STRING), - (KNOWN_DATETIME_TYPES, DataType.DATETIME), - (KNOWN_BOOLEAN_TYPES, DataType.BOOLEAN), -] -TYPE_MAP: dict[str, DataType] = {} -for concrete_type, abstract_type in TYPE_CORRESPONDENCE: - TYPE_MAP.update({c: abstract_type for c in concrete_type}) - -TDatasetParser = t.TypeVar("TDatasetParser", bound="DatasetParser") - - -class DatasetParser(ABC): - """Abstract Base Class for all Dataset parsers. - - Implements: - - A static factory method to get the correct implementation for each file extension. - - A static method for data type conversion. - - Requires implementation by subclasses: - - A method to extract variables (columns) from the dataset, so they may be documented. - """ - - def __init__(self, dataset: pathlib.Path | CloudPath) -> None: - """Initialize for a given dataset.""" - self.dataset = dataset - - @staticmethod - def for_file(dataset: pathlib.Path | CloudPath) -> DatasetParser: - """Return the correct subclass based on the given dataset file.""" - supported_file_types: dict[ - str, - type[DatasetParser], - ] = { - ".parquet": DatasetParserParquet, - ".sas7bdat": DatasetParserSas7Bdat, - ".parquet.gzip": DatasetParserParquet, - } - file_type = "Unknown" - try: - file_type = dataset.suffix - # Gzipped parquet files can be read with DatasetParserParquet - match = re.search(r"(.parquet.gzip)", str(dataset).lower()) - file_type = ".parquet.gzip" if match else file_type - # Extract the appropriate reader class from the SUPPORTED_FILE_TYPES dict and return an instance of it - reader = supported_file_types[file_type](dataset) - except IndexError as e: - # Thrown when just one element is returned from split, meaning there is no file extension supplied - msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(supported_file_types.keys())}" - raise FileNotFoundError( - msg, - ) from e - except KeyError as e: - # In this case the file type is not supported, so we throw a helpful exception - msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(supported_file_types.keys())} or contact the maintainers to request support." - raise NotImplementedError( - msg, - ) from e - else: - return reader - - @staticmethod - def transform_data_type(data_type: str) -> DataType | None: - """Transform a concrete data type to an abstract data type. - - In statistical metadata, one is not interested in how the data is - technically stored, but in the meaning of the data type. Because of - this, we transform known data types to their abstract metadata - representations. - - If we encounter a data type we don't know, we just ignore it and let - the user handle it in the GUI. - - Arguments: - data_type: The concrete data type to map. - """ - return TYPE_MAP.get(data_type.lower(), None) - - @abstractmethod - def get_fields(self) -> list[Variable]: - """Abstract method, must be implemented by subclasses.""" - - -class DatasetParserParquet(DatasetParser): - """Concrete implementation for parsing parquet files.""" - - def __init__(self, dataset: pathlib.Path | CloudPath) -> None: - """Call the super init method for initialization. - - Args: - dataset: Path to the dataset to parse. - """ - super().__init__(dataset) - - def get_fields(self) -> list[Variable]: - """Extract the fields from this dataset.""" - with self.dataset.open(mode="rb") as f: - schema: pa.Schema = pq.read_schema(f) # type: ignore [arg-type] - return [ - Variable( - short_name=data_field.name.strip(), - data_type=self.transform_data_type(str(data_field.type)), - ) - for data_field in schema - if data_field.name - != "__index_level_0__" # Index columns should not be documented - ] - - -class DatasetParserSas7Bdat(DatasetParser): - """Concrete implementation for parsing SAS7BDAT files.""" - - def __init__(self, dataset: pathlib.Path | CloudPath) -> None: - """Call the super init method for initialization. - - Args: - dataset: Path to the dataset to parse. - """ - super().__init__(dataset) - - def get_fields(self) -> list[Variable]: - """Extract the fields from this dataset.""" - fields = [] - with self.dataset.open(mode="rb") as f: - # Use an iterator to avoid reading in the entire dataset - sas_reader = pd.read_sas(f, format="sas7bdat", iterator=True) - - # Get the first row from the iterator - try: - row = next(sas_reader) - except StopIteration as e: - msg = f"Could not read data from {self.dataset}" - raise RuntimeError(msg) from e - - # Get all the values from the row and loop through them - for i, v in enumerate(row.to_numpy().tolist()[0]): - fields.append( - Variable( - short_name=sas_reader.columns[i].name, # type: ignore [attr-defined] - # Assume labels are defined in the default language (NORSK_BOKMÅL) - # If this is not correct, the user may fix it via the UI - name=LanguageStringType( - [ - LanguageStringTypeItem( - languageCode=SupportedLanguages.NORSK_BOKMÅL.value, - languageText=sas_reader.columns[ # type: ignore [attr-defined] - i - ].label, - ), - ], - ), - # Access the python type for the value and transform it to a DataDoc Data type - data_type=self.transform_data_type(type(v).__name__.lower()), - ), - ) - - return fields diff --git a/src/datadoc/backend/external_sources/__init__.py b/src/datadoc/backend/external_sources/__init__.py deleted file mode 100644 index d7505726..00000000 --- a/src/datadoc/backend/external_sources/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Abstract parent class for interacting with external resources asynchorously.""" diff --git a/src/datadoc/backend/external_sources/external_sources.py b/src/datadoc/backend/external_sources/external_sources.py deleted file mode 100644 index 70c9da29..00000000 --- a/src/datadoc/backend/external_sources/external_sources.py +++ /dev/null @@ -1,87 +0,0 @@ -from __future__ import annotations - -import logging -from abc import ABC -from abc import abstractmethod -from typing import TYPE_CHECKING -from typing import Generic -from typing import TypeVar - -if TYPE_CHECKING: - import concurrent.futures - -logger = logging.getLogger(__name__) - -T = TypeVar("T") - - -class GetExternalSource(ABC, Generic[T]): - """Abstract base class for retrieving data from external sources asynchronously. - - This class provides methods to initiate an asynchronous data retrieval - operation, check its status, and retrieve the result once the operation - completes. Subclasses must implement the `_fetch_data_from_external_source` - method to define how data is fetched from the specific external source. - """ - - def __init__(self, executor: concurrent.futures.ThreadPoolExecutor) -> None: - """Initialize the GetExternalSource with an executor to manage asynchronous tasks. - - This constructor initializes a future object that will hold the result of the - asynchronous data fetching operation from an external source. - - Args: - executor: An instance of ThreadPoolExecutor to manage the asynchronous - execution of data fetching. - """ - self.future = executor.submit( - self._fetch_data_from_external_source, - ) - - def wait_for_external_result(self) -> None: - """Wait for the thread responsible for loading the external request to finish. - - If there is no future to wait for, it logs a warning and returns immediately. - """ - if not self.future: - logger.warning("No future to wait for.") - return - self.future.result() - - def check_if_external_data_is_loaded(self) -> bool: - """Check if the thread getting the external data has finished running. - - Returns: - True if the data fetching operation is complete, False otherwise. - """ - if self.future: - return self.future.done() - return False - - def retrieve_external_data(self) -> T | None: - """Retrieve the result of the data fetching operation. - - This method checks if the asynchronous data fetching operation has - completed. If the operation is finished, it returns the result. - Otherwise, it returns None. - - Returns: - The result of the data fetching operation if it is complete or None - if the operation has not yet finished. - """ - if self.future: - return self.future.result() - return None - - @abstractmethod - def _fetch_data_from_external_source(self) -> T | None: - """Handle external data retrieval. - - Abstract method to be implemented in the subclass. - This method should define the logic for retrieving data from the specific - external source. - - Returns: - The data retrieved from the external source. - """ - raise NotImplementedError diff --git a/src/datadoc/backend/model_backwards_compatibility.py b/src/datadoc/backend/model_backwards_compatibility.py deleted file mode 100644 index 50be9298..00000000 --- a/src/datadoc/backend/model_backwards_compatibility.py +++ /dev/null @@ -1,520 +0,0 @@ -"""Upgrade old metadata files to be compatible with new versions. - -An important principle of Datadoc is that we ALWAYS guarantee backwards -compatibility of existing metadata documents. This means that we guarantee -that a user will never lose data, even if their document is decades old. - -For each document version we release with breaking changes, we implement a -handler and register the version by defining a BackwardsCompatibleVersion -instance. These documents will then be upgraded when they're opened in Datadoc. - -A test must also be implemented for each new version. -""" - -from __future__ import annotations - -from collections import OrderedDict -from dataclasses import dataclass -from datetime import datetime -from datetime import timezone -from typing import TYPE_CHECKING -from typing import Any - -import arrow - -if TYPE_CHECKING: - from collections.abc import Callable - -VERSION_FIELD_NAME = "document_version" - - -class UnknownModelVersionError(Exception): - """Exception raised for unknown model versions. - - This error is thrown when an unrecognized model version is encountered. - """ - - def __init__( - self, - supplied_version: str, - *args: tuple[Any, ...], - ) -> None: - """Initialize the exception with the supplied version. - - Args: - supplied_version: The version of the model that was not recognized. - *args: Additional arguments for the Exception base class. - """ - super().__init__(args) - self.supplied_version = supplied_version - - def __str__(self) -> str: - """Return string representation.""" - return f"Document Version ({self.supplied_version}) of discovered file is not supported" - - -SUPPORTED_VERSIONS: OrderedDict[str, BackwardsCompatibleVersion] = OrderedDict() - - -@dataclass() -class BackwardsCompatibleVersion: - """A version which we support with backwards compatibility. - - This class registers a version and its corresponding handler function - for backwards compatibility. - """ - - version: str - handler: Callable[[dict[str, Any]], dict[str, Any]] - - def __post_init__(self) -> None: - """Register this version in the supported versions map. - - This method adds the instance to the `SUPPORTED_VERSIONS` dictionary - using the version as the key. - """ - SUPPORTED_VERSIONS[self.version] = self - - -def handle_current_version(supplied_metadata: dict[str, Any]) -> dict[str, Any]: - """Handle the current version of the metadata. - - This function returns the supplied metadata unmodified. - - Args: - supplied_metadata: The metadata for the current version. - - Returns: - The unmodified supplied metadata. - """ - return supplied_metadata - - -def _find_and_update_language_strings(supplied_metadata: dict | None) -> dict | None: - """Find and update language-specific strings in the supplied metadata. - - This function iterates through the supplied metadata dictionary. - For each key-value pair, if the value is a dictionary containing "en" - it is passed to the `_convert_language_string_type` function to potentially - update its format. - - Args: - supplied_metadata: A metadata dictionary where values may include nested - dictionaries with language-specific strings. - - Returns: - The updated metadata dictionary. If the supplied metadata is not a - dictionary, it returns `None`. - """ - if isinstance(supplied_metadata, dict): - for key, value in supplied_metadata.items(): - if isinstance(value, dict) and "en" in value: - supplied_metadata[key] = _convert_language_string_type(value) - return supplied_metadata - return None - - -def _convert_language_string_type(supplied_value: dict) -> list[dict[str, str]]: - """Convert a dictionary of language-specific strings to a list of dictionaries. - - This function takes a dictionary with language codes as keys and - corresponding language-specific strings as values, and converts it to a list - of dictionaries with 'languageCode' and 'languageText' keys. - - Args: - supplied_value: A dictionary containing language codes as keys and - language strings as values. - - Returns: - A list of dictionaries, each containing 'languageCode' and 'languageText' - keys, representing the converted language strings. - """ - return [ - { - "languageCode": "en", - "languageText": supplied_value["en"], - }, - { - "languageCode": "nn", - "languageText": supplied_value["nn"], - }, - { - "languageCode": "nb", - "languageText": supplied_value["nb"], - }, - ] - - -def _remove_element_from_model( - supplied_metadata: dict[str, Any], - element_to_remove: str, -) -> None: - """Remove an element from the supplied metadata dictionary. - - This function deletes a specified element from the supplied metadata dictionary - if it exists. - - Args: - supplied_metadata: The metadata dictionary from which the element will be - removed. - element_to_remove: The key of the element to be removed from the metadata - dictionary. - """ - if element_to_remove in supplied_metadata: - del supplied_metadata[element_to_remove] - - -def _cast_to_date_type(value_to_update: str | None) -> str | None: - """Convert a string to a date string in ISO format. - - This function takes a string representing a date and converts it to a - date string in ISO format. If the input is `None`, it returns `None` without - modification. - - Args: - value_to_update: A string representing a date or `None`. - - Returns: - The date string in ISO format if the input was a valid date string, or - `None` if the input was `None`. - """ - if value_to_update is None: - return value_to_update - - return str( - arrow.get( - value_to_update, - ).date(), - ) - - -def handle_version_3_3_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: - """Handle breaking changes for version 3.3.0. - - This function modifies the supplied metadata to accommodate breaking changes - introduced in version 4.0.0. Specifically, it removes the - 'direct_person_identifying' field from each variable in 'datadoc.variables' - and updates the 'document_version' field to "4.0.0". - - Args: - supplied_metadata: The metadata dictionary to be updated. - - Returns: - The updated metadata dictionary. - """ - for i in range(len(supplied_metadata["datadoc"]["variables"])): - _remove_element_from_model( - supplied_metadata["datadoc"]["variables"][i], - "direct_person_identifying", - ) - supplied_metadata["datadoc"]["document_version"] = "4.0.0" - return supplied_metadata - - -def handle_version_3_2_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: - """Handle breaking changes for version 3.2.0. - - This function modifies the supplied metadata to accommodate breaking - changes introduced in version 3.3.0. Specifically, it updates the - 'contains_data_from' and 'contains_data_until' fields in both the 'dataset' - and 'variables' sections of the supplied metadata dictionary to ensure they - are stored as date strings. - It also updates the 'document_version' field to "3.3.0". - - Args: - supplied_metadata: The metadata dictionary to be updated. - - Returns: - The updated metadata dictionary. - """ - fields = ["contains_data_from", "contains_data_until"] - for field in fields: - supplied_metadata["datadoc"]["dataset"][field] = _cast_to_date_type( - supplied_metadata["datadoc"]["dataset"].get(field, None), - ) - for v in supplied_metadata["datadoc"]["variables"]: - v[field] = _cast_to_date_type(v.get(field, None)) - - supplied_metadata["datadoc"]["document_version"] = "3.3.0" - return supplied_metadata - - -def handle_version_3_1_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: - """Handle breaking changes for version 3.1.0. - - This function modifies the supplied metadata to accommodate breaking - changes introduced in version 3.2.0. Specifically, it updates the - 'data_source' field in both the 'dataset' and 'variables' sections of the - supplied metadata dictionary by converting value to string. - The 'document_version' field is also updated to "3.2.0". - - Args: - supplied_metadata: The metadata dictionary to be updated. - - Returns: - The updated metadata dictionary. - """ - data: list = supplied_metadata["datadoc"]["dataset"]["data_source"] - - if data is not None: - supplied_metadata["datadoc"]["dataset"]["data_source"] = str( - data[0]["languageText"], - ) - - for i in range(len(supplied_metadata["datadoc"]["variables"])): - data = supplied_metadata["datadoc"]["variables"][i]["data_source"] - if data is not None: - supplied_metadata["datadoc"]["variables"][i]["data_source"] = str( - data[0]["languageText"], - ) - - supplied_metadata["datadoc"]["document_version"] = "3.2.0" - return supplied_metadata - - -def handle_version_2_2_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: - """Handle breaking changes for version 2.2.0. - - This function modifies the supplied metadata to accommodate breaking changes - introduced in version 3.1.0. Specifically, it updates the 'subject_field' in - the 'dataset' section of the supplied metadata dictionary by converting it to - a string. It also removes the 'register_uri' field from the 'dataset'. - Additionally, it removes 'sentinel_value_uri' from each variable, - sets 'special_value' and 'custom_type' fields to None, and updates - language strings in the 'variables' and 'dataset' sections. - The 'document_version' is updated to "3.1.0". - - Args: - supplied_metadata: The metadata dictionary to be updated. - - Returns: - The updated metadata dictionary. - """ - if supplied_metadata["datadoc"]["dataset"]["subject_field"] is not None: - data = supplied_metadata["datadoc"]["dataset"]["subject_field"] - supplied_metadata["datadoc"]["dataset"]["subject_field"] = str( - data["nb"] or data["nn"] or data["en"], - ) - - _remove_element_from_model(supplied_metadata["datadoc"]["dataset"], "register_uri") - - for i in range(len(supplied_metadata["datadoc"]["variables"])): - _remove_element_from_model( - supplied_metadata["datadoc"]["variables"][i], - "sentinel_value_uri", - ) - supplied_metadata["datadoc"]["variables"][i]["special_value"] = None - supplied_metadata["datadoc"]["variables"][i]["custom_type"] = None - supplied_metadata["datadoc"]["variables"][ - i - ] = _find_and_update_language_strings( - supplied_metadata["datadoc"]["variables"][i], - ) - supplied_metadata["datadoc"]["dataset"]["custom_type"] = None - supplied_metadata["datadoc"]["dataset"] = _find_and_update_language_strings( - supplied_metadata["datadoc"]["dataset"], - ) - supplied_metadata["datadoc"]["document_version"] = "3.1.0" - return supplied_metadata - - -def add_container(existing_metadata: dict) -> dict: - """Add container for previous versions. - - Adds a container structure for previous versions of metadata. - This function wraps the existing metadata in a new container structure - that includes the 'document_version', 'datadoc', and 'pseudonymization' - fields. The 'document_version' is set to "0.0.1" and 'pseudonymization' - is set to None. - - Args: - existing_metadata: The original metadata dictionary to be wrapped. - - Returns: - A new dictionary containing the wrapped metadata with additional fields. - """ - return { - "document_version": "0.0.1", - "datadoc": existing_metadata, - "pseudonymization": None, - } - - -def handle_version_2_1_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: - """Handle breaking changes for version 2.1.0. - - This function modifies the supplied metadata to accommodate breaking changes - introduced in version 2.2.0. Specifically, it updates the 'owner' field in - the 'dataset' section of the supplied metadata dictionary by converting it - from a LanguageStringType to a string. - The 'document_version' is updated to "2.2.0". - - Args: - supplied_metadata: The metadata dictionary to be updated. - - Returns: - The updated metadata dictionary. - """ - data = supplied_metadata["dataset"]["owner"] - supplied_metadata["dataset"]["owner"] = str(data["nb"] or data["nn"] or data["en"]) - supplied_metadata["document_version"] = "2.2.0" - return add_container(supplied_metadata) - - -def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]: - """Handle breaking changes for version 1.0.0. - - This function modifies the supplied metadata to accommodate breaking changes - introduced in version 2.1.0. Specifically, it updates the date fields - 'metadata_created_date' and 'metadata_last_updated_date' to ISO 8601 format - with UTC timezone. It also converts the 'data_source' field from a string to a - dictionary with language keys if necessary and removes the 'data_source_path' - field. - The 'document_version' is updated to "2.1.0". - - Args: - supplied_metadata: The metadata dictionary to be updated. - - Returns: - The updated metadata dictionary. - - """ - datetime_fields = [("metadata_created_date"), ("metadata_last_updated_date")] - for field in datetime_fields: - if supplied_metadata["dataset"][field]: - supplied_metadata["dataset"][field] = datetime.isoformat( - datetime.fromisoformat(supplied_metadata["dataset"][field]).astimezone( - tz=timezone.utc, - ), - timespec="seconds", - ) - if isinstance(supplied_metadata["dataset"]["data_source"], str): - supplied_metadata["dataset"]["data_source"] = { - "en": supplied_metadata["dataset"]["data_source"], - "nn": "", - "nb": "", - } - - _remove_element_from_model(supplied_metadata["dataset"], "data_source_path") - - supplied_metadata["document_version"] = "2.1.0" - return supplied_metadata - - -def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]: - """Handle breaking changes for version 0.1.1. - - This function modifies the supplied metadata to accommodate breaking changes - introduced in version 1.0.0. Specifically, it renames certain keys within the - `dataset` and `variables` sections, and replaces empty string values with - `None` for `dataset` keys. - - Args: - supplied_metadata: The metadata dictionary that needs to be updated. - - Returns: - The updated metadata dictionary. - - References: - PR ref: https://github.com/statisticsnorway/ssb-datadoc-model/pull/4 - """ - key_renaming = [ - ("metadata_created_date", "created_date"), - ("metadata_created_by", "created_by"), - ("metadata_last_updated_date", "last_updated_date"), - ("metadata_last_updated_by", "last_updated_by"), - ] - for new_key, old_key in key_renaming: - supplied_metadata["dataset"][new_key] = supplied_metadata["dataset"].pop( - old_key, - ) - # Replace empty strings with None, empty strings are not valid for LanguageStrings values - supplied_metadata["dataset"] = { - k: None if v == "" else v for k, v in supplied_metadata["dataset"].items() - } - - key_renaming = [("data_type", "datatype")] - - for i in range(len(supplied_metadata["variables"])): - for new_key, old_key in key_renaming: - supplied_metadata["variables"][i][new_key] = supplied_metadata["variables"][ - i - ].pop( - old_key, - ) - - return supplied_metadata - - -# Register all the supported versions and their handlers. -# MUST be ordered from oldest to newest. -BackwardsCompatibleVersion(version="0.1.1", handler=handle_version_0_1_1) -BackwardsCompatibleVersion(version="1.0.0", handler=handle_version_1_0_0) -BackwardsCompatibleVersion( - version="2.1.0", - handler=handle_version_2_1_0, -) # A container must be created at this version -BackwardsCompatibleVersion(version="2.2.0", handler=handle_version_2_2_0) -BackwardsCompatibleVersion(version="3.1.0", handler=handle_version_3_1_0) -BackwardsCompatibleVersion(version="3.2.0", handler=handle_version_3_2_0) -BackwardsCompatibleVersion(version="3.3.0", handler=handle_version_3_3_0) -BackwardsCompatibleVersion(version="4.0.0", handler=handle_current_version) - - -def upgrade_metadata(fresh_metadata: dict[str, Any]) -> dict[str, Any]: - """Upgrade the metadata to the latest version using registered handlers. - - This function checks the version of the provided metadata and applies a series - of upgrade handlers to migrate the metadata to the latest version. - It starts from the provided version and applies all subsequent handlers in - sequence. If the metadata is already in the latest version or the version - cannot be determined, appropriate actions are taken. - - Args: - fresh_metadata: The metadata dictionary to be upgraded. This dictionary - must include version information that determines which handlers to apply. - - Returns: - The upgraded metadata dictionary, after applying all necessary handlers. - - Raises: - UnknownModelVersionError: If the metadata's version is unknown or unsupported. - """ - # Special case for current version, we expose the current_model_version parameter for test purposes - if is_metadata_in_container_structure(fresh_metadata): - if fresh_metadata["datadoc"] is None: - return fresh_metadata - supplied_version = fresh_metadata["datadoc"][VERSION_FIELD_NAME] - else: - supplied_version = fresh_metadata[VERSION_FIELD_NAME] - start_running_handlers = False - # Run all the handlers in order from the supplied version onwards - for k, v in SUPPORTED_VERSIONS.items(): - if k == supplied_version: - start_running_handlers = True - if start_running_handlers: - fresh_metadata = v.handler(fresh_metadata) - if not start_running_handlers: - raise UnknownModelVersionError(supplied_version) - return fresh_metadata - - -def is_metadata_in_container_structure( - metadata: dict, -) -> bool: - """Check if the metadata is in the container structure. - - At a certain point a metadata 'container' was introduced. - The container provides a structure for different 'types' of metadata, such as - 'datadoc', 'pseudonymization' etc. - This function determines if the given metadata dictionary follows this container - structure by checking for the presence of the 'datadoc' field. - - Args: - metadata: The metadata dictionary to check. - - Returns: - True if the metadata is in the container structure (i.e., contains the - 'datadoc' field), False otherwise. - """ - return "datadoc" in metadata diff --git a/src/datadoc/backend/model_validation.py b/src/datadoc/backend/model_validation.py deleted file mode 100644 index bda5a5fe..00000000 --- a/src/datadoc/backend/model_validation.py +++ /dev/null @@ -1,188 +0,0 @@ -"""Handle validation for metadata with pydantic validators and custom warnings.""" - -from __future__ import annotations - -import logging -import warnings -from typing import TYPE_CHECKING -from typing import Self -from typing import TextIO - -from datadoc_model import model -from pydantic import model_validator - -from datadoc.backend.constants import DATE_VALIDATION_MESSAGE -from datadoc.backend.constants import NUM_OBLIGATORY_DATASET_FIELDS -from datadoc.backend.constants import NUM_OBLIGATORY_VARIABLES_FIELDS -from datadoc.backend.constants import OBLIGATORY_METADATA_WARNING -from datadoc.backend.utils import get_missing_obligatory_dataset_fields -from datadoc.backend.utils import get_missing_obligatory_variables_fields -from datadoc.backend.utils import incorrect_date_order -from datadoc.backend.utils import num_obligatory_dataset_fields_completed -from datadoc.backend.utils import num_obligatory_variables_fields_completed -from datadoc.backend.utils import set_variables_inherit_from_dataset -from datadoc.utils import get_timestamp_now - -if TYPE_CHECKING: - from datetime import datetime - -logger = logging.getLogger(__name__) - - -class ValidateDatadocMetadata(model.DatadocMetadata): - """Class that inherits from DatadocMetadata, providing additional validation.""" - - @model_validator(mode="after") - def check_date_order(self) -> Self: - """Validate the order of date fields. - - Check that dataset and variable date fields `contains_data_from` and - `contains_data_until` are in chronological order. - - Mode: This validator runs after other validation. - - Returns: - The instance of the model after validation. - - Raises: - ValueError: If `contains_data_until` date is earlier than - `contains_data_from date`. - """ - if self.dataset is not None and incorrect_date_order( - self.dataset.contains_data_from, - self.dataset.contains_data_until, - ): - raise ValueError(DATE_VALIDATION_MESSAGE) - if self.variables is not None: - for v in self.variables: - if incorrect_date_order(v.contains_data_from, v.contains_data_until): - raise ValueError(DATE_VALIDATION_MESSAGE) - return self - - @model_validator(mode="after") - def check_metadata_created_date(self) -> Self: - """Ensure `metadata_created_date` is set for the dataset. - - Sets the current timestamp if `metadata_created_date` is None. - - Mode: This validator runs after other validation. - - Returns: - The instance of the model after validation. - """ - timestamp: datetime = get_timestamp_now() # --check-untyped-defs - if self.dataset is not None and self.dataset.metadata_created_date is None: - self.dataset.metadata_created_date = timestamp - return self - - @model_validator(mode="after") - def check_inherit_values(self) -> Self: - """Inherit values from dataset to variables if not set. - - Sets values for 'data source', 'temporality type', 'contains data from', - and 'contains data until' if they are None. - - Mode: This validator runs after other validation. - - Returns: - The instance of the model after validation. - """ - if self.variables and self.dataset is not None: - set_variables_inherit_from_dataset(self.dataset, self.variables) - return self - - @model_validator(mode="after") - def check_obligatory_dataset_metadata(self) -> Self: - """Check obligatory dataset fields and issue a warning if any are missing. - - Mode: - This validator runs after other validation. - - Returns: - The instance of the model after validation. - - Raises: - ObligatoryDatasetWarning: If not all obligatory dataset metadata fields - are filled in. - """ - if ( - self.dataset is not None - and num_obligatory_dataset_fields_completed( - self.dataset, - ) - != NUM_OBLIGATORY_DATASET_FIELDS - ): - warnings.warn( - f"{OBLIGATORY_METADATA_WARNING} {get_missing_obligatory_dataset_fields(self.dataset)}", - ObligatoryDatasetWarning, - stacklevel=2, - ) - logger.warning( - "Type warning: %s.%s %s", - ObligatoryDatasetWarning, - OBLIGATORY_METADATA_WARNING, - get_missing_obligatory_dataset_fields(self.dataset), - ) - - return self - - @model_validator(mode="after") - def check_obligatory_variables_metadata(self) -> Self: - """Check obligatory variable fields and issue a warning if any are missing. - - Mode: - This validator runs after other validation. - - Returns: - The instance of the model after validation. - - Raises: - ObligatoryVariableWarning: If not all obligatory variable metadata fields - are filled in. - """ - if self.variables is not None and num_obligatory_variables_fields_completed( - self.variables, - ) != (NUM_OBLIGATORY_VARIABLES_FIELDS * len(self.variables)): - warnings.warn( - f"{OBLIGATORY_METADATA_WARNING} {get_missing_obligatory_variables_fields(self.variables)}", - ObligatoryVariableWarning, - stacklevel=2, - ) - logger.warning( - "Type warning: %s.%s %s", - ObligatoryVariableWarning, - OBLIGATORY_METADATA_WARNING, - get_missing_obligatory_variables_fields(self.variables), - ) - - return self - - -class ValidationWarning(UserWarning): - """Custom warning for validation purposes.""" - - -class ObligatoryDatasetWarning(UserWarning): - """Custom warning for checking obligatory metadata for dataset.""" - - -class ObligatoryVariableWarning(UserWarning): - """Custom warning for checking obligatory metadata for variables.""" - - -def custom_warning_handler( # noqa: PLR0913 remove fields causes incompatible types - message: Warning | str, - category: type[Warning], - filename: str, - lineno: int, - file: TextIO | None = None, # noqa: ARG001 remove causes incompatible types - line: str | None = None, # noqa: ARG001 remove causes incompatible types -) -> None: - """Handle warnings.""" - print( # noqa: T201 - f"Warning: {message}, Category: {category.__name__}, Filename: {filename}, Line: {lineno}", - ) - - -warnings.showwarning = custom_warning_handler -warnings.simplefilter("always") diff --git a/src/datadoc/backend/statistic_subject_mapping.py b/src/datadoc/backend/statistic_subject_mapping.py deleted file mode 100644 index cd35c8b2..00000000 --- a/src/datadoc/backend/statistic_subject_mapping.py +++ /dev/null @@ -1,182 +0,0 @@ -from __future__ import annotations - -import logging -from dataclasses import dataclass -from typing import TYPE_CHECKING - -import bs4 -import requests -from bs4 import BeautifulSoup -from bs4 import ResultSet - -from datadoc.backend.external_sources.external_sources import GetExternalSource -from datadoc.enums import SupportedLanguages - -if TYPE_CHECKING: - import concurrent - -logger = logging.getLogger(__name__) - - -@dataclass -class Subject: - """Base class for Primary and Secondary subjects. - - A statistical subject is a related grouping of statistics. - """ - - titles: dict[str, str] - subject_code: str - - def get_title(self, language: SupportedLanguages) -> str: - """Get the title in the given language.""" - try: - return self.titles[ - ( - # Adjust to language codes in the StatisticSubjectMapping structure. - "no" - if language - in [ - SupportedLanguages.NORSK_BOKMÅL, - SupportedLanguages.NORSK_NYNORSK, - ] - else "en" - ) - ] - except KeyError: - logger.exception( - "Could not find title for subject %s and language: %s", - self, - language.name, - ) - return "" - - -@dataclass -class SecondarySubject(Subject): - """Data structure for secondary subjects or 'delemne'.""" - - statistic_short_names: list[str] - - -@dataclass -class PrimarySubject(Subject): - """Data structure for primary subjects or 'hovedemne'.""" - - secondary_subjects: list[SecondarySubject] - - -class StatisticSubjectMapping(GetExternalSource): - """Provide mapping between statistic short name and primary and secondary subject.""" - - def __init__( - self, - executor: concurrent.futures.ThreadPoolExecutor, - source_url: str | None, - ) -> None: - """Retrieve the statistical structure document from the given URL. - - Initializes the mapping based on values in the statistical structure document sourced at `source_url`. - - Args: - executor: The ThreadPoolExecutor which will run the job of fetching the statistical structure document. - source_url: The URL from which to fetch the statistical structure document. - """ - self.source_url = source_url - - self._statistic_subject_structure_xml: ResultSet | None = None - - self._primary_subjects: list[PrimarySubject] = [] - - super().__init__(executor) - - def get_secondary_subject(self, statistic_short_name: str | None) -> str | None: - """Looks up the secondary subject for the given statistic short name in the mapping dict. - - Returns the secondary subject string if found, else None. - """ - for p in self.primary_subjects: - for s in p.secondary_subjects: - if statistic_short_name in s.statistic_short_names: - logger.debug("Got %s from %s", s, statistic_short_name) - return s.subject_code - - logger.debug("No secondary subject found for %s", statistic_short_name) - return None - - @staticmethod - def _extract_titles(titles_xml: bs4.element.Tag) -> dict[str, str]: - titles = {} - for title in titles_xml.find_all("tittel"): - titles[title["sprak"]] = title.text - return titles - - def _fetch_data_from_external_source(self) -> ResultSet | None: - """Fetch statistical structure document from source_url. - - Returns a BeautifulSoup ResultSet. - """ - try: - url = str(self.source_url) - response = requests.get(url, timeout=30) - response.encoding = "utf-8" - logger.debug("Got response %s from %s", response, url) - soup = BeautifulSoup(response.text, features="xml") - return soup.find_all("hovedemne") - except requests.exceptions.RequestException: - logger.exception( - "Exception while fetching statistical structure ", - ) - return None - - def _parse_statistic_subject_structure_xml( - self, - statistical_structure_xml: ResultSet, - ) -> list[PrimarySubject]: - primary_subjects: list[PrimarySubject] = [] - for p in statistical_structure_xml: - secondary_subjects: list[SecondarySubject] = [ - SecondarySubject( - self._extract_titles(s.titler), - s["emnekode"], - [statistikk["kortnavn"] for statistikk in s.find_all("Statistikk")], - ) - for s in p.find_all("delemne") - ] - - primary_subjects.append( - PrimarySubject( - self._extract_titles(p.titler), - p["emnekode"], - secondary_subjects, - ), - ) - return primary_subjects - - @property - def primary_subjects(self) -> list[PrimarySubject]: - """Getter for primary subjects.""" - if not self._primary_subjects: - self._parse_xml_if_loaded() - logger.debug("Got %s primary subjects", len(self._primary_subjects)) - return self._primary_subjects - - def _parse_xml_if_loaded(self) -> bool: - """Checks if the xml is loaded, then parses the xml if it is loaded. - - Returns `True` if it is loaded and parsed. - """ - if self.check_if_external_data_is_loaded(): - self._statistic_subject_structure_xml = self.retrieve_external_data() - - if self._statistic_subject_structure_xml is not None: - self._primary_subjects = self._parse_statistic_subject_structure_xml( - self._statistic_subject_structure_xml, - ) - logger.debug( - "Thread finished. Parsed %s primary subjects", - len(self._primary_subjects), - ) - return True - logger.warning("Thread is not done. Cannot parse xml.") - return False diff --git a/src/datadoc/backend/user_info.py b/src/datadoc/backend/user_info.py deleted file mode 100644 index 14548445..00000000 --- a/src/datadoc/backend/user_info.py +++ /dev/null @@ -1,88 +0,0 @@ -from __future__ import annotations - -import contextlib -import logging -from typing import Protocol - -import jwt - -from datadoc import config -from datadoc.enums import DaplaRegion -from datadoc.enums import DaplaService - -logger = logging.getLogger(__name__) - - -PLACEHOLDER_EMAIL_ADDRESS = "default_user@ssb.no" - - -class UserInfo(Protocol): - """Information about the current user. - - Implementations may be provided for different platforms or testing. - """ - - @property - def short_email(self) -> str | None: - """Get the short email address.""" - ... - - -class UnknownUserInfo: - """Fallback when no implementation is found.""" - - @property - def short_email(self) -> str | None: - """Unknown email address.""" - return None - - -class TestUserInfo: - """Information about the current user for local development and testing.""" - - @property - def short_email(self) -> str | None: - """Get the short email address.""" - return PLACEHOLDER_EMAIL_ADDRESS - - -class DaplaLabUserInfo: - """Information about the current user when running on Dapla Lab.""" - - @property - def short_email(self) -> str | None: - """Get the short email address.""" - encoded_jwt = config.get_oidc_token() - if encoded_jwt: - # The JWT has been verified by the platform prior to injection, no need to verify. - decoded_jwt = jwt.decode(encoded_jwt, options={"verify_signature": False}) - with contextlib.suppress(KeyError): - # If email can't be found in the JWT, fall through and return None - return decoded_jwt["email"] - - logger.warning( - "Could not access JWT from environment. Could not get short email address.", - ) - return None - - -class JupyterHubUserInfo: - """Information about the current user when running on JupyterHub.""" - - @property - def short_email(self) -> str | None: - """Get the short email address.""" - return config.get_jupyterhub_user() - - -def get_user_info_for_current_platform() -> UserInfo: - """Return the correct implementation of UserInfo for the current platform.""" - if config.get_dapla_region() == DaplaRegion.DAPLA_LAB: - return DaplaLabUserInfo() - elif config.get_dapla_service() == DaplaService.JUPYTERLAB: # noqa: RET505 - return JupyterHubUserInfo() - else: - logger.warning( - "Was not possible to retrieve user information! Some fields may not be set.", - ) - return UnknownUserInfo() diff --git a/src/datadoc/backend/utils.py b/src/datadoc/backend/utils.py deleted file mode 100644 index b58671e4..00000000 --- a/src/datadoc/backend/utils.py +++ /dev/null @@ -1,388 +0,0 @@ -from __future__ import annotations - -import datetime # noqa: TCH003 import is needed in xdoctest -import logging -import pathlib -import uuid - -from cloudpathlib import CloudPath -from cloudpathlib import GSClient -from cloudpathlib import GSPath -from dapla import AuthClient -from datadoc_model import model - -from datadoc.backend.constants import NUM_OBLIGATORY_VARIABLES_FIELDS -from datadoc.backend.constants import OBLIGATORY_DATASET_METADATA_IDENTIFIERS -from datadoc.backend.constants import ( - OBLIGATORY_DATASET_METADATA_IDENTIFIERS_MULTILANGUAGE, -) -from datadoc.backend.constants import OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS -from datadoc.backend.constants import ( - OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE, -) -from datadoc.enums import Assessment -from datadoc.enums import DataSetState -from datadoc.enums import VariableRole - -logger = logging.getLogger(__name__) - - -def normalize_path(path: str) -> pathlib.Path | CloudPath: - """Obtain a pathlib compatible Path. - - Obtains a pathlib compatible Path regardless of whether the file is on a filesystem or in GCS. - - Args: - path: Path on a filesystem or in cloud storage. - - Returns: - Pathlib compatible object. - """ - if path.startswith(GSPath.cloud_prefix): - client = GSClient(credentials=AuthClient.fetch_google_credentials()) - return GSPath(path, client=client) - return pathlib.Path(path) - - -def calculate_percentage(completed: int, total: int) -> int: - """Calculate percentage as a rounded integer. - - Args: - completed: The number of completed items. - total: The total number of items. - - Returns: - The rounded percentage of completed items out of the total. - """ - return round((completed / total) * 100) - - -def derive_assessment_from_state(state: DataSetState) -> Assessment: - """Derive assessment from dataset state. - - Args: - state: The state of the dataset. - - Returns: - The derived assessment of the dataset. - """ - match (state): - case ( - DataSetState.INPUT_DATA - | DataSetState.PROCESSED_DATA - | DataSetState.STATISTICS - ): - return Assessment.PROTECTED - case DataSetState.OUTPUT_DATA: - return Assessment.OPEN - case DataSetState.SOURCE_DATA: - return Assessment.SENSITIVE - - -def set_default_values_variables(variables: list) -> None: - """Set default values on variables. - - Args: - variables: A list of variable objects to set default values on. - - Example: - >>> variables = [model.Variable(short_name="pers",id=None, is_personal_data = None), model.Variable(short_name="fnr",id='9662875c-c245-41de-b667-12ad2091a1ee', is_personal_data='PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA')] - >>> set_default_values_variables(variables) - >>> isinstance(variables[0].id, uuid.UUID) - True - - >>> variables[1].is_personal_data == 'PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA' - True - - >>> variables[0].is_personal_data == 'NOT_PERSONAL_DATA' - True - """ - for v in variables: - if v.id is None: - v.id = uuid.uuid4() - if v.is_personal_data is None: - v.is_personal_data = model.IsPersonalData.NOT_PERSONAL_DATA - if v.variable_role is None: - v.variable_role = VariableRole.MEASURE - - -def set_default_values_dataset(dataset: model.Dataset) -> None: - """Set default values on dataset. - - Args: - dataset: The dataset object to set default values on. - - Example: - >>> dataset = model.Dataset(id=None, contains_personal_data=None) - >>> set_default_values_dataset(dataset) - >>> dataset.id is not None - True - - >>> dataset.contains_personal_data == False - True - """ - if not dataset.id: - dataset.id = uuid.uuid4() - if dataset.contains_personal_data is None: - dataset.contains_personal_data = False - - -def set_variables_inherit_from_dataset( - dataset: model.Dataset, - variables: list, -) -> None: - """Set specific dataset values on a list of variable objects. - - This function populates 'data source', 'temporality type', 'contains data from', - and 'contains data until' fields in each variable if they are not set (None). - The values are inherited from the corresponding fields in the dataset. - - Args: - dataset: The dataset object from which to inherit values. - variables: A list of variable objects to update with dataset values. - - Example: - >>> dataset = model.Dataset(short_name='person_data_v1',data_source='01',temporality_type='STATUS',id='9662875c-c245-41de-b667-12ad2091a1ee',contains_data_from="2010-09-05",contains_data_until="2022-09-05") - >>> variables = [model.Variable(short_name="pers",data_source =None,temporality_type = None, contains_data_from = None,contains_data_until = None)] - >>> set_variables_inherit_from_dataset(dataset, variables) - >>> variables[0].data_source == dataset.data_source - True - - >>> variables[0].temporality_type is None - False - - >>> variables[0].contains_data_from == dataset.contains_data_from - True - - >>> variables[0].contains_data_until == dataset.contains_data_until - True - """ - for v in variables: - v.contains_data_from = v.contains_data_from or dataset.contains_data_from - v.contains_data_until = v.contains_data_until or dataset.contains_data_until - v.temporality_type = v.temporality_type or dataset.temporality_type - v.data_source = v.data_source or dataset.data_source - - -def incorrect_date_order( - date_from: datetime.date | None, - date_until: datetime.date | None, -) -> bool: - """Evaluate the chronological order of two dates. - - This function checks if 'date until' is earlier than 'date from'. If so, it - indicates an incorrect date order. - - Args: - date_from: The start date of the time period. - date_until: The end date of the time period. - - Returns: - True if 'date_until' is earlier than 'date_from' or if only 'date_from' is None, False otherwise. - - Example: - >>> incorrect_date_order(datetime.date(1980, 1, 1), datetime.date(1967, 1, 1)) - True - - >>> incorrect_date_order(datetime.date(1967, 1, 1), datetime.date(1980, 1, 1)) - False - - >>> incorrect_date_order(None, datetime.date(2024,7,1)) - True - """ - if date_from is None and date_until is not None: - return True - return date_from is not None and date_until is not None and date_until < date_from - - -def _is_missing_multilanguage_value( - field_name: str, - field_value, # noqa: ANN001 Skip type hint to enable dynamically handling value for LanguageStringType not indexable - obligatory_list: list, -) -> bool: - """Check obligatory fields with multilanguage value. - - This function checks whether a given field, which is supposed to have - multilanguage values, is missing values in all specified languages. - - Args: - field_name: The name of the field to check. - field_value: The value of the field. Expected to be of type LanguageStringType. - obligatory_list: A list of obligatory field names that should have multilanguage values. - - Returns: - True if no value in any of languages for one field, False otherwise. - """ - return bool( - field_name in obligatory_list - and field_value - and ( - len(field_value[0]) > 0 - and not field_value[0]["languageText"] - and (len(field_value) <= 1 or not field_value[1]["languageText"]) - and ( - len(field_value) <= 2 # noqa: PLR2004 approve magic value - or not field_value[2]["languageText"] - ) - ), - ) - - -def _is_missing_metadata( - field_name: str, - field_value, # noqa: ANN001 Skip type hint because method '_is_missing_multilanguage_value' - obligatory_list: list, - obligatory_multi_language_list: list, -) -> bool: - """Check if an obligatory field is missing its value. - - This function checks whether a given field, which may be a simple string or a - multilanguage value, is missing its value. It considers two lists: one for - obligatory fields and another for obligatory multilanguage fields. - - Args: - field_name: The name of the field to check. - field_value: The value of the field. Can be of type str, or LanguageStringType for - multilanguage fields. - obligatory_list: List of obligatory fields. - obligatory_multi_language_list: List of obligatory fields with multilanguage - values. - - Returns: - True if the field doesn't have a value, False otherwise. - """ - return bool( - field_name in obligatory_list - and field_value is None - or _is_missing_multilanguage_value( - field_name, - field_value, - obligatory_multi_language_list, - ), - ) - - -def num_obligatory_dataset_fields_completed(dataset: model.Dataset) -> int: - """Count the number of completed obligatory dataset fields. - - This function returns the total count of obligatory fields in the dataset that - have values (are not None). - - Args: - dataset: The dataset object for which to count the fields. - - Returns: - The number of obligatory dataset fields that have been completed (not None). - """ - return len(OBLIGATORY_DATASET_METADATA_IDENTIFIERS) - len( - get_missing_obligatory_dataset_fields(dataset), - ) - - -def num_obligatory_variables_fields_completed(variables: list) -> int: - """Count the number of obligatory fields completed for all variables. - - This function calculates the total number of obligatory fields that have - values (are not None) for one variable in the list. - - Args: - variables: A list with variable objects. - - Returns: - The total number of obligatory variable fields that have been completed - (not None) for all variables. - """ - num_completed = 0 - for v in variables: - num_completed += num_obligatory_variable_fields_completed(v) - return num_completed - - -def num_obligatory_variable_fields_completed(variable: model.Variable) -> int: - """Count the number of obligatory fields completed for one variable. - - This function calculates the total number of obligatory fields that have - values (are not None) for one variable in the list. - - Args: - variable: The variable to count obligatory fields for. - - Returns: - The total number of obligatory variable fields that have been completed - (not None) for one variable. - """ - missing_metadata = [ - key - for key, value in variable.model_dump().items() - if _is_missing_metadata( - key, - value, - OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS, - OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE, - ) - ] - return NUM_OBLIGATORY_VARIABLES_FIELDS - len(missing_metadata) - - -def get_missing_obligatory_dataset_fields(dataset: model.Dataset) -> list: - """Identify all obligatory dataset fields that are missing values. - - This function checks for obligatory fields that are either directly missing - (i.e., set to `None`) or have multilanguage values with empty content. - - Args: - dataset: The dataset object to examine. This object must support the - `model_dump()` method which returns a dictionary of field names and - values. - - Returns: - A list of field names (as strings) that are missing values. This includes: - - Fields that are directly `None` and are listed as obligatory metadata. - - Multilanguage fields (listed as obligatory metadata`) where - the value exists but the primary language text is empty. - """ - return [ - key - for key, value in dataset.model_dump().items() - if _is_missing_metadata( - key, - value, - OBLIGATORY_DATASET_METADATA_IDENTIFIERS, - OBLIGATORY_DATASET_METADATA_IDENTIFIERS_MULTILANGUAGE, - ) - ] - - -def get_missing_obligatory_variables_fields(variables: list) -> list[dict]: - """Identify obligatory variable fields that are missing values for each variable. - - This function checks for obligatory fields that are either directly missing - (i.e., set to `None`) or have multilanguage values with empty content. - - Args: - variables: A list of variable objects to check for missing obligatory fields. - - Returns: - A list of dictionaries with variable short names as keys and list of missing - obligatory variable fields as values. This includes: - - Fields that are directly `None` and are llisted as obligatory metadata. - - Multilanguage fields (listed as obligatory metadata) where the value - exists but the primary language text is empty. - """ - missing_variables_fields = [ - { - variable.short_name: [ - key - for key, value in variable.model_dump().items() - if _is_missing_metadata( - key, - value, - OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS, - OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE, - ) - ], - } - for variable in variables - ] - # Filtering out variable keys with empty values list - return [item for item in missing_variables_fields if next(iter(item.values()))] diff --git a/src/datadoc/config.py b/src/datadoc/config.py index c67bdd38..d0a29a19 100644 --- a/src/datadoc/config.py +++ b/src/datadoc/config.py @@ -8,12 +8,11 @@ from pprint import pformat from typing import Literal +from dapla_metadata.datasets import enums from dotenv import dotenv_values from dotenv import load_dotenv from datadoc.constants import DAPLA_MANUAL_TEXT -from datadoc.enums import DaplaRegion -from datadoc.enums import DaplaService from datadoc.frontend.components.builders import build_link_object logging.basicConfig(level=logging.DEBUG, force=True) @@ -117,18 +116,18 @@ def get_statistical_subject_source_url() -> str | None: return _get_config_item("DATADOC_STATISTICAL_SUBJECT_SOURCE_URL") -def get_dapla_region() -> DaplaRegion | None: +def get_dapla_region() -> enums.DaplaRegion | None: """Get the Dapla region we're running on.""" if region := _get_config_item(DAPLA_REGION): - return DaplaRegion(region) + return enums.DaplaRegion(region) return None -def get_dapla_service() -> DaplaService | None: +def get_dapla_service() -> enums.DaplaService | None: """Get the Dapla service we're running on.""" if service := _get_config_item(DAPLA_SERVICE): - return DaplaService(service) + return enums.DaplaService(service) return None diff --git a/src/datadoc/enums.py b/src/datadoc/enums.py index 1b2fa666..e6d7e5a6 100644 --- a/src/datadoc/enums.py +++ b/src/datadoc/enums.py @@ -4,39 +4,8 @@ from enum import Enum -from datadoc_model import model -from datadoc_model.model import LanguageStringType -from datadoc_model.model import LanguageStringTypeItem - - -class DaplaRegion(str, Enum): - """Dapla platforms/regions.""" - - DAPLA_LAB = "DAPLA_LAB" - BIP = "BIP" - ON_PREM = "ON_PREM" - CLOUD_RUN = "CLOUD_RUN" - - -class DaplaService(str, Enum): - """Dapla services.""" - - DATADOC = "DATADOC" - JUPYTERLAB = "JUPYTERLAB" - VS_CODE = "VS_CODE" - R_STUDIO = "R_STUDIO" - KILDOMATEN = "KILDOMATEN" - - -class SupportedLanguages(str, Enum): - """The list of languages metadata may be recorded in. - - Reference: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry - """ - - NORSK_BOKMÅL = "nb" - NORSK_NYNORSK = "nn" - ENGLISH = "en" +from dapla_metadata.datasets import enums +from dapla_metadata.datasets import model class LanguageStringsEnum(Enum): @@ -44,7 +13,7 @@ class LanguageStringsEnum(Enum): def __init__( self, - language_strings: LanguageStringType, + language_strings: model.LanguageStringType, ) -> None: """Store the LanguageStringType object for displaying enum values in multiple languages. @@ -55,21 +24,9 @@ def __init__( self._value_ = self.name self.language_strings = language_strings - @classmethod - def _missing_(cls, value: object) -> LanguageStringsEnum: - """Support constructing an enum member from a supplied name string.""" - try: - member: LanguageStringsEnum = cls._member_map_[str(value)] # type: ignore [assignment] - except KeyError as e: - # Raise the expected exception with a useful explanation - message = f"{value} is not a valid {cls.__qualname__}" - raise ValueError(message) from e - else: - return member - def get_value_for_language( self, - language: SupportedLanguages, + language: enums.SupportedLanguages, ) -> str | None: """Retrieve the string for the relevant language.""" if self.language_strings.root is not None: @@ -82,34 +39,34 @@ def get_value_for_language( class Assessment(LanguageStringsEnum): """Sensitivity of data.""" - SENSITIVE = LanguageStringType( + SENSITIVE = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.Assessment.SENSITIVE.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="SENSITIV"), - LanguageStringTypeItem(languageCode="nb", languageText="SENSITIV"), + model.LanguageStringTypeItem(languageCode="nn", languageText="SENSITIV"), + model.LanguageStringTypeItem(languageCode="nb", languageText="SENSITIV"), ], ) - PROTECTED = LanguageStringType( + PROTECTED = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.Assessment.PROTECTED.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="SKJERMET"), - LanguageStringTypeItem(languageCode="nb", languageText="SKJERMET"), + model.LanguageStringTypeItem(languageCode="nn", languageText="SKJERMET"), + model.LanguageStringTypeItem(languageCode="nb", languageText="SKJERMET"), ], ) - OPEN = LanguageStringType( + OPEN = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.Assessment.OPEN.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="ÅPEN"), - LanguageStringTypeItem(languageCode="nb", languageText="ÅPEN"), + model.LanguageStringTypeItem(languageCode="nn", languageText="ÅPEN"), + model.LanguageStringTypeItem(languageCode="nb", languageText="ÅPEN"), ], ) @@ -117,44 +74,44 @@ class Assessment(LanguageStringsEnum): class DataSetStatus(LanguageStringsEnum): """Lifecycle status of a dataset.""" - DRAFT = LanguageStringType( + DRAFT = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataSetStatus.DRAFT.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="UTKAST"), - LanguageStringTypeItem(languageCode="nb", languageText="UTKAST"), + model.LanguageStringTypeItem(languageCode="nn", languageText="UTKAST"), + model.LanguageStringTypeItem(languageCode="nb", languageText="UTKAST"), ], ) - INTERNAL = LanguageStringType( + INTERNAL = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataSetStatus.INTERNAL.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="INTERN"), - LanguageStringTypeItem(languageCode="nb", languageText="INTERN"), + model.LanguageStringTypeItem(languageCode="nn", languageText="INTERN"), + model.LanguageStringTypeItem(languageCode="nb", languageText="INTERN"), ], ) - EXTERNAL = LanguageStringType( + EXTERNAL = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataSetStatus.EXTERNAL.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="EKSTERN"), - LanguageStringTypeItem(languageCode="nb", languageText="EKSTERN"), + model.LanguageStringTypeItem(languageCode="nn", languageText="EKSTERN"), + model.LanguageStringTypeItem(languageCode="nb", languageText="EKSTERN"), ], ) - DEPRECATED = LanguageStringType( + DEPRECATED = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataSetStatus.DEPRECATED.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="UTGÅTT"), - LanguageStringTypeItem(languageCode="nb", languageText="UTGÅTT"), + model.LanguageStringTypeItem(languageCode="nn", languageText="UTGÅTT"), + model.LanguageStringTypeItem(languageCode="nb", languageText="UTGÅTT"), ], ) @@ -162,54 +119,60 @@ class DataSetStatus(LanguageStringsEnum): class DataSetState(LanguageStringsEnum): """Processing state of a dataset.""" - SOURCE_DATA = LanguageStringType( + SOURCE_DATA = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataSetState.SOURCE_DATA.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="KILDEDATA"), - LanguageStringTypeItem(languageCode="nb", languageText="KILDEDATA"), + model.LanguageStringTypeItem(languageCode="nn", languageText="KILDEDATA"), + model.LanguageStringTypeItem(languageCode="nb", languageText="KILDEDATA"), ], ) - INPUT_DATA = LanguageStringType( + INPUT_DATA = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataSetState.INPUT_DATA.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="INNDATA"), - LanguageStringTypeItem(languageCode="nb", languageText="INNDATA"), + model.LanguageStringTypeItem(languageCode="nn", languageText="INNDATA"), + model.LanguageStringTypeItem(languageCode="nb", languageText="INNDATA"), ], ) - PROCESSED_DATA = LanguageStringType( + PROCESSED_DATA = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataSetState.PROCESSED_DATA.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="KLARGJORTE DATA"), - LanguageStringTypeItem(languageCode="nb", languageText="KLARGJORTE DATA"), + model.LanguageStringTypeItem( + languageCode="nn", + languageText="KLARGJORTE DATA", + ), + model.LanguageStringTypeItem( + languageCode="nb", + languageText="KLARGJORTE DATA", + ), ], ) - STATISTICS = LanguageStringType( + STATISTICS = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataSetState.STATISTICS.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="STATISTIKK"), - LanguageStringTypeItem(languageCode="nb", languageText="STATISTIKK"), + model.LanguageStringTypeItem(languageCode="nn", languageText="STATISTIKK"), + model.LanguageStringTypeItem(languageCode="nb", languageText="STATISTIKK"), ], ) - OUTPUT_DATA = LanguageStringType( + OUTPUT_DATA = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataSetState.OUTPUT_DATA.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="UTDATA"), - LanguageStringTypeItem(languageCode="nb", languageText="UTDATA"), + model.LanguageStringTypeItem(languageCode="nn", languageText="UTDATA"), + model.LanguageStringTypeItem(languageCode="nb", languageText="UTDATA"), ], ) @@ -220,44 +183,44 @@ class TemporalityTypeType(LanguageStringsEnum): More information about temporality type: https://statistics-norway.atlassian.net/l/c/HV12q90R """ - FIXED = LanguageStringType( + FIXED = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.TemporalityTypeType.FIXED.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="FAST"), - LanguageStringTypeItem(languageCode="nb", languageText="FAST"), + model.LanguageStringTypeItem(languageCode="nn", languageText="FAST"), + model.LanguageStringTypeItem(languageCode="nb", languageText="FAST"), ], ) - STATUS = LanguageStringType( + STATUS = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.TemporalityTypeType.STATUS.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="TVERRSNITT"), - LanguageStringTypeItem(languageCode="nb", languageText="TVERRSNITT"), + model.LanguageStringTypeItem(languageCode="nn", languageText="TVERRSNITT"), + model.LanguageStringTypeItem(languageCode="nb", languageText="TVERRSNITT"), ], ) - ACCUMULATED = LanguageStringType( + ACCUMULATED = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.TemporalityTypeType.ACCUMULATED.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="AKKUMULERT"), - LanguageStringTypeItem(languageCode="nb", languageText="AKKUMULERT"), + model.LanguageStringTypeItem(languageCode="nn", languageText="AKKUMULERT"), + model.LanguageStringTypeItem(languageCode="nb", languageText="AKKUMULERT"), ], ) - EVENT = LanguageStringType( + EVENT = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.TemporalityTypeType.EVENT.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="HENDELSE"), - LanguageStringTypeItem(languageCode="nb", languageText="HENDELSE"), + model.LanguageStringTypeItem(languageCode="nn", languageText="HENDELSE"), + model.LanguageStringTypeItem(languageCode="nb", languageText="HENDELSE"), ], ) @@ -265,54 +228,54 @@ class TemporalityTypeType(LanguageStringsEnum): class DataType(LanguageStringsEnum): """Simplified data types for metadata purposes.""" - STRING = LanguageStringType( + STRING = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataType.STRING.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="TEKST"), - LanguageStringTypeItem(languageCode="nb", languageText="TEKST"), + model.LanguageStringTypeItem(languageCode="nn", languageText="TEKST"), + model.LanguageStringTypeItem(languageCode="nb", languageText="TEKST"), ], ) - INTEGER = LanguageStringType( + INTEGER = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataType.INTEGER.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="HELTALL"), - LanguageStringTypeItem(languageCode="nb", languageText="HELTALL"), + model.LanguageStringTypeItem(languageCode="nn", languageText="HELTALL"), + model.LanguageStringTypeItem(languageCode="nb", languageText="HELTALL"), ], ) - FLOAT = LanguageStringType( + FLOAT = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataType.FLOAT.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="DESIMALTALL"), - LanguageStringTypeItem(languageCode="nb", languageText="DESIMALTALL"), + model.LanguageStringTypeItem(languageCode="nn", languageText="DESIMALTALL"), + model.LanguageStringTypeItem(languageCode="nb", languageText="DESIMALTALL"), ], ) - DATETIME = LanguageStringType( + DATETIME = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataType.DATETIME.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="DATOTID"), - LanguageStringTypeItem(languageCode="nb", languageText="DATOTID"), + model.LanguageStringTypeItem(languageCode="nn", languageText="DATOTID"), + model.LanguageStringTypeItem(languageCode="nb", languageText="DATOTID"), ], ) - BOOLEAN = LanguageStringType( + BOOLEAN = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.DataType.BOOLEAN.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="BOOLSK"), - LanguageStringTypeItem(languageCode="nb", languageText="BOOLSK"), + model.LanguageStringTypeItem(languageCode="nn", languageText="BOOLSK"), + model.LanguageStringTypeItem(languageCode="nb", languageText="BOOLSK"), ], ) @@ -320,49 +283,49 @@ class DataType(LanguageStringsEnum): class IsPersonalData(LanguageStringsEnum): """Is the variable instance personal data and if so, how is it encrypted.""" - NOT_PERSONAL_DATA = LanguageStringType( + NOT_PERSONAL_DATA = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.IsPersonalData.NOT_PERSONAL_DATA.value, ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nn", languageText="IKKE PERSONOPPLYSNING", ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="IKKE PERSONOPPLYSNING", ), ], ) - PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA = LanguageStringType( + PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.IsPersonalData.PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA.value, ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nn", languageText="PSEUDONYMISERT/KRYPTERT PERSONOPPLYSNING", ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="PSEUDONYMISERT/KRYPTERT PERSONOPPLYSNING", ), ], ) - NON_PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA = LanguageStringType( + NON_PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.IsPersonalData.NON_PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA.value, ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nn", languageText="IKKE PSEUDONYMISERT/KRYPTERT PERSONOPPLYSNING", ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="IKKE PSEUDONYMISERT/KRYPTERT PERSONOPPLYSNING", ), @@ -373,54 +336,66 @@ class IsPersonalData(LanguageStringsEnum): class VariableRole(LanguageStringsEnum): """The role of a variable in a dataset.""" - IDENTIFIER = LanguageStringType( + IDENTIFIER = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.VariableRole.IDENTIFIER.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="IDENTIFIKATOR"), - LanguageStringTypeItem(languageCode="nb", languageText="IDENTIFIKATOR"), + model.LanguageStringTypeItem( + languageCode="nn", + languageText="IDENTIFIKATOR", + ), + model.LanguageStringTypeItem( + languageCode="nb", + languageText="IDENTIFIKATOR", + ), ], ) - MEASURE = LanguageStringType( + MEASURE = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.VariableRole.MEASURE.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="MÅLEVARIABEL"), - LanguageStringTypeItem(languageCode="nb", languageText="MÅLEVARIABEL"), + model.LanguageStringTypeItem( + languageCode="nn", + languageText="MÅLEVARIABEL", + ), + model.LanguageStringTypeItem( + languageCode="nb", + languageText="MÅLEVARIABEL", + ), ], ) - START_TIME = LanguageStringType( + START_TIME = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.VariableRole.START_TIME.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="STARTTID"), - LanguageStringTypeItem(languageCode="nb", languageText="STARTTID"), + model.LanguageStringTypeItem(languageCode="nn", languageText="STARTTID"), + model.LanguageStringTypeItem(languageCode="nb", languageText="STARTTID"), ], ) - STOP_TIME = LanguageStringType( + STOP_TIME = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.VariableRole.STOP_TIME.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="STOPPTID"), - LanguageStringTypeItem(languageCode="nb", languageText="STOPPTID"), + model.LanguageStringTypeItem(languageCode="nn", languageText="STOPPTID"), + model.LanguageStringTypeItem(languageCode="nb", languageText="STOPPTID"), ], ) - ATTRIBUTE = LanguageStringType( + ATTRIBUTE = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.VariableRole.ATTRIBUTE.value, ), - LanguageStringTypeItem(languageCode="nn", languageText="ATTRIBUTT"), - LanguageStringTypeItem(languageCode="nb", languageText="ATTRIBUTT"), + model.LanguageStringTypeItem(languageCode="nn", languageText="ATTRIBUTT"), + model.LanguageStringTypeItem(languageCode="nb", languageText="ATTRIBUTT"), ], ) @@ -428,49 +403,49 @@ class VariableRole(LanguageStringsEnum): class UseRestriction(LanguageStringsEnum): """Lifecycle status of a dataset.""" - DELETION_ANONYMIZATION = LanguageStringType( + DELETION_ANONYMIZATION = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.UseRestriction.DELETION_ANONYMIZATION.value, ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nn", languageText="SLETTING/ANONYMISERING", ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="SLETTING/ANONYMISERING", ), ], ) - PROCESS_LIMITATIONS = LanguageStringType( + PROCESS_LIMITATIONS = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.UseRestriction.PROCESS_LIMITATIONS.value, ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nn", languageText="BEHANDLINGSBEGRENSNINGER", ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="BEHANDLINGSBEGRENSNINGER", ), ], ) - SECONDARY_USE_RESTRICTIONS = LanguageStringType( + SECONDARY_USE_RESTRICTIONS = model.LanguageStringType( [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText=model.UseRestriction.SECONDARY_USE_RESTRICTIONS.value, ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nn", languageText="SEKUNDÆRBRUKSRESTRIKSJONER", ), - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="SEKUNDÆRBRUKSRESTRIKSJONER", ), diff --git a/src/datadoc/frontend/callbacks/dataset.py b/src/datadoc/frontend/callbacks/dataset.py index bdb656f7..a49bc44b 100644 --- a/src/datadoc/frontend/callbacks/dataset.py +++ b/src/datadoc/frontend/callbacks/dataset.py @@ -8,13 +8,13 @@ from typing import TYPE_CHECKING import arrow +from dapla_metadata.datasets import DaplaDatasetPathInfo +from dapla_metadata.datasets import Datadoc from dash import no_update from pydantic import ValidationError from datadoc import config from datadoc import state -from datadoc.backend.core import Datadoc -from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo from datadoc.constants import CHECK_OBLIGATORY_METADATA_DATASET_MESSAGE from datadoc.constants import MISSING_METADATA_WARNING from datadoc.frontend.callbacks.utils import VALIDATION_ERROR @@ -33,6 +33,8 @@ ) from datadoc.frontend.components.builders import AlertTypes from datadoc.frontend.components.builders import build_ssb_alert +from datadoc.frontend.constants import INVALID_DATE_ORDER +from datadoc.frontend.constants import INVALID_VALUE from datadoc.frontend.fields.display_dataset import DISPLAY_DATASET from datadoc.frontend.fields.display_dataset import ( DROPDOWN_DATASET_METADATA_IDENTIFIERS, @@ -45,13 +47,11 @@ ) from datadoc.frontend.fields.display_dataset import TIMEZONE_AWARE_METADATA_IDENTIFIERS from datadoc.frontend.fields.display_dataset import DatasetIdentifiers -from datadoc.frontend.text import INVALID_DATE_ORDER -from datadoc.frontend.text import INVALID_VALUE from datadoc.utils import METADATA_DOCUMENT_FILE_SUFFIX if TYPE_CHECKING: import dash_bootstrap_components as dbc - from datadoc_model.model import LanguageStringType + from dapla_metadata.datasets import model logger = logging.getLogger(__name__) @@ -136,17 +136,17 @@ def process_keyword(value: str) -> list[str]: def process_special_cases( - value: MetadataInputTypes | LanguageStringType, + value: MetadataInputTypes | model.LanguageStringType, metadata_identifier: str, language: str | None = None, -) -> MetadataInputTypes | LanguageStringType: +) -> MetadataInputTypes | model.LanguageStringType: """Pre-process metadata where needed. Some types of metadata need processing before being saved to the model. Handle these cases here, other values are returned unchanged. """ - updated_value: MetadataInputTypes | LanguageStringType + updated_value: MetadataInputTypes | model.LanguageStringType if metadata_identifier == DatasetIdentifiers.KEYWORD.value and isinstance( value, str, @@ -188,7 +188,7 @@ def process_special_cases( def accept_dataset_metadata_input( - value: MetadataInputTypes | LanguageStringType, + value: MetadataInputTypes | model.LanguageStringType, metadata_identifier: str, language: str | None = None, ) -> tuple[bool, str]: diff --git a/src/datadoc/frontend/callbacks/register_callbacks.py b/src/datadoc/frontend/callbacks/register_callbacks.py index f1e9e1bf..e94599f1 100644 --- a/src/datadoc/frontend/callbacks/register_callbacks.py +++ b/src/datadoc/frontend/callbacks/register_callbacks.py @@ -9,6 +9,8 @@ import warnings from typing import TYPE_CHECKING +from dapla_metadata.datasets import ObligatoryDatasetWarning +from dapla_metadata.datasets import ObligatoryVariableWarning from dash import MATCH from dash import Dash from dash import Input @@ -19,8 +21,6 @@ from dash import no_update from datadoc import state -from datadoc.backend.model_validation import ObligatoryDatasetWarning -from datadoc.backend.model_validation import ObligatoryVariableWarning from datadoc.frontend.callbacks.dataset import accept_dataset_metadata_date_input from datadoc.frontend.callbacks.dataset import accept_dataset_metadata_input from datadoc.frontend.callbacks.dataset import dataset_control diff --git a/src/datadoc/frontend/callbacks/utils.py b/src/datadoc/frontend/callbacks/utils.py index 67a27588..044eaf51 100644 --- a/src/datadoc/frontend/callbacks/utils.py +++ b/src/datadoc/frontend/callbacks/utils.py @@ -9,8 +9,8 @@ import arrow import ssb_dash_components as ssb +from dapla_metadata.datasets import model from dash import html -from datadoc_model import model from datadoc import config from datadoc import state diff --git a/src/datadoc/frontend/callbacks/variables.py b/src/datadoc/frontend/callbacks/variables.py index 9afab330..858f89d6 100644 --- a/src/datadoc/frontend/callbacks/variables.py +++ b/src/datadoc/frontend/callbacks/variables.py @@ -18,6 +18,8 @@ from datadoc.frontend.components.builders import build_edit_section from datadoc.frontend.components.builders import build_ssb_accordion from datadoc.frontend.components.builders import build_ssb_alert +from datadoc.frontend.constants import INVALID_DATE_ORDER +from datadoc.frontend.constants import INVALID_VALUE from datadoc.frontend.fields.display_variables import DISPLAY_VARIABLES from datadoc.frontend.fields.display_variables import ( MULTIPLE_LANGUAGE_VARIABLES_METADATA, @@ -28,13 +30,10 @@ ) from datadoc.frontend.fields.display_variables import OPTIONAL_VARIABLES_METADATA from datadoc.frontend.fields.display_variables import VariableIdentifiers -from datadoc.frontend.text import INVALID_DATE_ORDER -from datadoc.frontend.text import INVALID_VALUE if TYPE_CHECKING: import dash_bootstrap_components as dbc - from datadoc_model import model - from datadoc_model.model import LanguageStringType + from dapla_metadata.datasets import model logger = logging.getLogger(__name__) @@ -77,10 +76,10 @@ def populate_variables_workspace( def handle_multi_language_metadata( metadata_field: str, - new_value: MetadataInputTypes | LanguageStringType, + new_value: MetadataInputTypes | model.LanguageStringType, updated_row_id: str, language: str, -) -> MetadataInputTypes | LanguageStringType: +) -> MetadataInputTypes | model.LanguageStringType: """Handle updates to fields which support multiple languages.""" if new_value is None: # This edge case occurs when the user removes the text in an input field @@ -267,7 +266,7 @@ def variable_identifier_multilanguage( def set_variables_values_inherit_dataset_values( - value: MetadataInputTypes | LanguageStringType, + value: MetadataInputTypes | model.LanguageStringType, metadata_identifier: str, ) -> None: """Set variable value based on dataset value.""" @@ -282,7 +281,7 @@ def set_variables_values_inherit_dataset_values( def set_variables_value_multilanguage_inherit_dataset_values( - value: MetadataInputTypes | LanguageStringType, + value: MetadataInputTypes | model.LanguageStringType, metadata_identifier: str, language: str, ) -> None: diff --git a/src/datadoc/frontend/components/builders.py b/src/datadoc/frontend/components/builders.py index a3c915ed..7fbb878a 100644 --- a/src/datadoc/frontend/components/builders.py +++ b/src/datadoc/frontend/components/builders.py @@ -16,7 +16,7 @@ from datadoc.frontend.fields.display_base import FieldTypes if TYPE_CHECKING: - from datadoc_model import model + from dapla_metadata.datasets import model class AlertTypes(Enum): diff --git a/src/datadoc/frontend/text.py b/src/datadoc/frontend/constants.py similarity index 73% rename from src/datadoc/frontend/text.py rename to src/datadoc/frontend/constants.py index 00e4940d..f92d72f5 100644 --- a/src/datadoc/frontend/text.py +++ b/src/datadoc/frontend/constants.py @@ -1,2 +1,4 @@ +"""Repository for constant values in Datadoc frontend module.""" + INVALID_VALUE = "Ugyldig verdi angitt!" INVALID_DATE_ORDER = "Verdien for {contains_data_from_display_name} må være en lik eller tidligere dato som {contains_data_until_display_name}" diff --git a/src/datadoc/frontend/fields/display_base.py b/src/datadoc/frontend/fields/display_base.py index 9c290196..254b9a14 100644 --- a/src/datadoc/frontend/fields/display_base.py +++ b/src/datadoc/frontend/fields/display_base.py @@ -10,19 +10,19 @@ from typing import Any import ssb_dash_components as ssb +from dapla_metadata.datasets import enums from dash import html from datadoc import state -from datadoc.enums import LanguageStringsEnum -from datadoc.enums import SupportedLanguages if TYPE_CHECKING: from collections.abc import Callable + from dapla_metadata.datasets import model from dash.development.base_component import Component - from datadoc_model.model import LanguageStringType from pydantic import BaseModel + from datadoc.enums import LanguageStringsEnum from datadoc.frontend.callbacks.utils import MetadataInputTypes logger = logging.getLogger(__name__) @@ -39,17 +39,17 @@ METADATA_LANGUAGES = [ { - "supported_language": SupportedLanguages.NORSK_BOKMÅL, + "supported_language": enums.SupportedLanguages.NORSK_BOKMÅL, "language_title": "Bokmål", "language_value": "nb", }, { - "supported_language": SupportedLanguages.NORSK_NYNORSK, + "supported_language": enums.SupportedLanguages.NORSK_NYNORSK, "language_title": "Nynorsk", "language_value": "nn", }, { - "supported_language": SupportedLanguages.ENGLISH, + "supported_language": enums.SupportedLanguages.ENGLISH, "language_title": "English", "language_value": "en", }, @@ -62,7 +62,8 @@ def get_enum_options( """Generate the list of options based on the currently chosen language.""" dropdown_options = [ { - "title": i.get_value_for_language(SupportedLanguages.NORSK_BOKMÅL) or "", + "title": i.get_value_for_language(enums.SupportedLanguages.NORSK_BOKMÅL) + or "", "id": i.name, } for i in enum # type: ignore [attr-defined] @@ -75,7 +76,7 @@ def get_data_source_options() -> list[dict[str, str]]: """Collect the unit type options.""" dropdown_options = [ { - "title": data_sources.get_title(SupportedLanguages.NORSK_BOKMÅL), + "title": data_sources.get_title(enums.SupportedLanguages.NORSK_BOKMÅL), "id": data_sources.code, } for data_sources in state.data_sources.classifications @@ -98,8 +99,8 @@ def get_metadata_and_stringify(metadata: BaseModel, identifier: str) -> str | No def _get_string_type_item( - language_strings: LanguageStringType, - current_metadata_language: SupportedLanguages, + language_strings: model.LanguageStringType, + current_metadata_language: enums.SupportedLanguages, ) -> str | None: if language_strings.root is not None: for i in language_strings.root: @@ -111,10 +112,10 @@ def _get_string_type_item( def get_multi_language_metadata_and_stringify( metadata: BaseModel, identifier: str, - language: SupportedLanguages, + language: enums.SupportedLanguages, ) -> str | None: """Get a metadata value supporting multiple languages from the model.""" - value: LanguageStringType | None = getattr(metadata, identifier) + value: model.LanguageStringType | None = getattr(metadata, identifier) if value is None: return "" return _get_string_type_item(value, language) @@ -276,7 +277,7 @@ def render_input_group( value=get_multi_language_metadata_and_stringify( metadata, self.identifier, - SupportedLanguages(i["supported_language"]), + enums.SupportedLanguages(i["supported_language"]), ), debounce=True, id={ @@ -298,7 +299,7 @@ def render_input_group( value=get_multi_language_metadata_and_stringify( metadata, self.identifier, - SupportedLanguages(i["supported_language"]), + enums.SupportedLanguages(i["supported_language"]), ), debounce=True, id={ diff --git a/src/datadoc/frontend/fields/display_dataset.py b/src/datadoc/frontend/fields/display_dataset.py index 330e5ddc..f0cc934a 100644 --- a/src/datadoc/frontend/fields/display_dataset.py +++ b/src/datadoc/frontend/fields/display_dataset.py @@ -6,9 +6,14 @@ import logging from enum import Enum -from datadoc import enums +from dapla_metadata.datasets import enums + from datadoc import state -from datadoc.enums import SupportedLanguages +from datadoc.enums import Assessment +from datadoc.enums import DataSetState +from datadoc.enums import DataSetStatus +from datadoc.enums import TemporalityTypeType +from datadoc.enums import UseRestriction from datadoc.frontend.fields.display_base import DATASET_METADATA_DATE_INPUT from datadoc.frontend.fields.display_base import DATASET_METADATA_MULTILANGUAGE_INPUT from datadoc.frontend.fields.display_base import DROPDOWN_DESELECT_OPTION @@ -30,7 +35,7 @@ def get_statistical_subject_options() -> list[dict[str, str]]: """Generate the list of options for statistical subject.""" dropdown_options = [ { - "title": f"{primary.get_title(SupportedLanguages.NORSK_BOKMÅL)} - {secondary.get_title(SupportedLanguages.NORSK_BOKMÅL)}", + "title": f"{primary.get_title(enums.SupportedLanguages.NORSK_BOKMÅL)} - {secondary.get_title(enums.SupportedLanguages.NORSK_BOKMÅL)}", "id": secondary.subject_code, } for primary in state.statistic_subject_mapping.primary_subjects @@ -44,7 +49,7 @@ def get_unit_type_options() -> list[dict[str, str]]: """Collect the unit type options.""" dropdown_options = [ { - "title": unit_type.get_title(SupportedLanguages.NORSK_BOKMÅL), + "title": unit_type.get_title(enums.SupportedLanguages.NORSK_BOKMÅL), "id": unit_type.code, } for unit_type in state.unit_types.classifications @@ -57,7 +62,7 @@ def get_owner_options() -> list[dict[str, str]]: """Collect the owner options.""" dropdown_options = [ { - "title": f"{option.code} - {option.get_title(SupportedLanguages.NORSK_BOKMÅL)}", + "title": f"{option.code} - {option.get_title(enums.SupportedLanguages.NORSK_BOKMÅL)}", "id": option.code, } for option in state.organisational_units.classifications @@ -116,7 +121,7 @@ class DatasetIdentifiers(str, Enum): obligatory=True, options_getter=functools.partial( get_enum_options, - enums.Assessment, + Assessment, ), ), DatasetIdentifiers.DATASET_STATUS: MetadataDropdownField( @@ -125,7 +130,7 @@ class DatasetIdentifiers(str, Enum): description="Oppgi om metadataene er under arbeid (utkast), kan deles internt (intern), kan deles eksternt(ekstern) eller er avsluttet/erstattet (utgått). Det kan være restriksjoner knyttet til deling både internt og eksternt.", options_getter=functools.partial( get_enum_options, - enums.DataSetStatus, + DataSetStatus, ), obligatory=True, ), @@ -136,7 +141,7 @@ class DatasetIdentifiers(str, Enum): obligatory=True, options_getter=functools.partial( get_enum_options, - enums.DataSetState, + DataSetState, ), ), DatasetIdentifiers.NAME: MetadataMultiLanguageField( @@ -194,7 +199,7 @@ class DatasetIdentifiers(str, Enum): description="Temporalitetstypen sier noe om tidsdimensjonen i datasettet. Fast er data med verdi som ikke endres over tid (f.eks. fødselsdato), tverrsnitt er data som er målt på et gitt tidspunkt, akkumulert er data som er samlet over en viss tidsperiode (f.eks. inntekt gjennom et år) og hendelse/forløp registrerer tidspunkt og tidsperiode for ulike hendelser /tilstander, f.eks. (skifte av) bosted.", options_getter=functools.partial( get_enum_options, - enums.TemporalityTypeType, + TemporalityTypeType, ), obligatory=True, ), @@ -290,7 +295,7 @@ class DatasetIdentifiers(str, Enum): description="Oppgi om det er knyttet noen bruksrestriksjoner til datasettet, f.eks. krav om sletting/anonymisering.", options_getter=functools.partial( get_enum_options, - enums.UseRestriction, + UseRestriction, ), ), DatasetIdentifiers.USE_RESTRICTION_DATE: MetadataDateField( diff --git a/src/datadoc/frontend/fields/display_variables.py b/src/datadoc/frontend/fields/display_variables.py index ae68f1e6..a3902e22 100644 --- a/src/datadoc/frontend/fields/display_variables.py +++ b/src/datadoc/frontend/fields/display_variables.py @@ -5,8 +5,13 @@ import functools from enum import Enum -from datadoc import enums +from dapla_metadata.datasets import enums + from datadoc import state +from datadoc.enums import DataType +from datadoc.enums import IsPersonalData +from datadoc.enums import TemporalityTypeType +from datadoc.enums import VariableRole from datadoc.frontend.fields.display_base import VARIABLES_METADATA_DATE_INPUT from datadoc.frontend.fields.display_base import VARIABLES_METADATA_MULTILANGUAGE_INPUT from datadoc.frontend.fields.display_base import FieldTypes @@ -80,7 +85,7 @@ class VariableIdentifiers(str, Enum): obligatory=True, options_getter=functools.partial( get_enum_options, - enums.DataType, + DataType, ), ), VariableIdentifiers.VARIABLE_ROLE: MetadataDropdownField( @@ -90,7 +95,7 @@ class VariableIdentifiers(str, Enum): obligatory=True, options_getter=functools.partial( get_enum_options, - enums.VariableRole, + VariableRole, ), ), VariableIdentifiers.DEFINITION_URI: MetadataInputField( @@ -106,7 +111,7 @@ class VariableIdentifiers(str, Enum): obligatory=True, options_getter=functools.partial( get_enum_options, - enums.IsPersonalData, + IsPersonalData, ), ), VariableIdentifiers.DATA_SOURCE: MetadataDropdownField( @@ -133,7 +138,7 @@ class VariableIdentifiers(str, Enum): description="Temporalitetstypen settes vanligvis på datasettnivå, men dersom datasettet består av variabler med ulike temporalitetstyper, kan den settes på variabelnivå. Temporalitet sier noe om tidsdimensjonen i datasettet. Fast er data med verdi som ikke endres over tid (f.eks. fødselsdato), tverrsnitt er data som er målt på et gitt tidspunkt, akkumulert er data som er samlet over en viss tidsperiode (f.eks. inntekt gjennom et år) og hendelse/forløp registrerer tidspunkt og tidsperiode for ulike hendelser /tilstander, f.eks. (skifte av) bosted.", options_getter=functools.partial( get_enum_options, - enums.TemporalityTypeType, + TemporalityTypeType, ), ), VariableIdentifiers.MEASUREMENT_UNIT: MetadataDropdownField( diff --git a/src/datadoc/state.py b/src/datadoc/state.py index 47859b9e..610a4665 100644 --- a/src/datadoc/state.py +++ b/src/datadoc/state.py @@ -13,9 +13,11 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from datadoc.backend.code_list import CodeList - from datadoc.backend.core import Datadoc - from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping + from dapla_metadata.datasets.code_list import CodeList + from dapla_metadata.datasets.core import Datadoc + from dapla_metadata.datasets.statistic_subject_mapping import ( + StatisticSubjectMapping, + ) # Global metadata container diff --git a/tests/backend/__init__.py b/tests/backend/__init__.py deleted file mode 100644 index 73cc186f..00000000 --- a/tests/backend/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Unit tests for backend Datadoc.""" diff --git a/tests/backend/test_code_list.py b/tests/backend/test_code_list.py deleted file mode 100644 index d14a4618..00000000 --- a/tests/backend/test_code_list.py +++ /dev/null @@ -1,131 +0,0 @@ -import pytest - -from datadoc.backend.code_list import CodeList -from datadoc.backend.code_list import CodeListItem -from tests.utils import TEST_RESOURCES_DIRECTORY - -CODE_LIST_DIR = "code_list" - - -@pytest.mark.parametrize( - ( - "code_list_csv_filepath_nb", - "code_list_csv_filepath_nn", - "code_list_csv_filepath_en", - "expected", - ), - [ - ( - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_nb.csv", - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_nn.csv", - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_en.csv", - [ - CodeListItem( - titles={ - "nb": "Adresse", - "nn": "Adresse", - "en": "Adresse", - }, - code="01", - ), - CodeListItem( - titles={ - "nb": "Arbeidsulykke", - "nn": "Arbeidsulykke", - "en": "Arbeidsulykke", - }, - code="02", - ), - CodeListItem( - titles={ - "nb": "Bolig", - "nn": "Bolig", - "en": "Bolig", - }, - code="03", - ), - ], - ), - ( - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "empty.csv", - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "empty.csv", - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "empty.csv", - [], - ), - ( - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_nb.csv", - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "empty.csv", - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "empty.csv", - [ - CodeListItem( - titles={ - "nb": "Adresse", - "nn": None, - "en": None, - }, - code="01", - ), - CodeListItem( - titles={ - "nb": "Arbeidsulykke", - "nn": None, - "en": None, - }, - code="02", - ), - CodeListItem( - titles={ - "nb": "Bolig", - "nn": None, - "en": None, - }, - code="03", - ), - ], - ), - ( - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "no_code.csv", - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "no_code.csv", - TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "no_code.csv", - [ - CodeListItem( - titles={ - "nb": "Adresse", - "nn": "Adresse", - "en": "Adresse", - }, - code=None, - ), - CodeListItem( - titles={ - "nb": "Arbeidsulykke", - "nn": "Arbeidsulykke", - "en": "Arbeidsulykke", - }, - code=None, - ), - CodeListItem( - titles={ - "nb": "Bolig", - "nn": "Bolig", - "en": "Bolig", - }, - code=None, - ), - ], - ), - ], -) -@pytest.mark.usefixtures("_mock_fetch_dataframe") -def test_read_dataframe( - code_list_fake_structure: CodeList, - expected: list[str], -): - code_list_fake_structure.wait_for_external_result() - assert code_list_fake_structure.classifications == expected - - -def test_non_existent_code(thread_pool_executor): - code_list = CodeList(thread_pool_executor, 0) - code_list.wait_for_external_result() - assert code_list.classifications == [] diff --git a/tests/backend/test_dapla_dataset_path_info.py b/tests/backend/test_dapla_dataset_path_info.py deleted file mode 100644 index 17331f4a..00000000 --- a/tests/backend/test_dapla_dataset_path_info.py +++ /dev/null @@ -1,307 +0,0 @@ -from __future__ import annotations - -import datetime -from dataclasses import dataclass -from typing import TYPE_CHECKING - -import pytest - -from datadoc.backend.dapla_dataset_path_info import ISO_YEAR -from datadoc.backend.dapla_dataset_path_info import ISO_YEAR_MONTH -from datadoc.backend.dapla_dataset_path_info import ISO_YEAR_MONTH_DAY -from datadoc.backend.dapla_dataset_path_info import SSB_BIMESTER -from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo -from datadoc.enums import DataSetState -from tests.utils import TEST_BUCKET_PARQUET_FILEPATH_WITH_SHORTNAME -from tests.utils import TEST_PARQUET_FILEPATH - -if TYPE_CHECKING: - import pathlib - - -@dataclass -class DatasetPathTestCase: - """Structure to define attributes needed for a test case.""" - - path: str - expected_contains_data_from: datetime.date - expected_contains_data_until: datetime.date - - -TEST_CASES = [ - DatasetPathTestCase( - path="grensehandel_imputert_p2022-10-01_p2022-12-31_v1.parquet", - expected_contains_data_from=datetime.date(2022, 10, 1), - expected_contains_data_until=datetime.date(2022, 12, 31), - ), - DatasetPathTestCase( - path="grensehandel_imputert_p2022-10_p2022-12_v1.parquet", - expected_contains_data_from=datetime.date(2022, 10, 1), - expected_contains_data_until=datetime.date(2022, 12, 31), - ), - DatasetPathTestCase( - path="flygende_objekter_p2019_v1.parquet", - expected_contains_data_from=datetime.date(2019, 1, 1), - expected_contains_data_until=datetime.date(2019, 12, 31), - ), - DatasetPathTestCase( - path="framskrevne-befolkningsendringer_p2019_p2050_v1.parquet", - expected_contains_data_from=datetime.date(2019, 1, 1), - expected_contains_data_until=datetime.date(2050, 12, 31), - ), - DatasetPathTestCase( - path="ufo_observasjoner_p2019_p2020_v1.parquet", - expected_contains_data_from=datetime.date(2019, 1, 1), - expected_contains_data_until=datetime.date(2020, 12, 31), - ), - DatasetPathTestCase( - path="omsetning_p2020W15_v1.parquet", - expected_contains_data_from=datetime.date(2020, 4, 6), - expected_contains_data_until=datetime.date(2020, 4, 12), - ), - DatasetPathTestCase( - path="omsetning_p1981-W52_v1.parquet", - expected_contains_data_from=datetime.date(1981, 12, 21), - expected_contains_data_until=datetime.date(1981, 12, 27), - ), - DatasetPathTestCase( - path="personinntekt_p2022H1_v1.parquet", - expected_contains_data_from=datetime.date(2022, 1, 1), - expected_contains_data_until=datetime.date(2022, 6, 30), - ), - DatasetPathTestCase( - path="nybilreg_p2022T1_v1.parquet", - expected_contains_data_from=datetime.date(2022, 1, 1), - expected_contains_data_until=datetime.date(2022, 4, 30), - ), - DatasetPathTestCase( - path="varehandel_p2018Q1_p2018Q4_v1.parquet", - expected_contains_data_from=datetime.date(2018, 1, 1), - expected_contains_data_until=datetime.date(2018, 12, 31), - ), - DatasetPathTestCase( - path="pensjon_p2018Q1_v1.parquet", - expected_contains_data_from=datetime.date(2018, 1, 1), - expected_contains_data_until=datetime.date(2018, 3, 31), - ), - DatasetPathTestCase( - path="skipsanloep_p2021B2_v1.parquet", - expected_contains_data_from=datetime.date(2021, 3, 1), - expected_contains_data_until=datetime.date(2021, 4, 30), - ), - DatasetPathTestCase( - path="skipsanloep_p2022B1_v1.parquet", - expected_contains_data_from=datetime.date(2022, 1, 1), - expected_contains_data_until=datetime.date(2022, 2, 28), - ), -] - - -@pytest.fixture( - ids=[tc.path for tc in TEST_CASES], - params=TEST_CASES, -) -def test_data(request: pytest.FixtureRequest) -> DatasetPathTestCase: - return request.param - - -@pytest.fixture() -def dataset_path(test_data: DatasetPathTestCase) -> DaplaDatasetPathInfo: - return DaplaDatasetPathInfo(test_data.path) - - -@pytest.fixture() -def expected_contains_data_from(test_data: DatasetPathTestCase) -> datetime.date: - return test_data.expected_contains_data_from - - -@pytest.fixture() -def expected_contains_data_until(test_data: DatasetPathTestCase) -> datetime.date: - return test_data.expected_contains_data_until - - -def test_extract_period_info_date_from( - dataset_path: DaplaDatasetPathInfo, - expected_contains_data_from: datetime.date, -): - assert dataset_path.contains_data_from == expected_contains_data_from - - -def test_extract_period_info_date_until( - dataset_path: DaplaDatasetPathInfo, - expected_contains_data_until: datetime.date, -): - assert dataset_path.contains_data_until == expected_contains_data_until - - -@pytest.mark.parametrize( - "data", - [ - "nonsen.data", - "nonsens2.parquet", - TEST_PARQUET_FILEPATH.name, - ], -) -def test_extract_period_info_no_period_info_in_path(data: str): - assert DaplaDatasetPathInfo(data).contains_data_from is None - - -@pytest.mark.parametrize( - ("path_parts_to_insert", "expected_result"), - [ - ("kildedata", DataSetState.SOURCE_DATA), - ("inndata", DataSetState.INPUT_DATA), - ("roskildedata/klargjorte-data", DataSetState.PROCESSED_DATA), - ("klargjorte_data", DataSetState.PROCESSED_DATA), - ("klargjorte-data", DataSetState.PROCESSED_DATA), - ("statistikk", DataSetState.STATISTICS), - ("", None), - ], -) -def test_get_dataset_state( - full_dataset_state_path: pathlib.Path, - expected_result: DataSetState, -): - actual_state = DaplaDatasetPathInfo(full_dataset_state_path).dataset_state - assert actual_state == expected_result - - -@pytest.mark.parametrize( - ("path", "expected"), - [ - ("person_data_v1", "1"), - ("person_data_v2", "2"), - ("person_data_vwrong", None), - ("person_data", None), - ("person_testdata_p2021-12-31_p2021-12-31_v20", "20"), - ], -) -def test_get_dataset_version( - path: str, - expected: str | None, -): - assert DaplaDatasetPathInfo(path).dataset_version == expected - - -# These tests covers both date until after date from, mix of SSB keys and invalid SSB keys -@pytest.mark.parametrize( - "dataset_path_name", - [ - "ufo_observasjoner_p2019_p1920_v1.parquet", - "varehandel_p2018H2_p2018H1_v1.parquet", - "varehandel_p2018Q1_p2018H2_v1.parquet", - "sykkeltransport_p1973B8_v1.parquet", - ], -) -def test_extract_period_info_date_from_invalid_pathname(dataset_path_name: str) -> None: - dataset = DaplaDatasetPathInfo(dataset_path_name) - assert dataset.contains_data_from is None - - -@pytest.mark.parametrize( - "dataset_path_name", - [ - "ufo_observasjoner_p2019_p1920_v1.parquet", - "varehandel_p2018H2_p2018H1_v1.parquet", - "varehandel_p2018Q1_p2018H2_v1.parquet", - "sykkeltransport_p1973B2_p2020T8_v1.parquet", - ], -) -def test_extract_period_info_date_until_invalid_pathname( - dataset_path_name: str, -) -> None: - dataset = DaplaDatasetPathInfo(dataset_path_name) - assert dataset.contains_data_until is None - - -@pytest.mark.parametrize( - ("date_format", "period"), - [ - (ISO_YEAR, "1980"), - (ISO_YEAR_MONTH, "1888-11"), - (ISO_YEAR_MONTH_DAY, "2203-01-24"), - (SSB_BIMESTER, "1963B3"), - ], -) -def test_date_format_return_date_object_period_start(date_format, period): - assert isinstance(date_format.get_floor(period), datetime.date) - - -@pytest.mark.parametrize( - ("date_format", "period"), - [ - (ISO_YEAR, "1980"), - (ISO_YEAR_MONTH, "1888-11"), - (ISO_YEAR_MONTH_DAY, "2203-01-24"), - (SSB_BIMESTER, "1963B3"), - ], -) -def test_date_format_return_date_object_period_end(date_format, period): - assert isinstance(date_format.get_ceil(period), datetime.date) - - -@pytest.mark.parametrize( - ("date_format", "period", "expected"), - [ - (ISO_YEAR, "1980", datetime.date(1980, 1, 1)), - (ISO_YEAR_MONTH, "1888-11", datetime.date(1888, 11, 1)), - (ISO_YEAR_MONTH_DAY, "2203-01-24", datetime.date(2203, 1, 24)), - (SSB_BIMESTER, "1963B3", datetime.date(1963, 5, 1)), - ], -) -def test_date_format_correct_from_date(date_format, period, expected: datetime.date): - assert date_format.get_floor(period) == expected - - -@pytest.mark.parametrize( - ("date_format", "period", "expected"), - [ - (ISO_YEAR, "1980", datetime.date(1980, 12, 31)), - (ISO_YEAR_MONTH, "1888-11", datetime.date(1888, 11, 30)), - (ISO_YEAR_MONTH_DAY, "2203-01-24", datetime.date(2203, 1, 24)), - (SSB_BIMESTER, "1963B3", datetime.date(1963, 6, 30)), - ], -) -def test_date_format_correct_end_date(date_format, period, expected): - assert date_format.get_ceil(period) == expected - - -@pytest.mark.parametrize( - ("data", "expected"), - [ - (TEST_BUCKET_PARQUET_FILEPATH_WITH_SHORTNAME, "befolkning"), - ( - "gs://ssb-staging-dapla-felles-data-delt/datadoc/person_data_v1.parquet", - None, - ), - ("inndata/person_data_v1.parquet", None), - ], -) -def test_extract_shortname_in_path(data: str, expected: str): - assert DaplaDatasetPathInfo(data).statistic_short_name == expected - - -@pytest.mark.parametrize( - ("data"), - [ - "gs://ssb-staging-dapla-felles-data-delt/person_data_p2022_v1.parquet", - "gs://ssb-staging-dapla-felles-data-delt/datadoc/person_data_v1.parquet", - "gs://ssb-staging-dapla-felles-data-delt/datadoc/person_data_p2021_v3.parquet", - "gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_v1.parquet", - "gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021.parquet", - ], -) -def test_path_complies_with_naming_standard_invalid_input(data: str): - assert DaplaDatasetPathInfo(data).path_complies_with_naming_standard() is False - - -@pytest.mark.parametrize( - ("data"), - [ - "gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet", - "gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_p2022_v2.parquet", - "gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/undermappe/person_data_p2021_v2.parquet", - ], -) -def test_path_complies_with_naming_standard_valid_input(data: str): - assert DaplaDatasetPathInfo(data).path_complies_with_naming_standard() is True diff --git a/tests/backend/test_datadoc_metadata.py b/tests/backend/test_datadoc_metadata.py deleted file mode 100644 index 1838c44c..00000000 --- a/tests/backend/test_datadoc_metadata.py +++ /dev/null @@ -1,800 +0,0 @@ -"""Tests for the DataDocMetadata class.""" - -from __future__ import annotations - -import contextlib -import json -import pathlib -import shutil -import warnings -from pathlib import Path -from typing import TYPE_CHECKING -from unittest.mock import MagicMock -from unittest.mock import patch -from uuid import UUID - -import arrow -import pytest -from datadoc_model.model import DatadocMetadata -from datadoc_model.model import Dataset -from datadoc_model.model import Variable - -from datadoc.backend.constants import DATASET_FIELDS_FROM_EXISTING_METADATA -from datadoc.backend.core import Datadoc -from datadoc.backend.core import InconsistentDatasetsError -from datadoc.backend.core import InconsistentDatasetsWarning -from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping -from datadoc.backend.user_info import PLACEHOLDER_EMAIL_ADDRESS -from datadoc.backend.user_info import TestUserInfo -from datadoc.enums import Assessment -from datadoc.enums import DataSetState -from datadoc.enums import DataSetStatus -from datadoc.enums import DataType -from datadoc.enums import IsPersonalData -from datadoc.enums import VariableRole -from tests.utils import TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH -from tests.utils import TEST_DATASETS_DIRECTORY -from tests.utils import TEST_EXISTING_METADATA_DIRECTORY -from tests.utils import TEST_EXISTING_METADATA_FILE_NAME -from tests.utils import TEST_EXISTING_METADATA_NAMING_STANDARD_FILEPATH -from tests.utils import TEST_NAMING_STANDARD_COMPATIBLE_DATASET -from tests.utils import TEST_PARQUET_FILEPATH -from tests.utils import TEST_PROCESSED_DATA_POPULATION_DIRECTORY -from tests.utils import TEST_RESOURCES_DIRECTORY - -if TYPE_CHECKING: - from collections.abc import Generator - from datetime import datetime - - -DATADOC_METADATA_MODULE = "datadoc.backend.core" - - -@pytest.fixture() -def generate_periodic_file( - existing_data_path: Path, - insert_string: str, -) -> Generator[Path, None, None]: - file_name = existing_data_path.name - insert_pos = file_name.find("_v1") - new_file_name = file_name[:insert_pos] + insert_string + file_name[insert_pos:] - new_path = TEST_RESOURCES_DIRECTORY / new_file_name - shutil.copy(existing_data_path, new_path) - yield new_path - if new_path.exists(): - new_path.unlink() - - -@pytest.mark.usefixtures("existing_metadata_file") -def test_existing_metadata_file( - metadata: Datadoc, -): - root = getattr(metadata.dataset.name, "root", []) - if root: - assert root[0].languageText == "successfully_read_existing_file" - else: - msg = "Root is none" - raise AssertionError(msg) - - -def test_metadata_document_percent_complete(metadata: Datadoc): - dataset = Dataset(dataset_state=DataSetState.OUTPUT_DATA) - variable_1 = Variable(data_type=DataType.BOOLEAN) - variable_2 = Variable(data_type=DataType.INTEGER) - document = DatadocMetadata( - percentage_complete=0, - dataset=dataset, - variables=[variable_1, variable_2], - ) - metadata.dataset = document.dataset # type: ignore [assignment] - metadata.variables = document.variables # type: ignore [assignment] - - assert metadata.percent_complete == 12 # noqa: PLR2004 - - -def test_write_metadata_document( - dummy_timestamp: datetime, - metadata: Datadoc, - tmp_path: pathlib.Path, -): - metadata.dataset.metadata_created_date = dummy_timestamp - metadata.write_metadata_document() - written_document = tmp_path / TEST_EXISTING_METADATA_FILE_NAME - assert Path.exists(written_document) - assert metadata.dataset.metadata_created_date == dummy_timestamp - assert metadata.dataset.metadata_created_by == PLACEHOLDER_EMAIL_ADDRESS - assert metadata.dataset.metadata_last_updated_date == dummy_timestamp - assert metadata.dataset.metadata_last_updated_by == PLACEHOLDER_EMAIL_ADDRESS - - with Path.open(written_document) as f: - written_metadata = json.loads(f.read()) - datadoc_metadata = written_metadata["datadoc"]["dataset"] - - assert ( - # Use our pydantic model to read in the datetime string so we get the correct format - Dataset( - metadata_created_date=datadoc_metadata["metadata_created_date"], - ).metadata_created_date - == dummy_timestamp - ) - assert datadoc_metadata["metadata_created_by"] == PLACEHOLDER_EMAIL_ADDRESS - assert ( - # Use our pydantic model to read in the datetime string so we get the correct format - Dataset( - metadata_last_updated_date=datadoc_metadata["metadata_last_updated_date"], - ).metadata_last_updated_date - == dummy_timestamp - ) - assert datadoc_metadata["metadata_last_updated_by"] == PLACEHOLDER_EMAIL_ADDRESS - - -@pytest.mark.usefixtures("existing_metadata_file") -@patch( - "datadoc.backend.user_info.get_user_info_for_current_platform", - return_value=TestUserInfo(), -) -def test_write_metadata_document_existing_document( - _mock_user_info: MagicMock, # noqa: PT019 it's a patch, not a fixture - dummy_timestamp: datetime, - metadata: Datadoc, -): - original_created_date = metadata.dataset.metadata_created_date - original_created_by = metadata.dataset.metadata_created_by - metadata.write_metadata_document() - assert metadata.dataset.metadata_created_by == original_created_by - assert metadata.dataset.metadata_created_date == original_created_date - assert metadata.dataset.metadata_last_updated_by == PLACEHOLDER_EMAIL_ADDRESS - assert metadata.dataset.metadata_last_updated_date == dummy_timestamp - - -def test_metadata_id(metadata: Datadoc): - assert isinstance(metadata.dataset.id, UUID) - - -@pytest.mark.parametrize( - "existing_metadata_path", - [TEST_EXISTING_METADATA_DIRECTORY / "invalid_id_field"], -) -def test_existing_metadata_none_id( - existing_metadata_file: Path, - metadata: Datadoc, -): - with existing_metadata_file.open() as f: - pre_open_id: None = json.load(f)["datadoc"]["dataset"]["id"] - assert pre_open_id is None - assert isinstance(metadata.dataset.id, UUID) - metadata.write_metadata_document() - with existing_metadata_file.open() as f: - post_write_id = json.load(f)["datadoc"]["dataset"]["id"] - assert post_write_id == str(metadata.dataset.id) - - -@pytest.mark.parametrize( - "existing_metadata_path", - [TEST_EXISTING_METADATA_DIRECTORY / "valid_id_field"], -) -def test_existing_metadata_valid_id( - existing_metadata_file: Path, - metadata: Datadoc, -): - pre_open_id = "" - post_write_id = "" - with existing_metadata_file.open() as f: - pre_open_id = json.load(f)["datadoc"]["dataset"]["id"] - assert pre_open_id is not None - assert isinstance(metadata.dataset.id, UUID) - assert str(metadata.dataset.id) == pre_open_id - metadata.write_metadata_document() - with existing_metadata_file.open() as f: - post_write_id = json.load(f)["datadoc"]["dataset"]["id"] - assert post_write_id == pre_open_id - - -def test_dataset_short_name(metadata: Datadoc): - assert metadata.dataset.short_name == "person_data" - - -def test_dataset_file_path(metadata: Datadoc): - assert metadata.dataset.file_path == str(metadata.dataset_path) - - -def test_variable_role_default_value(metadata: Datadoc): - assert all( - v.variable_role == VariableRole.MEASURE.value for v in metadata.variables - ) - - -def test_is_personal_data_value(metadata: Datadoc): - assert all( - v.is_personal_data == IsPersonalData.NOT_PERSONAL_DATA.value - for v in metadata.variables - ) - - -def test_save_file_path_metadata_field( - existing_metadata_file: Path, - metadata: Datadoc, -): - metadata.write_metadata_document() - with existing_metadata_file.open() as f: - saved_file_path = json.load(f)["datadoc"]["dataset"]["file_path"] - assert saved_file_path == str(metadata.dataset_path) - - -def test_save_file_path_dataset_and_no_metadata( - metadata: Datadoc, - tmp_path: pathlib.Path, -): - metadata.write_metadata_document() - with (tmp_path / TEST_EXISTING_METADATA_FILE_NAME).open() as f: - saved_file_path = json.load(f)["datadoc"]["dataset"]["file_path"] - assert saved_file_path == str(metadata.dataset_path) - - -@pytest.mark.parametrize( - ("insert_string", "expected_from", "expected_until"), - [ - ("_p2021", arrow.get("2021-01-01").date(), arrow.get("2021-12-31").date()), - ( - "_p2022_p2023", - arrow.get("2022-01-01").date(), - arrow.get("2023-12-31").date(), - ), - ], -) -def test_period_metadata_fields_saved( - subject_mapping_fake_statistical_structure: StatisticSubjectMapping, - generate_periodic_file, - expected_from, - expected_until, -): - metadata = Datadoc( - str(generate_periodic_file), - statistic_subject_mapping=subject_mapping_fake_statistical_structure, - ) - assert metadata.dataset.contains_data_from == expected_from - assert metadata.dataset.contains_data_until == expected_until - - -@pytest.mark.parametrize( - ("dataset_path", "expected_type"), - [ - ( - TEST_PROCESSED_DATA_POPULATION_DIRECTORY - / "person_testdata_p2021-12-31_p2021-12-31_v1.parquet", - DataSetStatus.INTERNAL.value, - ), - ( - TEST_PARQUET_FILEPATH, - DataSetStatus.DRAFT.value, - ), - ( - "", - None, - ), - ], -) -def test_dataset_status_default_value( - subject_mapping_fake_statistical_structure: StatisticSubjectMapping, - dataset_path: str, - expected_type: DataSetStatus | None, -): - datadoc_metadata = Datadoc( - str(dataset_path), - statistic_subject_mapping=subject_mapping_fake_statistical_structure, - ) - assert datadoc_metadata.dataset.dataset_status == expected_type - - -@pytest.mark.parametrize( - ("path_parts_to_insert", "expected_type"), - [ - ( - "kildedata", - Assessment.SENSITIVE.value, - ), - ( - "inndata", - Assessment.PROTECTED.value, - ), - ( - "klargjorte_data", - Assessment.PROTECTED.value, - ), - ( - "statistikk", - Assessment.PROTECTED.value, - ), - ( - "utdata", - Assessment.OPEN.value, - ), - ( - "", - None, - ), - ], -) -def test_dataset_assessment_default_value( - expected_type: Assessment | None, - copy_dataset_to_path: Path, - thread_pool_executor, -): - datadoc_metadata = Datadoc( - dataset_path=str(copy_dataset_to_path), - statistic_subject_mapping=StatisticSubjectMapping( - thread_pool_executor, - source_url="", - ), - ) - assert datadoc_metadata.dataset.assessment == expected_type - - -@pytest.mark.parametrize( - ("path_parts_to_insert", "expected_subject_code"), - [ - (["aa_kortnvan_01", "klargjorte_data"], "aa01"), - (["ab_kortnvan", "utdata"], "ab00"), - (["aa_kortnvan_01", "no_dataset_state"], None), - (["unknown_short_name", "klargjorte_data"], None), - ], -) -def test_extract_subject_field_value_from_statistic_structure_xml( - subject_mapping_fake_statistical_structure: StatisticSubjectMapping, - copy_dataset_to_path: Path, - expected_subject_code: str, -): - subject_mapping_fake_statistical_structure.wait_for_external_result() - metadata = Datadoc( - str(copy_dataset_to_path), - statistic_subject_mapping=subject_mapping_fake_statistical_structure, - ) - assert metadata.dataset.subject_field == expected_subject_code # type: ignore [union-attr] - - -@pytest.mark.parametrize( - "existing_metadata_path", - [TEST_EXISTING_METADATA_DIRECTORY / "pseudo"], -) -def test_existing_pseudo_metadata_file( - existing_metadata_file: Path, - metadata: Datadoc, -): - pre_open_metadata = json.loads(existing_metadata_file.read_text()) - metadata.write_metadata_document() - post_open_metadata = json.loads(existing_metadata_file.read_text()) - - assert len(metadata.variables) == 8 # noqa: PLR2004 - assert ( - pre_open_metadata["pseudonymization"] == post_open_metadata["pseudonymization"] - ) - assert post_open_metadata["datadoc"] is not None - - -def test_generate_variables_id( - metadata: Datadoc, -): - assert all(isinstance(v.id, UUID) for v in metadata.variables) - - -@pytest.mark.parametrize( - "existing_metadata_path", - [TEST_EXISTING_METADATA_DIRECTORY / "invalid_id_field"], -) -def test_existing_metadata_variables_none_id( - existing_metadata_file: Path, - metadata: Datadoc, -): - with existing_metadata_file.open() as f: - pre_open_id: list = [v["id"] for v in json.load(f)["datadoc"]["variables"]] - assert (i is None for i in pre_open_id) - - assert all(isinstance(v.id, UUID) for v in metadata.variables) - - metadata.write_metadata_document() - with existing_metadata_file.open() as f: - post_write_id: list = [v["id"] for v in json.load(f)["datadoc"]["variables"]] - - assert post_write_id == [str(v.id) for v in metadata.variables] - - -@pytest.mark.parametrize( - "existing_metadata_path", - [TEST_EXISTING_METADATA_DIRECTORY / "valid_variable_id_field"], -) -def test_existing_metadata_variables_valid_id( - existing_metadata_file: Path, - metadata: Datadoc, -): - with existing_metadata_file.open() as f: - pre_open_id: list = [v["id"] for v in json.load(f)["datadoc"]["variables"]] - - assert all(isinstance(v.id, UUID) for v in metadata.variables) - metadata_variable_ids = [str(v.id) for v in metadata.variables] - assert metadata_variable_ids == pre_open_id - - metadata.write_metadata_document() - with existing_metadata_file.open() as f: - post_write_id: list = [v["id"] for v in json.load(f)["datadoc"]["variables"]] - - assert pre_open_id == post_write_id - - -@pytest.mark.parametrize( - ("index", "expected_text"), - [ - (0, "Norge"), - (1, "Noreg"), - (2, "Norway"), - ], -) -def test_default_spatial_coverage_description( - metadata: Datadoc, - index: int, - expected_text: str, -): - ls = metadata.dataset.spatial_coverage_description - assert ls.root[index].languageText == expected_text # type: ignore[union-attr, index] - - -def test_open_extracted_and_existing_metadata(metadata_merged: Datadoc, tmp_path: Path): - assert ( - metadata_merged.metadata_document - == tmp_path - / "ifpn/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json" - ) - assert str(metadata_merged.dataset_path) is not None - - -def test_open_nonexistent_existing_metadata(existing_data_path: Path): - with pytest.raises( - ValueError, - match="Metadata document does not exist! Provided path:", - ): - Datadoc( - str(existing_data_path), - str(Datadoc.build_metadata_document_path(existing_data_path)), - ) - - -def test_merge_extracted_and_existing_dataset_metadata(metadata_merged: Datadoc): - metadata_extracted = Datadoc( - dataset_path=str(metadata_merged.dataset_path), - ) - metadata_existing = Datadoc( - metadata_document_path=str(TEST_EXISTING_METADATA_NAMING_STANDARD_FILEPATH), - ) - - # Should match extracted metadata from the dataset - assert metadata_merged.dataset.short_name == metadata_extracted.dataset.short_name - assert metadata_merged.dataset.assessment == metadata_extracted.dataset.assessment - assert ( - metadata_merged.dataset.dataset_state - == metadata_extracted.dataset.dataset_state - ) - assert metadata_merged.dataset.version == metadata_extracted.dataset.version - assert metadata_merged.dataset.file_path == metadata_extracted.dataset.file_path - assert ( - metadata_merged.dataset.metadata_created_by - == metadata_extracted.dataset.metadata_created_by - ) - assert ( - metadata_merged.dataset.metadata_last_updated_by - == metadata_extracted.dataset.metadata_last_updated_by - ) - assert ( - metadata_merged.dataset.contains_data_from - == metadata_extracted.dataset.contains_data_from - ) - assert ( - metadata_merged.dataset.contains_data_until - == metadata_extracted.dataset.contains_data_until - ) - - # Should match existing metadata - for field in DATASET_FIELDS_FROM_EXISTING_METADATA: - actual = getattr(metadata_merged.dataset, field) - assert actual == getattr( - metadata_existing.dataset, - field, - ), f"{field} in merged metadata did not match existing metadata" - - # Special cases - assert metadata_merged.dataset.version_description is None - assert metadata_merged.dataset.id != metadata_existing.dataset.id - assert metadata_merged.dataset.metadata_created_date is None - assert metadata_merged.dataset.metadata_last_updated_date is None - - -def test_merge_variables(tmp_path): - dataset = tmp_path / "fewer_variables_p2021-12-31_p2021-12-31_v1.parquet" - existing_document = TEST_EXISTING_METADATA_NAMING_STANDARD_FILEPATH - dataset.parent.mkdir(parents=True, exist_ok=True) - shutil.copy( - TEST_DATASETS_DIRECTORY / "fewer_variables_p2021-12-31_p2021-12-31_v1.parquet", - dataset, - ) - extracted = Datadoc( - dataset_path=str(dataset), - ) - existing = Datadoc( - metadata_document_path=str(existing_document), - ) - merged = Datadoc( - dataset_path=str(dataset), - metadata_document_path=str(existing_document), - errors_as_warnings=True, - ) - assert [v.short_name for v in merged.variables] == [ - v.short_name for v in extracted.variables - ] - assert all(v.id is not None for v in merged.variables) - assert [v.id for v in merged.variables] != [v.id for v in existing.variables] - assert all( - v.contains_data_from == merged.dataset.contains_data_from - for v in merged.variables - ) - assert all( - v.contains_data_until == merged.dataset.contains_data_until - for v in merged.variables - ) - - -def test_merge_with_fewer_variables_in_existing_metadata(tmp_path): - target = tmp_path / TEST_NAMING_STANDARD_COMPATIBLE_DATASET - target.parent.mkdir(parents=True, exist_ok=True) - shutil.copy( - TEST_DATASETS_DIRECTORY / TEST_NAMING_STANDARD_COMPATIBLE_DATASET, - target, - ) - datadoc = Datadoc( - str(target), - str( - TEST_EXISTING_METADATA_DIRECTORY - / "fewer_variables_p2020-12-31_p2020-12-31_v1__DOC.json", - ), - errors_as_warnings=True, - ) - assert [v.short_name for v in datadoc.variables] == [ - "fnr", - "sivilstand", - "bostedskommune", - "inntekt", - "bankinnskudd", - "dato", - ] - - -@pytest.mark.parametrize( - ("new_dataset_path", "existing_dataset_path"), - [ - ( - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH, - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH, - ), - ( - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH.replace("v1", "v2"), - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH, - ), - ( - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH.replace("p2021", "p2022"), - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH, - ), - ( - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH.replace( - "/ifpn", - "/deeper/folder/structure/ifpn", - ), - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH, - ), - ], - ids=[ - "identical path", - "differing version", - "differing period", - "different folder structure", - ], -) -@pytest.mark.parametrize( - "errors_as_warnings", - [True, False], - ids=["warnings", "errors"], -) -def test_check_ready_to_merge_consistent_paths( - new_dataset_path: str, - existing_dataset_path: str, - errors_as_warnings: bool, # noqa: FBT001 -): - with warnings.catch_warnings() if errors_as_warnings else contextlib.nullcontext(): # type: ignore [attr-defined] - if errors_as_warnings: - warnings.simplefilter("error") - Datadoc._check_ready_to_merge( # noqa: SLF001 - Path(new_dataset_path), - Path(existing_dataset_path), - DatadocMetadata(variables=[]), - DatadocMetadata(variables=[]), - errors_as_warnings=errors_as_warnings, - ) - - -@pytest.mark.parametrize( - ("new_dataset_path", "existing_dataset_path"), - [ - ( - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH.replace("produkt", "delt"), - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH, - ), - ( - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH.replace("ifpn", "blah"), - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH, - ), - ( - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH.replace( - "klargjorte_data", - "utdata", - ), - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH, - ), - ( - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH.replace( - "person_testdata", - "totally_different_dataset", - ), - TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH, - ), - ], - ids=["bucket", "data product", "dataset state", "dataset short name"], -) -@pytest.mark.parametrize( - "errors_as_warnings", - [True, False], - ids=["warnings", "errors"], -) -def test_check_ready_to_merge_inconsistent_paths( - new_dataset_path: str, - existing_dataset_path: str, - errors_as_warnings: bool, # noqa: FBT001 -): - with contextlib.ExitStack() as stack: - if errors_as_warnings: - stack.enter_context(pytest.warns(InconsistentDatasetsWarning)) - else: - stack.enter_context(pytest.raises(InconsistentDatasetsError)) - Datadoc._check_ready_to_merge( # noqa: SLF001 - Path(new_dataset_path), - Path(existing_dataset_path), - DatadocMetadata(variables=[]), - DatadocMetadata(variables=[]), - errors_as_warnings=errors_as_warnings, - ) - - -VARIABLE_SHORT_NAMES = [ - "fnr", - "sivilstand", - "bostedskommune", - "inntekt", - "bankinnskudd", - "dato", -] - - -VARIABLE_DATA_TYPES = [ - DataType.STRING, - DataType.STRING, - DataType.STRING, - DataType.INTEGER, - DataType.INTEGER, - DataType.DATETIME, -] - - -@pytest.mark.parametrize( - ("extracted_variables", "existing_variables"), - [ - (VARIABLE_SHORT_NAMES, VARIABLE_SHORT_NAMES[:-2]), - (VARIABLE_SHORT_NAMES[:-2], VARIABLE_SHORT_NAMES), - (VARIABLE_SHORT_NAMES, VARIABLE_SHORT_NAMES[:-1] + ["blah"]), - ], - ids=["fewer existing", "fewer extracted", "renamed"], -) -@pytest.mark.parametrize( - "errors_as_warnings", - [True, False], - ids=["warnings", "errors"], -) -def test_check_ready_to_merge_inconsistent_variable_names( - extracted_variables: list[str], - existing_variables: list[str], - errors_as_warnings: bool, # noqa: FBT001 -): - with contextlib.ExitStack() as stack: - if errors_as_warnings: - stack.enter_context(pytest.warns(InconsistentDatasetsWarning)) - else: - stack.enter_context(pytest.raises(InconsistentDatasetsError)) - Datadoc._check_ready_to_merge( # noqa: SLF001 - Path(TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH), - Path(TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH), - DatadocMetadata( - variables=[Variable(short_name=name) for name in extracted_variables], - ), - DatadocMetadata( - variables=[Variable(short_name=name) for name in existing_variables], - ), - errors_as_warnings=errors_as_warnings, - ) - - -@pytest.mark.parametrize( - "errors_as_warnings", - [True, False], - ids=["warnings", "errors"], -) -def test_check_ready_to_merge_consistent_variables( - errors_as_warnings: bool, # noqa: FBT001 -): - with warnings.catch_warnings() if errors_as_warnings else contextlib.nullcontext(): # type: ignore [attr-defined] - if errors_as_warnings: - warnings.simplefilter("error") - Datadoc._check_ready_to_merge( # noqa: SLF001 - Path(TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH), - Path(TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH), - DatadocMetadata( - variables=[ - Variable(short_name=name, data_type=data_type) - for name, data_type in zip( - VARIABLE_SHORT_NAMES, - VARIABLE_DATA_TYPES, - ) - ], - ), - DatadocMetadata( - variables=[ - Variable(short_name=name, data_type=data_type) - for name, data_type in zip( - VARIABLE_SHORT_NAMES, - VARIABLE_DATA_TYPES, - ) - ], - ), - errors_as_warnings=errors_as_warnings, - ) - - -@pytest.mark.parametrize( - "errors_as_warnings", - [True, False], - ids=["warnings", "errors"], -) -def test_check_ready_to_merge_inconsistent_variable_data_types( - errors_as_warnings: bool, # noqa: FBT001 -): - with contextlib.ExitStack() as stack: - if errors_as_warnings: - stack.enter_context(pytest.warns(InconsistentDatasetsWarning)) - else: - stack.enter_context(pytest.raises(InconsistentDatasetsError)) - Datadoc._check_ready_to_merge( # noqa: SLF001 - Path(TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH), - Path(TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH), - DatadocMetadata( - variables=[ - Variable(short_name=name, data_type=data_type) - for name, data_type in zip( - VARIABLE_SHORT_NAMES, - VARIABLE_DATA_TYPES[:-1] + [DataType.BOOLEAN], - ) - ], - ), - DatadocMetadata( - variables=[ - Variable(short_name=name, data_type=data_type) - for name, data_type in zip( - VARIABLE_SHORT_NAMES, - VARIABLE_DATA_TYPES, - ) - ], - ), - errors_as_warnings=errors_as_warnings, - ) diff --git a/tests/backend/test_dataset_parser.py b/tests/backend/test_dataset_parser.py deleted file mode 100644 index 9624930b..00000000 --- a/tests/backend/test_dataset_parser.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Tests for the DatasetParser class.""" - -import io -import pathlib - -import pandas as pd -import pytest -from datadoc_model.model import LanguageStringType -from datadoc_model.model import LanguageStringTypeItem -from datadoc_model.model import Variable - -from datadoc.backend.dataset_parser import KNOWN_BOOLEAN_TYPES -from datadoc.backend.dataset_parser import KNOWN_DATETIME_TYPES -from datadoc.backend.dataset_parser import KNOWN_FLOAT_TYPES -from datadoc.backend.dataset_parser import KNOWN_INTEGER_TYPES -from datadoc.backend.dataset_parser import KNOWN_STRING_TYPES -from datadoc.backend.dataset_parser import DatasetParser -from datadoc.backend.dataset_parser import DatasetParserParquet -from datadoc.enums import DataType -from tests.utils import TEST_PARQUET_FILEPATH -from tests.utils import TEST_PARQUET_GZIP_FILEPATH -from tests.utils import TEST_SAS7BDAT_FILEPATH - - -def test_use_abstract_class_directly(): - with pytest.raises(TypeError): - DatasetParser().get_fields() - - -@pytest.mark.parametrize( - "local_parser", - [ - DatasetParser.for_file(TEST_PARQUET_FILEPATH), - DatasetParser.for_file(TEST_PARQUET_GZIP_FILEPATH), - ], -) -def test_get_fields_parquet(local_parser: DatasetParserParquet): - expected_fields = [ - Variable(short_name="pers_id", data_type=DataType.STRING), - Variable(short_name="tidspunkt", data_type=DataType.DATETIME), - Variable(short_name="sivilstand", data_type=DataType.STRING), - Variable(short_name="alm_inntekt", data_type=DataType.INTEGER), - Variable(short_name="sykepenger", data_type=DataType.INTEGER), - Variable(short_name="ber_bruttoformue", data_type=DataType.INTEGER), - Variable(short_name="fullf_utdanning", data_type=DataType.STRING), - Variable(short_name="hoveddiagnose", data_type=DataType.STRING), - ] - fields = local_parser.get_fields() - - assert fields == expected_fields - - -def test_get_fields_sas7bdat(): - expected_fields = [ - Variable( - short_name="tekst", - name=LanguageStringType( - [LanguageStringTypeItem(languageCode="nb", languageText="Tekst")], - ), - data_type=DataType.STRING, - ), - Variable( - short_name="tall", - name=LanguageStringType( - [LanguageStringTypeItem(languageCode="nb", languageText="Tall")], - ), - data_type=DataType.FLOAT, - ), - Variable( - short_name="dato", - name=LanguageStringType( - [LanguageStringTypeItem(languageCode="nb", languageText="Dato")], - ), - data_type=DataType.DATETIME, - ), - ] - - reader = DatasetParser.for_file(TEST_SAS7BDAT_FILEPATH) - fields = reader.get_fields() - - assert fields == expected_fields - - -@pytest.mark.parametrize("file", ["my_dataset.csv", "my_dataset.xlsx", "my_dataset"]) -def test_dataset_parser_unsupported_files(file: pathlib.Path): - with pytest.raises(NotImplementedError): - DatasetParser.for_file(pathlib.Path(file)) - - -def test_transform_datatype_unknown_type(): - assert DatasetParser.transform_data_type("definitely not a known data type") is None - - -@pytest.mark.parametrize( - ("expected", "concrete_type"), - [ - *[(DataType.INTEGER, i) for i in KNOWN_INTEGER_TYPES], - *[(DataType.FLOAT, i) for i in KNOWN_FLOAT_TYPES], - *[(DataType.STRING, i) for i in KNOWN_STRING_TYPES], - *[(DataType.DATETIME, i) for i in KNOWN_DATETIME_TYPES], - *[(DataType.BOOLEAN, i) for i in KNOWN_BOOLEAN_TYPES], - ], -) -def test_transform_datatype(expected: DataType, concrete_type: str): - actual = DatasetParser.transform_data_type(concrete_type) - assert actual == expected - - -@pytest.fixture() -def parquet_with_index_column(tmp_path): - """Create a parquet file with a column called __index_level_0__.""" - test_data = pd.read_csv( - io.StringIO( - """a b -1 4 -2 5 -3 6 -""", - ), - sep="\t", - ) - - output_path = tmp_path / "test_with_index.parquet" - test_data.query("b % 2 == 0").to_parquet(output_path, engine="pyarrow") - return output_path - - -def test_parquet_with_index_column(parquet_with_index_column: pathlib.Path): - fields = DatasetParser.for_file(parquet_with_index_column).get_fields() - assert not any(f.short_name == "__index_level_0__" for f in fields) diff --git a/tests/backend/test_model_backwards_compatibility.py b/tests/backend/test_model_backwards_compatibility.py deleted file mode 100644 index 3867b6a7..00000000 --- a/tests/backend/test_model_backwards_compatibility.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Tests for the ModelBackwardsCompatibility class.""" - -import json -from pathlib import Path - -import pytest - -from datadoc.backend.core import Datadoc -from datadoc.backend.model_backwards_compatibility import UnknownModelVersionError -from datadoc.backend.model_backwards_compatibility import add_container -from datadoc.backend.model_backwards_compatibility import handle_version_2_2_0 -from datadoc.backend.model_backwards_compatibility import handle_version_3_3_0 -from datadoc.backend.model_backwards_compatibility import ( - is_metadata_in_container_structure, -) -from datadoc.backend.model_backwards_compatibility import upgrade_metadata -from tests.utils import TEST_COMPATIBILITY_DIRECTORY -from tests.utils import TEST_EXISTING_METADATA_FILE_NAME - -BACKWARDS_COMPATIBLE_VERSION_DIRECTORIES = [ - d for d in TEST_COMPATIBILITY_DIRECTORY.iterdir() if d.is_dir() -] -BACKWARDS_COMPATIBLE_VERSION_NAMES = [ - d.stem for d in BACKWARDS_COMPATIBLE_VERSION_DIRECTORIES -] - - -def test_existing_metadata_current_model_version(): - current_model_version = "4.0.0" - fresh_metadata = {"document_version": current_model_version} - upgraded_metadata = upgrade_metadata(fresh_metadata) - assert upgraded_metadata == fresh_metadata - - -def test_handle_version_2_2_0() -> None: - pydir: Path = Path(__file__).resolve().parent - rootdir: Path = pydir.parent.parent - existing_metadata_file: Path = ( - rootdir - / TEST_COMPATIBILITY_DIRECTORY - / "v2_2_0" - / TEST_EXISTING_METADATA_FILE_NAME - ) - with existing_metadata_file.open(mode="r", encoding="utf-8") as file: - fresh_metadata = json.load(file) - upgraded_metadata = handle_version_2_2_0(fresh_metadata) - assert "custom_type" in upgraded_metadata["datadoc"]["dataset"] - assert "custom_type" in upgraded_metadata["datadoc"]["variables"][0] - assert "special_value" in upgraded_metadata["datadoc"]["variables"][0] - - -def test_handle_version_3_3_0() -> None: - pydir: Path = Path(__file__).resolve().parent - rootdir: Path = pydir.parent.parent - existing_metadata_file: Path = ( - rootdir - / TEST_COMPATIBILITY_DIRECTORY - / "v3_3_0" - / TEST_EXISTING_METADATA_FILE_NAME - ) - with existing_metadata_file.open(mode="r", encoding="utf-8") as file: - fresh_metadata = json.load(file) - upgraded_metadata = handle_version_3_3_0(fresh_metadata) - assert ( - "direct_person_identifying" not in upgraded_metadata["datadoc"]["variables"][0] - ) - - -def test_existing_metadata_unknown_model_version(): - fresh_metadata = {"document_version": "0.27.65"} - with pytest.raises(UnknownModelVersionError): - upgrade_metadata(fresh_metadata) - - -@pytest.mark.parametrize( - "existing_metadata_path", - BACKWARDS_COMPATIBLE_VERSION_DIRECTORIES, - ids=BACKWARDS_COMPATIBLE_VERSION_NAMES, -) -def test_backwards_compatibility( - existing_metadata_file: Path, - metadata: Datadoc, -): - with existing_metadata_file.open() as f: - file_metadata = json.loads(f.read()) - - if is_metadata_in_container_structure(file_metadata): - file_metadata = file_metadata["datadoc"] - - # Just test a single value to make sure we have a working model - assert metadata.dataset.short_name == file_metadata["dataset"]["short_name"] # type: ignore [union-attr, index] - - -def test_add_container(): - doc = { - "percentage_complete": 98, - "document_version": "2.1.0", - "dataset": {"short_name": "person_data_v1", "assessment": "SENSITIVE"}, - } - doc_with_container = add_container(doc) - assert doc_with_container["document_version"] == "0.0.1" - assert doc_with_container["datadoc"]["document_version"] == "2.1.0" - assert "pseudonymization" in doc_with_container diff --git a/tests/backend/test_statistic_subject_mapping.py b/tests/backend/test_statistic_subject_mapping.py deleted file mode 100644 index a03f4387..00000000 --- a/tests/backend/test_statistic_subject_mapping.py +++ /dev/null @@ -1,176 +0,0 @@ -import pytest -import requests -from bs4 import BeautifulSoup - -from datadoc.backend.statistic_subject_mapping import PrimarySubject -from datadoc.backend.statistic_subject_mapping import SecondarySubject -from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping -from tests.utils import TEST_RESOURCES_DIRECTORY - - -def test_no_source_url(thread_pool_executor): - subject_mapping = StatisticSubjectMapping(thread_pool_executor, None) - subject_mapping.wait_for_external_result() - assert subject_mapping.primary_subjects == [] - - -def test_extract_titles(): - xml_string = 'PartifinansieringFunding of political parties' - soup = BeautifulSoup(xml_string, features="xml") - assert StatisticSubjectMapping._extract_titles(soup) == { # noqa: SLF001 - "no": "Partifinansiering", - "en": "Funding of political parties", - } - - -STATISTICAL_SUBJECT_STRUCTURE_DIR = "statistical_subject_structure" - - -@pytest.mark.parametrize( - ("subject_xml_file_path", "expected"), - [ - ( - TEST_RESOURCES_DIRECTORY / STATISTICAL_SUBJECT_STRUCTURE_DIR / "simple.xml", - [ - PrimarySubject( - titles={"en": "aa english", "no": "aa norwegian"}, - subject_code="aa", - secondary_subjects=[ - SecondarySubject( - titles={"en": "aa00 english", "no": "aa00 norwegian"}, - subject_code="aa00", - statistic_short_names=["aa_kortnvan"], - ), - ], - ), - ], - ), - ( - TEST_RESOURCES_DIRECTORY / STATISTICAL_SUBJECT_STRUCTURE_DIR / "empty.xml", - [], - ), - ( - TEST_RESOURCES_DIRECTORY - / STATISTICAL_SUBJECT_STRUCTURE_DIR - / "missing_language.xml", - [ - PrimarySubject( - titles={ - "en": "aa english", - }, - subject_code="aa", - secondary_subjects=[ - SecondarySubject( - titles={ - "en": "aa00 english", - "no": "aa00 norwegian", - }, - subject_code="aa00", - statistic_short_names=[ - "aa_kortnvan", - ], - ), - SecondarySubject( - titles={ - "no": "aa01 norwegian", - }, - subject_code="aa01", - statistic_short_names=[ - "aa_kortnvan_01", - ], - ), - ], - ), - PrimarySubject( - titles={ - "en": "ab english", - }, - subject_code="ab", - secondary_subjects=[ - SecondarySubject( - titles={ - "en": "ab00 english", - "no": "ab00 norwegian", - }, - subject_code="ab00", - statistic_short_names=[ - "ab_kortnvan", - ], - ), - SecondarySubject( - titles={ - "en": "ab01 english", - }, - subject_code="ab01", - statistic_short_names=[ - "ab_kortnvan_01", - ], - ), - ], - ), - ], - ), - ], -) -@pytest.mark.usefixtures("_mock_fetch_statistical_structure") -def test_read_in_statistical_structure( - subject_mapping_fake_statistical_structure: StatisticSubjectMapping, - expected: list[PrimarySubject], -) -> None: - subject_mapping_fake_statistical_structure.wait_for_external_result() - assert subject_mapping_fake_statistical_structure.primary_subjects == expected - - -@pytest.mark.parametrize( - ("statistic_short_name", "expected_secondary_subject"), - [ - ("ab_kortnvan", "ab00"), - ("aa_kortnvan", "aa00"), - ("ab_kortnvan_01", "ab01"), - ("aa_kortnvan_01", "aa01"), - ("unknown_name", None), - (None, None), - ], -) -@pytest.mark.usefixtures("_mock_fetch_statistical_structure") -def test_get_secondary_subject( - subject_mapping_fake_statistical_structure: StatisticSubjectMapping, - statistic_short_name: str, - expected_secondary_subject: str, -) -> None: - subject_mapping_fake_statistical_structure.wait_for_external_result() - assert ( - subject_mapping_fake_statistical_structure.get_secondary_subject( - statistic_short_name, - ) - == expected_secondary_subject - ) - - -@pytest.fixture() -def subject_mapping_http_exception( - requests_mock, - exception_to_raise, - thread_pool_executor, -) -> StatisticSubjectMapping: - requests_mock.get( - "http://test.some.url.com", - exc=exception_to_raise, - ) - return StatisticSubjectMapping(thread_pool_executor, "http://test.some.url.com") - - -@pytest.mark.parametrize( - ("exception_to_raise"), - [ - (requests.exceptions.ConnectTimeout), - (requests.exceptions.HTTPError), - (requests.exceptions.ReadTimeout), - (requests.exceptions.ConnectionError), - ], -) -def test_subject_mapping_http_exception( - subject_mapping_http_exception: StatisticSubjectMapping, -) -> None: - subject_mapping_http_exception.wait_for_external_result() - assert subject_mapping_http_exception.primary_subjects == [] diff --git a/tests/backend/test_user_info.py b/tests/backend/test_user_info.py deleted file mode 100644 index 35d2cfae..00000000 --- a/tests/backend/test_user_info.py +++ /dev/null @@ -1,112 +0,0 @@ -import string - -import jwt -import pytest -from faker import Faker - -from datadoc.backend import user_info -from datadoc.backend.user_info import PLACEHOLDER_EMAIL_ADDRESS -from datadoc.backend.user_info import DaplaLabUserInfo -from datadoc.backend.user_info import JupyterHubUserInfo -from datadoc.backend.user_info import UnknownUserInfo -from datadoc.backend.user_info import UserInfo -from datadoc.config import DAPLA_REGION -from datadoc.config import DAPLA_SERVICE -from datadoc.config import JUPYTERHUB_USER -from datadoc.enums import DaplaRegion -from datadoc.enums import DaplaService - - -@pytest.fixture() -def raw_jwt_payload(faker: Faker) -> dict[str, object]: - user_name = "".join(faker.random_sample(elements=string.ascii_lowercase, length=3)) - email = f"{user_name}@ssb.no" - first_name = faker.first_name() - last_name = faker.last_name() - return { - "exp": faker.unix_time(), - "iat": faker.unix_time(), - "auth_time": faker.unix_time(), - "jti": faker.uuid4(), - "iss": faker.url(), - "aud": [ - faker.word(), - faker.uuid4(), - "broker", - "account", - ], - "sub": faker.uuid4(), - "typ": "Bearer", - "azp": "onyxia", - "session_state": faker.uuid4(), - "allowed-origins": ["*"], - "realm_access": { - "roles": [faker.word(), faker.word()], - }, - "resource_access": { - "broker": {"roles": [faker.word()]}, - "account": { - "roles": [faker.word()], - }, - }, - "scope": "openid email profile", - "sid": faker.uuid4(), - "email_verified": True, - "name": f"{first_name} {last_name}", - "short_username": f"ssb-{user_name}", - "preferred_username": email, - "given_name": first_name, - "family_name": last_name, - "email": email, - } - - -@pytest.fixture() -def fake_jwt(raw_jwt_payload): - return jwt.encode(raw_jwt_payload, "test secret", algorithm="HS256") - - -@pytest.mark.parametrize( - ("environment_variable_name", "environment_variable_value", "expected_class"), - [ - (DAPLA_SERVICE, DaplaService.JUPYTERLAB.value, JupyterHubUserInfo), - (DAPLA_REGION, DaplaRegion.DAPLA_LAB.value, DaplaLabUserInfo), - (None, None, UnknownUserInfo), - ], -) -def test_get_user_info_for_current_platform( - monkeypatch: pytest.MonkeyPatch, - environment_variable_name: str, - environment_variable_value: str, - expected_class: type[UserInfo], -): - if environment_variable_name: - monkeypatch.setenv(environment_variable_name, environment_variable_value) - assert isinstance(user_info.get_user_info_for_current_platform(), expected_class) - - -def test_jupyterhub_user_info_short_email(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv(JUPYTERHUB_USER, PLACEHOLDER_EMAIL_ADDRESS) - assert JupyterHubUserInfo().short_email == PLACEHOLDER_EMAIL_ADDRESS - - -def test_dapla_lab_user_info_short_email( - fake_jwt: str, - raw_jwt_payload: dict[str, object], - monkeypatch: pytest.MonkeyPatch, -): - monkeypatch.setenv("OIDC_TOKEN", fake_jwt) - assert DaplaLabUserInfo().short_email == raw_jwt_payload["email"] - - -def test_dapla_lab_user_info_short_email_no_jwt_available(): - assert DaplaLabUserInfo().short_email is None - - -@pytest.mark.parametrize(("raw_jwt_payload"), [{"no_email": "no_email_in_jwt"}]) -def test_dapla_lab_user_info_short_email_no_email_in_jwt( - fake_jwt: str, - monkeypatch: pytest.MonkeyPatch, -): - monkeypatch.setenv("OIDC_TOKEN", fake_jwt) - assert DaplaLabUserInfo().short_email is None diff --git a/tests/backend/test_utils.py b/tests/backend/test_utils.py deleted file mode 100644 index 041fc862..00000000 --- a/tests/backend/test_utils.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -import pathlib - -import pytest -from cloudpathlib.local import LocalGSClient -from cloudpathlib.local import LocalGSPath - -from datadoc.backend.utils import calculate_percentage -from datadoc.backend.utils import normalize_path -from tests.utils import TEST_BUCKET_PARQUET_FILEPATH -from tests.utils import TEST_PARQUET_FILEPATH - -BACKEND_UTILS_MODULE = "datadoc.backend.utils" - - -@pytest.mark.parametrize( - ("dataset_path", "expected_type"), - [ - (TEST_BUCKET_PARQUET_FILEPATH, LocalGSPath), - (str(TEST_PARQUET_FILEPATH), pathlib.Path), - ], -) -def test_normalize_path( - dataset_path: str, - expected_type: type[os.PathLike], - mocker, -): - mocker.patch(f"{BACKEND_UTILS_MODULE}.AuthClient", autospec=True) - mocker.patch(f"{BACKEND_UTILS_MODULE}.GSClient", LocalGSClient) - mocker.patch( - f"{BACKEND_UTILS_MODULE}.GSPath", - LocalGSPath, - ) - file = normalize_path( # for testing purposes - dataset_path, - ) - assert isinstance(file, expected_type) - - -def test_calculate_percentage(): - assert calculate_percentage(1, 3) == 33 # noqa: PLR2004 diff --git a/tests/backend/test_validators.py b/tests/backend/test_validators.py deleted file mode 100644 index a4fa701c..00000000 --- a/tests/backend/test_validators.py +++ /dev/null @@ -1,305 +0,0 @@ -"""Tests for validators for DatadocMetadata class.""" - -from __future__ import annotations - -import datetime -import re -import warnings -from typing import TYPE_CHECKING - -import datadoc_model -import pytest -from datadoc_model import model -from pydantic import ValidationError - -from datadoc import state -from datadoc.backend.constants import OBLIGATORY_METADATA_WARNING -from datadoc.backend.model_validation import ObligatoryDatasetWarning -from datadoc.backend.model_validation import ObligatoryVariableWarning -from datadoc.backend.utils import incorrect_date_order -from datadoc.enums import TemporalityTypeType - -if TYPE_CHECKING: - from datadoc.backend.core import Datadoc - - -@pytest.mark.parametrize( - ("date_from", "date_until", "expected"), - [ - (datetime.date(2024, 1, 1), datetime.date(1960, 1, 1), True), - (datetime.date(1980, 1, 1), datetime.date(2000, 6, 5), False), - (None, None, False), - (datetime.date(2024, 1, 1), None, False), - (None, datetime.date(2024, 1, 1), True), - (datetime.date(2024, 1, 1), datetime.date(2024, 1, 1), False), - ], -) -def test_incorrect_date_order(date_from, date_until, expected): - result = incorrect_date_order(date_from, date_until) - assert result == expected - - -@pytest.mark.parametrize( - ("model_type", "date_from", "date_until", "raises_exception"), - [ - ("dataset", datetime.date(2024, 1, 1), datetime.date(1980, 10, 1), True), - ("dataset", datetime.date(1967, 1, 1), datetime.date(1980, 1, 1), False), - ("variable", datetime.date(1999, 10, 5), datetime.date(1925, 3, 12), True), - ("variable", datetime.date(2022, 7, 24), datetime.date(2023, 2, 19), False), - ("dataset", datetime.date(1967, 1, 1), None, False), - ("variable", datetime.date(1999, 2, 2), datetime.date(1999, 2, 2), False), - ], -) -def test_write_metadata_document_validate_date_order( - model_type, - date_from, - date_until, - raises_exception, - metadata: Datadoc, -): - if model_type == "dataset": - metadata.dataset.contains_data_from = date_from - metadata.dataset.contains_data_until = date_until - if model_type == "variable": - for v in metadata.variables: - v.contains_data_from = date_from - v.contains_data_until = date_until - if raises_exception: - with pytest.raises( - ValueError, - match="contains_data_from must be the same or earlier date than contains_data_until", - ): - metadata.write_metadata_document() - else: - try: - metadata.write_metadata_document() - except ValidationError as exc: - pytest.fail(str(exc)) - - -def test_write_metadata_document_created_date( - metadata: Datadoc, -): - metadata.dataset.metadata_created_date = None - metadata.write_metadata_document() - assert metadata.dataset.metadata_created_date is not None - - -@pytest.mark.parametrize( - ("variable_date", "date_from", "date_until"), - [ - (None, datetime.date(1967, 1, 1), datetime.date(1980, 1, 1)), - ( - datetime.date(2022, 2, 2), - datetime.date(1999, 3, 3), - datetime.date(2000, 1, 4), - ), - ], -) -def test_variables_inherit_dates( - variable_date, - date_from, - date_until, - metadata: Datadoc, -): - state.metadata = metadata - metadata.dataset.contains_data_from = date_from - metadata.dataset.contains_data_until = date_until - for v in metadata.variables: - v.contains_data_from = variable_date - v.contains_data_until = variable_date - metadata.write_metadata_document() - for v in metadata.variables: - if variable_date is None: - assert v.contains_data_from == metadata.dataset.contains_data_from - assert v.contains_data_until == metadata.dataset.contains_data_until - else: - assert v.contains_data_from == variable_date - assert v.contains_data_until == variable_date - - -def test_variables_inherit_temporality_type_value(metadata: Datadoc): - assert all(v.temporality_type is None for v in metadata.variables) - metadata.dataset.temporality_type = datadoc_model.model.TemporalityTypeType( - TemporalityTypeType.FIXED.value, - ) - metadata.write_metadata_document() - assert all( - v.temporality_type == metadata.dataset.temporality_type - for v in metadata.variables - ) - - -def test_obligatory_metadata_dataset_warning(metadata: Datadoc): - state.metadata = metadata - with pytest.warns( - ObligatoryDatasetWarning, - match=OBLIGATORY_METADATA_WARNING, - ) as record: - metadata.write_metadata_document() - all_obligatory_completed = 100 - num_warnings = 2 - if metadata.percent_complete != all_obligatory_completed: - assert len(record) == num_warnings - assert issubclass(record[0].category, ObligatoryDatasetWarning) - assert OBLIGATORY_METADATA_WARNING in str( - record[0].message, - ) - - -def test_obligatory_metadata_variables_warning(metadata: Datadoc): - state.metadata = metadata - with pytest.warns( - ObligatoryVariableWarning, - match=OBLIGATORY_METADATA_WARNING, - ) as record: - metadata.write_metadata_document() - all_obligatory_completed = 100 - if metadata.percent_complete != all_obligatory_completed and len(record) > 1: - assert issubclass(record[1].category, ObligatoryVariableWarning) - if ( - metadata.variables_lookup["pers_id"] - and metadata.variables_lookup["pers_id"].name is None - ): - assert "[{'pers_id': ['name']}," in str( - record[1].message, - ) - - -def test_obligatory_metadata_dataset_warning_name(metadata: Datadoc): - state.metadata = metadata - metadata.dataset.name = None - with pytest.warns( - ObligatoryDatasetWarning, - match=OBLIGATORY_METADATA_WARNING, - ) as record: - metadata.write_metadata_document() - assert "name" in str( - record[0].message, - ) - # Set value 'name' for first time, a Language object is created - metadata.dataset.name = model.LanguageStringType( - [ - model.LanguageStringTypeItem(languageCode="nb", languageText="Navnet"), - ], - ) - metadata.dataset.description = None - with pytest.warns( - ObligatoryDatasetWarning, - match=OBLIGATORY_METADATA_WARNING, - ) as record2: - metadata.write_metadata_document() - assert "name" not in str(record2[0].message) - - # Remove value for 'name', value for 'name' is no longer 'None', but 'languageText' is None - metadata.dataset.name = model.LanguageStringType( - [ - model.LanguageStringTypeItem(languageCode="nb", languageText=""), - ], - ) - with pytest.warns( - ObligatoryDatasetWarning, - match=OBLIGATORY_METADATA_WARNING, - ) as record3: - metadata.write_metadata_document() - assert "name" in str(record3[0].message) - - -def test_obligatory_metadata_dataset_warning_description(metadata: Datadoc): - """Field name 'description' is a special case because it can match other field names like 'version_description'.""" - state.metadata = metadata - error_message: str - missing_obligatory_dataset = "" - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - metadata.write_metadata_document() - if issubclass(w[0].category, ObligatoryDatasetWarning): - error_message = str(w[0].message) - assert re.search(r"\bdescription\b", error_message) - - # Check that field name is removed from warning when value - metadata.dataset.description = model.LanguageStringType( - [ - model.LanguageStringTypeItem(languageCode="nb", languageText="Beskrivelse"), - ], - ) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - metadata.write_metadata_document() - if issubclass(w[0].category, ObligatoryDatasetWarning): - missing_obligatory_dataset = str(w[0].message) - assert not re.search(r"\bdescription\b", missing_obligatory_dataset) - - -def test_obligatory_metadata_dataset_warning_multiple_languages( - metadata: Datadoc, -): - state.metadata = metadata - missing_obligatory_dataset = "" - - metadata.dataset.description = model.LanguageStringType( - [ - model.LanguageStringTypeItem(languageCode="nb", languageText="Beskrivelse"), - model.LanguageStringTypeItem(languageCode="en", languageText="Description"), - ], - ) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - metadata.write_metadata_document() - if issubclass(w[0].category, ObligatoryDatasetWarning): - missing_obligatory_dataset = str(w[0].message) - assert not re.search(r"\bdescription\b", missing_obligatory_dataset) - - # Remove value for one language - metadata.dataset.description = model.LanguageStringType( - [ - model.LanguageStringTypeItem(languageCode="nb", languageText=""), - model.LanguageStringTypeItem(languageCode="en", languageText="Description"), - ], - ) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - metadata.write_metadata_document() - if issubclass(w[0].category, ObligatoryDatasetWarning): - missing_obligatory_dataset = str(w[0].message) - assert not re.search(r"\bdescription\b", missing_obligatory_dataset) - - # Remove value for all languages - metadata.dataset.description = model.LanguageStringType( - [ - model.LanguageStringTypeItem(languageCode="nb", languageText=""), - model.LanguageStringTypeItem(languageCode="en", languageText=""), - ], - ) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - metadata.write_metadata_document() - if issubclass(w[0].category, ObligatoryDatasetWarning): - missing_obligatory_dataset = str(w[0].message) - assert re.search(r"\bdescription\b", missing_obligatory_dataset) - - -def test_obligatory_metadata_variables_warning_name(metadata: Datadoc): - state.metadata = metadata - variable_with_name = "{'pers_id': ['name']}" - with pytest.warns( - ObligatoryVariableWarning, - match=OBLIGATORY_METADATA_WARNING, - ) as record: - metadata.write_metadata_document() - assert metadata.variables_lookup["pers_id"] is not None - assert metadata.variables_lookup["pers_id"].name is None - assert variable_with_name in str(record[1].message) - - metadata.variables_lookup["pers_id"].name = model.LanguageStringType( - [ - model.LanguageStringTypeItem(languageCode="nb", languageText="Navnet"), - ], - ) - with pytest.warns( - ObligatoryVariableWarning, - match=OBLIGATORY_METADATA_WARNING, - ) as record2: - metadata.write_metadata_document() - assert variable_with_name not in str(record2[1].message) - assert "pers_id" not in str(record2[1].message) diff --git a/tests/conftest.py b/tests/conftest.py index 06e48030..33464a5e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,34 +10,34 @@ import shutil from datetime import datetime from datetime import timezone -from pathlib import Path from typing import TYPE_CHECKING import pandas as pd import pytest from bs4 import BeautifulSoup from bs4 import ResultSet -from datadoc_model import model +from dapla_metadata.datasets import Datadoc +from dapla_metadata.datasets import model +from dapla_metadata.datasets.code_list import CodeList +from dapla_metadata.datasets.statistic_subject_mapping import StatisticSubjectMapping +from dapla_metadata.datasets.user_info import TestUserInfo from datadoc import state -from datadoc.backend.code_list import CodeList -from datadoc.backend.core import Datadoc -from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping -from datadoc.backend.user_info import TestUserInfo -from tests.backend.test_statistic_subject_mapping import ( - STATISTICAL_SUBJECT_STRUCTURE_DIR, -) - -from .utils import TEST_DATASETS_DIRECTORY + from .utils import TEST_EXISTING_METADATA_DIRECTORY -from .utils import TEST_EXISTING_METADATA_FILE_NAME -from .utils import TEST_EXISTING_METADATA_NAMING_STANDARD_FILEPATH -from .utils import TEST_NAMING_STANDARD_COMPATIBLE_DATASET from .utils import TEST_PARQUET_FILE_NAME from .utils import TEST_PARQUET_FILEPATH from .utils import TEST_RESOURCES_DIRECTORY +if TYPE_CHECKING: + from pathlib import Path + + from pytest_mock import MockerFixture + + +DATADOC_METADATA_MODULE = "dapla_metadata.datasets" CODE_LIST_DIR = "code_list" +STATISTICAL_SUBJECT_STRUCTURE_DIR = "statistical_subject_structure" if TYPE_CHECKING: from pytest_mock import MockerFixture @@ -62,7 +62,7 @@ def dummy_timestamp() -> datetime: @pytest.fixture() def _mock_timestamp(mocker: MockerFixture, dummy_timestamp: datetime) -> None: mocker.patch( - "datadoc.backend.core.get_timestamp_now", + DATADOC_METADATA_MODULE + ".core.get_timestamp_now", return_value=dummy_timestamp, ) @@ -70,7 +70,7 @@ def _mock_timestamp(mocker: MockerFixture, dummy_timestamp: datetime) -> None: @pytest.fixture() def _mock_user_info(mocker: MockerFixture) -> None: mocker.patch( - "datadoc.backend.user_info.get_user_info_for_current_platform", + DATADOC_METADATA_MODULE + ".user_info.get_user_info_for_current_platform", return_value=TestUserInfo(), ) @@ -89,41 +89,11 @@ def metadata( ) -@pytest.fixture() -def metadata_merged( - _mock_timestamp: None, - _mock_user_info: None, - subject_mapping_fake_statistical_structure: StatisticSubjectMapping, - tmp_path: Path, -) -> Datadoc: - target = tmp_path / TEST_NAMING_STANDARD_COMPATIBLE_DATASET - target.parent.mkdir(parents=True, exist_ok=True) - shutil.copy( - TEST_DATASETS_DIRECTORY / TEST_NAMING_STANDARD_COMPATIBLE_DATASET, - target, - ) - return Datadoc( - str(target), - str(TEST_EXISTING_METADATA_NAMING_STANDARD_FILEPATH), - statistic_subject_mapping=subject_mapping_fake_statistical_structure, - ) - - @pytest.fixture() def existing_metadata_path() -> Path: return TEST_EXISTING_METADATA_DIRECTORY -@pytest.fixture() -def existing_metadata_file(tmp_path: Path, existing_metadata_path: Path) -> Path: - # Setup by copying the file into the relevant directory - shutil.copy( - existing_metadata_path / TEST_EXISTING_METADATA_FILE_NAME, - tmp_path / TEST_EXISTING_METADATA_FILE_NAME, - ) - return tmp_path / TEST_EXISTING_METADATA_FILE_NAME - - @pytest.fixture(autouse=True) def _clear_state() -> None: """Global fixture, referred to in pytest.ini.""" @@ -164,19 +134,6 @@ def language_object( ) -@pytest.fixture() -def language_dicts(english_name: str, bokmål_name: str) -> list[dict[str, str]]: - return [ - {"languageCode": "en", "languageText": english_name}, - {"languageCode": "nb", "languageText": bokmål_name}, - ] - - -@pytest.fixture() -def existing_data_path() -> Path: - return TEST_PARQUET_FILEPATH - - @pytest.fixture() def full_dataset_state_path( path_parts_to_insert: str | list[str], @@ -233,24 +190,12 @@ def fake_statistical_structure() -> ResultSet: return BeautifulSoup(f.read(), features="xml").find_all("hovedemne") mocker.patch( - "datadoc.backend.statistic_subject_mapping.StatisticSubjectMapping._fetch_data_from_external_source", + DATADOC_METADATA_MODULE + + ".statistic_subject_mapping.StatisticSubjectMapping._fetch_data_from_external_source", functools.partial(fake_statistical_structure), ) -@pytest.fixture() -def subject_mapping_http_exception( - requests_mock, - exception_to_raise, - thread_pool_executor, -) -> StatisticSubjectMapping: - requests_mock.get( - "http://test.some.url.com", - exc=exception_to_raise, - ) - return StatisticSubjectMapping(thread_pool_executor, "http://test.some.url.com") - - @pytest.fixture() def code_list_csv_filepath_nb() -> pathlib.Path: return TEST_RESOURCES_DIRECTORY / CODE_LIST_DIR / "code_list_nb.csv" @@ -281,7 +226,8 @@ def fake_code_list() -> dict[str, pd.DataFrame]: } mocker.patch( - "datadoc.backend.code_list.CodeList._fetch_data_from_external_source", + DATADOC_METADATA_MODULE + + ".code_list.CodeList._fetch_data_from_external_source", functools.partial(fake_code_list), ) diff --git a/tests/frontend/callbacks/test_callbacks_utils.py b/tests/frontend/callbacks/test_callbacks_utils.py index 73296f33..36a00220 100644 --- a/tests/frontend/callbacks/test_callbacks_utils.py +++ b/tests/frontend/callbacks/test_callbacks_utils.py @@ -1,6 +1,6 @@ import pytest +from dapla_metadata.datasets import model from dash import html -from datadoc_model import model from datadoc.frontend.callbacks.utils import find_existing_language_string from datadoc.frontend.callbacks.utils import render_tabs diff --git a/tests/frontend/callbacks/test_dataset_callbacks.py b/tests/frontend/callbacks/test_dataset_callbacks.py index 6999d0b3..f02a143e 100644 --- a/tests/frontend/callbacks/test_dataset_callbacks.py +++ b/tests/frontend/callbacks/test_dataset_callbacks.py @@ -11,26 +11,27 @@ import dash import dash_bootstrap_components as dbc import pytest -from datadoc_model import model +from dapla_metadata.datasets import ObligatoryDatasetWarning +from dapla_metadata.datasets import model from datadoc import enums from datadoc import state -from datadoc.backend.model_validation import ObligatoryDatasetWarning from datadoc.frontend.callbacks.dataset import accept_dataset_metadata_date_input from datadoc.frontend.callbacks.dataset import accept_dataset_metadata_input from datadoc.frontend.callbacks.dataset import dataset_control from datadoc.frontend.callbacks.dataset import open_dataset_handling from datadoc.frontend.callbacks.dataset import process_special_cases +from datadoc.frontend.constants import INVALID_DATE_ORDER +from datadoc.frontend.constants import INVALID_VALUE from datadoc.frontend.fields.display_dataset import DISPLAY_DATASET from datadoc.frontend.fields.display_dataset import ( MULTIPLE_LANGUAGE_DATASET_IDENTIFIERS, ) from datadoc.frontend.fields.display_dataset import DatasetIdentifiers -from datadoc.frontend.text import INVALID_DATE_ORDER -from datadoc.frontend.text import INVALID_VALUE if TYPE_CHECKING: - from datadoc.backend.core import Datadoc + from dapla_metadata.datasets import Datadoc + from datadoc.frontend.callbacks.utils import MetadataInputTypes DATASET_CALLBACKS_MODULE = "datadoc.frontend.callbacks.dataset" @@ -73,9 +74,9 @@ def file_path_without_dates(): ( DatasetIdentifiers.NAME, "Dataset name", - enums.LanguageStringType( + model.LanguageStringType( [ - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="Dataset name", ), @@ -85,9 +86,9 @@ def file_path_without_dates(): ( DatasetIdentifiers.DESCRIPTION, "Dataset description", - enums.LanguageStringType( + model.LanguageStringType( [ - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="Dataset description", ), @@ -102,9 +103,9 @@ def file_path_without_dates(): ( DatasetIdentifiers.POPULATION_DESCRIPTION, "Population description", - enums.LanguageStringType( + model.LanguageStringType( [ - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="Population description", ), @@ -115,9 +116,9 @@ def file_path_without_dates(): ( DatasetIdentifiers.VERSION_DESCRIPTION, "Version description", - enums.LanguageStringType( + model.LanguageStringType( [ - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="Version description", ), @@ -147,17 +148,17 @@ def file_path_without_dates(): ( DatasetIdentifiers.SPATIAL_COVERAGE_DESCRIPTION, "Spatial coverage description", - enums.LanguageStringType( + model.LanguageStringType( [ - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="Spatial coverage description", ), - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nn", languageText="Noreg", ), - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="en", languageText="Norway", ), diff --git a/tests/frontend/callbacks/test_variables_callbacks.py b/tests/frontend/callbacks/test_variables_callbacks.py index 36e0a035..3a589e81 100644 --- a/tests/frontend/callbacks/test_variables_callbacks.py +++ b/tests/frontend/callbacks/test_variables_callbacks.py @@ -10,13 +10,12 @@ import arrow import dash_bootstrap_components as dbc import pytest -from datadoc_model import model -from datadoc_model.model import LanguageStringTypeItem +from dapla_metadata.datasets import ObligatoryVariableWarning +from dapla_metadata.datasets import model from pydantic_core import Url from datadoc import enums from datadoc import state -from datadoc.backend.model_validation import ObligatoryVariableWarning from datadoc.frontend.callbacks.variables import accept_variable_metadata_date_input from datadoc.frontend.callbacks.variables import accept_variable_metadata_input from datadoc.frontend.callbacks.variables import populate_variables_workspace @@ -30,16 +29,17 @@ set_variables_values_inherit_dataset_values, ) from datadoc.frontend.callbacks.variables import variables_control +from datadoc.frontend.constants import INVALID_DATE_ORDER +from datadoc.frontend.constants import INVALID_VALUE from datadoc.frontend.fields.display_base import get_metadata_and_stringify from datadoc.frontend.fields.display_base import get_standard_metadata from datadoc.frontend.fields.display_dataset import DatasetIdentifiers from datadoc.frontend.fields.display_variables import DISPLAY_VARIABLES from datadoc.frontend.fields.display_variables import VariableIdentifiers -from datadoc.frontend.text import INVALID_DATE_ORDER -from datadoc.frontend.text import INVALID_VALUE if TYPE_CHECKING: - from datadoc.backend.core import Datadoc + from dapla_metadata.datasets import Datadoc + from datadoc.frontend.callbacks.utils import MetadataInputTypes @@ -49,9 +49,9 @@ ( VariableIdentifiers.NAME, "Variable name", - enums.LanguageStringType( + model.LanguageStringType( [ - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="Variable name", ), @@ -86,9 +86,9 @@ ( VariableIdentifiers.POPULATION_DESCRIPTION, "Population description", - enums.LanguageStringType( + model.LanguageStringType( [ - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="Population description", ), @@ -98,9 +98,9 @@ ( VariableIdentifiers.COMMENT, "Comment", - enums.LanguageStringType( + model.LanguageStringType( [ - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="Comment", ), @@ -130,9 +130,9 @@ ( VariableIdentifiers.INVALID_VALUE_DESCRIPTION, "Invalid value", - enums.LanguageStringType( + model.LanguageStringType( [ - enums.LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="Invalid value", ), @@ -452,7 +452,7 @@ def test_variables_values_multilanguage_inherit_dataset_values( state.metadata = metadata dataset_population_description = "Personer bosatt i Norge" dataset_population_description_language_item = [ - LanguageStringTypeItem( + model.LanguageStringTypeItem( languageCode="nb", languageText="Personer bosatt i Norge", ), @@ -482,7 +482,10 @@ def test_variables_values_multilanguage_can_be_changed_after_inherit_dataset_val state.metadata = metadata dataset_population_description = "Persons in Norway" dataset_population_description_language_item = [ - LanguageStringTypeItem(languageCode="en", languageText="Persons in Norway"), + model.LanguageStringTypeItem( + languageCode="en", + languageText="Persons in Norway", + ), ] dataset_identifier = DatasetIdentifiers.POPULATION_DESCRIPTION variables_identifier = VariableIdentifiers.POPULATION_DESCRIPTION @@ -503,7 +506,10 @@ def test_variables_values_multilanguage_can_be_changed_after_inherit_dataset_val variables_identifier, ) variables_language_item = [ - LanguageStringTypeItem(languageCode="en", languageText="Persons in Sweden"), + model.LanguageStringTypeItem( + languageCode="en", + languageText="Persons in Sweden", + ), ] setattr( state.metadata.variables_lookup["pers_id"], @@ -598,7 +604,7 @@ def test_variables_metadata_control_dont_return_alert(metadata: Datadoc): state.metadata.variables_lookup[val.short_name], VariableIdentifiers.NAME, model.LanguageStringType( - [LanguageStringTypeItem(languageCode="nb", languageText="Test")], + [model.LanguageStringTypeItem(languageCode="nb", languageText="Test")], ), ) setattr( diff --git a/tests/frontend/components/test_build_dataset_edit_section.py b/tests/frontend/components/test_build_dataset_edit_section.py index 69d2fe0b..d980b09e 100644 --- a/tests/frontend/components/test_build_dataset_edit_section.py +++ b/tests/frontend/components/test_build_dataset_edit_section.py @@ -3,8 +3,8 @@ import dash_bootstrap_components as dbc import pytest import ssb_dash_components as ssb # type: ignore[import-untyped] +from dapla_metadata.datasets import model from dash import html -from datadoc_model import model from datadoc.frontend.components.builders import build_dataset_edit_section from datadoc.frontend.fields.display_base import FieldTypes @@ -22,9 +22,12 @@ for m in DISPLAY_DATASET.values() if m.obligatory and m.editable - and m.identifier != DatasetIdentifiers.UNIT_TYPE.value - and m.identifier != DatasetIdentifiers.SUBJECT_FIELD.value - and m.identifier != DatasetIdentifiers.OWNER.value + and m.identifier + not in ( + DatasetIdentifiers.UNIT_TYPE.value, + DatasetIdentifiers.SUBJECT_FIELD.value, + DatasetIdentifiers.OWNER.value, + ) ] INPUT_DATA_BUILD_DATASET_SECTION = [ diff --git a/tests/frontend/components/test_build_edit_section.py b/tests/frontend/components/test_build_edit_section.py index 7b9cbb46..1a4910ea 100644 --- a/tests/frontend/components/test_build_edit_section.py +++ b/tests/frontend/components/test_build_edit_section.py @@ -3,8 +3,8 @@ import dash_bootstrap_components as dbc import pytest import ssb_dash_components as ssb # type: ignore[import-untyped] +from dapla_metadata.datasets import model from dash import html -from datadoc_model import model from datadoc.frontend.components.builders import build_edit_section from datadoc.frontend.fields.display_variables import OBLIGATORY_VARIABLES_METADATA diff --git a/tests/frontend/components/test_build_input_section.py b/tests/frontend/components/test_build_input_section.py index f7474348..1c628c38 100644 --- a/tests/frontend/components/test_build_input_section.py +++ b/tests/frontend/components/test_build_input_section.py @@ -3,7 +3,7 @@ import dash_bootstrap_components as dbc import pytest import ssb_dash_components as ssb # type: ignore[import-untyped] -from datadoc_model import model +from dapla_metadata.datasets import model from datadoc.frontend.components.builders import build_input_field_section from datadoc.frontend.fields.display_base import MetadataCheckboxField diff --git a/tests/frontend/fields/test_display_dataset.py b/tests/frontend/fields/test_display_dataset.py index 5a6b8249..6021291e 100644 --- a/tests/frontend/fields/test_display_dataset.py +++ b/tests/frontend/fields/test_display_dataset.py @@ -4,10 +4,8 @@ from datadoc.frontend.fields.display_base import DROPDOWN_DESELECT_OPTION from datadoc.frontend.fields.display_dataset import get_statistical_subject_options from datadoc.frontend.fields.display_dataset import get_unit_type_options -from tests.backend.test_code_list import CODE_LIST_DIR -from tests.backend.test_statistic_subject_mapping import ( - STATISTICAL_SUBJECT_STRUCTURE_DIR, -) +from tests.conftest import CODE_LIST_DIR +from tests.conftest import STATISTICAL_SUBJECT_STRUCTURE_DIR from tests.utils import TEST_RESOURCES_DIRECTORY diff --git a/tests/test_model.py b/tests/test_model.py index eab4df5b..6bcaedd7 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,7 +1,6 @@ """Verify that we are in sync with the Model.""" -from datadoc_model.model import Dataset -from datadoc_model.model import Variable +from dapla_metadata.datasets import model from datadoc.frontend.fields.display_dataset import DISPLAY_DATASET from datadoc.frontend.fields.display_dataset import DatasetIdentifiers @@ -12,7 +11,7 @@ def test_dataset_metadata_definition_parity(): """The metadata fields are currently defined in multiple places for technical reasons. We want these to always be exactly identical.""" datadoc_values = sorted([i.value for i in DatasetIdentifiers]) - model_values = sorted(Dataset().model_dump().keys()) + model_values = sorted(model.Dataset().model_dump().keys()) # TODO @Jorgen-5: Fields that are currently not supported by datadoc # noqa: TD003 model_values.remove("custom_type") @@ -24,7 +23,7 @@ def test_dataset_metadata_definition_parity(): def test_variables_metadata_definition_parity(): """The metadata fields are currently defined in multiple places for technical reasons. We want these to always be exactly identical.""" datadoc_values = sorted([i.value for i in VariableIdentifiers]) - model_values = sorted(Variable().model_dump().keys()) + model_values = sorted(model.Variable().model_dump().keys()) # TODO @Jorgen-5: Fields that are currently not supported by datadoc # noqa: TD003 model_values.remove("custom_type")