Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to owner #206

Merged
merged 12 commits into from
Mar 11, 2024
10 changes: 5 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ dash = ">=2.15.0"
pydantic = "==2.5.2"
dash-bootstrap-components = ">=1.1.0"
pandas = ">=1.4.2"
ssb-datadoc-model = "==4.2.0"
ssb-datadoc-model = "4.3.2"
dapla-toolbelt = ">=1.3.3"
gunicorn = ">=21.2.0"
flask-healthz = ">=0.0.3"
Expand Down
8 changes: 6 additions & 2 deletions src/datadoc/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@

from datadoc import config
from datadoc import state
from datadoc.backend.code_list import CodeList
from datadoc.backend.datadoc_metadata import DataDocMetadata
from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping
from datadoc.backend.unit_types import UnitTypes
from datadoc.enums import SupportedLanguages
from datadoc.frontend.callbacks.register_callbacks import register_callbacks
from datadoc.frontend.components.alerts import dataset_validation_error
Expand Down Expand Up @@ -122,10 +122,14 @@ def collect_data_from_external_sources() -> None:
config.get_statistical_subject_source_url(),
)

state.unit_types = UnitTypes(
state.unit_types = CodeList(
config.get_unit_code(),
)

state.organisational_units = CodeList(
config.get_organisational_unit_code(),
)


def main(dataset_path: str | None = None) -> None:
"""Entrypoint when running as a script."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,39 @@


@dataclass
class UnitType:
"""Data structure for the a unit type."""
class CodeListItem:
"""Data structure for a code list item."""

titles: dict[str, str]
unit_code: str
code: str

def get_title(self, language: SupportedLanguages) -> str:
"""Get the title in the given language."""
try:
return self.titles[
(
# Adjust to language codes in the UnitTypes structure.
"nb"
if language
in [
SupportedLanguages.NORSK_BOKMÅL,
SupportedLanguages.NORSK_NYNORSK,
]
else "en"
)
]
return self.titles[language]
except KeyError:
logger.exception(
"Could not find title for subject %s and language: %s",
self,
language.name,
)
return ""
try:
return self.titles[
(
"nb"
if language
in [
SupportedLanguages.NORSK_BOKMÅL,
SupportedLanguages.NORSK_NYNORSK,
]
else "en"
)
]
except KeyError:
logger.exception(
"Could not find title for subject %s and language: %s",
self,
language.name,
)
return ""


class UnitTypes(GetExternalSource):
class CodeList(GetExternalSource):
"""Class for retrieving classifications from Klass."""

def __init__(self, classification_id: int | None) -> None:
Expand All @@ -58,13 +60,9 @@ def __init__(self, classification_id: int | None) -> None:
SupportedLanguages.NORSK_BOKMÅL.value,
SupportedLanguages.ENGLISH.value,
]

self._classifications: list[UnitType] = []

self._classifications: list[CodeListItem] = []
self.classification_id = classification_id

self.classifications_dataframes: dict[str, pd.DataFrame] | None = None

super().__init__()

def _fetch_data_from_external_source(
Expand All @@ -85,7 +83,6 @@ def _fetch_data_from_external_source(
.get_codes()
.data
)

except Exception:
logger.exception(
"Exception while getting classifications from Klass",
Expand All @@ -110,10 +107,10 @@ def _extract_titles(
list_of_titles.append(titles)
return list_of_titles

def _create_unit_types_from_dataframe(
def _create_code_list_from_dataframe(
self,
classifications_dataframes: dict[SupportedLanguages, pd.DataFrame],
) -> list[UnitType]:
) -> list[CodeListItem]:
"""Method that finds the name column in the dataframe, and returns all values in a list."""
classification_names = self._extract_titles(classifications_dataframes)
classification_codes: list
Expand All @@ -128,7 +125,7 @@ def _create_unit_types_from_dataframe(
unit_types = []
for a, b in zip(classification_names, classification_codes):
unit_types.append(
UnitType(a, b),
CodeListItem(a, b),
)
return unit_types

Expand All @@ -137,7 +134,7 @@ def _get_classification_dataframe_if_loaded(self) -> bool:
if not self._classifications:
self.classifications_dataframes = self.retrieve_external_data()
if self.classifications_dataframes is not None:
self._classifications = self._create_unit_types_from_dataframe(
self._classifications = self._create_code_list_from_dataframe(
self.classifications_dataframes,
)
logger.debug(
Expand All @@ -151,7 +148,7 @@ def _get_classification_dataframe_if_loaded(self) -> bool:
return False

@property
def classifications(self) -> list[UnitType]:
def classifications(self) -> list[CodeListItem]:
"""Getter for primary subjects."""
self._get_classification_dataframe_if_loaded()
logger.debug("Got %s classifications subjects", len(self._classifications))
Expand Down
23 changes: 2 additions & 21 deletions src/datadoc/backend/datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import uuid
from typing import TYPE_CHECKING

import pydantic
from cloudpathlib import CloudPath
from cloudpathlib import GSClient
from cloudpathlib import GSPath
Expand Down Expand Up @@ -51,16 +50,13 @@ def __init__(
) -> None:
"""Read in a dataset if supplied, otherwise naively instantiate the class."""
self._statistic_subject_mapping = statistic_subject_mapping

self.metadata_document: pathlib.Path | CloudPath | None = None
self.container: model.MetadataContainer | None = None
self.dataset_path: pathlib.Path | CloudPath | None = None
self.short_name: str | None = None
self.dataset = model.Dataset()
self.variables: list = []

self.variables_lookup: dict[str, model.Variable] = {}

if metadata_document_path:
# In this case the user has specified an independent metadata document for editing
# without a dataset.
Expand All @@ -72,7 +68,6 @@ def __init__(
self.metadata_document = self.dataset_path.parent / (
self.dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX
)

self.extract_metadata_from_files()

@staticmethod
Expand Down Expand Up @@ -123,35 +118,28 @@ def extract_metadata_from_existing_document(
try:
with document.open(mode="r", encoding="utf-8") as file:
fresh_metadata = json.load(file)
logger.info(
"Opened existing metadata file %s",
document,
)
logger.info("Opened existing metadata file %s", document)
if self.is_metadata_in_container_structure(fresh_metadata):
self.container = model.MetadataContainer.model_validate_json(
json.dumps(fresh_metadata),
)
datadoc_metadata = fresh_metadata["datadoc"]
else:
datadoc_metadata = fresh_metadata

if datadoc_metadata is None:
# In this case we've read in a file with an empty "datadoc" structure.
# A typical example of this is a file produced from a pseudonymization process.
return

datadoc_metadata = upgrade_metadata(
datadoc_metadata,
)

meta = model.DatadocMetadata.model_validate_json(
json.dumps(datadoc_metadata),
)
if meta.dataset is not None:
self.dataset = meta.dataset
if meta.variables is not None:
self.variables = meta.variables

except json.JSONDecodeError:
logger.warning(
"Could not open existing metadata file %s. \
Expand All @@ -169,14 +157,7 @@ def is_metadata_in_container_structure(
The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc.
This method returns True if the metadata is in the container structure, False otherwise.
"""
try:
model.MetadataContainer.model_validate_json(
json.dumps(metadata),
)
except pydantic.ValidationError:
return False
else:
return True
return "datadoc" in metadata

def extract_metadata_from_dataset(
self,
Expand Down
1 change: 0 additions & 1 deletion src/datadoc/backend/external_sources/external_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def __init__(self) -> None:
Initializes the future object.
"""
self.future: concurrent.futures.Future[T | None] | None = None

executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
self.future = executor.submit(
self._fetch_data_from_external_source,
Expand Down
33 changes: 15 additions & 18 deletions src/datadoc/backend/model_backwards_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,20 @@ def handle_current_version(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
return supplied_metadata


def handle_version_2_1_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
"""Handle breaking changes for v2.1.0.

Datatype changed from LanguageStringType to str for owner
"""
data = supplied_metadata["dataset"]["owner"]
supplied_metadata["dataset"]["owner"] = str(data["nb"] or data["nn"] or data["en"])
supplied_metadata["document_version"] = "2.2.0"
return supplied_metadata


def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
"""Handle breaking changes for v1.0.0."""
datetime_fields = [
("metadata_created_date"),
("metadata_last_updated_date"),
]
datetime_fields = [("metadata_created_date"), ("metadata_last_updated_date")]
for field in datetime_fields:
if supplied_metadata["dataset"][field]:
supplied_metadata["dataset"][field] = datetime.isoformat(
Expand All @@ -77,13 +85,11 @@ def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
),
timespec="seconds",
)

if isinstance(supplied_metadata["dataset"]["data_source"], str):
supplied_metadata["dataset"]["data_source"] = LanguageStringType(
en=supplied_metadata["dataset"]["data_source"],
)
supplied_metadata["document_version"] = "2.1.0"

return supplied_metadata


Expand All @@ -102,7 +108,6 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
supplied_metadata["dataset"][new_key] = supplied_metadata["dataset"].pop(
old_key,
)

# Replace empty strings with None, empty strings are not valid for LanguageStrings values
supplied_metadata["dataset"] = {
k: None if v == "" else v for k, v in supplied_metadata["dataset"].items()
Expand All @@ -113,30 +118,22 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
# Register all the supported versions and their handlers.
# MUST be ordered from oldest to newest.
BackwardsCompatibleVersion(version="0.1.1", handler=handle_version_0_1_1)
BackwardsCompatibleVersion(
version="1.0.0",
handler=handle_version_1_0_0,
)
BackwardsCompatibleVersion(
version="2.1.0",
handler=handle_current_version,
)
BackwardsCompatibleVersion(version="1.0.0", handler=handle_version_1_0_0)
BackwardsCompatibleVersion(version="2.1.0", handler=handle_version_2_1_0)
BackwardsCompatibleVersion(version="2.2.0", handler=handle_current_version)


def upgrade_metadata(fresh_metadata: dict[str, Any]) -> dict[str, Any]:
"""Run the handler for this version to upgrade the document to the latest version."""
# Special case for current version, we expose the current_model_version parameter for test purposes
supplied_version = fresh_metadata[VERSION_FIELD_NAME]
start_running_handlers = False

# Run all the handlers in order from the supplied version onwards
for k, v in SUPPORTED_VERSIONS.items():
if k == supplied_version:
start_running_handlers = True
if start_running_handlers:
fresh_metadata = v.handler(fresh_metadata)

if not start_running_handlers:
raise UnknownModelVersionError(supplied_version)

return fresh_metadata
5 changes: 5 additions & 0 deletions src/datadoc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,8 @@ def get_oidc_token() -> str | None:
def get_unit_code() -> int | None:
"""The code for the Unit Type code list in Klass."""
return int(_get_config_item("DATADOC_UNIT_CODE") or 702)


def get_organisational_unit_code() -> int | None:
"""The code for the organisational units code list in Klass."""
return int(_get_config_item("DATADOC_ORGANISATIONAL_UNIT_CODE") or 83)
Loading