Skip to content

Commit

Permalink
Merge pull request #206 from statisticsnorway/feat/dpmeta-62-add-klas…
Browse files Browse the repository at this point in the history
…s-codes-to-owner

Changes to owner
  • Loading branch information
JanhSander authored Mar 11, 2024
2 parents 518804e + 83107aa commit 9c547f8
Show file tree
Hide file tree
Showing 30 changed files with 435 additions and 175 deletions.
10 changes: 5 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ dash = ">=2.15.0"
pydantic = "==2.5.2"
dash-bootstrap-components = ">=1.1.0"
pandas = ">=1.4.2"
ssb-datadoc-model = "==4.2.0"
ssb-datadoc-model = "4.3.2"
dapla-toolbelt = ">=1.3.3"
gunicorn = ">=21.2.0"
flask-healthz = ">=0.0.3"
Expand Down
8 changes: 6 additions & 2 deletions src/datadoc/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@

from datadoc import config
from datadoc import state
from datadoc.backend.code_list import CodeList
from datadoc.backend.datadoc_metadata import DataDocMetadata
from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping
from datadoc.backend.unit_types import UnitTypes
from datadoc.enums import SupportedLanguages
from datadoc.frontend.callbacks.register_callbacks import register_callbacks
from datadoc.frontend.components.alerts import dataset_validation_error
Expand Down Expand Up @@ -122,10 +122,14 @@ def collect_data_from_external_sources() -> None:
config.get_statistical_subject_source_url(),
)

state.unit_types = UnitTypes(
state.unit_types = CodeList(
config.get_unit_code(),
)

state.organisational_units = CodeList(
config.get_organisational_unit_code(),
)


def main(dataset_path: str | None = None) -> None:
"""Entrypoint when running as a script."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,39 @@


@dataclass
class UnitType:
"""Data structure for the a unit type."""
class CodeListItem:
"""Data structure for a code list item."""

titles: dict[str, str]
unit_code: str
code: str

def get_title(self, language: SupportedLanguages) -> str:
"""Get the title in the given language."""
try:
return self.titles[
(
# Adjust to language codes in the UnitTypes structure.
"nb"
if language
in [
SupportedLanguages.NORSK_BOKMÅL,
SupportedLanguages.NORSK_NYNORSK,
]
else "en"
)
]
return self.titles[language]
except KeyError:
logger.exception(
"Could not find title for subject %s and language: %s",
self,
language.name,
)
return ""
try:
return self.titles[
(
"nb"
if language
in [
SupportedLanguages.NORSK_BOKMÅL,
SupportedLanguages.NORSK_NYNORSK,
]
else "en"
)
]
except KeyError:
logger.exception(
"Could not find title for subject %s and language: %s",
self,
language.name,
)
return ""


class UnitTypes(GetExternalSource):
class CodeList(GetExternalSource):
"""Class for retrieving classifications from Klass."""

def __init__(self, classification_id: int | None) -> None:
Expand All @@ -58,13 +60,9 @@ def __init__(self, classification_id: int | None) -> None:
SupportedLanguages.NORSK_BOKMÅL.value,
SupportedLanguages.ENGLISH.value,
]

self._classifications: list[UnitType] = []

self._classifications: list[CodeListItem] = []
self.classification_id = classification_id

self.classifications_dataframes: dict[str, pd.DataFrame] | None = None

super().__init__()

def _fetch_data_from_external_source(
Expand All @@ -85,7 +83,6 @@ def _fetch_data_from_external_source(
.get_codes()
.data
)

except Exception:
logger.exception(
"Exception while getting classifications from Klass",
Expand All @@ -110,10 +107,10 @@ def _extract_titles(
list_of_titles.append(titles)
return list_of_titles

def _create_unit_types_from_dataframe(
def _create_code_list_from_dataframe(
self,
classifications_dataframes: dict[SupportedLanguages, pd.DataFrame],
) -> list[UnitType]:
) -> list[CodeListItem]:
"""Method that finds the name column in the dataframe, and returns all values in a list."""
classification_names = self._extract_titles(classifications_dataframes)
classification_codes: list
Expand All @@ -128,7 +125,7 @@ def _create_unit_types_from_dataframe(
unit_types = []
for a, b in zip(classification_names, classification_codes):
unit_types.append(
UnitType(a, b),
CodeListItem(a, b),
)
return unit_types

Expand All @@ -137,7 +134,7 @@ def _get_classification_dataframe_if_loaded(self) -> bool:
if not self._classifications:
self.classifications_dataframes = self.retrieve_external_data()
if self.classifications_dataframes is not None:
self._classifications = self._create_unit_types_from_dataframe(
self._classifications = self._create_code_list_from_dataframe(
self.classifications_dataframes,
)
logger.debug(
Expand All @@ -151,7 +148,7 @@ def _get_classification_dataframe_if_loaded(self) -> bool:
return False

@property
def classifications(self) -> list[UnitType]:
def classifications(self) -> list[CodeListItem]:
"""Getter for primary subjects."""
self._get_classification_dataframe_if_loaded()
logger.debug("Got %s classifications subjects", len(self._classifications))
Expand Down
23 changes: 2 additions & 21 deletions src/datadoc/backend/datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import uuid
from typing import TYPE_CHECKING

import pydantic
from cloudpathlib import CloudPath
from cloudpathlib import GSClient
from cloudpathlib import GSPath
Expand Down Expand Up @@ -51,16 +50,13 @@ def __init__(
) -> None:
"""Read in a dataset if supplied, otherwise naively instantiate the class."""
self._statistic_subject_mapping = statistic_subject_mapping

self.metadata_document: pathlib.Path | CloudPath | None = None
self.container: model.MetadataContainer | None = None
self.dataset_path: pathlib.Path | CloudPath | None = None
self.short_name: str | None = None
self.dataset = model.Dataset()
self.variables: list = []

self.variables_lookup: dict[str, model.Variable] = {}

if metadata_document_path:
# In this case the user has specified an independent metadata document for editing
# without a dataset.
Expand All @@ -72,7 +68,6 @@ def __init__(
self.metadata_document = self.dataset_path.parent / (
self.dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX
)

self.extract_metadata_from_files()

@staticmethod
Expand Down Expand Up @@ -123,35 +118,28 @@ def extract_metadata_from_existing_document(
try:
with document.open(mode="r", encoding="utf-8") as file:
fresh_metadata = json.load(file)
logger.info(
"Opened existing metadata file %s",
document,
)
logger.info("Opened existing metadata file %s", document)
if self.is_metadata_in_container_structure(fresh_metadata):
self.container = model.MetadataContainer.model_validate_json(
json.dumps(fresh_metadata),
)
datadoc_metadata = fresh_metadata["datadoc"]
else:
datadoc_metadata = fresh_metadata

if datadoc_metadata is None:
# In this case we've read in a file with an empty "datadoc" structure.
# A typical example of this is a file produced from a pseudonymization process.
return

datadoc_metadata = upgrade_metadata(
datadoc_metadata,
)

meta = model.DatadocMetadata.model_validate_json(
json.dumps(datadoc_metadata),
)
if meta.dataset is not None:
self.dataset = meta.dataset
if meta.variables is not None:
self.variables = meta.variables

except json.JSONDecodeError:
logger.warning(
"Could not open existing metadata file %s. \
Expand All @@ -169,14 +157,7 @@ def is_metadata_in_container_structure(
The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc.
This method returns True if the metadata is in the container structure, False otherwise.
"""
try:
model.MetadataContainer.model_validate_json(
json.dumps(metadata),
)
except pydantic.ValidationError:
return False
else:
return True
return "datadoc" in metadata

def extract_metadata_from_dataset(
self,
Expand Down
1 change: 0 additions & 1 deletion src/datadoc/backend/external_sources/external_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def __init__(self) -> None:
Initializes the future object.
"""
self.future: concurrent.futures.Future[T | None] | None = None

executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
self.future = executor.submit(
self._fetch_data_from_external_source,
Expand Down
33 changes: 15 additions & 18 deletions src/datadoc/backend/model_backwards_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,20 @@ def handle_current_version(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
return supplied_metadata


def handle_version_2_1_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
"""Handle breaking changes for v2.1.0.
Datatype changed from LanguageStringType to str for owner
"""
data = supplied_metadata["dataset"]["owner"]
supplied_metadata["dataset"]["owner"] = str(data["nb"] or data["nn"] or data["en"])
supplied_metadata["document_version"] = "2.2.0"
return supplied_metadata


def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
"""Handle breaking changes for v1.0.0."""
datetime_fields = [
("metadata_created_date"),
("metadata_last_updated_date"),
]
datetime_fields = [("metadata_created_date"), ("metadata_last_updated_date")]
for field in datetime_fields:
if supplied_metadata["dataset"][field]:
supplied_metadata["dataset"][field] = datetime.isoformat(
Expand All @@ -77,13 +85,11 @@ def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
),
timespec="seconds",
)

if isinstance(supplied_metadata["dataset"]["data_source"], str):
supplied_metadata["dataset"]["data_source"] = LanguageStringType(
en=supplied_metadata["dataset"]["data_source"],
)
supplied_metadata["document_version"] = "2.1.0"

return supplied_metadata


Expand All @@ -102,7 +108,6 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
supplied_metadata["dataset"][new_key] = supplied_metadata["dataset"].pop(
old_key,
)

# Replace empty strings with None, empty strings are not valid for LanguageStrings values
supplied_metadata["dataset"] = {
k: None if v == "" else v for k, v in supplied_metadata["dataset"].items()
Expand All @@ -113,30 +118,22 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
# Register all the supported versions and their handlers.
# MUST be ordered from oldest to newest.
BackwardsCompatibleVersion(version="0.1.1", handler=handle_version_0_1_1)
BackwardsCompatibleVersion(
version="1.0.0",
handler=handle_version_1_0_0,
)
BackwardsCompatibleVersion(
version="2.1.0",
handler=handle_current_version,
)
BackwardsCompatibleVersion(version="1.0.0", handler=handle_version_1_0_0)
BackwardsCompatibleVersion(version="2.1.0", handler=handle_version_2_1_0)
BackwardsCompatibleVersion(version="2.2.0", handler=handle_current_version)


def upgrade_metadata(fresh_metadata: dict[str, Any]) -> dict[str, Any]:
"""Run the handler for this version to upgrade the document to the latest version."""
# Special case for current version, we expose the current_model_version parameter for test purposes
supplied_version = fresh_metadata[VERSION_FIELD_NAME]
start_running_handlers = False

# Run all the handlers in order from the supplied version onwards
for k, v in SUPPORTED_VERSIONS.items():
if k == supplied_version:
start_running_handlers = True
if start_running_handlers:
fresh_metadata = v.handler(fresh_metadata)

if not start_running_handlers:
raise UnknownModelVersionError(supplied_version)

return fresh_metadata
5 changes: 5 additions & 0 deletions src/datadoc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,8 @@ def get_oidc_token() -> str | None:
def get_unit_code() -> int | None:
"""The code for the Unit Type code list in Klass."""
return int(_get_config_item("DATADOC_UNIT_CODE") or 702)


def get_organisational_unit_code() -> int | None:
"""The code for the organisational units code list in Klass."""
return int(_get_config_item("DATADOC_ORGANISATIONAL_UNIT_CODE") or 83)
Loading

0 comments on commit 9c547f8

Please sign in to comment.