Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use ssb-datadoc-model v4.1.2 #120

Merged
merged 12 commits into from
Jan 10, 2024
2 changes: 1 addition & 1 deletion datadoc/app.py
Original file line number Diff line number Diff line change
@@ -10,11 +10,11 @@

import dash_bootstrap_components as dbc
from dash import Dash
from datadoc_model.Enums import SupportedLanguages
from flask_healthz import healthz

from datadoc import state
from datadoc.backend.datadoc_metadata import DataDocMetadata
from datadoc.enums import SupportedLanguages
from datadoc.frontend.callbacks.register_callbacks import register_callbacks
from datadoc.frontend.components.alerts import (
dataset_validation_error,
193 changes: 104 additions & 89 deletions datadoc/backend/datadoc_metadata.py
Original file line number Diff line number Diff line change
@@ -9,12 +9,12 @@
import uuid
from typing import TYPE_CHECKING

from datadoc_model import Model
from datadoc_model.Enums import DatasetState, SupportedLanguages
from datadoc_model import model

from datadoc.backend.dataset_parser import DatasetParser
from datadoc.backend.model_backwards_compatibility import upgrade_metadata
from datadoc.backend.storage_adapter import StorageAdapter
from datadoc.enums import DatasetState, SupportedLanguages, VariableRole
from datadoc.frontend.fields import display_dataset, display_variables
from datadoc.utils import calculate_percentage, get_timestamp_now

@@ -23,34 +23,6 @@

logger = logging.getLogger(__name__)

OBLIGATORY_DATASET_METADATA = [
m.identifier
for m in display_dataset.DISPLAY_DATASET.values()
if m.obligatory and m.editable
]

OBLIGATORY_VARIABLES_METADATA = [
m.identifier
for m in display_variables.DISPLAY_VARIABLES.values()
if m.obligatory and m.editable
]

# These don't vary at runtime so we calculate them as constants here
NUM_OBLIGATORY_DATASET_FIELDS = len(
[
k
for k in Model.DataDocDataSet().model_dump()
if k in OBLIGATORY_DATASET_METADATA
],
)
NUM_OBLIGATORY_VARIABLES_FIELDS = len(
[
k
for k in Model.DataDocVariable().model_dump()
if k in OBLIGATORY_VARIABLES_METADATA
],
)

METADATA_DOCUMENT_FILE_SUFFIX = "__DOC.json"

PLACEHOLDER_USERNAME = "default_user@ssb.no"
@@ -61,21 +33,46 @@ class DataDocMetadata:

def __init__(
self: t.Self @ DataDocMetadata,
dataset: str | None,
dataset_path: str | os.PathLike | None = None,
metadata_document_path: str | os.PathLike | None = None,
) -> None:
"""Read in a dataset if supplied, otherwise naively instantiate the class."""
self.dataset: str = dataset
if self.dataset:
self.dataset: str = dataset_path
self.metadata_document: StorageAdapter | None = None
self.container: model.MetadataContainer | None = None

self.dataset_state: DatasetState | None = None
self.short_name: str | None = None
self.current_user: str | None = None
self.meta: model.DatadocJsonSchema = model.DatadocJsonSchema(
percentage_complete=0,
dataset=model.Dataset(),
variables=[],
)

self.variables_lookup: dict[str, model.Variable] = {}

if metadata_document_path:
# In this case the user has specified an independent metadata document for editing
# without a dataset.
self.metadata_document = StorageAdapter.for_path(metadata_document_path)
self.extract_metadata_from_existing_document()

elif self.dataset:
# The short_name is set as the dataset filename without file extension
self.short_name: str = pathlib.Path(
self.dataset,
).stem # filename without file ending
).stem
self.metadata_document: StorageAdapter = StorageAdapter.for_path(
StorageAdapter.for_path(self.dataset).parent(),
)
self.metadata_document.joinpath(
self.short_name + METADATA_DOCUMENT_FILE_SUFFIX,
)
self.dataset_state: DatasetState = self.get_dataset_state(self.dataset)

self.extract_metadata_from_files()

try:
self.current_user = os.environ["JUPYTERHUB_USER"]
except KeyError:
@@ -85,18 +82,6 @@ def __init__(
self.current_user,
)

self.meta: Model.MetadataDocument = Model.MetadataDocument(
percentage_complete=0,
document_version=Model.MODEL_VERSION,
dataset=Model.DataDocDataSet(),
variables=[],
)

self.variables_lookup: dict[str, Model.DataDocVariable] = {}

if self.dataset:
self.extract_metadata_from_files()

def get_dataset_state(
self: t.Self @ DataDocMetadata,
dataset: str,
@@ -143,54 +128,77 @@ def get_dataset_version(
return None

def extract_metadata_from_files(self: t.Self @ DataDocMetadata) -> None:
"""Read metadata from a dataset.
"""Read metadata from an existing metadata document.

If a metadata document already exists, read in the metadata from that instead.
If no metadata document exists, create one from scratch by extracting metadata
from the dataset file.
"""
fresh_metadata = {}
if self.metadata_document.exists():
try:
with self.metadata_document.open(mode="r", encoding="utf-8") as file:
fresh_metadata = json.load(file)
logger.info(
"Opened existing metadata file %s",
self.metadata_document.location,
)

fresh_metadata = upgrade_metadata(fresh_metadata, Model.MODEL_VERSION)

variables_list = fresh_metadata.pop("variables", None)

self.meta.variables = [
Model.DataDocVariable(**v) for v in variables_list
]
self.meta.dataset = Model.DataDocDataSet(
**fresh_metadata.pop("dataset", None),
)
except json.JSONDecodeError:
logger.warning(
"Could not open existing metadata file %s. \
Falling back to collecting data from the dataset",
self.metadata_document.location,
exc_info=True,
)
self.extract_metadata_from_dataset()
self.extract_metadata_from_existing_document()
else:
self.extract_metadata_from_dataset()

if self.meta.dataset.id is None:
self.meta.dataset.id = uuid.uuid4()

# Set default values for variables where appropriate
v: Model.DataDocVariable
for v in self.meta.variables:
if v.variable_role is None:
v.variable_role = Model.Enums.VariableRole.MEASURE
if v.direct_person_identifying is None:
v.direct_person_identifying = False
# Set default values for variables where appropriate
v: model.Variable
for v in self.meta.variables:
if v.variable_role is None:
v.variable_role = VariableRole.MEASURE
if v.direct_person_identifying is None:
v.direct_person_identifying = False

if not self.meta.dataset.id:
self.meta.dataset.id = uuid.uuid4()

self.variables_lookup = {v.short_name: v for v in self.meta.variables}

def extract_metadata_from_existing_document(self: t.Self @ DataDocMetadata) -> None:
"""There's an existing metadata document, so read in the metadata from that."""
fresh_metadata = {}
try:
with self.metadata_document.open(mode="r", encoding="utf-8") as file:
fresh_metadata = json.load(file)
logger.info(
"Opened existing metadata file %s",
self.metadata_document.location,
)

if self.is_metadata_in_container_structure(fresh_metadata):
self.container = model.MetadataContainer.model_validate_json(
json.dumps(fresh_metadata),
)
datadoc_metadata = fresh_metadata["datadoc"]
else:
datadoc_metadata = fresh_metadata

datadoc_metadata = upgrade_metadata(
datadoc_metadata,
)

self.meta = model.DatadocJsonSchema.model_validate_json(
json.dumps(datadoc_metadata),
)

except json.JSONDecodeError:
logger.warning(
"Could not open existing metadata file %s. \
Falling back to collecting data from the dataset",
self.metadata_document.location,
exc_info=True,
)

def is_metadata_in_container_structure(
self: t.Self @ DataDocMetadata,
metadata: dict,
) -> bool:
"""At a certain point a metadata 'container' was introduced.

The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc.
This method returns True if the metadata is in the container structure, False otherwise.
"""
return "datadoc" in metadata and "dataset" in metadata["datadoc"]

def extract_metadata_from_dataset(self: t.Self @ DataDocMetadata) -> None:
"""Obtain what metadata we can from the dataset itself.

@@ -199,7 +207,7 @@ def extract_metadata_from_dataset(self: t.Self @ DataDocMetadata) -> None:
"""
self.ds_schema = DatasetParser.for_file(self.dataset)

self.meta.dataset = Model.DataDocDataSet(
self.meta.dataset = model.Dataset(
short_name=self.short_name,
dataset_state=self.dataset_state,
version=self.get_dataset_version(self.short_name),
@@ -218,7 +226,13 @@ def write_metadata_document(self: t.Self @ DataDocMetadata) -> None:
self.meta.dataset.metadata_created_by = self.current_user
self.meta.dataset.metadata_last_updated_date = timestamp
self.meta.dataset.metadata_last_updated_by = self.current_user
self.metadata_document.write_text(self.meta.model_dump_json(indent=4))

if self.container:
self.container.datadoc = self.meta
else:
self.container = model.MetadataContainer(datadoc=self.meta)

self.metadata_document.write_text(self.container.model_dump_json(indent=4))
logger.info("Saved metadata document %s", self.metadata_document.location)

@property
@@ -229,22 +243,23 @@ def percent_complete(self: t.Self @ DataDocMetadata) -> int:
assigned. Used for a live progress bar in the UI, as well as being
saved in the datadoc as a simple quality indicator.
"""
num_all_fields = NUM_OBLIGATORY_DATASET_FIELDS
num_all_fields = len(display_dataset.OBLIGATORY_DATASET_METADATA)
num_set_fields = len(
[
k
for k, v in self.meta.dataset.model_dump().items()
if k in OBLIGATORY_DATASET_METADATA and v is not None
if k in display_dataset.OBLIGATORY_DATASET_METADATA and v is not None
],
)

for variable in self.meta.variables:
num_all_fields += NUM_OBLIGATORY_VARIABLES_FIELDS
num_all_fields += len(display_variables.OBLIGATORY_VARIABLES_METADATA)
num_set_fields += len(
[
k
for k, v in variable.model_dump().items()
if k in OBLIGATORY_VARIABLES_METADATA and v is not None
if k in display_variables.OBLIGATORY_VARIABLES_METADATA
and v is not None
],
)

33 changes: 16 additions & 17 deletions datadoc/backend/dataset_parser.py
Original file line number Diff line number Diff line change
@@ -12,12 +12,11 @@

import pandas as pd
import pyarrow.parquet as pq
from datadoc_model.Enums import Datatype
from datadoc_model.LanguageStrings import LanguageStrings
from datadoc_model.Model import DataDocVariable
from datadoc_model.model import LanguageStringType, Variable

from datadoc import state
from datadoc.backend.storage_adapter import StorageAdapter
from datadoc.enums import DataType

TDatasetParser = t.TypeVar("TDatasetParser", bound="DatasetParser")

@@ -76,14 +75,14 @@
KNOWN_BOOLEAN_TYPES = ("bool", "bool_", "boolean")


TYPE_CORRESPONDENCE: list[tuple[list[str], Datatype]] = [
(KNOWN_INTEGER_TYPES, Datatype.INTEGER),
(KNOWN_FLOAT_TYPES, Datatype.FLOAT),
(KNOWN_STRING_TYPES, Datatype.STRING),
(KNOWN_DATETIME_TYPES, Datatype.DATETIME),
(KNOWN_BOOLEAN_TYPES, Datatype.BOOLEAN),
TYPE_CORRESPONDENCE: list[tuple[list[str], DataType]] = [
(KNOWN_INTEGER_TYPES, DataType.INTEGER),
(KNOWN_FLOAT_TYPES, DataType.FLOAT),
(KNOWN_STRING_TYPES, DataType.STRING),
(KNOWN_DATETIME_TYPES, DataType.DATETIME),
(KNOWN_BOOLEAN_TYPES, DataType.BOOLEAN),
]
TYPE_MAP: dict[str:Datatype] = {}
TYPE_MAP: dict[str:DataType] = {}
for concrete_type, abstract_type in TYPE_CORRESPONDENCE:
TYPE_MAP.update({c: abstract_type for c in concrete_type})

@@ -135,7 +134,7 @@ def for_file(dataset: str) -> TDatasetParser:
return reader

@staticmethod
def transform_data_type(data_type: str) -> Datatype | None:
def transform_data_type(data_type: str) -> DataType | None:
"""Transform a concrete data type to an abstract data type.

In statistical metadata, one is not interested in how the data is
@@ -149,7 +148,7 @@ def transform_data_type(data_type: str) -> Datatype | None:
return TYPE_MAP.get(data_type.lower(), None)

@abstractmethod
def get_fields(self: t.Self @ DatasetParser) -> list[DataDocVariable]:
def get_fields(self: t.Self @ DatasetParser) -> list[Variable]:
"""Abstract method, must be implemented by subclasses."""


@@ -160,12 +159,12 @@ def __init__(self: t.Self @ DatasetParserParquet, dataset: str) -> None:
"""Use the super init method."""
super().__init__(dataset)

def get_fields(self: t.Self @ DatasetParserParquet) -> list[DataDocVariable]:
def get_fields(self: t.Self @ DatasetParserParquet) -> list[Variable]:
"""Extract the fields from this dataset."""
with self.dataset.open(mode="rb") as f:
data_table = pq.read_table(f)
return [
DataDocVariable(
Variable(
short_name=data_field.name,
data_type=self.transform_data_type(str(data_field.type)),
)
@@ -180,7 +179,7 @@ def __init__(self: t.Self @ DatasetParserSas7Bdat, dataset: str) -> None:
"""Use the super init method."""
super().__init__(dataset)

def get_fields(self: t.Self @ DatasetParserSas7Bdat) -> list[DataDocVariable]:
def get_fields(self: t.Self @ DatasetParserSas7Bdat) -> list[Variable]:
"""Extract the fields from this dataset."""
fields = []
with self.dataset.open(mode="rb") as f:
@@ -197,11 +196,11 @@ def get_fields(self: t.Self @ DatasetParserSas7Bdat) -> list[DataDocVariable]:
# Get all the values from the row and loop through them
for i, v in enumerate(row.to_numpy().tolist()[0]):
fields.append(
DataDocVariable(
Variable(
short_name=sas_reader.columns[i].name,
# Assume labels are defined in the default language (NORSK_BOKMÅL)
# If this is not correct, the user may fix it via the UI
name=LanguageStrings(
name=LanguageStringType(
**{
state.current_metadata_language: sas_reader.columns[
i
Loading