Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fetch unit types items from klass #203

Merged
merged 12 commits into from
Mar 5, 2024
210 changes: 118 additions & 92 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ Changelog = "https://github.com/statisticsnorway/datadoc/releases"
python = ">=3.10,<4.0"
pyarrow = ">=8.0.0"
dash = ">=2.15.0"
pydantic = ">2"
pydantic = "==2.5.2"
dash-bootstrap-components = ">=1.1.0"
pandas = ">=1.4.2"
ssb-datadoc-model = "==4.1.2"
ssb-datadoc-model = "==4.2.0"
dapla-toolbelt = ">=1.3.3"
gunicorn = ">=21.2.0"
flask-healthz = ">=0.0.3"
Expand Down Expand Up @@ -94,6 +94,7 @@ show_missing = true
fail_under = 80

[tool.mypy]
plugins = ["pydantic.mypy"]
strict = false
warn_unreachable = true
pretty = true
Expand Down
5 changes: 5 additions & 0 deletions src/datadoc/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from datadoc import state
from datadoc.backend.datadoc_metadata import DataDocMetadata
from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping
from datadoc.backend.unit_types import UnitTypes
from datadoc.enums import SupportedLanguages
from datadoc.frontend.callbacks.register_callbacks import register_callbacks
from datadoc.frontend.callbacks.register_callbacks import (
Expand Down Expand Up @@ -138,6 +139,10 @@ def collect_data_from_external_sources() -> None:
config.get_statistical_subject_source_url(),
)

state.unit_types = UnitTypes(
config.get_unit_code(),
)


def main(dataset_path: str | None = None) -> None:
"""Entrypoint when running as a script."""
Expand Down
12 changes: 6 additions & 6 deletions src/datadoc/backend/dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import arrow

from datadoc.enums import DatasetState
from datadoc.enums import DataSetState
from datadoc.enums import SupportedLanguages

if TYPE_CHECKING:
Expand Down Expand Up @@ -336,7 +336,7 @@ def _extract_period_string_from_index(self, index: int) -> str | None:

def _extract_norwegian_dataset_state_path_part(
self,
dataset_state: DatasetState,
dataset_state: DataSetState,
) -> set:
norwegian_dataset_state_path_part = dataset_state.get_value_for_language(
SupportedLanguages.NORSK_BOKMÅL,
Expand Down Expand Up @@ -372,19 +372,19 @@ def contains_data_until(self) -> datetime.date | None:
@property
def dataset_state(
self,
) -> DatasetState | None:
) -> DataSetState | None:
"""Extract the dataset state from the path.

Examples:
>>> DaplaDatasetPathInfo('klargjorte_data/person_data_v1.parquet').dataset_state
<DatasetState.PROCESSED_DATA: 'PROCESSED_DATA'>
<DataSetState.PROCESSED_DATA: 'PROCESSED_DATA'>
>>> DaplaDatasetPathInfo('utdata/min_statistikk/person_data_v1.parquet').dataset_state
<DatasetState.OUTPUT_DATA: 'OUTPUT_DATA'>
<DataSetState.OUTPUT_DATA: 'OUTPUT_DATA'>
>>> DaplaDatasetPathInfo('my_special_data/person_data_v1.parquet').dataset_state
None
"""
dataset_path_parts = set(self.dataset_path.parts)
for s in DatasetState:
for s in DataSetState:
# We assume that files are saved in the Norwegian language as specified by SSB.
norwegian_dataset_state_path_part_variations = (
self._extract_norwegian_dataset_state_path_part(s)
Expand Down
90 changes: 49 additions & 41 deletions src/datadoc/backend/datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@
from datadoc.backend.dataset_parser import DatasetParser
from datadoc.backend.model_backwards_compatibility import upgrade_metadata
from datadoc.enums import Assessment
from datadoc.enums import DatasetState
from datadoc.enums import DatasetStatus
from datadoc.enums import VariableRole
from datadoc.enums import DataSetState
from datadoc.enums import DataSetStatus
from datadoc.frontend.fields.display_dataset import (
OBLIGATORY_DATASET_METADATA_IDENTIFIERS,
)
Expand Down Expand Up @@ -49,31 +48,28 @@ def __init__(
) -> None:
"""Read in a dataset if supplied, otherwise naively instantiate the class."""
self._statistic_subject_mapping = statistic_subject_mapping
self.dataset_path = dataset_path

self.metadata_document: pathlib.Path | CloudPath | None = None
self.container: model.MetadataContainer | None = None
self.dataset: pathlib.Path | CloudPath | None = None
self.dataset_path: pathlib.Path | CloudPath | None = None
self.short_name: str | None = None
self.meta: model.DatadocJsonSchema = model.DatadocJsonSchema(
percentage_complete=0,
dataset=model.Dataset(),
variables=[],
)
self.dataset = model.Dataset()
self.variables: list = []

self.variables_lookup: dict[str, model.Variable] = {}
if metadata_document_path:
# In this case the user has specified an independent metadata document for editing
# without a dataset.
self.metadata_document = self._open_path(metadata_document_path)
self.extract_metadata_from_existing_document(self.metadata_document)
elif dataset_path:
self.dataset = self._open_path(dataset_path)
self.dataset_path = self._open_path(dataset_path)
# The short_name is set as the dataset filename without file extension
self.short_name = self.dataset.stem
self.short_name = self.dataset_path.stem
# Build the metadata document path based on the dataset path
# Example: /path/to/dataset.parquet -> /path/to/dataset__DOC.json
self.metadata_document = self.dataset.parent / (
self.dataset.stem + METADATA_DOCUMENT_FILE_SUFFIX
self.metadata_document = self.dataset_path.parent / (
self.dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX
)
self.extract_metadata_from_files()

Expand All @@ -97,19 +93,19 @@ def extract_metadata_from_files(self) -> None:
"""
if self.metadata_document is not None and self.metadata_document.exists():
self.extract_metadata_from_existing_document(self.metadata_document)
elif self.dataset is not None:
self.extract_metadata_from_dataset(self.dataset)
self.meta.dataset.id = uuid.uuid4()
elif self.dataset_path is not None:
self.extract_metadata_from_dataset(self.dataset_path)
self.dataset.id = uuid.uuid4()
# Set default values for variables where appropriate
v: model.Variable
for v in self.meta.variables:
for v in self.variables:
if v.variable_role is None:
v.variable_role = VariableRole.MEASURE
v.variable_role = model.VariableRole.MEASURE
if v.direct_person_identifying is None:
v.direct_person_identifying = False
if not self.meta.dataset.id:
self.meta.dataset.id = uuid.uuid4()
self.variables_lookup = {v.short_name: v for v in self.meta.variables}
if not self.dataset.id:
self.dataset.id = uuid.uuid4()
self.variables_lookup = {v.short_name: v for v in self.variables}

def extract_metadata_from_existing_document(
self,
Expand All @@ -135,9 +131,15 @@ def extract_metadata_from_existing_document(
datadoc_metadata = upgrade_metadata(
datadoc_metadata,
)
self.meta = model.DatadocJsonSchema.model_validate_json(

meta = model.DatadocMetadata.model_validate_json(
json.dumps(datadoc_metadata),
)
if meta.dataset is not None:
self.dataset = meta.dataset
if meta.variables is not None:
self.variables = meta.variables

except json.JSONDecodeError:
logger.warning(
"Could not open existing metadata file %s. \
Expand Down Expand Up @@ -173,17 +175,17 @@ def extract_metadata_from_dataset(
dapla_dataset_path_info.statistic_short_name,
)

self.meta.dataset = model.Dataset(
self.dataset = model.Dataset(
short_name=self.short_name,
dataset_state=dapla_dataset_path_info.dataset_state,
dataset_status=DatasetStatus.DRAFT,
dataset_status=DataSetStatus.DRAFT,
assessment=self.get_assessment_by_state(
dapla_dataset_path_info.dataset_state,
),
version=dapla_dataset_path_info.dataset_version,
contains_data_from=str(dapla_dataset_path_info.contains_data_from),
contains_data_until=str(dapla_dataset_path_info.contains_data_until),
data_source_path=self.dataset,
data_source_path=self.dataset_path,
metadata_created_by=user_info.get_user_info_for_current_platform().short_email,
# TODO @mmwinther: Remove multiple_language_support once the model is updated.
# https://github.com/statisticsnorway/ssb-datadoc-model/issues/41
Expand All @@ -193,21 +195,21 @@ def extract_metadata_from_dataset(
nn=subject_field,
),
)
self.meta.variables = self.ds_schema.get_fields()
self.variables = self.ds_schema.get_fields()

@staticmethod
def get_assessment_by_state(state: DatasetState | None) -> Assessment | None:
def get_assessment_by_state(state: DataSetState | None) -> Assessment | None:
"""Find assessment derived by dataset state."""
if state is None:
return None
match (state):
case (
DatasetState.INPUT_DATA
| DatasetState.PROCESSED_DATA
| DatasetState.STATISTICS
DataSetState.INPUT_DATA
| DataSetState.PROCESSED_DATA
| DataSetState.STATISTICS
):
return Assessment.PROTECTED
case DatasetState.OUTPUT_DATA:
case DataSetState.OUTPUT_DATA:
return Assessment.OPEN
case _:
return None
Expand All @@ -216,17 +218,23 @@ def write_metadata_document(self) -> None:
"""Write all currently known metadata to file."""
timestamp: datetime = get_timestamp_now()

if self.meta.dataset.metadata_created_date is None:
self.meta.dataset.metadata_created_date = timestamp
self.meta.dataset.metadata_last_updated_date = timestamp
self.meta.dataset.metadata_last_updated_by = (
if self.dataset.metadata_created_date is None:
self.dataset.metadata_created_date = timestamp
self.dataset.metadata_last_updated_date = timestamp
self.dataset.metadata_last_updated_by = (
user_info.get_user_info_for_current_platform().short_email
)
self.meta.dataset.file_path = str(self.dataset)
self.dataset.file_path = str(self.dataset_path)

datadoc: model.DatadocMetadata = model.DatadocMetadata(
percentage_complete=self.percent_complete,
dataset=self.dataset,
variables=self.variables,
)
if self.container:
self.container.datadoc = self.meta
self.container.datadoc = datadoc
else:
self.container = model.MetadataContainer(datadoc=self.meta)
self.container = model.MetadataContainer(datadoc=datadoc)
if self.metadata_document:
self.metadata_document.write_text(self.container.model_dump_json(indent=4))
logger.info("Saved metadata document %s", self.metadata_document)
Expand All @@ -246,11 +254,11 @@ def percent_complete(self) -> int:
num_set_fields = len(
[
k
for k, v in self.meta.dataset.model_dump().items()
for k, v in self.dataset.model_dump().items()
if k in OBLIGATORY_DATASET_METADATA_IDENTIFIERS and v is not None
],
)
for variable in self.meta.variables:
for variable in self.variables:
num_all_fields += len(OBLIGATORY_VARIABLES_METADATA)
num_set_fields += len(
[
Expand Down
2 changes: 1 addition & 1 deletion src/datadoc/backend/dataset_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def get_fields(self) -> list[Variable]:
# If this is not correct, the user may fix it via the UI
name=LanguageStringType(
**{
state.current_metadata_language: sas_reader.columns[ # type: ignore [attr-defined]
state.current_metadata_language.value: sas_reader.columns[ # type: ignore [attr-defined]
i
].label,
},
Expand Down
4 changes: 2 additions & 2 deletions src/datadoc/backend/model_backwards_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
supplied_metadata["dataset"]["data_source"] = LanguageStringType(
en=supplied_metadata["dataset"]["data_source"],
)
supplied_metadata["document_version"] = "2.0.0"
supplied_metadata["document_version"] = "2.1.0"

return supplied_metadata

Expand Down Expand Up @@ -118,7 +118,7 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
handler=handle_version_1_0_0,
)
BackwardsCompatibleVersion(
version="2.0.0",
version="2.1.0",
handler=handle_current_version,
)

Expand Down
Loading