Skip to content

Commit

Permalink
Fetch unit types items from klass (#203)
Browse files Browse the repository at this point in the history
* Added dataclass to unit type as a data strucutre

* Updated to new datadoc model, and changed tests to be compatable with new datadoc model

* Fixed tests errors and type errors

* Fixed pre commit errors

* Fixed unit types pre commit errors

* Fixed final pre-commit errors

* Fixed error causing the app to stop working after loading dataset and changed parsing of unit types dataframes

* Fixed unit_types tests

* Removed prints

---------

Co-authored-by: rlj <[email protected]>
  • Loading branch information
Jorgen-5 and rlj authored Mar 5, 2024
1 parent 4b75527 commit 14d515f
Show file tree
Hide file tree
Showing 36 changed files with 621 additions and 448 deletions.
210 changes: 118 additions & 92 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ Changelog = "https://github.com/statisticsnorway/datadoc/releases"
python = ">=3.10,<4.0"
pyarrow = ">=8.0.0"
dash = ">=2.15.0"
pydantic = ">2"
pydantic = "==2.5.2"
dash-bootstrap-components = ">=1.1.0"
pandas = ">=1.4.2"
ssb-datadoc-model = "==4.1.2"
ssb-datadoc-model = "==4.2.0"
dapla-toolbelt = ">=1.3.3"
gunicorn = ">=21.2.0"
flask-healthz = ">=0.0.3"
Expand Down Expand Up @@ -94,6 +94,7 @@ show_missing = true
fail_under = 80

[tool.mypy]
plugins = ["pydantic.mypy"]
strict = false
warn_unreachable = true
pretty = true
Expand Down
5 changes: 5 additions & 0 deletions src/datadoc/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from datadoc import state
from datadoc.backend.datadoc_metadata import DataDocMetadata
from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping
from datadoc.backend.unit_types import UnitTypes
from datadoc.enums import SupportedLanguages
from datadoc.frontend.callbacks.register_callbacks import register_callbacks
from datadoc.frontend.callbacks.register_callbacks import (
Expand Down Expand Up @@ -138,6 +139,10 @@ def collect_data_from_external_sources() -> None:
config.get_statistical_subject_source_url(),
)

state.unit_types = UnitTypes(
config.get_unit_code(),
)


def main(dataset_path: str | None = None) -> None:
"""Entrypoint when running as a script."""
Expand Down
12 changes: 6 additions & 6 deletions src/datadoc/backend/dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import arrow

from datadoc.enums import DatasetState
from datadoc.enums import DataSetState
from datadoc.enums import SupportedLanguages

if TYPE_CHECKING:
Expand Down Expand Up @@ -336,7 +336,7 @@ def _extract_period_string_from_index(self, index: int) -> str | None:

def _extract_norwegian_dataset_state_path_part(
self,
dataset_state: DatasetState,
dataset_state: DataSetState,
) -> set:
norwegian_dataset_state_path_part = dataset_state.get_value_for_language(
SupportedLanguages.NORSK_BOKMÅL,
Expand Down Expand Up @@ -372,19 +372,19 @@ def contains_data_until(self) -> datetime.date | None:
@property
def dataset_state(
self,
) -> DatasetState | None:
) -> DataSetState | None:
"""Extract the dataset state from the path.
Examples:
>>> DaplaDatasetPathInfo('klargjorte_data/person_data_v1.parquet').dataset_state
<DatasetState.PROCESSED_DATA: 'PROCESSED_DATA'>
<DataSetState.PROCESSED_DATA: 'PROCESSED_DATA'>
>>> DaplaDatasetPathInfo('utdata/min_statistikk/person_data_v1.parquet').dataset_state
<DatasetState.OUTPUT_DATA: 'OUTPUT_DATA'>
<DataSetState.OUTPUT_DATA: 'OUTPUT_DATA'>
>>> DaplaDatasetPathInfo('my_special_data/person_data_v1.parquet').dataset_state
None
"""
dataset_path_parts = set(self.dataset_path.parts)
for s in DatasetState:
for s in DataSetState:
# We assume that files are saved in the Norwegian language as specified by SSB.
norwegian_dataset_state_path_part_variations = (
self._extract_norwegian_dataset_state_path_part(s)
Expand Down
90 changes: 49 additions & 41 deletions src/datadoc/backend/datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@
from datadoc.backend.dataset_parser import DatasetParser
from datadoc.backend.model_backwards_compatibility import upgrade_metadata
from datadoc.enums import Assessment
from datadoc.enums import DatasetState
from datadoc.enums import DatasetStatus
from datadoc.enums import VariableRole
from datadoc.enums import DataSetState
from datadoc.enums import DataSetStatus
from datadoc.frontend.fields.display_dataset import (
OBLIGATORY_DATASET_METADATA_IDENTIFIERS,
)
Expand Down Expand Up @@ -49,31 +48,28 @@ def __init__(
) -> None:
"""Read in a dataset if supplied, otherwise naively instantiate the class."""
self._statistic_subject_mapping = statistic_subject_mapping
self.dataset_path = dataset_path

self.metadata_document: pathlib.Path | CloudPath | None = None
self.container: model.MetadataContainer | None = None
self.dataset: pathlib.Path | CloudPath | None = None
self.dataset_path: pathlib.Path | CloudPath | None = None
self.short_name: str | None = None
self.meta: model.DatadocJsonSchema = model.DatadocJsonSchema(
percentage_complete=0,
dataset=model.Dataset(),
variables=[],
)
self.dataset = model.Dataset()
self.variables: list = []

self.variables_lookup: dict[str, model.Variable] = {}
if metadata_document_path:
# In this case the user has specified an independent metadata document for editing
# without a dataset.
self.metadata_document = self._open_path(metadata_document_path)
self.extract_metadata_from_existing_document(self.metadata_document)
elif dataset_path:
self.dataset = self._open_path(dataset_path)
self.dataset_path = self._open_path(dataset_path)
# The short_name is set as the dataset filename without file extension
self.short_name = self.dataset.stem
self.short_name = self.dataset_path.stem
# Build the metadata document path based on the dataset path
# Example: /path/to/dataset.parquet -> /path/to/dataset__DOC.json
self.metadata_document = self.dataset.parent / (
self.dataset.stem + METADATA_DOCUMENT_FILE_SUFFIX
self.metadata_document = self.dataset_path.parent / (
self.dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX
)
self.extract_metadata_from_files()

Expand All @@ -97,19 +93,19 @@ def extract_metadata_from_files(self) -> None:
"""
if self.metadata_document is not None and self.metadata_document.exists():
self.extract_metadata_from_existing_document(self.metadata_document)
elif self.dataset is not None:
self.extract_metadata_from_dataset(self.dataset)
self.meta.dataset.id = uuid.uuid4()
elif self.dataset_path is not None:
self.extract_metadata_from_dataset(self.dataset_path)
self.dataset.id = uuid.uuid4()
# Set default values for variables where appropriate
v: model.Variable
for v in self.meta.variables:
for v in self.variables:
if v.variable_role is None:
v.variable_role = VariableRole.MEASURE
v.variable_role = model.VariableRole.MEASURE
if v.direct_person_identifying is None:
v.direct_person_identifying = False
if not self.meta.dataset.id:
self.meta.dataset.id = uuid.uuid4()
self.variables_lookup = {v.short_name: v for v in self.meta.variables}
if not self.dataset.id:
self.dataset.id = uuid.uuid4()
self.variables_lookup = {v.short_name: v for v in self.variables}

def extract_metadata_from_existing_document(
self,
Expand All @@ -135,9 +131,15 @@ def extract_metadata_from_existing_document(
datadoc_metadata = upgrade_metadata(
datadoc_metadata,
)
self.meta = model.DatadocJsonSchema.model_validate_json(

meta = model.DatadocMetadata.model_validate_json(
json.dumps(datadoc_metadata),
)
if meta.dataset is not None:
self.dataset = meta.dataset
if meta.variables is not None:
self.variables = meta.variables

except json.JSONDecodeError:
logger.warning(
"Could not open existing metadata file %s. \
Expand Down Expand Up @@ -173,17 +175,17 @@ def extract_metadata_from_dataset(
dapla_dataset_path_info.statistic_short_name,
)

self.meta.dataset = model.Dataset(
self.dataset = model.Dataset(
short_name=self.short_name,
dataset_state=dapla_dataset_path_info.dataset_state,
dataset_status=DatasetStatus.DRAFT,
dataset_status=DataSetStatus.DRAFT,
assessment=self.get_assessment_by_state(
dapla_dataset_path_info.dataset_state,
),
version=dapla_dataset_path_info.dataset_version,
contains_data_from=str(dapla_dataset_path_info.contains_data_from),
contains_data_until=str(dapla_dataset_path_info.contains_data_until),
data_source_path=self.dataset,
data_source_path=self.dataset_path,
metadata_created_by=user_info.get_user_info_for_current_platform().short_email,
# TODO @mmwinther: Remove multiple_language_support once the model is updated.
# https://github.com/statisticsnorway/ssb-datadoc-model/issues/41
Expand All @@ -193,21 +195,21 @@ def extract_metadata_from_dataset(
nn=subject_field,
),
)
self.meta.variables = self.ds_schema.get_fields()
self.variables = self.ds_schema.get_fields()

@staticmethod
def get_assessment_by_state(state: DatasetState | None) -> Assessment | None:
def get_assessment_by_state(state: DataSetState | None) -> Assessment | None:
"""Find assessment derived by dataset state."""
if state is None:
return None
match (state):
case (
DatasetState.INPUT_DATA
| DatasetState.PROCESSED_DATA
| DatasetState.STATISTICS
DataSetState.INPUT_DATA
| DataSetState.PROCESSED_DATA
| DataSetState.STATISTICS
):
return Assessment.PROTECTED
case DatasetState.OUTPUT_DATA:
case DataSetState.OUTPUT_DATA:
return Assessment.OPEN
case _:
return None
Expand All @@ -216,17 +218,23 @@ def write_metadata_document(self) -> None:
"""Write all currently known metadata to file."""
timestamp: datetime = get_timestamp_now()

if self.meta.dataset.metadata_created_date is None:
self.meta.dataset.metadata_created_date = timestamp
self.meta.dataset.metadata_last_updated_date = timestamp
self.meta.dataset.metadata_last_updated_by = (
if self.dataset.metadata_created_date is None:
self.dataset.metadata_created_date = timestamp
self.dataset.metadata_last_updated_date = timestamp
self.dataset.metadata_last_updated_by = (
user_info.get_user_info_for_current_platform().short_email
)
self.meta.dataset.file_path = str(self.dataset)
self.dataset.file_path = str(self.dataset_path)

datadoc: model.DatadocMetadata = model.DatadocMetadata(
percentage_complete=self.percent_complete,
dataset=self.dataset,
variables=self.variables,
)
if self.container:
self.container.datadoc = self.meta
self.container.datadoc = datadoc
else:
self.container = model.MetadataContainer(datadoc=self.meta)
self.container = model.MetadataContainer(datadoc=datadoc)
if self.metadata_document:
self.metadata_document.write_text(self.container.model_dump_json(indent=4))
logger.info("Saved metadata document %s", self.metadata_document)
Expand All @@ -246,11 +254,11 @@ def percent_complete(self) -> int:
num_set_fields = len(
[
k
for k, v in self.meta.dataset.model_dump().items()
for k, v in self.dataset.model_dump().items()
if k in OBLIGATORY_DATASET_METADATA_IDENTIFIERS and v is not None
],
)
for variable in self.meta.variables:
for variable in self.variables:
num_all_fields += len(OBLIGATORY_VARIABLES_METADATA)
num_set_fields += len(
[
Expand Down
2 changes: 1 addition & 1 deletion src/datadoc/backend/dataset_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def get_fields(self) -> list[Variable]:
# If this is not correct, the user may fix it via the UI
name=LanguageStringType(
**{
state.current_metadata_language: sas_reader.columns[ # type: ignore [attr-defined]
state.current_metadata_language.value: sas_reader.columns[ # type: ignore [attr-defined]
i
].label,
},
Expand Down
4 changes: 2 additions & 2 deletions src/datadoc/backend/model_backwards_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
supplied_metadata["dataset"]["data_source"] = LanguageStringType(
en=supplied_metadata["dataset"]["data_source"],
)
supplied_metadata["document_version"] = "2.0.0"
supplied_metadata["document_version"] = "2.1.0"

return supplied_metadata

Expand Down Expand Up @@ -118,7 +118,7 @@ def handle_version_0_1_1(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
handler=handle_version_1_0_0,
)
BackwardsCompatibleVersion(
version="2.0.0",
version="2.1.0",
handler=handle_current_version,
)

Expand Down
Loading

0 comments on commit 14d515f

Please sign in to comment.