Skip to content

Commit

Permalink
Move unnecessary methods out of DataDocMetadata class (#360)
Browse files Browse the repository at this point in the history
* Extract _open_path

* Move calculate_percentage

* Move default spatial coverage description

* Move get_assessment_from_state

* Mark internal methods with leading underscore
  • Loading branch information
mmwinther authored Jul 4, 2024
1 parent 778b88a commit f447361
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 107 deletions.
90 changes: 22 additions & 68 deletions src/datadoc/backend/datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,9 @@

import json
import logging
import pathlib
import uuid
from typing import TYPE_CHECKING

from cloudpathlib import CloudPath
from cloudpathlib import GSClient
from cloudpathlib import GSPath
from dapla import AuthClient
from datadoc_model import model

from datadoc.backend import user_info
Expand All @@ -21,24 +16,26 @@
is_metadata_in_container_structure,
)
from datadoc.backend.model_backwards_compatibility import upgrade_metadata
from datadoc.enums import Assessment
from datadoc.enums import DataSetState
from datadoc.backend.utils import DEFAULT_SPATIAL_COVERAGE_DESCRIPTION
from datadoc.backend.utils import calculate_percentage
from datadoc.backend.utils import derive_assessment_from_state
from datadoc.backend.utils import normalize_path
from datadoc.enums import DataSetStatus
from datadoc.enums import LanguageStringType
from datadoc.enums import LanguageStringTypeItem
from datadoc.frontend.fields.display_dataset import (
OBLIGATORY_DATASET_METADATA_IDENTIFIERS,
)
from datadoc.frontend.fields.display_variables import (
OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS,
)
from datadoc.utils import METADATA_DOCUMENT_FILE_SUFFIX
from datadoc.utils import calculate_percentage
from datadoc.utils import get_timestamp_now

if TYPE_CHECKING:
import pathlib
from datetime import datetime

from cloudpathlib import CloudPath

from datadoc.backend.statistic_subject_mapping import StatisticSubjectMapping

logger = logging.getLogger(__name__)
Expand All @@ -64,46 +61,35 @@ def __init__(
if metadata_document_path:
# In this case the user has specified an independent metadata document for editing
# without a dataset.
self.metadata_document = self._open_path(metadata_document_path)
self.metadata_document = normalize_path(metadata_document_path)
elif dataset_path:
self.dataset_path = self._open_path(dataset_path)
self.dataset_path = normalize_path(dataset_path)
# Build the metadata document path based on the dataset path
# Example: /path/to/dataset.parquet -> /path/to/dataset__DOC.json
self.metadata_document = self.dataset_path.parent / (
self.dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX
)
self.extract_metadata_from_files()

@staticmethod
def _open_path(path: str) -> pathlib.Path | CloudPath:
"""Open a given path regardless of whether it is local or cloud.
The returned path may be treated just as if it's a pathlib.Path.
"""
if path.startswith(GSPath.cloud_prefix):
client = GSClient(credentials=AuthClient.fetch_google_credentials())
return GSPath(path, client=client)
return pathlib.Path(path)
self._extract_metadata_from_files()

def _set_variable_uuid(self) -> None:
for v in self.variables:
if v.id is None:
v.id = uuid.uuid4()

def extract_metadata_from_files(self) -> None:
def _extract_metadata_from_files(self) -> None:
"""Read metadata from an existing metadata document.
If no metadata document exists, create one from scratch by extracting metadata
from the dataset file.
"""
if self.metadata_document is not None and self.metadata_document.exists():
self.extract_metadata_from_existing_document(self.metadata_document)
self._extract_metadata_from_existing_document(self.metadata_document)
if (
self.dataset_path is not None
and self.dataset == model.Dataset()
and len(self.variables) == 0
):
self.extract_metadata_from_dataset(self.dataset_path)
self._extract_metadata_from_dataset(self.dataset_path)
self.dataset.id = uuid.uuid4()
# Set default values for variables where appropriate
v: model.Variable
Expand All @@ -119,7 +105,7 @@ def extract_metadata_from_files(self) -> None:
self.dataset.contains_personal_data = False
self.variables_lookup = {v.short_name: v for v in self.variables}

def extract_metadata_from_existing_document(
def _extract_metadata_from_existing_document(
self,
document: pathlib.Path | CloudPath,
) -> None:
Expand Down Expand Up @@ -158,7 +144,7 @@ def extract_metadata_from_existing_document(
exc_info=True,
)

def extract_metadata_from_dataset(
def _extract_metadata_from_dataset(
self,
dataset: pathlib.Path | CloudPath,
) -> None:
Expand All @@ -176,36 +162,23 @@ def extract_metadata_from_dataset(
short_name=dapla_dataset_path_info.dataset_short_name,
dataset_state=dapla_dataset_path_info.dataset_state,
dataset_status=DataSetStatus.DRAFT,
assessment=self.get_assessment_by_state(
dapla_dataset_path_info.dataset_state,
assessment=(
derive_assessment_from_state(
dapla_dataset_path_info.dataset_state,
)
if dapla_dataset_path_info.dataset_state is not None
else None
),
version=dapla_dataset_path_info.dataset_version,
contains_data_from=dapla_dataset_path_info.contains_data_from,
contains_data_until=dapla_dataset_path_info.contains_data_until,
file_path=str(self.dataset_path),
metadata_created_by=user_info.get_user_info_for_current_platform().short_email,
subject_field=subject_field,
spatial_coverage_description=self.set_default_spatial_coverage_description(),
spatial_coverage_description=DEFAULT_SPATIAL_COVERAGE_DESCRIPTION,
)
self.variables = self.ds_schema.get_fields()

@staticmethod
def get_assessment_by_state(state: DataSetState | None) -> Assessment | None:
"""Find assessment derived by dataset state."""
if state is None:
return None
match (state):
case (
DataSetState.INPUT_DATA
| DataSetState.PROCESSED_DATA
| DataSetState.STATISTICS
):
return Assessment.PROTECTED
case DataSetState.OUTPUT_DATA:
return Assessment.OPEN
case _:
return None

def write_metadata_document(self) -> None:
"""Write all currently known metadata to file."""
timestamp: datetime = get_timestamp_now()
Expand Down Expand Up @@ -262,22 +235,3 @@ def percent_complete(self) -> int:
],
)
return calculate_percentage(num_set_fields, num_all_fields)

def set_default_spatial_coverage_description(self) -> LanguageStringType:
"""Returns the default value 'Norge'."""
return LanguageStringType(
[
LanguageStringTypeItem(
languageCode="nb",
languageText="Norge",
),
LanguageStringTypeItem(
languageCode="nn",
languageText="Noreg",
),
LanguageStringTypeItem(
languageCode="en",
languageText="Norway",
),
],
)
72 changes: 72 additions & 0 deletions src/datadoc/backend/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from __future__ import annotations

import pathlib

from cloudpathlib import CloudPath
from cloudpathlib import GSClient
from cloudpathlib import GSPath
from dapla import AuthClient

from datadoc.enums import Assessment
from datadoc.enums import DataSetState
from datadoc.enums import LanguageStringType
from datadoc.enums import LanguageStringTypeItem

DEFAULT_SPATIAL_COVERAGE_DESCRIPTION = LanguageStringType(
[
LanguageStringTypeItem(
languageCode="nb",
languageText="Norge",
),
LanguageStringTypeItem(
languageCode="nn",
languageText="Noreg",
),
LanguageStringTypeItem(
languageCode="en",
languageText="Norway",
),
],
)


def normalize_path(path: str) -> pathlib.Path | CloudPath:
"""Obtain a pathlib compatible Path regardless of whether the file is on a filesystem or in GCS.
Args:
path (str): Path on a filesystem or in cloud storage
Returns:
pathlib.Path | CloudPath: Pathlib compatible object
"""
if path.startswith(GSPath.cloud_prefix):
client = GSClient(credentials=AuthClient.fetch_google_credentials())
return GSPath(path, client=client)
return pathlib.Path(path)


def calculate_percentage(completed: int, total: int) -> int:
"""Calculate percentage as a rounded integer."""
return round((completed / total) * 100)


def derive_assessment_from_state(state: DataSetState) -> Assessment:
"""Derive assessment from dataset state.
Args:
state (DataSetState): The state of the dataset.
Returns:
Assessment: The derived assessment of the dataset.
"""
match (state):
case (
DataSetState.INPUT_DATA
| DataSetState.PROCESSED_DATA
| DataSetState.STATISTICS
):
return Assessment.PROTECTED
case DataSetState.OUTPUT_DATA:
return Assessment.OPEN
case DataSetState.SOURCE_DATA:
return Assessment.SENSITIVE
5 changes: 0 additions & 5 deletions src/datadoc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,6 @@ def running_in_notebook() -> bool:
return False


def calculate_percentage(completed: int, total: int) -> int:
"""Calculate percentage as a rounded integer."""
return round((completed / total) * 100)


def pick_random_port() -> int:
"""Pick a random free port number.
Expand Down
30 changes: 1 addition & 29 deletions tests/backend/test_datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@

import arrow
import pytest
from cloudpathlib.local import LocalGSClient
from cloudpathlib.local import LocalGSPath
from datadoc_model.model import DatadocMetadata
from datadoc_model.model import Dataset
from datadoc_model.model import Variable
Expand All @@ -28,15 +26,13 @@
from datadoc.enums import DataSetStatus
from datadoc.enums import DataType
from datadoc.enums import VariableRole
from tests.utils import TEST_BUCKET_PARQUET_FILEPATH
from tests.utils import TEST_EXISTING_METADATA_DIRECTORY
from tests.utils import TEST_EXISTING_METADATA_FILE_NAME
from tests.utils import TEST_PARQUET_FILEPATH
from tests.utils import TEST_PROCESSED_DATA_POPULATION_DIRECTORY
from tests.utils import TEST_RESOURCES_DIRECTORY

if TYPE_CHECKING:
import os
from collections.abc import Generator
from datetime import datetime

Expand Down Expand Up @@ -246,30 +242,6 @@ def test_period_metadata_fields_saved(
assert metadata.dataset.contains_data_until == expected_until


@pytest.mark.parametrize(
("dataset_path", "expected_type"),
[
(TEST_BUCKET_PARQUET_FILEPATH, LocalGSPath),
(str(TEST_PARQUET_FILEPATH), pathlib.Path),
],
)
def test_open_file(
dataset_path: str,
expected_type: type[os.PathLike],
mocker,
):
mocker.patch(f"{DATADOC_METADATA_MODULE}.AuthClient", autospec=True)
mocker.patch(f"{DATADOC_METADATA_MODULE}.GSClient", LocalGSClient)
mocker.patch(
f"{DATADOC_METADATA_MODULE}.GSPath",
LocalGSPath,
)
file = DataDocMetadata._open_path( # noqa: SLF001 for testing purposes
dataset_path,
)
assert isinstance(file, expected_type)


@pytest.mark.parametrize(
("dataset_path", "expected_type"),
[
Expand Down Expand Up @@ -305,7 +277,7 @@ def test_dataset_status_default_value(
[
(
"kildedata",
None,
Assessment.SENSITIVE.value,
),
(
"inndata",
Expand Down
41 changes: 41 additions & 0 deletions tests/backend/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
import pathlib

import pytest
from cloudpathlib.local import LocalGSClient
from cloudpathlib.local import LocalGSPath

from datadoc.backend.utils import calculate_percentage
from datadoc.backend.utils import normalize_path
from tests.utils import TEST_BUCKET_PARQUET_FILEPATH
from tests.utils import TEST_PARQUET_FILEPATH

BACKEND_UTILS_MODULE = "datadoc.backend.utils"


@pytest.mark.parametrize(
("dataset_path", "expected_type"),
[
(TEST_BUCKET_PARQUET_FILEPATH, LocalGSPath),
(str(TEST_PARQUET_FILEPATH), pathlib.Path),
],
)
def test_normalize_path(
dataset_path: str,
expected_type: type[os.PathLike],
mocker,
):
mocker.patch(f"{BACKEND_UTILS_MODULE}.AuthClient", autospec=True)
mocker.patch(f"{BACKEND_UTILS_MODULE}.GSClient", LocalGSClient)
mocker.patch(
f"{BACKEND_UTILS_MODULE}.GSPath",
LocalGSPath,
)
file = normalize_path( # for testing purposes
dataset_path,
)
assert isinstance(file, expected_type)


def test_calculate_percentage():
assert calculate_percentage(1, 3) == 33 # noqa: PLR2004
5 changes: 0 additions & 5 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import tomli

from datadoc.utils import calculate_percentage
from datadoc.utils import get_app_version
from datadoc.utils import running_in_notebook

Expand All @@ -13,10 +12,6 @@ def test_not_running_in_notebook():
assert not running_in_notebook()


def test_calculate_percentage():
assert calculate_percentage(1, 3) == 33 # noqa: PLR2004


def test_get_app_version():
with (pathlib.Path(__file__).parent.parent / "pyproject.toml").open("rb") as f:
pyproject = tomli.load(f)
Expand Down

0 comments on commit f447361

Please sign in to comment.