Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into feature/DPMETA-20-path-not-in.convention
  • Loading branch information
tilen1976 committed Jan 30, 2024
2 parents a9b7f11 + 097b321 commit ffc2bb4
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 186 deletions.
18 changes: 18 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,24 @@ poetry run python
poetry run datadoc
```

## Config for local development

We use a python package called `python-dotenv` for configuration management. This gives two possibilities for sourcing configuration:

1. Environment variables.
1. A file called `.env` by convention.

To set up for local development run this command from the root of the repo.

1. Create a file `src/datadoc/.env`
1. Place the following lines in the file:
```
DATADOC_DASH_DEVELOPMENT_MODE=True
DATADOC_LOG_LEVEL=debug
```

To see all configuration options, see `src/datadoc/config.py`

## How to test the project

Run the full test suite:
Expand Down
280 changes: 120 additions & 160 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion src/datadoc/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ def main(dataset_path: str | None = None) -> None:
port=port,
)
else:
app.run(debug=config.get_dash_development_mode(), port=port)
if dev_mode := config.get_dash_development_mode():
logger.warning("Starting in Development Mode. NOT SUITABLE FOR PRODUCTION.")
app.run(debug=dev_mode, port=port)


if __name__ == "__main__":
Expand Down
35 changes: 21 additions & 14 deletions src/datadoc/backend/dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Extract info from a path following SSB's dataset naming convention."""
from __future__ import annotations

import contextlib
import pathlib
import re
from dataclasses import dataclass
Expand All @@ -12,6 +11,7 @@

if TYPE_CHECKING:
import datetime
import os


@dataclass
Expand Down Expand Up @@ -262,16 +262,11 @@ def convert_ssb_period(
class DaplaDatasetPathInfo:
"""Extract info from a path following SSB's dataset naming convention."""

def __init__(self, dataset_path: str) -> None:
def __init__(self, dataset_path: str | os.PathLike[str]) -> None:
"""Digest the path so that it's ready for further parsing."""
self.dataset_path = pathlib.Path(dataset_path)
self.dataset_name_sections = self.dataset_path.stem.split("_")
_period_strings = self._extract_period_strings(self.dataset_name_sections)
self.first_period_string = _period_strings[0]
self.second_period_string: str | None = None

with contextlib.suppress(IndexError):
self.second_period_string = _period_strings[1]
self._period_strings = self._extract_period_strings(self.dataset_name_sections)

@staticmethod
def _extract_period_strings(dataset_name_sections: list[str]) -> list[str]:
Expand Down Expand Up @@ -306,28 +301,40 @@ def _extract_period_strings(dataset_name_sections: list[str]) -> list[str]:
]

@property
def contains_data_from(self) -> datetime.date:
def contains_data_from(self) -> datetime.date | None:
"""The earliest date from which data in the dataset is relevant for."""
date_format = categorize_period_string(self.first_period_string)
try:
period_string = self._period_strings[0]
date_format = categorize_period_string(period_string)
except IndexError:
return None

if isinstance(date_format, SsbDateFormat):
"""If dateformat is SSB date format return start month of ssb period."""
period = convert_ssb_period(
self.first_period_string,
period_string,
"start",
date_format,
)
return arrow.get(period, date_format.arrow_pattern).floor("month").date()

return (
arrow.get(self.first_period_string, date_format.arrow_pattern)
arrow.get(period_string, date_format.arrow_pattern)
.floor(date_format.timeframe)
.date()
)

@property
def contains_data_until(self) -> datetime.date:
def contains_data_until(self) -> datetime.date | None:
"""The latest date until which data in the dataset is relevant for."""
period_string = self.second_period_string or self.first_period_string
try:
period_string = self._period_strings[1]
except IndexError:
try:
period_string = self._period_strings[0]
except IndexError:
return None

date_format = categorize_period_string(period_string)
if isinstance(date_format, SsbDateFormat):
"""If dateformat is SSB date format return end month of ssb period."""
Expand Down
5 changes: 5 additions & 0 deletions src/datadoc/backend/datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from datadoc_model import model

from datadoc import config
from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo
from datadoc.backend.dataset_parser import DatasetParser
from datadoc.backend.model_backwards_compatibility import upgrade_metadata
from datadoc.backend.storage_adapter import StorageAdapter
Expand Down Expand Up @@ -214,10 +215,14 @@ def extract_metadata_from_dataset(
"""
self.ds_schema: DatasetParser = DatasetParser.for_file(dataset)

dapla_dataset_path_info = DaplaDatasetPathInfo(dataset)

self.meta.dataset = model.Dataset(
short_name=self.short_name,
dataset_state=self.dataset_state,
version=self.get_dataset_version(short_name),
contains_data_from=str(dapla_dataset_path_info.contains_data_from),
contains_data_until=str(dapla_dataset_path_info.contains_data_until),
data_source_path=self.dataset,
created_by=self.current_user,
)
Expand Down
29 changes: 23 additions & 6 deletions tests/backend/test_dapla_dataset_path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from datadoc.backend.dapla_dataset_path_info import DaplaDatasetPathInfo
from tests.utils import TEST_PARQUET_FILEPATH


@dataclass
Expand Down Expand Up @@ -136,16 +137,32 @@ def test_extract_period_info_date_until(
assert dataset_path.contains_data_until == expected_contains_data_until


# Nonsens names, no dates path names will trigger IndexError
@pytest.mark.parametrize(
"data",
[
(
"varehandel_p2018Q5_p2018Q4_v1.parquet",
"Period format p2018Q5 is not supported",
),
(
"varehandel_p2018Q1_p2018H2_v1.parquet",
"Period format p2018H2 is not supported",
),
],
)
def test_extract_period_info_failures(data: tuple):
DaplaDatasetPathInfo(data[0])
with pytest.raises(NotImplementedError):
raise NotImplementedError(data[1])


@pytest.mark.parametrize(
"data",
[
"nonsen.data",
"nonsens2.parquet",
"nonsens_v1.parquet",
"varehandel_v1.parquet",
TEST_PARQUET_FILEPATH.name,
],
)
def test_extract_period_info_failures_index_error(data: str):
with pytest.raises(IndexError):
DaplaDatasetPathInfo(data)
def test_extract_period_info_no_period_info_in_path(data: str):
assert DaplaDatasetPathInfo(data).contains_data_from is None
5 changes: 0 additions & 5 deletions tests/backend/test_datadoc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ def test_direct_person_identifying_default_value(metadata: DataDocMetadata):
assert all(not v.direct_person_identifying for v in metadata.meta.variables)


# Test with existing dataset and metadata document
def test_save_file_path_metadata_field(
existing_metadata_file: str,
metadata: DataDocMetadata,
Expand All @@ -211,14 +210,10 @@ def test_save_file_path_metadata_field(
assert saved_file_path == str(metadata.dataset)


# Test with dataset and no metadata document
def test_save_file_path_dataset_and_no_metadata(
metadata: DataDocMetadata,
):
metadata.write_metadata_document()
with Path.open(Path(TEST_RESOURCES_METADATA_DOCUMENT)) as f:
saved_file_path = json.load(f)["datadoc"]["dataset"]["file_path"]
assert saved_file_path == str(metadata.dataset)


# Test with metadata document and no dataset

0 comments on commit ffc2bb4

Please sign in to comment.