Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement DataValidator.apply() #368

Merged
merged 19 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 35 additions & 4 deletions nomenclature/processor/data_validator.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import logging
from pathlib import Path
import textwrap
from typing import List, Union

import yaml
from pyam import IamDataFrame
from pyam.logging import adjust_log_level

from nomenclature.definition import DataStructureDefinition
from nomenclature.error import ErrorCollector
from nomenclature.processor.iamc import IamcDataFilter
from nomenclature.processor import Processor
from nomenclature.processor.utils import get_relative_path

logger = logging.getLogger(__name__)


class DataValidationCriteria(IamcDataFilter):
"""Data validation criteria"""
Expand All @@ -29,14 +35,39 @@ def from_file(cls, file: Union[Path, str]) -> "DataValidator":
content = yaml.safe_load(f)
return cls(file=file, criteria_items=content)

def apply(self):
pass
def apply(self, df: IamDataFrame) -> IamDataFrame:
error_list = []

with adjust_log_level():
for item in self.criteria_items:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could turn this into a single list comprehension and use the walrus operator:

if error_list := [
                "  Criteria: "
                + ", ".join([f"{key}: {value}" for key, value in item.criteria.items()])
                + "\n"
                + textwrap.indent(str(df.validate(**item.criteria)), prefix="    ")
                + "\n"
                for item in self.criteria_items
                if df.validate(**item.criteria) is not None
            ]:
                logger.error(
                    "Failed data validation (file %s):\n%s",
                    get_relative_path(self.file),
                    "\n".join(error_list),
                )

not sure if that's more readable though.
Feel free to keep whatever is most readable to you.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems less readable than the current implementation, so suggest to get it unless we make a utility-function that moves the string-concat somewhere else - maybe together with a refactor of the RequiredDataValidator?

failed_validation = df.validate(**item.criteria)
if failed_validation is not None:
error_list.append(
" Criteria: "
+ ", ".join(
[f"{key}: {value}" for key, value in item.criteria.items()]
)
)
error_list.append(
textwrap.indent(str(failed_validation), prefix=" ") + "\n"
)

if error_list:
logger.error(
"Failed data validation (file %s):\n%s",
get_relative_path(self.file),
"\n".join(error_list),
)
raise ValueError(
"Data validation failed. Please check the log for details."
)
return df

def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
errors = ErrorCollector(description=f"in file '{self.file}'")
for data in self.criteria_items:
for criterion in self.criteria_items:
try:
data.validate_with_definition(dsd)
criterion.validate_with_definition(dsd)
except ValueError as value_error:
errors.append(value_error)
if errors:
Expand Down
11 changes: 8 additions & 3 deletions nomenclature/processor/iamc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Tuple, Any
from typing import List
from pydantic import BaseModel, field_validator

from pyam import IAMC_IDX
Expand All @@ -19,15 +19,20 @@ class IamcDataFilter(BaseModel):
def single_input_to_list(cls, v):
return v if isinstance(v, list) else [v]

@property
def criteria(self):
return dict(item for item in self.model_dump().items() if item[1] is not None)

def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
error_msg = ""

# check for filter-items that are not defined in the codelists
for dimension in IAMC_IDX:
codelist = getattr(dsd, dimension, None)
if codelist is None:
# no validation if codelist is not defined or filter-item is None
if codelist is None or getattr(self, dimension) is None:
continue
if invalid := codelist.validate_items(getattr(self, dimension, [])):
if invalid := codelist.validate_items(getattr(self, dimension)):
error_msg += (
f"The following {dimension}s are not defined in the "
f"DataStructureDefinition:\n {', '.join(invalid)}\n"
Expand Down
2 changes: 2 additions & 0 deletions tests/data/validation/definitions/variable/variable.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
unit: EJ/yr
- Primary Energy:
unit: EJ/yr
- Primary Energy|Coal:
unit: EJ/yr
- Emissions|CO2:
unit: Mt CO2/yr
- Emissions|CH4:
Expand Down
4 changes: 1 addition & 3 deletions tests/data/validation/validate_data/simple_validation.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
- region: World
variable: Final Energy
- variable: Final Energy
year: 2010
upper_bound: 2.5
lower_bound: 1

11 changes: 11 additions & 0 deletions tests/data/validation/validate_data/validate_data_fails.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# 2005 value passes the validation, but the 2010 value does not
- variable: Primary Energy
upper_bound: 5.
# variable exists only for 'scen_a'
- variable: Primary Energy|Coal
lower_bound: 2
# both upper and lower bound are triggered
- variable: Primary Energy
year: 2005
upper_bound: 1.9
lower_bound: 1.1
37 changes: 36 additions & 1 deletion tests/test_validate_data.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import pytest
from conftest import TEST_DATA_DIR

Expand All @@ -12,7 +14,6 @@ def test_DataValidator_from_file():
**{
"criteria_items": [
{
"region": ["World"],
"variable": "Final Energy",
"year": [2010],
"upper_bound": 2.5,
Expand Down Expand Up @@ -57,3 +58,37 @@ def test_DataValidator_validate_with_definition_raises(dimension, match):
dimensions=[dim for dim in ["region", "variable"] if dim != dimension],
)
assert data_validator.validate_with_definition(dsd) is None


def test_DataValidator_apply_no_matching_data(simple_df):
data_validator = DataValidator.from_file(
DATA_VALIDATION_TEST_DIR / "simple_validation.yaml"
)
# no data matches validation criteria, `apply()` passes and returns unchanged object
assert data_validator.apply(simple_df) == simple_df


def test_DataValidator_apply_fails(simple_df, caplog):
data_file = DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml"
data_validator = DataValidator.from_file(data_file)

failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}):
Criteria: variable: ['Primary Energy'], upper_bound: 5.0
model scenario region variable unit year value
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0

Criteria: variable: ['Primary Energy|Coal'], lower_bound: 2.0
model scenario region variable unit year value
0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5

Criteria: variable: ['Primary Energy'], year: [2005], upper_bound: 1.9, lower_bound: 1.1
model scenario region variable unit year value
0 model_a scen_a World Primary Energy EJ/yr 2005 1.0
1 model_a scen_b World Primary Energy EJ/yr 2005 2.0"""

with pytest.raises(ValueError, match="Data validation failed"):
data_validator.apply(simple_df)

# check if the log message contains the correct information
assert failed_validation_message in caplog.text