diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index dfc5158c..8afab90d 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -1,7 +1,11 @@ +import logging from pathlib import Path +import textwrap from typing import List, Union import yaml +from pyam import IamDataFrame +from pyam.logging import adjust_log_level from nomenclature.definition import DataStructureDefinition from nomenclature.error import ErrorCollector @@ -9,6 +13,8 @@ from nomenclature.processor import Processor from nomenclature.processor.utils import get_relative_path +logger = logging.getLogger(__name__) + class DataValidationCriteria(IamcDataFilter): """Data validation criteria""" @@ -29,14 +35,39 @@ def from_file(cls, file: Union[Path, str]) -> "DataValidator": content = yaml.safe_load(f) return cls(file=file, criteria_items=content) - def apply(self): - pass + def apply(self, df: IamDataFrame) -> IamDataFrame: + error_list = [] + + with adjust_log_level(): + for item in self.criteria_items: + failed_validation = df.validate(**item.criteria) + if failed_validation is not None: + error_list.append( + " Criteria: " + + ", ".join( + [f"{key}: {value}" for key, value in item.criteria.items()] + ) + ) + error_list.append( + textwrap.indent(str(failed_validation), prefix=" ") + "\n" + ) + + if error_list: + logger.error( + "Failed data validation (file %s):\n%s", + get_relative_path(self.file), + "\n".join(error_list), + ) + raise ValueError( + "Data validation failed. Please check the log for details." + ) + return df def validate_with_definition(self, dsd: DataStructureDefinition) -> None: errors = ErrorCollector(description=f"in file '{self.file}'") - for data in self.criteria_items: + for criterion in self.criteria_items: try: - data.validate_with_definition(dsd) + criterion.validate_with_definition(dsd) except ValueError as value_error: errors.append(value_error) if errors: diff --git a/nomenclature/processor/iamc.py b/nomenclature/processor/iamc.py index 8dc3936a..163b22a0 100644 --- a/nomenclature/processor/iamc.py +++ b/nomenclature/processor/iamc.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Any +from typing import List from pydantic import BaseModel, field_validator from pyam import IAMC_IDX @@ -19,15 +19,20 @@ class IamcDataFilter(BaseModel): def single_input_to_list(cls, v): return v if isinstance(v, list) else [v] + @property + def criteria(self): + return dict(item for item in self.model_dump().items() if item[1] is not None) + def validate_with_definition(self, dsd: DataStructureDefinition) -> None: error_msg = "" # check for filter-items that are not defined in the codelists for dimension in IAMC_IDX: codelist = getattr(dsd, dimension, None) - if codelist is None: + # no validation if codelist is not defined or filter-item is None + if codelist is None or getattr(self, dimension) is None: continue - if invalid := codelist.validate_items(getattr(self, dimension, [])): + if invalid := codelist.validate_items(getattr(self, dimension)): error_msg += ( f"The following {dimension}s are not defined in the " f"DataStructureDefinition:\n {', '.join(invalid)}\n" diff --git a/tests/data/validation/definitions/variable/variable.yaml b/tests/data/validation/definitions/variable/variable.yaml index 59c7edf8..0d6094ea 100644 --- a/tests/data/validation/definitions/variable/variable.yaml +++ b/tests/data/validation/definitions/variable/variable.yaml @@ -2,6 +2,8 @@ unit: EJ/yr - Primary Energy: unit: EJ/yr +- Primary Energy|Coal: + unit: EJ/yr - Emissions|CO2: unit: Mt CO2/yr - Emissions|CH4: diff --git a/tests/data/validation/validate_data/simple_validation.yaml b/tests/data/validation/validate_data/simple_validation.yaml index 2694b20f..a7ec5ee9 100644 --- a/tests/data/validation/validate_data/simple_validation.yaml +++ b/tests/data/validation/validate_data/simple_validation.yaml @@ -1,6 +1,4 @@ - - region: World - variable: Final Energy + - variable: Final Energy year: 2010 upper_bound: 2.5 lower_bound: 1 - diff --git a/tests/data/validation/validate_data/validate_data_fails.yaml b/tests/data/validation/validate_data/validate_data_fails.yaml new file mode 100644 index 00000000..e576e3a4 --- /dev/null +++ b/tests/data/validation/validate_data/validate_data_fails.yaml @@ -0,0 +1,11 @@ + # 2005 value passes the validation, but the 2010 value does not + - variable: Primary Energy + upper_bound: 5. +# variable exists only for 'scen_a' + - variable: Primary Energy|Coal + lower_bound: 2 +# both upper and lower bound are triggered + - variable: Primary Energy + year: 2005 + upper_bound: 1.9 + lower_bound: 1.1 diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 4d339750..2e451a78 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -1,3 +1,5 @@ +from pathlib import Path + import pytest from conftest import TEST_DATA_DIR @@ -12,7 +14,6 @@ def test_DataValidator_from_file(): **{ "criteria_items": [ { - "region": ["World"], "variable": "Final Energy", "year": [2010], "upper_bound": 2.5, @@ -57,3 +58,37 @@ def test_DataValidator_validate_with_definition_raises(dimension, match): dimensions=[dim for dim in ["region", "variable"] if dim != dimension], ) assert data_validator.validate_with_definition(dsd) is None + + +def test_DataValidator_apply_no_matching_data(simple_df): + data_validator = DataValidator.from_file( + DATA_VALIDATION_TEST_DIR / "simple_validation.yaml" + ) + # no data matches validation criteria, `apply()` passes and returns unchanged object + assert data_validator.apply(simple_df) == simple_df + + +def test_DataValidator_apply_fails(simple_df, caplog): + data_file = DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml" + data_validator = DataValidator.from_file(data_file) + + failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}): + Criteria: variable: ['Primary Energy'], upper_bound: 5.0 + model scenario region variable unit year value + 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 + + Criteria: variable: ['Primary Energy|Coal'], lower_bound: 2.0 + model scenario region variable unit year value + 0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5 + + Criteria: variable: ['Primary Energy'], year: [2005], upper_bound: 1.9, lower_bound: 1.1 + model scenario region variable unit year value + 0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 + 1 model_a scen_b World Primary Energy EJ/yr 2005 2.0""" + + with pytest.raises(ValueError, match="Data validation failed"): + data_validator.apply(simple_df) + + # check if the log message contains the correct information + assert failed_validation_message in caplog.text