diff --git a/tests/post_training/README.md b/tests/post_training/README.md index af2639c8ec7..975637fc79d 100644 --- a/tests/post_training/README.md +++ b/tests/post_training/README.md @@ -122,3 +122,26 @@ Run test with calibration dataset having batch-size=10 for all models: ```bash pytest --data= --batch-size 10 tests/post_training/test_quantize_conformance.py ``` + +## Reference data + +The reference data section outlines the expected format for defining reference values used during parallel testing. + +```yml +_backend_: + metric_value: +``` + +> [!IMPORTANT] +> The reference file is used for parallel testing. +> The path to the *_reference_data.yaml files is used during testing and should not be changed without updating Jenkins scripts. + +### Marking tests as xfail + +To mark a test as expected to fail (xfail) when a validation metric does not meet expectations, add the following line to the reference data: + +```yml +_backend_: + ... + metrics_xfail_reason: "Issue-" +``` diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 4083aab523f..5874e4c507e 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -26,9 +26,9 @@ tinyllama_data_aware_gptq_backend_OV: metric_value: 0.87134 num_int4: 94 num_int8: 124 - atol: 0.0004 # issue 148819 + metrics_xfail_reason: "Issue-148819" tinyllama_scale_estimation_per_channel_backend_OV: metric_value: 0.81389 num_int4: 188 num_int8: 124 - atol: 0.006 # issue 148819 + metrics_xfail_reason: "Issue-148819" diff --git a/tests/post_training/pipelines/base.py b/tests/post_training/pipelines/base.py index 519668a491a..426870c9e97 100644 --- a/tests/post_training/pipelines/base.py +++ b/tests/post_training/pipelines/base.py @@ -36,6 +36,7 @@ from tools.memory_monitor import memory_monitor_context DEFAULT_VAL_THREADS = 4 +METRICS_XFAIL_REASON = "metrics_xfail_reason" class BackendType(Enum): @@ -307,6 +308,7 @@ def validate(self) -> None: if metric_value is not None and metric_value_fp32 is not None: self.run_info.metric_diff = round(self.run_info.metric_value - self.reference_data["metric_value_fp32"], 5) + status_msg = None if ( metric_value is not None and metric_reference is not None @@ -314,9 +316,13 @@ def validate(self) -> None: ): if metric_value < metric_reference: status_msg = f"Regression: Metric value is less than reference {metric_value} < {metric_reference}" - raise ValueError(status_msg) if metric_value > metric_reference: status_msg = f"Improvement: Metric value is better than reference {metric_value} > {metric_reference}" + + if status_msg is not None: + if METRICS_XFAIL_REASON in self.reference_data: + self.run_info.status = f"XFAIL: {self.reference_data[METRICS_XFAIL_REASON]} - {status_msg}" + else: raise ValueError(status_msg) def run(self) -> None: diff --git a/tests/post_training/test_quantize_conformance.py b/tests/post_training/test_quantize_conformance.py index de3b0bc96d1..34addb8a2b5 100644 --- a/tests/post_training/test_quantize_conformance.py +++ b/tests/post_training/test_quantize_conformance.py @@ -366,3 +366,5 @@ def test_weight_compression( if err_msg: pytest.fail(err_msg) + if run_info.status is not None and run_info.status.startswith("XFAIL:"): + pytest.xfail(run_info.status)