From 28fd506c95cb762ee786d74fb7e04f9968bc9ed3 Mon Sep 17 00:00:00 2001
From: Sebastian Niehus <sebastian.niehus@ext.aleph-alpha.com>
Date: Tue, 28 May 2024 10:53:21 +0200
Subject: [PATCH] feat: Add `skip_example_on_any_failure` to `evaluate_runs`
 Task: IL-540

---
 CHANGELOG.md                                  |  1 +
 .../evaluation/evaluator/argilla_evaluator.py |  5 +-
 .../evaluation/evaluator/base_evaluator.py    | 11 ++-
 .../evaluation/evaluator/evaluator.py         |  6 +-
 .../evaluator/incremental_evaluator.py        |  6 +-
 tests/evaluation/test_evaluator.py            | 87 +++++++++++++++++++
 6 files changed, 111 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e3a05ea4..d0fe6ad36 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
     - `RougeGrader` now uses the `rouge_score`-package.
 
 ### New Features
+ - Add `skip_example_on_any_failure` flag to `evaluate_runs` (defaults to True). This allows to configure if you want to keep an example for evaluation, even if it failed for some run.
  - Add `how_to_implement_incremental_evaluation`.
  - Improve README.md
  - Add `export_for_viewing` to tracers to be able to export traces in a unified format similar to opentelemetry.
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py
index e12624f19..6229e48ca 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py
@@ -133,6 +133,7 @@ def submit(
         *run_ids: str,
         num_examples: Optional[int] = None,
         abort_on_error: bool = False,
+        skip_example_on_any_failure: bool = True,
     ) -> PartialEvaluationOverview:
         argilla_dataset_id = self._client.ensure_dataset_exists(
             self._workspace_id,
@@ -144,7 +145,9 @@ def submit(
         run_overviews = self._load_run_overviews(*run_ids)
         submit_count = 0
         for example, outputs in self._retrieve_eval_logic_input(
-            run_overviews, num_examples=num_examples
+            run_overviews,
+            skip_example_on_any_failure=skip_example_on_any_failure,
+            num_examples=num_examples,
         ):
             record_sequence = self._evaluation_logic.to_record(example, *outputs)
             for record in record_sequence.records:
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py
index 875642c07..cc0d3f044 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py
@@ -246,6 +246,7 @@ def _generate_evaluation_inputs(
         self,
         examples: Iterable[Example[Input, ExpectedOutput]],
         example_outputs_for_example: Iterable[tuple[ExampleOutput[Output], ...]],
+        skip_example_on_any_failure: bool,
         num_examples: Optional[int],
     ) -> Iterable[
         Tuple[
@@ -256,7 +257,7 @@ def _generate_evaluation_inputs(
         current_example = 0
 
         for example, example_outputs in zip(examples, example_outputs_for_example):
-            if any(
+            if skip_example_on_any_failure and any(
                 isinstance(output.output, FailedExampleRun)
                 for output in example_outputs
             ):
@@ -265,6 +266,7 @@ def _generate_evaluation_inputs(
             successful_example_outputs = [
                 cast(SuccessfulExampleOutput[Output], output)
                 for output in example_outputs
+                if not isinstance(output.output, FailedExampleRun)
             ]
 
             if num_examples and current_example >= num_examples:
@@ -279,6 +281,7 @@ def _generate_evaluation_inputs(
     def _retrieve_eval_logic_input(
         self,
         run_overviews: set[RunOverview],
+        skip_example_on_any_failure: bool,
         num_examples: Optional[int] = None,
     ) -> Iterable[
         Tuple[
@@ -293,6 +296,7 @@ def _retrieve_eval_logic_input(
 
         Args:
             run_overviews: Run overviews to gather data from.
+            skip_example_on_any_failure: Skip example on any failure.
             num_examples: Maximum amount of examples to gather. Defaults to None.
 
         Returns:
@@ -303,7 +307,10 @@ def _retrieve_eval_logic_input(
         dataset_id = next(iter(run_overviews)).dataset_id
         examples = self._retrieve_examples(dataset_id)
         return self._generate_evaluation_inputs(
-            examples, example_outputs_for_example, num_examples
+            examples,
+            example_outputs_for_example,
+            skip_example_on_any_failure,
+            num_examples,
         )
 
     def failed_evaluations(
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py
index b22c21660..b539b1bef 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py
@@ -97,6 +97,7 @@ def evaluate_runs(
         *run_ids: str,
         num_examples: Optional[int] = None,
         abort_on_error: bool = False,
+        skip_example_on_any_failure: bool = True,
     ) -> EvaluationOverview:
         """Evaluates all generated outputs in the run.
 
@@ -114,6 +115,7 @@ def evaluate_runs(
             num_examples: The number of examples which should be evaluated from the given runs.
                 Always the first n runs stored in the evaluation repository. Defaults to None.
             abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
+            skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True.
 
         Returns:
             EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
@@ -133,7 +135,9 @@ def evaluate_runs(
                             args[0], eval_id, abort_on_error, *args[1]
                         ),
                         self._retrieve_eval_logic_input(
-                            run_overviews, num_examples=num_examples
+                            run_overviews,
+                            skip_example_on_any_failure=skip_example_on_any_failure,
+                            num_examples=num_examples,
                         ),
                     ),
                     desc="Evaluating",
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
index cf24f55c1..e58eebbee 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
@@ -169,10 +169,14 @@ def evaluate_runs(
         *run_ids: str,
         num_examples: Optional[int] = None,
         abort_on_error: bool = False,
+        skip_example_on_any_failure: bool = True,
     ) -> EvaluationOverview:
         self._evaluation_logic.set_previous_run_output_ids([])
         return super().evaluate_runs(
-            *run_ids, num_examples=num_examples, abort_on_error=abort_on_error
+            *run_ids,
+            num_examples=num_examples,
+            skip_example_on_any_failure=skip_example_on_any_failure,
+            abort_on_error=abort_on_error,
         )
 
 
diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py
index e6a5d1c33..fad8ba1cf 100644
--- a/tests/evaluation/test_evaluator.py
+++ b/tests/evaluation/test_evaluator.py
@@ -67,6 +67,26 @@ def do_evaluate_single_output(
         return DummyEvaluation(result="pass")
 
 
+class DummyPairwiseEvaluationLogic(
+    EvaluationLogic[
+        str,
+        str,
+        None,
+        DummyEvaluation,
+    ]
+):
+    def do_evaluate(
+        self,
+        example: Example[str, None],
+        *output: SuccessfulExampleOutput[str],
+    ) -> DummyEvaluation:
+        for out in output:
+            if out.output == FAIL_IN_EVAL_INPUT:
+                raise RuntimeError(output)
+
+        return DummyEvaluation(result="pass")
+
+
 class ComparisonEvaluation(BaseModel):
     is_equal: bool
 
@@ -134,11 +154,34 @@ def dummy_eval_logic() -> DummyEvaluationLogic:
     return DummyEvaluationLogic()
 
 
+@fixture
+def dummy_pairwise_eval_logic() -> DummyPairwiseEvaluationLogic:
+    return DummyPairwiseEvaluationLogic()
+
+
 @fixture
 def dummy_aggregate_logic() -> DummyAggregationLogic:
     return DummyAggregationLogic()
 
 
+class SuccessfulDummyTask(Task[str, str]):
+    def do_run(self, input: str, tracer: Tracer) -> str:
+        return input
+
+
+@fixture
+def successful_dummy_runner(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+) -> Runner[str, str]:
+    return Runner(
+        SuccessfulDummyTask(),
+        in_memory_dataset_repository,
+        in_memory_run_repository,
+        "successful-dummy-runner",
+    )
+
+
 @fixture
 def dummy_evaluator(
     in_memory_dataset_repository: InMemoryDatasetRepository,
@@ -155,6 +198,22 @@ def dummy_evaluator(
     )
 
 
+@fixture
+def dummy_pairwise_evaluator(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    dummy_pairwise_eval_logic: DummyPairwiseEvaluationLogic,
+) -> Evaluator[str, str, None, DummyEvaluation]:
+    return Evaluator(
+        in_memory_dataset_repository,
+        in_memory_run_repository,
+        in_memory_evaluation_repository,
+        "dummy-evaluator",
+        dummy_pairwise_eval_logic,
+    )
+
+
 @fixture
 def dummy_aggregator(
     in_memory_dataset_repository: InMemoryDatasetRepository,
@@ -256,6 +315,34 @@ def test_eval_runs_returns_generic_statistics(
     assert evaluation_overview.failed_evaluation_count == 1
 
 
+def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false(
+    dummy_pairwise_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_runner: Runner[str, str],
+    successful_dummy_runner: Runner[str, str],
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+) -> None:
+    examples = [
+        Example(input="success", expected_output=None, id="example-1"),
+        Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"),
+        Example(input=FAIL_IN_EVAL_INPUT, expected_output=None, id="example-3"),
+    ]
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=examples, dataset_name="test-dataset"
+    ).id
+
+    run_overview_with_failure = dummy_runner.run_dataset(dataset_id)
+    successful_run_overview = successful_dummy_runner.run_dataset(dataset_id)
+
+    evaluation_overview = dummy_pairwise_evaluator.evaluate_runs(
+        run_overview_with_failure.id,
+        successful_run_overview.id,
+        skip_example_on_any_failure=False,
+    )
+
+    assert evaluation_overview.successful_evaluation_count == 2
+    assert evaluation_overview.failed_evaluation_count == 1
+
+
 def test_evaluator_aborts_on_error(
     dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
     dummy_aggregator: Aggregator[