From 28fd506c95cb762ee786d74fb7e04f9968bc9ed3 Mon Sep 17 00:00:00 2001 From: Sebastian Niehus Date: Tue, 28 May 2024 10:53:21 +0200 Subject: [PATCH] feat: Add `skip_example_on_any_failure` to `evaluate_runs` Task: IL-540 --- CHANGELOG.md | 1 + .../evaluation/evaluator/argilla_evaluator.py | 5 +- .../evaluation/evaluator/base_evaluator.py | 11 ++- .../evaluation/evaluator/evaluator.py | 6 +- .../evaluator/incremental_evaluator.py | 6 +- tests/evaluation/test_evaluator.py | 87 +++++++++++++++++++ 6 files changed, 111 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e3a05ea4..d0fe6ad36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - `RougeGrader` now uses the `rouge_score`-package. ### New Features + - Add `skip_example_on_any_failure` flag to `evaluate_runs` (defaults to True). This allows to configure if you want to keep an example for evaluation, even if it failed for some run. - Add `how_to_implement_incremental_evaluation`. - Improve README.md - Add `export_for_viewing` to tracers to be able to export traces in a unified format similar to opentelemetry. diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py index e12624f19..6229e48ca 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py @@ -133,6 +133,7 @@ def submit( *run_ids: str, num_examples: Optional[int] = None, abort_on_error: bool = False, + skip_example_on_any_failure: bool = True, ) -> PartialEvaluationOverview: argilla_dataset_id = self._client.ensure_dataset_exists( self._workspace_id, @@ -144,7 +145,9 @@ def submit( run_overviews = self._load_run_overviews(*run_ids) submit_count = 0 for example, outputs in self._retrieve_eval_logic_input( - run_overviews, num_examples=num_examples + run_overviews, + skip_example_on_any_failure=skip_example_on_any_failure, + num_examples=num_examples, ): record_sequence = self._evaluation_logic.to_record(example, *outputs) for record in record_sequence.records: diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py index 875642c07..cc0d3f044 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py @@ -246,6 +246,7 @@ def _generate_evaluation_inputs( self, examples: Iterable[Example[Input, ExpectedOutput]], example_outputs_for_example: Iterable[tuple[ExampleOutput[Output], ...]], + skip_example_on_any_failure: bool, num_examples: Optional[int], ) -> Iterable[ Tuple[ @@ -256,7 +257,7 @@ def _generate_evaluation_inputs( current_example = 0 for example, example_outputs in zip(examples, example_outputs_for_example): - if any( + if skip_example_on_any_failure and any( isinstance(output.output, FailedExampleRun) for output in example_outputs ): @@ -265,6 +266,7 @@ def _generate_evaluation_inputs( successful_example_outputs = [ cast(SuccessfulExampleOutput[Output], output) for output in example_outputs + if not isinstance(output.output, FailedExampleRun) ] if num_examples and current_example >= num_examples: @@ -279,6 +281,7 @@ def _generate_evaluation_inputs( def _retrieve_eval_logic_input( self, run_overviews: set[RunOverview], + skip_example_on_any_failure: bool, num_examples: Optional[int] = None, ) -> Iterable[ Tuple[ @@ -293,6 +296,7 @@ def _retrieve_eval_logic_input( Args: run_overviews: Run overviews to gather data from. + skip_example_on_any_failure: Skip example on any failure. num_examples: Maximum amount of examples to gather. Defaults to None. Returns: @@ -303,7 +307,10 @@ def _retrieve_eval_logic_input( dataset_id = next(iter(run_overviews)).dataset_id examples = self._retrieve_examples(dataset_id) return self._generate_evaluation_inputs( - examples, example_outputs_for_example, num_examples + examples, + example_outputs_for_example, + skip_example_on_any_failure, + num_examples, ) def failed_evaluations( diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py index b22c21660..b539b1bef 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py @@ -97,6 +97,7 @@ def evaluate_runs( *run_ids: str, num_examples: Optional[int] = None, abort_on_error: bool = False, + skip_example_on_any_failure: bool = True, ) -> EvaluationOverview: """Evaluates all generated outputs in the run. @@ -114,6 +115,7 @@ def evaluate_runs( num_examples: The number of examples which should be evaluated from the given runs. Always the first n runs stored in the evaluation repository. Defaults to None. abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False. + skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True. Returns: EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be @@ -133,7 +135,9 @@ def evaluate_runs( args[0], eval_id, abort_on_error, *args[1] ), self._retrieve_eval_logic_input( - run_overviews, num_examples=num_examples + run_overviews, + skip_example_on_any_failure=skip_example_on_any_failure, + num_examples=num_examples, ), ), desc="Evaluating", diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py index cf24f55c1..e58eebbee 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py @@ -169,10 +169,14 @@ def evaluate_runs( *run_ids: str, num_examples: Optional[int] = None, abort_on_error: bool = False, + skip_example_on_any_failure: bool = True, ) -> EvaluationOverview: self._evaluation_logic.set_previous_run_output_ids([]) return super().evaluate_runs( - *run_ids, num_examples=num_examples, abort_on_error=abort_on_error + *run_ids, + num_examples=num_examples, + skip_example_on_any_failure=skip_example_on_any_failure, + abort_on_error=abort_on_error, ) diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py index e6a5d1c33..fad8ba1cf 100644 --- a/tests/evaluation/test_evaluator.py +++ b/tests/evaluation/test_evaluator.py @@ -67,6 +67,26 @@ def do_evaluate_single_output( return DummyEvaluation(result="pass") +class DummyPairwiseEvaluationLogic( + EvaluationLogic[ + str, + str, + None, + DummyEvaluation, + ] +): + def do_evaluate( + self, + example: Example[str, None], + *output: SuccessfulExampleOutput[str], + ) -> DummyEvaluation: + for out in output: + if out.output == FAIL_IN_EVAL_INPUT: + raise RuntimeError(output) + + return DummyEvaluation(result="pass") + + class ComparisonEvaluation(BaseModel): is_equal: bool @@ -134,11 +154,34 @@ def dummy_eval_logic() -> DummyEvaluationLogic: return DummyEvaluationLogic() +@fixture +def dummy_pairwise_eval_logic() -> DummyPairwiseEvaluationLogic: + return DummyPairwiseEvaluationLogic() + + @fixture def dummy_aggregate_logic() -> DummyAggregationLogic: return DummyAggregationLogic() +class SuccessfulDummyTask(Task[str, str]): + def do_run(self, input: str, tracer: Tracer) -> str: + return input + + +@fixture +def successful_dummy_runner( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, +) -> Runner[str, str]: + return Runner( + SuccessfulDummyTask(), + in_memory_dataset_repository, + in_memory_run_repository, + "successful-dummy-runner", + ) + + @fixture def dummy_evaluator( in_memory_dataset_repository: InMemoryDatasetRepository, @@ -155,6 +198,22 @@ def dummy_evaluator( ) +@fixture +def dummy_pairwise_evaluator( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, + dummy_pairwise_eval_logic: DummyPairwiseEvaluationLogic, +) -> Evaluator[str, str, None, DummyEvaluation]: + return Evaluator( + in_memory_dataset_repository, + in_memory_run_repository, + in_memory_evaluation_repository, + "dummy-evaluator", + dummy_pairwise_eval_logic, + ) + + @fixture def dummy_aggregator( in_memory_dataset_repository: InMemoryDatasetRepository, @@ -256,6 +315,34 @@ def test_eval_runs_returns_generic_statistics( assert evaluation_overview.failed_evaluation_count == 1 +def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false( + dummy_pairwise_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_runner: Runner[str, str], + successful_dummy_runner: Runner[str, str], + in_memory_dataset_repository: InMemoryDatasetRepository, +) -> None: + examples = [ + Example(input="success", expected_output=None, id="example-1"), + Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"), + Example(input=FAIL_IN_EVAL_INPUT, expected_output=None, id="example-3"), + ] + dataset_id = in_memory_dataset_repository.create_dataset( + examples=examples, dataset_name="test-dataset" + ).id + + run_overview_with_failure = dummy_runner.run_dataset(dataset_id) + successful_run_overview = successful_dummy_runner.run_dataset(dataset_id) + + evaluation_overview = dummy_pairwise_evaluator.evaluate_runs( + run_overview_with_failure.id, + successful_run_overview.id, + skip_example_on_any_failure=False, + ) + + assert evaluation_overview.successful_evaluation_count == 2 + assert evaluation_overview.failed_evaluation_count == 1 + + def test_evaluator_aborts_on_error( dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], dummy_aggregator: Aggregator[