Skip to content

Commit

Permalink
feat: Add skip_example_on_any_failure to evaluate_runs
Browse files Browse the repository at this point in the history
Task: IL-540
  • Loading branch information
SebastianNiehusAA committed May 30, 2024
1 parent 757110b commit 28fd506
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
- `RougeGrader` now uses the `rouge_score`-package.

### New Features
- Add `skip_example_on_any_failure` flag to `evaluate_runs` (defaults to True). This allows to configure if you want to keep an example for evaluation, even if it failed for some run.
- Add `how_to_implement_incremental_evaluation`.
- Improve README.md
- Add `export_for_viewing` to tracers to be able to export traces in a unified format similar to opentelemetry.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def submit(
*run_ids: str,
num_examples: Optional[int] = None,
abort_on_error: bool = False,
skip_example_on_any_failure: bool = True,
) -> PartialEvaluationOverview:
argilla_dataset_id = self._client.ensure_dataset_exists(
self._workspace_id,
Expand All @@ -144,7 +145,9 @@ def submit(
run_overviews = self._load_run_overviews(*run_ids)
submit_count = 0
for example, outputs in self._retrieve_eval_logic_input(
run_overviews, num_examples=num_examples
run_overviews,
skip_example_on_any_failure=skip_example_on_any_failure,
num_examples=num_examples,
):
record_sequence = self._evaluation_logic.to_record(example, *outputs)
for record in record_sequence.records:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def _generate_evaluation_inputs(
self,
examples: Iterable[Example[Input, ExpectedOutput]],
example_outputs_for_example: Iterable[tuple[ExampleOutput[Output], ...]],
skip_example_on_any_failure: bool,
num_examples: Optional[int],
) -> Iterable[
Tuple[
Expand All @@ -256,7 +257,7 @@ def _generate_evaluation_inputs(
current_example = 0

for example, example_outputs in zip(examples, example_outputs_for_example):
if any(
if skip_example_on_any_failure and any(
isinstance(output.output, FailedExampleRun)
for output in example_outputs
):
Expand All @@ -265,6 +266,7 @@ def _generate_evaluation_inputs(
successful_example_outputs = [
cast(SuccessfulExampleOutput[Output], output)
for output in example_outputs
if not isinstance(output.output, FailedExampleRun)
]

if num_examples and current_example >= num_examples:
Expand All @@ -279,6 +281,7 @@ def _generate_evaluation_inputs(
def _retrieve_eval_logic_input(
self,
run_overviews: set[RunOverview],
skip_example_on_any_failure: bool,
num_examples: Optional[int] = None,
) -> Iterable[
Tuple[
Expand All @@ -293,6 +296,7 @@ def _retrieve_eval_logic_input(
Args:
run_overviews: Run overviews to gather data from.
skip_example_on_any_failure: Skip example on any failure.
num_examples: Maximum amount of examples to gather. Defaults to None.
Returns:
Expand All @@ -303,7 +307,10 @@ def _retrieve_eval_logic_input(
dataset_id = next(iter(run_overviews)).dataset_id
examples = self._retrieve_examples(dataset_id)
return self._generate_evaluation_inputs(
examples, example_outputs_for_example, num_examples
examples,
example_outputs_for_example,
skip_example_on_any_failure,
num_examples,
)

def failed_evaluations(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def evaluate_runs(
*run_ids: str,
num_examples: Optional[int] = None,
abort_on_error: bool = False,
skip_example_on_any_failure: bool = True,
) -> EvaluationOverview:
"""Evaluates all generated outputs in the run.
Expand All @@ -114,6 +115,7 @@ def evaluate_runs(
num_examples: The number of examples which should be evaluated from the given runs.
Always the first n runs stored in the evaluation repository. Defaults to None.
abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True.
Returns:
EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
Expand All @@ -133,7 +135,9 @@ def evaluate_runs(
args[0], eval_id, abort_on_error, *args[1]
),
self._retrieve_eval_logic_input(
run_overviews, num_examples=num_examples
run_overviews,
skip_example_on_any_failure=skip_example_on_any_failure,
num_examples=num_examples,
),
),
desc="Evaluating",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,14 @@ def evaluate_runs(
*run_ids: str,
num_examples: Optional[int] = None,
abort_on_error: bool = False,
skip_example_on_any_failure: bool = True,
) -> EvaluationOverview:
self._evaluation_logic.set_previous_run_output_ids([])
return super().evaluate_runs(
*run_ids, num_examples=num_examples, abort_on_error=abort_on_error
*run_ids,
num_examples=num_examples,
skip_example_on_any_failure=skip_example_on_any_failure,
abort_on_error=abort_on_error,
)


Expand Down
87 changes: 87 additions & 0 deletions tests/evaluation/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,26 @@ def do_evaluate_single_output(
return DummyEvaluation(result="pass")


class DummyPairwiseEvaluationLogic(
EvaluationLogic[
str,
str,
None,
DummyEvaluation,
]
):
def do_evaluate(
self,
example: Example[str, None],
*output: SuccessfulExampleOutput[str],
) -> DummyEvaluation:
for out in output:
if out.output == FAIL_IN_EVAL_INPUT:
raise RuntimeError(output)

return DummyEvaluation(result="pass")


class ComparisonEvaluation(BaseModel):
is_equal: bool

Expand Down Expand Up @@ -134,11 +154,34 @@ def dummy_eval_logic() -> DummyEvaluationLogic:
return DummyEvaluationLogic()


@fixture
def dummy_pairwise_eval_logic() -> DummyPairwiseEvaluationLogic:
return DummyPairwiseEvaluationLogic()


@fixture
def dummy_aggregate_logic() -> DummyAggregationLogic:
return DummyAggregationLogic()


class SuccessfulDummyTask(Task[str, str]):
def do_run(self, input: str, tracer: Tracer) -> str:
return input


@fixture
def successful_dummy_runner(
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
) -> Runner[str, str]:
return Runner(
SuccessfulDummyTask(),
in_memory_dataset_repository,
in_memory_run_repository,
"successful-dummy-runner",
)


@fixture
def dummy_evaluator(
in_memory_dataset_repository: InMemoryDatasetRepository,
Expand All @@ -155,6 +198,22 @@ def dummy_evaluator(
)


@fixture
def dummy_pairwise_evaluator(
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
in_memory_evaluation_repository: InMemoryEvaluationRepository,
dummy_pairwise_eval_logic: DummyPairwiseEvaluationLogic,
) -> Evaluator[str, str, None, DummyEvaluation]:
return Evaluator(
in_memory_dataset_repository,
in_memory_run_repository,
in_memory_evaluation_repository,
"dummy-evaluator",
dummy_pairwise_eval_logic,
)


@fixture
def dummy_aggregator(
in_memory_dataset_repository: InMemoryDatasetRepository,
Expand Down Expand Up @@ -256,6 +315,34 @@ def test_eval_runs_returns_generic_statistics(
assert evaluation_overview.failed_evaluation_count == 1


def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false(
dummy_pairwise_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_runner: Runner[str, str],
successful_dummy_runner: Runner[str, str],
in_memory_dataset_repository: InMemoryDatasetRepository,
) -> None:
examples = [
Example(input="success", expected_output=None, id="example-1"),
Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"),
Example(input=FAIL_IN_EVAL_INPUT, expected_output=None, id="example-3"),
]
dataset_id = in_memory_dataset_repository.create_dataset(
examples=examples, dataset_name="test-dataset"
).id

run_overview_with_failure = dummy_runner.run_dataset(dataset_id)
successful_run_overview = successful_dummy_runner.run_dataset(dataset_id)

evaluation_overview = dummy_pairwise_evaluator.evaluate_runs(
run_overview_with_failure.id,
successful_run_overview.id,
skip_example_on_any_failure=False,
)

assert evaluation_overview.successful_evaluation_count == 2
assert evaluation_overview.failed_evaluation_count == 1


def test_evaluator_aborts_on_error(
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_aggregator: Aggregator[
Expand Down

0 comments on commit 28fd506

Please sign in to comment.