Skip to content

Commit

Permalink
feat: description for run and eval (#883)
Browse files Browse the repository at this point in the history
* feat: Add description argument to Evaluator.evaluate_runs
* feat: Add description argument to Runner.run_dataset
TASK: IL-417
  • Loading branch information
SebastianNiehusAA authored May 30, 2024
1 parent f698c83 commit ccd859d
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 47 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
- This is not supported for the `OpenTelemetryTracer` because of technical incompatibilities.
- All exported spans now contain the status of the span.
- We now support python 3.12
- Add `description` parameter to `Evaluator.evaluate_runs` and `Runner.run_dataset` to allow individual descriptions without the need to create a new `Evaluator` or `Runner`.

### Fixes
- The document index client now correctly URL-encodes document names in its queries.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def evaluate_runs(
num_examples: Optional[int] = None,
abort_on_error: bool = False,
skip_example_on_any_failure: bool = True,
description: Optional[str] = None,
) -> EvaluationOverview:
"""Evaluates all generated outputs in the run.
Expand All @@ -116,6 +117,7 @@ def evaluate_runs(
Always the first n runs stored in the evaluation repository. Defaults to None.
abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True.
description: Optional description of the evaluation. Defaults to None.
Returns:
EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
Expand Down Expand Up @@ -150,14 +152,17 @@ def evaluate_runs(
)

successful_evaluation_count = len(example_evaluations) - failed_evaluation_count
full_description = (
self.description + " : " + description if description else self.description
)
overview = EvaluationOverview(
run_overviews=frozenset(run_overviews),
id=eval_id,
start_date=start,
end_date=utc_now(),
successful_evaluation_count=successful_evaluation_count,
failed_evaluation_count=failed_evaluation_count,
description=self.description,
description=full_description,
)
self._evaluation_repository.store_evaluation_overview(overview)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,15 @@ def evaluate_runs(
num_examples: Optional[int] = None,
abort_on_error: bool = False,
skip_example_on_any_failure: bool = True,
description: Optional[str] = None,
) -> EvaluationOverview:
self._evaluation_logic.set_previous_run_output_ids([])
return super().evaluate_runs(
*run_ids,
num_examples=num_examples,
skip_example_on_any_failure=skip_example_on_any_failure,
abort_on_error=abort_on_error,
description=description,
)


Expand Down
7 changes: 6 additions & 1 deletion src/intelligence_layer/evaluation/run/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def run_dataset(
num_examples: Optional[int] = None,
abort_on_error: bool = False,
max_workers: int = 10,
description: Optional[str] = None,
) -> RunOverview:
"""Generates all outputs for the provided dataset.
Expand All @@ -90,6 +91,7 @@ def run_dataset(
Always the first n examples will be taken.
abort_on_error: Flag to abort all run when an error occurs. Defaults to False.
max_workers: Number of examples that can be evaluated concurrently. Defaults to 10.
description: An optional description of the run. Defaults to None.
Returns:
An overview of the run. Outputs will not be returned but instead stored in the
Expand Down Expand Up @@ -140,14 +142,17 @@ def run(
run_id=run_id, example_id=example_id, output=output
),
)
full_description = (
self.description + " : " + description if description else self.description
)
run_overview = RunOverview(
dataset_id=dataset_id,
id=run_id,
start=start,
end=utc_now(),
failed_example_count=failed_count,
successful_example_count=successful_count,
description=self.description,
description=full_description,
)
self._run_repository.store_run_overview(run_overview)
return run_overview
Expand Down
92 changes: 48 additions & 44 deletions tests/evaluation/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,7 @@
from pydantic import BaseModel
from pytest import fixture

from intelligence_layer.core import (
InMemoryTaskSpan,
InMemoryTracer,
Input,
NoOpTracer,
Output,
Task,
Tracer,
)
from intelligence_layer.core import Input, Output, Task, Tracer
from intelligence_layer.evaluation import (
AggregatedEvaluation,
AggregationLogic,
Expand Down Expand Up @@ -241,6 +233,14 @@ def dataset_id(
).id


@fixture
def run_id(
dataset_id: str,
dummy_runner: Runner[str, str],
) -> str:
return dummy_runner.run_dataset(dataset_id).id


@fixture
def good_dataset_id(
sequence_good_examples: Iterable[Example[str, None]],
Expand Down Expand Up @@ -315,6 +315,37 @@ def test_eval_runs_returns_generic_statistics(
assert evaluation_overview.failed_evaluation_count == 1


def test_eval_runs_works_without_description(
run_id: str,
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
in_memory_evaluation_repository: InMemoryEvaluationRepository,
dummy_eval_logic: DummyEvaluationLogic,
) -> None:
evaluator = Evaluator(
in_memory_dataset_repository,
in_memory_run_repository,
in_memory_evaluation_repository,
"",
dummy_eval_logic,
)
evaluation_overview = evaluator.evaluate_runs(run_id)

assert evaluation_overview.description == evaluator.description


def test_eval_runs_uses_correct_description(
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str
) -> None:
eval_description = "My evaluation description"
evaluation_overview = dummy_evaluator.evaluate_runs(
run_id, description=eval_description
)

assert dummy_evaluator.description in evaluation_overview.description
assert eval_description in evaluation_overview.description


def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false(
dummy_pairwise_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_runner: Runner[str, str],
Expand Down Expand Up @@ -345,50 +376,25 @@ def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false(

def test_evaluator_aborts_on_error(
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_aggregator: Aggregator[
DummyEvaluation, DummyAggregatedEvaluationWithResultList
],
dummy_runner: Runner[str, str],
dataset_id: str,
run_id: str,
) -> None:
run_overview = dummy_runner.run_dataset(dataset_id)

with pytest.raises(RuntimeError):
dummy_evaluator.evaluate_runs(run_overview.id, abort_on_error=True)


def test_eval_and_aggregate_runs_uses_passed_tracer(
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_aggregator: Aggregator[
DummyEvaluation, DummyAggregatedEvaluationWithResultList
],
dataset_id: str,
dummy_runner: Runner[str, str],
) -> None:
in_memory_tracer = InMemoryTracer()
run_overview = dummy_runner.run_dataset(dataset_id, in_memory_tracer)
evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
dummy_aggregator.aggregate_evaluation(evaluation_overview.id)

entries = in_memory_tracer.entries
assert len(entries) == 3
assert all([isinstance(e, InMemoryTaskSpan) for e in entries])
dummy_evaluator.evaluate_runs(run_id, abort_on_error=True)


def test_eval_and_aggregate_runs_stores_example_evaluations(
dummy_runner: Runner[str, str],
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_aggregator: Aggregator[
DummyEvaluation, DummyAggregatedEvaluationWithResultList
],
dataset_id: str,
run_id: str,
) -> None:
evaluation_repository = dummy_evaluator._evaluation_repository
dataset_repository = dummy_evaluator._dataset_repository
examples = list(dataset_repository.examples(dataset_id, str, type(None)))

run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer())
evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
assert next(iter(aggregation_overview.evaluation_overviews)) == evaluation_overview

Expand Down Expand Up @@ -416,13 +422,12 @@ def test_eval_and_aggregate_runs_stores_example_evaluations(


def test_failed_evaluations_returns_only_failed_evaluations(
dummy_runner: Runner[str, str],
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dataset_id: str,
run_id: str,
sequence_examples: Iterable[Example[str, None]],
) -> None:
run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer())
evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
failed_evaluations = list(
dummy_evaluator.failed_evaluations(evaluation_overview.id)
)
Expand Down Expand Up @@ -475,13 +480,12 @@ def test_eval_and_aggregate_runs_stores_aggregated_results(
dummy_aggregator: Aggregator[
DummyEvaluation, DummyAggregatedEvaluationWithResultList
],
dummy_runner: Runner[str, str],
dataset_id: str,
run_id: str,
) -> None:
aggregation_repository = dummy_aggregator._aggregation_repository

run_overview = dummy_runner.run_dataset(dataset_id)
evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
loaded_evaluation_run_overview = aggregation_repository.aggregation_overview(
aggregation_overview.id, DummyAggregatedEvaluationWithResultList
Expand Down
41 changes: 40 additions & 1 deletion tests/evaluation/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from intelligence_layer.core import InMemoryTracer
from intelligence_layer.core import InMemoryTaskSpan, InMemoryTracer
from intelligence_layer.evaluation import (
Example,
InMemoryDatasetRepository,
Expand Down Expand Up @@ -42,6 +42,41 @@ def test_runner_runs_dataset(
assert failed_runs[0].example.id == examples[1].id


def test_runner_works_without_description(
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
sequence_examples: Iterable[Example[str, None]],
) -> None:
examples = list(sequence_examples)
task = DummyTask()
runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "")

dataset_id = in_memory_dataset_repository.create_dataset(
examples=examples, dataset_name=""
).id
overview = runner.run_dataset(dataset_id)
assert overview.description is runner.description


def test_runner_has_correct_description(
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
sequence_examples: Iterable[Example[str, None]],
) -> None:
examples = list(sequence_examples)
task = DummyTask()
runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo")

dataset_id = in_memory_dataset_repository.create_dataset(
examples=examples, dataset_name=""
).id
run_description = "bar"
overview = runner.run_dataset(dataset_id, description=run_description)

assert runner.description in overview.description
assert run_description in overview.description


def test_runner_aborts_on_error(
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
Expand Down Expand Up @@ -83,3 +118,7 @@ def test_runner_runs_n_examples(
assert overview.successful_example_count == 1
assert overview_with_tracer.successful_example_count == 1
assert overview_with_tracer.failed_example_count == 0

entries = tracer.entries
assert len(entries) == 1
assert all([isinstance(e, InMemoryTaskSpan) for e in entries])

0 comments on commit ccd859d

Please sign in to comment.