feat: description for run and eval (#883)

* feat: Add description argument to Evaluator.evaluate_runs * feat: Add description argument to Runner.run_dataset TASK: IL-417
Aleph-Alpha · May 30, 2024 · ccd859d · ccd859d
1 parent f698c83
commit ccd859d
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 47 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@
    - This is not supported for the `OpenTelemetryTracer` because of technical incompatibilities.
  - All exported spans now contain the status of the span.
  - We now support python 3.12
+ - Add `description` parameter to `Evaluator.evaluate_runs` and `Runner.run_dataset` to allow individual descriptions without the need to create a new `Evaluator` or `Runner`.
 
 ### Fixes
  - The document index client now correctly URL-encodes document names in its queries.

diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py
@@ -98,6 +98,7 @@ def evaluate_runs(
         num_examples: Optional[int] = None,
         abort_on_error: bool = False,
         skip_example_on_any_failure: bool = True,
+        description: Optional[str] = None,
     ) -> EvaluationOverview:
         """Evaluates all generated outputs in the run.
 
@@ -116,6 +117,7 @@ def evaluate_runs(
                 Always the first n runs stored in the evaluation repository. Defaults to None.
             abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
             skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True.
+            description: Optional description of the evaluation. Defaults to None.
 
         Returns:
             EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
@@ -150,14 +152,17 @@ def evaluate_runs(
         )
 
         successful_evaluation_count = len(example_evaluations) - failed_evaluation_count
+        full_description = (
+            self.description + " : " + description if description else self.description
+        )
         overview = EvaluationOverview(
             run_overviews=frozenset(run_overviews),
             id=eval_id,
             start_date=start,
             end_date=utc_now(),
             successful_evaluation_count=successful_evaluation_count,
             failed_evaluation_count=failed_evaluation_count,
-            description=self.description,
+            description=full_description,
         )
         self._evaluation_repository.store_evaluation_overview(overview)
 

diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
@@ -170,13 +170,15 @@ def evaluate_runs(
         num_examples: Optional[int] = None,
         abort_on_error: bool = False,
         skip_example_on_any_failure: bool = True,
+        description: Optional[str] = None,
     ) -> EvaluationOverview:
         self._evaluation_logic.set_previous_run_output_ids([])
         return super().evaluate_runs(
             *run_ids,
             num_examples=num_examples,
             skip_example_on_any_failure=skip_example_on_any_failure,
             abort_on_error=abort_on_error,
+            description=description,
         )
 
 

diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py
@@ -77,6 +77,7 @@ def run_dataset(
         num_examples: Optional[int] = None,
         abort_on_error: bool = False,
         max_workers: int = 10,
+        description: Optional[str] = None,
     ) -> RunOverview:
         """Generates all outputs for the provided dataset.
 
@@ -90,6 +91,7 @@ def run_dataset(
                 Always the first n examples will be taken.
             abort_on_error: Flag to abort all run when an error occurs. Defaults to False.
             max_workers: Number of examples that can be evaluated concurrently. Defaults to 10.
+            description: An optional description of the run. Defaults to None.
 
         Returns:
             An overview of the run. Outputs will not be returned but instead stored in the
@@ -140,14 +142,17 @@ def run(
                         run_id=run_id, example_id=example_id, output=output
                     ),
                 )
+        full_description = (
+            self.description + " : " + description if description else self.description
+        )
         run_overview = RunOverview(
             dataset_id=dataset_id,
             id=run_id,
             start=start,
             end=utc_now(),
             failed_example_count=failed_count,
             successful_example_count=successful_count,
-            description=self.description,
+            description=full_description,
         )
         self._run_repository.store_run_overview(run_overview)
         return run_overview

diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py
@@ -4,15 +4,7 @@
 from pydantic import BaseModel
 from pytest import fixture
 
-from intelligence_layer.core import (
-    InMemoryTaskSpan,
-    InMemoryTracer,
-    Input,
-    NoOpTracer,
-    Output,
-    Task,
-    Tracer,
-)
+from intelligence_layer.core import Input, Output, Task, Tracer
 from intelligence_layer.evaluation import (
     AggregatedEvaluation,
     AggregationLogic,
@@ -241,6 +233,14 @@ def dataset_id(
     ).id
 
 
+@fixture
+def run_id(
+    dataset_id: str,
+    dummy_runner: Runner[str, str],
+) -> str:
+    return dummy_runner.run_dataset(dataset_id).id
+
+
 @fixture
 def good_dataset_id(
     sequence_good_examples: Iterable[Example[str, None]],
@@ -315,6 +315,37 @@ def test_eval_runs_returns_generic_statistics(
     assert evaluation_overview.failed_evaluation_count == 1
 
 
+def test_eval_runs_works_without_description(
+    run_id: str,
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    dummy_eval_logic: DummyEvaluationLogic,
+) -> None:
+    evaluator = Evaluator(
+        in_memory_dataset_repository,
+        in_memory_run_repository,
+        in_memory_evaluation_repository,
+        "",
+        dummy_eval_logic,
+    )
+    evaluation_overview = evaluator.evaluate_runs(run_id)
+
+    assert evaluation_overview.description == evaluator.description
+
+
+def test_eval_runs_uses_correct_description(
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str
+) -> None:
+    eval_description = "My evaluation description"
+    evaluation_overview = dummy_evaluator.evaluate_runs(
+        run_id, description=eval_description
+    )
+
+    assert dummy_evaluator.description in evaluation_overview.description
+    assert eval_description in evaluation_overview.description
+
+
 def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false(
     dummy_pairwise_evaluator: Evaluator[str, str, None, DummyEvaluation],
     dummy_runner: Runner[str, str],
@@ -345,50 +376,25 @@ def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false(
 
 def test_evaluator_aborts_on_error(
     dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
-    dummy_aggregator: Aggregator[
-        DummyEvaluation, DummyAggregatedEvaluationWithResultList
-    ],
-    dummy_runner: Runner[str, str],
-    dataset_id: str,
+    run_id: str,
 ) -> None:
-    run_overview = dummy_runner.run_dataset(dataset_id)
-
     with pytest.raises(RuntimeError):
-        dummy_evaluator.evaluate_runs(run_overview.id, abort_on_error=True)
-
-
-def test_eval_and_aggregate_runs_uses_passed_tracer(
-    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
-    dummy_aggregator: Aggregator[
-        DummyEvaluation, DummyAggregatedEvaluationWithResultList
-    ],
-    dataset_id: str,
-    dummy_runner: Runner[str, str],
-) -> None:
-    in_memory_tracer = InMemoryTracer()
-    run_overview = dummy_runner.run_dataset(dataset_id, in_memory_tracer)
-    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
-    dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
-
-    entries = in_memory_tracer.entries
-    assert len(entries) == 3
-    assert all([isinstance(e, InMemoryTaskSpan) for e in entries])
+        dummy_evaluator.evaluate_runs(run_id, abort_on_error=True)
 
 
 def test_eval_and_aggregate_runs_stores_example_evaluations(
-    dummy_runner: Runner[str, str],
     dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
     dummy_aggregator: Aggregator[
         DummyEvaluation, DummyAggregatedEvaluationWithResultList
     ],
     dataset_id: str,
+    run_id: str,
 ) -> None:
     evaluation_repository = dummy_evaluator._evaluation_repository
     dataset_repository = dummy_evaluator._dataset_repository
     examples = list(dataset_repository.examples(dataset_id, str, type(None)))
 
-    run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer())
-    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
     aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
     assert next(iter(aggregation_overview.evaluation_overviews)) == evaluation_overview
 
@@ -416,13 +422,12 @@ def test_eval_and_aggregate_runs_stores_example_evaluations(
 
 
 def test_failed_evaluations_returns_only_failed_evaluations(
-    dummy_runner: Runner[str, str],
     dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
     dataset_id: str,
+    run_id: str,
     sequence_examples: Iterable[Example[str, None]],
 ) -> None:
-    run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer())
-    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
     failed_evaluations = list(
         dummy_evaluator.failed_evaluations(evaluation_overview.id)
     )
@@ -475,13 +480,12 @@ def test_eval_and_aggregate_runs_stores_aggregated_results(
     dummy_aggregator: Aggregator[
         DummyEvaluation, DummyAggregatedEvaluationWithResultList
     ],
-    dummy_runner: Runner[str, str],
     dataset_id: str,
+    run_id: str,
 ) -> None:
     aggregation_repository = dummy_aggregator._aggregation_repository
 
-    run_overview = dummy_runner.run_dataset(dataset_id)
-    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
     aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
     loaded_evaluation_run_overview = aggregation_repository.aggregation_overview(
         aggregation_overview.id, DummyAggregatedEvaluationWithResultList

diff --git a/tests/evaluation/test_runner.py b/tests/evaluation/test_runner.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from intelligence_layer.core import InMemoryTracer
+from intelligence_layer.core import InMemoryTaskSpan, InMemoryTracer
 from intelligence_layer.evaluation import (
     Example,
     InMemoryDatasetRepository,
@@ -42,6 +42,41 @@ def test_runner_runs_dataset(
     assert failed_runs[0].example.id == examples[1].id
 
 
+def test_runner_works_without_description(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    sequence_examples: Iterable[Example[str, None]],
+) -> None:
+    examples = list(sequence_examples)
+    task = DummyTask()
+    runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "")
+
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=examples, dataset_name=""
+    ).id
+    overview = runner.run_dataset(dataset_id)
+    assert overview.description is runner.description
+
+
+def test_runner_has_correct_description(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    sequence_examples: Iterable[Example[str, None]],
+) -> None:
+    examples = list(sequence_examples)
+    task = DummyTask()
+    runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo")
+
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=examples, dataset_name=""
+    ).id
+    run_description = "bar"
+    overview = runner.run_dataset(dataset_id, description=run_description)
+
+    assert runner.description in overview.description
+    assert run_description in overview.description
+
+
 def test_runner_aborts_on_error(
     in_memory_dataset_repository: InMemoryDatasetRepository,
     in_memory_run_repository: InMemoryRunRepository,
@@ -83,3 +118,7 @@ def test_runner_runs_n_examples(
     assert overview.successful_example_count == 1
     assert overview_with_tracer.successful_example_count == 1
     assert overview_with_tracer.failed_example_count == 0
+
+    entries = tracer.entries
+    assert len(entries) == 1
+    assert all([isinstance(e, InMemoryTaskSpan) for e in entries])