diff --git a/CHANGELOG.md b/CHANGELOG.md index c8b3c62b7..3139f3b9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ - This is not supported for the `OpenTelemetryTracer` because of technical incompatibilities. - All exported spans now contain the status of the span. - We now support python 3.12 + - Add `description` parameter to `Evaluator.evaluate_runs` and `Runner.run_dataset` to allow individual descriptions without the need to create a new `Evaluator` or `Runner`. ### Fixes - The document index client now correctly URL-encodes document names in its queries. diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py index b539b1bef..c0f07852d 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py @@ -98,6 +98,7 @@ def evaluate_runs( num_examples: Optional[int] = None, abort_on_error: bool = False, skip_example_on_any_failure: bool = True, + description: Optional[str] = None, ) -> EvaluationOverview: """Evaluates all generated outputs in the run. @@ -116,6 +117,7 @@ def evaluate_runs( Always the first n runs stored in the evaluation repository. Defaults to None. abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False. skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True. + description: Optional description of the evaluation. Defaults to None. Returns: EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be @@ -150,6 +152,9 @@ def evaluate_runs( ) successful_evaluation_count = len(example_evaluations) - failed_evaluation_count + full_description = ( + self.description + " : " + description if description else self.description + ) overview = EvaluationOverview( run_overviews=frozenset(run_overviews), id=eval_id, @@ -157,7 +162,7 @@ def evaluate_runs( end_date=utc_now(), successful_evaluation_count=successful_evaluation_count, failed_evaluation_count=failed_evaluation_count, - description=self.description, + description=full_description, ) self._evaluation_repository.store_evaluation_overview(overview) diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py index e58eebbee..18f62afd9 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py @@ -170,6 +170,7 @@ def evaluate_runs( num_examples: Optional[int] = None, abort_on_error: bool = False, skip_example_on_any_failure: bool = True, + description: Optional[str] = None, ) -> EvaluationOverview: self._evaluation_logic.set_previous_run_output_ids([]) return super().evaluate_runs( @@ -177,6 +178,7 @@ def evaluate_runs( num_examples=num_examples, skip_example_on_any_failure=skip_example_on_any_failure, abort_on_error=abort_on_error, + description=description, ) diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py index a45b670fd..6f45d82e2 100644 --- a/src/intelligence_layer/evaluation/run/runner.py +++ b/src/intelligence_layer/evaluation/run/runner.py @@ -77,6 +77,7 @@ def run_dataset( num_examples: Optional[int] = None, abort_on_error: bool = False, max_workers: int = 10, + description: Optional[str] = None, ) -> RunOverview: """Generates all outputs for the provided dataset. @@ -90,6 +91,7 @@ def run_dataset( Always the first n examples will be taken. abort_on_error: Flag to abort all run when an error occurs. Defaults to False. max_workers: Number of examples that can be evaluated concurrently. Defaults to 10. + description: An optional description of the run. Defaults to None. Returns: An overview of the run. Outputs will not be returned but instead stored in the @@ -140,6 +142,9 @@ def run( run_id=run_id, example_id=example_id, output=output ), ) + full_description = ( + self.description + " : " + description if description else self.description + ) run_overview = RunOverview( dataset_id=dataset_id, id=run_id, @@ -147,7 +152,7 @@ def run( end=utc_now(), failed_example_count=failed_count, successful_example_count=successful_count, - description=self.description, + description=full_description, ) self._run_repository.store_run_overview(run_overview) return run_overview diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py index fad8ba1cf..3b18f36c7 100644 --- a/tests/evaluation/test_evaluator.py +++ b/tests/evaluation/test_evaluator.py @@ -4,15 +4,7 @@ from pydantic import BaseModel from pytest import fixture -from intelligence_layer.core import ( - InMemoryTaskSpan, - InMemoryTracer, - Input, - NoOpTracer, - Output, - Task, - Tracer, -) +from intelligence_layer.core import Input, Output, Task, Tracer from intelligence_layer.evaluation import ( AggregatedEvaluation, AggregationLogic, @@ -241,6 +233,14 @@ def dataset_id( ).id +@fixture +def run_id( + dataset_id: str, + dummy_runner: Runner[str, str], +) -> str: + return dummy_runner.run_dataset(dataset_id).id + + @fixture def good_dataset_id( sequence_good_examples: Iterable[Example[str, None]], @@ -315,6 +315,37 @@ def test_eval_runs_returns_generic_statistics( assert evaluation_overview.failed_evaluation_count == 1 +def test_eval_runs_works_without_description( + run_id: str, + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, + dummy_eval_logic: DummyEvaluationLogic, +) -> None: + evaluator = Evaluator( + in_memory_dataset_repository, + in_memory_run_repository, + in_memory_evaluation_repository, + "", + dummy_eval_logic, + ) + evaluation_overview = evaluator.evaluate_runs(run_id) + + assert evaluation_overview.description == evaluator.description + + +def test_eval_runs_uses_correct_description( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str +) -> None: + eval_description = "My evaluation description" + evaluation_overview = dummy_evaluator.evaluate_runs( + run_id, description=eval_description + ) + + assert dummy_evaluator.description in evaluation_overview.description + assert eval_description in evaluation_overview.description + + def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false( dummy_pairwise_evaluator: Evaluator[str, str, None, DummyEvaluation], dummy_runner: Runner[str, str], @@ -345,50 +376,25 @@ def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false( def test_evaluator_aborts_on_error( dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], - dummy_aggregator: Aggregator[ - DummyEvaluation, DummyAggregatedEvaluationWithResultList - ], - dummy_runner: Runner[str, str], - dataset_id: str, + run_id: str, ) -> None: - run_overview = dummy_runner.run_dataset(dataset_id) - with pytest.raises(RuntimeError): - dummy_evaluator.evaluate_runs(run_overview.id, abort_on_error=True) - - -def test_eval_and_aggregate_runs_uses_passed_tracer( - dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], - dummy_aggregator: Aggregator[ - DummyEvaluation, DummyAggregatedEvaluationWithResultList - ], - dataset_id: str, - dummy_runner: Runner[str, str], -) -> None: - in_memory_tracer = InMemoryTracer() - run_overview = dummy_runner.run_dataset(dataset_id, in_memory_tracer) - evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) - dummy_aggregator.aggregate_evaluation(evaluation_overview.id) - - entries = in_memory_tracer.entries - assert len(entries) == 3 - assert all([isinstance(e, InMemoryTaskSpan) for e in entries]) + dummy_evaluator.evaluate_runs(run_id, abort_on_error=True) def test_eval_and_aggregate_runs_stores_example_evaluations( - dummy_runner: Runner[str, str], dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], dummy_aggregator: Aggregator[ DummyEvaluation, DummyAggregatedEvaluationWithResultList ], dataset_id: str, + run_id: str, ) -> None: evaluation_repository = dummy_evaluator._evaluation_repository dataset_repository = dummy_evaluator._dataset_repository examples = list(dataset_repository.examples(dataset_id, str, type(None))) - run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer()) - evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) assert next(iter(aggregation_overview.evaluation_overviews)) == evaluation_overview @@ -416,13 +422,12 @@ def test_eval_and_aggregate_runs_stores_example_evaluations( def test_failed_evaluations_returns_only_failed_evaluations( - dummy_runner: Runner[str, str], dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], dataset_id: str, + run_id: str, sequence_examples: Iterable[Example[str, None]], ) -> None: - run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer()) - evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) failed_evaluations = list( dummy_evaluator.failed_evaluations(evaluation_overview.id) ) @@ -475,13 +480,12 @@ def test_eval_and_aggregate_runs_stores_aggregated_results( dummy_aggregator: Aggregator[ DummyEvaluation, DummyAggregatedEvaluationWithResultList ], - dummy_runner: Runner[str, str], dataset_id: str, + run_id: str, ) -> None: aggregation_repository = dummy_aggregator._aggregation_repository - run_overview = dummy_runner.run_dataset(dataset_id) - evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) loaded_evaluation_run_overview = aggregation_repository.aggregation_overview( aggregation_overview.id, DummyAggregatedEvaluationWithResultList diff --git a/tests/evaluation/test_runner.py b/tests/evaluation/test_runner.py index d07021b0b..dee8260ae 100644 --- a/tests/evaluation/test_runner.py +++ b/tests/evaluation/test_runner.py @@ -2,7 +2,7 @@ import pytest -from intelligence_layer.core import InMemoryTracer +from intelligence_layer.core import InMemoryTaskSpan, InMemoryTracer from intelligence_layer.evaluation import ( Example, InMemoryDatasetRepository, @@ -42,6 +42,41 @@ def test_runner_runs_dataset( assert failed_runs[0].example.id == examples[1].id +def test_runner_works_without_description( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + sequence_examples: Iterable[Example[str, None]], +) -> None: + examples = list(sequence_examples) + task = DummyTask() + runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "") + + dataset_id = in_memory_dataset_repository.create_dataset( + examples=examples, dataset_name="" + ).id + overview = runner.run_dataset(dataset_id) + assert overview.description is runner.description + + +def test_runner_has_correct_description( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + sequence_examples: Iterable[Example[str, None]], +) -> None: + examples = list(sequence_examples) + task = DummyTask() + runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo") + + dataset_id = in_memory_dataset_repository.create_dataset( + examples=examples, dataset_name="" + ).id + run_description = "bar" + overview = runner.run_dataset(dataset_id, description=run_description) + + assert runner.description in overview.description + assert run_description in overview.description + + def test_runner_aborts_on_error( in_memory_dataset_repository: InMemoryDatasetRepository, in_memory_run_repository: InMemoryRunRepository, @@ -83,3 +118,7 @@ def test_runner_runs_n_examples( assert overview.successful_example_count == 1 assert overview_with_tracer.successful_example_count == 1 assert overview_with_tracer.failed_example_count == 0 + + entries = tracer.entries + assert len(entries) == 1 + assert all([isinstance(e, InMemoryTaskSpan) for e in entries])