diff --git a/CHANGELOG.md b/CHANGELOG.md index 61648a0ea..6b9b1c13b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ - `DefaultArgillaClient` -> `ArgillaWrapperClient` - `Question` -> `argilla.RatingQuestion`, `options` -> `values` and it takes only a list - `Field` -> `argilla.TextField` + - Add `description` parameter to `Aggregator.aggregate_evaluation` to allow individual descriptions without the need to create a new `Aggregator`. This was missing from the previous release. + ### Fixes - Reinitializing different `AlephAlphaModel` instances and retrieving their tokenizer should now consume a lot less memory. diff --git a/src/documentation/parameter_optimization.ipynb b/src/documentation/parameter_optimization.ipynb index b74db13fa..7c40fa098 100644 --- a/src/documentation/parameter_optimization.ipynb +++ b/src/documentation/parameter_optimization.ipynb @@ -159,6 +159,13 @@ " evaluation_repository,\n", " EXPERIMENT_NAME,\n", " DummyEvaluationLogic(),\n", + ")\n", + "\n", + "aggregator = Aggregator(\n", + " evaluation_repository,\n", + " aggregation_repository,\n", + " EXPERIMENT_NAME,\n", + " DummyAggregationLogic(),\n", ")" ] }, @@ -179,6 +186,9 @@ "source": [ "# Definition of parameters\n", "model_list = [\"model a\", \"model b\", \"model c\"]\n", + "label = \"dummy_label\"\n", + "labels = {label}\n", + "\n", "prompt_list = [\n", " \"A nice story starts with:\",\n", " \"Some kind of prompt\",\n", @@ -188,28 +198,24 @@ "\n", "# Loop over all combinations of parameters and run the `Task` for each combination.\n", "# Note, that this can be **very** expensive for large sets of parameters.\n", - "for model, prompt in itertools.product(model_list, prompt_list):\n", + "for i, (model, prompt) in enumerate(itertools.product(model_list, prompt_list)):\n", " dummy_task = DummyTask(model=model, prompt=prompt)\n", "\n", " # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n", " metadata = dict({\"model\": model, \"prompt\": prompt})\n", - " description = \"Evaluate dummy task\"\n", + " description = f\"Evaluate dummy task {i}\"\n", " runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n", " run_overview = runner.run_dataset(\n", - " dataset.id, metadata=metadata, description=description\n", + " dataset.id, metadata=metadata, description=description, labels=labels\n", " )\n", "\n", " eval_overview = evaluator.evaluate_runs(\n", - " run_overview.id, metadata=metadata, description=description\n", + " run_overview.id, metadata=metadata, description=description, labels=labels\n", " )\n", "\n", - " aggregator = Aggregator(\n", - " evaluation_repository,\n", - " aggregation_repository,\n", - " EXPERIMENT_NAME,\n", - " DummyAggregationLogic(),\n", - " )\n", - " aggregator.aggregate_evaluation(eval_overview.id, metadata=metadata)" + " aggregator.aggregate_evaluation(\n", + " eval_overview.id, metadata=metadata, description=description, labels=labels\n", + " )" ] }, { @@ -228,13 +234,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`. Filtering can also be done on labels and/or metadata.\n", + "# Retrieve all aggregations and filter them by desired criteria, i.e., the label `dummy_label`. Filtering can also be done on description and/or metadata.\n", "aggregations_of_interest = [\n", " overview\n", " for overview in aggregation_repository.aggregation_overviews(\n", " aggregation_type=DummyAggregatedEvaluation\n", " )\n", - " if overview.description == EXPERIMENT_NAME\n", + " if label in overview.labels\n", "]\n", "\n", "# Convert the desired aggregation into a pandas dataframe\n", @@ -311,7 +317,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/intelligence_layer/evaluation/aggregation/aggregator.py b/src/intelligence_layer/evaluation/aggregation/aggregator.py index b93da6ba0..d46d81e6d 100644 --- a/src/intelligence_layer/evaluation/aggregation/aggregator.py +++ b/src/intelligence_layer/evaluation/aggregation/aggregator.py @@ -3,6 +3,7 @@ from functools import cached_property from typing import ( Generic, + Optional, TypeVar, cast, final, @@ -186,6 +187,7 @@ def evaluation_type(self) -> type[Evaluation]: def aggregate_evaluation( self, *eval_ids: str, + description: Optional[str] = None, labels: set[str] | None = None, metadata: SerializableDict | None = None, ) -> AggregationOverview[AggregatedEvaluation]: @@ -196,6 +198,7 @@ def aggregate_evaluation( Args: eval_ids: An overview of the evaluation to be aggregated. Does not include actual evaluations as these will be retrieved from the repository. + description: Optional description of the aggregation. Defaults to None. labels: A list of labels for filtering. Defaults to an empty list. metadata: A dict for additional information about the aggregation overview. Defaults to an empty dict. @@ -240,6 +243,9 @@ def load_eval_overview(evaluation_id: str) -> EvaluationOverview: cast(Iterable[Evaluation], successful_evaluations) ) + full_description = ( + self.description + " : " + description if description else self.description + ) aggregation_overview = AggregationOverview( evaluation_overviews=frozenset(evaluation_overviews), id=str(uuid4()), @@ -247,7 +253,7 @@ def load_eval_overview(evaluation_id: str) -> EvaluationOverview: end=utc_now(), successful_evaluation_count=successful_evaluations.included_count(), crashed_during_evaluation_count=successful_evaluations.excluded_count(), - description=self.description, + description=full_description, statistics=statistics, labels=labels, metadata=metadata, diff --git a/tests/evaluation/evaluation/test_evaluator.py b/tests/evaluation/evaluation/test_evaluator_and_aggregator.py similarity index 95% rename from tests/evaluation/evaluation/test_evaluator.py rename to tests/evaluation/evaluation/test_evaluator_and_aggregator.py index ee133cf43..8cd8570ea 100644 --- a/tests/evaluation/evaluation/test_evaluator.py +++ b/tests/evaluation/evaluation/test_evaluator_and_aggregator.py @@ -352,6 +352,37 @@ def test_eval_runs_uses_correct_description( assert eval_description in evaluation_overview.description +def test_aggregation_runs_works_without_description( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList + ], + run_id: str, +) -> None: + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) + aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) + + assert aggregation_overview.description == dummy_aggregator.description + + +def test_aggregation_runs_uses_correct_description( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList + ], + run_id: str, +) -> None: + aggregation_description = "My aggregation description" + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) + + aggregation_overview = dummy_aggregator.aggregate_evaluation( + evaluation_overview.id, description=aggregation_description + ) + + assert dummy_aggregator.description in aggregation_overview.description + assert aggregation_description in aggregation_overview.description + + def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false( dummy_pairwise_evaluator: Evaluator[str, str, None, DummyEvaluation], dummy_runner: Runner[str, str],