feat: add description to aggregate method (#913)

* added description to aggregate_evaluation * updated notebook to check for label instead of description * updated changelog * pulled creation of aggregator out of the for loop * renamed test file which contains tests for evaluator and aggregator
Aleph-Alpha · Jun 17, 2024 · 8301c7f · 8301c7f
1 parent 5f0b04c
commit 8301c7f
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,8 @@
       - `DefaultArgillaClient` -> `ArgillaWrapperClient`
       - `Question` -> `argilla.RatingQuestion`, `options` -> `values` and it takes only a list
       - `Field` -> `argilla.TextField`
+  - Add `description` parameter to `Aggregator.aggregate_evaluation` to allow individual descriptions without the need to create a new `Aggregator`. This was missing from the previous release.
+
 
 ### Fixes
   - Reinitializing different `AlephAlphaModel` instances and retrieving their tokenizer should now consume a lot less memory.

diff --git a/src/documentation/parameter_optimization.ipynb b/src/documentation/parameter_optimization.ipynb
@@ -159,6 +159,13 @@
     "    evaluation_repository,\n",
     "    EXPERIMENT_NAME,\n",
     "    DummyEvaluationLogic(),\n",
+    ")\n",
+    "\n",
+    "aggregator = Aggregator(\n",
+    "    evaluation_repository,\n",
+    "    aggregation_repository,\n",
+    "    EXPERIMENT_NAME,\n",
+    "    DummyAggregationLogic(),\n",
     ")"
    ]
   },
@@ -179,6 +186,9 @@
    "source": [
     "# Definition of parameters\n",
     "model_list = [\"model a\", \"model b\", \"model c\"]\n",
+    "label = \"dummy_label\"\n",
+    "labels = {label}\n",
+    "\n",
     "prompt_list = [\n",
     "    \"A nice story starts with:\",\n",
     "    \"Some kind of prompt\",\n",
@@ -188,28 +198,24 @@
     "\n",
     "# Loop over all combinations of parameters and run the `Task` for each combination.\n",
     "# Note, that this can be **very** expensive for large sets of parameters.\n",
-    "for model, prompt in itertools.product(model_list, prompt_list):\n",
+    "for i, (model, prompt) in enumerate(itertools.product(model_list, prompt_list)):\n",
     "    dummy_task = DummyTask(model=model, prompt=prompt)\n",
     "\n",
     "    # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n",
     "    metadata = dict({\"model\": model, \"prompt\": prompt})\n",
-    "    description = \"Evaluate dummy task\"\n",
+    "    description = f\"Evaluate dummy task {i}\"\n",
     "    runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n",
     "    run_overview = runner.run_dataset(\n",
-    "        dataset.id, metadata=metadata, description=description\n",
+    "        dataset.id, metadata=metadata, description=description, labels=labels\n",
     "    )\n",
     "\n",
     "    eval_overview = evaluator.evaluate_runs(\n",
-    "        run_overview.id, metadata=metadata, description=description\n",
+    "        run_overview.id, metadata=metadata, description=description, labels=labels\n",
     "    )\n",
     "\n",
-    "    aggregator = Aggregator(\n",
-    "        evaluation_repository,\n",
-    "        aggregation_repository,\n",
-    "        EXPERIMENT_NAME,\n",
-    "        DummyAggregationLogic(),\n",
-    "    )\n",
-    "    aggregator.aggregate_evaluation(eval_overview.id, metadata=metadata)"
+    "    aggregator.aggregate_evaluation(\n",
+    "        eval_overview.id, metadata=metadata, description=description, labels=labels\n",
+    "    )"
    ]
   },
   {
@@ -228,13 +234,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`. Filtering can also be done on labels and/or metadata.\n",
+    "# Retrieve all aggregations and filter them by desired criteria, i.e., the label `dummy_label`. Filtering can also be done on description and/or metadata.\n",
     "aggregations_of_interest = [\n",
     "    overview\n",
     "    for overview in aggregation_repository.aggregation_overviews(\n",
     "        aggregation_type=DummyAggregatedEvaluation\n",
     "    )\n",
-    "    if overview.description == EXPERIMENT_NAME\n",
+    "    if label in overview.labels\n",
     "]\n",
     "\n",
     "# Convert the desired aggregation into a pandas dataframe\n",
@@ -311,7 +317,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.8"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,

diff --git a/src/intelligence_layer/evaluation/aggregation/aggregator.py b/src/intelligence_layer/evaluation/aggregation/aggregator.py
@@ -3,6 +3,7 @@
 from functools import cached_property
 from typing import (
     Generic,
+    Optional,
     TypeVar,
     cast,
     final,
@@ -186,6 +187,7 @@ def evaluation_type(self) -> type[Evaluation]:
     def aggregate_evaluation(
         self,
         *eval_ids: str,
+        description: Optional[str] = None,
         labels: set[str] | None = None,
         metadata: SerializableDict | None = None,
     ) -> AggregationOverview[AggregatedEvaluation]:
@@ -196,6 +198,7 @@ def aggregate_evaluation(
         Args:
             eval_ids: An overview of the evaluation to be aggregated. Does not include
                 actual evaluations as these will be retrieved from the repository.
+            description: Optional description of the aggregation. Defaults to None.
             labels: A list of labels for filtering. Defaults to an empty list.
             metadata: A dict for additional information about the aggregation overview. Defaults to an empty dict.
 
@@ -240,14 +243,17 @@ def load_eval_overview(evaluation_id: str) -> EvaluationOverview:
             cast(Iterable[Evaluation], successful_evaluations)
         )
 
+        full_description = (
+            self.description + " : " + description if description else self.description
+        )
         aggregation_overview = AggregationOverview(
             evaluation_overviews=frozenset(evaluation_overviews),
             id=str(uuid4()),
             start=start,
             end=utc_now(),
             successful_evaluation_count=successful_evaluations.included_count(),
             crashed_during_evaluation_count=successful_evaluations.excluded_count(),
-            description=self.description,
+            description=full_description,
             statistics=statistics,
             labels=labels,
             metadata=metadata,

diff --git a/...s/evaluation/evaluation/test_evaluator.py → ...aluation/test_evaluator_and_aggregator.py b/...s/evaluation/evaluation/test_evaluator.py → ...aluation/test_evaluator_and_aggregator.py
@@ -352,6 +352,37 @@ def test_eval_runs_uses_correct_description(
     assert eval_description in evaluation_overview.description
 
 
+def test_aggregation_runs_works_without_description(
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    ],
+    run_id: str,
+) -> None:
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
+    aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
+
+    assert aggregation_overview.description == dummy_aggregator.description
+
+
+def test_aggregation_runs_uses_correct_description(
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    ],
+    run_id: str,
+) -> None:
+    aggregation_description = "My aggregation description"
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
+
+    aggregation_overview = dummy_aggregator.aggregate_evaluation(
+        evaluation_overview.id, description=aggregation_description
+    )
+
+    assert dummy_aggregator.description in aggregation_overview.description
+    assert aggregation_description in aggregation_overview.description
+
+
 def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false(
     dummy_pairwise_evaluator: Evaluator[str, str, None, DummyEvaluation],
     dummy_runner: Runner[str, str],