Skip to content

Commit

Permalink
feat: add description to aggregate method (#913)
Browse files Browse the repository at this point in the history
* added description to aggregate_evaluation

* updated notebook to check for label instead of description

* updated changelog

* pulled creation of aggregator out of the for loop

* renamed test file which contains tests for evaluator and aggregator
  • Loading branch information
JohannesWesch authored Jun 17, 2024
1 parent 5f0b04c commit 8301c7f
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 15 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
- `DefaultArgillaClient` -> `ArgillaWrapperClient`
- `Question` -> `argilla.RatingQuestion`, `options` -> `values` and it takes only a list
- `Field` -> `argilla.TextField`
- Add `description` parameter to `Aggregator.aggregate_evaluation` to allow individual descriptions without the need to create a new `Aggregator`. This was missing from the previous release.


### Fixes
- Reinitializing different `AlephAlphaModel` instances and retrieving their tokenizer should now consume a lot less memory.
Expand Down
34 changes: 20 additions & 14 deletions src/documentation/parameter_optimization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,13 @@
" evaluation_repository,\n",
" EXPERIMENT_NAME,\n",
" DummyEvaluationLogic(),\n",
")\n",
"\n",
"aggregator = Aggregator(\n",
" evaluation_repository,\n",
" aggregation_repository,\n",
" EXPERIMENT_NAME,\n",
" DummyAggregationLogic(),\n",
")"
]
},
Expand All @@ -179,6 +186,9 @@
"source": [
"# Definition of parameters\n",
"model_list = [\"model a\", \"model b\", \"model c\"]\n",
"label = \"dummy_label\"\n",
"labels = {label}\n",
"\n",
"prompt_list = [\n",
" \"A nice story starts with:\",\n",
" \"Some kind of prompt\",\n",
Expand All @@ -188,28 +198,24 @@
"\n",
"# Loop over all combinations of parameters and run the `Task` for each combination.\n",
"# Note, that this can be **very** expensive for large sets of parameters.\n",
"for model, prompt in itertools.product(model_list, prompt_list):\n",
"for i, (model, prompt) in enumerate(itertools.product(model_list, prompt_list)):\n",
" dummy_task = DummyTask(model=model, prompt=prompt)\n",
"\n",
" # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n",
" metadata = dict({\"model\": model, \"prompt\": prompt})\n",
" description = \"Evaluate dummy task\"\n",
" description = f\"Evaluate dummy task {i}\"\n",
" runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n",
" run_overview = runner.run_dataset(\n",
" dataset.id, metadata=metadata, description=description\n",
" dataset.id, metadata=metadata, description=description, labels=labels\n",
" )\n",
"\n",
" eval_overview = evaluator.evaluate_runs(\n",
" run_overview.id, metadata=metadata, description=description\n",
" run_overview.id, metadata=metadata, description=description, labels=labels\n",
" )\n",
"\n",
" aggregator = Aggregator(\n",
" evaluation_repository,\n",
" aggregation_repository,\n",
" EXPERIMENT_NAME,\n",
" DummyAggregationLogic(),\n",
" )\n",
" aggregator.aggregate_evaluation(eval_overview.id, metadata=metadata)"
" aggregator.aggregate_evaluation(\n",
" eval_overview.id, metadata=metadata, description=description, labels=labels\n",
" )"
]
},
{
Expand All @@ -228,13 +234,13 @@
"metadata": {},
"outputs": [],
"source": [
"# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`. Filtering can also be done on labels and/or metadata.\n",
"# Retrieve all aggregations and filter them by desired criteria, i.e., the label `dummy_label`. Filtering can also be done on description and/or metadata.\n",
"aggregations_of_interest = [\n",
" overview\n",
" for overview in aggregation_repository.aggregation_overviews(\n",
" aggregation_type=DummyAggregatedEvaluation\n",
" )\n",
" if overview.description == EXPERIMENT_NAME\n",
" if label in overview.labels\n",
"]\n",
"\n",
"# Convert the desired aggregation into a pandas dataframe\n",
Expand Down Expand Up @@ -311,7 +317,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.12.2"
}
},
"nbformat": 4,
Expand Down
8 changes: 7 additions & 1 deletion src/intelligence_layer/evaluation/aggregation/aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from functools import cached_property
from typing import (
Generic,
Optional,
TypeVar,
cast,
final,
Expand Down Expand Up @@ -186,6 +187,7 @@ def evaluation_type(self) -> type[Evaluation]:
def aggregate_evaluation(
self,
*eval_ids: str,
description: Optional[str] = None,
labels: set[str] | None = None,
metadata: SerializableDict | None = None,
) -> AggregationOverview[AggregatedEvaluation]:
Expand All @@ -196,6 +198,7 @@ def aggregate_evaluation(
Args:
eval_ids: An overview of the evaluation to be aggregated. Does not include
actual evaluations as these will be retrieved from the repository.
description: Optional description of the aggregation. Defaults to None.
labels: A list of labels for filtering. Defaults to an empty list.
metadata: A dict for additional information about the aggregation overview. Defaults to an empty dict.
Expand Down Expand Up @@ -240,14 +243,17 @@ def load_eval_overview(evaluation_id: str) -> EvaluationOverview:
cast(Iterable[Evaluation], successful_evaluations)
)

full_description = (
self.description + " : " + description if description else self.description
)
aggregation_overview = AggregationOverview(
evaluation_overviews=frozenset(evaluation_overviews),
id=str(uuid4()),
start=start,
end=utc_now(),
successful_evaluation_count=successful_evaluations.included_count(),
crashed_during_evaluation_count=successful_evaluations.excluded_count(),
description=self.description,
description=full_description,
statistics=statistics,
labels=labels,
metadata=metadata,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,37 @@ def test_eval_runs_uses_correct_description(
assert eval_description in evaluation_overview.description


def test_aggregation_runs_works_without_description(
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_aggregator: Aggregator[
DummyEvaluation, DummyAggregatedEvaluationWithResultList
],
run_id: str,
) -> None:
evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)

assert aggregation_overview.description == dummy_aggregator.description


def test_aggregation_runs_uses_correct_description(
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_aggregator: Aggregator[
DummyEvaluation, DummyAggregatedEvaluationWithResultList
],
run_id: str,
) -> None:
aggregation_description = "My aggregation description"
evaluation_overview = dummy_evaluator.evaluate_runs(run_id)

aggregation_overview = dummy_aggregator.aggregate_evaluation(
evaluation_overview.id, description=aggregation_description
)

assert dummy_aggregator.description in aggregation_overview.description
assert aggregation_description in aggregation_overview.description


def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false(
dummy_pairwise_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_runner: Runner[str, str],
Expand Down

0 comments on commit 8301c7f

Please sign in to comment.