Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add description to aggregate method #913

Merged
merged 5 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
- `DefaultArgillaClient` -> `ArgillaWrapperClient`
- `Question` -> `argilla.RatingQuestion`, `options` -> `values` and it takes only a list
- `Field` -> `argilla.TextField`
- Add `description` parameter to `Aggregator.aggregate_evaluation` to allow individual descriptions without the need to create a new `Aggregator`. This was missing from the previous release.


### Fixes
- Reinitializing different `AlephAlphaModel` instances and retrieving their tokenizer should now consume a lot less memory.
Expand Down
34 changes: 20 additions & 14 deletions src/documentation/parameter_optimization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,13 @@
" evaluation_repository,\n",
" EXPERIMENT_NAME,\n",
" DummyEvaluationLogic(),\n",
")\n",
"\n",
"aggregator = Aggregator(\n",
" evaluation_repository,\n",
" aggregation_repository,\n",
" EXPERIMENT_NAME,\n",
" DummyAggregationLogic(),\n",
")"
]
},
Expand All @@ -179,6 +186,9 @@
"source": [
"# Definition of parameters\n",
"model_list = [\"model a\", \"model b\", \"model c\"]\n",
"label = \"dummy_label\"\n",
"labels = {label}\n",
"\n",
"prompt_list = [\n",
" \"A nice story starts with:\",\n",
" \"Some kind of prompt\",\n",
Expand All @@ -188,28 +198,24 @@
"\n",
"# Loop over all combinations of parameters and run the `Task` for each combination.\n",
"# Note, that this can be **very** expensive for large sets of parameters.\n",
"for model, prompt in itertools.product(model_list, prompt_list):\n",
"for i, (model, prompt) in enumerate(itertools.product(model_list, prompt_list)):\n",
" dummy_task = DummyTask(model=model, prompt=prompt)\n",
"\n",
" # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n",
" metadata = dict({\"model\": model, \"prompt\": prompt})\n",
" description = \"Evaluate dummy task\"\n",
" description = f\"Evaluate dummy task {i}\"\n",
" runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n",
" run_overview = runner.run_dataset(\n",
" dataset.id, metadata=metadata, description=description\n",
" dataset.id, metadata=metadata, description=description, labels=labels\n",
" )\n",
"\n",
" eval_overview = evaluator.evaluate_runs(\n",
" run_overview.id, metadata=metadata, description=description\n",
" run_overview.id, metadata=metadata, description=description, labels=labels\n",
" )\n",
"\n",
" aggregator = Aggregator(\n",
" evaluation_repository,\n",
" aggregation_repository,\n",
" EXPERIMENT_NAME,\n",
" DummyAggregationLogic(),\n",
" )\n",
" aggregator.aggregate_evaluation(eval_overview.id, metadata=metadata)"
" aggregator.aggregate_evaluation(\n",
" eval_overview.id, metadata=metadata, description=description, labels=labels\n",
" )"
]
},
{
Expand All @@ -228,13 +234,13 @@
"metadata": {},
"outputs": [],
"source": [
"# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`. Filtering can also be done on labels and/or metadata.\n",
"# Retrieve all aggregations and filter them by desired criteria, i.e., the label `dummy_label`. Filtering can also be done on description and/or metadata.\n",
"aggregations_of_interest = [\n",
" overview\n",
" for overview in aggregation_repository.aggregation_overviews(\n",
" aggregation_type=DummyAggregatedEvaluation\n",
" )\n",
" if overview.description == EXPERIMENT_NAME\n",
" if label in overview.labels\n",
"]\n",
"\n",
"# Convert the desired aggregation into a pandas dataframe\n",
Expand Down Expand Up @@ -311,7 +317,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.12.2"
}
},
"nbformat": 4,
Expand Down
8 changes: 7 additions & 1 deletion src/intelligence_layer/evaluation/aggregation/aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from functools import cached_property
from typing import (
Generic,
Optional,
TypeVar,
cast,
final,
Expand Down Expand Up @@ -186,6 +187,7 @@ def evaluation_type(self) -> type[Evaluation]:
def aggregate_evaluation(
self,
*eval_ids: str,
description: Optional[str] = None,
labels: set[str] | None = None,
metadata: SerializableDict | None = None,
) -> AggregationOverview[AggregatedEvaluation]:
Expand All @@ -196,6 +198,7 @@ def aggregate_evaluation(
Args:
eval_ids: An overview of the evaluation to be aggregated. Does not include
actual evaluations as these will be retrieved from the repository.
description: Optional description of the aggregation. Defaults to None.
labels: A list of labels for filtering. Defaults to an empty list.
metadata: A dict for additional information about the aggregation overview. Defaults to an empty dict.

Expand Down Expand Up @@ -240,14 +243,17 @@ def load_eval_overview(evaluation_id: str) -> EvaluationOverview:
cast(Iterable[Evaluation], successful_evaluations)
)

full_description = (
self.description + " : " + description if description else self.description
)
aggregation_overview = AggregationOverview(
evaluation_overviews=frozenset(evaluation_overviews),
id=str(uuid4()),
start=start,
end=utc_now(),
successful_evaluation_count=successful_evaluations.included_count(),
crashed_during_evaluation_count=successful_evaluations.excluded_count(),
description=self.description,
description=full_description,
statistics=statistics,
labels=labels,
metadata=metadata,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,37 @@ def test_eval_runs_uses_correct_description(
assert eval_description in evaluation_overview.description


def test_aggregation_runs_works_without_description(
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_aggregator: Aggregator[
DummyEvaluation, DummyAggregatedEvaluationWithResultList
],
run_id: str,
) -> None:
evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)

assert aggregation_overview.description == dummy_aggregator.description


def test_aggregation_runs_uses_correct_description(
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_aggregator: Aggregator[
DummyEvaluation, DummyAggregatedEvaluationWithResultList
],
run_id: str,
) -> None:
aggregation_description = "My aggregation description"
evaluation_overview = dummy_evaluator.evaluate_runs(run_id)

aggregation_overview = dummy_aggregator.aggregate_evaluation(
evaluation_overview.id, description=aggregation_description
)

assert dummy_aggregator.description in aggregation_overview.description
assert aggregation_description in aggregation_overview.description


def test_eval_runs_keeps_example_for_eval_if_skip_flag_is_false(
dummy_pairwise_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_runner: Runner[str, str],
Expand Down