-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: aggregation of aggregation (#885)
* feat: add aggregation_overviews_to_pandas function for comparing aggregations, and tutorial notebook TASK: IL-419 --------- Co-authored-by: Sebastian Niehus <sebastian.niehus@ext.aleph-alpha.com>
1 parent
e5a4a77
commit 31df7b3
Showing
8 changed files
with
507 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,314 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import itertools\n", | ||
"import random\n", | ||
"import string\n", | ||
"from typing import Iterable\n", | ||
"\n", | ||
"from pydantic import BaseModel\n", | ||
"\n", | ||
"from intelligence_layer.core import Input, Task, TaskSpan\n", | ||
"from intelligence_layer.evaluation import (\n", | ||
" AggregationLogic,\n", | ||
" Aggregator,\n", | ||
" Evaluation,\n", | ||
" Evaluator,\n", | ||
" Example,\n", | ||
" ExpectedOutput,\n", | ||
" InMemoryAggregationRepository,\n", | ||
" InMemoryDatasetRepository,\n", | ||
" InMemoryEvaluationRepository,\n", | ||
" InMemoryRunRepository,\n", | ||
" Runner,\n", | ||
" SingleOutputEvaluationLogic,\n", | ||
" aggregation_overviews_to_pandas,\n", | ||
")\n", | ||
"\n", | ||
"\n", | ||
"class DummyTask(Task[str, str]):\n", | ||
" def __init__(self, model: str, prompt: str):\n", | ||
" self.model = model\n", | ||
" self.prompt = prompt\n", | ||
"\n", | ||
" def do_run(self, input: str, task_span: TaskSpan) -> str:\n", | ||
" wordlist = [\n", | ||
" \"apple\",\n", | ||
" \"banana\",\n", | ||
" \"car\",\n", | ||
" \"dog\",\n", | ||
" \"elephant\",\n", | ||
" \"fish\",\n", | ||
" \"goat\",\n", | ||
" \"hat\",\n", | ||
" \"igloo\",\n", | ||
" \"jacket\",\n", | ||
" ]\n", | ||
" sentences = [\n", | ||
" \"Once upon a time,\",\n", | ||
" \"In a land far, far away,\",\n", | ||
" \"Suddenly,\",\n", | ||
" \"One day,\",\n", | ||
" \"In the morning,\",\n", | ||
" ]\n", | ||
"\n", | ||
" random.seed(hash(input) + hash(self.model)) # Set the seed based on the prompt\n", | ||
"\n", | ||
" story = self.prompt + \" \"\n", | ||
" for _ in range(10):\n", | ||
" sentence = random.choice(sentences)\n", | ||
" word = random.choice(wordlist)\n", | ||
" story += sentence + \" \" + word + \" \"\n", | ||
" return story\n", | ||
"\n", | ||
"\n", | ||
"class DummyEvaluation(BaseModel):\n", | ||
" text_length: int\n", | ||
" normalized_capital_count: float\n", | ||
"\n", | ||
"\n", | ||
"class DummyEvaluationLogic(\n", | ||
" SingleOutputEvaluationLogic[str, str, None, DummyEvaluation]\n", | ||
"):\n", | ||
" def do_evaluate_single_output(\n", | ||
" self, example: Example[Input, ExpectedOutput], output: str\n", | ||
" ) -> Evaluation:\n", | ||
" return DummyEvaluation(\n", | ||
" text_length=len(output),\n", | ||
" normalized_capital_count=sum(c.isupper() for c in output) / len(output),\n", | ||
" )\n", | ||
"\n", | ||
"\n", | ||
"class DummyAggregatedEvaluation(BaseModel):\n", | ||
" avg_length: float\n", | ||
" avg_normalized_capital_count: float\n", | ||
"\n", | ||
"\n", | ||
"class DummyAggregationLogic(\n", | ||
" AggregationLogic[DummyEvaluation, DummyAggregatedEvaluation]\n", | ||
"):\n", | ||
" def aggregate(\n", | ||
" self, evaluations: Iterable[DummyEvaluation]\n", | ||
" ) -> DummyAggregatedEvaluation:\n", | ||
" eval_list = list(evaluations)\n", | ||
" avg_length = sum([s.text_length for s in eval_list]) / len(eval_list)\n", | ||
" avg_normalized_capital_count = sum(\n", | ||
" [s.normalized_capital_count for s in eval_list]\n", | ||
" ) / len(eval_list)\n", | ||
" return DummyAggregatedEvaluation(\n", | ||
" avg_length=avg_length,\n", | ||
" avg_normalized_capital_count=avg_normalized_capital_count,\n", | ||
" )\n", | ||
"\n", | ||
"\n", | ||
"# Initialize the dataset\n", | ||
"random.seed(42)\n", | ||
"examples = [\n", | ||
" Example(\n", | ||
" input=\"\".join(random.choices(string.ascii_letters, k=random.randint(1, 50))),\n", | ||
" expected_output=None,\n", | ||
" )\n", | ||
" for i in range(10)\n", | ||
"]\n", | ||
"\n", | ||
"dataset_repository = InMemoryDatasetRepository()\n", | ||
"dataset = dataset_repository.create_dataset(\n", | ||
" examples=examples, dataset_name=\"my-dataset\"\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "1", | ||
"metadata": {}, | ||
"source": [ | ||
"# Optimizing Tasks by Comparing Aggregations\n", | ||
"\n", | ||
"In this tutorial we demonstrate how to optimize a given `Task` that depends on a `model` and a `prompt` parameter. This is done by evaluating each combination of parameters for the tasks separately and then comparing the resulting aggregations.\n", | ||
"\n", | ||
"In this scenario, our task does not depend on an LLM, for the sake of execution speed. However, the demonstrated principles generalize to other use cases.\n", | ||
"\n", | ||
"## Setup\n", | ||
"\n", | ||
"We assume the dataset, the `Task` and `DatasetRepository` are already given, so we can start by instantiating the remaining repositories and our `Evaluator`. The `EXPERIMENT_NAME` will later be used to identify the aggregations of interest. Therefore, we pass it to the `Evaluator` and later to the `Runner` and `Aggregator`. " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"EXPERIMENT_NAME = \"compare-tasks\"\n", | ||
"\n", | ||
"# The `DatasetRepository` is named `dataset_repository`\n", | ||
"run_repository = InMemoryRunRepository()\n", | ||
"evaluation_repository = InMemoryEvaluationRepository()\n", | ||
"aggregation_repository = InMemoryAggregationRepository()\n", | ||
"\n", | ||
"evaluator = Evaluator(\n", | ||
" dataset_repository,\n", | ||
" run_repository,\n", | ||
" evaluation_repository,\n", | ||
" EXPERIMENT_NAME,\n", | ||
" DummyEvaluationLogic(),\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "3", | ||
"metadata": {}, | ||
"source": [ | ||
"## Running Experiments for Different Configurations" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "4", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Definition of parameters\n", | ||
"model_list = [\"model a\", \"model b\", \"model c\"]\n", | ||
"prompt_list = [\n", | ||
" \"A nice story starts with:\",\n", | ||
" \"Some kind of prompt\",\n", | ||
" \"No prompt at all\",\n", | ||
" \"OPTIMIZING PROMPTS IS HARD TO DO\",\n", | ||
"]\n", | ||
"\n", | ||
"# Loop over all combinations of parameters and run the `Task` for each combination.\n", | ||
"# Note, that this can be **very** expensive for large sets of parameters.\n", | ||
"for model, prompt in itertools.product(model_list, prompt_list):\n", | ||
" dummy_task = DummyTask(model=model, prompt=prompt)\n", | ||
"\n", | ||
" # The description and the Experiment will later be used to identify the run parameters. Take special note of the delimiter '|'.\n", | ||
" description = f\"|{model}|{prompt}|\"\n", | ||
" runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n", | ||
" run_overview = runner.run_dataset(dataset.id, description=description)\n", | ||
"\n", | ||
" eval_overview = evaluator.evaluate_runs(run_overview.id, description=description)\n", | ||
"\n", | ||
" aggregator = Aggregator(\n", | ||
" evaluation_repository,\n", | ||
" aggregation_repository,\n", | ||
" EXPERIMENT_NAME + \":\" + description,\n", | ||
" DummyAggregationLogic(),\n", | ||
" )\n", | ||
" aggregator.aggregate_evaluation(eval_overview.id)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "5", | ||
"metadata": {}, | ||
"source": [ | ||
"## Comparison of Different Configurations\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "6", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`.\n", | ||
"aggregations_of_interest = [\n", | ||
" overview\n", | ||
" for overview in aggregation_repository.aggregation_overviews(\n", | ||
" aggregation_type=DummyAggregatedEvaluation\n", | ||
" )\n", | ||
" if overview.description.startswith(EXPERIMENT_NAME)\n", | ||
"]\n", | ||
"\n", | ||
"# Convert the desired aggregation into a pandas dataframe\n", | ||
"formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "7", | ||
"metadata": {}, | ||
"source": [ | ||
" The following steps are very specific to the experiment setup, mostly depending on standard pandas knowledge. They are just one example of how one might analyze the configurations." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "8", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"aggregation_fields = list(DummyAggregatedEvaluation.model_fields.keys())\n", | ||
"formated_aggregations = formated_aggregations[[\"description\"] + aggregation_fields]\n", | ||
"formated_aggregations[[\"model\", \"prompt\"]] = formated_aggregations[\n", | ||
" \"description\"\n", | ||
"].str.split(\"|\", expand=True)[[1, 2]]\n", | ||
"formated_aggregations.drop(columns=\"description\", inplace=True)\n", | ||
"\n", | ||
"display(\n", | ||
" formated_aggregations.sort_values(\n", | ||
" by=\"avg_normalized_capital_count\", ascending=False\n", | ||
" )\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"formated_aggregations.pivot(\n", | ||
" index=\"model\", columns=\"prompt\", values=\"avg_normalized_capital_count\"\n", | ||
").plot(kind=\"box\", rot=90, title=\"avg_normalized_capital_count\")\n", | ||
"formated_aggregations.pivot(index=\"prompt\", columns=\"model\", values=\"avg_length\").plot(\n", | ||
" kind=\"box\", title=\"avg_length\"\n", | ||
")\n", | ||
"pass" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "10", | ||
"metadata": {}, | ||
"source": [ | ||
"With these results, it's easy to see which prompt is best to optimize our score! The model on the other hand does not seem to have a big impact on our metrics." | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters