From 77b5b8b0363e04ed1590e8e509258f54cad65d0b Mon Sep 17 00:00:00 2001 From: Sebastian Niehus Date: Wed, 22 May 2024 12:08:12 +0200 Subject: [PATCH] WIP: feat: Add more code to How-to run incremental eval TASK: IL-313 --- src/documentation/how_tos/example_data.py | 16 +++ ...complete_incremental_evaluation_flow.ipynb | 98 ++++++++++++++++--- 2 files changed, 100 insertions(+), 14 deletions(-) diff --git a/src/documentation/how_tos/example_data.py b/src/documentation/how_tos/example_data.py index 24353db7f..5affd555b 100644 --- a/src/documentation/how_tos/example_data.py +++ b/src/documentation/how_tos/example_data.py @@ -20,6 +20,7 @@ from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import ( ComparisonEvaluation, EloEvaluationLogic, + IncrementalEvaluationLogic, Matches, MatchOutcome, ) @@ -48,6 +49,21 @@ def do_evaluate( ) +class DummyIncrementalEvaluationLogic( + IncrementalEvaluationLogic[str, str, str, DummyEvaluation] +): + def do_incremental_evaluate( + self, + example: Example[str, str], + outputs: list[SuccessfulExampleOutput[str]], + already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]], + ) -> DummyEvaluation: + output_str = "(" + (", ".join(o.output for o in outputs)) + ")" + return DummyEvaluation( + eval=f"{example.input}, {example.expected_output}, {output_str} -> evaluation" + ) + + class DummyEloEvaluationLogic(EloEvaluationLogic[str, str, str]): def grade( self, diff --git a/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb b/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb index b537a227c..409fdafd4 100644 --- a/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb +++ b/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb @@ -6,10 +6,23 @@ "metadata": {}, "outputs": [], "source": [ - "from documentation.how_tos.example_data import DummyEloEvaluationLogic, example_data\n", + "from evaluation.conftest import DummyTask\n", + "\n", + "from documentation.how_tos.example_data import (\n", + " DummyAggregationLogic,\n", + " DummyExample,\n", + " DummyIncrementalEvaluationLogic,\n", + ")\n", "from intelligence_layer.evaluation import (\n", + " Aggregator,\n", " IncrementalEvaluator,\n", + " InMemoryAggregationRepository,\n", " InMemoryEvaluationRepository,\n", + " InMemoryRunRepository,\n", + " Runner,\n", + ")\n", + "from intelligence_layer.evaluation.dataset.in_memory_dataset_repository import (\n", + " InMemoryDatasetRepository,\n", ")" ] }, @@ -23,7 +36,14 @@ " - run multiple tasks and configurations on the same dataset\n", " - perform evaluations in an incremental fashion, i.e., adding additional runs to your existing evaluations without the need for recalculation\n", " - run aggregation on these evaluations\n", - " - " + " - \n", + " \n", + "# How to implement elo evaluations\n", + "1. Run your tasks on the datasets you want to evaluate (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n", + " - When evaluating multiple runs, all of them need the same data types \n", + "2. Initialize `IncrementalEvaluator` with the repositories and an `EloEvaluationLogic` that is specific to your use case. \n", + "3. Run the evaluator to evaluate all examples and create a single `EvaluationOverview`\n", + "4. (Optional) Save the evaluation id for later use" ] }, { @@ -32,36 +52,86 @@ "metadata": {}, "outputs": [], "source": [ - "# Step 0 Define ne\n", - "\n", - "\n", - "my_example_data = example_data()\n", - "print()\n", - "run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]\n", + "# Preparation\n", + "examples = [\n", + " DummyExample(input=\"input1\", expected_output=\"expected_output1\", data=\"data1\")\n", + "]\n", "\n", "# Step 1\n", - "dataset_repository = my_example_data.dataset_repository\n", - "run_repository = my_example_data.run_repository\n", + "dataset_repository = InMemoryDatasetRepository()\n", + "my_dataset = dataset_repository.create_dataset(examples, \"MyDataset\")\n", + "\n", + "run_repository = InMemoryRunRepository()\n", "evaluation_repository = InMemoryEvaluationRepository()\n", - "evaluation_logic = DummyEloEvaluationLogic()\n", + "evaluation_logic = DummyIncrementalEvaluationLogic()\n", + "\n", + "aggregation_repository = InMemoryAggregationRepository()\n", + "aggregation_logic = DummyAggregationLogic()\n", + "\n", + "first_task = DummyTask()\n", + "second_task = DummyTask()\n", + "\n", + "first_runner = Runner(first_task, dataset_repository, run_repository, \"MyFirstRun\")\n", + "second_runner = Runner(second_task, dataset_repository, run_repository, \"MySecondRun\")\n", + "\n", + "first_run_overview = first_runner.run_dataset(my_dataset.id)\n", + "second_run_overview = second_runner.run_dataset(my_dataset.id)\n", + "\n", + "print(first_run_overview.id)\n", + "print(second_run_overview.id)\n", + "\n", "\n", "# Step 2\n", + "run_overview_ids_for_first_evaluation = []\n", + "for run_overview in run_repository.run_overviews():\n", + " if run_overview.description == \"MyFirstRun\":\n", + " run_overview_ids_for_first_evaluation.append(run_overview.id)\n", + "\n", "evaluator = IncrementalEvaluator(\n", " dataset_repository,\n", " run_repository,\n", " evaluation_repository,\n", - " \"My dummy evaluation\",\n", + " \"My incremental evaluation\",\n", " evaluation_logic,\n", ")\n", + "evaluation_overview_first_task = evaluator.evaluate_runs(first_run_overview.id)\n", "\n", - "evaluation_overview = evaluator.evaluate_runs(*run_ids)\n", + "# Step 2.5 Aggregate single\n", + "aggregator = Aggregator(\n", + " evaluation_repository, aggregation_repository, \"MyAggregator\", aggregation_logic\n", + ")\n", + "first_aggregation_overview = aggregator.aggregate_evaluation(\n", + " *evaluation_repository.evaluation_overview_ids()\n", + ")\n", + "print(first_aggregation_overview)\n", "\n", "# Step 3\n", - "print(evaluation_overview.id)" + "previous_evaluation_ids = evaluation_repository.evaluation_overview_ids()\n", + "evaluator.evaluate_additional_runs(\n", + " *run_repository.run_overview_ids(), previous_evaluation_ids=previous_evaluation_ids\n", + ")\n", + "\n", + "# Step 4: Aggregate all\n", + "second_aggregation_overview = aggregator.aggregate_evaluation(\n", + " *evaluation_repository.evaluation_overview_ids()\n", + ")\n", + "print(second_aggregation_overview)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { "name": "python" }