From 77b5b8b0363e04ed1590e8e509258f54cad65d0b Mon Sep 17 00:00:00 2001
From: Sebastian Niehus <sebastian.niehus@ext.aleph-alpha.com>
Date: Wed, 22 May 2024 12:08:12 +0200
Subject: [PATCH] WIP: feat: Add more code to How-to run incremental eval

TASK: IL-313
---
 src/documentation/how_tos/example_data.py     | 16 +++
 ...complete_incremental_evaluation_flow.ipynb | 98 ++++++++++++++++---
 2 files changed, 100 insertions(+), 14 deletions(-)

diff --git a/src/documentation/how_tos/example_data.py b/src/documentation/how_tos/example_data.py
index 24353db7f..5affd555b 100644
--- a/src/documentation/how_tos/example_data.py
+++ b/src/documentation/how_tos/example_data.py
@@ -20,6 +20,7 @@
 from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (
     ComparisonEvaluation,
     EloEvaluationLogic,
+    IncrementalEvaluationLogic,
     Matches,
     MatchOutcome,
 )
@@ -48,6 +49,21 @@ def do_evaluate(
         )
 
 
+class DummyIncrementalEvaluationLogic(
+    IncrementalEvaluationLogic[str, str, str, DummyEvaluation]
+):
+    def do_incremental_evaluate(
+        self,
+        example: Example[str, str],
+        outputs: list[SuccessfulExampleOutput[str]],
+        already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],
+    ) -> DummyEvaluation:
+        output_str = "(" + (", ".join(o.output for o in outputs)) + ")"
+        return DummyEvaluation(
+            eval=f"{example.input}, {example.expected_output}, {output_str} -> evaluation"
+        )
+
+
 class DummyEloEvaluationLogic(EloEvaluationLogic[str, str, str]):
     def grade(
         self,
diff --git a/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb b/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb
index b537a227c..409fdafd4 100644
--- a/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb
+++ b/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb
@@ -6,10 +6,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from documentation.how_tos.example_data import DummyEloEvaluationLogic, example_data\n",
+    "from evaluation.conftest import DummyTask\n",
+    "\n",
+    "from documentation.how_tos.example_data import (\n",
+    "    DummyAggregationLogic,\n",
+    "    DummyExample,\n",
+    "    DummyIncrementalEvaluationLogic,\n",
+    ")\n",
     "from intelligence_layer.evaluation import (\n",
+    "    Aggregator,\n",
     "    IncrementalEvaluator,\n",
+    "    InMemoryAggregationRepository,\n",
     "    InMemoryEvaluationRepository,\n",
+    "    InMemoryRunRepository,\n",
+    "    Runner,\n",
+    ")\n",
+    "from intelligence_layer.evaluation.dataset.in_memory_dataset_repository import (\n",
+    "    InMemoryDatasetRepository,\n",
     ")"
    ]
   },
@@ -23,7 +36,14 @@
     "    - run multiple tasks and configurations on the same dataset\n",
     "    - perform evaluations in an incremental fashion, i.e., adding additional runs to your existing evaluations without the need for recalculation\n",
     "    - run aggregation on these evaluations\n",
-    "    - "
+    "    - \n",
+    "    \n",
+    "# How to implement elo evaluations\n",
+    "1. Run your tasks on the datasets you want to evaluate (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n",
+    "   - When evaluating multiple runs, all of them need the same data types \n",
+    "2. Initialize `IncrementalEvaluator` with the repositories and an `EloEvaluationLogic` that is specific to your use case. \n",
+    "3. Run the evaluator to evaluate all examples and create a single `EvaluationOverview`\n",
+    "4. (Optional) Save the evaluation id for later use"
    ]
   },
   {
@@ -32,36 +52,86 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Step 0 Define ne\n",
-    "\n",
-    "\n",
-    "my_example_data = example_data()\n",
-    "print()\n",
-    "run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]\n",
+    "# Preparation\n",
+    "examples = [\n",
+    "    DummyExample(input=\"input1\", expected_output=\"expected_output1\", data=\"data1\")\n",
+    "]\n",
     "\n",
     "# Step 1\n",
-    "dataset_repository = my_example_data.dataset_repository\n",
-    "run_repository = my_example_data.run_repository\n",
+    "dataset_repository = InMemoryDatasetRepository()\n",
+    "my_dataset = dataset_repository.create_dataset(examples, \"MyDataset\")\n",
+    "\n",
+    "run_repository = InMemoryRunRepository()\n",
     "evaluation_repository = InMemoryEvaluationRepository()\n",
-    "evaluation_logic = DummyEloEvaluationLogic()\n",
+    "evaluation_logic = DummyIncrementalEvaluationLogic()\n",
+    "\n",
+    "aggregation_repository = InMemoryAggregationRepository()\n",
+    "aggregation_logic = DummyAggregationLogic()\n",
+    "\n",
+    "first_task = DummyTask()\n",
+    "second_task = DummyTask()\n",
+    "\n",
+    "first_runner = Runner(first_task, dataset_repository, run_repository, \"MyFirstRun\")\n",
+    "second_runner = Runner(second_task, dataset_repository, run_repository, \"MySecondRun\")\n",
+    "\n",
+    "first_run_overview = first_runner.run_dataset(my_dataset.id)\n",
+    "second_run_overview = second_runner.run_dataset(my_dataset.id)\n",
+    "\n",
+    "print(first_run_overview.id)\n",
+    "print(second_run_overview.id)\n",
+    "\n",
     "\n",
     "# Step 2\n",
+    "run_overview_ids_for_first_evaluation = []\n",
+    "for run_overview in run_repository.run_overviews():\n",
+    "    if run_overview.description == \"MyFirstRun\":\n",
+    "        run_overview_ids_for_first_evaluation.append(run_overview.id)\n",
+    "\n",
     "evaluator = IncrementalEvaluator(\n",
     "    dataset_repository,\n",
     "    run_repository,\n",
     "    evaluation_repository,\n",
-    "    \"My dummy evaluation\",\n",
+    "    \"My incremental evaluation\",\n",
     "    evaluation_logic,\n",
     ")\n",
+    "evaluation_overview_first_task = evaluator.evaluate_runs(first_run_overview.id)\n",
     "\n",
-    "evaluation_overview = evaluator.evaluate_runs(*run_ids)\n",
+    "# Step 2.5 Aggregate single\n",
+    "aggregator = Aggregator(\n",
+    "    evaluation_repository, aggregation_repository, \"MyAggregator\", aggregation_logic\n",
+    ")\n",
+    "first_aggregation_overview = aggregator.aggregate_evaluation(\n",
+    "    *evaluation_repository.evaluation_overview_ids()\n",
+    ")\n",
+    "print(first_aggregation_overview)\n",
     "\n",
     "# Step 3\n",
-    "print(evaluation_overview.id)"
+    "previous_evaluation_ids = evaluation_repository.evaluation_overview_ids()\n",
+    "evaluator.evaluate_additional_runs(\n",
+    "    *run_repository.run_overview_ids(), previous_evaluation_ids=previous_evaluation_ids\n",
+    ")\n",
+    "\n",
+    "# Step 4: Aggregate all\n",
+    "second_aggregation_overview = aggregator.aggregate_evaluation(\n",
+    "    *evaluation_repository.evaluation_overview_ids()\n",
+    ")\n",
+    "print(second_aggregation_overview)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
    "name": "python"
   }