From 0d5e77aa8dc9ee7fd0ac5a3978102e41024128d2 Mon Sep 17 00:00:00 2001
From: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com>
Date: Thu, 4 Apr 2024 11:52:19 +0200
Subject: [PATCH] Il 367 label counts on failed classify eval (#695)

Warnings and easier retrieval of failed examples in SingelLabelClassify

---------

Co-authored-by: niklas.finken <niklas.finken@aleph-alpha.com>
Co-authored-by: Sebastian Niehus <Sebastian.Niehus@tngtech.com>
Co-authored-by: Johannes Wesch <johannes.wesch@aleph-alpha.com>
---
 CHANGELOG.md                                  |  1 +
 src/examples/user_journey.ipynb               | 71 +++++++--------
 src/intelligence_layer/use_cases/__init__.py  |  1 -
 .../use_cases/classify/classify.py            | 91 ++++++++++++++-----
 .../classify/test_prompt_based_classify.py    | 30 ++++++
 5 files changed, 132 insertions(+), 62 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 31e55d21d..0edfeda9b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@
 ### Breaking Changes
 
 ### New Features
+- feature: Add SingleLabelClassifyFailedExampleIterator for easy retrieval of failed examples.
 - feature: Error information is printed to the console on failed runs and evaluations.
 - feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object
 - feature: The `Runner.run_dataset(..)` and `Evaluator.evaluate_run(..)` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb
index 97a5fb5f0..1dc569768 100644
--- a/src/examples/user_journey.ipynb
+++ b/src/examples/user_journey.ipynb
@@ -22,9 +22,7 @@
     "    ClassifyInput,\n",
     "    PromptBasedClassify,\n",
     "    SingleLabelClassifyAggregationLogic,\n",
-    "    SingleLabelClassifyEvaluation,\n",
     "    SingleLabelClassifyEvaluationLogic,\n",
-    "    SingleLabelClassifyOutput,\n",
     ")\n",
     "import json\n",
     "\n",
@@ -144,6 +142,28 @@
     "labeled_examples"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Intelligence layer offers support to run task evaluations.\n",
+    "\n",
+    "First, we have to create a dataset inside a repository.\n",
+    "There are different repositories (that persist datasets in different ways), but an `InMemoryDatasetRepository` will do for now.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"data/classify_examples.json\", \"r\") as file:\n",
+    "    labeled_examples: list[dict[str, str]] = json.load(file)\n",
+    "\n",
+    "labeled_examples"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -297,38 +317,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_failed_examples(run_id: str, eval_id: str, dataset_id: str, first_n: int):\n",
-    "    overview = [\n",
-    "        {\n",
-    "            \"input\": example.input,\n",
-    "            \"expected_output\": example.expected_output,\n",
-    "            \"result\": sorted(\n",
-    "                list(\n",
-    "                    next(\n",
-    "                        example_output\n",
-    "                        for example_output in run_repository.example_outputs(\n",
-    "                            run_id, SingleLabelClassifyOutput\n",
-    "                        )\n",
-    "                        if example_output.example_id == example.id\n",
-    "                    ).output.scores.items()\n",
-    "                ),\n",
-    "                key=lambda i: i[1],\n",
-    "                reverse=True,\n",
-    "            )[0],\n",
-    "            \"eval\": evaluation_repository.example_evaluation(\n",
-    "                evaluation_id=eval_id,\n",
-    "                example_id=example.id,\n",
-    "                evaluation_type=SingleLabelClassifyEvaluation,\n",
-    "            ).result,\n",
-    "        }\n",
-    "        for example in dataset_repository.examples(\n",
-    "            dataset_id=dataset_id, input_type=ClassifyInput, expected_output_type=str\n",
-    "        )\n",
-    "    ]\n",
-    "    return [example for example in overview if not example[\"eval\"].correct][:first_n]\n",
-    "\n",
+    "from intelligence_layer.use_cases.classify.classify import (\n",
+    "    SingleLabelClassifyFailedExampleIterator,\n",
+    ")\n",
     "\n",
-    "get_failed_examples(run_overview.id, eval_overview.id, dataset_id, 3)"
+    "failed_example_iterator = SingleLabelClassifyFailedExampleIterator(\n",
+    "    dataset_repository, run_repository, evaluation_repository\n",
+    ")\n",
+    "list(failed_example_iterator.get_examples(eval_overview.id))"
    ]
   },
   {
@@ -512,12 +508,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "get_failed_examples(\n",
-    "    run_overview_prompt_adjusted.id,\n",
-    "    eval_overview_prompt_adjusted.id,\n",
-    "    cleaned_dataset_id,\n",
-    "    3,\n",
-    ")"
+    "list(failed_example_iterator.get_examples(eval_overview_prompt_adjusted.id))"
    ]
   },
   {
@@ -546,7 +537,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/src/intelligence_layer/use_cases/__init__.py b/src/intelligence_layer/use_cases/__init__.py
index 85f233daa..de0947313 100644
--- a/src/intelligence_layer/use_cases/__init__.py
+++ b/src/intelligence_layer/use_cases/__init__.py
@@ -15,7 +15,6 @@
     MultiLabelClassifyEvaluationLogic as MultiLabelClassifyEvaluationLogic,
 )
 from .classify.classify import MultiLabelClassifyOutput as MultiLabelClassifyOutput
-from .classify.classify import PerformanceScores as PerformanceScores
 from .classify.classify import Probability as Probability
 from .classify.classify import (
     SingleLabelClassifyAggregationLogic as SingleLabelClassifyAggregationLogic,
diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py
index 562252160..59e2ab376 100644
--- a/src/intelligence_layer/use_cases/classify/classify.py
+++ b/src/intelligence_layer/use_cases/classify/classify.py
@@ -1,3 +1,4 @@
+import warnings
 from collections import defaultdict
 from typing import Iterable, Mapping, NewType, Sequence
 
@@ -6,10 +7,15 @@
 from intelligence_layer.core import TextChunk
 from intelligence_layer.evaluation import (
     AggregationLogic,
+    DatasetRepository,
+    EvaluationRepository,
     Example,
     MeanAccumulator,
+    RepositoryNavigator,
+    RunRepository,
     SingleOutputEvaluationLogic,
 )
+from intelligence_layer.evaluation.evaluation.domain import FailedExampleEvaluation
 
 Probability = NewType("Probability", float)
 
@@ -69,20 +75,6 @@ class SingleLabelClassifyEvaluation(BaseModel):
     expected_label_missing: bool
 
 
-class PerformanceScores(BaseModel):
-    """The relevant metrics resulting from a confusion matrix in a classification run.
-
-    Attributes:
-        precision: Proportion of correctly predicted classes to all predicted classes.
-        recall: Proportion of correctly predicted classes to all expected classes.
-        f1: Aggregated performance, formally the harmonic mean of precision and recall.
-    """
-
-    precision: float
-    recall: float
-    f1: float
-
-
 class AggregatedLabelInfo(BaseModel):
     expected_count: int
     predicted_count: int
@@ -124,6 +116,11 @@ def aggregate(
                 confusion_matrix[(evaluation.predicted, evaluation.expected)] += 1
                 by_label[evaluation.predicted]["predicted"] += 1
                 by_label[evaluation.expected]["expected"] += 1
+
+        if len(missing_labels) > 0:
+            warn_message = "[WARNING] There were examples with expected labels missing in the evaluation inputs. For a detailed list, see the 'statistics.missing_labels' field of the returned `AggregationOverview`."
+            warnings.warn(warn_message, RuntimeWarning)
+
         return AggregatedSingleLabelClassifyEvaluation(
             percentage_correct=acc.extract(),
             confusion_matrix=confusion_matrix,
@@ -154,6 +151,10 @@ def do_evaluate_single_output(
         sorted_classes = sorted(
             output.scores.items(), key=lambda item: item[1], reverse=True
         )
+        if example.expected_output not in example.input.labels:
+            warn_message = f"[WARNING] Example with ID '{example.id}' has expected label '{example.expected_output}', which is not part of the example's input labels."
+            warnings.warn(warn_message, RuntimeWarning)
+
         predicted = sorted_classes[0][0]
         if predicted == example.expected_output:
             correct = True
@@ -167,6 +168,40 @@ def do_evaluate_single_output(
         )
 
 
+class SingleLabelClassifyFailedExampleIterator:
+    def __init__(
+        self,
+        dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: EvaluationRepository,
+    ):
+        self.repository_navigator = RepositoryNavigator(
+            dataset_repository, run_repository, evaluation_repository
+        )
+
+    # TODO: Add test
+    def get_examples(
+        self, evaluation_overview_id: str, first_n: int = 0
+    ) -> Iterable[Example[ClassifyInput, str]]:
+        evaluation_lineages = self.repository_navigator.evaluation_lineages(
+            evaluation_id=evaluation_overview_id,
+            input_type=ClassifyInput,
+            expected_output_type=str,
+            output_type=SingleLabelClassifyOutput,
+            evaluation_type=SingleLabelClassifyEvaluation,
+        )
+        count_yielded = 0
+        for lineage in evaluation_lineages:
+            if first_n != 0 and count_yielded >= first_n:
+                break
+            if (
+                isinstance(lineage.evaluation.result, FailedExampleEvaluation)
+                or not lineage.evaluation.result.correct
+            ):
+                count_yielded += 1
+                yield lineage.example
+
+
 class MultiLabelClassifyEvaluation(BaseModel):
     """The evaluation of a single multi-label classification example.
 
@@ -183,6 +218,20 @@ class MultiLabelClassifyEvaluation(BaseModel):
     fn: frozenset[str]
 
 
+class MultiLabelClassifyMetrics(BaseModel):
+    """The relevant metrics resulting from a confusion matrix in a classification run.
+
+    Attributes:
+        precision: Proportion of correctly predicted classes to all predicted classes.
+        recall: Proportion of correctly predicted classes to all expected classes.
+        f1: Aggregated performance, formally the harmonic mean of precision and recall.
+    """
+
+    precision: float
+    recall: float
+    f1: float
+
+
 class AggregatedMultiLabelClassifyEvaluation(BaseModel):
     """The aggregated evaluation of a multi-label classify dataset.
 
@@ -193,9 +242,9 @@ class AggregatedMultiLabelClassifyEvaluation(BaseModel):
 
     """
 
-    class_metrics: Mapping[str, PerformanceScores]
-    micro_avg: PerformanceScores
-    macro_avg: PerformanceScores
+    class_metrics: Mapping[str, MultiLabelClassifyMetrics]
+    micro_avg: MultiLabelClassifyMetrics
+    macro_avg: MultiLabelClassifyMetrics
 
 
 class MultiLabelClassifyAggregationLogic(
@@ -243,7 +292,7 @@ def aggregate(
                 else 0
             )
 
-            class_metrics[label] = PerformanceScores(
+            class_metrics[label] = MultiLabelClassifyMetrics(
                 precision=precision, recall=recall, f1=f1
             )
 
@@ -255,19 +304,19 @@ def aggregate(
             sum_f1 += f1
 
         try:
-            micro_avg = PerformanceScores(
+            micro_avg = MultiLabelClassifyMetrics(
                 precision=sum_tp / (sum_tp + sum_fp),
                 recall=sum_tp / (sum_tp + sum_fn),
                 f1=(2 * (sum_tp / (sum_tp + sum_fp)) * (sum_tp / (sum_tp + sum_fn)))
                 / ((sum_tp / (sum_tp + sum_fp)) + (sum_tp / (sum_tp + sum_fn))),
             )
         except ZeroDivisionError:
-            micro_avg = PerformanceScores(
+            micro_avg = MultiLabelClassifyMetrics(
                 precision=0,
                 recall=0,
                 f1=0,
             )
-        macro_avg = PerformanceScores(
+        macro_avg = MultiLabelClassifyMetrics(
             precision=sum_precision / len(class_metrics),
             recall=sum_recall / len(class_metrics),
             f1=sum_f1 / len(class_metrics),
diff --git a/tests/use_cases/classify/test_prompt_based_classify.py b/tests/use_cases/classify/test_prompt_based_classify.py
index a54b58bb4..35e6c8cc6 100644
--- a/tests/use_cases/classify/test_prompt_based_classify.py
+++ b/tests/use_cases/classify/test_prompt_based_classify.py
@@ -1,5 +1,6 @@
 from typing import Sequence
 
+import pytest
 from pytest import fixture
 
 from intelligence_layer.core import InMemoryTracer, NoOpTracer, TextChunk
@@ -216,6 +217,35 @@ def test_can_evaluate_classify(
     assert evaluation.correct is True
 
 
+def test_classify_warns_on_missing_label(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    classify_runner: Runner[ClassifyInput, SingleLabelClassifyOutput],
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    classify_evaluator: Evaluator[
+        ClassifyInput,
+        SingleLabelClassifyOutput,
+        Sequence[str],
+        SingleLabelClassifyEvaluation,
+    ],
+    prompt_based_classify: PromptBasedClassify,
+) -> None:
+    example = Example(
+        input=ClassifyInput(
+            chunk=TextChunk("This is good"),
+            labels=frozenset({"positive", "negative"}),
+        ),
+        expected_output="SomethingElse",
+    )
+
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=[example], dataset_name="test-dataset"
+    ).id
+
+    run_overview = classify_runner.run_dataset(dataset_id)
+
+    pytest.warns(RuntimeWarning, classify_evaluator.evaluate_runs, run_overview.id)
+
+
 def test_can_aggregate_evaluations(
     classify_evaluator: Evaluator[
         ClassifyInput,