From 0d5e77aa8dc9ee7fd0ac5a3978102e41024128d2 Mon Sep 17 00:00:00 2001 From: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com> Date: Thu, 4 Apr 2024 11:52:19 +0200 Subject: [PATCH] Il 367 label counts on failed classify eval (#695) Warnings and easier retrieval of failed examples in SingelLabelClassify --------- Co-authored-by: niklas.finken Co-authored-by: Sebastian Niehus Co-authored-by: Johannes Wesch --- CHANGELOG.md | 1 + src/examples/user_journey.ipynb | 71 +++++++-------- src/intelligence_layer/use_cases/__init__.py | 1 - .../use_cases/classify/classify.py | 91 ++++++++++++++----- .../classify/test_prompt_based_classify.py | 30 ++++++ 5 files changed, 132 insertions(+), 62 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 31e55d21d..0edfeda9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Breaking Changes ### New Features +- feature: Add SingleLabelClassifyFailedExampleIterator for easy retrieval of failed examples. - feature: Error information is printed to the console on failed runs and evaluations. - feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object - feature: The `Runner.run_dataset(..)` and `Evaluator.evaluate_run(..)` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs. diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb index 97a5fb5f0..1dc569768 100644 --- a/src/examples/user_journey.ipynb +++ b/src/examples/user_journey.ipynb @@ -22,9 +22,7 @@ " ClassifyInput,\n", " PromptBasedClassify,\n", " SingleLabelClassifyAggregationLogic,\n", - " SingleLabelClassifyEvaluation,\n", " SingleLabelClassifyEvaluationLogic,\n", - " SingleLabelClassifyOutput,\n", ")\n", "import json\n", "\n", @@ -144,6 +142,28 @@ "labeled_examples" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Intelligence layer offers support to run task evaluations.\n", + "\n", + "First, we have to create a dataset inside a repository.\n", + "There are different repositories (that persist datasets in different ways), but an `InMemoryDatasetRepository` will do for now.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"data/classify_examples.json\", \"r\") as file:\n", + " labeled_examples: list[dict[str, str]] = json.load(file)\n", + "\n", + "labeled_examples" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -297,38 +317,14 @@ "metadata": {}, "outputs": [], "source": [ - "def get_failed_examples(run_id: str, eval_id: str, dataset_id: str, first_n: int):\n", - " overview = [\n", - " {\n", - " \"input\": example.input,\n", - " \"expected_output\": example.expected_output,\n", - " \"result\": sorted(\n", - " list(\n", - " next(\n", - " example_output\n", - " for example_output in run_repository.example_outputs(\n", - " run_id, SingleLabelClassifyOutput\n", - " )\n", - " if example_output.example_id == example.id\n", - " ).output.scores.items()\n", - " ),\n", - " key=lambda i: i[1],\n", - " reverse=True,\n", - " )[0],\n", - " \"eval\": evaluation_repository.example_evaluation(\n", - " evaluation_id=eval_id,\n", - " example_id=example.id,\n", - " evaluation_type=SingleLabelClassifyEvaluation,\n", - " ).result,\n", - " }\n", - " for example in dataset_repository.examples(\n", - " dataset_id=dataset_id, input_type=ClassifyInput, expected_output_type=str\n", - " )\n", - " ]\n", - " return [example for example in overview if not example[\"eval\"].correct][:first_n]\n", - "\n", + "from intelligence_layer.use_cases.classify.classify import (\n", + " SingleLabelClassifyFailedExampleIterator,\n", + ")\n", "\n", - "get_failed_examples(run_overview.id, eval_overview.id, dataset_id, 3)" + "failed_example_iterator = SingleLabelClassifyFailedExampleIterator(\n", + " dataset_repository, run_repository, evaluation_repository\n", + ")\n", + "list(failed_example_iterator.get_examples(eval_overview.id))" ] }, { @@ -512,12 +508,7 @@ "metadata": {}, "outputs": [], "source": [ - "get_failed_examples(\n", - " run_overview_prompt_adjusted.id,\n", - " eval_overview_prompt_adjusted.id,\n", - " cleaned_dataset_id,\n", - " 3,\n", - ")" + "list(failed_example_iterator.get_examples(eval_overview_prompt_adjusted.id))" ] }, { @@ -546,7 +537,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/src/intelligence_layer/use_cases/__init__.py b/src/intelligence_layer/use_cases/__init__.py index 85f233daa..de0947313 100644 --- a/src/intelligence_layer/use_cases/__init__.py +++ b/src/intelligence_layer/use_cases/__init__.py @@ -15,7 +15,6 @@ MultiLabelClassifyEvaluationLogic as MultiLabelClassifyEvaluationLogic, ) from .classify.classify import MultiLabelClassifyOutput as MultiLabelClassifyOutput -from .classify.classify import PerformanceScores as PerformanceScores from .classify.classify import Probability as Probability from .classify.classify import ( SingleLabelClassifyAggregationLogic as SingleLabelClassifyAggregationLogic, diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py index 562252160..59e2ab376 100644 --- a/src/intelligence_layer/use_cases/classify/classify.py +++ b/src/intelligence_layer/use_cases/classify/classify.py @@ -1,3 +1,4 @@ +import warnings from collections import defaultdict from typing import Iterable, Mapping, NewType, Sequence @@ -6,10 +7,15 @@ from intelligence_layer.core import TextChunk from intelligence_layer.evaluation import ( AggregationLogic, + DatasetRepository, + EvaluationRepository, Example, MeanAccumulator, + RepositoryNavigator, + RunRepository, SingleOutputEvaluationLogic, ) +from intelligence_layer.evaluation.evaluation.domain import FailedExampleEvaluation Probability = NewType("Probability", float) @@ -69,20 +75,6 @@ class SingleLabelClassifyEvaluation(BaseModel): expected_label_missing: bool -class PerformanceScores(BaseModel): - """The relevant metrics resulting from a confusion matrix in a classification run. - - Attributes: - precision: Proportion of correctly predicted classes to all predicted classes. - recall: Proportion of correctly predicted classes to all expected classes. - f1: Aggregated performance, formally the harmonic mean of precision and recall. - """ - - precision: float - recall: float - f1: float - - class AggregatedLabelInfo(BaseModel): expected_count: int predicted_count: int @@ -124,6 +116,11 @@ def aggregate( confusion_matrix[(evaluation.predicted, evaluation.expected)] += 1 by_label[evaluation.predicted]["predicted"] += 1 by_label[evaluation.expected]["expected"] += 1 + + if len(missing_labels) > 0: + warn_message = "[WARNING] There were examples with expected labels missing in the evaluation inputs. For a detailed list, see the 'statistics.missing_labels' field of the returned `AggregationOverview`." + warnings.warn(warn_message, RuntimeWarning) + return AggregatedSingleLabelClassifyEvaluation( percentage_correct=acc.extract(), confusion_matrix=confusion_matrix, @@ -154,6 +151,10 @@ def do_evaluate_single_output( sorted_classes = sorted( output.scores.items(), key=lambda item: item[1], reverse=True ) + if example.expected_output not in example.input.labels: + warn_message = f"[WARNING] Example with ID '{example.id}' has expected label '{example.expected_output}', which is not part of the example's input labels." + warnings.warn(warn_message, RuntimeWarning) + predicted = sorted_classes[0][0] if predicted == example.expected_output: correct = True @@ -167,6 +168,40 @@ def do_evaluate_single_output( ) +class SingleLabelClassifyFailedExampleIterator: + def __init__( + self, + dataset_repository: DatasetRepository, + run_repository: RunRepository, + evaluation_repository: EvaluationRepository, + ): + self.repository_navigator = RepositoryNavigator( + dataset_repository, run_repository, evaluation_repository + ) + + # TODO: Add test + def get_examples( + self, evaluation_overview_id: str, first_n: int = 0 + ) -> Iterable[Example[ClassifyInput, str]]: + evaluation_lineages = self.repository_navigator.evaluation_lineages( + evaluation_id=evaluation_overview_id, + input_type=ClassifyInput, + expected_output_type=str, + output_type=SingleLabelClassifyOutput, + evaluation_type=SingleLabelClassifyEvaluation, + ) + count_yielded = 0 + for lineage in evaluation_lineages: + if first_n != 0 and count_yielded >= first_n: + break + if ( + isinstance(lineage.evaluation.result, FailedExampleEvaluation) + or not lineage.evaluation.result.correct + ): + count_yielded += 1 + yield lineage.example + + class MultiLabelClassifyEvaluation(BaseModel): """The evaluation of a single multi-label classification example. @@ -183,6 +218,20 @@ class MultiLabelClassifyEvaluation(BaseModel): fn: frozenset[str] +class MultiLabelClassifyMetrics(BaseModel): + """The relevant metrics resulting from a confusion matrix in a classification run. + + Attributes: + precision: Proportion of correctly predicted classes to all predicted classes. + recall: Proportion of correctly predicted classes to all expected classes. + f1: Aggregated performance, formally the harmonic mean of precision and recall. + """ + + precision: float + recall: float + f1: float + + class AggregatedMultiLabelClassifyEvaluation(BaseModel): """The aggregated evaluation of a multi-label classify dataset. @@ -193,9 +242,9 @@ class AggregatedMultiLabelClassifyEvaluation(BaseModel): """ - class_metrics: Mapping[str, PerformanceScores] - micro_avg: PerformanceScores - macro_avg: PerformanceScores + class_metrics: Mapping[str, MultiLabelClassifyMetrics] + micro_avg: MultiLabelClassifyMetrics + macro_avg: MultiLabelClassifyMetrics class MultiLabelClassifyAggregationLogic( @@ -243,7 +292,7 @@ def aggregate( else 0 ) - class_metrics[label] = PerformanceScores( + class_metrics[label] = MultiLabelClassifyMetrics( precision=precision, recall=recall, f1=f1 ) @@ -255,19 +304,19 @@ def aggregate( sum_f1 += f1 try: - micro_avg = PerformanceScores( + micro_avg = MultiLabelClassifyMetrics( precision=sum_tp / (sum_tp + sum_fp), recall=sum_tp / (sum_tp + sum_fn), f1=(2 * (sum_tp / (sum_tp + sum_fp)) * (sum_tp / (sum_tp + sum_fn))) / ((sum_tp / (sum_tp + sum_fp)) + (sum_tp / (sum_tp + sum_fn))), ) except ZeroDivisionError: - micro_avg = PerformanceScores( + micro_avg = MultiLabelClassifyMetrics( precision=0, recall=0, f1=0, ) - macro_avg = PerformanceScores( + macro_avg = MultiLabelClassifyMetrics( precision=sum_precision / len(class_metrics), recall=sum_recall / len(class_metrics), f1=sum_f1 / len(class_metrics), diff --git a/tests/use_cases/classify/test_prompt_based_classify.py b/tests/use_cases/classify/test_prompt_based_classify.py index a54b58bb4..35e6c8cc6 100644 --- a/tests/use_cases/classify/test_prompt_based_classify.py +++ b/tests/use_cases/classify/test_prompt_based_classify.py @@ -1,5 +1,6 @@ from typing import Sequence +import pytest from pytest import fixture from intelligence_layer.core import InMemoryTracer, NoOpTracer, TextChunk @@ -216,6 +217,35 @@ def test_can_evaluate_classify( assert evaluation.correct is True +def test_classify_warns_on_missing_label( + in_memory_dataset_repository: InMemoryDatasetRepository, + classify_runner: Runner[ClassifyInput, SingleLabelClassifyOutput], + in_memory_evaluation_repository: InMemoryEvaluationRepository, + classify_evaluator: Evaluator[ + ClassifyInput, + SingleLabelClassifyOutput, + Sequence[str], + SingleLabelClassifyEvaluation, + ], + prompt_based_classify: PromptBasedClassify, +) -> None: + example = Example( + input=ClassifyInput( + chunk=TextChunk("This is good"), + labels=frozenset({"positive", "negative"}), + ), + expected_output="SomethingElse", + ) + + dataset_id = in_memory_dataset_repository.create_dataset( + examples=[example], dataset_name="test-dataset" + ).id + + run_overview = classify_runner.run_dataset(dataset_id) + + pytest.warns(RuntimeWarning, classify_evaluator.evaluate_runs, run_overview.id) + + def test_can_aggregate_evaluations( classify_evaluator: Evaluator[ ClassifyInput,