diff --git a/CHANGELOG.md b/CHANGELOG.md index 0edfeda9b..f3fe99fe2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ ### Breaking Changes ### New Features -- feature: Add SingleLabelClassifyFailedExampleIterator for easy retrieval of failed examples. +- feature: Add sorted_scores property to `SingleLabelClassifyOutput`. - feature: Error information is printed to the console on failed runs and evaluations. - feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object - feature: The `Runner.run_dataset(..)` and `Evaluator.evaluate_run(..)` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs. diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb index a7d4ea6ae..f4c2e4eb7 100644 --- a/src/examples/user_journey.ipynb +++ b/src/examples/user_journey.ipynb @@ -19,6 +19,7 @@ " Runner,\n", " evaluation_lineages_to_pandas,\n", ")\n", + "from intelligence_layer.evaluation.evaluation.domain import FailedExampleEvaluation\n", "from intelligence_layer.use_cases import (\n", " ClassifyInput,\n", " PromptBasedClassify,\n", @@ -27,7 +28,6 @@ ")\n", "import json\n", "\n", - "\n", "load_dotenv()" ] }, @@ -110,7 +110,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Hmm, we have some results, but they aren't really legible (yet)." + "Hmm, we have some results, but they aren't really legible (yet).\n", + "So let's look at the sorted individual results for more clarity: " ] }, { @@ -119,15 +120,16 @@ "metadata": {}, "outputs": [], "source": [ - "[sorted(list(o.scores.items()), key=lambda i: i[1], reverse=True)[0] for o in outputs]" + "[o.sorted_scores for o in outputs]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "It appears that the Finance Department can fix my laptop and the Comms people can reward free credits...\n", - "We probably have to do some finetuning of our classification approach.\n", + "For the first example 'Communications' gets the highest score, while for the second example the 'Communications' is the clear winner.\n", + "This suggests that the Finance Department can fix my laptop and the Comms people can reward free credits ... Not very likely.\n", + "We probably have to do some fine-tuning of our classification approach.\n", "\n", "However, let's first make sure that this evidence is not anecdotal.\n", "For this, we need to do some eval. Luckily, we have by now got access to a few more examples...\n" @@ -313,11 +315,6 @@ "metadata": {}, "outputs": [], "source": [ - "# from intelligence_layer.evaluation import evaluation_lineages_to_pandas\n", - "\n", - "\n", - "from intelligence_layer.evaluation import FailedExampleEvaluation\n", - "\n", "passed_lineages = [\n", " lineage\n", " for lineage in evaluator.evaluation_lineages(eval_overview.id)\n", diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py index 7b109adeb..d3dcb89e6 100644 --- a/src/intelligence_layer/use_cases/classify/classify.py +++ b/src/intelligence_layer/use_cases/classify/classify.py @@ -39,6 +39,10 @@ class SingleLabelClassifyOutput(BaseModel): scores: Mapping[str, Probability] + @property + def sorted_scores(self) -> list[tuple[str, Probability]]: + return sorted(self.scores.items(), key=lambda item: item[1], reverse=True) + class MultiLabelClassifyOutput(BaseModel): """Output for a multi label classification task. @@ -143,14 +147,11 @@ def do_evaluate_single_output( example: Example[ClassifyInput, str], output: SingleLabelClassifyOutput, ) -> SingleLabelClassifyEvaluation: - sorted_classes = sorted( - output.scores.items(), key=lambda item: item[1], reverse=True - ) if example.expected_output not in example.input.labels: warn_message = f"[WARNING] Example with ID '{example.id}' has expected label '{example.expected_output}', which is not part of the example's input labels." warnings.warn(warn_message, RuntimeWarning) - predicted = sorted_classes[0][0] + predicted = output.sorted_scores[0][0] if predicted == example.expected_output: correct = True else: