Skip to content

Commit

Permalink
Il 367 label counts on failed classify eval (#695)
Browse files Browse the repository at this point in the history
Warnings and easier retrieval of failed examples in SingelLabelClassify

---------

Co-authored-by: niklas.finken <[email protected]>
Co-authored-by: Sebastian Niehus <[email protected]>
Co-authored-by: Johannes Wesch <[email protected]>
  • Loading branch information
4 people authored Apr 4, 2024
1 parent 881e59d commit 0d5e77a
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 62 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### Breaking Changes

### New Features
- feature: Add SingleLabelClassifyFailedExampleIterator for easy retrieval of failed examples.
- feature: Error information is printed to the console on failed runs and evaluations.
- feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object
- feature: The `Runner.run_dataset(..)` and `Evaluator.evaluate_run(..)` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
Expand Down
71 changes: 31 additions & 40 deletions src/examples/user_journey.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
" ClassifyInput,\n",
" PromptBasedClassify,\n",
" SingleLabelClassifyAggregationLogic,\n",
" SingleLabelClassifyEvaluation,\n",
" SingleLabelClassifyEvaluationLogic,\n",
" SingleLabelClassifyOutput,\n",
")\n",
"import json\n",
"\n",
Expand Down Expand Up @@ -144,6 +142,28 @@
"labeled_examples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The Intelligence layer offers support to run task evaluations.\n",
"\n",
"First, we have to create a dataset inside a repository.\n",
"There are different repositories (that persist datasets in different ways), but an `InMemoryDatasetRepository` will do for now.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(\"data/classify_examples.json\", \"r\") as file:\n",
" labeled_examples: list[dict[str, str]] = json.load(file)\n",
"\n",
"labeled_examples"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -297,38 +317,14 @@
"metadata": {},
"outputs": [],
"source": [
"def get_failed_examples(run_id: str, eval_id: str, dataset_id: str, first_n: int):\n",
" overview = [\n",
" {\n",
" \"input\": example.input,\n",
" \"expected_output\": example.expected_output,\n",
" \"result\": sorted(\n",
" list(\n",
" next(\n",
" example_output\n",
" for example_output in run_repository.example_outputs(\n",
" run_id, SingleLabelClassifyOutput\n",
" )\n",
" if example_output.example_id == example.id\n",
" ).output.scores.items()\n",
" ),\n",
" key=lambda i: i[1],\n",
" reverse=True,\n",
" )[0],\n",
" \"eval\": evaluation_repository.example_evaluation(\n",
" evaluation_id=eval_id,\n",
" example_id=example.id,\n",
" evaluation_type=SingleLabelClassifyEvaluation,\n",
" ).result,\n",
" }\n",
" for example in dataset_repository.examples(\n",
" dataset_id=dataset_id, input_type=ClassifyInput, expected_output_type=str\n",
" )\n",
" ]\n",
" return [example for example in overview if not example[\"eval\"].correct][:first_n]\n",
"\n",
"from intelligence_layer.use_cases.classify.classify import (\n",
" SingleLabelClassifyFailedExampleIterator,\n",
")\n",
"\n",
"get_failed_examples(run_overview.id, eval_overview.id, dataset_id, 3)"
"failed_example_iterator = SingleLabelClassifyFailedExampleIterator(\n",
" dataset_repository, run_repository, evaluation_repository\n",
")\n",
"list(failed_example_iterator.get_examples(eval_overview.id))"
]
},
{
Expand Down Expand Up @@ -512,12 +508,7 @@
"metadata": {},
"outputs": [],
"source": [
"get_failed_examples(\n",
" run_overview_prompt_adjusted.id,\n",
" eval_overview_prompt_adjusted.id,\n",
" cleaned_dataset_id,\n",
" 3,\n",
")"
"list(failed_example_iterator.get_examples(eval_overview_prompt_adjusted.id))"
]
},
{
Expand Down Expand Up @@ -546,7 +537,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
"version": "3.11.4"
}
},
"nbformat": 4,
Expand Down
1 change: 0 additions & 1 deletion src/intelligence_layer/use_cases/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
MultiLabelClassifyEvaluationLogic as MultiLabelClassifyEvaluationLogic,
)
from .classify.classify import MultiLabelClassifyOutput as MultiLabelClassifyOutput
from .classify.classify import PerformanceScores as PerformanceScores
from .classify.classify import Probability as Probability
from .classify.classify import (
SingleLabelClassifyAggregationLogic as SingleLabelClassifyAggregationLogic,
Expand Down
91 changes: 70 additions & 21 deletions src/intelligence_layer/use_cases/classify/classify.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from collections import defaultdict
from typing import Iterable, Mapping, NewType, Sequence

Expand All @@ -6,10 +7,15 @@
from intelligence_layer.core import TextChunk
from intelligence_layer.evaluation import (
AggregationLogic,
DatasetRepository,
EvaluationRepository,
Example,
MeanAccumulator,
RepositoryNavigator,
RunRepository,
SingleOutputEvaluationLogic,
)
from intelligence_layer.evaluation.evaluation.domain import FailedExampleEvaluation

Probability = NewType("Probability", float)

Expand Down Expand Up @@ -69,20 +75,6 @@ class SingleLabelClassifyEvaluation(BaseModel):
expected_label_missing: bool


class PerformanceScores(BaseModel):
"""The relevant metrics resulting from a confusion matrix in a classification run.
Attributes:
precision: Proportion of correctly predicted classes to all predicted classes.
recall: Proportion of correctly predicted classes to all expected classes.
f1: Aggregated performance, formally the harmonic mean of precision and recall.
"""

precision: float
recall: float
f1: float


class AggregatedLabelInfo(BaseModel):
expected_count: int
predicted_count: int
Expand Down Expand Up @@ -124,6 +116,11 @@ def aggregate(
confusion_matrix[(evaluation.predicted, evaluation.expected)] += 1
by_label[evaluation.predicted]["predicted"] += 1
by_label[evaluation.expected]["expected"] += 1

if len(missing_labels) > 0:
warn_message = "[WARNING] There were examples with expected labels missing in the evaluation inputs. For a detailed list, see the 'statistics.missing_labels' field of the returned `AggregationOverview`."
warnings.warn(warn_message, RuntimeWarning)

return AggregatedSingleLabelClassifyEvaluation(
percentage_correct=acc.extract(),
confusion_matrix=confusion_matrix,
Expand Down Expand Up @@ -154,6 +151,10 @@ def do_evaluate_single_output(
sorted_classes = sorted(
output.scores.items(), key=lambda item: item[1], reverse=True
)
if example.expected_output not in example.input.labels:
warn_message = f"[WARNING] Example with ID '{example.id}' has expected label '{example.expected_output}', which is not part of the example's input labels."
warnings.warn(warn_message, RuntimeWarning)

predicted = sorted_classes[0][0]
if predicted == example.expected_output:
correct = True
Expand All @@ -167,6 +168,40 @@ def do_evaluate_single_output(
)


class SingleLabelClassifyFailedExampleIterator:
def __init__(
self,
dataset_repository: DatasetRepository,
run_repository: RunRepository,
evaluation_repository: EvaluationRepository,
):
self.repository_navigator = RepositoryNavigator(
dataset_repository, run_repository, evaluation_repository
)

# TODO: Add test
def get_examples(
self, evaluation_overview_id: str, first_n: int = 0
) -> Iterable[Example[ClassifyInput, str]]:
evaluation_lineages = self.repository_navigator.evaluation_lineages(
evaluation_id=evaluation_overview_id,
input_type=ClassifyInput,
expected_output_type=str,
output_type=SingleLabelClassifyOutput,
evaluation_type=SingleLabelClassifyEvaluation,
)
count_yielded = 0
for lineage in evaluation_lineages:
if first_n != 0 and count_yielded >= first_n:
break
if (
isinstance(lineage.evaluation.result, FailedExampleEvaluation)
or not lineage.evaluation.result.correct
):
count_yielded += 1
yield lineage.example


class MultiLabelClassifyEvaluation(BaseModel):
"""The evaluation of a single multi-label classification example.
Expand All @@ -183,6 +218,20 @@ class MultiLabelClassifyEvaluation(BaseModel):
fn: frozenset[str]


class MultiLabelClassifyMetrics(BaseModel):
"""The relevant metrics resulting from a confusion matrix in a classification run.
Attributes:
precision: Proportion of correctly predicted classes to all predicted classes.
recall: Proportion of correctly predicted classes to all expected classes.
f1: Aggregated performance, formally the harmonic mean of precision and recall.
"""

precision: float
recall: float
f1: float


class AggregatedMultiLabelClassifyEvaluation(BaseModel):
"""The aggregated evaluation of a multi-label classify dataset.
Expand All @@ -193,9 +242,9 @@ class AggregatedMultiLabelClassifyEvaluation(BaseModel):
"""

class_metrics: Mapping[str, PerformanceScores]
micro_avg: PerformanceScores
macro_avg: PerformanceScores
class_metrics: Mapping[str, MultiLabelClassifyMetrics]
micro_avg: MultiLabelClassifyMetrics
macro_avg: MultiLabelClassifyMetrics


class MultiLabelClassifyAggregationLogic(
Expand Down Expand Up @@ -243,7 +292,7 @@ def aggregate(
else 0
)

class_metrics[label] = PerformanceScores(
class_metrics[label] = MultiLabelClassifyMetrics(
precision=precision, recall=recall, f1=f1
)

Expand All @@ -255,19 +304,19 @@ def aggregate(
sum_f1 += f1

try:
micro_avg = PerformanceScores(
micro_avg = MultiLabelClassifyMetrics(
precision=sum_tp / (sum_tp + sum_fp),
recall=sum_tp / (sum_tp + sum_fn),
f1=(2 * (sum_tp / (sum_tp + sum_fp)) * (sum_tp / (sum_tp + sum_fn)))
/ ((sum_tp / (sum_tp + sum_fp)) + (sum_tp / (sum_tp + sum_fn))),
)
except ZeroDivisionError:
micro_avg = PerformanceScores(
micro_avg = MultiLabelClassifyMetrics(
precision=0,
recall=0,
f1=0,
)
macro_avg = PerformanceScores(
macro_avg = MultiLabelClassifyMetrics(
precision=sum_precision / len(class_metrics),
recall=sum_recall / len(class_metrics),
f1=sum_f1 / len(class_metrics),
Expand Down
30 changes: 30 additions & 0 deletions tests/use_cases/classify/test_prompt_based_classify.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Sequence

import pytest
from pytest import fixture

from intelligence_layer.core import InMemoryTracer, NoOpTracer, TextChunk
Expand Down Expand Up @@ -216,6 +217,35 @@ def test_can_evaluate_classify(
assert evaluation.correct is True


def test_classify_warns_on_missing_label(
in_memory_dataset_repository: InMemoryDatasetRepository,
classify_runner: Runner[ClassifyInput, SingleLabelClassifyOutput],
in_memory_evaluation_repository: InMemoryEvaluationRepository,
classify_evaluator: Evaluator[
ClassifyInput,
SingleLabelClassifyOutput,
Sequence[str],
SingleLabelClassifyEvaluation,
],
prompt_based_classify: PromptBasedClassify,
) -> None:
example = Example(
input=ClassifyInput(
chunk=TextChunk("This is good"),
labels=frozenset({"positive", "negative"}),
),
expected_output="SomethingElse",
)

dataset_id = in_memory_dataset_repository.create_dataset(
examples=[example], dataset_name="test-dataset"
).id

run_overview = classify_runner.run_dataset(dataset_id)

pytest.warns(RuntimeWarning, classify_evaluator.evaluate_runs, run_overview.id)


def test_can_aggregate_evaluations(
classify_evaluator: Evaluator[
ClassifyInput,
Expand Down

0 comments on commit 0d5e77a

Please sign in to comment.