From baca0efab49ef7ddac3bac111edd7c79ec643ee2 Mon Sep 17 00:00:00 2001 From: Merlin Kallenborn Date: Wed, 5 Jun 2024 16:45:18 +0200 Subject: [PATCH 1/7] feat: Add labels and metadata fields to dataset and overview classes TASK: IL-547 --- .gitignore | 3 ++ .../evaluation/aggregation/aggregator.py | 10 +++- .../evaluation/aggregation/domain.py | 11 ++++ .../evaluation/dataset/dataset_repository.py | 5 ++ .../evaluation/dataset/domain.py | 16 +++++- .../dataset/file_dataset_repository.py | 5 +- .../dataset/in_memory_dataset_repository.py | 5 +- .../single_huggingface_dataset_repository.py | 3 ++ .../evaluation/evaluation/domain.py | 14 +++++ .../evaluation/evaluator/argilla_evaluator.py | 7 +++ .../evaluation/evaluator/base_evaluator.py | 1 + .../evaluation/evaluator/evaluator.py | 7 +++ .../evaluator/incremental_evaluator.py | 13 ++++- .../evaluation/run/domain.py | 10 ++++ .../evaluation/run/runner.py | 8 +++ tests/conftest.py | 4 ++ .../evaluation/dataset/test_dataset_domain.py | 20 +++++++ .../dataset/test_dataset_repository.py | 53 +++++++++++++++++++ .../test_async_evaluation_repository.py | 4 ++ .../evaluation/test_elo_evaluation_logic.py | 2 + .../evaluation/test_evaluation_repository.py | 2 + tests/evaluation/evaluation/test_evaluator.py | 51 ++++++++++++++++++ ...t_instruct_comparison_argilla_evaluator.py | 32 +++++++++++ .../test_repository_navigator.py | 2 + tests/evaluation/run/test_run_repository.py | 2 + tests/evaluation/run/test_runner.py | 41 ++++++++++++++ 26 files changed, 325 insertions(+), 6 deletions(-) create mode 100644 tests/evaluation/dataset/test_dataset_domain.py diff --git a/.gitignore b/.gitignore index 0caf99001..7476d43b5 100644 --- a/.gitignore +++ b/.gitignore @@ -248,3 +248,6 @@ fabric.properties # End of https://www.toptal.com/developers/gitignore/api/intellij+all .python-version + +src/documentation/human-eval-data/datasets* +src/documentation/human-eval-data/runs/* diff --git a/src/intelligence_layer/evaluation/aggregation/aggregator.py b/src/intelligence_layer/evaluation/aggregation/aggregator.py index 29b9053f7..028204fbd 100644 --- a/src/intelligence_layer/evaluation/aggregation/aggregator.py +++ b/src/intelligence_layer/evaluation/aggregation/aggregator.py @@ -11,6 +11,7 @@ ) from uuid import uuid4 +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import utc_now from intelligence_layer.evaluation.aggregation.aggregation_repository import ( AggregationRepository, @@ -181,7 +182,10 @@ def evaluation_type(self) -> type[Evaluation]: @final def aggregate_evaluation( - self, *eval_ids: str + self, + *eval_ids: str, + labels: set[str] = set(), + metadata: dict[str, JsonSerializable] = dict(), ) -> AggregationOverview[AggregatedEvaluation]: """Aggregates all evaluations into an overview that includes high-level statistics. @@ -190,6 +194,8 @@ def aggregate_evaluation( Args: eval_ids: An overview of the evaluation to be aggregated. Does not include actual evaluations as these will be retrieved from the repository. + labels: A list of labels for filtering. Defaults to an empty list. + metadata: A dict for additional information about the aggregation overview. Default to an empty dict. Returns: An overview of the aggregated evaluation. @@ -237,6 +243,8 @@ def load_eval_overview(evaluation_id: str) -> EvaluationOverview: crashed_during_evaluation_count=successful_evaluations.excluded_count(), description=self.description, statistics=statistics, + labels=labels, + metadata=metadata, ) self._aggregation_repository.store_aggregation_overview(aggregation_overview) return aggregation_overview diff --git a/src/intelligence_layer/evaluation/aggregation/domain.py b/src/intelligence_layer/evaluation/aggregation/domain.py index 46a996dd2..01f810e07 100644 --- a/src/intelligence_layer/evaluation/aggregation/domain.py +++ b/src/intelligence_layer/evaluation/aggregation/domain.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, SerializeAsAny +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.evaluation.evaluation.domain import ( EvaluationFailed, EvaluationOverview, @@ -31,6 +32,9 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True) run_ids: IDs of all :class:`RunOverview`s from all linked :class:`EvaluationOverview`s. description: A short description. statistics: Aggregated statistics of the run. Whatever is returned by :meth:`Evaluator.aggregate` + labels: Labels for filtering aggregation. Defaults to empty list. + metadata: Additional information about the aggregation. Defaults to empty dict. + """ evaluation_overviews: frozenset[EvaluationOverview] @@ -41,6 +45,8 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True) crashed_during_evaluation_count: int description: str statistics: SerializeAsAny[AggregatedEvaluation] + labels: set[str] = set() + metadata: dict[str, JsonSerializable] = dict() @property def run_ids(self) -> Sequence[str]: @@ -74,6 +80,8 @@ def __str__(self) -> str: f"Successful example count = {self.successful_evaluation_count}\n" f"Count of examples crashed during evaluation = {self.failed_evaluation_count}\n" f'Description = "{self.description}"\n' + f"Labels = {self.labels}\n" + f"Metadata = {self.metadata}\n" ) res += f"IDs of aggregated Evaluation Overviews = {[evaluation_overview.id for evaluation_overview in self.evaluation_overviews]}\n" @@ -84,3 +92,6 @@ def __str__(self) -> str: res += "}\n" return res + + def __hash__(self) -> int: + return hash(self.id) diff --git a/src/intelligence_layer/evaluation/dataset/dataset_repository.py b/src/intelligence_layer/evaluation/dataset/dataset_repository.py index bbd79db22..7421f6e72 100644 --- a/src/intelligence_layer/evaluation/dataset/dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/dataset_repository.py @@ -2,6 +2,7 @@ from collections.abc import Iterable from typing import Optional +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import Input from intelligence_layer.evaluation.dataset.domain import ( Dataset, @@ -22,6 +23,8 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, + labels: set[str] = set(), + metadata: dict[str, JsonSerializable] = dict(), ) -> Dataset: """Creates a dataset from given :class:`Example`s and returns the ID of that dataset. @@ -29,6 +32,8 @@ def create_dataset( examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset. dataset_name: A name for the dataset. id: The dataset ID. If `None`, an ID will be generated. + labels: A list of labels for filtering. Defaults to an empty list. + metadata: A dict for additional information about the dataset. Default to an empty dict. Returns: The created :class:`Dataset`. diff --git a/src/intelligence_layer/evaluation/dataset/domain.py b/src/intelligence_layer/evaluation/dataset/domain.py index c32c36d33..d2353e4e5 100644 --- a/src/intelligence_layer/evaluation/dataset/domain.py +++ b/src/intelligence_layer/evaluation/dataset/domain.py @@ -4,7 +4,10 @@ from pydantic import BaseModel, Field from rich.tree import Tree -from intelligence_layer.connectors.base.json_serializable import SerializableDict +from intelligence_layer.connectors.base.json_serializable import ( + JsonSerializable, + SerializableDict, +) from intelligence_layer.core.task import Input from intelligence_layer.core.tracer.tracer import PydanticSerializable @@ -60,13 +63,22 @@ class Dataset(BaseModel): Attributes: id: Dataset ID. name: A short name of the dataset. + label: Labels for filtering datasets. Defaults to empty list. + metadata: Additional information about the dataset. Defaults to empty dict. """ id: str = Field(default_factory=lambda: str(uuid4())) name: str + labels: set[str] = set() + metadata: dict[str, JsonSerializable] = dict() def __repr__(self) -> str: return self.__str__() def __str__(self) -> str: - return f"Dataset ID = {self.id}\nName = {self.name}\n" + return ( + f"Dataset ID = {self.id}\n" + f"Name = {self.name}\n" + f"Labels = {self.labels}\n" + f"Metadata = {self.metadata}" + ) diff --git a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py index 67e734bb8..dae839294 100644 --- a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py @@ -5,6 +5,7 @@ from fsspec.implementations.local import LocalFileSystem # type: ignore +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import Input, JsonSerializer, PydanticSerializable from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import ( @@ -31,8 +32,10 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, + labels: set[str] = set(), + metadata: dict[str, JsonSerializable] = dict(), ) -> Dataset: - dataset = Dataset(name=dataset_name) + dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata) if id is not None: dataset.id = id diff --git a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py index 94206c2be..ab63eb99d 100644 --- a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py @@ -1,6 +1,7 @@ from collections.abc import Iterable, Sequence from typing import Optional, cast +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import Input, PydanticSerializable from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import ( @@ -24,8 +25,10 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, + labels: set[str] = set(), + metadata: dict[str, JsonSerializable] = dict(), ) -> Dataset: - dataset = Dataset(name=dataset_name) + dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata) if id is not None: dataset.id = id if dataset.id in self._datasets_and_examples: diff --git a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py index 4bc79c9d8..4f8666d7c 100644 --- a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py @@ -5,6 +5,7 @@ from datasets import DatasetDict, IterableDataset, IterableDatasetDict from pydantic import BaseModel +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core.task import Input from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import ( @@ -33,6 +34,8 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, + labels: set[str] = set(), + metadata: dict[str, JsonSerializable] = dict(), ) -> Dataset: raise NotImplementedError diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py index a7688cac9..944da3628 100644 --- a/src/intelligence_layer/evaluation/evaluation/domain.py +++ b/src/intelligence_layer/evaluation/evaluation/domain.py @@ -5,6 +5,7 @@ from pydantic import BaseModel, SerializeAsAny from rich.tree import Tree +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.evaluation.run.domain import RunOverview Evaluation = TypeVar("Evaluation", bound=BaseModel, covariant=True) @@ -81,6 +82,8 @@ class PartialEvaluationOverview(BaseModel, frozen=True): start_date: datetime submitted_evaluation_count: int description: str + labels: set[str] + metadata: dict[str, JsonSerializable] def __repr__(self) -> str: return self.__str__() @@ -100,6 +103,8 @@ def __str__(self) -> str: f"Start time = {self.start_date}\n" f"Submitted Evaluations = {self.submitted_evaluation_count}\n" f'Description = "{self.description}"\n' + f"Labels = {self.labels}\n" + f"Metadata = {self.metadata}\n" f"{run_overview_str}" ) @@ -116,6 +121,8 @@ class EvaluationOverview(BaseModel, frozen=True): failed_evaluation_count: Number of examples that produced an error during evaluation. Note: failed runs are skipped in the evaluation and therefore not counted as failures description: human-readable for the evaluator that created the evaluation. + labels: Labels for filtering evaluation. Defaults to empty list. + metadata: Additional information about the evaluation. Defaults to empty dict. """ run_overviews: frozenset[RunOverview] @@ -125,6 +132,8 @@ class EvaluationOverview(BaseModel, frozen=True): successful_evaluation_count: int failed_evaluation_count: int description: str + labels: set[str] + metadata: dict[str, JsonSerializable] def __repr__(self) -> str: return self.__str__() @@ -146,9 +155,14 @@ def __str__(self) -> str: f"Successful examples = {self.successful_evaluation_count}\n" f"Failed examples = {self.failed_evaluation_count}\n" f'Description = "{self.description}"\n' + f"Labels = {self.labels}\n" + f"Metadata = {self.metadata}\n" f"{run_overview_str}" ) + def __hash__(self) -> int: + return hash(self.id) + class EvaluationFailed(Exception): def __init__(self, evaluation_id: str, failed_count: int) -> None: diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py index 87e870c33..2c455b79c 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py @@ -16,6 +16,7 @@ RatingQuestion, RecordData, ) +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import CompleteOutput, Input, InstructInput, Output from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput @@ -138,6 +139,8 @@ def submit( dataset_name: Optional[str] = None, abort_on_error: bool = False, skip_example_on_any_failure: bool = True, + labels: set[str] = set(), + metadata: dict[str, JsonSerializable] = dict(), ) -> PartialEvaluationOverview: argilla_dataset_id = self._client.create_dataset( self._workspace_id, @@ -179,6 +182,8 @@ def submit( start_date=datetime.now(), submitted_evaluation_count=submit_count, description=self.description, + labels=labels, + metadata=metadata, ) self._evaluation_repository.store_partial_evaluation_overview(partial_overview) @@ -227,6 +232,8 @@ def retrieve( successful_evaluation_count=len(evaluations), failed_evaluation_count=num_not_yet_evaluated_evals + num_failed_evaluations, + labels=partial_overview.labels, + metadata=partial_overview.metadata, ) self._evaluation_repository.store_evaluation_overview(overview) return overview diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py index 44bf5eae6..d464da457 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py @@ -202,6 +202,7 @@ def _load_run_overviews(self, *run_ids: str) -> set[RunOverview]: run_overview = self._run_repository.run_overview(run_id) if not run_overview: raise ValueError(f"No RunOverview found for run-id: {run_id}") + run_overviews.add(run_overview) return run_overviews diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py index cb399cb5b..f5f6f5773 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py @@ -4,6 +4,7 @@ from tqdm import tqdm +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import Input, Output, utc_now from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput @@ -99,6 +100,8 @@ def evaluate_runs( abort_on_error: bool = False, skip_example_on_any_failure: bool = True, description: Optional[str] = None, + labels: set[str] = set(), + metadata: dict[str, JsonSerializable] = dict(), ) -> EvaluationOverview: """Evaluates all generated outputs in the run. @@ -118,6 +121,8 @@ def evaluate_runs( abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False. skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True. description: Optional description of the evaluation. Defaults to None. + labels: A list of labels for filtering. Defaults to an empty list. + metadata: A dict for additional information about the evaluation overview. Default to an empty dict. Returns: EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be @@ -162,6 +167,8 @@ def evaluate_runs( successful_evaluation_count=successful_evaluation_count, failed_evaluation_count=failed_evaluation_count, description=full_description, + labels=labels, + metadata=metadata, ) self._evaluation_repository.store_evaluation_overview(overview) diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py index c4cb678df..578c83e5f 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py @@ -6,6 +6,7 @@ from pydantic import BaseModel +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import Input, Output from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput @@ -123,6 +124,8 @@ def evaluate_additional_runs( previous_evaluation_ids: Optional[list[str]] = None, num_examples: Optional[int] = None, abort_on_error: bool = False, + labels: set[str] = set(), + metadata: dict[str, JsonSerializable] = dict(), ) -> EvaluationOverview: """Evaluate all runs while considering which runs have already been evaluated according to `previous_evaluation_id`. @@ -141,6 +144,8 @@ def evaluate_additional_runs( num_examples: The number of examples which should be evaluated from the given runs. Always the first n runs stored in the evaluation repository. Defaults to None. abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False. + labels: A list of labels for filtering. Defaults to an empty list. + metadata: A dict for additional information about the evaluation overview. Default to an empty dict. Returns: EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be @@ -160,7 +165,11 @@ def evaluate_additional_runs( self._evaluation_logic.set_previous_run_output_ids(previous_run_ids) return super().evaluate_runs( - *run_ids, num_examples=num_examples, abort_on_error=abort_on_error + *run_ids, + num_examples=num_examples, + abort_on_error=abort_on_error, + labels=labels, + metadata=metadata, ) def evaluate_runs( @@ -170,6 +179,8 @@ def evaluate_runs( abort_on_error: bool = False, skip_example_on_any_failure: bool = True, description: Optional[str] = None, + labels: set[str] = set(), + metadata: dict[str, JsonSerializable] = dict(), ) -> EvaluationOverview: self._evaluation_logic.set_previous_run_output_ids([]) return super().evaluate_runs( diff --git a/src/intelligence_layer/evaluation/run/domain.py b/src/intelligence_layer/evaluation/run/domain.py index bcf80653d..11526391e 100644 --- a/src/intelligence_layer/evaluation/run/domain.py +++ b/src/intelligence_layer/evaluation/run/domain.py @@ -5,6 +5,7 @@ from pydantic import BaseModel from rich.tree import Tree +from intelligence_layer.connectors import JsonSerializable from intelligence_layer.core.task import Output @@ -100,6 +101,8 @@ class RunOverview(BaseModel, frozen=True): failed_example_count: The number of examples where an exception was raised when running the task. successful_example_count: The number of examples that where successfully run. description: Human-readable of the runner that run the task. + labels: Labels for filtering runs. Defaults to empty list. + metadata: Additional information about the run. Defaults to empty dict. """ dataset_id: str @@ -109,6 +112,8 @@ class RunOverview(BaseModel, frozen=True): failed_example_count: int successful_example_count: int description: str + labels: set[str] + metadata: dict[str, JsonSerializable] def __repr__(self) -> str: return self.__str__() @@ -122,4 +127,9 @@ def __str__(self) -> str: f"Failed example count = {self.failed_example_count}\n" f"Successful example count = {self.successful_example_count}\n" f'Description = "{self.description}"\n' + f'Labels = "{self.labels}"\n' + f'Metadata = "{self.metadata}"\n' ) + + def __hash__(self) -> int: + return hash(self.id) diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py index 56cc689c8..35e950096 100644 --- a/src/intelligence_layer/evaluation/run/runner.py +++ b/src/intelligence_layer/evaluation/run/runner.py @@ -8,6 +8,7 @@ from pydantic import JsonValue from tqdm import tqdm +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import ( CompositeTracer, Input, @@ -81,6 +82,8 @@ def run_dataset( max_workers: int = 10, description: Optional[str] = None, trace_examples_individually: bool = True, + labels: set[str] = set(), + metadata: dict[str, JsonSerializable] = dict(), ) -> RunOverview: """Generates all outputs for the provided dataset. @@ -97,6 +100,8 @@ def run_dataset( max_workers: Number of examples that can be evaluated concurrently. Defaults to 10. description: An optional description of the run. Defaults to None. trace_examples_individually: Flag to create individual tracers for each example. Defaults to True. + labels: A list of labels for filtering. Defaults to an empty list. + metadata: A dict for additional information about the run overview. Default to an empty dict. Returns: An overview of the run. Outputs will not be returned but instead stored in the @@ -157,6 +162,7 @@ def run( full_description = ( self.description + " : " + description if description else self.description ) + run_overview = RunOverview( dataset_id=dataset_id, id=run_id, @@ -165,6 +171,8 @@ def run( failed_example_count=failed_count, successful_example_count=successful_count, description=full_description, + labels=labels, + metadata=metadata, ) self._run_repository.store_run_overview(run_overview) return run_overview diff --git a/tests/conftest.py b/tests/conftest.py index b3c8669e1..40c95d000 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -146,6 +146,8 @@ def run_overview() -> RunOverview: failed_example_count=0, successful_example_count=3, description="test run overview 1", + labels=set(), + metadata=dict(), ) @@ -166,4 +168,6 @@ def evaluation_overview( failed_evaluation_count=1, run_overviews=frozenset([run_overview]), description="test evaluation overview 1", + labels=set(), + metadata=dict(), ) diff --git a/tests/evaluation/dataset/test_dataset_domain.py b/tests/evaluation/dataset/test_dataset_domain.py new file mode 100644 index 000000000..f8968c403 --- /dev/null +++ b/tests/evaluation/dataset/test_dataset_domain.py @@ -0,0 +1,20 @@ +from intelligence_layer.evaluation import Dataset + + +def test_default_values_are_set() -> None: + dataset = Dataset(name="Test") + + assert dataset.id is not None + assert len(dataset.metadata) == 0 + assert len(dataset.labels) == 0 + + +def test_default_values_are_not_changed() -> None: + modified_dataset = Dataset(name="Modified Dataset") + modified_dataset.labels.add("test_label") + modified_dataset.metadata.update({"key": "value"}) + + default_dataset = Dataset(name="Default Dataset") + + assert modified_dataset.labels != default_dataset.labels + assert modified_dataset.metadata != default_dataset.metadata diff --git a/tests/evaluation/dataset/test_dataset_repository.py b/tests/evaluation/dataset/test_dataset_repository.py index 963782f78..db00dce03 100644 --- a/tests/evaluation/dataset/test_dataset_repository.py +++ b/tests/evaluation/dataset/test_dataset_repository.py @@ -2,11 +2,13 @@ from pathlib import Path from typing import Any from unittest.mock import patch +from uuid import uuid4 import pytest from fsspec.implementations.memory import MemoryFileSystem # type: ignore from pytest import FixtureRequest, fixture, mark, raises +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.evaluation import ( DatasetRepository, Example, @@ -71,6 +73,57 @@ def test_dataset_repository_with_custom_id( assert dataset.id == "my-custom-dataset-id" +@mark.parametrize( + "repository_fixture", + test_repository_fixtures, +) +def test_dataset_repository_create_dataset_sets_default_values( + repository_fixture: str, + request: FixtureRequest, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], +) -> None: + dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) + + dataset = dataset_repository.create_dataset( + examples=[dummy_string_example], dataset_name="test-dataset" + ) + + assert dataset.id is not None + assert dataset.name == "test-dataset" + assert dataset.labels == set() + assert dataset.metadata == dict() + + +@mark.parametrize( + "repository_fixture", + test_repository_fixtures, +) +def test_dataset_repository_create_dataset_explicit_values_overwrite_defaults( + repository_fixture: str, + request: FixtureRequest, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], +) -> None: + expected_id = str(uuid4()) + expected_name = "test_name" + expected_labels = set(["test_label"]) + expected_metadata: dict[str, JsonSerializable] = dict({"test_key": "test_value"}) + + dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) + + dataset = dataset_repository.create_dataset( + examples=[dummy_string_example], + dataset_name=expected_name, + id=expected_id, + labels=expected_labels, + metadata=expected_metadata, + ) + + assert dataset.id == expected_id + assert dataset.name == expected_name + assert dataset.labels == expected_labels + assert dataset.metadata == expected_metadata + + @mark.parametrize( "repository_fixture", test_repository_fixtures, diff --git a/tests/evaluation/evaluation/test_async_evaluation_repository.py b/tests/evaluation/evaluation/test_async_evaluation_repository.py index 16bfc8697..fc94d3fd0 100644 --- a/tests/evaluation/evaluation/test_async_evaluation_repository.py +++ b/tests/evaluation/evaluation/test_async_evaluation_repository.py @@ -41,6 +41,8 @@ def partial_evaluation_overviews( run_overviews=frozenset([run_overview]), submitted_evaluation_count=10, description="test evaluation overview", + labels=set(), + metadata=dict(), ) ) return evaluation_overviews @@ -56,6 +58,8 @@ def partial_evaluation_overview( run_overviews=frozenset([run_overview]), submitted_evaluation_count=10, description="test evaluation overview", + labels=set(), + metadata=dict(), ) diff --git a/tests/evaluation/evaluation/test_elo_evaluation_logic.py b/tests/evaluation/evaluation/test_elo_evaluation_logic.py index e46ee1e19..0b4e2faca 100644 --- a/tests/evaluation/evaluation/test_elo_evaluation_logic.py +++ b/tests/evaluation/evaluation/test_elo_evaluation_logic.py @@ -150,6 +150,8 @@ def qa_setup( failed_example_count=0, successful_example_count=len(qa_outputs), description="runner", + labels=set(), + metadata=dict(), ) ) return run_ids diff --git a/tests/evaluation/evaluation/test_evaluation_repository.py b/tests/evaluation/evaluation/test_evaluation_repository.py index 7db3777c4..3948b5a9f 100644 --- a/tests/evaluation/evaluation/test_evaluation_repository.py +++ b/tests/evaluation/evaluation/test_evaluation_repository.py @@ -70,6 +70,8 @@ def evaluation_overviews(run_overview: RunOverview) -> Iterable[EvaluationOvervi failed_evaluation_count=1, run_overviews=frozenset([run_overview]), description="test evaluation overview 1", + labels=set(), + metadata={}, ) ) return evaluation_overviews diff --git a/tests/evaluation/evaluation/test_evaluator.py b/tests/evaluation/evaluation/test_evaluator.py index 92bbf783f..0ac17d14c 100644 --- a/tests/evaluation/evaluation/test_evaluator.py +++ b/tests/evaluation/evaluation/test_evaluator.py @@ -5,6 +5,7 @@ from pydantic import BaseModel from pytest import fixture +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import Input, Output, Task, Tracer from intelligence_layer.core.tracer.in_memory_tracer import ( InMemoryTaskSpan, @@ -709,3 +710,53 @@ def test_eval_raises_error_if_examples_and_example_outputs_dont_match( num_examples=None, ) ) +def test_evaluator_evaluate_runs_sets_default_values( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str +) -> None: + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) + assert evaluation_overview.labels == set() + assert evaluation_overview.metadata == dict() + + +def test_evaluator_evaluate_runs_specific_values_overwrite_defaults( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str +) -> None: + expected_labels = set(["test_label"]) + expected_metadata: dict[str, JsonSerializable] = dict({"test_key": "test-value"}) + evaluation_overview = dummy_evaluator.evaluate_runs( + run_id, labels=expected_labels, metadata=expected_metadata + ) + assert evaluation_overview.labels == expected_labels + assert evaluation_overview.metadata == expected_metadata + + +def test_aggregate_evaluation_set_default_labels_metadata_values( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList + ], + run_id: str, +) -> None: + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) + aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) + + assert aggregation_overview.labels == set() + assert aggregation_overview.metadata == dict() + + +def test_aggregate_evaluation_specific_values_overwrite_defaults( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList + ], + run_id: str, +) -> None: + expected_labels = set(["test_label"]) + expected_metadata: dict[str, JsonSerializable] = dict({"test_key": "test-value"}) + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) + aggregation_overview = dummy_aggregator.aggregate_evaluation( + evaluation_overview.id, labels=expected_labels, metadata=expected_metadata + ) + + assert aggregation_overview.labels == expected_labels + assert aggregation_overview.metadata == expected_metadata diff --git a/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py index 014e40b41..fae966d89 100644 --- a/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py +++ b/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py @@ -14,6 +14,7 @@ Question, RecordData, ) +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import CompleteOutput, InstructInput, utc_now from intelligence_layer.evaluation import ( Aggregator, @@ -162,6 +163,10 @@ def create_dummy_runs( failed_example_count=0, successful_example_count=1, description="runner", + labels=set(["test-label"]), + metadata=dict( + {"test_key": "test_value"}, + ), ) ) @@ -277,3 +282,30 @@ def test_elo_calculating_works_as_expected() -> None: elo.calculate(comeback_matches) assert elo.ratings[player2] > elo.ratings[player1] + + +def test_retrieve_argilla_evaluation_overview_has_submitted_partial_evaluation_overview_labels_metadata( + evaluator: ArgillaEvaluator[ + InstructInput, CompleteOutput, None, ComparisonEvaluation + ], + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + any_instruct_output: CompleteOutput, +) -> None: + run_count = 10 + run_ids = [f"{i}" for i in range(run_count)] + dataset_id = create_dummy_dataset(in_memory_dataset_repository) + create_dummy_runs( + in_memory_run_repository, any_instruct_output, run_ids, dataset_id + ) + + expected_labels = set(["test-label"]) + expected_metadata: dict[str, JsonSerializable] = dict({"test_key": "test_value"}) + + partial_overview = evaluator.submit( + *run_ids, labels=expected_labels, metadata=expected_metadata + ) + evaluation_overview = evaluator.retrieve(partial_overview.id) + + assert partial_overview.labels == evaluation_overview.labels + assert partial_overview.metadata == evaluation_overview.metadata diff --git a/tests/evaluation/infrastructure/test_repository_navigator.py b/tests/evaluation/infrastructure/test_repository_navigator.py index 8272ee5b0..3f4a5d9df 100644 --- a/tests/evaluation/infrastructure/test_repository_navigator.py +++ b/tests/evaluation/infrastructure/test_repository_navigator.py @@ -476,6 +476,8 @@ def test_aggregation_overviews_to_pandas_works_with_eval_overviews() -> None: successful_evaluation_count=1, failed_evaluation_count=1, description="", + labels=set(), + metadata=dict(), ) overview = AggregationOverview( evaluation_overviews=frozenset([eval_overview]), diff --git a/tests/evaluation/run/test_run_repository.py b/tests/evaluation/run/test_run_repository.py index 50fe9da98..3bdba5ea8 100644 --- a/tests/evaluation/run/test_run_repository.py +++ b/tests/evaluation/run/test_run_repository.py @@ -30,6 +30,8 @@ def run_overviews() -> Sequence[RunOverview]: failed_example_count=0, successful_example_count=1, description="test run overview", + labels=set(), + metadata=dict(), ) run_overviews.append(run_overview) return run_overviews diff --git a/tests/evaluation/run/test_runner.py b/tests/evaluation/run/test_runner.py index 0ebd0ba68..ee111947f 100644 --- a/tests/evaluation/run/test_runner.py +++ b/tests/evaluation/run/test_runner.py @@ -2,6 +2,7 @@ import pytest +from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.core import InMemoryTaskSpan, InMemoryTracer from intelligence_layer.evaluation import ( Example, @@ -122,3 +123,43 @@ def test_runner_runs_n_examples( entries = tracer.entries assert len(entries) == 1 assert all([isinstance(e, InMemoryTaskSpan) for e in entries]) + + +def test_runner_run_overview_has_default_metadata_and_labels( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + sequence_examples: Iterable[Example[str, None]], +) -> None: + examples = list(sequence_examples) + task = DummyTask() + runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo") + + dataset_id = in_memory_dataset_repository.create_dataset( + examples=examples, dataset_name="" + ).id + + overview = runner.run_dataset(dataset_id) + + assert overview.metadata == dict() + assert overview.labels == set() + + +def test_runner_run_overview_has_specified_metadata_and_labels( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + sequence_examples: Iterable[Example[str, None]], +) -> None: + run_labels = set(["test-label"]) + run_metadata: dict[str, JsonSerializable] = dict({"test_key": "test-value"}) + + examples = list(sequence_examples) + task = DummyTask() + runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo") + + dataset_id = in_memory_dataset_repository.create_dataset( + examples=examples, dataset_name="" + ).id + overview = runner.run_dataset(dataset_id, labels=run_labels, metadata=run_metadata) + + assert overview.metadata == run_metadata + assert overview.labels == run_labels From 7052efc40524dcd91b5d1ce977fe6aef51359ff0 Mon Sep 17 00:00:00 2001 From: Max Hammer Date: Wed, 12 Jun 2024 12:31:40 +0200 Subject: [PATCH 2/7] feat: use metadata in parameter_optimization notebook and additional ones TASK: IL-547 --- .../how_to_aggregate_evaluations.ipynb | 10 ++++-- .../how_tos/how_to_create_a_dataset.ipynb | 8 +++-- .../how_tos/how_to_evaluate_runs.ipynb | 8 +++-- .../parameter_optimization.ipynb | 31 +++++++++++-------- 4 files changed, 37 insertions(+), 20 deletions(-) diff --git a/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb b/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb index 2dadcce47..d64431a19 100644 --- a/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb +++ b/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb @@ -57,10 +57,14 @@ " \"MyAggregationDescription\",\n", " aggregation_logic,\n", ")\n", - "aggregation_overview = aggregator.aggregate_evaluation(*evaluation_ids)\n", + "aggregation_overview = aggregator.aggregate_evaluation(\n", + " *evaluation_ids, labels=set([\"label_a\"]), metadata=dict({\"key\": \"value\"})\n", + ")\n", "\n", "# Step 3\n", - "print(aggregation_overview.id)" + "print(aggregation_overview.id)\n", + "print(aggregation_overview.labels)\n", + "print(aggregation_overview.metadata)" ] } ], @@ -80,7 +84,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.8" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/how_to_create_a_dataset.ipynb b/src/documentation/how_tos/how_to_create_a_dataset.ipynb index 83ccc91f0..f760274e4 100644 --- a/src/documentation/how_tos/how_to_create_a_dataset.ipynb +++ b/src/documentation/how_tos/how_to_create_a_dataset.ipynb @@ -67,10 +67,14 @@ "dataset = dataset_repository.create_dataset(\n", " examples=examples,\n", " dataset_name=\"StoryDataset\",\n", + " labels=set([\"label1\", \"label2\"]),\n", + " metadata=dict({\"key_a\": [\"a\", \"b\"], \"key_b\": \"value\"}),\n", ")\n", "\n", "# Step 4\n", - "print(dataset.id)" + "print(dataset.id)\n", + "print(dataset.labels)\n", + "print(dataset.metadata)" ] } ], @@ -90,7 +94,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.8" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/how_to_evaluate_runs.ipynb b/src/documentation/how_tos/how_to_evaluate_runs.ipynb index 91d05c023..ff39f511c 100644 --- a/src/documentation/how_tos/how_to_evaluate_runs.ipynb +++ b/src/documentation/how_tos/how_to_evaluate_runs.ipynb @@ -57,10 +57,14 @@ " evaluation_logic,\n", ")\n", "\n", - "evaluation_overview = evaluator.evaluate_runs(*run_ids)\n", + "evaluation_overview = evaluator.evaluate_runs(\n", + " *run_ids, labels=set({\"label\"}), metadata=dict({\"key\": \"value\"})\n", + ")\n", "\n", "# Step 4\n", - "print(evaluation_overview.id)" + "print(evaluation_overview.id)\n", + "print(evaluation_overview.metadata)\n", + "print(evaluation_overview.labels)" ] } ], diff --git a/src/documentation/parameter_optimization.ipynb b/src/documentation/parameter_optimization.ipynb index 5eb833ae5..cf4f69457 100644 --- a/src/documentation/parameter_optimization.ipynb +++ b/src/documentation/parameter_optimization.ipynb @@ -12,6 +12,7 @@ "import string\n", "from collections.abc import Iterable\n", "\n", + "import pandas as pd\n", "from pydantic import BaseModel\n", "\n", "from intelligence_layer.core import Input, Task, TaskSpan\n", @@ -191,20 +192,25 @@ "for model, prompt in itertools.product(model_list, prompt_list):\n", " dummy_task = DummyTask(model=model, prompt=prompt)\n", "\n", - " # The description and the Experiment will later be used to identify the run parameters. Take special note of the delimiter '|'.\n", + " # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n", + " metadata = dict({\"model\": model, \"prompt\": prompt})\n", " description = f\"|{model}|{prompt}|\"\n", " runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n", - " run_overview = runner.run_dataset(dataset.id, description=description)\n", + " run_overview = runner.run_dataset(\n", + " dataset.id, metadata=metadata, description=description\n", + " )\n", "\n", - " eval_overview = evaluator.evaluate_runs(run_overview.id, description=description)\n", + " eval_overview = evaluator.evaluate_runs(\n", + " run_overview.id, metadata=metadata, description=description\n", + " )\n", "\n", " aggregator = Aggregator(\n", " evaluation_repository,\n", " aggregation_repository,\n", - " EXPERIMENT_NAME + \":\" + description,\n", + " EXPERIMENT_NAME,\n", " DummyAggregationLogic(),\n", " )\n", - " aggregator.aggregate_evaluation(eval_overview.id)" + " aggregator.aggregate_evaluation(eval_overview.id, metadata=metadata)" ] }, { @@ -229,7 +235,7 @@ " for overview in aggregation_repository.aggregation_overviews(\n", " aggregation_type=DummyAggregatedEvaluation\n", " )\n", - " if overview.description.startswith(EXPERIMENT_NAME)\n", + " if overview.description == EXPERIMENT_NAME\n", "]\n", "\n", "# Convert the desired aggregation into a pandas dataframe\n", @@ -252,12 +258,11 @@ "outputs": [], "source": [ "aggregation_fields = list(DummyAggregatedEvaluation.model_fields.keys())\n", - "formated_aggregations = formated_aggregations[[\"description\", *aggregation_fields]]\n", - "formated_aggregations[[\"model\", \"prompt\"]] = formated_aggregations[\n", - " \"description\"\n", - "].str.split(\"|\", expand=True)[[1, 2]]\n", - "formated_aggregations.drop(columns=\"description\", inplace=True)\n", - "\n", + "formated_aggregations = formated_aggregations[[\"metadata\"] + aggregation_fields]\n", + "# Flatten the metadata dict into columns\n", + "flattened_metadata = pd.json_normalize(formated_aggregations[\"metadata\"])\n", + "formated_aggregations = pd.concat([formated_aggregations, flattened_metadata], axis=1)\n", + "formated_aggregations.drop(columns=[\"metadata\"], inplace=True)\n", "display(\n", " formated_aggregations.sort_values(\n", " by=\"avg_normalized_capital_count\", ascending=False\n", @@ -306,7 +311,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.2" + "version": "3.11.8" } }, "nbformat": 4, From b3ed5bcbc83d0cc3f884de0176797f826615337d Mon Sep 17 00:00:00 2001 From: Merlin Kallenborn Date: Thu, 13 Jun 2024 10:45:06 +0200 Subject: [PATCH 3/7] feat: Add unwrapping of metadata to `aggregation_overviews_to_pandas`-function. TASK: IL-547 --- .../parameter_optimization.ipynb | 18 ++++---- .../infrastructure/repository_navigator.py | 8 ++++ .../test_repository_navigator.py | 42 +++++++++++++++++++ 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/documentation/parameter_optimization.ipynb b/src/documentation/parameter_optimization.ipynb index cf4f69457..b74db13fa 100644 --- a/src/documentation/parameter_optimization.ipynb +++ b/src/documentation/parameter_optimization.ipynb @@ -12,7 +12,6 @@ "import string\n", "from collections.abc import Iterable\n", "\n", - "import pandas as pd\n", "from pydantic import BaseModel\n", "\n", "from intelligence_layer.core import Input, Task, TaskSpan\n", @@ -194,7 +193,7 @@ "\n", " # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n", " metadata = dict({\"model\": model, \"prompt\": prompt})\n", - " description = f\"|{model}|{prompt}|\"\n", + " description = \"Evaluate dummy task\"\n", " runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n", " run_overview = runner.run_dataset(\n", " dataset.id, metadata=metadata, description=description\n", @@ -229,7 +228,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`.\n", + "# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`. Filtering can also be done on labels and/or metadata.\n", "aggregations_of_interest = [\n", " overview\n", " for overview in aggregation_repository.aggregation_overviews(\n", @@ -239,7 +238,10 @@ "]\n", "\n", "# Convert the desired aggregation into a pandas dataframe\n", - "formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)" + "formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)\n", + "\n", + "# Print all columns to check for columns of interest\n", + "formated_aggregations.columns" ] }, { @@ -258,11 +260,9 @@ "outputs": [], "source": [ "aggregation_fields = list(DummyAggregatedEvaluation.model_fields.keys())\n", - "formated_aggregations = formated_aggregations[[\"metadata\"] + aggregation_fields]\n", - "# Flatten the metadata dict into columns\n", - "flattened_metadata = pd.json_normalize(formated_aggregations[\"metadata\"])\n", - "formated_aggregations = pd.concat([formated_aggregations, flattened_metadata], axis=1)\n", - "formated_aggregations.drop(columns=[\"metadata\"], inplace=True)\n", + "# Filter for columns of interest\n", + "formated_aggregations = formated_aggregations[[\"model\", \"prompt\", *aggregation_fields]]\n", + "\n", "display(\n", " formated_aggregations.sort_values(\n", " by=\"avg_normalized_capital_count\", ascending=False\n", diff --git a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py index 4d3b8650f..93cd4dfce 100644 --- a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py +++ b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py @@ -144,6 +144,7 @@ def aggregation_overviews_to_pandas( aggregation_overviews: Sequence[AggregationOverview[AggregatedEvaluation]], unwrap_statistics: bool = True, strict: bool = True, + unwrap_metadata: bool = True, ) -> pd.DataFrame: """Converts aggregation overviews to a pandas table for easier comparison. @@ -152,6 +153,8 @@ def aggregation_overviews_to_pandas( unwrap_statistics: Unwrap the `statistics` field in the overviews into separate columns. Defaults to True. strict: Allow only overviews with exactly equal `statistics` types. Defaults to True. + unwrap_metadata: Unwrap the `metadata` field in the overviews into separate columns. + Defaults to True. Returns: A pandas :class:`DataFrame` containing an overview per row with fields as columns. @@ -173,6 +176,11 @@ def aggregation_overviews_to_pandas( df = df.join(pd.DataFrame(df["statistics"].to_list())).drop( columns=["statistics"] ) + if unwrap_metadata and "metadata" in df.columns: + df = pd.concat([df, pd.json_normalize(df["metadata"])], axis=1).drop( # type: ignore + columns=["metadata"] + ) + return df diff --git a/tests/evaluation/infrastructure/test_repository_navigator.py b/tests/evaluation/infrastructure/test_repository_navigator.py index 3f4a5d9df..8cc93a105 100644 --- a/tests/evaluation/infrastructure/test_repository_navigator.py +++ b/tests/evaluation/infrastructure/test_repository_navigator.py @@ -466,6 +466,48 @@ class AggregationDummy2(BaseModel): assert "statistics" not in df.columns +def test_aggregation_overviews_to_pandas_unwrap_metadata() -> None: + # given + + overview = AggregationOverview( + evaluation_overviews=frozenset([]), + id="aggregation-id", + start=utc_now(), + end=utc_now(), + successful_evaluation_count=5, + crashed_during_evaluation_count=3, + description="dummy-evaluator", + statistics=AggregationDummy(), + labels=set(), + metadata=dict({"model": "model_a", "prompt": "prompt_a"}), + ) + overview2 = AggregationOverview( + evaluation_overviews=frozenset([]), + id="aggregation-id2", + start=utc_now(), + end=utc_now(), + successful_evaluation_count=5, + crashed_during_evaluation_count=3, + description="dummy-evaluator", + statistics=AggregationDummy(), + labels=set(), + metadata=dict( + {"model": "model_a", "prompt": "prompt_a", "different_column": "value"} + ), + ) + + df = aggregation_overviews_to_pandas( + [overview, overview2], unwrap_metadata=True, strict=False + ) + + assert "model" in df.columns + assert "prompt" in df.columns + assert "different_column" in df.columns + assert "metadata" not in df.columns + assert all(df["model"] == "model_a") + assert all(df["prompt"] == "prompt_a") + + def test_aggregation_overviews_to_pandas_works_with_eval_overviews() -> None: # given eval_overview = EvaluationOverview( From 4899fba1a2f7aea7ef61bfbfa3e9558c44973ff5 Mon Sep 17 00:00:00 2001 From: Merlin Kallenborn Date: Thu, 13 Jun 2024 11:16:34 +0200 Subject: [PATCH 4/7] fix: Make labels and metadata mutuable defaults TASK:IL-547 --- .../evaluation/aggregation/aggregator.py | 8 ++++++-- .../evaluation/dataset/dataset_repository.py | 4 ++-- .../dataset/file_dataset_repository.py | 8 ++++++-- .../dataset/in_memory_dataset_repository.py | 8 ++++++-- .../single_huggingface_dataset_repository.py | 4 ++-- .../evaluation/evaluator/argilla_evaluator.py | 8 ++++++-- .../evaluation/evaluation/evaluator/evaluator.py | 8 ++++++-- .../evaluator/incremental_evaluator.py | 16 ++++++++++++---- src/intelligence_layer/evaluation/run/runner.py | 8 ++++++-- tests/evaluation/evaluation/test_evaluator.py | 2 ++ .../infrastructure/test_repository_navigator.py | 4 +++- 11 files changed, 57 insertions(+), 21 deletions(-) diff --git a/src/intelligence_layer/evaluation/aggregation/aggregator.py b/src/intelligence_layer/evaluation/aggregation/aggregator.py index 028204fbd..3cf232f12 100644 --- a/src/intelligence_layer/evaluation/aggregation/aggregator.py +++ b/src/intelligence_layer/evaluation/aggregation/aggregator.py @@ -184,8 +184,8 @@ def evaluation_type(self) -> type[Evaluation]: def aggregate_evaluation( self, *eval_ids: str, - labels: set[str] = set(), - metadata: dict[str, JsonSerializable] = dict(), + labels: set[str] | None = None, + metadata: dict[str, JsonSerializable] | None = None, ) -> AggregationOverview[AggregatedEvaluation]: """Aggregates all evaluations into an overview that includes high-level statistics. @@ -200,6 +200,10 @@ def aggregate_evaluation( Returns: An overview of the aggregated evaluation. """ + if metadata is None: + metadata = dict() + if labels is None: + labels = set() def load_eval_overview(evaluation_id: str) -> EvaluationOverview: evaluation_overview = self._evaluation_repository.evaluation_overview( diff --git a/src/intelligence_layer/evaluation/dataset/dataset_repository.py b/src/intelligence_layer/evaluation/dataset/dataset_repository.py index 7421f6e72..ead03f2a3 100644 --- a/src/intelligence_layer/evaluation/dataset/dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/dataset_repository.py @@ -23,8 +23,8 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, - labels: set[str] = set(), - metadata: dict[str, JsonSerializable] = dict(), + labels: set[str] | None = None, + metadata: dict[str, JsonSerializable] | None = None, ) -> Dataset: """Creates a dataset from given :class:`Example`s and returns the ID of that dataset. diff --git a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py index dae839294..43dc8b125 100644 --- a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py @@ -32,9 +32,13 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, - labels: set[str] = set(), - metadata: dict[str, JsonSerializable] = dict(), + labels: set[str] | None = None, + metadata: dict[str, JsonSerializable] | None = None, ) -> Dataset: + if metadata is None: + metadata = dict() + if labels is None: + labels = set() dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata) if id is not None: dataset.id = id diff --git a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py index ab63eb99d..bdf2993ae 100644 --- a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py @@ -25,9 +25,13 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, - labels: set[str] = set(), - metadata: dict[str, JsonSerializable] = dict(), + labels: set[str] | None = None, + metadata: dict[str, JsonSerializable] | None = None, ) -> Dataset: + if metadata is None: + metadata = dict() + if labels is None: + labels = set() dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata) if id is not None: dataset.id = id diff --git a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py index 4f8666d7c..504a507d7 100644 --- a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py @@ -34,8 +34,8 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, - labels: set[str] = set(), - metadata: dict[str, JsonSerializable] = dict(), + labels: set[str] | None = None, + metadata: dict[str, JsonSerializable] | None = None, ) -> Dataset: raise NotImplementedError diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py index 2c455b79c..51a81f9e9 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py @@ -139,9 +139,13 @@ def submit( dataset_name: Optional[str] = None, abort_on_error: bool = False, skip_example_on_any_failure: bool = True, - labels: set[str] = set(), - metadata: dict[str, JsonSerializable] = dict(), + labels: Optional[set[str]] = None, + metadata: Optional[dict[str, JsonSerializable]] = None, ) -> PartialEvaluationOverview: + if metadata is None: + metadata = dict() + if labels is None: + labels = set() argilla_dataset_id = self._client.create_dataset( self._workspace_id, dataset_name if dataset_name else str(uuid4()), diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py index f5f6f5773..097b25b4a 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py @@ -100,8 +100,8 @@ def evaluate_runs( abort_on_error: bool = False, skip_example_on_any_failure: bool = True, description: Optional[str] = None, - labels: set[str] = set(), - metadata: dict[str, JsonSerializable] = dict(), + labels: Optional[set[str]] = None, + metadata: Optional[dict[str, JsonSerializable]] = None, ) -> EvaluationOverview: """Evaluates all generated outputs in the run. @@ -129,6 +129,10 @@ def evaluate_runs( returned but instead stored in the :class:`EvaluationRepository` provided in the __init__. """ + if metadata is None: + metadata = dict() + if labels is None: + labels = set() start = utc_now() run_overviews = self._load_run_overviews(*run_ids) eval_id = self._evaluation_repository.initialize_evaluation() diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py index 578c83e5f..bac9815bd 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py @@ -124,8 +124,8 @@ def evaluate_additional_runs( previous_evaluation_ids: Optional[list[str]] = None, num_examples: Optional[int] = None, abort_on_error: bool = False, - labels: set[str] = set(), - metadata: dict[str, JsonSerializable] = dict(), + labels: Optional[set[str]] = None, + metadata: Optional[dict[str, JsonSerializable]] = None, ) -> EvaluationOverview: """Evaluate all runs while considering which runs have already been evaluated according to `previous_evaluation_id`. @@ -152,6 +152,10 @@ def evaluate_additional_runs( returned but instead stored in the :class:`EvaluationRepository` provided in the __init__. """ + if metadata is None: + metadata = dict() + if labels is None: + labels = set() previous_run_ids = [] previous_evaluation_ids = previous_evaluation_ids or [] @@ -179,9 +183,13 @@ def evaluate_runs( abort_on_error: bool = False, skip_example_on_any_failure: bool = True, description: Optional[str] = None, - labels: set[str] = set(), - metadata: dict[str, JsonSerializable] = dict(), + labels: set[str] | None = None, + metadata: dict[str, JsonSerializable] | None = None, ) -> EvaluationOverview: + if metadata is None: + metadata = dict() + if labels is None: + labels = set() self._evaluation_logic.set_previous_run_output_ids([]) return super().evaluate_runs( *run_ids, diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py index 35e950096..aaa4e5069 100644 --- a/src/intelligence_layer/evaluation/run/runner.py +++ b/src/intelligence_layer/evaluation/run/runner.py @@ -82,8 +82,8 @@ def run_dataset( max_workers: int = 10, description: Optional[str] = None, trace_examples_individually: bool = True, - labels: set[str] = set(), - metadata: dict[str, JsonSerializable] = dict(), + labels: Optional[set[str]] = None, + metadata: Optional[dict[str, JsonSerializable]] = None, ) -> RunOverview: """Generates all outputs for the provided dataset. @@ -107,6 +107,10 @@ def run_dataset( An overview of the run. Outputs will not be returned but instead stored in the :class:`RunRepository` provided in the __init__. """ + if labels is None: + labels = set() + if metadata is None: + metadata = dict() def run( example: Example[Input, ExpectedOutput], diff --git a/tests/evaluation/evaluation/test_evaluator.py b/tests/evaluation/evaluation/test_evaluator.py index 0ac17d14c..a7abf69ae 100644 --- a/tests/evaluation/evaluation/test_evaluator.py +++ b/tests/evaluation/evaluation/test_evaluator.py @@ -710,6 +710,8 @@ def test_eval_raises_error_if_examples_and_example_outputs_dont_match( num_examples=None, ) ) + + def test_evaluator_evaluate_runs_sets_default_values( dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str ) -> None: diff --git a/tests/evaluation/infrastructure/test_repository_navigator.py b/tests/evaluation/infrastructure/test_repository_navigator.py index 8cc93a105..ddfc98ed7 100644 --- a/tests/evaluation/infrastructure/test_repository_navigator.py +++ b/tests/evaluation/infrastructure/test_repository_navigator.py @@ -437,7 +437,9 @@ def test_aggregation_overviews_to_pandas(length: int) -> None: # given overview = create_aggregation_overview(AggregationDummy()) # when - df = aggregation_overviews_to_pandas([overview] * length, unwrap_statistics=False) + df = aggregation_overviews_to_pandas( + [overview] * length, unwrap_statistics=False, unwrap_metadata=False + ) # then assert len(df) == length assert set(AggregationOverview.model_fields.keys()) == set(df.columns) From 6674bc77427f1518fbce8f3d8aa1cb9ef9304278 Mon Sep 17 00:00:00 2001 From: Sebastian Niehus Date: Thu, 13 Jun 2024 12:12:46 +0200 Subject: [PATCH 5/7] fix: Minor fixes and replacements TASK:IL-547 --- .../evaluation/aggregation/aggregator.py | 6 ++++-- .../evaluation/aggregation/domain.py | 6 ++++-- .../evaluation/dataset/dataset_repository.py | 6 ++++-- src/intelligence_layer/evaluation/dataset/domain.py | 3 +-- .../evaluation/dataset/file_dataset_repository.py | 4 ++-- .../dataset/in_memory_dataset_repository.py | 6 ++++-- .../dataset/single_huggingface_dataset_repository.py | 4 ++-- .../evaluation/evaluation/domain.py | 6 +++--- .../evaluation/evaluator/argilla_evaluator.py | 6 ++++-- .../evaluation/evaluation/evaluator/evaluator.py | 6 ++++-- .../evaluation/evaluator/incremental_evaluator.py | 8 +++++--- src/intelligence_layer/evaluation/run/domain.py | 4 ++-- src/intelligence_layer/evaluation/run/runner.py | 6 ++++-- tests/evaluation/dataset/test_dataset_repository.py | 8 +++++--- tests/evaluation/evaluation/test_evaluator.py | 12 +++++++----- .../test_instruct_comparison_argilla_evaluator.py | 10 ++++++---- tests/evaluation/run/test_runner.py | 8 +++++--- 17 files changed, 66 insertions(+), 43 deletions(-) diff --git a/src/intelligence_layer/evaluation/aggregation/aggregator.py b/src/intelligence_layer/evaluation/aggregation/aggregator.py index 3cf232f12..ab8e518e8 100644 --- a/src/intelligence_layer/evaluation/aggregation/aggregator.py +++ b/src/intelligence_layer/evaluation/aggregation/aggregator.py @@ -11,7 +11,9 @@ ) from uuid import uuid4 -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import utc_now from intelligence_layer.evaluation.aggregation.aggregation_repository import ( AggregationRepository, @@ -185,7 +187,7 @@ def aggregate_evaluation( self, *eval_ids: str, labels: set[str] | None = None, - metadata: dict[str, JsonSerializable] | None = None, + metadata: SerializableDict | None = None, ) -> AggregationOverview[AggregatedEvaluation]: """Aggregates all evaluations into an overview that includes high-level statistics. diff --git a/src/intelligence_layer/evaluation/aggregation/domain.py b/src/intelligence_layer/evaluation/aggregation/domain.py index 01f810e07..70ffda668 100644 --- a/src/intelligence_layer/evaluation/aggregation/domain.py +++ b/src/intelligence_layer/evaluation/aggregation/domain.py @@ -4,7 +4,9 @@ from pydantic import BaseModel, SerializeAsAny -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.evaluation.evaluation.domain import ( EvaluationFailed, EvaluationOverview, @@ -46,7 +48,7 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True) description: str statistics: SerializeAsAny[AggregatedEvaluation] labels: set[str] = set() - metadata: dict[str, JsonSerializable] = dict() + metadata: SerializableDict = dict() @property def run_ids(self) -> Sequence[str]: diff --git a/src/intelligence_layer/evaluation/dataset/dataset_repository.py b/src/intelligence_layer/evaluation/dataset/dataset_repository.py index ead03f2a3..e480e83cf 100644 --- a/src/intelligence_layer/evaluation/dataset/dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/dataset_repository.py @@ -2,7 +2,9 @@ from collections.abc import Iterable from typing import Optional -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import Input from intelligence_layer.evaluation.dataset.domain import ( Dataset, @@ -24,7 +26,7 @@ def create_dataset( dataset_name: str, id: str | None = None, labels: set[str] | None = None, - metadata: dict[str, JsonSerializable] | None = None, + metadata: SerializableDict | None = None, ) -> Dataset: """Creates a dataset from given :class:`Example`s and returns the ID of that dataset. diff --git a/src/intelligence_layer/evaluation/dataset/domain.py b/src/intelligence_layer/evaluation/dataset/domain.py index d2353e4e5..3d6b4cf9d 100644 --- a/src/intelligence_layer/evaluation/dataset/domain.py +++ b/src/intelligence_layer/evaluation/dataset/domain.py @@ -5,7 +5,6 @@ from rich.tree import Tree from intelligence_layer.connectors.base.json_serializable import ( - JsonSerializable, SerializableDict, ) from intelligence_layer.core.task import Input @@ -70,7 +69,7 @@ class Dataset(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) name: str labels: set[str] = set() - metadata: dict[str, JsonSerializable] = dict() + metadata: SerializableDict = dict() def __repr__(self) -> str: return self.__str__() diff --git a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py index 43dc8b125..85575c8ad 100644 --- a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py @@ -5,7 +5,7 @@ from fsspec.implementations.local import LocalFileSystem # type: ignore -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import SerializableDict from intelligence_layer.core import Input, JsonSerializer, PydanticSerializable from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import ( @@ -33,7 +33,7 @@ def create_dataset( dataset_name: str, id: str | None = None, labels: set[str] | None = None, - metadata: dict[str, JsonSerializable] | None = None, + metadata: SerializableDict | None = None, ) -> Dataset: if metadata is None: metadata = dict() diff --git a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py index bdf2993ae..2ca418cf9 100644 --- a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py @@ -1,7 +1,9 @@ from collections.abc import Iterable, Sequence from typing import Optional, cast -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import Input, PydanticSerializable from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import ( @@ -26,7 +28,7 @@ def create_dataset( dataset_name: str, id: str | None = None, labels: set[str] | None = None, - metadata: dict[str, JsonSerializable] | None = None, + metadata: SerializableDict | None = None, ) -> Dataset: if metadata is None: metadata = dict() diff --git a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py index 504a507d7..4608a248f 100644 --- a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py @@ -5,7 +5,7 @@ from datasets import DatasetDict, IterableDataset, IterableDatasetDict from pydantic import BaseModel -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import SerializableDict from intelligence_layer.core.task import Input from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import ( @@ -35,7 +35,7 @@ def create_dataset( dataset_name: str, id: str | None = None, labels: set[str] | None = None, - metadata: dict[str, JsonSerializable] | None = None, + metadata: SerializableDict | None = None, ) -> Dataset: raise NotImplementedError diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py index 944da3628..34bb56e05 100644 --- a/src/intelligence_layer/evaluation/evaluation/domain.py +++ b/src/intelligence_layer/evaluation/evaluation/domain.py @@ -5,7 +5,7 @@ from pydantic import BaseModel, SerializeAsAny from rich.tree import Tree -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import SerializableDict from intelligence_layer.evaluation.run.domain import RunOverview Evaluation = TypeVar("Evaluation", bound=BaseModel, covariant=True) @@ -83,7 +83,7 @@ class PartialEvaluationOverview(BaseModel, frozen=True): submitted_evaluation_count: int description: str labels: set[str] - metadata: dict[str, JsonSerializable] + metadata: SerializableDict def __repr__(self) -> str: return self.__str__() @@ -133,7 +133,7 @@ class EvaluationOverview(BaseModel, frozen=True): failed_evaluation_count: int description: str labels: set[str] - metadata: dict[str, JsonSerializable] + metadata: SerializableDict def __repr__(self) -> str: return self.__str__() diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py index 51a81f9e9..ee49bfaa8 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py @@ -16,7 +16,9 @@ RatingQuestion, RecordData, ) -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import CompleteOutput, Input, InstructInput, Output from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput @@ -140,7 +142,7 @@ def submit( abort_on_error: bool = False, skip_example_on_any_failure: bool = True, labels: Optional[set[str]] = None, - metadata: Optional[dict[str, JsonSerializable]] = None, + metadata: Optional[SerializableDict] = None, ) -> PartialEvaluationOverview: if metadata is None: metadata = dict() diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py index 097b25b4a..bc435b3d0 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py @@ -4,7 +4,9 @@ from tqdm import tqdm -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import Input, Output, utc_now from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput @@ -101,7 +103,7 @@ def evaluate_runs( skip_example_on_any_failure: bool = True, description: Optional[str] = None, labels: Optional[set[str]] = None, - metadata: Optional[dict[str, JsonSerializable]] = None, + metadata: Optional[SerializableDict] = None, ) -> EvaluationOverview: """Evaluates all generated outputs in the run. diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py index bac9815bd..b60da37d6 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py @@ -6,7 +6,9 @@ from pydantic import BaseModel -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import Input, Output from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput @@ -125,7 +127,7 @@ def evaluate_additional_runs( num_examples: Optional[int] = None, abort_on_error: bool = False, labels: Optional[set[str]] = None, - metadata: Optional[dict[str, JsonSerializable]] = None, + metadata: Optional[SerializableDict] = None, ) -> EvaluationOverview: """Evaluate all runs while considering which runs have already been evaluated according to `previous_evaluation_id`. @@ -184,7 +186,7 @@ def evaluate_runs( skip_example_on_any_failure: bool = True, description: Optional[str] = None, labels: set[str] | None = None, - metadata: dict[str, JsonSerializable] | None = None, + metadata: SerializableDict | None = None, ) -> EvaluationOverview: if metadata is None: metadata = dict() diff --git a/src/intelligence_layer/evaluation/run/domain.py b/src/intelligence_layer/evaluation/run/domain.py index 11526391e..fe40f1be1 100644 --- a/src/intelligence_layer/evaluation/run/domain.py +++ b/src/intelligence_layer/evaluation/run/domain.py @@ -5,7 +5,7 @@ from pydantic import BaseModel from rich.tree import Tree -from intelligence_layer.connectors import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import SerializableDict from intelligence_layer.core.task import Output @@ -113,7 +113,7 @@ class RunOverview(BaseModel, frozen=True): successful_example_count: int description: str labels: set[str] - metadata: dict[str, JsonSerializable] + metadata: SerializableDict def __repr__(self) -> str: return self.__str__() diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py index aaa4e5069..17fb20d5a 100644 --- a/src/intelligence_layer/evaluation/run/runner.py +++ b/src/intelligence_layer/evaluation/run/runner.py @@ -8,7 +8,9 @@ from pydantic import JsonValue from tqdm import tqdm -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import ( CompositeTracer, Input, @@ -83,7 +85,7 @@ def run_dataset( description: Optional[str] = None, trace_examples_individually: bool = True, labels: Optional[set[str]] = None, - metadata: Optional[dict[str, JsonSerializable]] = None, + metadata: Optional[SerializableDict] = None, ) -> RunOverview: """Generates all outputs for the provided dataset. diff --git a/tests/evaluation/dataset/test_dataset_repository.py b/tests/evaluation/dataset/test_dataset_repository.py index db00dce03..c92fc73cd 100644 --- a/tests/evaluation/dataset/test_dataset_repository.py +++ b/tests/evaluation/dataset/test_dataset_repository.py @@ -8,7 +8,9 @@ from fsspec.implementations.memory import MemoryFileSystem # type: ignore from pytest import FixtureRequest, fixture, mark, raises -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.evaluation import ( DatasetRepository, Example, @@ -105,8 +107,8 @@ def test_dataset_repository_create_dataset_explicit_values_overwrite_defaults( ) -> None: expected_id = str(uuid4()) expected_name = "test_name" - expected_labels = set(["test_label"]) - expected_metadata: dict[str, JsonSerializable] = dict({"test_key": "test_value"}) + expected_labels = {"test_label"} + expected_metadata: SerializableDict = dict({"test_key": "test_value"}) dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) diff --git a/tests/evaluation/evaluation/test_evaluator.py b/tests/evaluation/evaluation/test_evaluator.py index a7abf69ae..ee133cf43 100644 --- a/tests/evaluation/evaluation/test_evaluator.py +++ b/tests/evaluation/evaluation/test_evaluator.py @@ -5,7 +5,9 @@ from pydantic import BaseModel from pytest import fixture -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import Input, Output, Task, Tracer from intelligence_layer.core.tracer.in_memory_tracer import ( InMemoryTaskSpan, @@ -723,8 +725,8 @@ def test_evaluator_evaluate_runs_sets_default_values( def test_evaluator_evaluate_runs_specific_values_overwrite_defaults( dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str ) -> None: - expected_labels = set(["test_label"]) - expected_metadata: dict[str, JsonSerializable] = dict({"test_key": "test-value"}) + expected_labels = {"test_label"} + expected_metadata: SerializableDict = dict({"test_key": "test-value"}) evaluation_overview = dummy_evaluator.evaluate_runs( run_id, labels=expected_labels, metadata=expected_metadata ) @@ -753,8 +755,8 @@ def test_aggregate_evaluation_specific_values_overwrite_defaults( ], run_id: str, ) -> None: - expected_labels = set(["test_label"]) - expected_metadata: dict[str, JsonSerializable] = dict({"test_key": "test-value"}) + expected_labels = {"test_label"} + expected_metadata: SerializableDict = dict({"test_key": "test-value"}) evaluation_overview = dummy_evaluator.evaluate_runs(run_id) aggregation_overview = dummy_aggregator.aggregate_evaluation( evaluation_overview.id, labels=expected_labels, metadata=expected_metadata diff --git a/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py index fae966d89..d30e42688 100644 --- a/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py +++ b/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py @@ -14,7 +14,9 @@ Question, RecordData, ) -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import CompleteOutput, InstructInput, utc_now from intelligence_layer.evaluation import ( Aggregator, @@ -163,7 +165,7 @@ def create_dummy_runs( failed_example_count=0, successful_example_count=1, description="runner", - labels=set(["test-label"]), + labels={"test-label"}, metadata=dict( {"test_key": "test_value"}, ), @@ -299,8 +301,8 @@ def test_retrieve_argilla_evaluation_overview_has_submitted_partial_evaluation_o in_memory_run_repository, any_instruct_output, run_ids, dataset_id ) - expected_labels = set(["test-label"]) - expected_metadata: dict[str, JsonSerializable] = dict({"test_key": "test_value"}) + expected_labels = {"test-label"} + expected_metadata: SerializableDict = dict({"test_key": "test_value"}) partial_overview = evaluator.submit( *run_ids, labels=expected_labels, metadata=expected_metadata diff --git a/tests/evaluation/run/test_runner.py b/tests/evaluation/run/test_runner.py index ee111947f..f3e32eb06 100644 --- a/tests/evaluation/run/test_runner.py +++ b/tests/evaluation/run/test_runner.py @@ -2,7 +2,9 @@ import pytest -from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import InMemoryTaskSpan, InMemoryTracer from intelligence_layer.evaluation import ( Example, @@ -149,8 +151,8 @@ def test_runner_run_overview_has_specified_metadata_and_labels( in_memory_run_repository: InMemoryRunRepository, sequence_examples: Iterable[Example[str, None]], ) -> None: - run_labels = set(["test-label"]) - run_metadata: dict[str, JsonSerializable] = dict({"test_key": "test-value"}) + run_labels = {"test-label"} + run_metadata: SerializableDict = dict({"test_key": "test-value"}) examples = list(sequence_examples) task = DummyTask() From 0e1858025d31d589a4c0965858f9b9b727e48681 Mon Sep 17 00:00:00 2001 From: MerlinKallenbornAA <166396684+MerlinKallenbornAA@users.noreply.github.com> Date: Thu, 13 Jun 2024 13:35:23 +0200 Subject: [PATCH 6/7] Update src/intelligence_layer/evaluation/dataset/dataset_repository.py Co-authored-by: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com> --- src/intelligence_layer/evaluation/dataset/dataset_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intelligence_layer/evaluation/dataset/dataset_repository.py b/src/intelligence_layer/evaluation/dataset/dataset_repository.py index e480e83cf..999aab760 100644 --- a/src/intelligence_layer/evaluation/dataset/dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/dataset_repository.py @@ -35,7 +35,7 @@ def create_dataset( dataset_name: A name for the dataset. id: The dataset ID. If `None`, an ID will be generated. labels: A list of labels for filtering. Defaults to an empty list. - metadata: A dict for additional information about the dataset. Default to an empty dict. + metadata: A dict for additional information about the dataset. Defaults to an empty dict. Returns: The created :class:`Dataset`. From df59b6abe828f543ed8809306cc23a1501160b6b Mon Sep 17 00:00:00 2001 From: MerlinKallenbornAA <166396684+MerlinKallenbornAA@users.noreply.github.com> Date: Thu, 13 Jun 2024 13:36:27 +0200 Subject: [PATCH 7/7] Apply suggestions from code review Co-authored-by: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com> --- src/intelligence_layer/evaluation/aggregation/aggregator.py | 2 +- .../evaluation/evaluation/evaluator/evaluator.py | 2 +- .../evaluation/evaluation/evaluator/incremental_evaluator.py | 2 +- src/intelligence_layer/evaluation/run/runner.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/intelligence_layer/evaluation/aggregation/aggregator.py b/src/intelligence_layer/evaluation/aggregation/aggregator.py index ab8e518e8..b93da6ba0 100644 --- a/src/intelligence_layer/evaluation/aggregation/aggregator.py +++ b/src/intelligence_layer/evaluation/aggregation/aggregator.py @@ -197,7 +197,7 @@ def aggregate_evaluation( eval_ids: An overview of the evaluation to be aggregated. Does not include actual evaluations as these will be retrieved from the repository. labels: A list of labels for filtering. Defaults to an empty list. - metadata: A dict for additional information about the aggregation overview. Default to an empty dict. + metadata: A dict for additional information about the aggregation overview. Defaults to an empty dict. Returns: An overview of the aggregated evaluation. diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py index bc435b3d0..a4c61f702 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py @@ -124,7 +124,7 @@ def evaluate_runs( skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True. description: Optional description of the evaluation. Defaults to None. labels: A list of labels for filtering. Defaults to an empty list. - metadata: A dict for additional information about the evaluation overview. Default to an empty dict. + metadata: A dict for additional information about the evaluation overview. Defaults to an empty dict. Returns: EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py index b60da37d6..84f1134d7 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py @@ -147,7 +147,7 @@ def evaluate_additional_runs( Always the first n runs stored in the evaluation repository. Defaults to None. abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False. labels: A list of labels for filtering. Defaults to an empty list. - metadata: A dict for additional information about the evaluation overview. Default to an empty dict. + metadata: A dict for additional information about the evaluation overview. Defaults to an empty dict. Returns: EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py index 17fb20d5a..20d96d790 100644 --- a/src/intelligence_layer/evaluation/run/runner.py +++ b/src/intelligence_layer/evaluation/run/runner.py @@ -103,7 +103,7 @@ def run_dataset( description: An optional description of the run. Defaults to None. trace_examples_individually: Flag to create individual tracers for each example. Defaults to True. labels: A list of labels for filtering. Defaults to an empty list. - metadata: A dict for additional information about the run overview. Default to an empty dict. + metadata: A dict for additional information about the run overview. Defaults to an empty dict. Returns: An overview of the run. Outputs will not be returned but instead stored in the