diff --git a/.gitignore b/.gitignore index 0caf99001..7476d43b5 100644 --- a/.gitignore +++ b/.gitignore @@ -248,3 +248,6 @@ fabric.properties # End of https://www.toptal.com/developers/gitignore/api/intellij+all .python-version + +src/documentation/human-eval-data/datasets* +src/documentation/human-eval-data/runs/* diff --git a/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb b/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb index 2dadcce47..d64431a19 100644 --- a/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb +++ b/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb @@ -57,10 +57,14 @@ " \"MyAggregationDescription\",\n", " aggregation_logic,\n", ")\n", - "aggregation_overview = aggregator.aggregate_evaluation(*evaluation_ids)\n", + "aggregation_overview = aggregator.aggregate_evaluation(\n", + " *evaluation_ids, labels=set([\"label_a\"]), metadata=dict({\"key\": \"value\"})\n", + ")\n", "\n", "# Step 3\n", - "print(aggregation_overview.id)" + "print(aggregation_overview.id)\n", + "print(aggregation_overview.labels)\n", + "print(aggregation_overview.metadata)" ] } ], @@ -80,7 +84,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.8" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/how_to_create_a_dataset.ipynb b/src/documentation/how_tos/how_to_create_a_dataset.ipynb index 83ccc91f0..f760274e4 100644 --- a/src/documentation/how_tos/how_to_create_a_dataset.ipynb +++ b/src/documentation/how_tos/how_to_create_a_dataset.ipynb @@ -67,10 +67,14 @@ "dataset = dataset_repository.create_dataset(\n", " examples=examples,\n", " dataset_name=\"StoryDataset\",\n", + " labels=set([\"label1\", \"label2\"]),\n", + " metadata=dict({\"key_a\": [\"a\", \"b\"], \"key_b\": \"value\"}),\n", ")\n", "\n", "# Step 4\n", - "print(dataset.id)" + "print(dataset.id)\n", + "print(dataset.labels)\n", + "print(dataset.metadata)" ] } ], @@ -90,7 +94,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.8" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/how_to_evaluate_runs.ipynb b/src/documentation/how_tos/how_to_evaluate_runs.ipynb index 91d05c023..ff39f511c 100644 --- a/src/documentation/how_tos/how_to_evaluate_runs.ipynb +++ b/src/documentation/how_tos/how_to_evaluate_runs.ipynb @@ -57,10 +57,14 @@ " evaluation_logic,\n", ")\n", "\n", - "evaluation_overview = evaluator.evaluate_runs(*run_ids)\n", + "evaluation_overview = evaluator.evaluate_runs(\n", + " *run_ids, labels=set({\"label\"}), metadata=dict({\"key\": \"value\"})\n", + ")\n", "\n", "# Step 4\n", - "print(evaluation_overview.id)" + "print(evaluation_overview.id)\n", + "print(evaluation_overview.metadata)\n", + "print(evaluation_overview.labels)" ] } ], diff --git a/src/documentation/parameter_optimization.ipynb b/src/documentation/parameter_optimization.ipynb index 5eb833ae5..b74db13fa 100644 --- a/src/documentation/parameter_optimization.ipynb +++ b/src/documentation/parameter_optimization.ipynb @@ -191,20 +191,25 @@ "for model, prompt in itertools.product(model_list, prompt_list):\n", " dummy_task = DummyTask(model=model, prompt=prompt)\n", "\n", - " # The description and the Experiment will later be used to identify the run parameters. Take special note of the delimiter '|'.\n", - " description = f\"|{model}|{prompt}|\"\n", + " # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n", + " metadata = dict({\"model\": model, \"prompt\": prompt})\n", + " description = \"Evaluate dummy task\"\n", " runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n", - " run_overview = runner.run_dataset(dataset.id, description=description)\n", + " run_overview = runner.run_dataset(\n", + " dataset.id, metadata=metadata, description=description\n", + " )\n", "\n", - " eval_overview = evaluator.evaluate_runs(run_overview.id, description=description)\n", + " eval_overview = evaluator.evaluate_runs(\n", + " run_overview.id, metadata=metadata, description=description\n", + " )\n", "\n", " aggregator = Aggregator(\n", " evaluation_repository,\n", " aggregation_repository,\n", - " EXPERIMENT_NAME + \":\" + description,\n", + " EXPERIMENT_NAME,\n", " DummyAggregationLogic(),\n", " )\n", - " aggregator.aggregate_evaluation(eval_overview.id)" + " aggregator.aggregate_evaluation(eval_overview.id, metadata=metadata)" ] }, { @@ -223,17 +228,20 @@ "metadata": {}, "outputs": [], "source": [ - "# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`.\n", + "# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`. Filtering can also be done on labels and/or metadata.\n", "aggregations_of_interest = [\n", " overview\n", " for overview in aggregation_repository.aggregation_overviews(\n", " aggregation_type=DummyAggregatedEvaluation\n", " )\n", - " if overview.description.startswith(EXPERIMENT_NAME)\n", + " if overview.description == EXPERIMENT_NAME\n", "]\n", "\n", "# Convert the desired aggregation into a pandas dataframe\n", - "formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)" + "formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)\n", + "\n", + "# Print all columns to check for columns of interest\n", + "formated_aggregations.columns" ] }, { @@ -252,11 +260,8 @@ "outputs": [], "source": [ "aggregation_fields = list(DummyAggregatedEvaluation.model_fields.keys())\n", - "formated_aggregations = formated_aggregations[[\"description\", *aggregation_fields]]\n", - "formated_aggregations[[\"model\", \"prompt\"]] = formated_aggregations[\n", - " \"description\"\n", - "].str.split(\"|\", expand=True)[[1, 2]]\n", - "formated_aggregations.drop(columns=\"description\", inplace=True)\n", + "# Filter for columns of interest\n", + "formated_aggregations = formated_aggregations[[\"model\", \"prompt\", *aggregation_fields]]\n", "\n", "display(\n", " formated_aggregations.sort_values(\n", @@ -306,7 +311,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.2" + "version": "3.11.8" } }, "nbformat": 4, diff --git a/src/intelligence_layer/evaluation/aggregation/aggregator.py b/src/intelligence_layer/evaluation/aggregation/aggregator.py index 29b9053f7..b93da6ba0 100644 --- a/src/intelligence_layer/evaluation/aggregation/aggregator.py +++ b/src/intelligence_layer/evaluation/aggregation/aggregator.py @@ -11,6 +11,9 @@ ) from uuid import uuid4 +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import utc_now from intelligence_layer.evaluation.aggregation.aggregation_repository import ( AggregationRepository, @@ -181,7 +184,10 @@ def evaluation_type(self) -> type[Evaluation]: @final def aggregate_evaluation( - self, *eval_ids: str + self, + *eval_ids: str, + labels: set[str] | None = None, + metadata: SerializableDict | None = None, ) -> AggregationOverview[AggregatedEvaluation]: """Aggregates all evaluations into an overview that includes high-level statistics. @@ -190,10 +196,16 @@ def aggregate_evaluation( Args: eval_ids: An overview of the evaluation to be aggregated. Does not include actual evaluations as these will be retrieved from the repository. + labels: A list of labels for filtering. Defaults to an empty list. + metadata: A dict for additional information about the aggregation overview. Defaults to an empty dict. Returns: An overview of the aggregated evaluation. """ + if metadata is None: + metadata = dict() + if labels is None: + labels = set() def load_eval_overview(evaluation_id: str) -> EvaluationOverview: evaluation_overview = self._evaluation_repository.evaluation_overview( @@ -237,6 +249,8 @@ def load_eval_overview(evaluation_id: str) -> EvaluationOverview: crashed_during_evaluation_count=successful_evaluations.excluded_count(), description=self.description, statistics=statistics, + labels=labels, + metadata=metadata, ) self._aggregation_repository.store_aggregation_overview(aggregation_overview) return aggregation_overview diff --git a/src/intelligence_layer/evaluation/aggregation/domain.py b/src/intelligence_layer/evaluation/aggregation/domain.py index 46a996dd2..70ffda668 100644 --- a/src/intelligence_layer/evaluation/aggregation/domain.py +++ b/src/intelligence_layer/evaluation/aggregation/domain.py @@ -4,6 +4,9 @@ from pydantic import BaseModel, SerializeAsAny +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.evaluation.evaluation.domain import ( EvaluationFailed, EvaluationOverview, @@ -31,6 +34,9 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True) run_ids: IDs of all :class:`RunOverview`s from all linked :class:`EvaluationOverview`s. description: A short description. statistics: Aggregated statistics of the run. Whatever is returned by :meth:`Evaluator.aggregate` + labels: Labels for filtering aggregation. Defaults to empty list. + metadata: Additional information about the aggregation. Defaults to empty dict. + """ evaluation_overviews: frozenset[EvaluationOverview] @@ -41,6 +47,8 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True) crashed_during_evaluation_count: int description: str statistics: SerializeAsAny[AggregatedEvaluation] + labels: set[str] = set() + metadata: SerializableDict = dict() @property def run_ids(self) -> Sequence[str]: @@ -74,6 +82,8 @@ def __str__(self) -> str: f"Successful example count = {self.successful_evaluation_count}\n" f"Count of examples crashed during evaluation = {self.failed_evaluation_count}\n" f'Description = "{self.description}"\n' + f"Labels = {self.labels}\n" + f"Metadata = {self.metadata}\n" ) res += f"IDs of aggregated Evaluation Overviews = {[evaluation_overview.id for evaluation_overview in self.evaluation_overviews]}\n" @@ -84,3 +94,6 @@ def __str__(self) -> str: res += "}\n" return res + + def __hash__(self) -> int: + return hash(self.id) diff --git a/src/intelligence_layer/evaluation/dataset/dataset_repository.py b/src/intelligence_layer/evaluation/dataset/dataset_repository.py index bbd79db22..999aab760 100644 --- a/src/intelligence_layer/evaluation/dataset/dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/dataset_repository.py @@ -2,6 +2,9 @@ from collections.abc import Iterable from typing import Optional +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import Input from intelligence_layer.evaluation.dataset.domain import ( Dataset, @@ -22,6 +25,8 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, + labels: set[str] | None = None, + metadata: SerializableDict | None = None, ) -> Dataset: """Creates a dataset from given :class:`Example`s and returns the ID of that dataset. @@ -29,6 +34,8 @@ def create_dataset( examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset. dataset_name: A name for the dataset. id: The dataset ID. If `None`, an ID will be generated. + labels: A list of labels for filtering. Defaults to an empty list. + metadata: A dict for additional information about the dataset. Defaults to an empty dict. Returns: The created :class:`Dataset`. diff --git a/src/intelligence_layer/evaluation/dataset/domain.py b/src/intelligence_layer/evaluation/dataset/domain.py index c32c36d33..3d6b4cf9d 100644 --- a/src/intelligence_layer/evaluation/dataset/domain.py +++ b/src/intelligence_layer/evaluation/dataset/domain.py @@ -4,7 +4,9 @@ from pydantic import BaseModel, Field from rich.tree import Tree -from intelligence_layer.connectors.base.json_serializable import SerializableDict +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core.task import Input from intelligence_layer.core.tracer.tracer import PydanticSerializable @@ -60,13 +62,22 @@ class Dataset(BaseModel): Attributes: id: Dataset ID. name: A short name of the dataset. + label: Labels for filtering datasets. Defaults to empty list. + metadata: Additional information about the dataset. Defaults to empty dict. """ id: str = Field(default_factory=lambda: str(uuid4())) name: str + labels: set[str] = set() + metadata: SerializableDict = dict() def __repr__(self) -> str: return self.__str__() def __str__(self) -> str: - return f"Dataset ID = {self.id}\nName = {self.name}\n" + return ( + f"Dataset ID = {self.id}\n" + f"Name = {self.name}\n" + f"Labels = {self.labels}\n" + f"Metadata = {self.metadata}" + ) diff --git a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py index 67e734bb8..85575c8ad 100644 --- a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py @@ -5,6 +5,7 @@ from fsspec.implementations.local import LocalFileSystem # type: ignore +from intelligence_layer.connectors.base.json_serializable import SerializableDict from intelligence_layer.core import Input, JsonSerializer, PydanticSerializable from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import ( @@ -31,8 +32,14 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, + labels: set[str] | None = None, + metadata: SerializableDict | None = None, ) -> Dataset: - dataset = Dataset(name=dataset_name) + if metadata is None: + metadata = dict() + if labels is None: + labels = set() + dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata) if id is not None: dataset.id = id diff --git a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py index 94206c2be..2ca418cf9 100644 --- a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py @@ -1,6 +1,9 @@ from collections.abc import Iterable, Sequence from typing import Optional, cast +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import Input, PydanticSerializable from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import ( @@ -24,8 +27,14 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, + labels: set[str] | None = None, + metadata: SerializableDict | None = None, ) -> Dataset: - dataset = Dataset(name=dataset_name) + if metadata is None: + metadata = dict() + if labels is None: + labels = set() + dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata) if id is not None: dataset.id = id if dataset.id in self._datasets_and_examples: diff --git a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py index 4bc79c9d8..4608a248f 100644 --- a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py @@ -5,6 +5,7 @@ from datasets import DatasetDict, IterableDataset, IterableDatasetDict from pydantic import BaseModel +from intelligence_layer.connectors.base.json_serializable import SerializableDict from intelligence_layer.core.task import Input from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import ( @@ -33,6 +34,8 @@ def create_dataset( examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str, id: str | None = None, + labels: set[str] | None = None, + metadata: SerializableDict | None = None, ) -> Dataset: raise NotImplementedError diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py index a7688cac9..34bb56e05 100644 --- a/src/intelligence_layer/evaluation/evaluation/domain.py +++ b/src/intelligence_layer/evaluation/evaluation/domain.py @@ -5,6 +5,7 @@ from pydantic import BaseModel, SerializeAsAny from rich.tree import Tree +from intelligence_layer.connectors.base.json_serializable import SerializableDict from intelligence_layer.evaluation.run.domain import RunOverview Evaluation = TypeVar("Evaluation", bound=BaseModel, covariant=True) @@ -81,6 +82,8 @@ class PartialEvaluationOverview(BaseModel, frozen=True): start_date: datetime submitted_evaluation_count: int description: str + labels: set[str] + metadata: SerializableDict def __repr__(self) -> str: return self.__str__() @@ -100,6 +103,8 @@ def __str__(self) -> str: f"Start time = {self.start_date}\n" f"Submitted Evaluations = {self.submitted_evaluation_count}\n" f'Description = "{self.description}"\n' + f"Labels = {self.labels}\n" + f"Metadata = {self.metadata}\n" f"{run_overview_str}" ) @@ -116,6 +121,8 @@ class EvaluationOverview(BaseModel, frozen=True): failed_evaluation_count: Number of examples that produced an error during evaluation. Note: failed runs are skipped in the evaluation and therefore not counted as failures description: human-readable for the evaluator that created the evaluation. + labels: Labels for filtering evaluation. Defaults to empty list. + metadata: Additional information about the evaluation. Defaults to empty dict. """ run_overviews: frozenset[RunOverview] @@ -125,6 +132,8 @@ class EvaluationOverview(BaseModel, frozen=True): successful_evaluation_count: int failed_evaluation_count: int description: str + labels: set[str] + metadata: SerializableDict def __repr__(self) -> str: return self.__str__() @@ -146,9 +155,14 @@ def __str__(self) -> str: f"Successful examples = {self.successful_evaluation_count}\n" f"Failed examples = {self.failed_evaluation_count}\n" f'Description = "{self.description}"\n' + f"Labels = {self.labels}\n" + f"Metadata = {self.metadata}\n" f"{run_overview_str}" ) + def __hash__(self) -> int: + return hash(self.id) + class EvaluationFailed(Exception): def __init__(self, evaluation_id: str, failed_count: int) -> None: diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py index 87e870c33..ee49bfaa8 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py @@ -16,6 +16,9 @@ RatingQuestion, RecordData, ) +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import CompleteOutput, Input, InstructInput, Output from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput @@ -138,7 +141,13 @@ def submit( dataset_name: Optional[str] = None, abort_on_error: bool = False, skip_example_on_any_failure: bool = True, + labels: Optional[set[str]] = None, + metadata: Optional[SerializableDict] = None, ) -> PartialEvaluationOverview: + if metadata is None: + metadata = dict() + if labels is None: + labels = set() argilla_dataset_id = self._client.create_dataset( self._workspace_id, dataset_name if dataset_name else str(uuid4()), @@ -179,6 +188,8 @@ def submit( start_date=datetime.now(), submitted_evaluation_count=submit_count, description=self.description, + labels=labels, + metadata=metadata, ) self._evaluation_repository.store_partial_evaluation_overview(partial_overview) @@ -227,6 +238,8 @@ def retrieve( successful_evaluation_count=len(evaluations), failed_evaluation_count=num_not_yet_evaluated_evals + num_failed_evaluations, + labels=partial_overview.labels, + metadata=partial_overview.metadata, ) self._evaluation_repository.store_evaluation_overview(overview) return overview diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py index 44bf5eae6..d464da457 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py @@ -202,6 +202,7 @@ def _load_run_overviews(self, *run_ids: str) -> set[RunOverview]: run_overview = self._run_repository.run_overview(run_id) if not run_overview: raise ValueError(f"No RunOverview found for run-id: {run_id}") + run_overviews.add(run_overview) return run_overviews diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py index cb399cb5b..a4c61f702 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py @@ -4,6 +4,9 @@ from tqdm import tqdm +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import Input, Output, utc_now from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput @@ -99,6 +102,8 @@ def evaluate_runs( abort_on_error: bool = False, skip_example_on_any_failure: bool = True, description: Optional[str] = None, + labels: Optional[set[str]] = None, + metadata: Optional[SerializableDict] = None, ) -> EvaluationOverview: """Evaluates all generated outputs in the run. @@ -118,12 +123,18 @@ def evaluate_runs( abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False. skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True. description: Optional description of the evaluation. Defaults to None. + labels: A list of labels for filtering. Defaults to an empty list. + metadata: A dict for additional information about the evaluation overview. Defaults to an empty dict. Returns: EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be returned but instead stored in the :class:`EvaluationRepository` provided in the __init__. """ + if metadata is None: + metadata = dict() + if labels is None: + labels = set() start = utc_now() run_overviews = self._load_run_overviews(*run_ids) eval_id = self._evaluation_repository.initialize_evaluation() @@ -162,6 +173,8 @@ def evaluate_runs( successful_evaluation_count=successful_evaluation_count, failed_evaluation_count=failed_evaluation_count, description=full_description, + labels=labels, + metadata=metadata, ) self._evaluation_repository.store_evaluation_overview(overview) diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py index c4cb678df..84f1134d7 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py @@ -6,6 +6,9 @@ from pydantic import BaseModel +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import Input, Output from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput @@ -123,6 +126,8 @@ def evaluate_additional_runs( previous_evaluation_ids: Optional[list[str]] = None, num_examples: Optional[int] = None, abort_on_error: bool = False, + labels: Optional[set[str]] = None, + metadata: Optional[SerializableDict] = None, ) -> EvaluationOverview: """Evaluate all runs while considering which runs have already been evaluated according to `previous_evaluation_id`. @@ -141,12 +146,18 @@ def evaluate_additional_runs( num_examples: The number of examples which should be evaluated from the given runs. Always the first n runs stored in the evaluation repository. Defaults to None. abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False. + labels: A list of labels for filtering. Defaults to an empty list. + metadata: A dict for additional information about the evaluation overview. Defaults to an empty dict. Returns: EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be returned but instead stored in the :class:`EvaluationRepository` provided in the __init__. """ + if metadata is None: + metadata = dict() + if labels is None: + labels = set() previous_run_ids = [] previous_evaluation_ids = previous_evaluation_ids or [] @@ -160,7 +171,11 @@ def evaluate_additional_runs( self._evaluation_logic.set_previous_run_output_ids(previous_run_ids) return super().evaluate_runs( - *run_ids, num_examples=num_examples, abort_on_error=abort_on_error + *run_ids, + num_examples=num_examples, + abort_on_error=abort_on_error, + labels=labels, + metadata=metadata, ) def evaluate_runs( @@ -170,7 +185,13 @@ def evaluate_runs( abort_on_error: bool = False, skip_example_on_any_failure: bool = True, description: Optional[str] = None, + labels: set[str] | None = None, + metadata: SerializableDict | None = None, ) -> EvaluationOverview: + if metadata is None: + metadata = dict() + if labels is None: + labels = set() self._evaluation_logic.set_previous_run_output_ids([]) return super().evaluate_runs( *run_ids, diff --git a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py index 4d3b8650f..93cd4dfce 100644 --- a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py +++ b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py @@ -144,6 +144,7 @@ def aggregation_overviews_to_pandas( aggregation_overviews: Sequence[AggregationOverview[AggregatedEvaluation]], unwrap_statistics: bool = True, strict: bool = True, + unwrap_metadata: bool = True, ) -> pd.DataFrame: """Converts aggregation overviews to a pandas table for easier comparison. @@ -152,6 +153,8 @@ def aggregation_overviews_to_pandas( unwrap_statistics: Unwrap the `statistics` field in the overviews into separate columns. Defaults to True. strict: Allow only overviews with exactly equal `statistics` types. Defaults to True. + unwrap_metadata: Unwrap the `metadata` field in the overviews into separate columns. + Defaults to True. Returns: A pandas :class:`DataFrame` containing an overview per row with fields as columns. @@ -173,6 +176,11 @@ def aggregation_overviews_to_pandas( df = df.join(pd.DataFrame(df["statistics"].to_list())).drop( columns=["statistics"] ) + if unwrap_metadata and "metadata" in df.columns: + df = pd.concat([df, pd.json_normalize(df["metadata"])], axis=1).drop( # type: ignore + columns=["metadata"] + ) + return df diff --git a/src/intelligence_layer/evaluation/run/domain.py b/src/intelligence_layer/evaluation/run/domain.py index bcf80653d..fe40f1be1 100644 --- a/src/intelligence_layer/evaluation/run/domain.py +++ b/src/intelligence_layer/evaluation/run/domain.py @@ -5,6 +5,7 @@ from pydantic import BaseModel from rich.tree import Tree +from intelligence_layer.connectors.base.json_serializable import SerializableDict from intelligence_layer.core.task import Output @@ -100,6 +101,8 @@ class RunOverview(BaseModel, frozen=True): failed_example_count: The number of examples where an exception was raised when running the task. successful_example_count: The number of examples that where successfully run. description: Human-readable of the runner that run the task. + labels: Labels for filtering runs. Defaults to empty list. + metadata: Additional information about the run. Defaults to empty dict. """ dataset_id: str @@ -109,6 +112,8 @@ class RunOverview(BaseModel, frozen=True): failed_example_count: int successful_example_count: int description: str + labels: set[str] + metadata: SerializableDict def __repr__(self) -> str: return self.__str__() @@ -122,4 +127,9 @@ def __str__(self) -> str: f"Failed example count = {self.failed_example_count}\n" f"Successful example count = {self.successful_example_count}\n" f'Description = "{self.description}"\n' + f'Labels = "{self.labels}"\n' + f'Metadata = "{self.metadata}"\n' ) + + def __hash__(self) -> int: + return hash(self.id) diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py index 56cc689c8..20d96d790 100644 --- a/src/intelligence_layer/evaluation/run/runner.py +++ b/src/intelligence_layer/evaluation/run/runner.py @@ -8,6 +8,9 @@ from pydantic import JsonValue from tqdm import tqdm +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import ( CompositeTracer, Input, @@ -81,6 +84,8 @@ def run_dataset( max_workers: int = 10, description: Optional[str] = None, trace_examples_individually: bool = True, + labels: Optional[set[str]] = None, + metadata: Optional[SerializableDict] = None, ) -> RunOverview: """Generates all outputs for the provided dataset. @@ -97,11 +102,17 @@ def run_dataset( max_workers: Number of examples that can be evaluated concurrently. Defaults to 10. description: An optional description of the run. Defaults to None. trace_examples_individually: Flag to create individual tracers for each example. Defaults to True. + labels: A list of labels for filtering. Defaults to an empty list. + metadata: A dict for additional information about the run overview. Defaults to an empty dict. Returns: An overview of the run. Outputs will not be returned but instead stored in the :class:`RunRepository` provided in the __init__. """ + if labels is None: + labels = set() + if metadata is None: + metadata = dict() def run( example: Example[Input, ExpectedOutput], @@ -157,6 +168,7 @@ def run( full_description = ( self.description + " : " + description if description else self.description ) + run_overview = RunOverview( dataset_id=dataset_id, id=run_id, @@ -165,6 +177,8 @@ def run( failed_example_count=failed_count, successful_example_count=successful_count, description=full_description, + labels=labels, + metadata=metadata, ) self._run_repository.store_run_overview(run_overview) return run_overview diff --git a/tests/conftest.py b/tests/conftest.py index b3c8669e1..40c95d000 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -146,6 +146,8 @@ def run_overview() -> RunOverview: failed_example_count=0, successful_example_count=3, description="test run overview 1", + labels=set(), + metadata=dict(), ) @@ -166,4 +168,6 @@ def evaluation_overview( failed_evaluation_count=1, run_overviews=frozenset([run_overview]), description="test evaluation overview 1", + labels=set(), + metadata=dict(), ) diff --git a/tests/evaluation/dataset/test_dataset_domain.py b/tests/evaluation/dataset/test_dataset_domain.py new file mode 100644 index 000000000..f8968c403 --- /dev/null +++ b/tests/evaluation/dataset/test_dataset_domain.py @@ -0,0 +1,20 @@ +from intelligence_layer.evaluation import Dataset + + +def test_default_values_are_set() -> None: + dataset = Dataset(name="Test") + + assert dataset.id is not None + assert len(dataset.metadata) == 0 + assert len(dataset.labels) == 0 + + +def test_default_values_are_not_changed() -> None: + modified_dataset = Dataset(name="Modified Dataset") + modified_dataset.labels.add("test_label") + modified_dataset.metadata.update({"key": "value"}) + + default_dataset = Dataset(name="Default Dataset") + + assert modified_dataset.labels != default_dataset.labels + assert modified_dataset.metadata != default_dataset.metadata diff --git a/tests/evaluation/dataset/test_dataset_repository.py b/tests/evaluation/dataset/test_dataset_repository.py index 963782f78..c92fc73cd 100644 --- a/tests/evaluation/dataset/test_dataset_repository.py +++ b/tests/evaluation/dataset/test_dataset_repository.py @@ -2,11 +2,15 @@ from pathlib import Path from typing import Any from unittest.mock import patch +from uuid import uuid4 import pytest from fsspec.implementations.memory import MemoryFileSystem # type: ignore from pytest import FixtureRequest, fixture, mark, raises +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.evaluation import ( DatasetRepository, Example, @@ -71,6 +75,57 @@ def test_dataset_repository_with_custom_id( assert dataset.id == "my-custom-dataset-id" +@mark.parametrize( + "repository_fixture", + test_repository_fixtures, +) +def test_dataset_repository_create_dataset_sets_default_values( + repository_fixture: str, + request: FixtureRequest, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], +) -> None: + dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) + + dataset = dataset_repository.create_dataset( + examples=[dummy_string_example], dataset_name="test-dataset" + ) + + assert dataset.id is not None + assert dataset.name == "test-dataset" + assert dataset.labels == set() + assert dataset.metadata == dict() + + +@mark.parametrize( + "repository_fixture", + test_repository_fixtures, +) +def test_dataset_repository_create_dataset_explicit_values_overwrite_defaults( + repository_fixture: str, + request: FixtureRequest, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], +) -> None: + expected_id = str(uuid4()) + expected_name = "test_name" + expected_labels = {"test_label"} + expected_metadata: SerializableDict = dict({"test_key": "test_value"}) + + dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) + + dataset = dataset_repository.create_dataset( + examples=[dummy_string_example], + dataset_name=expected_name, + id=expected_id, + labels=expected_labels, + metadata=expected_metadata, + ) + + assert dataset.id == expected_id + assert dataset.name == expected_name + assert dataset.labels == expected_labels + assert dataset.metadata == expected_metadata + + @mark.parametrize( "repository_fixture", test_repository_fixtures, diff --git a/tests/evaluation/evaluation/test_async_evaluation_repository.py b/tests/evaluation/evaluation/test_async_evaluation_repository.py index 16bfc8697..fc94d3fd0 100644 --- a/tests/evaluation/evaluation/test_async_evaluation_repository.py +++ b/tests/evaluation/evaluation/test_async_evaluation_repository.py @@ -41,6 +41,8 @@ def partial_evaluation_overviews( run_overviews=frozenset([run_overview]), submitted_evaluation_count=10, description="test evaluation overview", + labels=set(), + metadata=dict(), ) ) return evaluation_overviews @@ -56,6 +58,8 @@ def partial_evaluation_overview( run_overviews=frozenset([run_overview]), submitted_evaluation_count=10, description="test evaluation overview", + labels=set(), + metadata=dict(), ) diff --git a/tests/evaluation/evaluation/test_elo_evaluation_logic.py b/tests/evaluation/evaluation/test_elo_evaluation_logic.py index e46ee1e19..0b4e2faca 100644 --- a/tests/evaluation/evaluation/test_elo_evaluation_logic.py +++ b/tests/evaluation/evaluation/test_elo_evaluation_logic.py @@ -150,6 +150,8 @@ def qa_setup( failed_example_count=0, successful_example_count=len(qa_outputs), description="runner", + labels=set(), + metadata=dict(), ) ) return run_ids diff --git a/tests/evaluation/evaluation/test_evaluation_repository.py b/tests/evaluation/evaluation/test_evaluation_repository.py index 7db3777c4..3948b5a9f 100644 --- a/tests/evaluation/evaluation/test_evaluation_repository.py +++ b/tests/evaluation/evaluation/test_evaluation_repository.py @@ -70,6 +70,8 @@ def evaluation_overviews(run_overview: RunOverview) -> Iterable[EvaluationOvervi failed_evaluation_count=1, run_overviews=frozenset([run_overview]), description="test evaluation overview 1", + labels=set(), + metadata={}, ) ) return evaluation_overviews diff --git a/tests/evaluation/evaluation/test_evaluator.py b/tests/evaluation/evaluation/test_evaluator.py index 92bbf783f..ee133cf43 100644 --- a/tests/evaluation/evaluation/test_evaluator.py +++ b/tests/evaluation/evaluation/test_evaluator.py @@ -5,6 +5,9 @@ from pydantic import BaseModel from pytest import fixture +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import Input, Output, Task, Tracer from intelligence_layer.core.tracer.in_memory_tracer import ( InMemoryTaskSpan, @@ -709,3 +712,55 @@ def test_eval_raises_error_if_examples_and_example_outputs_dont_match( num_examples=None, ) ) + + +def test_evaluator_evaluate_runs_sets_default_values( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str +) -> None: + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) + assert evaluation_overview.labels == set() + assert evaluation_overview.metadata == dict() + + +def test_evaluator_evaluate_runs_specific_values_overwrite_defaults( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str +) -> None: + expected_labels = {"test_label"} + expected_metadata: SerializableDict = dict({"test_key": "test-value"}) + evaluation_overview = dummy_evaluator.evaluate_runs( + run_id, labels=expected_labels, metadata=expected_metadata + ) + assert evaluation_overview.labels == expected_labels + assert evaluation_overview.metadata == expected_metadata + + +def test_aggregate_evaluation_set_default_labels_metadata_values( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList + ], + run_id: str, +) -> None: + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) + aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) + + assert aggregation_overview.labels == set() + assert aggregation_overview.metadata == dict() + + +def test_aggregate_evaluation_specific_values_overwrite_defaults( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList + ], + run_id: str, +) -> None: + expected_labels = {"test_label"} + expected_metadata: SerializableDict = dict({"test_key": "test-value"}) + evaluation_overview = dummy_evaluator.evaluate_runs(run_id) + aggregation_overview = dummy_aggregator.aggregate_evaluation( + evaluation_overview.id, labels=expected_labels, metadata=expected_metadata + ) + + assert aggregation_overview.labels == expected_labels + assert aggregation_overview.metadata == expected_metadata diff --git a/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py index 014e40b41..d30e42688 100644 --- a/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py +++ b/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py @@ -14,6 +14,9 @@ Question, RecordData, ) +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import CompleteOutput, InstructInput, utc_now from intelligence_layer.evaluation import ( Aggregator, @@ -162,6 +165,10 @@ def create_dummy_runs( failed_example_count=0, successful_example_count=1, description="runner", + labels={"test-label"}, + metadata=dict( + {"test_key": "test_value"}, + ), ) ) @@ -277,3 +284,30 @@ def test_elo_calculating_works_as_expected() -> None: elo.calculate(comeback_matches) assert elo.ratings[player2] > elo.ratings[player1] + + +def test_retrieve_argilla_evaluation_overview_has_submitted_partial_evaluation_overview_labels_metadata( + evaluator: ArgillaEvaluator[ + InstructInput, CompleteOutput, None, ComparisonEvaluation + ], + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + any_instruct_output: CompleteOutput, +) -> None: + run_count = 10 + run_ids = [f"{i}" for i in range(run_count)] + dataset_id = create_dummy_dataset(in_memory_dataset_repository) + create_dummy_runs( + in_memory_run_repository, any_instruct_output, run_ids, dataset_id + ) + + expected_labels = {"test-label"} + expected_metadata: SerializableDict = dict({"test_key": "test_value"}) + + partial_overview = evaluator.submit( + *run_ids, labels=expected_labels, metadata=expected_metadata + ) + evaluation_overview = evaluator.retrieve(partial_overview.id) + + assert partial_overview.labels == evaluation_overview.labels + assert partial_overview.metadata == evaluation_overview.metadata diff --git a/tests/evaluation/infrastructure/test_repository_navigator.py b/tests/evaluation/infrastructure/test_repository_navigator.py index 8272ee5b0..ddfc98ed7 100644 --- a/tests/evaluation/infrastructure/test_repository_navigator.py +++ b/tests/evaluation/infrastructure/test_repository_navigator.py @@ -437,7 +437,9 @@ def test_aggregation_overviews_to_pandas(length: int) -> None: # given overview = create_aggregation_overview(AggregationDummy()) # when - df = aggregation_overviews_to_pandas([overview] * length, unwrap_statistics=False) + df = aggregation_overviews_to_pandas( + [overview] * length, unwrap_statistics=False, unwrap_metadata=False + ) # then assert len(df) == length assert set(AggregationOverview.model_fields.keys()) == set(df.columns) @@ -466,6 +468,48 @@ class AggregationDummy2(BaseModel): assert "statistics" not in df.columns +def test_aggregation_overviews_to_pandas_unwrap_metadata() -> None: + # given + + overview = AggregationOverview( + evaluation_overviews=frozenset([]), + id="aggregation-id", + start=utc_now(), + end=utc_now(), + successful_evaluation_count=5, + crashed_during_evaluation_count=3, + description="dummy-evaluator", + statistics=AggregationDummy(), + labels=set(), + metadata=dict({"model": "model_a", "prompt": "prompt_a"}), + ) + overview2 = AggregationOverview( + evaluation_overviews=frozenset([]), + id="aggregation-id2", + start=utc_now(), + end=utc_now(), + successful_evaluation_count=5, + crashed_during_evaluation_count=3, + description="dummy-evaluator", + statistics=AggregationDummy(), + labels=set(), + metadata=dict( + {"model": "model_a", "prompt": "prompt_a", "different_column": "value"} + ), + ) + + df = aggregation_overviews_to_pandas( + [overview, overview2], unwrap_metadata=True, strict=False + ) + + assert "model" in df.columns + assert "prompt" in df.columns + assert "different_column" in df.columns + assert "metadata" not in df.columns + assert all(df["model"] == "model_a") + assert all(df["prompt"] == "prompt_a") + + def test_aggregation_overviews_to_pandas_works_with_eval_overviews() -> None: # given eval_overview = EvaluationOverview( @@ -476,6 +520,8 @@ def test_aggregation_overviews_to_pandas_works_with_eval_overviews() -> None: successful_evaluation_count=1, failed_evaluation_count=1, description="", + labels=set(), + metadata=dict(), ) overview = AggregationOverview( evaluation_overviews=frozenset([eval_overview]), diff --git a/tests/evaluation/run/test_run_repository.py b/tests/evaluation/run/test_run_repository.py index 50fe9da98..3bdba5ea8 100644 --- a/tests/evaluation/run/test_run_repository.py +++ b/tests/evaluation/run/test_run_repository.py @@ -30,6 +30,8 @@ def run_overviews() -> Sequence[RunOverview]: failed_example_count=0, successful_example_count=1, description="test run overview", + labels=set(), + metadata=dict(), ) run_overviews.append(run_overview) return run_overviews diff --git a/tests/evaluation/run/test_runner.py b/tests/evaluation/run/test_runner.py index 0ebd0ba68..f3e32eb06 100644 --- a/tests/evaluation/run/test_runner.py +++ b/tests/evaluation/run/test_runner.py @@ -2,6 +2,9 @@ import pytest +from intelligence_layer.connectors.base.json_serializable import ( + SerializableDict, +) from intelligence_layer.core import InMemoryTaskSpan, InMemoryTracer from intelligence_layer.evaluation import ( Example, @@ -122,3 +125,43 @@ def test_runner_runs_n_examples( entries = tracer.entries assert len(entries) == 1 assert all([isinstance(e, InMemoryTaskSpan) for e in entries]) + + +def test_runner_run_overview_has_default_metadata_and_labels( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + sequence_examples: Iterable[Example[str, None]], +) -> None: + examples = list(sequence_examples) + task = DummyTask() + runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo") + + dataset_id = in_memory_dataset_repository.create_dataset( + examples=examples, dataset_name="" + ).id + + overview = runner.run_dataset(dataset_id) + + assert overview.metadata == dict() + assert overview.labels == set() + + +def test_runner_run_overview_has_specified_metadata_and_labels( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + sequence_examples: Iterable[Example[str, None]], +) -> None: + run_labels = {"test-label"} + run_metadata: SerializableDict = dict({"test_key": "test-value"}) + + examples = list(sequence_examples) + task = DummyTask() + runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo") + + dataset_id = in_memory_dataset_repository.create_dataset( + examples=examples, dataset_name="" + ).id + overview = runner.run_dataset(dataset_id, labels=run_labels, metadata=run_metadata) + + assert overview.metadata == run_metadata + assert overview.labels == run_labels