Skip to content

Commit

Permalink
feat: Add labels and metadata fields to dataset and overview classes
Browse files Browse the repository at this point in the history
TASK: IL-547
  • Loading branch information
MerlinKallenbornAA authored and maxhammeralephalpha committed Jun 11, 2024
1 parent 40b617a commit 236bf8b
Show file tree
Hide file tree
Showing 26 changed files with 327 additions and 6 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,6 @@ fabric.properties

# End of https://www.toptal.com/developers/gitignore/api/intellij+all
.python-version

src/documentation/human-eval-data/datasets*
src/documentation/human-eval-data/runs/*
10 changes: 9 additions & 1 deletion src/intelligence_layer/evaluation/aggregation/aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from uuid import uuid4

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import utc_now
from intelligence_layer.evaluation.aggregation.aggregation_repository import (
AggregationRepository,
Expand Down Expand Up @@ -182,7 +183,10 @@ def evaluation_type(self) -> type[Evaluation]:

@final
def aggregate_evaluation(
self, *eval_ids: str
self,
*eval_ids: str,
labels: set[str] = set(),
metadata: dict[str, JsonSerializable] = dict(),
) -> AggregationOverview[AggregatedEvaluation]:
"""Aggregates all evaluations into an overview that includes high-level statistics.
Expand All @@ -191,6 +195,8 @@ def aggregate_evaluation(
Args:
eval_ids: An overview of the evaluation to be aggregated. Does not include
actual evaluations as these will be retrieved from the repository.
labels: A list of labels for filtering. Defaults to an empty list.
metadata: A dict for additional information about the aggregation overview. Default to an empty dict.
Returns:
An overview of the aggregated evaluation.
Expand Down Expand Up @@ -238,6 +244,8 @@ def load_eval_overview(evaluation_id: str) -> EvaluationOverview:
crashed_during_evaluation_count=successful_evaluations.excluded_count(),
description=self.description,
statistics=statistics,
labels=labels,
metadata=metadata,
)
self._aggregation_repository.store_aggregation_overview(aggregation_overview)
return aggregation_overview
11 changes: 11 additions & 0 deletions src/intelligence_layer/evaluation/aggregation/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from pydantic import BaseModel, SerializeAsAny

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.evaluation.evaluation.domain import (
EvaluationFailed,
EvaluationOverview,
Expand Down Expand Up @@ -30,6 +31,9 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True)
run_ids: IDs of all :class:`RunOverview`s from all linked :class:`EvaluationOverview`s.
description: A short description.
statistics: Aggregated statistics of the run. Whatever is returned by :meth:`Evaluator.aggregate`
labels: Labels for filtering aggregation. Defaults to empty list.
metadata: Additional information about the aggregation. Defaults to empty dict.
"""

evaluation_overviews: frozenset[EvaluationOverview]
Expand All @@ -40,6 +44,8 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True)
crashed_during_evaluation_count: int
description: str
statistics: SerializeAsAny[AggregatedEvaluation]
labels: set[str] = set()
metadata: dict[str, JsonSerializable] = dict()

@property
def run_ids(self) -> Sequence[str]:
Expand Down Expand Up @@ -73,6 +79,8 @@ def __str__(self) -> str:
f"Successful example count = {self.successful_evaluation_count}\n"
f"Count of examples crashed during evaluation = {self.failed_evaluation_count}\n"
f'Description = "{self.description}"\n'
f"Labels = {self.labels}\n"
f"Metadata = {self.metadata}\n"
)

res += f"IDs of aggregated Evaluation Overviews = {[evaluation_overview.id for evaluation_overview in self.evaluation_overviews]}\n"
Expand All @@ -83,3 +91,6 @@ def __str__(self) -> str:
res += "}\n"

return res

def __hash__(self) -> int:
return hash(self.id)
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from abc import ABC, abstractmethod
from typing import Iterable, Optional

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import Input
from intelligence_layer.evaluation.dataset.domain import (
Dataset,
Expand All @@ -21,13 +22,17 @@ def create_dataset(
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
labels: set[str] = set(),
metadata: dict[str, JsonSerializable] = dict(),
) -> Dataset:
"""Creates a dataset from given :class:`Example`s and returns the ID of that dataset.
Args:
examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset.
dataset_name: A name for the dataset.
id: The dataset ID. If `None`, an ID will be generated.
labels: A list of labels for filtering. Defaults to an empty list.
metadata: A dict for additional information about the dataset. Default to an empty dict.
Returns:
The created :class:`Dataset`.
Expand Down
16 changes: 14 additions & 2 deletions src/intelligence_layer/evaluation/dataset/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from pydantic import BaseModel, Field
from rich.tree import Tree

from intelligence_layer.connectors.base.json_serializable import SerializableDict
from intelligence_layer.connectors.base.json_serializable import (
JsonSerializable,
SerializableDict,
)
from intelligence_layer.core.task import Input
from intelligence_layer.core.tracer.tracer import PydanticSerializable

Expand Down Expand Up @@ -60,13 +63,22 @@ class Dataset(BaseModel):
Attributes:
id: Dataset ID.
name: A short name of the dataset.
label: Labels for filtering datasets. Defaults to empty list.
metadata: Additional information about the dataset. Defaults to empty dict.
"""

id: str = Field(default_factory=lambda: str(uuid4()))
name: str
labels: set[str] = set()
metadata: dict[str, JsonSerializable] = dict()

def __repr__(self) -> str:
return self.__str__()

def __str__(self) -> str:
return f"Dataset ID = {self.id}\nName = {self.name}\n"
return (
f"Dataset ID = {self.id}\n"
f"Name = {self.name}\n"
f"Labels = {self.labels}\n"
f"Metadata = {self.metadata}"
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from fsspec.implementations.local import LocalFileSystem # type: ignore

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import Input, JsonSerializer, PydanticSerializable
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import (
Expand All @@ -29,8 +30,10 @@ def create_dataset(
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
labels: set[str] = set(),
metadata: dict[str, JsonSerializable] = dict(),
) -> Dataset:
dataset = Dataset(name=dataset_name)
dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata)
if id is not None:
dataset.id = id

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Iterable, Optional, Sequence, Tuple, cast

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import Input, PydanticSerializable
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import (
Expand All @@ -23,8 +24,10 @@ def create_dataset(
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
labels: set[str] = set(),
metadata: dict[str, JsonSerializable] = dict(),
) -> Dataset:
dataset = Dataset(name=dataset_name)
dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata)
if id is not None:
dataset.id = id
if dataset.id in self._datasets_and_examples:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datasets import DatasetDict, IterableDataset, IterableDatasetDict
from pydantic import BaseModel

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core.task import Input
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import (
Expand Down Expand Up @@ -32,6 +33,8 @@ def create_dataset(
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
labels: set[str] = set(),
metadata: dict[str, JsonSerializable] = dict(),
) -> Dataset:
raise NotImplementedError

Expand Down
14 changes: 14 additions & 0 deletions src/intelligence_layer/evaluation/evaluation/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pydantic import BaseModel, SerializeAsAny
from rich.tree import Tree

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.evaluation.run.domain import RunOverview

Evaluation = TypeVar("Evaluation", bound=BaseModel, covariant=True)
Expand Down Expand Up @@ -81,6 +82,8 @@ class PartialEvaluationOverview(BaseModel, frozen=True):
start_date: datetime
submitted_evaluation_count: int
description: str
labels: set[str]
metadata: dict[str, JsonSerializable]

def __repr__(self) -> str:
return self.__str__()
Expand All @@ -100,6 +103,8 @@ def __str__(self) -> str:
f"Start time = {self.start_date}\n"
f"Submitted Evaluations = {self.submitted_evaluation_count}\n"
f'Description = "{self.description}"\n'
f"Labels = {self.labels}\n"
f"Metadata = {self.metadata}\n"
f"{run_overview_str}"
)

Expand All @@ -116,6 +121,8 @@ class EvaluationOverview(BaseModel, frozen=True):
failed_evaluation_count: Number of examples that produced an error during evaluation.
Note: failed runs are skipped in the evaluation and therefore not counted as failures
description: human-readable for the evaluator that created the evaluation.
labels: Labels for filtering evaluation. Defaults to empty list.
metadata: Additional information about the evaluation. Defaults to empty dict.
"""

run_overviews: frozenset[RunOverview]
Expand All @@ -125,6 +132,8 @@ class EvaluationOverview(BaseModel, frozen=True):
successful_evaluation_count: int
failed_evaluation_count: int
description: str
labels: set[str]
metadata: dict[str, JsonSerializable]

def __repr__(self) -> str:
return self.__str__()
Expand All @@ -146,9 +155,14 @@ def __str__(self) -> str:
f"Successful examples = {self.successful_evaluation_count}\n"
f"Failed examples = {self.failed_evaluation_count}\n"
f'Description = "{self.description}"\n'
f"Labels = {self.labels}\n"
f"Metadata = {self.metadata}\n"
f"{run_overview_str}"
)

def __hash__(self) -> int:
return hash(self.id)


class EvaluationFailed(Exception):
def __init__(self, evaluation_id: str, failed_count: int) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
Question,
RecordData,
)
from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import CompleteOutput, Input, InstructInput, Output
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
Expand Down Expand Up @@ -136,6 +137,8 @@ def submit(
dataset_name: Optional[str] = None,
abort_on_error: bool = False,
skip_example_on_any_failure: bool = True,
labels: set[str] = set(),
metadata: dict[str, JsonSerializable] = dict(),
) -> PartialEvaluationOverview:
argilla_dataset_id = self._client.create_dataset(
self._workspace_id,
Expand Down Expand Up @@ -177,6 +180,8 @@ def submit(
start_date=datetime.now(),
submitted_evaluation_count=submit_count,
description=self.description,
labels=labels,
metadata=metadata,
)

self._evaluation_repository.store_partial_evaluation_overview(partial_overview)
Expand Down Expand Up @@ -225,6 +230,8 @@ def retrieve(
successful_evaluation_count=len(evaluations),
failed_evaluation_count=num_not_yet_evaluated_evals
+ num_failed_evaluations,
labels=partial_overview.labels,
metadata=partial_overview.metadata,
)
self._evaluation_repository.store_evaluation_overview(overview)
return overview
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ def _load_run_overviews(self, *run_ids: str) -> set[RunOverview]:
run_overview = self._run_repository.run_overview(run_id)
if not run_overview:
raise ValueError(f"No RunOverview found for run-id: {run_id}")

run_overviews.add(run_overview)
return run_overviews

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from tqdm import tqdm

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import Input, Output, utc_now
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
Expand Down Expand Up @@ -99,6 +100,8 @@ def evaluate_runs(
abort_on_error: bool = False,
skip_example_on_any_failure: bool = True,
description: Optional[str] = None,
labels: set[str] = set(),
metadata: dict[str, JsonSerializable] = dict(),
) -> EvaluationOverview:
"""Evaluates all generated outputs in the run.
Expand All @@ -118,6 +121,8 @@ def evaluate_runs(
abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True.
description: Optional description of the evaluation. Defaults to None.
labels: A list of labels for filtering. Defaults to an empty list.
metadata: A dict for additional information about the evaluation overview. Default to an empty dict.
Returns:
EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
Expand Down Expand Up @@ -163,6 +168,8 @@ def evaluate_runs(
successful_evaluation_count=successful_evaluation_count,
failed_evaluation_count=failed_evaluation_count,
description=full_description,
labels=labels,
metadata=metadata,
)
self._evaluation_repository.store_evaluation_overview(overview)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from pydantic import BaseModel

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import Input, Output
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
Expand Down Expand Up @@ -123,6 +124,8 @@ def evaluate_additional_runs(
previous_evaluation_ids: Optional[list[str]] = None,
num_examples: Optional[int] = None,
abort_on_error: bool = False,
labels: set[str] = set(),
metadata: dict[str, JsonSerializable] = dict(),
) -> EvaluationOverview:
"""Evaluate all runs while considering which runs have already been evaluated according to `previous_evaluation_id`.
Expand All @@ -141,6 +144,8 @@ def evaluate_additional_runs(
num_examples: The number of examples which should be evaluated from the given runs.
Always the first n runs stored in the evaluation repository. Defaults to None.
abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
labels: A list of labels for filtering. Defaults to an empty list.
metadata: A dict for additional information about the evaluation overview. Default to an empty dict.
Returns:
EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
Expand All @@ -161,7 +166,11 @@ def evaluate_additional_runs(

self._evaluation_logic.set_previous_run_output_ids(previous_run_ids)
return super().evaluate_runs(
*run_ids, num_examples=num_examples, abort_on_error=abort_on_error
*run_ids,
num_examples=num_examples,
abort_on_error=abort_on_error,
labels=labels,
metadata=metadata,
)

def evaluate_runs(
Expand All @@ -171,6 +180,8 @@ def evaluate_runs(
abort_on_error: bool = False,
skip_example_on_any_failure: bool = True,
description: Optional[str] = None,
labels: set[str] = set(),
metadata: dict[str, JsonSerializable] = dict(),
) -> EvaluationOverview:
self._evaluation_logic.set_previous_run_output_ids([])
return super().evaluate_runs(
Expand Down
Loading

0 comments on commit 236bf8b

Please sign in to comment.