Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Il 547 add metadata to overviews #899

Merged
merged 7 commits into from
Jun 13, 2024
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,6 @@ fabric.properties

# End of https://www.toptal.com/developers/gitignore/api/intellij+all
.python-version

src/documentation/human-eval-data/datasets*
src/documentation/human-eval-data/runs/*
10 changes: 7 additions & 3 deletions src/documentation/how_tos/how_to_aggregate_evaluations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,14 @@
" \"MyAggregationDescription\",\n",
" aggregation_logic,\n",
")\n",
"aggregation_overview = aggregator.aggregate_evaluation(*evaluation_ids)\n",
"aggregation_overview = aggregator.aggregate_evaluation(\n",
" *evaluation_ids, labels=set([\"label_a\"]), metadata=dict({\"key\": \"value\"})\n",
")\n",
"\n",
"# Step 3\n",
"print(aggregation_overview.id)"
"print(aggregation_overview.id)\n",
"print(aggregation_overview.labels)\n",
"print(aggregation_overview.metadata)"
]
}
],
Expand All @@ -80,7 +84,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.11.8"
}
},
"nbformat": 4,
Expand Down
8 changes: 6 additions & 2 deletions src/documentation/how_tos/how_to_create_a_dataset.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,14 @@
"dataset = dataset_repository.create_dataset(\n",
" examples=examples,\n",
" dataset_name=\"StoryDataset\",\n",
" labels=set([\"label1\", \"label2\"]),\n",
" metadata=dict({\"key_a\": [\"a\", \"b\"], \"key_b\": \"value\"}),\n",
")\n",
"\n",
"# Step 4\n",
"print(dataset.id)"
"print(dataset.id)\n",
"print(dataset.labels)\n",
"print(dataset.metadata)"
]
}
],
Expand All @@ -90,7 +94,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.11.8"
}
},
"nbformat": 4,
Expand Down
8 changes: 6 additions & 2 deletions src/documentation/how_tos/how_to_evaluate_runs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,14 @@
" evaluation_logic,\n",
")\n",
"\n",
"evaluation_overview = evaluator.evaluate_runs(*run_ids)\n",
"evaluation_overview = evaluator.evaluate_runs(\n",
" *run_ids, labels=set({\"label\"}), metadata=dict({\"key\": \"value\"})\n",
")\n",
"\n",
"# Step 4\n",
"print(evaluation_overview.id)"
"print(evaluation_overview.id)\n",
"print(evaluation_overview.metadata)\n",
"print(evaluation_overview.labels)"
]
}
],
Expand Down
35 changes: 20 additions & 15 deletions src/documentation/parameter_optimization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -191,20 +191,25 @@
"for model, prompt in itertools.product(model_list, prompt_list):\n",
" dummy_task = DummyTask(model=model, prompt=prompt)\n",
"\n",
" # The description and the Experiment will later be used to identify the run parameters. Take special note of the delimiter '|'.\n",
" description = f\"|{model}|{prompt}|\"\n",
" # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n",
" metadata = dict({\"model\": model, \"prompt\": prompt})\n",
" description = \"Evaluate dummy task\"\n",
" runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n",
" run_overview = runner.run_dataset(dataset.id, description=description)\n",
" run_overview = runner.run_dataset(\n",
" dataset.id, metadata=metadata, description=description\n",
" )\n",
"\n",
" eval_overview = evaluator.evaluate_runs(run_overview.id, description=description)\n",
" eval_overview = evaluator.evaluate_runs(\n",
" run_overview.id, metadata=metadata, description=description\n",
" )\n",
"\n",
" aggregator = Aggregator(\n",
" evaluation_repository,\n",
" aggregation_repository,\n",
" EXPERIMENT_NAME + \":\" + description,\n",
" EXPERIMENT_NAME,\n",
" DummyAggregationLogic(),\n",
" )\n",
" aggregator.aggregate_evaluation(eval_overview.id)"
" aggregator.aggregate_evaluation(eval_overview.id, metadata=metadata)"
]
},
{
Expand All @@ -223,17 +228,20 @@
"metadata": {},
"outputs": [],
"source": [
"# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`.\n",
"# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`. Filtering can also be done on labels and/or metadata.\n",
"aggregations_of_interest = [\n",
" overview\n",
" for overview in aggregation_repository.aggregation_overviews(\n",
" aggregation_type=DummyAggregatedEvaluation\n",
" )\n",
" if overview.description.startswith(EXPERIMENT_NAME)\n",
" if overview.description == EXPERIMENT_NAME\n",
"]\n",
"\n",
"# Convert the desired aggregation into a pandas dataframe\n",
"formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)"
"formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)\n",
"\n",
"# Print all columns to check for columns of interest\n",
"formated_aggregations.columns"
]
},
{
Expand All @@ -252,11 +260,8 @@
"outputs": [],
"source": [
"aggregation_fields = list(DummyAggregatedEvaluation.model_fields.keys())\n",
"formated_aggregations = formated_aggregations[[\"description\", *aggregation_fields]]\n",
"formated_aggregations[[\"model\", \"prompt\"]] = formated_aggregations[\n",
" \"description\"\n",
"].str.split(\"|\", expand=True)[[1, 2]]\n",
"formated_aggregations.drop(columns=\"description\", inplace=True)\n",
"# Filter for columns of interest\n",
"formated_aggregations = formated_aggregations[[\"model\", \"prompt\", *aggregation_fields]]\n",
"\n",
"display(\n",
" formated_aggregations.sort_values(\n",
Expand Down Expand Up @@ -306,7 +311,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
"version": "3.11.8"
MerlinKallenbornAA marked this conversation as resolved.
Show resolved Hide resolved
}
},
"nbformat": 4,
Expand Down
14 changes: 13 additions & 1 deletion src/intelligence_layer/evaluation/aggregation/aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)
from uuid import uuid4

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import utc_now
from intelligence_layer.evaluation.aggregation.aggregation_repository import (
AggregationRepository,
Expand Down Expand Up @@ -181,7 +182,10 @@ def evaluation_type(self) -> type[Evaluation]:

@final
def aggregate_evaluation(
self, *eval_ids: str
self,
*eval_ids: str,
labels: set[str] | None = None,
metadata: dict[str, JsonSerializable] | None = None,
) -> AggregationOverview[AggregatedEvaluation]:
"""Aggregates all evaluations into an overview that includes high-level statistics.

Expand All @@ -190,10 +194,16 @@ def aggregate_evaluation(
Args:
eval_ids: An overview of the evaluation to be aggregated. Does not include
actual evaluations as these will be retrieved from the repository.
labels: A list of labels for filtering. Defaults to an empty list.
metadata: A dict for additional information about the aggregation overview. Default to an empty dict.
MerlinKallenbornAA marked this conversation as resolved.
Show resolved Hide resolved

Returns:
An overview of the aggregated evaluation.
"""
if metadata is None:
metadata = dict()
if labels is None:
labels = set()

def load_eval_overview(evaluation_id: str) -> EvaluationOverview:
evaluation_overview = self._evaluation_repository.evaluation_overview(
Expand Down Expand Up @@ -237,6 +247,8 @@ def load_eval_overview(evaluation_id: str) -> EvaluationOverview:
crashed_during_evaluation_count=successful_evaluations.excluded_count(),
description=self.description,
statistics=statistics,
labels=labels,
metadata=metadata,
)
self._aggregation_repository.store_aggregation_overview(aggregation_overview)
return aggregation_overview
11 changes: 11 additions & 0 deletions src/intelligence_layer/evaluation/aggregation/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from pydantic import BaseModel, SerializeAsAny

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.evaluation.evaluation.domain import (
EvaluationFailed,
EvaluationOverview,
Expand Down Expand Up @@ -31,6 +32,9 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True)
run_ids: IDs of all :class:`RunOverview`s from all linked :class:`EvaluationOverview`s.
description: A short description.
statistics: Aggregated statistics of the run. Whatever is returned by :meth:`Evaluator.aggregate`
labels: Labels for filtering aggregation. Defaults to empty list.
metadata: Additional information about the aggregation. Defaults to empty dict.

"""

evaluation_overviews: frozenset[EvaluationOverview]
Expand All @@ -41,6 +45,8 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True)
crashed_during_evaluation_count: int
description: str
statistics: SerializeAsAny[AggregatedEvaluation]
labels: set[str] = set()
MerlinKallenbornAA marked this conversation as resolved.
Show resolved Hide resolved
metadata: dict[str, JsonSerializable] = dict()

@property
def run_ids(self) -> Sequence[str]:
Expand Down Expand Up @@ -74,6 +80,8 @@ def __str__(self) -> str:
f"Successful example count = {self.successful_evaluation_count}\n"
f"Count of examples crashed during evaluation = {self.failed_evaluation_count}\n"
f'Description = "{self.description}"\n'
f"Labels = {self.labels}\n"
f"Metadata = {self.metadata}\n"
)

res += f"IDs of aggregated Evaluation Overviews = {[evaluation_overview.id for evaluation_overview in self.evaluation_overviews]}\n"
Expand All @@ -84,3 +92,6 @@ def __str__(self) -> str:
res += "}\n"

return res

def __hash__(self) -> int:
return hash(self.id)
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections.abc import Iterable
from typing import Optional

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import Input
from intelligence_layer.evaluation.dataset.domain import (
Dataset,
Expand All @@ -22,13 +23,17 @@ def create_dataset(
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
labels: set[str] | None = None,
metadata: dict[str, JsonSerializable] | None = None,
) -> Dataset:
"""Creates a dataset from given :class:`Example`s and returns the ID of that dataset.

Args:
examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset.
dataset_name: A name for the dataset.
id: The dataset ID. If `None`, an ID will be generated.
labels: A list of labels for filtering. Defaults to an empty list.
metadata: A dict for additional information about the dataset. Default to an empty dict.
MerlinKallenbornAA marked this conversation as resolved.
Show resolved Hide resolved

Returns:
The created :class:`Dataset`.
Expand Down
16 changes: 14 additions & 2 deletions src/intelligence_layer/evaluation/dataset/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from pydantic import BaseModel, Field
from rich.tree import Tree

from intelligence_layer.connectors.base.json_serializable import SerializableDict
from intelligence_layer.connectors.base.json_serializable import (
JsonSerializable,
SerializableDict,
)
from intelligence_layer.core.task import Input
from intelligence_layer.core.tracer.tracer import PydanticSerializable

Expand Down Expand Up @@ -60,13 +63,22 @@ class Dataset(BaseModel):
Attributes:
id: Dataset ID.
name: A short name of the dataset.
label: Labels for filtering datasets. Defaults to empty list.
metadata: Additional information about the dataset. Defaults to empty dict.
"""

id: str = Field(default_factory=lambda: str(uuid4()))
name: str
labels: set[str] = set()
FelixFehse marked this conversation as resolved.
Show resolved Hide resolved
metadata: dict[str, JsonSerializable] = dict()

def __repr__(self) -> str:
return self.__str__()

def __str__(self) -> str:
return f"Dataset ID = {self.id}\nName = {self.name}\n"
return (
f"Dataset ID = {self.id}\n"
f"Name = {self.name}\n"
f"Labels = {self.labels}\n"
f"Metadata = {self.metadata}"
)
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from fsspec.implementations.local import LocalFileSystem # type: ignore

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import Input, JsonSerializer, PydanticSerializable
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import (
Expand All @@ -31,8 +32,14 @@ def create_dataset(
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
labels: set[str] | None = None,
metadata: dict[str, JsonSerializable] | None = None,
) -> Dataset:
dataset = Dataset(name=dataset_name)
if metadata is None:
metadata = dict()
if labels is None:
labels = set()
dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata)
if id is not None:
dataset.id = id

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections.abc import Iterable, Sequence
from typing import Optional, cast

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core import Input, PydanticSerializable
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import (
Expand All @@ -24,8 +25,14 @@ def create_dataset(
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
labels: set[str] | None = None,
metadata: dict[str, JsonSerializable] | None = None,
) -> Dataset:
dataset = Dataset(name=dataset_name)
if metadata is None:
metadata = dict()
if labels is None:
labels = set()
dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata)
if id is not None:
dataset.id = id
if dataset.id in self._datasets_and_examples:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from datasets import DatasetDict, IterableDataset, IterableDatasetDict
from pydantic import BaseModel

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
from intelligence_layer.core.task import Input
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import (
Expand Down Expand Up @@ -33,6 +34,8 @@ def create_dataset(
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
labels: set[str] | None = None,
metadata: dict[str, JsonSerializable] | None = None,
) -> Dataset:
raise NotImplementedError

Expand Down
Loading