diff --git a/.gitignore b/.gitignore
index 0caf99001..7476d43b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -248,3 +248,6 @@ fabric.properties
 
 # End of https://www.toptal.com/developers/gitignore/api/intellij+all
 .python-version
+
+src/documentation/human-eval-data/datasets*
+src/documentation/human-eval-data/runs/*
diff --git a/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb b/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb
index 2dadcce47..d64431a19 100644
--- a/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb
+++ b/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb
@@ -57,10 +57,14 @@
     "    \"MyAggregationDescription\",\n",
     "    aggregation_logic,\n",
     ")\n",
-    "aggregation_overview = aggregator.aggregate_evaluation(*evaluation_ids)\n",
+    "aggregation_overview = aggregator.aggregate_evaluation(\n",
+    "    *evaluation_ids, labels=set([\"label_a\"]), metadata=dict({\"key\": \"value\"})\n",
+    ")\n",
     "\n",
     "# Step 3\n",
-    "print(aggregation_overview.id)"
+    "print(aggregation_overview.id)\n",
+    "print(aggregation_overview.labels)\n",
+    "print(aggregation_overview.metadata)"
    ]
   }
  ],
@@ -80,7 +84,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.8"
   }
  },
  "nbformat": 4,
diff --git a/src/documentation/how_tos/how_to_create_a_dataset.ipynb b/src/documentation/how_tos/how_to_create_a_dataset.ipynb
index 83ccc91f0..f760274e4 100644
--- a/src/documentation/how_tos/how_to_create_a_dataset.ipynb
+++ b/src/documentation/how_tos/how_to_create_a_dataset.ipynb
@@ -67,10 +67,14 @@
     "dataset = dataset_repository.create_dataset(\n",
     "    examples=examples,\n",
     "    dataset_name=\"StoryDataset\",\n",
+    "    labels=set([\"label1\", \"label2\"]),\n",
+    "    metadata=dict({\"key_a\": [\"a\", \"b\"], \"key_b\": \"value\"}),\n",
     ")\n",
     "\n",
     "# Step 4\n",
-    "print(dataset.id)"
+    "print(dataset.id)\n",
+    "print(dataset.labels)\n",
+    "print(dataset.metadata)"
    ]
   }
  ],
@@ -90,7 +94,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.8"
   }
  },
  "nbformat": 4,
diff --git a/src/documentation/how_tos/how_to_evaluate_runs.ipynb b/src/documentation/how_tos/how_to_evaluate_runs.ipynb
index 91d05c023..ff39f511c 100644
--- a/src/documentation/how_tos/how_to_evaluate_runs.ipynb
+++ b/src/documentation/how_tos/how_to_evaluate_runs.ipynb
@@ -57,10 +57,14 @@
     "    evaluation_logic,\n",
     ")\n",
     "\n",
-    "evaluation_overview = evaluator.evaluate_runs(*run_ids)\n",
+    "evaluation_overview = evaluator.evaluate_runs(\n",
+    "    *run_ids, labels=set({\"label\"}), metadata=dict({\"key\": \"value\"})\n",
+    ")\n",
     "\n",
     "# Step 4\n",
-    "print(evaluation_overview.id)"
+    "print(evaluation_overview.id)\n",
+    "print(evaluation_overview.metadata)\n",
+    "print(evaluation_overview.labels)"
    ]
   }
  ],
diff --git a/src/documentation/parameter_optimization.ipynb b/src/documentation/parameter_optimization.ipynb
index 5eb833ae5..b74db13fa 100644
--- a/src/documentation/parameter_optimization.ipynb
+++ b/src/documentation/parameter_optimization.ipynb
@@ -191,20 +191,25 @@
     "for model, prompt in itertools.product(model_list, prompt_list):\n",
     "    dummy_task = DummyTask(model=model, prompt=prompt)\n",
     "\n",
-    "    # The description and the Experiment will later be used to identify the run parameters. Take special note of the delimiter '|'.\n",
-    "    description = f\"|{model}|{prompt}|\"\n",
+    "    # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n",
+    "    metadata = dict({\"model\": model, \"prompt\": prompt})\n",
+    "    description = \"Evaluate dummy task\"\n",
     "    runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n",
-    "    run_overview = runner.run_dataset(dataset.id, description=description)\n",
+    "    run_overview = runner.run_dataset(\n",
+    "        dataset.id, metadata=metadata, description=description\n",
+    "    )\n",
     "\n",
-    "    eval_overview = evaluator.evaluate_runs(run_overview.id, description=description)\n",
+    "    eval_overview = evaluator.evaluate_runs(\n",
+    "        run_overview.id, metadata=metadata, description=description\n",
+    "    )\n",
     "\n",
     "    aggregator = Aggregator(\n",
     "        evaluation_repository,\n",
     "        aggregation_repository,\n",
-    "        EXPERIMENT_NAME + \":\" + description,\n",
+    "        EXPERIMENT_NAME,\n",
     "        DummyAggregationLogic(),\n",
     "    )\n",
-    "    aggregator.aggregate_evaluation(eval_overview.id)"
+    "    aggregator.aggregate_evaluation(eval_overview.id, metadata=metadata)"
    ]
   },
   {
@@ -223,17 +228,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`.\n",
+    "# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`. Filtering can also be done on labels and/or metadata.\n",
     "aggregations_of_interest = [\n",
     "    overview\n",
     "    for overview in aggregation_repository.aggregation_overviews(\n",
     "        aggregation_type=DummyAggregatedEvaluation\n",
     "    )\n",
-    "    if overview.description.startswith(EXPERIMENT_NAME)\n",
+    "    if overview.description == EXPERIMENT_NAME\n",
     "]\n",
     "\n",
     "# Convert the desired aggregation into a pandas dataframe\n",
-    "formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)"
+    "formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)\n",
+    "\n",
+    "# Print all columns to check for columns of interest\n",
+    "formated_aggregations.columns"
    ]
   },
   {
@@ -252,11 +260,8 @@
    "outputs": [],
    "source": [
     "aggregation_fields = list(DummyAggregatedEvaluation.model_fields.keys())\n",
-    "formated_aggregations = formated_aggregations[[\"description\", *aggregation_fields]]\n",
-    "formated_aggregations[[\"model\", \"prompt\"]] = formated_aggregations[\n",
-    "    \"description\"\n",
-    "].str.split(\"|\", expand=True)[[1, 2]]\n",
-    "formated_aggregations.drop(columns=\"description\", inplace=True)\n",
+    "# Filter for columns of interest\n",
+    "formated_aggregations = formated_aggregations[[\"model\", \"prompt\", *aggregation_fields]]\n",
     "\n",
     "display(\n",
     "    formated_aggregations.sort_values(\n",
@@ -306,7 +311,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.11.8"
   }
  },
  "nbformat": 4,
diff --git a/src/intelligence_layer/evaluation/aggregation/aggregator.py b/src/intelligence_layer/evaluation/aggregation/aggregator.py
index 29b9053f7..b93da6ba0 100644
--- a/src/intelligence_layer/evaluation/aggregation/aggregator.py
+++ b/src/intelligence_layer/evaluation/aggregation/aggregator.py
@@ -11,6 +11,9 @@
 )
 from uuid import uuid4
 
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core import utc_now
 from intelligence_layer.evaluation.aggregation.aggregation_repository import (
     AggregationRepository,
@@ -181,7 +184,10 @@ def evaluation_type(self) -> type[Evaluation]:
 
     @final
     def aggregate_evaluation(
-        self, *eval_ids: str
+        self,
+        *eval_ids: str,
+        labels: set[str] | None = None,
+        metadata: SerializableDict | None = None,
     ) -> AggregationOverview[AggregatedEvaluation]:
         """Aggregates all evaluations into an overview that includes high-level statistics.
 
@@ -190,10 +196,16 @@ def aggregate_evaluation(
         Args:
             eval_ids: An overview of the evaluation to be aggregated. Does not include
                 actual evaluations as these will be retrieved from the repository.
+            labels: A list of labels for filtering. Defaults to an empty list.
+            metadata: A dict for additional information about the aggregation overview. Defaults to an empty dict.
 
         Returns:
             An overview of the aggregated evaluation.
         """
+        if metadata is None:
+            metadata = dict()
+        if labels is None:
+            labels = set()
 
         def load_eval_overview(evaluation_id: str) -> EvaluationOverview:
             evaluation_overview = self._evaluation_repository.evaluation_overview(
@@ -237,6 +249,8 @@ def load_eval_overview(evaluation_id: str) -> EvaluationOverview:
             crashed_during_evaluation_count=successful_evaluations.excluded_count(),
             description=self.description,
             statistics=statistics,
+            labels=labels,
+            metadata=metadata,
         )
         self._aggregation_repository.store_aggregation_overview(aggregation_overview)
         return aggregation_overview
diff --git a/src/intelligence_layer/evaluation/aggregation/domain.py b/src/intelligence_layer/evaluation/aggregation/domain.py
index 46a996dd2..70ffda668 100644
--- a/src/intelligence_layer/evaluation/aggregation/domain.py
+++ b/src/intelligence_layer/evaluation/aggregation/domain.py
@@ -4,6 +4,9 @@
 
 from pydantic import BaseModel, SerializeAsAny
 
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.evaluation.evaluation.domain import (
     EvaluationFailed,
     EvaluationOverview,
@@ -31,6 +34,9 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True)
         run_ids: IDs of all :class:`RunOverview`s from all linked :class:`EvaluationOverview`s.
         description: A short description.
         statistics: Aggregated statistics of the run. Whatever is returned by :meth:`Evaluator.aggregate`
+        labels: Labels for filtering aggregation. Defaults to empty list.
+        metadata: Additional information about the aggregation. Defaults to empty dict.
+
     """
 
     evaluation_overviews: frozenset[EvaluationOverview]
@@ -41,6 +47,8 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True)
     crashed_during_evaluation_count: int
     description: str
     statistics: SerializeAsAny[AggregatedEvaluation]
+    labels: set[str] = set()
+    metadata: SerializableDict = dict()
 
     @property
     def run_ids(self) -> Sequence[str]:
@@ -74,6 +82,8 @@ def __str__(self) -> str:
             f"Successful example count = {self.successful_evaluation_count}\n"
             f"Count of examples crashed during evaluation = {self.failed_evaluation_count}\n"
             f'Description = "{self.description}"\n'
+            f"Labels = {self.labels}\n"
+            f"Metadata = {self.metadata}\n"
         )
 
         res += f"IDs of aggregated Evaluation Overviews = {[evaluation_overview.id for evaluation_overview in self.evaluation_overviews]}\n"
@@ -84,3 +94,6 @@ def __str__(self) -> str:
         res += "}\n"
 
         return res
+
+    def __hash__(self) -> int:
+        return hash(self.id)
diff --git a/src/intelligence_layer/evaluation/dataset/dataset_repository.py b/src/intelligence_layer/evaluation/dataset/dataset_repository.py
index bbd79db22..999aab760 100644
--- a/src/intelligence_layer/evaluation/dataset/dataset_repository.py
+++ b/src/intelligence_layer/evaluation/dataset/dataset_repository.py
@@ -2,6 +2,9 @@
 from collections.abc import Iterable
 from typing import Optional
 
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core import Input
 from intelligence_layer.evaluation.dataset.domain import (
     Dataset,
@@ -22,6 +25,8 @@ def create_dataset(
         examples: Iterable[Example[Input, ExpectedOutput]],
         dataset_name: str,
         id: str | None = None,
+        labels: set[str] | None = None,
+        metadata: SerializableDict | None = None,
     ) -> Dataset:
         """Creates a dataset from given :class:`Example`s and returns the ID of that dataset.
 
@@ -29,6 +34,8 @@ def create_dataset(
             examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset.
             dataset_name: A name for the dataset.
             id: The dataset ID. If `None`, an ID will be generated.
+            labels: A list of labels for filtering. Defaults to an empty list.
+            metadata: A dict for additional information about the dataset. Defaults to an empty dict.
 
         Returns:
             The created :class:`Dataset`.
diff --git a/src/intelligence_layer/evaluation/dataset/domain.py b/src/intelligence_layer/evaluation/dataset/domain.py
index c32c36d33..3d6b4cf9d 100644
--- a/src/intelligence_layer/evaluation/dataset/domain.py
+++ b/src/intelligence_layer/evaluation/dataset/domain.py
@@ -4,7 +4,9 @@
 from pydantic import BaseModel, Field
 from rich.tree import Tree
 
-from intelligence_layer.connectors.base.json_serializable import SerializableDict
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core.task import Input
 from intelligence_layer.core.tracer.tracer import PydanticSerializable
 
@@ -60,13 +62,22 @@ class Dataset(BaseModel):
     Attributes:
         id: Dataset ID.
         name: A short name of the dataset.
+        label: Labels for filtering datasets. Defaults to empty list.
+        metadata: Additional information about the dataset. Defaults to empty dict.
     """
 
     id: str = Field(default_factory=lambda: str(uuid4()))
     name: str
+    labels: set[str] = set()
+    metadata: SerializableDict = dict()
 
     def __repr__(self) -> str:
         return self.__str__()
 
     def __str__(self) -> str:
-        return f"Dataset ID = {self.id}\nName = {self.name}\n"
+        return (
+            f"Dataset ID = {self.id}\n"
+            f"Name = {self.name}\n"
+            f"Labels = {self.labels}\n"
+            f"Metadata = {self.metadata}"
+        )
diff --git a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py
index 67e734bb8..85575c8ad 100644
--- a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py
+++ b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py
@@ -5,6 +5,7 @@
 
 from fsspec.implementations.local import LocalFileSystem  # type: ignore
 
+from intelligence_layer.connectors.base.json_serializable import SerializableDict
 from intelligence_layer.core import Input, JsonSerializer, PydanticSerializable
 from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
 from intelligence_layer.evaluation.dataset.domain import (
@@ -31,8 +32,14 @@ def create_dataset(
         examples: Iterable[Example[Input, ExpectedOutput]],
         dataset_name: str,
         id: str | None = None,
+        labels: set[str] | None = None,
+        metadata: SerializableDict | None = None,
     ) -> Dataset:
-        dataset = Dataset(name=dataset_name)
+        if metadata is None:
+            metadata = dict()
+        if labels is None:
+            labels = set()
+        dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata)
         if id is not None:
             dataset.id = id
 
diff --git a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py
index 94206c2be..2ca418cf9 100644
--- a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py
+++ b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py
@@ -1,6 +1,9 @@
 from collections.abc import Iterable, Sequence
 from typing import Optional, cast
 
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core import Input, PydanticSerializable
 from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
 from intelligence_layer.evaluation.dataset.domain import (
@@ -24,8 +27,14 @@ def create_dataset(
         examples: Iterable[Example[Input, ExpectedOutput]],
         dataset_name: str,
         id: str | None = None,
+        labels: set[str] | None = None,
+        metadata: SerializableDict | None = None,
     ) -> Dataset:
-        dataset = Dataset(name=dataset_name)
+        if metadata is None:
+            metadata = dict()
+        if labels is None:
+            labels = set()
+        dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata)
         if id is not None:
             dataset.id = id
         if dataset.id in self._datasets_and_examples:
diff --git a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py
index 4bc79c9d8..4608a248f 100644
--- a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py
+++ b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py
@@ -5,6 +5,7 @@
 from datasets import DatasetDict, IterableDataset, IterableDatasetDict
 from pydantic import BaseModel
 
+from intelligence_layer.connectors.base.json_serializable import SerializableDict
 from intelligence_layer.core.task import Input
 from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
 from intelligence_layer.evaluation.dataset.domain import (
@@ -33,6 +34,8 @@ def create_dataset(
         examples: Iterable[Example[Input, ExpectedOutput]],
         dataset_name: str,
         id: str | None = None,
+        labels: set[str] | None = None,
+        metadata: SerializableDict | None = None,
     ) -> Dataset:
         raise NotImplementedError
 
diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py
index a7688cac9..34bb56e05 100644
--- a/src/intelligence_layer/evaluation/evaluation/domain.py
+++ b/src/intelligence_layer/evaluation/evaluation/domain.py
@@ -5,6 +5,7 @@
 from pydantic import BaseModel, SerializeAsAny
 from rich.tree import Tree
 
+from intelligence_layer.connectors.base.json_serializable import SerializableDict
 from intelligence_layer.evaluation.run.domain import RunOverview
 
 Evaluation = TypeVar("Evaluation", bound=BaseModel, covariant=True)
@@ -81,6 +82,8 @@ class PartialEvaluationOverview(BaseModel, frozen=True):
     start_date: datetime
     submitted_evaluation_count: int
     description: str
+    labels: set[str]
+    metadata: SerializableDict
 
     def __repr__(self) -> str:
         return self.__str__()
@@ -100,6 +103,8 @@ def __str__(self) -> str:
             f"Start time = {self.start_date}\n"
             f"Submitted Evaluations = {self.submitted_evaluation_count}\n"
             f'Description = "{self.description}"\n'
+            f"Labels = {self.labels}\n"
+            f"Metadata = {self.metadata}\n"
             f"{run_overview_str}"
         )
 
@@ -116,6 +121,8 @@ class EvaluationOverview(BaseModel, frozen=True):
         failed_evaluation_count: Number of examples that produced an error during evaluation.
             Note: failed runs are skipped in the evaluation and therefore not counted as failures
         description: human-readable for the evaluator that created the evaluation.
+        labels: Labels for filtering evaluation. Defaults to empty list.
+        metadata: Additional information about the evaluation. Defaults to empty dict.
     """
 
     run_overviews: frozenset[RunOverview]
@@ -125,6 +132,8 @@ class EvaluationOverview(BaseModel, frozen=True):
     successful_evaluation_count: int
     failed_evaluation_count: int
     description: str
+    labels: set[str]
+    metadata: SerializableDict
 
     def __repr__(self) -> str:
         return self.__str__()
@@ -146,9 +155,14 @@ def __str__(self) -> str:
             f"Successful examples = {self.successful_evaluation_count}\n"
             f"Failed examples = {self.failed_evaluation_count}\n"
             f'Description = "{self.description}"\n'
+            f"Labels = {self.labels}\n"
+            f"Metadata = {self.metadata}\n"
             f"{run_overview_str}"
         )
 
+    def __hash__(self) -> int:
+        return hash(self.id)
+
 
 class EvaluationFailed(Exception):
     def __init__(self, evaluation_id: str, failed_count: int) -> None:
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py
index 87e870c33..ee49bfaa8 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py
@@ -16,6 +16,9 @@
     RatingQuestion,
     RecordData,
 )
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core import CompleteOutput, Input, InstructInput, Output
 from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
 from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
@@ -138,7 +141,13 @@ def submit(
         dataset_name: Optional[str] = None,
         abort_on_error: bool = False,
         skip_example_on_any_failure: bool = True,
+        labels: Optional[set[str]] = None,
+        metadata: Optional[SerializableDict] = None,
     ) -> PartialEvaluationOverview:
+        if metadata is None:
+            metadata = dict()
+        if labels is None:
+            labels = set()
         argilla_dataset_id = self._client.create_dataset(
             self._workspace_id,
             dataset_name if dataset_name else str(uuid4()),
@@ -179,6 +188,8 @@ def submit(
             start_date=datetime.now(),
             submitted_evaluation_count=submit_count,
             description=self.description,
+            labels=labels,
+            metadata=metadata,
         )
 
         self._evaluation_repository.store_partial_evaluation_overview(partial_overview)
@@ -227,6 +238,8 @@ def retrieve(
             successful_evaluation_count=len(evaluations),
             failed_evaluation_count=num_not_yet_evaluated_evals
             + num_failed_evaluations,
+            labels=partial_overview.labels,
+            metadata=partial_overview.metadata,
         )
         self._evaluation_repository.store_evaluation_overview(overview)
         return overview
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py
index 44bf5eae6..d464da457 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/base_evaluator.py
@@ -202,6 +202,7 @@ def _load_run_overviews(self, *run_ids: str) -> set[RunOverview]:
             run_overview = self._run_repository.run_overview(run_id)
             if not run_overview:
                 raise ValueError(f"No RunOverview found for run-id: {run_id}")
+
             run_overviews.add(run_overview)
         return run_overviews
 
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py
index cb399cb5b..a4c61f702 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/evaluator.py
@@ -4,6 +4,9 @@
 
 from tqdm import tqdm
 
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core import Input, Output, utc_now
 from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
 from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
@@ -99,6 +102,8 @@ def evaluate_runs(
         abort_on_error: bool = False,
         skip_example_on_any_failure: bool = True,
         description: Optional[str] = None,
+        labels: Optional[set[str]] = None,
+        metadata: Optional[SerializableDict] = None,
     ) -> EvaluationOverview:
         """Evaluates all generated outputs in the run.
 
@@ -118,12 +123,18 @@ def evaluate_runs(
             abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
             skip_example_on_any_failure: Flag to skip evaluation on any example for which at least one run fails. Defaults to True.
             description: Optional description of the evaluation. Defaults to None.
+            labels: A list of labels for filtering. Defaults to an empty list.
+            metadata: A dict for additional information about the evaluation overview. Defaults to an empty dict.
 
         Returns:
             EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
             returned but instead stored in the :class:`EvaluationRepository` provided in the
             __init__.
         """
+        if metadata is None:
+            metadata = dict()
+        if labels is None:
+            labels = set()
         start = utc_now()
         run_overviews = self._load_run_overviews(*run_ids)
         eval_id = self._evaluation_repository.initialize_evaluation()
@@ -162,6 +173,8 @@ def evaluate_runs(
             successful_evaluation_count=successful_evaluation_count,
             failed_evaluation_count=failed_evaluation_count,
             description=full_description,
+            labels=labels,
+            metadata=metadata,
         )
         self._evaluation_repository.store_evaluation_overview(overview)
 
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
index c4cb678df..84f1134d7 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
@@ -6,6 +6,9 @@
 
 from pydantic import BaseModel
 
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core import Input, Output
 from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
 from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
@@ -123,6 +126,8 @@ def evaluate_additional_runs(
         previous_evaluation_ids: Optional[list[str]] = None,
         num_examples: Optional[int] = None,
         abort_on_error: bool = False,
+        labels: Optional[set[str]] = None,
+        metadata: Optional[SerializableDict] = None,
     ) -> EvaluationOverview:
         """Evaluate all runs while considering which runs have already been evaluated according to `previous_evaluation_id`.
 
@@ -141,12 +146,18 @@ def evaluate_additional_runs(
             num_examples: The number of examples which should be evaluated from the given runs.
                 Always the first n runs stored in the evaluation repository. Defaults to None.
             abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
+            labels: A list of labels for filtering. Defaults to an empty list.
+            metadata: A dict for additional information about the evaluation overview. Defaults to an empty dict.
 
         Returns:
             EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
             returned but instead stored in the :class:`EvaluationRepository` provided in the
             __init__.
         """
+        if metadata is None:
+            metadata = dict()
+        if labels is None:
+            labels = set()
         previous_run_ids = []
         previous_evaluation_ids = previous_evaluation_ids or []
 
@@ -160,7 +171,11 @@ def evaluate_additional_runs(
 
         self._evaluation_logic.set_previous_run_output_ids(previous_run_ids)
         return super().evaluate_runs(
-            *run_ids, num_examples=num_examples, abort_on_error=abort_on_error
+            *run_ids,
+            num_examples=num_examples,
+            abort_on_error=abort_on_error,
+            labels=labels,
+            metadata=metadata,
         )
 
     def evaluate_runs(
@@ -170,7 +185,13 @@ def evaluate_runs(
         abort_on_error: bool = False,
         skip_example_on_any_failure: bool = True,
         description: Optional[str] = None,
+        labels: set[str] | None = None,
+        metadata: SerializableDict | None = None,
     ) -> EvaluationOverview:
+        if metadata is None:
+            metadata = dict()
+        if labels is None:
+            labels = set()
         self._evaluation_logic.set_previous_run_output_ids([])
         return super().evaluate_runs(
             *run_ids,
diff --git a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py
index 4d3b8650f..93cd4dfce 100644
--- a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py
+++ b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py
@@ -144,6 +144,7 @@ def aggregation_overviews_to_pandas(
     aggregation_overviews: Sequence[AggregationOverview[AggregatedEvaluation]],
     unwrap_statistics: bool = True,
     strict: bool = True,
+    unwrap_metadata: bool = True,
 ) -> pd.DataFrame:
     """Converts aggregation overviews to a pandas table for easier comparison.
 
@@ -152,6 +153,8 @@ def aggregation_overviews_to_pandas(
         unwrap_statistics: Unwrap the `statistics` field in the overviews into separate columns.
             Defaults to True.
         strict: Allow only overviews with exactly equal `statistics` types. Defaults to True.
+        unwrap_metadata: Unwrap the `metadata` field in the overviews into separate columns.
+            Defaults to True.
 
     Returns:
         A pandas :class:`DataFrame` containing an overview per row with fields as columns.
@@ -173,6 +176,11 @@ def aggregation_overviews_to_pandas(
         df = df.join(pd.DataFrame(df["statistics"].to_list())).drop(
             columns=["statistics"]
         )
+    if unwrap_metadata and "metadata" in df.columns:
+        df = pd.concat([df, pd.json_normalize(df["metadata"])], axis=1).drop(  # type: ignore
+            columns=["metadata"]
+        )
+
     return df
 
 
diff --git a/src/intelligence_layer/evaluation/run/domain.py b/src/intelligence_layer/evaluation/run/domain.py
index bcf80653d..fe40f1be1 100644
--- a/src/intelligence_layer/evaluation/run/domain.py
+++ b/src/intelligence_layer/evaluation/run/domain.py
@@ -5,6 +5,7 @@
 from pydantic import BaseModel
 from rich.tree import Tree
 
+from intelligence_layer.connectors.base.json_serializable import SerializableDict
 from intelligence_layer.core.task import Output
 
 
@@ -100,6 +101,8 @@ class RunOverview(BaseModel, frozen=True):
         failed_example_count: The number of examples where an exception was raised when running the task.
         successful_example_count: The number of examples that where successfully run.
         description: Human-readable of the runner that run the task.
+        labels: Labels for filtering runs. Defaults to empty list.
+        metadata: Additional information about the run. Defaults to empty dict.
     """
 
     dataset_id: str
@@ -109,6 +112,8 @@ class RunOverview(BaseModel, frozen=True):
     failed_example_count: int
     successful_example_count: int
     description: str
+    labels: set[str]
+    metadata: SerializableDict
 
     def __repr__(self) -> str:
         return self.__str__()
@@ -122,4 +127,9 @@ def __str__(self) -> str:
             f"Failed example count = {self.failed_example_count}\n"
             f"Successful example count = {self.successful_example_count}\n"
             f'Description = "{self.description}"\n'
+            f'Labels = "{self.labels}"\n'
+            f'Metadata = "{self.metadata}"\n'
         )
+
+    def __hash__(self) -> int:
+        return hash(self.id)
diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py
index 56cc689c8..20d96d790 100644
--- a/src/intelligence_layer/evaluation/run/runner.py
+++ b/src/intelligence_layer/evaluation/run/runner.py
@@ -8,6 +8,9 @@
 from pydantic import JsonValue
 from tqdm import tqdm
 
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core import (
     CompositeTracer,
     Input,
@@ -81,6 +84,8 @@ def run_dataset(
         max_workers: int = 10,
         description: Optional[str] = None,
         trace_examples_individually: bool = True,
+        labels: Optional[set[str]] = None,
+        metadata: Optional[SerializableDict] = None,
     ) -> RunOverview:
         """Generates all outputs for the provided dataset.
 
@@ -97,11 +102,17 @@ def run_dataset(
             max_workers: Number of examples that can be evaluated concurrently. Defaults to 10.
             description: An optional description of the run. Defaults to None.
             trace_examples_individually: Flag to create individual tracers for each example. Defaults to True.
+            labels: A list of labels for filtering. Defaults to an empty list.
+            metadata: A dict for additional information about the run overview. Defaults to an empty dict.
 
         Returns:
             An overview of the run. Outputs will not be returned but instead stored in the
             :class:`RunRepository` provided in the __init__.
         """
+        if labels is None:
+            labels = set()
+        if metadata is None:
+            metadata = dict()
 
         def run(
             example: Example[Input, ExpectedOutput],
@@ -157,6 +168,7 @@ def run(
         full_description = (
             self.description + " : " + description if description else self.description
         )
+
         run_overview = RunOverview(
             dataset_id=dataset_id,
             id=run_id,
@@ -165,6 +177,8 @@ def run(
             failed_example_count=failed_count,
             successful_example_count=successful_count,
             description=full_description,
+            labels=labels,
+            metadata=metadata,
         )
         self._run_repository.store_run_overview(run_overview)
         return run_overview
diff --git a/tests/conftest.py b/tests/conftest.py
index b3c8669e1..40c95d000 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -146,6 +146,8 @@ def run_overview() -> RunOverview:
         failed_example_count=0,
         successful_example_count=3,
         description="test run overview 1",
+        labels=set(),
+        metadata=dict(),
     )
 
 
@@ -166,4 +168,6 @@ def evaluation_overview(
         failed_evaluation_count=1,
         run_overviews=frozenset([run_overview]),
         description="test evaluation overview 1",
+        labels=set(),
+        metadata=dict(),
     )
diff --git a/tests/evaluation/dataset/test_dataset_domain.py b/tests/evaluation/dataset/test_dataset_domain.py
new file mode 100644
index 000000000..f8968c403
--- /dev/null
+++ b/tests/evaluation/dataset/test_dataset_domain.py
@@ -0,0 +1,20 @@
+from intelligence_layer.evaluation import Dataset
+
+
+def test_default_values_are_set() -> None:
+    dataset = Dataset(name="Test")
+
+    assert dataset.id is not None
+    assert len(dataset.metadata) == 0
+    assert len(dataset.labels) == 0
+
+
+def test_default_values_are_not_changed() -> None:
+    modified_dataset = Dataset(name="Modified Dataset")
+    modified_dataset.labels.add("test_label")
+    modified_dataset.metadata.update({"key": "value"})
+
+    default_dataset = Dataset(name="Default Dataset")
+
+    assert modified_dataset.labels != default_dataset.labels
+    assert modified_dataset.metadata != default_dataset.metadata
diff --git a/tests/evaluation/dataset/test_dataset_repository.py b/tests/evaluation/dataset/test_dataset_repository.py
index 963782f78..c92fc73cd 100644
--- a/tests/evaluation/dataset/test_dataset_repository.py
+++ b/tests/evaluation/dataset/test_dataset_repository.py
@@ -2,11 +2,15 @@
 from pathlib import Path
 from typing import Any
 from unittest.mock import patch
+from uuid import uuid4
 
 import pytest
 from fsspec.implementations.memory import MemoryFileSystem  # type: ignore
 from pytest import FixtureRequest, fixture, mark, raises
 
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.evaluation import (
     DatasetRepository,
     Example,
@@ -71,6 +75,57 @@ def test_dataset_repository_with_custom_id(
     assert dataset.id == "my-custom-dataset-id"
 
 
+@mark.parametrize(
+    "repository_fixture",
+    test_repository_fixtures,
+)
+def test_dataset_repository_create_dataset_sets_default_values(
+    repository_fixture: str,
+    request: FixtureRequest,
+    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
+) -> None:
+    dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
+
+    dataset = dataset_repository.create_dataset(
+        examples=[dummy_string_example], dataset_name="test-dataset"
+    )
+
+    assert dataset.id is not None
+    assert dataset.name == "test-dataset"
+    assert dataset.labels == set()
+    assert dataset.metadata == dict()
+
+
+@mark.parametrize(
+    "repository_fixture",
+    test_repository_fixtures,
+)
+def test_dataset_repository_create_dataset_explicit_values_overwrite_defaults(
+    repository_fixture: str,
+    request: FixtureRequest,
+    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
+) -> None:
+    expected_id = str(uuid4())
+    expected_name = "test_name"
+    expected_labels = {"test_label"}
+    expected_metadata: SerializableDict = dict({"test_key": "test_value"})
+
+    dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
+
+    dataset = dataset_repository.create_dataset(
+        examples=[dummy_string_example],
+        dataset_name=expected_name,
+        id=expected_id,
+        labels=expected_labels,
+        metadata=expected_metadata,
+    )
+
+    assert dataset.id == expected_id
+    assert dataset.name == expected_name
+    assert dataset.labels == expected_labels
+    assert dataset.metadata == expected_metadata
+
+
 @mark.parametrize(
     "repository_fixture",
     test_repository_fixtures,
diff --git a/tests/evaluation/evaluation/test_async_evaluation_repository.py b/tests/evaluation/evaluation/test_async_evaluation_repository.py
index 16bfc8697..fc94d3fd0 100644
--- a/tests/evaluation/evaluation/test_async_evaluation_repository.py
+++ b/tests/evaluation/evaluation/test_async_evaluation_repository.py
@@ -41,6 +41,8 @@ def partial_evaluation_overviews(
                 run_overviews=frozenset([run_overview]),
                 submitted_evaluation_count=10,
                 description="test evaluation overview",
+                labels=set(),
+                metadata=dict(),
             )
         )
     return evaluation_overviews
@@ -56,6 +58,8 @@ def partial_evaluation_overview(
         run_overviews=frozenset([run_overview]),
         submitted_evaluation_count=10,
         description="test evaluation overview",
+        labels=set(),
+        metadata=dict(),
     )
 
 
diff --git a/tests/evaluation/evaluation/test_elo_evaluation_logic.py b/tests/evaluation/evaluation/test_elo_evaluation_logic.py
index e46ee1e19..0b4e2faca 100644
--- a/tests/evaluation/evaluation/test_elo_evaluation_logic.py
+++ b/tests/evaluation/evaluation/test_elo_evaluation_logic.py
@@ -150,6 +150,8 @@ def qa_setup(
                 failed_example_count=0,
                 successful_example_count=len(qa_outputs),
                 description="runner",
+                labels=set(),
+                metadata=dict(),
             )
         )
     return run_ids
diff --git a/tests/evaluation/evaluation/test_evaluation_repository.py b/tests/evaluation/evaluation/test_evaluation_repository.py
index 7db3777c4..3948b5a9f 100644
--- a/tests/evaluation/evaluation/test_evaluation_repository.py
+++ b/tests/evaluation/evaluation/test_evaluation_repository.py
@@ -70,6 +70,8 @@ def evaluation_overviews(run_overview: RunOverview) -> Iterable[EvaluationOvervi
                 failed_evaluation_count=1,
                 run_overviews=frozenset([run_overview]),
                 description="test evaluation overview 1",
+                labels=set(),
+                metadata={},
             )
         )
     return evaluation_overviews
diff --git a/tests/evaluation/evaluation/test_evaluator.py b/tests/evaluation/evaluation/test_evaluator.py
index 92bbf783f..ee133cf43 100644
--- a/tests/evaluation/evaluation/test_evaluator.py
+++ b/tests/evaluation/evaluation/test_evaluator.py
@@ -5,6 +5,9 @@
 from pydantic import BaseModel
 from pytest import fixture
 
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core import Input, Output, Task, Tracer
 from intelligence_layer.core.tracer.in_memory_tracer import (
     InMemoryTaskSpan,
@@ -709,3 +712,55 @@ def test_eval_raises_error_if_examples_and_example_outputs_dont_match(
                 num_examples=None,
             )
         )
+
+
+def test_evaluator_evaluate_runs_sets_default_values(
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str
+) -> None:
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
+    assert evaluation_overview.labels == set()
+    assert evaluation_overview.metadata == dict()
+
+
+def test_evaluator_evaluate_runs_specific_values_overwrite_defaults(
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], run_id: str
+) -> None:
+    expected_labels = {"test_label"}
+    expected_metadata: SerializableDict = dict({"test_key": "test-value"})
+    evaluation_overview = dummy_evaluator.evaluate_runs(
+        run_id, labels=expected_labels, metadata=expected_metadata
+    )
+    assert evaluation_overview.labels == expected_labels
+    assert evaluation_overview.metadata == expected_metadata
+
+
+def test_aggregate_evaluation_set_default_labels_metadata_values(
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    ],
+    run_id: str,
+) -> None:
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
+    aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
+
+    assert aggregation_overview.labels == set()
+    assert aggregation_overview.metadata == dict()
+
+
+def test_aggregate_evaluation_specific_values_overwrite_defaults(
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    ],
+    run_id: str,
+) -> None:
+    expected_labels = {"test_label"}
+    expected_metadata: SerializableDict = dict({"test_key": "test-value"})
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_id)
+    aggregation_overview = dummy_aggregator.aggregate_evaluation(
+        evaluation_overview.id, labels=expected_labels, metadata=expected_metadata
+    )
+
+    assert aggregation_overview.labels == expected_labels
+    assert aggregation_overview.metadata == expected_metadata
diff --git a/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py
index 014e40b41..d30e42688 100644
--- a/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py
+++ b/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py
@@ -14,6 +14,9 @@
     Question,
     RecordData,
 )
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core import CompleteOutput, InstructInput, utc_now
 from intelligence_layer.evaluation import (
     Aggregator,
@@ -162,6 +165,10 @@ def create_dummy_runs(
                 failed_example_count=0,
                 successful_example_count=1,
                 description="runner",
+                labels={"test-label"},
+                metadata=dict(
+                    {"test_key": "test_value"},
+                ),
             )
         )
 
@@ -277,3 +284,30 @@ def test_elo_calculating_works_as_expected() -> None:
     elo.calculate(comeback_matches)
 
     assert elo.ratings[player2] > elo.ratings[player1]
+
+
+def test_retrieve_argilla_evaluation_overview_has_submitted_partial_evaluation_overview_labels_metadata(
+    evaluator: ArgillaEvaluator[
+        InstructInput, CompleteOutput, None, ComparisonEvaluation
+    ],
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    any_instruct_output: CompleteOutput,
+) -> None:
+    run_count = 10
+    run_ids = [f"{i}" for i in range(run_count)]
+    dataset_id = create_dummy_dataset(in_memory_dataset_repository)
+    create_dummy_runs(
+        in_memory_run_repository, any_instruct_output, run_ids, dataset_id
+    )
+
+    expected_labels = {"test-label"}
+    expected_metadata: SerializableDict = dict({"test_key": "test_value"})
+
+    partial_overview = evaluator.submit(
+        *run_ids, labels=expected_labels, metadata=expected_metadata
+    )
+    evaluation_overview = evaluator.retrieve(partial_overview.id)
+
+    assert partial_overview.labels == evaluation_overview.labels
+    assert partial_overview.metadata == evaluation_overview.metadata
diff --git a/tests/evaluation/infrastructure/test_repository_navigator.py b/tests/evaluation/infrastructure/test_repository_navigator.py
index 8272ee5b0..ddfc98ed7 100644
--- a/tests/evaluation/infrastructure/test_repository_navigator.py
+++ b/tests/evaluation/infrastructure/test_repository_navigator.py
@@ -437,7 +437,9 @@ def test_aggregation_overviews_to_pandas(length: int) -> None:
     # given
     overview = create_aggregation_overview(AggregationDummy())
     # when
-    df = aggregation_overviews_to_pandas([overview] * length, unwrap_statistics=False)
+    df = aggregation_overviews_to_pandas(
+        [overview] * length, unwrap_statistics=False, unwrap_metadata=False
+    )
     # then
     assert len(df) == length
     assert set(AggregationOverview.model_fields.keys()) == set(df.columns)
@@ -466,6 +468,48 @@ class AggregationDummy2(BaseModel):
     assert "statistics" not in df.columns
 
 
+def test_aggregation_overviews_to_pandas_unwrap_metadata() -> None:
+    # given
+
+    overview = AggregationOverview(
+        evaluation_overviews=frozenset([]),
+        id="aggregation-id",
+        start=utc_now(),
+        end=utc_now(),
+        successful_evaluation_count=5,
+        crashed_during_evaluation_count=3,
+        description="dummy-evaluator",
+        statistics=AggregationDummy(),
+        labels=set(),
+        metadata=dict({"model": "model_a", "prompt": "prompt_a"}),
+    )
+    overview2 = AggregationOverview(
+        evaluation_overviews=frozenset([]),
+        id="aggregation-id2",
+        start=utc_now(),
+        end=utc_now(),
+        successful_evaluation_count=5,
+        crashed_during_evaluation_count=3,
+        description="dummy-evaluator",
+        statistics=AggregationDummy(),
+        labels=set(),
+        metadata=dict(
+            {"model": "model_a", "prompt": "prompt_a", "different_column": "value"}
+        ),
+    )
+
+    df = aggregation_overviews_to_pandas(
+        [overview, overview2], unwrap_metadata=True, strict=False
+    )
+
+    assert "model" in df.columns
+    assert "prompt" in df.columns
+    assert "different_column" in df.columns
+    assert "metadata" not in df.columns
+    assert all(df["model"] == "model_a")
+    assert all(df["prompt"] == "prompt_a")
+
+
 def test_aggregation_overviews_to_pandas_works_with_eval_overviews() -> None:
     # given
     eval_overview = EvaluationOverview(
@@ -476,6 +520,8 @@ def test_aggregation_overviews_to_pandas_works_with_eval_overviews() -> None:
         successful_evaluation_count=1,
         failed_evaluation_count=1,
         description="",
+        labels=set(),
+        metadata=dict(),
     )
     overview = AggregationOverview(
         evaluation_overviews=frozenset([eval_overview]),
diff --git a/tests/evaluation/run/test_run_repository.py b/tests/evaluation/run/test_run_repository.py
index 50fe9da98..3bdba5ea8 100644
--- a/tests/evaluation/run/test_run_repository.py
+++ b/tests/evaluation/run/test_run_repository.py
@@ -30,6 +30,8 @@ def run_overviews() -> Sequence[RunOverview]:
             failed_example_count=0,
             successful_example_count=1,
             description="test run overview",
+            labels=set(),
+            metadata=dict(),
         )
         run_overviews.append(run_overview)
     return run_overviews
diff --git a/tests/evaluation/run/test_runner.py b/tests/evaluation/run/test_runner.py
index 0ebd0ba68..f3e32eb06 100644
--- a/tests/evaluation/run/test_runner.py
+++ b/tests/evaluation/run/test_runner.py
@@ -2,6 +2,9 @@
 
 import pytest
 
+from intelligence_layer.connectors.base.json_serializable import (
+    SerializableDict,
+)
 from intelligence_layer.core import InMemoryTaskSpan, InMemoryTracer
 from intelligence_layer.evaluation import (
     Example,
@@ -122,3 +125,43 @@ def test_runner_runs_n_examples(
     entries = tracer.entries
     assert len(entries) == 1
     assert all([isinstance(e, InMemoryTaskSpan) for e in entries])
+
+
+def test_runner_run_overview_has_default_metadata_and_labels(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    sequence_examples: Iterable[Example[str, None]],
+) -> None:
+    examples = list(sequence_examples)
+    task = DummyTask()
+    runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo")
+
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=examples, dataset_name=""
+    ).id
+
+    overview = runner.run_dataset(dataset_id)
+
+    assert overview.metadata == dict()
+    assert overview.labels == set()
+
+
+def test_runner_run_overview_has_specified_metadata_and_labels(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    sequence_examples: Iterable[Example[str, None]],
+) -> None:
+    run_labels = {"test-label"}
+    run_metadata: SerializableDict = dict({"test_key": "test-value"})
+
+    examples = list(sequence_examples)
+    task = DummyTask()
+    runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo")
+
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=examples, dataset_name=""
+    ).id
+    overview = runner.run_dataset(dataset_id, labels=run_labels, metadata=run_metadata)
+
+    assert overview.metadata == run_metadata
+    assert overview.labels == run_labels