feat: aggregation of aggregation (#885)

* feat: add aggregation_overviews_to_pandas function for comparing aggregations, and tutorial notebook TASK: IL-419 --------- Co-authored-by: Sebastian Niehus <sebastian.niehus@ext.aleph-alpha.com>
Aleph-Alpha · May 31, 2024 · 31df7b3 · 31df7b3
1 parent e5a4a77
commit 31df7b3
Showing 8 changed files with 507 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,8 @@
  - We now support python 3.12
  - Add `description` parameter to `Evaluator.evaluate_runs` and `Runner.run_dataset` to allow individual descriptions without the need to create a new `Evaluator` or `Runner`.
  - All models raise an error during initialization if an incompatible `name` is passed, instead of only when they are used.
+ - Add `aggregation_overviews_to_pandas` function to allow for easier comparison of multiple aggregation overviews
+ - Add `parameter_optimization.ipynb` notebook to demonstrate the optimization of tasks by comparing different parameter combinations.
 
 ### Fixes
  - The document index client now correctly URL-encodes document names in its queries.

diff --git a/README.md b/README.md
@@ -124,19 +124,20 @@ To use an **on-premises setup**, set the `CLIENT_URL` variable to your host URL.
 
 ## Tutorial Notebooks
 
-| Order | Topic                | Description                                          | Notebook 📓                                                                              |
-|-------|----------------------|------------------------------------------------------|------------------------------------------------------------------------------------------|
-| 1     | Summarization        | Summarize a document                                 | [summarization.ipynb](./src/documentation/summarization.ipynb)                           |
-| 2     | Question Answering   | Various approaches for QA                            | [qa.ipynb](./src/documentation/qa.ipynb)                                                 |
-| 3     | Classification       | Learn about two methods of classification            | [classification.ipynb](./src/documentation/classification.ipynb)                         |
-| 4     | Evaluation           | Evaluate LLM-based methodologies                     | [evaluation.ipynb](./src/documentation/evaluation.ipynb)                                 |
-| 5     | Elo QA Evaluation    | Evaluate QA tasks in an Elo ranking                  | [elo_qa_eval.ipynb](./src/documentation/elo_qa_eval.ipynb)  |
-| 6     | Quickstart Task      | Build a custom `Task` for your use case              | [quickstart_task.ipynb](./src/documentation/quickstart_task.ipynb)                       |
-| 7     | Document Index       | Connect your proprietary knowledge base              | [document_index.ipynb](./src/documentation/document_index.ipynb)                         |
-| 8     | Human Evaluation     | Connect to Argilla for manual evaluation             | [human_evaluation.ipynb](./src/documentation/human_evaluation.ipynb)                     |
-| 9     | Performance tips     | Contains some small tips for performance             | [performance_tips.ipynb](./src/documentation/performance_tips.ipynb)                     |
-| 10    | Deployment           | Shows how to deploy a Task in a minimal FastAPI app. | [fastapi_tutorial.ipynb](./src/documentation/fastapi_tutorial.ipynb)                     |
-| 11    | Issue Classification | Deploy a Task in Kubernetes to classify Jira issues  | [Found in adjacent repository](https://github.com/Aleph-Alpha/IL-Classification-Journey) |
+| Order | Topic                  | Description                                          | Notebook 📓                                                                              |
+|-------|------------------------|------------------------------------------------------|------------------------------------------------------------------------------------------|
+| 1     | Summarization          | Summarize a document                                 | [summarization.ipynb](./src/documentation/summarization.ipynb)                           |
+| 2     | Question Answering     | Various approaches for QA                            | [qa.ipynb](./src/documentation/qa.ipynb)                                                 |
+| 3     | Classification         | Learn about two methods of classification            | [classification.ipynb](./src/documentation/classification.ipynb)                         |
+| 4     | Evaluation             | Evaluate LLM-based methodologies                     | [evaluation.ipynb](./src/documentation/evaluation.ipynb)                                 |
+| 5     | Parameter Optimization | Compare Task configuration for optimization          | [parameter_optimization.ipynb](./src/documentation/parameter_optimization.ipynb)         |
+| 5     | Elo QA Evaluation      | Evaluate QA tasks in an Elo ranking                  | [elo_qa_eval.ipynb](./src/documentation/elo_qa_eval.ipynb)                               |
+| 6     | Quickstart Task        | Build a custom `Task` for your use case              | [quickstart_task.ipynb](./src/documentation/quickstart_task.ipynb)                       |
+| 7     | Document Index         | Connect your proprietary knowledge base              | [document_index.ipynb](./src/documentation/document_index.ipynb)                         |
+| 8     | Human Evaluation       | Connect to Argilla for manual evaluation             | [human_evaluation.ipynb](./src/documentation/human_evaluation.ipynb)                     |
+| 9     | Performance tips       | Contains some small tips for performance             | [performance_tips.ipynb](./src/documentation/performance_tips.ipynb)                     |
+| 10    | Deployment             | Shows how to deploy a Task in a minimal FastAPI app. | [fastapi_tutorial.ipynb](./src/documentation/fastapi_tutorial.ipynb)                     |
+| 11    | Issue Classification   | Deploy a Task in Kubernetes to classify Jira issues  | [Found in adjacent repository](https://github.com/Aleph-Alpha/IL-Classification-Journey) |
 
 ## How-Tos
 

diff --git a/src/documentation/parameter_optimization.ipynb b/src/documentation/parameter_optimization.ipynb
@@ -0,0 +1,314 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "import random\n",
+    "import string\n",
+    "from typing import Iterable\n",
+    "\n",
+    "from pydantic import BaseModel\n",
+    "\n",
+    "from intelligence_layer.core import Input, Task, TaskSpan\n",
+    "from intelligence_layer.evaluation import (\n",
+    "    AggregationLogic,\n",
+    "    Aggregator,\n",
+    "    Evaluation,\n",
+    "    Evaluator,\n",
+    "    Example,\n",
+    "    ExpectedOutput,\n",
+    "    InMemoryAggregationRepository,\n",
+    "    InMemoryDatasetRepository,\n",
+    "    InMemoryEvaluationRepository,\n",
+    "    InMemoryRunRepository,\n",
+    "    Runner,\n",
+    "    SingleOutputEvaluationLogic,\n",
+    "    aggregation_overviews_to_pandas,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "class DummyTask(Task[str, str]):\n",
+    "    def __init__(self, model: str, prompt: str):\n",
+    "        self.model = model\n",
+    "        self.prompt = prompt\n",
+    "\n",
+    "    def do_run(self, input: str, task_span: TaskSpan) -> str:\n",
+    "        wordlist = [\n",
+    "            \"apple\",\n",
+    "            \"banana\",\n",
+    "            \"car\",\n",
+    "            \"dog\",\n",
+    "            \"elephant\",\n",
+    "            \"fish\",\n",
+    "            \"goat\",\n",
+    "            \"hat\",\n",
+    "            \"igloo\",\n",
+    "            \"jacket\",\n",
+    "        ]\n",
+    "        sentences = [\n",
+    "            \"Once upon a time,\",\n",
+    "            \"In a land far, far away,\",\n",
+    "            \"Suddenly,\",\n",
+    "            \"One day,\",\n",
+    "            \"In the morning,\",\n",
+    "        ]\n",
+    "\n",
+    "        random.seed(hash(input) + hash(self.model))  # Set the seed based on the prompt\n",
+    "\n",
+    "        story = self.prompt + \" \"\n",
+    "        for _ in range(10):\n",
+    "            sentence = random.choice(sentences)\n",
+    "            word = random.choice(wordlist)\n",
+    "            story += sentence + \" \" + word + \" \"\n",
+    "        return story\n",
+    "\n",
+    "\n",
+    "class DummyEvaluation(BaseModel):\n",
+    "    text_length: int\n",
+    "    normalized_capital_count: float\n",
+    "\n",
+    "\n",
+    "class DummyEvaluationLogic(\n",
+    "    SingleOutputEvaluationLogic[str, str, None, DummyEvaluation]\n",
+    "):\n",
+    "    def do_evaluate_single_output(\n",
+    "        self, example: Example[Input, ExpectedOutput], output: str\n",
+    "    ) -> Evaluation:\n",
+    "        return DummyEvaluation(\n",
+    "            text_length=len(output),\n",
+    "            normalized_capital_count=sum(c.isupper() for c in output) / len(output),\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "class DummyAggregatedEvaluation(BaseModel):\n",
+    "    avg_length: float\n",
+    "    avg_normalized_capital_count: float\n",
+    "\n",
+    "\n",
+    "class DummyAggregationLogic(\n",
+    "    AggregationLogic[DummyEvaluation, DummyAggregatedEvaluation]\n",
+    "):\n",
+    "    def aggregate(\n",
+    "        self, evaluations: Iterable[DummyEvaluation]\n",
+    "    ) -> DummyAggregatedEvaluation:\n",
+    "        eval_list = list(evaluations)\n",
+    "        avg_length = sum([s.text_length for s in eval_list]) / len(eval_list)\n",
+    "        avg_normalized_capital_count = sum(\n",
+    "            [s.normalized_capital_count for s in eval_list]\n",
+    "        ) / len(eval_list)\n",
+    "        return DummyAggregatedEvaluation(\n",
+    "            avg_length=avg_length,\n",
+    "            avg_normalized_capital_count=avg_normalized_capital_count,\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "# Initialize the dataset\n",
+    "random.seed(42)\n",
+    "examples = [\n",
+    "    Example(\n",
+    "        input=\"\".join(random.choices(string.ascii_letters, k=random.randint(1, 50))),\n",
+    "        expected_output=None,\n",
+    "    )\n",
+    "    for i in range(10)\n",
+    "]\n",
+    "\n",
+    "dataset_repository = InMemoryDatasetRepository()\n",
+    "dataset = dataset_repository.create_dataset(\n",
+    "    examples=examples, dataset_name=\"my-dataset\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1",
+   "metadata": {},
+   "source": [
+    "# Optimizing Tasks by Comparing Aggregations\n",
+    "\n",
+    "In this tutorial we demonstrate how to optimize a given `Task` that depends on a `model` and a `prompt` parameter. This is done by evaluating each combination of parameters for the tasks separately and then comparing the resulting aggregations.\n",
+    "\n",
+    "In this scenario, our task does not depend on an LLM, for the sake of execution speed. However, the demonstrated principles generalize to other use cases.\n",
+    "\n",
+    "## Setup\n",
+    "\n",
+    "We assume the dataset, the `Task` and `DatasetRepository` are already given, so we can start by instantiating the remaining repositories and our `Evaluator`. The `EXPERIMENT_NAME` will later be used to identify the aggregations of interest. Therefore, we pass it to the `Evaluator` and later to the `Runner` and `Aggregator`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EXPERIMENT_NAME = \"compare-tasks\"\n",
+    "\n",
+    "# The `DatasetRepository` is named `dataset_repository`\n",
+    "run_repository = InMemoryRunRepository()\n",
+    "evaluation_repository = InMemoryEvaluationRepository()\n",
+    "aggregation_repository = InMemoryAggregationRepository()\n",
+    "\n",
+    "evaluator = Evaluator(\n",
+    "    dataset_repository,\n",
+    "    run_repository,\n",
+    "    evaluation_repository,\n",
+    "    EXPERIMENT_NAME,\n",
+    "    DummyEvaluationLogic(),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3",
+   "metadata": {},
+   "source": [
+    "## Running Experiments for Different Configurations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Definition of parameters\n",
+    "model_list = [\"model a\", \"model b\", \"model c\"]\n",
+    "prompt_list = [\n",
+    "    \"A nice story starts with:\",\n",
+    "    \"Some kind of prompt\",\n",
+    "    \"No prompt at all\",\n",
+    "    \"OPTIMIZING PROMPTS IS HARD TO DO\",\n",
+    "]\n",
+    "\n",
+    "# Loop over all combinations of parameters and run the `Task` for each combination.\n",
+    "# Note, that this can be **very** expensive for large sets of parameters.\n",
+    "for model, prompt in itertools.product(model_list, prompt_list):\n",
+    "    dummy_task = DummyTask(model=model, prompt=prompt)\n",
+    "\n",
+    "    # The description and the Experiment will later be used to identify the run parameters. Take special note of the delimiter '|'.\n",
+    "    description = f\"|{model}|{prompt}|\"\n",
+    "    runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n",
+    "    run_overview = runner.run_dataset(dataset.id, description=description)\n",
+    "\n",
+    "    eval_overview = evaluator.evaluate_runs(run_overview.id, description=description)\n",
+    "\n",
+    "    aggregator = Aggregator(\n",
+    "        evaluation_repository,\n",
+    "        aggregation_repository,\n",
+    "        EXPERIMENT_NAME + \":\" + description,\n",
+    "        DummyAggregationLogic(),\n",
+    "    )\n",
+    "    aggregator.aggregate_evaluation(eval_overview.id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5",
+   "metadata": {},
+   "source": [
+    "## Comparison of Different Configurations\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Retrieve all aggregations and filter them by desired criteria, i.e., the `EXPERIMENT_NAME`.\n",
+    "aggregations_of_interest = [\n",
+    "    overview\n",
+    "    for overview in aggregation_repository.aggregation_overviews(\n",
+    "        aggregation_type=DummyAggregatedEvaluation\n",
+    "    )\n",
+    "    if overview.description.startswith(EXPERIMENT_NAME)\n",
+    "]\n",
+    "\n",
+    "# Convert the desired aggregation into a pandas dataframe\n",
+    "formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7",
+   "metadata": {},
+   "source": [
+    " The following steps are very specific to the experiment setup, mostly depending on standard pandas knowledge. They are just one example of how one might analyze the configurations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "aggregation_fields = list(DummyAggregatedEvaluation.model_fields.keys())\n",
+    "formated_aggregations = formated_aggregations[[\"description\"] + aggregation_fields]\n",
+    "formated_aggregations[[\"model\", \"prompt\"]] = formated_aggregations[\n",
+    "    \"description\"\n",
+    "].str.split(\"|\", expand=True)[[1, 2]]\n",
+    "formated_aggregations.drop(columns=\"description\", inplace=True)\n",
+    "\n",
+    "display(\n",
+    "    formated_aggregations.sort_values(\n",
+    "        by=\"avg_normalized_capital_count\", ascending=False\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "formated_aggregations.pivot(\n",
+    "    index=\"model\", columns=\"prompt\", values=\"avg_normalized_capital_count\"\n",
+    ").plot(kind=\"box\", rot=90, title=\"avg_normalized_capital_count\")\n",
+    "formated_aggregations.pivot(index=\"prompt\", columns=\"model\", values=\"avg_length\").plot(\n",
+    "    kind=\"box\", title=\"avg_length\"\n",
+    ")\n",
+    "pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10",
+   "metadata": {},
+   "source": [
+    "With these results, it's easy to see which prompt is best to optimize our score! The model on the other hand does not seem to have a big impact on our metrics."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py
@@ -108,6 +108,9 @@
 from .infrastructure.repository_navigator import (
     RepositoryNavigator as RepositoryNavigator,
 )
+from .infrastructure.repository_navigator import (
+    aggregation_overviews_to_pandas as aggregation_overviews_to_pandas,
+)
 from .infrastructure.repository_navigator import (
     evaluation_lineages_to_pandas as evaluation_lineages_to_pandas,
 )

diff --git a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py
@@ -7,6 +7,10 @@
 from rich.tree import Tree
 
 from intelligence_layer.core.task import Input, Output
+from intelligence_layer.evaluation.aggregation.domain import (
+    AggregatedEvaluation,
+    AggregationOverview,
+)
 from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
 from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
 from intelligence_layer.evaluation.evaluation.domain import (
@@ -108,6 +112,42 @@ def evaluation_lineages_to_pandas(
     return df
 
 
+def aggregation_overviews_to_pandas(
+    aggregation_overviews: Sequence[AggregationOverview[AggregatedEvaluation]],
+    unwrap_statistics: bool = True,
+    strict: bool = True,
+) -> pd.DataFrame:
+    """Converts aggregation overviews to a pandas table for easier comparison.
+
+    Args:
+        aggregation_overviews: Overviews to convert.
+        unwrap_statistics: Unwrap the `statistics` field in the overviews into separate columns.
+            Defaults to True.
+        strict: Allow only overviews with exactly equal `statistics` types. Defaults to True.
+
+    Returns:
+        A pandas :class:`DataFrame` containing an overview per row with fields as columns.
+    """
+    overviews = list(aggregation_overviews)
+    if strict and len(overviews) > 1:
+        first_type = overviews[0].statistics.__class__
+        if any(
+            overview.statistics.__class__ != first_type for overview in overviews[1:]
+        ):
+            raise ValueError(
+                "Aggregation overviews contain different types, which is not allowed with strict=True"
+            )
+
+    df = pd.DataFrame(
+        [model.model_dump(mode="json") for model in aggregation_overviews]
+    )
+    if unwrap_statistics and "statistics" in df.columns:
+        df = df.join(pd.DataFrame(df["statistics"].to_list())).drop(
+            columns=["statistics"]
+        )
+    return df
+
+
 class RepositoryNavigator:
     """The `RepositoryNavigator` is used to retrieve coupled data from multiple repositories."""
 

diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py
@@ -208,11 +208,8 @@ def dummy_pairwise_evaluator(
 
 @fixture
 def dummy_aggregator(
-    in_memory_dataset_repository: InMemoryDatasetRepository,
-    in_memory_run_repository: InMemoryRunRepository,
     in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_aggregation_repository: InMemoryAggregationRepository,
-    dummy_eval_logic: DummyEvaluationLogic,
     dummy_aggregate_logic: DummyAggregationLogic,
 ) -> Aggregator[DummyEvaluation, DummyAggregatedEvaluationWithResultList]:
     return Aggregator(

diff --git a/tests/evaluation/test_repository_navigator.py b/tests/evaluation/test_repository_navigator.py
@@ -1,10 +1,11 @@
-from typing import Sequence
+from typing import Sequence, TypeVar
 
 import pytest
 from pydantic import BaseModel
 from pytest import fixture
 
 from intelligence_layer.core import Task, TaskSpan
+from intelligence_layer.core.tracer.tracer import utc_now
 from intelligence_layer.evaluation import (
     Dataset,
     DatasetRepository,
@@ -24,6 +25,10 @@
     evaluation_lineages_to_pandas,
     run_lineages_to_pandas,
 )
+from intelligence_layer.evaluation.aggregation.domain import AggregationOverview
+from intelligence_layer.evaluation.infrastructure.repository_navigator import (
+    aggregation_overviews_to_pandas,
+)
 
 
 class DummyExample(Example[str, str]):
@@ -401,3 +406,128 @@ def test_evaluation_lineages_to_pandas(
             count += 1
 
     assert count == len(df)
+
+
+class AggregationDummy(BaseModel):
+    score: float = 0.5
+    value: float = 0.3
+
+
+T = TypeVar("T", bound=BaseModel)
+
+
+def create_aggregation_overview(
+    statistics: T,
+) -> AggregationOverview[T]:
+    return AggregationOverview(
+        evaluation_overviews=frozenset(),
+        id="aggregation-id",
+        start=utc_now(),
+        end=utc_now(),
+        successful_evaluation_count=5,
+        crashed_during_evaluation_count=3,
+        description="dummy-evaluator",
+        statistics=statistics,
+    )
+
+
+@pytest.mark.parametrize("length", [1, 2])
+def test_aggregation_overviews_to_pandas(length: int) -> None:
+    # given
+    overview = create_aggregation_overview(AggregationDummy())
+    # when
+    df = aggregation_overviews_to_pandas([overview] * length, unwrap_statistics=False)
+    # then
+    assert len(df) == length
+    assert set(AggregationOverview.model_fields.keys()) == set(df.columns)
+
+
+def test_aggregation_overviews_to_pandas_unwrap_statistics() -> None:
+    overview = create_aggregation_overview(AggregationDummy())
+
+    df = aggregation_overviews_to_pandas([overview], unwrap_statistics=True)
+
+    assert "score" in df.columns
+    assert "value" in df.columns
+    assert "statistics" not in df.columns
+    assert all(df["score"] == 0.5)
+    assert all(df["value"] == 0.3)
+
+    class AggregationDummy2(BaseModel):
+        score_2: float = 0.5
+        value_2: float = 0.3
+
+    overview2 = create_aggregation_overview(AggregationDummy2())
+
+    df = aggregation_overviews_to_pandas([overview2], unwrap_statistics=True)
+    assert "score_2" in df.columns
+    assert "value_2" in df.columns
+    assert "statistics" not in df.columns
+
+
+def test_aggregation_overviews_to_pandas_works_with_eval_overviews() -> None:
+    # given
+    eval_overview = EvaluationOverview(
+        run_overviews=frozenset(),
+        id="id",
+        start_date=utc_now(),
+        end_date=utc_now(),
+        successful_evaluation_count=1,
+        failed_evaluation_count=1,
+        description="",
+    )
+    overview = AggregationOverview(
+        evaluation_overviews=frozenset([eval_overview]),
+        id="aggregation-id",
+        start=utc_now(),
+        end=utc_now(),
+        successful_evaluation_count=5,
+        crashed_during_evaluation_count=3,
+        description="dummy-evaluator",
+        statistics=AggregationDummy(),
+    )
+    # when
+    df = aggregation_overviews_to_pandas([overview], unwrap_statistics=False)
+    # then
+    assert len(df) == 1
+
+
+def test_aggregation_overviews_to_pandas_works_with_empty_input() -> None:
+    # when
+    df = aggregation_overviews_to_pandas([])
+    # then
+    assert len(df) == 0
+
+
+def test_aggregation_overviews_does_not_work_with_different_aggregations() -> None:
+    # given
+    overview = create_aggregation_overview(AggregationDummy())
+
+    class OtherVariableNames(BaseModel):
+        not_score: float = 0.5
+        not_value: float = 0.3
+
+    other_variable_names = create_aggregation_overview(OtherVariableNames())
+
+    class SameNameOtherClassAggregation(BaseModel):
+        not_score: float = 0.5
+        not_value: float = 0.3
+
+    same_variable_names_other_class = create_aggregation_overview(
+        SameNameOtherClassAggregation()
+    )
+
+    # when then
+    with pytest.raises(ValueError):
+        df = aggregation_overviews_to_pandas([overview, other_variable_names])
+    df = aggregation_overviews_to_pandas([overview, other_variable_names], strict=False)
+
+    with pytest.raises(ValueError):
+        df = aggregation_overviews_to_pandas(
+            [overview, same_variable_names_other_class]
+        )
+    df = aggregation_overviews_to_pandas(
+        [overview, same_variable_names_other_class], strict=False
+    )
+
+    assert len(df) == 2
diff --git a/tests/use_cases/qa/test_retriever_based_qa.py b/tests/use_cases/qa/test_retriever_based_qa.py
@@ -1,3 +1,4 @@
+import pytest
 from pytest import fixture
 
 from intelligence_layer.connectors.document_index.document_index import DocumentPath
@@ -34,6 +35,7 @@ def retriever_based_qa_with_document_index(
     )
 
 
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_retriever_based_qa_using_in_memory_retriever(
     retriever_based_qa_with_in_memory_retriever: RetrieverBasedQa[int],
     no_op_tracer: NoOpTracer,
@@ -46,6 +48,7 @@ def test_retriever_based_qa_using_in_memory_retriever(
     assert output.subanswers[0].id == 3
 
 
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_retriever_based_qa_with_document_index(
     retriever_based_qa_with_document_index: RetrieverBasedQa[DocumentPath],
     no_op_tracer: NoOpTracer,