Skip to content

Commit

Permalink
IL-502 add elo evaluation how-to (#846)
Browse files Browse the repository at this point in the history
feat: Add how-to for elo evaluation
* refactor: Move elo evaluation logic from "elo_evaluator.py" into "incremental_evaluator.py"
TASK: IL-502
  • Loading branch information
MerlinKallenbornAA authored May 21, 2024
1 parent c34d1ed commit 607b5fb
Show file tree
Hide file tree
Showing 10 changed files with 260 additions and 134 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

### Breaking Changes
- Changed the behavior of `IncrementalEvaluator::do_evaluate` such that it now sends all `SuccessfulExampleOutput`s to `do_incremental_evaluate` instead of only the new `SuccessfulExampleOutput`s.
-

### New Features
- Add generic `EloEvaluationLogic` class for implementation of Elo evaluation use cases.
- Add `EloQaEvaluationLogic` for Elo evaluation of QA runs, with optional later addition of more runs to an existing evaluation.
- Add `EloAggregationAdapter` class to simplify using the `ComparisonEvaluationAggregationLogic` for different Elo use cases.
- Add `elo_qa_eval` tutorial notebook describing the use of an (incremental) Elo evaluation use case for QA models.
- Add `how_to_implement_elo_evaluations` how-to as skeleton for implementing Elo evaluation cases

### Fixes
- `ExpandChunks`-task is now fast even for very large documents

Expand Down
34 changes: 34 additions & 0 deletions src/documentation/how_tos/example_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
SuccessfulExampleOutput,
)
from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic
from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (
ComparisonEvaluation,
EloEvaluationLogic,
Matches,
MatchOutcome,
)


class DummyExample(Example[str, str]):
Expand All @@ -42,6 +48,34 @@ def do_evaluate(
)


class DummyEloEvaluationLogic(EloEvaluationLogic[str, str, str]):
def grade(
self,
first: SuccessfulExampleOutput[str],
second: SuccessfulExampleOutput[str],
example: Example[str, str],
) -> MatchOutcome:
return MatchOutcome.DRAW

def do_incremental_evaluate(
self,
example: Example[str, str],
outputs: list[SuccessfulExampleOutput[str]],
already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],
) -> Matches:
player_a = SuccessfulExampleOutput(run_id="1", example_id="1", output="1")
player_b = SuccessfulExampleOutput(run_id="2", example_id="2", output="2")
return Matches(
comparison_evaluations=[
ComparisonEvaluation(
first_player="1",
second_player="2",
outcome=self.grade(player_a, player_b, example),
)
]
)


class DummyAggregation(BaseModel):
num_evaluations: int

Expand Down
96 changes: 96 additions & 0 deletions src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from documentation.how_tos.example_data import DummyEloEvaluationLogic, example_data\n",
"from intelligence_layer.evaluation import (\n",
" IncrementalEvaluator,\n",
" InMemoryEvaluationRepository,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How to implement elo evaluations\n",
"0. Run your tasks on the datasets you want to evaluate (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n",
" - When evaluating multiple runs, all of them need the same data types \n",
"2. Initialize all necessary repositories for the `IncrementalEvaluator`, and an `EloEvaluationLogic` that is specific to your use case. \n",
"3. Run the evaluator to evaluate all examples and create a single `EvaluationOverview`\n",
"4. (Optional) Save the evaluation id for later use"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Step 0\n",
"\n",
"\n",
"my_example_data = example_data()\n",
"print()\n",
"run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]\n",
"\n",
"# Step 1\n",
"dataset_repository = my_example_data.dataset_repository\n",
"run_repository = my_example_data.run_repository\n",
"evaluation_repository = InMemoryEvaluationRepository()\n",
"evaluation_logic = DummyEloEvaluationLogic()\n",
"\n",
"# Step 3\n",
"evaluator = IncrementalEvaluator(\n",
" dataset_repository,\n",
" run_repository,\n",
" evaluation_repository,\n",
" \"My dummy evaluation\",\n",
" evaluation_logic,\n",
")\n",
"\n",
"evaluation_overview = evaluator.evaluate_runs(*run_ids)\n",
"\n",
"# Step 4\n",
"print(evaluation_overview.id)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "intelligence-layer-aL2cXmJM-py3.11",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
17 changes: 11 additions & 6 deletions src/intelligence_layer/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,23 +63,28 @@
from .evaluation.evaluator.async_evaluator import (
AsyncEvaluationRepository as AsyncEvaluationRepository,
)
from .evaluation.evaluator.elo_evaluator import (
ComparisonEvaluation as ComparisonEvaluation,
)
from .evaluation.evaluator.elo_evaluator import EloEvaluationLogic as EloEvaluationLogic
from .evaluation.evaluator.elo_evaluator import Matches as Matches
from .evaluation.evaluator.elo_evaluator import MatchOutcome as MatchOutcome
from .evaluation.evaluator.evaluator import EvaluationLogic as EvaluationLogic
from .evaluation.evaluator.evaluator import Evaluator as Evaluator
from .evaluation.evaluator.evaluator import (
SingleOutputEvaluationLogic as SingleOutputEvaluationLogic,
)
from .evaluation.evaluator.incremental_evaluator import (
ComparisonEvaluation as ComparisonEvaluation,
)
from .evaluation.evaluator.incremental_evaluator import (
EloEvaluationLogic as EloEvaluationLogic,
)
from .evaluation.evaluator.incremental_evaluator import (
EloGradingInput as EloGradingInput,
)
from .evaluation.evaluator.incremental_evaluator import (
IncrementalEvaluationLogic as IncrementalEvaluationLogic,
)
from .evaluation.evaluator.incremental_evaluator import (
IncrementalEvaluator as IncrementalEvaluator,
)
from .evaluation.evaluator.incremental_evaluator import Matches as Matches
from .evaluation.evaluator.incremental_evaluator import MatchOutcome as MatchOutcome
from .evaluation.file_evaluation_repository import (
AsyncFileEvaluationRepository as AsyncFileEvaluationRepository,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from intelligence_layer.evaluation.aggregation.accumulator import MeanAccumulator
from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic
from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import (
from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (
ComparisonEvaluation,
Matches,
MatchOutcome,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from intelligence_layer.evaluation.evaluation.evaluator.base_evaluator import (
EvaluationLogicBase,
)
from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import (
from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (
ComparisonEvaluation,
MatchOutcome,
)
Expand Down

This file was deleted.

Loading

0 comments on commit 607b5fb

Please sign in to comment.