Skip to content

Commit

Permalink
WIP: feat: Add more code to How-to run incremental eval
Browse files Browse the repository at this point in the history
TASK: IL-313
  • Loading branch information
SebastianNiehusAA committed May 22, 2024
1 parent c168619 commit 77b5b8b
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 14 deletions.
16 changes: 16 additions & 0 deletions src/documentation/how_tos/example_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (
ComparisonEvaluation,
EloEvaluationLogic,
IncrementalEvaluationLogic,
Matches,
MatchOutcome,
)
Expand Down Expand Up @@ -48,6 +49,21 @@ def do_evaluate(
)


class DummyIncrementalEvaluationLogic(
IncrementalEvaluationLogic[str, str, str, DummyEvaluation]
):
def do_incremental_evaluate(
self,
example: Example[str, str],
outputs: list[SuccessfulExampleOutput[str]],
already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],
) -> DummyEvaluation:
output_str = "(" + (", ".join(o.output for o in outputs)) + ")"
return DummyEvaluation(
eval=f"{example.input}, {example.expected_output}, {output_str} -> evaluation"
)


class DummyEloEvaluationLogic(EloEvaluationLogic[str, str, str]):
def grade(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,23 @@
"metadata": {},
"outputs": [],
"source": [
"from documentation.how_tos.example_data import DummyEloEvaluationLogic, example_data\n",
"from evaluation.conftest import DummyTask\n",
"\n",
"from documentation.how_tos.example_data import (\n",
" DummyAggregationLogic,\n",
" DummyExample,\n",
" DummyIncrementalEvaluationLogic,\n",
")\n",
"from intelligence_layer.evaluation import (\n",
" Aggregator,\n",
" IncrementalEvaluator,\n",
" InMemoryAggregationRepository,\n",
" InMemoryEvaluationRepository,\n",
" InMemoryRunRepository,\n",
" Runner,\n",
")\n",
"from intelligence_layer.evaluation.dataset.in_memory_dataset_repository import (\n",
" InMemoryDatasetRepository,\n",
")"
]
},
Expand All @@ -23,7 +36,14 @@
" - run multiple tasks and configurations on the same dataset\n",
" - perform evaluations in an incremental fashion, i.e., adding additional runs to your existing evaluations without the need for recalculation\n",
" - run aggregation on these evaluations\n",
" - "
" - \n",
" \n",
"# How to implement elo evaluations\n",
"1. Run your tasks on the datasets you want to evaluate (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n",
" - When evaluating multiple runs, all of them need the same data types \n",
"2. Initialize `IncrementalEvaluator` with the repositories and an `EloEvaluationLogic` that is specific to your use case. \n",
"3. Run the evaluator to evaluate all examples and create a single `EvaluationOverview`\n",
"4. (Optional) Save the evaluation id for later use"
]
},
{
Expand All @@ -32,36 +52,86 @@
"metadata": {},
"outputs": [],
"source": [
"# Step 0 Define ne\n",
"\n",
"\n",
"my_example_data = example_data()\n",
"print()\n",
"run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]\n",
"# Preparation\n",
"examples = [\n",
" DummyExample(input=\"input1\", expected_output=\"expected_output1\", data=\"data1\")\n",
"]\n",
"\n",
"# Step 1\n",
"dataset_repository = my_example_data.dataset_repository\n",
"run_repository = my_example_data.run_repository\n",
"dataset_repository = InMemoryDatasetRepository()\n",
"my_dataset = dataset_repository.create_dataset(examples, \"MyDataset\")\n",
"\n",
"run_repository = InMemoryRunRepository()\n",
"evaluation_repository = InMemoryEvaluationRepository()\n",
"evaluation_logic = DummyEloEvaluationLogic()\n",
"evaluation_logic = DummyIncrementalEvaluationLogic()\n",
"\n",
"aggregation_repository = InMemoryAggregationRepository()\n",
"aggregation_logic = DummyAggregationLogic()\n",
"\n",
"first_task = DummyTask()\n",
"second_task = DummyTask()\n",
"\n",
"first_runner = Runner(first_task, dataset_repository, run_repository, \"MyFirstRun\")\n",
"second_runner = Runner(second_task, dataset_repository, run_repository, \"MySecondRun\")\n",
"\n",
"first_run_overview = first_runner.run_dataset(my_dataset.id)\n",
"second_run_overview = second_runner.run_dataset(my_dataset.id)\n",
"\n",
"print(first_run_overview.id)\n",
"print(second_run_overview.id)\n",
"\n",
"\n",
"# Step 2\n",
"run_overview_ids_for_first_evaluation = []\n",
"for run_overview in run_repository.run_overviews():\n",
" if run_overview.description == \"MyFirstRun\":\n",
" run_overview_ids_for_first_evaluation.append(run_overview.id)\n",
"\n",
"evaluator = IncrementalEvaluator(\n",
" dataset_repository,\n",
" run_repository,\n",
" evaluation_repository,\n",
" \"My dummy evaluation\",\n",
" \"My incremental evaluation\",\n",
" evaluation_logic,\n",
")\n",
"evaluation_overview_first_task = evaluator.evaluate_runs(first_run_overview.id)\n",
"\n",
"evaluation_overview = evaluator.evaluate_runs(*run_ids)\n",
"# Step 2.5 Aggregate single\n",
"aggregator = Aggregator(\n",
" evaluation_repository, aggregation_repository, \"MyAggregator\", aggregation_logic\n",
")\n",
"first_aggregation_overview = aggregator.aggregate_evaluation(\n",
" *evaluation_repository.evaluation_overview_ids()\n",
")\n",
"print(first_aggregation_overview)\n",
"\n",
"# Step 3\n",
"print(evaluation_overview.id)"
"previous_evaluation_ids = evaluation_repository.evaluation_overview_ids()\n",
"evaluator.evaluate_additional_runs(\n",
" *run_repository.run_overview_ids(), previous_evaluation_ids=previous_evaluation_ids\n",
")\n",
"\n",
"# Step 4: Aggregate all\n",
"second_aggregation_overview = aggregator.aggregate_evaluation(\n",
" *evaluation_repository.evaluation_overview_ids()\n",
")\n",
"print(second_aggregation_overview)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
}
Expand Down

0 comments on commit 77b5b8b

Please sign in to comment.