Skip to content

Commit

Permalink
WIP CONCEPT for new Argialla Evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
SebastianNiehusTNG committed Apr 4, 2024
1 parent 9d9b26a commit 7fbfe5d
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 10 deletions.
1 change: 0 additions & 1 deletion src/intelligence_layer/evaluation/aggregation/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True)
id: Aggregation overview ID.
start: Start timestamp of the aggregation.
end: End timestamp of the aggregation.
end: The time when the evaluation run ended
successful_evaluation_count: The number of examples that where successfully evaluated.
crashed_during_evaluation_count: The number of examples that crashed during evaluation.
failed_evaluation_count: The number of examples that crashed during evaluation
Expand Down
32 changes: 28 additions & 4 deletions src/intelligence_layer/evaluation/evaluation/argilla_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from itertools import combinations
from typing import Mapping, Optional

from pydantic import BaseModel

from intelligence_layer.connectors.argilla.argilla_client import (
ArgillaClient,
ArgillaEvaluation,
Expand All @@ -17,6 +19,7 @@
ArgillaEvaluationRepository,
RecordDataSequence,
)
from intelligence_layer.evaluation.evaluation.domain import Evaluation
from intelligence_layer.evaluation.evaluation.evaluation_repository import (
EvaluationRepository,
)
Expand All @@ -35,8 +38,19 @@ def do_evaluate(
self,
example: Example[Input, ExpectedOutput],
*output: SuccessfulExampleOutput[Output],
) -> RecordDataSequence:
return self._to_record(example, *output)
) -> Evaluation:
# Here we would put the current aggregation logic to
# fetch the evaluations from the argilla workspace and "finalize" them
# We then return a FULL EvaluationOverview
# The current ArgillaAggregator will then become generic EloAggregator
pass


class ArgillaSubmissionLogic(BaseModel):
"""
Class that translates the example into an Argilla-compatible record
and handles the submission to the Argilla workspace.
"""

@abstractmethod
def _to_record(
Expand All @@ -46,14 +60,18 @@ def _to_record(
) -> RecordDataSequence:
"""This method is responsible for translating the `Example` and `Output` of the task to :class:`RecordData`
Args:
example: The example to be translated.
output: The output of the example that was run.
"""
...


class ArgillaSubmission(BaseModel):
# Should contain info to identify a submission to the argilla workspace
pass


class ArgillaEvaluator(
Evaluator[Input, Output, ExpectedOutput, ArgillaEvaluation],
ABC,
Expand Down Expand Up @@ -83,6 +101,7 @@ def __init__(
run_repository: RunRepository,
evaluation_repository: ArgillaEvaluationRepository,
description: str,
submission_logic: ArgillaSubmissionLogic,
evaluation_logic: ArgillaEvaluationLogic[Input, Output, ExpectedOutput],
) -> None:
super().__init__(
Expand All @@ -96,8 +115,13 @@ def __init__(
def evaluation_type(self) -> type[ArgillaEvaluation]: # type: ignore
return ArgillaEvaluation

def submit(self) -> ArgillaSubmission:
# Call the ArgillaSubmissionLogic to send the examples to the argilla workspace
# This takes code from argilla_evaluation_repository
pass


class InstructComparisonArgillaEvaluationLogic(
class InstructComparisonArgillaEvaluationLogic( # Will become a submission logic
ArgillaEvaluationLogic[InstructInput, CompleteOutput, None]
):
def __init__(
Expand Down
10 changes: 9 additions & 1 deletion src/intelligence_layer/evaluation/evaluation/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,18 @@ class EvaluationOverview(BaseModel, frozen=True):
run_overviews: Overviews of the runs that were evaluated.
id: The unique identifier of this evaluation.
start: The time when the evaluation run was started
end: End timestamp of the evaluation.
successful_evaluation_count: The number of examples that where successfully evaluated.
crashed_during_evaluation_count: The number of examples that crashed during evaluation.
description: human-readable for the evaluator that created the evaluation
"""

run_overviews: frozenset[RunOverview]
id: str
start: Optional[datetime]
end: datetime
successful_evaluation_count: int
crashed_during_evaluation_count: int
description: str

def __repr__(self) -> str:
Expand All @@ -96,11 +102,13 @@ def __str__(self) -> str:
return (
f"Evaluation Overview ID = {self.id}\n"
f"Start time = {self.start}\n"
f"End time = {self.end}\n"
f"Successful evaluation count = {self.successful_evaluation_count}\n"
f"Count of crashed evaluations = {self.failed_evaluation_count}\n"
f'Description = "{self.description}"\n'
f"{run_overview_str}"
)


class EvaluationFailed(Exception):
def __init__(self, evaluation_id: str, failed_count: int) -> None:
super().__init__(
Expand Down
8 changes: 4 additions & 4 deletions tests/evaluation/test_argilla_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from intelligence_layer.evaluation import (
AggregationLogic,
ArgillaAggregator,
ArgillaEvaluationLogic,
ArgillaSubmissionLogic,
ArgillaEvaluationRepository,
ArgillaEvaluator,
Example,
Expand Down Expand Up @@ -43,8 +43,8 @@ def aggregate(
)


class DummyStringTaskArgillaEvaluationLogic(
ArgillaEvaluationLogic[
class DummyStringTaskArgillaSubmissionLogic(
ArgillaSubmissionLogic[
DummyStringInput,
DummyStringOutput,
DummyStringOutput,
Expand Down Expand Up @@ -132,7 +132,7 @@ def string_argilla_evaluator(
in_memory_run_repository,
argilla_evaluation_repository,
"dummy-string-task",
DummyStringTaskArgillaEvaluationLogic(),
DummyStringTaskArgillaSubmissionLogic(),
)
return evaluator

Expand Down

0 comments on commit 7fbfe5d

Please sign in to comment.