WIP CONCEPT for new Argialla Evaluation

Aleph-Alpha · Apr 4, 2024 · 7fbfe5d · 7fbfe5d
1 parent 9d9b26a
commit 7fbfe5d
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 10 deletions.
diff --git a/src/intelligence_layer/evaluation/aggregation/domain.py b/src/intelligence_layer/evaluation/aggregation/domain.py
@@ -22,7 +22,6 @@ class AggregationOverview(BaseModel, Generic[AggregatedEvaluation], frozen=True)
         id: Aggregation overview ID.
         start: Start timestamp of the aggregation.
         end: End timestamp of the aggregation.
-        end: The time when the evaluation run ended
         successful_evaluation_count: The number of examples that where successfully evaluated.
         crashed_during_evaluation_count: The number of examples that crashed during evaluation.
         failed_evaluation_count: The number of examples that crashed during evaluation

diff --git a/src/intelligence_layer/evaluation/evaluation/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/argilla_evaluator.py
@@ -3,6 +3,8 @@
 from itertools import combinations
 from typing import Mapping, Optional
 
+from pydantic import BaseModel
+
 from intelligence_layer.connectors.argilla.argilla_client import (
     ArgillaClient,
     ArgillaEvaluation,
@@ -17,6 +19,7 @@
     ArgillaEvaluationRepository,
     RecordDataSequence,
 )
+from intelligence_layer.evaluation.evaluation.domain import Evaluation
 from intelligence_layer.evaluation.evaluation.evaluation_repository import (
     EvaluationRepository,
 )
@@ -35,8 +38,19 @@ def do_evaluate(
         self,
         example: Example[Input, ExpectedOutput],
         *output: SuccessfulExampleOutput[Output],
-    ) -> RecordDataSequence:
-        return self._to_record(example, *output)
+    ) -> Evaluation:
+        # Here we would put the current aggregation logic to
+        # fetch the evaluations from the argilla workspace and "finalize" them
+        # We then return a FULL EvaluationOverview
+        # The current ArgillaAggregator will then become generic EloAggregator
+        pass
+
+
+class ArgillaSubmissionLogic(BaseModel):
+    """
+    Class that translates the example into an Argilla-compatible record
+    and handles the submission to the Argilla workspace.
+    """
 
     @abstractmethod
     def _to_record(
@@ -46,14 +60,18 @@ def _to_record(
     ) -> RecordDataSequence:
         """This method is responsible for translating the `Example` and `Output` of the task to :class:`RecordData`
 
-
         Args:
             example: The example to be translated.
             output: The output of the example that was run.
         """
         ...
 
 
+class ArgillaSubmission(BaseModel):
+    # Should contain info to identify a submission to the argilla workspace
+    pass
+
+
 class ArgillaEvaluator(
     Evaluator[Input, Output, ExpectedOutput, ArgillaEvaluation],
     ABC,
@@ -83,6 +101,7 @@ def __init__(
         run_repository: RunRepository,
         evaluation_repository: ArgillaEvaluationRepository,
         description: str,
+        submission_logic: ArgillaSubmissionLogic,
         evaluation_logic: ArgillaEvaluationLogic[Input, Output, ExpectedOutput],
     ) -> None:
         super().__init__(
@@ -96,8 +115,13 @@ def __init__(
     def evaluation_type(self) -> type[ArgillaEvaluation]:  # type: ignore
         return ArgillaEvaluation
 
+    def submit(self) -> ArgillaSubmission:
+        # Call the ArgillaSubmissionLogic to send the examples to the argilla workspace
+        # This takes code from argilla_evaluation_repository
+        pass
+
 
-class InstructComparisonArgillaEvaluationLogic(
+class InstructComparisonArgillaEvaluationLogic( # Will become a submission logic
     ArgillaEvaluationLogic[InstructInput, CompleteOutput, None]
 ):
     def __init__(

diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py
@@ -72,12 +72,18 @@ class EvaluationOverview(BaseModel, frozen=True):
         run_overviews: Overviews of the runs that were evaluated.
         id: The unique identifier of this evaluation.
         start: The time when the evaluation run was started
+        end: End timestamp of the evaluation.
+        successful_evaluation_count: The number of examples that where successfully evaluated.
+        crashed_during_evaluation_count: The number of examples that crashed during evaluation.
         description: human-readable for the evaluator that created the evaluation
     """
 
     run_overviews: frozenset[RunOverview]
     id: str
     start: Optional[datetime]
+    end: datetime
+    successful_evaluation_count: int
+    crashed_during_evaluation_count: int
     description: str
 
     def __repr__(self) -> str:
@@ -96,11 +102,13 @@ def __str__(self) -> str:
         return (
             f"Evaluation Overview ID = {self.id}\n"
             f"Start time = {self.start}\n"
+            f"End time = {self.end}\n"
+            f"Successful evaluation count = {self.successful_evaluation_count}\n"
+            f"Count of crashed evaluations = {self.failed_evaluation_count}\n"
             f'Description = "{self.description}"\n'
             f"{run_overview_str}"
         )
 
-
 class EvaluationFailed(Exception):
     def __init__(self, evaluation_id: str, failed_count: int) -> None:
         super().__init__(

diff --git a/tests/evaluation/test_argilla_evaluator.py b/tests/evaluation/test_argilla_evaluator.py
@@ -7,7 +7,7 @@
 from intelligence_layer.evaluation import (
     AggregationLogic,
     ArgillaAggregator,
-    ArgillaEvaluationLogic,
+    ArgillaSubmissionLogic,
     ArgillaEvaluationRepository,
     ArgillaEvaluator,
     Example,
@@ -43,8 +43,8 @@ def aggregate(
         )
 
 
-class DummyStringTaskArgillaEvaluationLogic(
-    ArgillaEvaluationLogic[
+class DummyStringTaskArgillaSubmissionLogic(
+    ArgillaSubmissionLogic[
         DummyStringInput,
         DummyStringOutput,
         DummyStringOutput,
@@ -132,7 +132,7 @@ def string_argilla_evaluator(
         in_memory_run_repository,
         argilla_evaluation_repository,
         "dummy-string-task",
-        DummyStringTaskArgillaEvaluationLogic(),
+        DummyStringTaskArgillaSubmissionLogic(),
     )
     return evaluator