Skip to content

Commit

Permalink
WIP introduced EvaluationSpan
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesWesch committed Apr 8, 2024
1 parent aa42d00 commit d32227c
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 42 deletions.
15 changes: 15 additions & 0 deletions src/intelligence_layer/core/tracer/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,16 @@ def task_span(
"""
...

@abstractmethod
def evaluation_span(
self,
evaluation_name: str,
example: PydanticSerializable,
*example_outputs: PydanticSerializable,
timestamp: Optional[datetime] = None,
trace_id: Optional[str] = None,
) -> "EvaluationSpan": ...

def ensure_id(self, id: Optional[str]) -> str:
"""Returns a valid id for tracing.
Expand Down Expand Up @@ -229,6 +239,11 @@ def record_output(self, output: PydanticSerializable) -> None:
...


class EvaluationSpan(Span):
@abstractmethod
def record_evaluation(self, evaluation: PydanticSerializable) -> None: ...


TracerVar = TypeVar("TracerVar", bound=Tracer)

SpanVar = TypeVar("SpanVar", bound=Span)
Expand Down
104 changes: 62 additions & 42 deletions src/intelligence_layer/evaluation/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from tqdm import tqdm

from intelligence_layer.core import Input, Output, utc_now
from intelligence_layer.core.tracer.tracer import Tracer
from intelligence_layer.core.tracer.composite_tracer import CompositeTracer
from intelligence_layer.core.tracer.tracer import EvaluationSpan, Tracer
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.evaluation.domain import (
Expand Down Expand Up @@ -48,9 +49,9 @@ class EvaluationLogic(ABC, Generic[Input, Output, ExpectedOutput, Evaluation]):
@abstractmethod
def do_evaluate(
self,
evaluation_span: EvaluationSpan,
example: Example[Input, ExpectedOutput],
tracer: Optional[Tracer],
*output: SuccessfulExampleOutput[Output],
*example_outputs: SuccessfulExampleOutput[Output],
) -> Evaluation:
"""Executes the evaluation for this specific example.
Expand All @@ -59,32 +60,53 @@ def do_evaluate(
Args:
example: Input data of :class:`Task` to produce the output.
output: Output of the :class:`Task`.
example_outputs: Output of the :class:`Task`.
Returns:
The metrics that come from the evaluated :class:`Task`.
"""
pass

@final
def evaluate(
self,
tracer: Tracer,
example: Example[Input, ExpectedOutput],
*example_outputs: SuccessfulExampleOutput[Output],
trace_id: Optional[str] = None,
) -> Evaluation:

with tracer.evaluation_span(
type(self).__name__,
example,
*example_outputs,
trace_id=trace_id,
) as evaluation_span:
evaluation = self.do_evaluate(evaluation_span, example, *example_outputs)
evaluation_span.record_evaluation(evaluation)
return evaluation


class SingleOutputEvaluationLogic(
EvaluationLogic[Input, Output, ExpectedOutput, Evaluation]
):
@final
def do_evaluate(
self,
evaluation_span: EvaluationSpan,
example: Example[Input, ExpectedOutput],
tracer: Optional[Tracer],
*output: SuccessfulExampleOutput[Output],
) -> Evaluation:
assert len(output) == 1
return self.do_evaluate_single_output(example, tracer, output[0].output)
return self.do_evaluate_single_output(
evaluation_span, example, output[0].output
)

@abstractmethod
def do_evaluate_single_output(
self,
evaluation_span: EvaluationSpan,
example: Example[Input, ExpectedOutput],
tracer: Optional[Tracer],
output: Output,
) -> Evaluation:
pass
Expand Down Expand Up @@ -227,7 +249,6 @@ def evaluation_type(self) -> type[Evaluation]:
)
return cast(type[Evaluation], evaluation_type)

@final
def evaluate_runs(
self,
*run_ids: str,
Expand Down Expand Up @@ -299,7 +320,6 @@ def generate_evaluation_inputs() -> (
Iterable[
Tuple[
Example[Input, ExpectedOutput],
str,
Sequence[SuccessfulExampleOutput[Output]],
]
]
Expand Down Expand Up @@ -333,16 +353,46 @@ def generate_evaluation_inputs() -> (

yield (
example,
eval_id,
successful_example_outputs,
)

def evaluate(
example: Example[Input, ExpectedOutput],
abort_on_error: bool,
*example_outputs: SuccessfulExampleOutput[Output],
) -> None:
evaluate_tracer = self._evaluation_repository.example_tracer(
eval_id, example.id
)
if tracer:
evaluate_tracer = CompositeTracer([evaluate_tracer, tracer])
try:
result: Evaluation | FailedExampleEvaluation = (
self._evaluation_logic.evaluate(
evaluate_tracer,
example,
*example_outputs,
)
)
except Exception as e:
if abort_on_error:
raise e
print(
f'FAILED EVALUATION: example "{example.id}", {type(e).__qualname__}: "{e}"'
)
result = FailedExampleEvaluation.from_exception(e)
self._evaluation_repository.store_example_evaluation(
ExampleEvaluation(
evaluation_id=eval_id, example_id=example.id, result=result
)
)

with ThreadPoolExecutor(max_workers=10) as executor:
list( # the list is needed to consume the iterator returned from the executor.map
tqdm(
executor.map(
lambda args: self.evaluate(
args[0], args[1], tracer, abort_on_error, *args[2]
lambda args: evaluate(
args[0], abort_on_error, *args[1]
),
generate_evaluation_inputs(),
)
Expand All @@ -359,36 +409,6 @@ def generate_evaluation_inputs() -> (

return partial_overview

@final
def evaluate(
self,
example: Example[Input, ExpectedOutput],
evaluation_id: str,
tracer: Optional[Tracer],
abort_on_error: bool,
*example_outputs: SuccessfulExampleOutput[Output],
) -> None:
try:
result: Evaluation | FailedExampleEvaluation = (
self._evaluation_logic.do_evaluate(
example,
tracer,
*example_outputs,
)
)
except Exception as e:
if abort_on_error:
raise e
print(
f'FAILED EVALUATION: example "{example.id}", {type(e).__qualname__}: "{e}"'
)
result = FailedExampleEvaluation.from_exception(e)
self._evaluation_repository.store_example_evaluation(
ExampleEvaluation(
evaluation_id=evaluation_id, example_id=example.id, result=result
)
)

def failed_evaluations(
self, evaluation_id: str
) -> Iterable[EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]]:
Expand Down

0 comments on commit d32227c

Please sign in to comment.