refactor: Change internal creation functions to include calculated to…

…ken and latency values
Aleph-Alpha · Dec 18, 2024 · a4c38bd · a4c38bd
1 parent cde364c
commit a4c38bd
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 11 deletions.
diff --git a/src/intelligence_layer/connectors/studio/studio.py b/src/intelligence_layer/connectors/studio/studio.py
@@ -143,7 +143,7 @@ class GetDatasetExamplesResponse(BaseModel, Generic[Input, ExpectedOutput]):
     items: Sequence[StudioExample[Input, ExpectedOutput]]
 
 
-class BenchmarkLineage(BaseModel, Generic[Input, Output, ExpectedOutput, Evaluation]):
+class BenchmarkLineage(BaseModel, Generic[Input, ExpectedOutput, Output, Evaluation]):
     trace_id: str
     input: Input
     expected_output: ExpectedOutput

diff --git a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py
@@ -181,7 +181,9 @@ def average_or_zero(list: list) -> float:
 
         benchmark_lineages = self._create_benchmark_lineages(
             eval_lineages=evaluation_lineages,
-            traces=run_traces,
+            trace_ids=trace_ids,
+            latencies_per_trace=latency_per_trace,
+            tokens_per_trace=tokens_per_trace,
         )
 
         self.client.submit_benchmark_lineages(
@@ -230,27 +232,39 @@ def _create_benchmark_lineages(
         eval_lineages: list[
             EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]
         ],
-        traces: list[Sequence[ExportedSpan]],
-    ) -> Sequence[BenchmarkLineage[Input, Output, ExpectedOutput, Evaluation]]:
+        trace_ids: list[str],
+        latencies_per_trace: list[int],
+        tokens_per_trace: list[int],
+    ) -> Sequence[BenchmarkLineage[Input, ExpectedOutput, Output, Evaluation]]:
         return [
-            self._create_benchmark_lineage(eval_lineage, trace)
-            for eval_lineage, trace in zip(eval_lineages, traces, strict=True)
+            self._create_benchmark_lineage(
+                eval_lineage, trace_id, run_latency, run_tokens
+            )
+            for eval_lineage, trace_id, run_latency, run_tokens in zip(
+                eval_lineages,
+                trace_ids,
+                latencies_per_trace,
+                tokens_per_trace,
+                strict=True,
+            )
         ]
 
     def _create_benchmark_lineage(
         self,
         eval_lineage: EvaluationLineage[Input, ExpectedOutput, Output, Evaluation],
-        trace: Sequence[ExportedSpan],
+        trace_id: str,
+        run_latency: int,
+        run_tokens: int,
     ) -> BenchmarkLineage:
         return BenchmarkLineage(
-            trace_id=str(trace[0].context.trace_id),
+            trace_id=trace_id,
             input=eval_lineage.example.input,
             expected_output=eval_lineage.example.expected_output,
             example_metadata=eval_lineage.example.metadata,
             output=eval_lineage.outputs[0].output,
             evaluation=eval_lineage.evaluation.result,
-            run_latency=extract_latency_from_trace(trace),
-            run_tokens=extract_token_count_from_trace(trace),
+            run_latency=run_latency,
+            run_tokens=run_tokens,
         )
 
 

diff --git a/tests/evaluation/benchmark/test_benchmark.py b/tests/evaluation/benchmark/test_benchmark.py
@@ -313,7 +313,11 @@ def test_execute_benchmark_on_empty_examples_uploads_example_and_calculates_corr
     assert mock_submit_trace.call_count == 0
 
 
+@patch(
+    "intelligence_layer.evaluation.benchmark.studio_benchmark.extract_token_count_from_trace"
+)
 def test_execute_benchmark_failing_examples_calculates_correctly(
+    mock_extract_tokens: Mock,
     studio_benchmark_repository: StudioBenchmarkRepository,
     mock_studio_client: StudioClient,
     evaluation_logic: DummyEvaluationLogic,
@@ -332,6 +336,9 @@ def test_execute_benchmark_failing_examples_calculates_correctly(
     benchmark = studio_benchmark_repository.get_benchmark(
         "benchmark_id", evaluation_logic, aggregation_logic
     )
+
+    expected_generated_tokens = 0
+    mock_extract_tokens.return_value = expected_generated_tokens + 1
     assert benchmark
 
     # when
@@ -349,7 +356,7 @@ def test_execute_benchmark_failing_examples_calculates_correctly(
         PostBenchmarkExecution, mock_submit_execution.call_args[1]["data"]
     )
     assert uploaded_execution.run_success_avg_latency == 0
-    assert uploaded_execution.run_success_avg_token_count == 0
+    assert uploaded_execution.run_success_avg_token_count == expected_generated_tokens
     assert uploaded_execution.run_successful_count == 0
 
     assert mock_submit_trace.call_count == 0