Annotator exceptions are always fatal.

mlcommons · Nov 1, 2024 · cc01e82 · cc01e82
1 parent 29c1c6c
commit cc01e82
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 9 deletions.
diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py
@@ -150,7 +150,7 @@ def _add_test_annotators(self, test: PromptResponseTest):
         self.test_annotators[test.uid] = annotators
 
     def add_finished_item(self, item: "TestRunItem"):
-        if item.completion() and item.annotations and not item.fatal_exceptions:
+        if item.completion() and item.annotations and not item.exceptions:
             self.finished_items[item.sut.key][item.test.uid].append(item)
             self.journal.item_entry("item finished", item)
         else:
@@ -160,7 +160,7 @@ def add_finished_item(self, item: "TestRunItem"):
                 item,
                 completion=bool(item.completion()),
                 annotations=len(item.annotations),
-                fatal_exceptions=len(item.fatal_exceptions),
+                fatal_exceptions=len(item.exceptions),
             )
 
         self.completed_item_count += 1
@@ -312,7 +312,7 @@ def handle_item(self, item: TestRunItem):
             self.test_run.journal.item_entry("translated sut response", item, response=response)
 
         except Exception as e:
-            item.fatal_exceptions.append(e)
+            item.exceptions.append(e)
             self.test_run.journal.item_exception_entry("sut exception", item, e)
             logger.error(f"failure handling sut item {item}:", exc_info=e)
         return item
@@ -333,7 +333,7 @@ def handle_item(self, item: TestRunItem) -> TestRunItem:
                     "measured item quality", item, measurements=item.measurements, run_time=timer
                 )
         except Exception as e:
-            item.fatal_exceptions.append(e)
+            item.exceptions.append(e)
             logger.error(f"failure handling annnotation for {item}", exc_info=e)
             self.test_run.journal.item_exception_entry("annotation exception", item, e)
         return item
@@ -373,6 +373,7 @@ def collect_annotations(self, item):
 
                 item.annotations[annotator.uid] = annotation
             except Exception as e:
+                item.exceptions.append(e)
                 logger.error(f"failure handling annotation for {annotator.uid} and {item}", exc_info=e)
                 self.test_run.journal.item_exception_entry("annotator exception", item, e, annotator=annotator.uid)
 

diff --git a/src/modelbench/benchmark_runner_items.py b/src/modelbench/benchmark_runner_items.py
@@ -104,7 +104,7 @@ class TestRunItem:
     sut_response: SUTResponse = None
     annotations: dict[str, Annotation] = dataclasses.field(default_factory=dict)
     measurements: dict[str, float] = dataclasses.field(default_factory=dict)
-    fatal_exceptions: list = dataclasses.field(default_factory=list)
+    exceptions: list = dataclasses.field(default_factory=list)
 
     def prompt_with_context(self) -> PromptWithContext:
         return self.test_item.prompts[0]

diff --git a/tests/modelbench_tests/test_benchmark_runner.py b/tests/modelbench_tests/test_benchmark_runner.py
@@ -273,7 +273,7 @@ def test_benchmark_sut_worker_throws_exception(
         assert result.test_item == item_from_test
         assert result.sut == exploding_sut
         assert result.sut_response is None
-        assert isinstance(result.fatal_exceptions[0], ValueError)
+        assert isinstance(result.exceptions[0], ValueError)
 
         assert "failure" in caplog.text
 
@@ -300,7 +300,7 @@ def test_test_annotation_worker(self, a_wrapped_test, tmp_path, item_from_test,
     def test_benchmark_annotation_worker_ignores_failed(self, a_wrapped_test, tmp_path, item_from_test, a_sut):
         baw = TestRunAnnotationWorker(self.a_run(tmp_path, suts=[a_sut]), NullCache())
         pipeline_item = TestRunItem(a_wrapped_test, item_from_test, a_sut)
-        pipeline_item.fatal_exceptions.append(ValueError())
+        pipeline_item.exceptions.append(ValueError())
 
         result = baw.handle_item(pipeline_item)
 
@@ -317,7 +317,7 @@ def test_benchmark_annotation_worker_throws_exception(
         result = baw.handle_item(pipeline_item)
 
         assert result.annotations == {}
-        assert len(pipeline_item.fatal_exceptions) == 0  # a single annotator failure is not fatal
+        assert len(pipeline_item.exceptions) == 1
 
         assert "failure" in caplog.text
 
@@ -334,7 +334,7 @@ def test_benchmark_results_collector_handles_failed(self, a_sut, tmp_path, a_wra
         run = self.a_run(tmp_path, suts=[a_sut])
         brc = TestRunResultsCollector(run)
         item = TestRunItem(a_wrapped_test, item_from_test, a_sut)
-        item.fatal_exceptions.append(ValueError("yes, this value error"))
+        item.exceptions.append(ValueError("yes, this value error"))
 
         brc.handle_item(item)