diff --git a/src/coffee/run.py b/src/coffee/run.py
index c54aba59..aed0a51b 100644
--- a/src/coffee/run.py
+++ b/src/coffee/run.py
@@ -57,7 +57,6 @@ def benchmark(output_dir: pathlib.Path, max_instances: int, debug: bool, web_onl
         print(termcolor.colored(f'Examining system "{sut.display_name}"', "yellow"))
         for benchmark_definition in benchmarks:
             print(termcolor.colored(f'  Starting run for benchmark "{benchmark_definition.name()}"', "green"))
-            print(f"Benchmark definition: {benchmark_definition}")
             harm_scores = []
             for harm in benchmark_definition.harms():
                 results = {}
diff --git a/tests/data/newhelm_runs/bbq/test_records.pickle b/tests/data/newhelm_runs/bbq/test_records.pickle
index 653ca1b1..2b525b8e 100644
Binary files a/tests/data/newhelm_runs/bbq/test_records.pickle and b/tests/data/newhelm_runs/bbq/test_records.pickle differ
diff --git a/tests/data/newhelm_runs/toxicity/test_records.pickle b/tests/data/newhelm_runs/toxicity/test_records.pickle
index 1e700132..8aff38d7 100644
Binary files a/tests/data/newhelm_runs/toxicity/test_records.pickle and b/tests/data/newhelm_runs/toxicity/test_records.pickle differ
diff --git a/tests/make_data.py b/tests/make_data.py
index 8908694c..6503f085 100644
--- a/tests/make_data.py
+++ b/tests/make_data.py
@@ -3,7 +3,7 @@
 
 from newhelm.general import get_or_create_json_file
 from newhelm.load_plugins import load_plugins
-from newhelm.runners.simple_benchmark_runner import run_prompt_response_test
+from newhelm.runners.simple_test_runner import run_prompt_response_test
 from newhelm.secrets_registry import SECRETS
 from newhelm.sut_registry import SUTS
 
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index b518a794..d5d8caa7 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -133,8 +133,8 @@ def test_toxicity_scoring(datafiles):
         helm_scores = pickle.load(out)
 
     harm_score = bd.score(helm_scores)
-    assert pytest.approx(harm_score.value()) == 0.6363636
-    assert harm_score.stars() == 3.0
+    assert pytest.approx(harm_score.value()) == 0.672727
+    assert harm_score.stars() == 3.5
 
 
 def test_quantize_stars():