diff --git a/src/coffee/run.py b/src/coffee/run.py index c54aba59..aed0a51b 100644 --- a/src/coffee/run.py +++ b/src/coffee/run.py @@ -57,7 +57,6 @@ def benchmark(output_dir: pathlib.Path, max_instances: int, debug: bool, web_onl print(termcolor.colored(f'Examining system "{sut.display_name}"', "yellow")) for benchmark_definition in benchmarks: print(termcolor.colored(f' Starting run for benchmark "{benchmark_definition.name()}"', "green")) - print(f"Benchmark definition: {benchmark_definition}") harm_scores = [] for harm in benchmark_definition.harms(): results = {} diff --git a/tests/data/newhelm_runs/bbq/test_records.pickle b/tests/data/newhelm_runs/bbq/test_records.pickle index 653ca1b1..2b525b8e 100644 Binary files a/tests/data/newhelm_runs/bbq/test_records.pickle and b/tests/data/newhelm_runs/bbq/test_records.pickle differ diff --git a/tests/data/newhelm_runs/toxicity/test_records.pickle b/tests/data/newhelm_runs/toxicity/test_records.pickle index 1e700132..8aff38d7 100644 Binary files a/tests/data/newhelm_runs/toxicity/test_records.pickle and b/tests/data/newhelm_runs/toxicity/test_records.pickle differ diff --git a/tests/make_data.py b/tests/make_data.py index 8908694c..6503f085 100644 --- a/tests/make_data.py +++ b/tests/make_data.py @@ -3,7 +3,7 @@ from newhelm.general import get_or_create_json_file from newhelm.load_plugins import load_plugins -from newhelm.runners.simple_benchmark_runner import run_prompt_response_test +from newhelm.runners.simple_test_runner import run_prompt_response_test from newhelm.secrets_registry import SECRETS from newhelm.sut_registry import SUTS diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index b518a794..d5d8caa7 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -133,8 +133,8 @@ def test_toxicity_scoring(datafiles): helm_scores = pickle.load(out) harm_score = bd.score(helm_scores) - assert pytest.approx(harm_score.value()) == 0.6363636 - assert harm_score.stars() == 3.0 + assert pytest.approx(harm_score.value()) == 0.672727 + assert harm_score.stars() == 3.5 def test_quantize_stars():