Code cleanup.

mlcommons · Feb 15, 2024 · 36d0cbd · 36d0cbd
1 parent d058b74
commit 36d0cbd
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 27 deletions.
diff --git a/src/coffee/new_run.py b/src/coffee/new_run.py
@@ -81,7 +81,6 @@ def benchmark(output_dir: pathlib.Path, max_instances: int, debug: bool, web_onl
                         )
                     harm_scores.append(score)
             benchmark_scores.append(BenchmarkScore(benchmark_definition, sut, harm_scores))
-    print(benchmark_scores)
 
     print()
     print(termcolor.colored(f"Benchmarking complete, rendering reports...", "green"))

diff --git a/tests/data/newhelm_runs/bbq/test_records.pickle b/tests/data/newhelm_runs/bbq/test_records.pickle
diff --git a/tests/data/newhelm_runs/toxicity/test_records.pickle b/tests/data/newhelm_runs/toxicity/test_records.pickle
diff --git a/tests/make_data.py b/tests/make_data.py
@@ -13,45 +13,34 @@
 
 
 def create_bbq_test_data():
-    load_plugins()
-    import newhelm.tests.bbq
-
     harm = BiasHarmDefinition()
-    max_instances = 55
+    max_instances = 5
     sut = NewhelmSut.GPT2
-    counter = 0
-    results = {}
-    for test in harm.tests():
-        items = max_instances
-        if isinstance(test, newhelm.tests.bbq.BBQ):
-            # BBQ is currently multiple sub-tests, so roughly split the items among them
-            items = int(items / len(newhelm.tests.bbq._CATEGORIES))
-        results[test] = run_prompt_response_test(
-            f"test-{counter}", test, sut.key, SUTS.make_instance(sut.key), "./run", items
-        )
-        counter += 1
-        # serializable = { k.model_dump_json() : results[k].model_dump_json() for k in results }
-        with open(SIMPLE_BBQ_DATA / "test_records.pickle", "wb") as out:
-            pickle.dump(results, out)
+    path = SIMPLE_BBQ_DATA / "test_records.pickle"
+    results = run_tests_for_harm_on_sut(harm, sut, max_instances)
+    with open(path, "wb") as out:
+        pickle.dump(results, out)
 
 
 def create_toxicity_test_data():
-    load_plugins()
-    SECRETS.set_values(get_or_create_json_file(pathlib.Path(__file__).parent.parent / "secrets/default.json"))
-
     harm = ToxicityHarmDefinition()
     max_instances = 55
     sut = NewhelmSut.GPT2
-    counter = 0
+    results = run_tests_for_harm_on_sut(harm, sut, max_instances)
+
+    with open(SIMPLE_TOXICITY_DATA / "test_records.pickle", "wb") as out:
+        pickle.dump(results, out)
+
+
+def run_tests_for_harm_on_sut(harm, sut, max_instances):
     results = {}
-    for test in harm.tests():
+    for counter, test in enumerate(harm.tests()):
         items = max_instances
         results[test] = run_prompt_response_test(
             f"test-{counter}", test, sut.key, SUTS.make_instance(sut.key), "./run", items
         )
         counter += 1
-        with open(SIMPLE_TOXICITY_DATA / "test_records.pickle", "wb") as out:
-            pickle.dump(results, out)
+    return results
 
 
 if __name__ == "__main__":
@@ -62,5 +51,8 @@ def create_toxicity_test_data():
     # That makes it impossible to laod the data, make sure all the related tests are consistent.
     #
 
+    load_plugins()
+    SECRETS.set_values(get_or_create_json_file(pathlib.Path(__file__).parent.parent / "secrets/default.json"))
+
     create_bbq_test_data()
     create_toxicity_test_data()
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -132,7 +132,7 @@ def test_toxicity_scoring(datafiles):
         helm_scores = pickle.load(out)
 
     harm_score = bd.score(helm_scores)
-    assert pytest.approx(harm_score.value()) == 0.690909
+    assert pytest.approx(harm_score.value()) == 0.6363636
     assert harm_score.stars() == 3.0