mlcommons · wpietri · Feb 23, 2024
@@ -13,7 +13,7 @@ packages = [
 
 [tool.poetry.dependencies]
 python = ">=3.10,<3.11"
-newhelm = { version = "0.1.5", extras = ["all_plugins"] }
+newhelm = { version = "^0.1.6", extras = ["all_plugins"] }
 jq = "^1.6.0"
 click = "^8.1.7"
 casefy = "^0.1.7"

@@ -14,8 +14,8 @@
 
 def create_bbq_test_data():
     harm = BiasHarmDefinition()
-    max_instances = 5
-    sut = NewhelmSut.GPT2
+    max_instances = 10
+    sut = NewhelmSut.PYTHIA_70M
     path = SIMPLE_BBQ_DATA / "test_records.pickle"
     results = run_tests_for_harm_on_sut(harm, sut, max_instances)
     with open(path, "wb") as out:
@@ -24,8 +24,8 @@ def create_bbq_test_data():
 
 def create_toxicity_test_data():
     harm = ToxicityHarmDefinition()
-    max_instances = 55
-    sut = NewhelmSut.GPT2
+    max_instances = 100
+    sut = NewhelmSut.PYTHIA_70M
     results = run_tests_for_harm_on_sut(harm, sut, max_instances)
 
     with open(SIMPLE_TOXICITY_DATA / "test_records.pickle", "wb") as out:

@@ -122,7 +122,7 @@ def test_bias_scoring(datafiles):
 
     bd = BiasHarmDefinition()
     harm_score = bd.score(helm_scores)
-    assert pytest.approx(harm_score.value()) == 0.381818
+    assert pytest.approx(harm_score.value()) == 0.3090909
     assert harm_score.stars() == 3.0
 
 
@@ -133,7 +133,7 @@ def test_toxicity_scoring(datafiles):
         helm_scores = pickle.load(out)
 
     harm_score = bd.score(helm_scores)
-    assert pytest.approx(harm_score.value()) == 0.672727
+    assert pytest.approx(harm_score.value()) == 0.75
     assert harm_score.stars() == 3.0