Skip to content

Commit

Permalink
Code cleanup.
Browse files Browse the repository at this point in the history
  • Loading branch information
wpietri committed Feb 15, 2024
1 parent d058b74 commit 36d0cbd
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 27 deletions.
1 change: 0 additions & 1 deletion src/coffee/new_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ def benchmark(output_dir: pathlib.Path, max_instances: int, debug: bool, web_onl
)
harm_scores.append(score)
benchmark_scores.append(BenchmarkScore(benchmark_definition, sut, harm_scores))
print(benchmark_scores)

print()
print(termcolor.colored(f"Benchmarking complete, rendering reports...", "green"))
Expand Down
Binary file modified tests/data/newhelm_runs/bbq/test_records.pickle
Binary file not shown.
Binary file modified tests/data/newhelm_runs/toxicity/test_records.pickle
Binary file not shown.
42 changes: 17 additions & 25 deletions tests/make_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,45 +13,34 @@


def create_bbq_test_data():
load_plugins()
import newhelm.tests.bbq

harm = BiasHarmDefinition()
max_instances = 55
max_instances = 5
sut = NewhelmSut.GPT2
counter = 0
results = {}
for test in harm.tests():
items = max_instances
if isinstance(test, newhelm.tests.bbq.BBQ):
# BBQ is currently multiple sub-tests, so roughly split the items among them
items = int(items / len(newhelm.tests.bbq._CATEGORIES))
results[test] = run_prompt_response_test(
f"test-{counter}", test, sut.key, SUTS.make_instance(sut.key), "./run", items
)
counter += 1
# serializable = { k.model_dump_json() : results[k].model_dump_json() for k in results }
with open(SIMPLE_BBQ_DATA / "test_records.pickle", "wb") as out:
pickle.dump(results, out)
path = SIMPLE_BBQ_DATA / "test_records.pickle"
results = run_tests_for_harm_on_sut(harm, sut, max_instances)
with open(path, "wb") as out:
pickle.dump(results, out)


def create_toxicity_test_data():
load_plugins()
SECRETS.set_values(get_or_create_json_file(pathlib.Path(__file__).parent.parent / "secrets/default.json"))

harm = ToxicityHarmDefinition()
max_instances = 55
sut = NewhelmSut.GPT2
counter = 0
results = run_tests_for_harm_on_sut(harm, sut, max_instances)

with open(SIMPLE_TOXICITY_DATA / "test_records.pickle", "wb") as out:
pickle.dump(results, out)


def run_tests_for_harm_on_sut(harm, sut, max_instances):
results = {}
for test in harm.tests():
for counter, test in enumerate(harm.tests()):
items = max_instances
results[test] = run_prompt_response_test(
f"test-{counter}", test, sut.key, SUTS.make_instance(sut.key), "./run", items
)
counter += 1
with open(SIMPLE_TOXICITY_DATA / "test_records.pickle", "wb") as out:
pickle.dump(results, out)
return results


if __name__ == "__main__":
Expand All @@ -62,5 +51,8 @@ def create_toxicity_test_data():
# That makes it impossible to laod the data, make sure all the related tests are consistent.
#

load_plugins()
SECRETS.set_values(get_or_create_json_file(pathlib.Path(__file__).parent.parent / "secrets/default.json"))

create_bbq_test_data()
create_toxicity_test_data()
2 changes: 1 addition & 1 deletion tests/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def test_toxicity_scoring(datafiles):
helm_scores = pickle.load(out)

harm_score = bd.score(helm_scores)
assert pytest.approx(harm_score.value()) == 0.690909
assert pytest.approx(harm_score.value()) == 0.6363636
assert harm_score.stars() == 3.0


Expand Down

0 comments on commit 36d0cbd

Please sign in to comment.