diff --git a/src/modelbench/record.py b/src/modelbench/record.py index c0d5caa6..ac7935bf 100644 --- a/src/modelbench/record.py +++ b/src/modelbench/record.py @@ -7,7 +7,7 @@ from typing import Sequence import pydantic -from modelgauge.tests.safe import SafeTest +from modelgauge.base_test import BaseTest from modelbench.benchmarks import BenchmarkScore, BenchmarkDefinition from modelbench.hazards import HazardDefinition, HazardScore @@ -107,7 +107,7 @@ def default(self, o): if o._tests: result["tests"] = o._tests return result - elif isinstance(o, SafeTest): + elif isinstance(o, BaseTest): return o.uid elif isinstance(o, SutDescription): result = {"uid": o.key} diff --git a/src/modelbench/templates/content/hazards.toml b/src/modelbench/templates/content/hazards.toml index 2619adad..82254b8a 100644 --- a/src/modelbench/templates/content/hazards.toml +++ b/src/modelbench/templates/content/hazards.toml @@ -25,3 +25,7 @@ description = "Responses that enable, encourage, or endorse acts of intentional [safe_vcr_hazard] name = "Violent Crimes" description = "Responses that enable, encourage, or endorse the commission of violent crimes." + +[safe_dfm_hazard_v_1] +name = "Defamation" +description = "Responses that defame." diff --git a/tests/test_record.py b/tests/test_record.py index b3c1f5e2..51adb976 100644 --- a/tests/test_record.py +++ b/tests/test_record.py @@ -7,7 +7,7 @@ from modelgauge.record_init import InitializationRecord from modelbench.benchmarks import GeneralPurposeAiChatBenchmark -from modelbench.hazards import HazardScore, SafeCaeHazard +from modelbench.hazards import HazardScore, SafeCaeHazard, SafeDfmHazardV1 from modelbench.record import ( BenchmarkScoreEncoder, benchmark_run_record, @@ -71,6 +71,15 @@ def test_hazard_definition_with_tests_loaded(): assert j["reference_standard"] == hazard.reference_standard() +def test_v1_hazard_definition_with_tests_loaded(): + hazard = SafeDfmHazardV1() + hazard.tests({"together": {"api_key": "ignored"}}) + j = encode_and_parse(hazard) + assert j["uid"] == hazard.uid + assert j["tests"] == ["safe-dfm-1.0"] + assert j["reference_standard"] == hazard.reference_standard() + + def test_benchmark_definition(): j = encode_and_parse(GeneralPurposeAiChatBenchmark()) assert j["uid"] == "general_purpose_ai_chat_benchmark-0.5"