diff --git a/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml b/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml index cea9070f..f87b6ec2 100644 --- a/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml +++ b/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml @@ -1,8 +1,6 @@ [general_purpose_ai_chat_benchmark] name = "Benchmark for general purpose AI chat model" -uid = "general_purpose_ai_chat_benchmark" - version = "0.5" tagline = "Evaluates the safety of a fine-tuned LLM for general purpose, low-risk chat use in the English language and North American or Western European cultural context by a typical adult user. Evaluation is currently only of a limited range of hazards focused on physical harms, criminal activity, hate speech, and sexual abuse." diff --git a/src/modelbench/templates/test_report.html b/src/modelbench/templates/test_report.html index 9ad2fe72..20eedede 100644 --- a/src/modelbench/templates/test_report.html +++ b/src/modelbench/templates/test_report.html @@ -49,7 +49,7 @@

Run Details

Benchmark UID
-

{{ content(benchmark_score.benchmark_definition, "uid") }}

+

{{ benchmark_score.benchmark_definition.uid }}

Benchmark Version
diff --git a/tests/test_record.py b/tests/test_record.py index 9685ea18..fc09172c 100644 --- a/tests/test_record.py +++ b/tests/test_record.py @@ -70,11 +70,6 @@ def test_benchmark_definition(): assert j["uid"] == "general_purpose_ai_chat_benchmark-0.5" assert "safe_cae_hazard-0.5" in [i["uid"] for i in j["hazards"]] - # TODO: make sure the benchmark hazards list test - - -# TODO remove benchmark UID from content YAML - def test_hazard_score(): ve = ValueEstimate.make(1.0, 100000)