mlcommons · wpietri · Jul 24, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jul 1, 2024
@@ -17,6 +17,10 @@
 
 class HazardDefinition(ABC, HasUid):
 
+    def __init__(self):
+        super().__init__()
+        self._tests = None
+
     @classmethod
     def name(cls):
         return casefy.titlecase(cls.__name__.replace(HazardDefinition.__name__, ""))
@@ -60,7 +64,7 @@ def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
         # per Peter, treat benign and hazardous as one Test
         # TODO: Consider if this means NewHELM should model as one test
         # or if more generally we don't want to report individual test estimates.
-        test_scores = {self.name(): score_estimate}
+        test_scores = {self.uid: score_estimate}
         return HazardScore(
             hazard_definition=self,
             score=score_estimate,
@@ -72,7 +76,9 @@ def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
     def tests(self, secrets: RawSecrets) -> List[BaseTest]:
         if self.hazard_key is None:
             raise (NotImplementedError("must set hazard key"))
-        return [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)]
+        if not self._tests:
+            self._tests = [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)]
+        return self._tests
 
 
 # not actually a hazard!

@@ -55,6 +55,18 @@ class ModelGaugeSut(SutDescription, Enum):
     WIZARDLM_13B = "wizardlm-13b", "WizardLM v1.2 (13B)", TogetherChatSUT, "WizardLM/WizardLM-13B-V1.2"
     # YI_34B_CHAT = "yi-34b", "01-ai Yi Chat (34B)", TogetherChatSUT, "zero-one-ai/Yi-34B-Chat"
 
+    def instance(self, secrets):
+        if not hasattr(self, "_instance"):
+            if not secrets:
+                return None
+            self._instance = SUTS.make_instance(self.key, secrets=secrets)
+        return self._instance
+
+    def instance_initialization(self):
+        instance = self.instance(None)
+        if instance:
+            return instance.initialization_record
+
 
 for sut in ModelGaugeSut:
     required_secrets = {

@@ -0,0 +1,104 @@
+import json
+import os
+import pathlib
+import platform
+import subprocess
+from datetime import datetime, timezone
+from typing import Sequence
+
+from modelgauge.tests.safe import SafeTest
+import pydantic
+
+from modelbench.benchmarks import BenchmarkScore, BenchmarkDefinition
+from modelbench.hazards import HazardDefinition, HazardScore
+from modelbench.modelgauge_runner import ModelGaugeSut
+from modelbench.static_site_generator import StaticContent
+
+
+def run_command(*args):
+    result = subprocess.run(args, capture_output=True)
+    return result.stdout.decode("utf-8").strip()
+
+
+def benchmark_code_info():
+    try:
+        return {
+            "git_version": run_command("git", "--version"),
+            "origin": run_command("git", "config", "--get", "remote.origin.url"),
+            "code_version": run_command(
+                "git", "describe", "--tags", "--abbrev=8", "--always", "--long", "--match", "v*"
+            ),
+            "changed_files": [
+                l.strip() for l in run_command("git", "status", "-s", "--untracked-files=no").splitlines()
+            ],
+        }
+    except FileNotFoundError:
+        return {"error": "git command not found"}
+
+
+def benchmark_metadata():
+    return {
+        "format_version": 1,
+        "run": {
+            "user": os.environ.get("USER", os.environ.get("USERNAME")),
+            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z"),
+            "platform": platform.platform(),
+            "system": f"{platform.system()} {platform.release()} {platform.version()}",
+            "node": platform.node(),
+            "python": platform.python_version(),
+        },
+        "code": benchmark_code_info(),
+    }
+
+
+def benchmark_run_record(score):
+    return {
+        "score": score,
+        "_metadata": benchmark_metadata(),
+    }
+
+
+def dump_json(
+    json_path: pathlib.Path,
+    start_time: datetime.time,
+    benchmark: BenchmarkDefinition,
+    benchmark_scores: Sequence[BenchmarkScore],
+):
+    with open(json_path, "w") as f:
+        output = {
+            "benchmark": (benchmark),
+            "run_uid": f"run-{benchmark.uid}-{start_time.strftime('%Y%m%d-%H%M%S')}",
+            "scores": (benchmark_scores),
+            "content": StaticContent(),
+        }
+        json.dump(output, f, cls=BenchmarkScoreEncoder, indent=4)
+
+
+class BenchmarkScoreEncoder(json.JSONEncoder):
+
+    def default(self, o):
+        if isinstance(o, BenchmarkScore) or isinstance(o, HazardScore):
+            result = {}
+            result.update(o.__dict__)
+            result["numeric_grade"] = o.numeric_grade()
+            result["text_grade"] = o.text_grade()
+            if "benchmark_definition" in result:
+                del result["benchmark_definition"]  # duplicated up the tree
+            return result
+        elif isinstance(o, BenchmarkDefinition):
+            return {"uid": o.uid, "hazards": o.hazards()}
+        elif isinstance(o, HazardDefinition):
+            return {"uid": o.uid, "tests": o._tests, "reference_standard": o.reference_standard()}
+        elif isinstance(o, SafeTest):
+            return o.uid
+        elif isinstance(o, ModelGaugeSut):
+            result = {"uid": o.key}
+            if o.instance_initialization():
+                result["initialization"] = o.instance_initialization()
+            return result
+        elif isinstance(o, pydantic.BaseModel):
+            return o.model_dump()
+        elif isinstance(o, datetime):
+            return str(o)
+        else:
+            return super().default(o)
@@ -31,6 +31,7 @@
 )
 from modelbench.hazards import HazardDefinition, HazardScore, STANDARDS
 from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription
+from modelbench.record import dump_json
 from modelbench.static_site_generator import StaticContent, StaticSiteGenerator
 
 _DEFAULT_SUTS = ModelGaugeSut
@@ -69,11 +70,7 @@ def cli() -> None:
 )
 @click.option("--max-instances", "-m", type=int, default=100)
 @click.option("--debug", default=False, is_flag=True)
-@click.option(
-    "--sut",
-    "-s",
-    multiple=True,
-)
+@click.option("sut_uids", "--sut", "-s", multiple=True, help="SUT uid(s) to run")
 @click.option("--view-embed", default=False, is_flag=True, help="Render the HTML to be embedded in another view")
 @click.option(
     "--custom-branding",
@@ -83,28 +80,32 @@ def cli() -> None:
 @click.option("--anonymize", type=int, help="Random number seed for consistent anonymization of SUTs")
 @click.option("--parallel", default=False, help="Experimentally run SUTs in parallel")
 @click.option(
+    "benchmark_name",
     "--benchmark",
     type=click.Choice([c.__name__ for c in BenchmarkDefinition.__subclasses__()]),
-    default=["GeneralPurposeAiChatBenchmark"],
+    default="GeneralPurposeAiChatBenchmark",
     help="Benchmark to run (Default: GeneralPurposeAiChatBenchmark)",
-    multiple=True,
+    multiple=False,
 )
 @local_plugin_dir_option
 def benchmark(
-    benchmark: str,
+    benchmark_name: str,
     output_dir: pathlib.Path,
     max_instances: int,
     debug: bool,
-    sut: List[str],
+    sut_uids: List[str],
     view_embed: bool,
     custom_branding: Optional[pathlib.Path] = None,
     anonymize=None,
     parallel=False,
 ) -> None:
-    suts = find_suts_for_sut_argument(sut)
-    benchmarks = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ in benchmark]
-    benchmark_scores = score_benchmarks(benchmarks, suts, max_instances, debug, parallel)
+    start_time = datetime.now(timezone.utc)
+    suts = find_suts_for_sut_argument(sut_uids)
+    benchmark = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ == benchmark_name][0]
+    benchmark_scores = score_benchmarks([benchmark], suts, max_instances, debug, parallel)
     generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding)
+    json_path = output_dir / f"benchmark_record-{benchmark.uid}.json"
+    dump_json(json_path, start_time, benchmark, benchmark_scores)
 
 
 def find_suts_for_sut_argument(sut_args: List[str]):
@@ -120,7 +121,7 @@ def find_suts_for_sut_argument(sut_args: List[str]):
             else:
                 all_sut_keys = registered_sut_keys.union(set(default_suts_by_key.keys()))
                 raise click.BadParameter(
-                    f"Unknown key '{sut_arg}'. Valid options are {sorted(all_sut_keys, key=lambda x:x.lower())}",
+                    f"Unknown key '{sut_arg}'. Valid options are {sorted(all_sut_keys, key=lambda x: x.lower())}",
                     param_hint="sut",
                 )
 
@@ -147,7 +148,7 @@ def score_benchmarks(benchmarks, suts, max_instances, debug, parallel=True):
 def score_a_sut(benchmarks, max_instances, secrets, debug, sut):
     sut_scores = []
     echo(termcolor.colored(f'Examining system "{sut.display_name}"', "green"))
-    sut_instance = SUTS.make_instance(sut.key, secrets=secrets)
+    sut_instance = sut.instance(secrets)
     for benchmark_definition in benchmarks:
         echo(termcolor.colored(f'  Starting run for benchmark "{benchmark_definition.name()}"', "green"))
         hazard_scores = []
@@ -268,7 +269,7 @@ def run_tests(
 ) -> Mapping[HazardDefinition, HazardScore]:
     secrets = load_secrets_from_config()
     result = {}
-    sut_instance = SUTS.make_instance(sut.key, secrets=secrets)
+    sut_instance = sut.instance(secrets)
     for hazard in hazards:
         test_scores = {}
         for test in hazard.tests(secrets=secrets):
@@ -286,9 +287,7 @@ def test_records_for_sut(sut: ModelGaugeSut, tests: Dict[str, FactoryEntry], dat
 
         @retry(tries=3, delay=1)  # to compensate for modelgauge/togetherai flakiness
         def process_one():
-            result = run_prompt_response_test(
-                test, SUTS.make_instance(sut.key, secrets=secrets), data_dir, max_test_items=max_test_items
-            )
+            result = run_prompt_response_test(test, sut.instance(secrets), data_dir, max_test_items=max_test_items)
             return test_id, result
 
         try:

@@ -1,11 +1,13 @@
 import pathlib
 from datetime import datetime
 from typing import List, Mapping
+from unittest.mock import MagicMock
 
 import pytest
 from modelgauge.base_test import BaseTest
 from modelgauge.records import TestRecord
 from modelgauge.secret_values import RawSecrets
+from modelgauge.tests.safe import SafeTestResult, PersonaType, PersonaResult
 
 from modelbench.benchmarks import (
     BenchmarkScore,
@@ -111,13 +113,27 @@ def test_hazard_definition_basics(fake_secrets):
 
 
 def test_hazard_score_basics():
-    bd = SafeCaeHazard()
-    a_score = bd.reference_standard()
-    hs = HazardScore(hazard_definition=bd, score=ValueEstimate.make(a_score, 50), test_scores={})
-    assert hs.hazard_definition == bd
+    hd = SafeCaeHazard()
+    a_score = hd.reference_standard()
+    hs = HazardScore(hazard_definition=hd, score=ValueEstimate.make(a_score, 50), test_scores={})
+    assert hs.hazard_definition == hd
     assert hs.score.estimate == a_score
 
 
+def test_hazard_score_test_scores():
+    hd = SafeCaeHazard()
+    mock_test_record = MagicMock()
+    frac_safe = 0.5
+    mock_test_record.result.to_instance.return_value = SafeTestResult(
+        persona_results={PersonaType.TYPICAL: PersonaResult(frac_safe=frac_safe, num_items=100)}
+    )
+    result = hd.score({"foo": mock_test_record})
+    print(result)
+    score_key = next(iter(result.test_scores))
+    assert score_key == hd.uid
+    assert result.test_scores[score_key].estimate == frac_safe
+
+
 def test_modelgauge_sut_display_name_and_name():
     assert ModelGaugeSut.LLAMA_2_7B.display_name == "Meta Llama 2, 7b parameters"
     assert ModelGaugeSut.LLAMA_2_7B.name == "LLAMA_2_7B"