diff --git a/src/modelbench/hazards.py b/src/modelbench/hazards.py
index 258accc5..32d81e3a 100644
--- a/src/modelbench/hazards.py
+++ b/src/modelbench/hazards.py
@@ -17,6 +17,10 @@
 
 class HazardDefinition(ABC, HasUid):
 
+    def __init__(self):
+        super().__init__()
+        self._tests = None
+
     @classmethod
     def name(cls):
         return casefy.titlecase(cls.__name__.replace(HazardDefinition.__name__, ""))
@@ -60,7 +64,7 @@ def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
         # per Peter, treat benign and hazardous as one Test
         # TODO: Consider if this means NewHELM should model as one test
         # or if more generally we don't want to report individual test estimates.
-        test_scores = {self.name(): score_estimate}
+        test_scores = {self.uid: score_estimate}
         return HazardScore(
             hazard_definition=self,
             score=score_estimate,
@@ -72,7 +76,9 @@ def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
     def tests(self, secrets: RawSecrets) -> List[BaseTest]:
         if self.hazard_key is None:
             raise (NotImplementedError("must set hazard key"))
-        return [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)]
+        if not self._tests:
+            self._tests = [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)]
+        return self._tests
 
 
 # not actually a hazard!
diff --git a/src/modelbench/modelgauge_runner.py b/src/modelbench/modelgauge_runner.py
index 97fadca3..47062d44 100644
--- a/src/modelbench/modelgauge_runner.py
+++ b/src/modelbench/modelgauge_runner.py
@@ -55,6 +55,18 @@ class ModelGaugeSut(SutDescription, Enum):
     WIZARDLM_13B = "wizardlm-13b", "WizardLM v1.2 (13B)", TogetherChatSUT, "WizardLM/WizardLM-13B-V1.2"
     # YI_34B_CHAT = "yi-34b", "01-ai Yi Chat (34B)", TogetherChatSUT, "zero-one-ai/Yi-34B-Chat"
 
+    def instance(self, secrets):
+        if not hasattr(self, "_instance"):
+            if not secrets:
+                return None
+            self._instance = SUTS.make_instance(self.key, secrets=secrets)
+        return self._instance
+
+    def instance_initialization(self):
+        instance = self.instance(None)
+        if instance:
+            return instance.initialization_record
+
 
 for sut in ModelGaugeSut:
     required_secrets = {
diff --git a/src/modelbench/record.py b/src/modelbench/record.py
new file mode 100644
index 00000000..48558c00
--- /dev/null
+++ b/src/modelbench/record.py
@@ -0,0 +1,122 @@
+import json
+import os
+import pathlib
+import platform
+import subprocess
+from datetime import datetime, timezone
+from typing import Sequence
+
+import pydantic
+from modelgauge.tests.safe import SafeTest
+
+from modelbench.benchmarks import BenchmarkScore, BenchmarkDefinition
+from modelbench.hazards import HazardDefinition, HazardScore
+from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription
+from modelbench.static_site_generator import StaticContent
+
+
+def run_command(*args):
+    result = subprocess.run(args, capture_output=True)
+    return result.stdout.decode("utf-8").strip()
+
+
+def benchmark_code_info():
+    try:
+        git_dir = run_command("git", "rev-parse", "git-dir")
+        if not git_dir:
+            return {"error": "couldn't find git dir"}
+    except FileNotFoundError:
+        return {"error": "git command not found"}
+
+    return {
+        "git_version": run_command("git", "--version"),
+        "origin": run_command("git", "config", "--get", "remote.origin.url"),
+        "code_version": run_command("git", "describe", "--tags", "--abbrev=8", "--always", "--long", "--match", "v*"),
+        "changed_files": [l.strip() for l in run_command("git", "status", "-s", "--untracked-files=no").splitlines()],
+    }
+
+
+def benchmark_library_info():
+    try:
+        text = run_command("python", "-m", "pip", "list")
+        result = {}
+        for line in text.splitlines()[2:]:
+            package, version = line.split(maxsplit=1)
+            result[package] = version
+        return result
+    except FileNotFoundError:
+        return {"error": "pip not found"}
+
+
+def benchmark_metadata():
+    return {
+        "format_version": 1,
+        "run": {
+            "user": os.environ.get("USER", os.environ.get("USERNAME")),
+            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z"),
+            "platform": platform.platform(),
+            "system": f"{platform.system()} {platform.release()} {platform.version()}",
+            "node": platform.node(),
+            "python": platform.python_version(),
+        },
+        "code": {
+            "source": benchmark_code_info(),
+            "libraries": benchmark_library_info(),
+        },
+    }
+
+
+def benchmark_run_record(score):
+    return {
+        "score": score,
+        "_metadata": benchmark_metadata(),
+    }
+
+
+def dump_json(
+    json_path: pathlib.Path,
+    start_time: datetime.time,
+    benchmark: BenchmarkDefinition,
+    benchmark_scores: Sequence[BenchmarkScore],
+):
+    with open(json_path, "w") as f:
+        output = {
+            "benchmark": (benchmark),
+            "run_uid": f"run-{benchmark.uid}-{start_time.strftime('%Y%m%d-%H%M%S')}",
+            "scores": (benchmark_scores),
+            "content": StaticContent(),
+        }
+        json.dump(output, f, cls=BenchmarkScoreEncoder, indent=4)
+
+
+class BenchmarkScoreEncoder(json.JSONEncoder):
+
+    def default(self, o):
+        if isinstance(o, BenchmarkScore) or isinstance(o, HazardScore):
+            result = {}
+            result.update(o.__dict__)
+            result["numeric_grade"] = o.numeric_grade()
+            result["text_grade"] = o.text_grade()
+            if "benchmark_definition" in result:
+                del result["benchmark_definition"]  # duplicated up the tree
+            return result
+        elif isinstance(o, BenchmarkDefinition):
+            return {"uid": o.uid, "hazards": o.hazards()}
+        elif isinstance(o, HazardDefinition):
+            result = {"uid": o.uid, "reference_standard": o.reference_standard()}
+            if o._tests:
+                result["tests"] = o._tests
+            return result
+        elif isinstance(o, SafeTest):
+            return o.uid
+        elif isinstance(o, SutDescription):
+            result = {"uid": o.key}
+            if isinstance(o, ModelGaugeSut) and o.instance_initialization():
+                result["initialization"] = o.instance_initialization()
+            return result
+        elif isinstance(o, pydantic.BaseModel):
+            return o.model_dump()
+        elif isinstance(o, datetime):
+            return str(o)
+        else:
+            return super().default(o)
diff --git a/src/modelbench/run.py b/src/modelbench/run.py
index c45aa998..0533140b 100644
--- a/src/modelbench/run.py
+++ b/src/modelbench/run.py
@@ -31,6 +31,7 @@
 )
 from modelbench.hazards import HazardDefinition, HazardScore, STANDARDS
 from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription
+from modelbench.record import dump_json
 from modelbench.static_site_generator import StaticContent, StaticSiteGenerator
 
 _DEFAULT_SUTS = ModelGaugeSut
@@ -69,11 +70,7 @@ def cli() -> None:
 )
 @click.option("--max-instances", "-m", type=int, default=100)
 @click.option("--debug", default=False, is_flag=True)
-@click.option(
-    "--sut",
-    "-s",
-    multiple=True,
-)
+@click.option("sut_uids", "--sut", "-s", multiple=True, help="SUT uid(s) to run")
 @click.option("--view-embed", default=False, is_flag=True, help="Render the HTML to be embedded in another view")
 @click.option(
     "--custom-branding",
@@ -83,28 +80,32 @@ def cli() -> None:
 @click.option("--anonymize", type=int, help="Random number seed for consistent anonymization of SUTs")
 @click.option("--parallel", default=False, help="Experimentally run SUTs in parallel")
 @click.option(
+    "benchmark_name",
     "--benchmark",
     type=click.Choice([c.__name__ for c in BenchmarkDefinition.__subclasses__()]),
-    default=["GeneralPurposeAiChatBenchmark"],
+    default="GeneralPurposeAiChatBenchmark",
     help="Benchmark to run (Default: GeneralPurposeAiChatBenchmark)",
-    multiple=True,
+    multiple=False,
 )
 @local_plugin_dir_option
 def benchmark(
-    benchmark: str,
+    benchmark_name: str,
     output_dir: pathlib.Path,
     max_instances: int,
     debug: bool,
-    sut: List[str],
+    sut_uids: List[str],
     view_embed: bool,
     custom_branding: Optional[pathlib.Path] = None,
     anonymize=None,
     parallel=False,
 ) -> None:
-    suts = find_suts_for_sut_argument(sut)
-    benchmarks = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ in benchmark]
-    benchmark_scores = score_benchmarks(benchmarks, suts, max_instances, debug, parallel)
+    start_time = datetime.now(timezone.utc)
+    suts = find_suts_for_sut_argument(sut_uids)
+    benchmark = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ == benchmark_name][0]
+    benchmark_scores = score_benchmarks([benchmark], suts, max_instances, debug, parallel)
     generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding)
+    json_path = output_dir / f"benchmark_record-{benchmark.uid}.json"
+    dump_json(json_path, start_time, benchmark, benchmark_scores)
 
 
 def find_suts_for_sut_argument(sut_args: List[str]):
@@ -120,7 +121,7 @@ def find_suts_for_sut_argument(sut_args: List[str]):
             else:
                 all_sut_keys = registered_sut_keys.union(set(default_suts_by_key.keys()))
                 raise click.BadParameter(
-                    f"Unknown key '{sut_arg}'. Valid options are {sorted(all_sut_keys, key=lambda x:x.lower())}",
+                    f"Unknown key '{sut_arg}'. Valid options are {sorted(all_sut_keys, key=lambda x: x.lower())}",
                     param_hint="sut",
                 )
 
@@ -147,7 +148,7 @@ def score_benchmarks(benchmarks, suts, max_instances, debug, parallel=True):
 def score_a_sut(benchmarks, max_instances, secrets, debug, sut):
     sut_scores = []
     echo(termcolor.colored(f'Examining system "{sut.display_name}"', "green"))
-    sut_instance = SUTS.make_instance(sut.key, secrets=secrets)
+    sut_instance = sut.instance(secrets)
     for benchmark_definition in benchmarks:
         echo(termcolor.colored(f'  Starting run for benchmark "{benchmark_definition.name()}"', "green"))
         hazard_scores = []
@@ -174,15 +175,16 @@ def score_a_sut(benchmarks, max_instances, secrets, debug, sut):
     return sut_scores
 
 
+class FakeSut(SutDescription):
+    @property
+    def name(self):
+        return self.key.upper()
+
+
 def generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding=None):
     static_site_generator = StaticSiteGenerator(view_embed=view_embed, custom_branding=custom_branding)
     if anonymize:
 
-        class FakeSut(SutDescription):
-            @property
-            def name(self):
-                return self.key.upper()
-
         rng = random.Random(anonymize)
         rng.shuffle(benchmark_scores)
 
@@ -268,7 +270,7 @@ def run_tests(
 ) -> Mapping[HazardDefinition, HazardScore]:
     secrets = load_secrets_from_config()
     result = {}
-    sut_instance = SUTS.make_instance(sut.key, secrets=secrets)
+    sut_instance = sut.instance(secrets)
     for hazard in hazards:
         test_scores = {}
         for test in hazard.tests(secrets=secrets):
@@ -286,9 +288,7 @@ def test_records_for_sut(sut: ModelGaugeSut, tests: Dict[str, FactoryEntry], dat
 
         @retry(tries=3, delay=1)  # to compensate for modelgauge/togetherai flakiness
         def process_one():
-            result = run_prompt_response_test(
-                test, SUTS.make_instance(sut.key, secrets=secrets), data_dir, max_test_items=max_test_items
-            )
+            result = run_prompt_response_test(test, sut.instance(secrets), data_dir, max_test_items=max_test_items)
             return test_id, result
 
         try:
diff --git a/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml b/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml
index cea9070f..f87b6ec2 100644
--- a/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml
+++ b/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml
@@ -1,8 +1,6 @@
 [general_purpose_ai_chat_benchmark]
 name = "Benchmark for general purpose AI chat model"
 
-uid = "general_purpose_ai_chat_benchmark"
-
 version = "0.5"
 
 tagline = "Evaluates the safety of a fine-tuned LLM for general purpose, low-risk chat use in the English language and North American or Western European cultural context by a typical adult user. Evaluation is currently only of a limited range of hazards focused on physical harms, criminal activity, hate speech, and sexual abuse."
diff --git a/src/modelbench/templates/test_report.html b/src/modelbench/templates/test_report.html
index 9ad2fe72..20eedede 100644
--- a/src/modelbench/templates/test_report.html
+++ b/src/modelbench/templates/test_report.html
@@ -49,7 +49,7 @@ <h2>Run Details</h2>
     <article class="mlc--card__border mlc--card__grid mlc--card__box-shadow">
         <div>
             <h6 class="mlc--test-detail-header">Benchmark UID</h6>
-            <p>{{ content(benchmark_score.benchmark_definition, "uid") }}</p>
+            <p>{{ benchmark_score.benchmark_definition.uid }}</p>
         </div>
         <div>
             <h6 class="mlc--test-detail-header">Benchmark Version</h6>
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 85733a3f..9c10eb59 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -1,11 +1,13 @@
 import pathlib
 from datetime import datetime
 from typing import List, Mapping
+from unittest.mock import MagicMock
 
 import pytest
 from modelgauge.base_test import BaseTest
 from modelgauge.records import TestRecord
 from modelgauge.secret_values import RawSecrets
+from modelgauge.tests.safe import SafeTestResult, PersonaType, PersonaResult
 
 from modelbench.benchmarks import (
     BenchmarkScore,
@@ -111,13 +113,27 @@ def test_hazard_definition_basics(fake_secrets):
 
 
 def test_hazard_score_basics():
-    bd = SafeCaeHazard()
-    a_score = bd.reference_standard()
-    hs = HazardScore(hazard_definition=bd, score=ValueEstimate.make(a_score, 50), test_scores={})
-    assert hs.hazard_definition == bd
+    hd = SafeCaeHazard()
+    a_score = hd.reference_standard()
+    hs = HazardScore(hazard_definition=hd, score=ValueEstimate.make(a_score, 50), test_scores={})
+    assert hs.hazard_definition == hd
     assert hs.score.estimate == a_score
 
 
+def test_hazard_score_test_scores():
+    hd = SafeCaeHazard()
+    mock_test_record = MagicMock()
+    frac_safe = 0.5
+    mock_test_record.result.to_instance.return_value = SafeTestResult(
+        persona_results={PersonaType.TYPICAL: PersonaResult(frac_safe=frac_safe, num_items=100)}
+    )
+    result = hd.score({"foo": mock_test_record})
+    print(result)
+    score_key = next(iter(result.test_scores))
+    assert score_key == hd.uid
+    assert result.test_scores[score_key].estimate == frac_safe
+
+
 def test_modelgauge_sut_display_name_and_name():
     assert ModelGaugeSut.LLAMA_2_7B.display_name == "Meta Llama 2, 7b parameters"
     assert ModelGaugeSut.LLAMA_2_7B.name == "LLAMA_2_7B"
diff --git a/tests/test_record.py b/tests/test_record.py
new file mode 100644
index 00000000..f7ca2e3e
--- /dev/null
+++ b/tests/test_record.py
@@ -0,0 +1,177 @@
+import json
+import platform
+import re
+from datetime import datetime, timezone
+from unittest.mock import Mock, MagicMock, patch
+
+from modelgauge.record_init import InitializationRecord
+
+from modelbench.benchmarks import GeneralPurposeAiChatBenchmark
+from modelbench.hazards import HazardScore, SafeCaeHazard
+from modelbench.modelgauge_runner import ModelGaugeSut
+from modelbench.record import (
+    BenchmarkScoreEncoder,
+    benchmark_run_record,
+    dump_json,
+    benchmark_code_info,
+    benchmark_library_info,
+)
+from modelbench.run import FakeSut
+from modelbench.scoring import ValueEstimate
+from test_static_site_generator import benchmark_score
+
+
+def encode(o):
+    return json.dumps(o, cls=BenchmarkScoreEncoder, indent=4)
+
+
+def encode_and_parse(o):
+    s = encode(o)
+    return json.loads(s)
+
+
+def test_sut():
+    assert encode_and_parse(ModelGaugeSut.ALPACA_7B) == {"uid": "alpaca-7b"}
+
+    ModelGaugeSut.ALPACA_7B.instance(MagicMock())
+    with_initialization = encode_and_parse(ModelGaugeSut.ALPACA_7B)
+    assert "uid" in with_initialization
+    assert "initialization" in with_initialization
+    assert encode_and_parse(ModelGaugeSut.ALPACA_7B) == with_initialization
+
+
+def test_anonymous_sut():
+    j = encode_and_parse(FakeSut("a_sut-v1.0", "A SUT of some sort"))
+    print(j)
+
+
+def test_value_estimate():
+    ve = ValueEstimate.make(0.5, 1000)
+    j = encode_and_parse(ve)
+    assert j["lower"] == ve.lower
+    assert j["estimate"] == ve.estimate
+    assert j["upper"] == ve.upper
+    assert j["samples"] == ve.samples
+
+
+def test_hazard_definition_without_tests_loaded():
+    hazard = SafeCaeHazard()
+    j = encode_and_parse(hazard)
+    assert j["uid"] == hazard.uid
+    assert "tests" not in j
+    assert j["reference_standard"] == hazard.reference_standard()
+
+
+def test_hazard_definition_with_tests_loaded():
+    hazard = SafeCaeHazard()
+    hazard.tests({"together": {"api_key": "ignored"}})
+    j = encode_and_parse(hazard)
+    assert j["uid"] == hazard.uid
+    assert j["tests"] == ["safe-cae"]
+    assert j["reference_standard"] == hazard.reference_standard()
+
+
+def test_benchmark_definition():
+    j = encode_and_parse(GeneralPurposeAiChatBenchmark())
+    assert j["uid"] == "general_purpose_ai_chat_benchmark-0.5"
+    assert "safe_cae_hazard-0.5" in [i["uid"] for i in j["hazards"]]
+
+
+def test_hazard_score():
+    ve = ValueEstimate.make(1.0, 100000)
+    hs = HazardScore(hazard_definition=SafeCaeHazard(), score=ve, test_scores={"cae": ve})
+    j = encode_and_parse(hs)
+    assert j["hazard_definition"]["uid"] == SafeCaeHazard().uid
+    assert j["score"]["samples"] == ve.samples
+    assert j["test_scores"]["cae"]["samples"] == ve.samples
+    assert j["numeric_grade"] == 5
+    assert j["text_grade"] == "L"
+
+
+def test_benchmark_score(benchmark_score):
+    j = encode_and_parse(benchmark_score)
+    assert "benchmark_definition" not in j  # it's already higher up in the tree; no need to duplicate
+    assert j["sut"]["uid"] == benchmark_score.sut.key
+    assert len(j["hazard_scores"]) == len(benchmark_score.hazard_scores)
+    assert j["end_time"] == str(benchmark_score.end_time)
+    assert j["numeric_grade"] == benchmark_score.numeric_grade()
+    assert j["text_grade"] == benchmark_score.text_grade()
+
+
+def test_benchmark_score_initialization_record(benchmark_score):
+    mock_method = Mock()
+    mock_method.return_value = InitializationRecord(
+        module="a_module", class_name="a_class", args=["arg1", "arg2"], kwargs={"kwarg1": "a_value"}
+    )
+    benchmark_score.sut.instance_initialization = mock_method
+    j = encode_and_parse(benchmark_score)
+    print(j)
+    assert j["sut"]["initialization"]["module"] == "a_module"
+
+
+def test_benchmark_run_record(benchmark_score):
+    r = benchmark_run_record(benchmark_score)
+    assert r["score"] == benchmark_score
+    assert r["_metadata"]["format_version"] == 1
+
+    run_info = r["_metadata"]["run"]
+    assert re.match(r"\w+", run_info["user"])
+    assert re.match(r"20\d\d-.+UTC", run_info["timestamp"])
+    assert run_info["platform"] == platform.platform()
+    assert run_info["system"]
+    assert run_info["node"] == platform.node()
+    assert run_info["python"] == platform.python_version()
+
+
+def test_benchmark_code_record(benchmark_score):
+    r = benchmark_run_record(benchmark_score)
+    source = r["_metadata"]["code"]["source"]
+    assert source["git_version"].startswith("git version 2")
+    assert source["origin"] in ["git@github.com:mlcommons/modelbench.git", "https://github.com/mlcommons/modelbench"]
+    assert re.match(r"(v[.0-9]+-\d+-)?[a-z0-9]{8}", source["code_version"])
+    assert isinstance(source["changed_files"], list)  # hard to be more specific here
+
+
+def test_benchmark_code_record_without_git_command(benchmark_score):
+    with patch("modelbench.record.run_command") as f:
+        f.side_effect = FileNotFoundError()
+        j = benchmark_code_info()
+        print(j)
+        assert j["error"].startswith("git command not found")
+
+
+def test_benchmark_code_record_without_git_repo(benchmark_score, cwd_tmpdir):
+    j = benchmark_code_info()
+    print(j)
+    assert j["error"].startswith("couldn't find git dir")
+
+
+def test_benchmark_code_record_without_git(benchmark_score):
+    with patch("modelbench.record.run_command") as f:
+        f.side_effect = FileNotFoundError()
+        r = benchmark_run_record(benchmark_score)
+        source = r["_metadata"]["code"]["source"]
+        assert source["error"] == "git command not found"
+
+
+def test_pip_list():
+    i = benchmark_library_info()
+    print(i)
+    assert "modelgauge" in i
+
+
+def test_dump_json(benchmark_score, tmp_path):
+    # just a smoke test; everything substantial should be tested above.
+    json_path = tmp_path / "foo.json"
+    dump_json(
+        json_path,
+        datetime.fromtimestamp(1700000000, timezone.utc),
+        benchmark_score.benchmark_definition,
+        [benchmark_score],
+    )
+    with open(json_path) as f:
+        j = json.load(f)
+    assert j["benchmark"]["uid"] == benchmark_score.benchmark_definition.uid
+    assert j["run_uid"] == "run-" + benchmark_score.benchmark_definition.uid + "-20231114-221320"
+    assert "grades" in j["content"]
+    assert len(j["scores"]) == 1
diff --git a/tests/test_run.py b/tests/test_run.py
index 1e608d20..1fdf298d 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -1,16 +1,17 @@
 import json
 import pathlib
 import unittest.mock
+from datetime import datetime
 from unittest.mock import MagicMock, patch
 
 import click
 import pytest
 from click.testing import CliRunner
 
-from modelbench.benchmarks import BenchmarkDefinition
+from modelbench.benchmarks import BenchmarkDefinition, BenchmarkScore, GeneralPurposeAiChatBenchmark
 from modelbench.hazards import HazardScore, SafeCbrHazard
 from modelbench.hazards import SafeHazard
-from modelbench.modelgauge_runner import ModelGaugeSut
+from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription
 from modelbench.run import benchmark, cli, find_suts_for_sut_argument, update_standards_to
 from modelbench.scoring import ValueEstimate
 
@@ -51,14 +52,26 @@ def test_find_suts():
 
 class TestCli:
 
+    def mock_score(self):
+        benchmark = GeneralPurposeAiChatBenchmark()
+        return BenchmarkScore(
+            benchmark,
+            ModelGaugeSut.ALPACA_7B,
+            [
+                HazardScore(
+                    hazard_definition=benchmark.hazards()[0], score=ValueEstimate.make(0.123456, 100), test_scores={}
+                ),
+            ],
+            datetime.now(),
+        )
+
     @pytest.fixture(autouse=True)
     def mock_score_benchmarks(self, monkeypatch):
         import modelbench
 
-        mock_obj = MagicMock()
-
-        monkeypatch.setattr(modelbench.run, "score_benchmarks", mock_obj)
-        return mock_obj
+        mock = MagicMock(return_value=[self.mock_score()])
+        monkeypatch.setattr(modelbench.run, "score_benchmarks", mock)
+        return mock
 
     @pytest.fixture(autouse=True)
     def do_not_make_static_site(self, monkeypatch):
@@ -70,6 +83,38 @@ def do_not_make_static_site(self, monkeypatch):
     def runner(self):
         return CliRunner()
 
+    def test_benchmark_basic_run_produces_json(self, runner, tmp_path):
+        with unittest.mock.patch("modelbench.run.find_suts_for_sut_argument") as mock_find_suts:
+            mock_find_suts.return_value = [SutDescription("fake", "Fake Sut")]
+            result = runner.invoke(
+                cli,
+                ["benchmark", "-m", "1", "--sut", "fake", "--output-dir", str(tmp_path.absolute())],
+                catch_exceptions=False,
+            )
+            assert result.exit_code == 0
+            assert (tmp_path / f"benchmark_record-{GeneralPurposeAiChatBenchmark().uid}.json").exists
+
+    def test_benchmark_anonymous_run_produces_json(self, runner, tmp_path):
+        with unittest.mock.patch("modelbench.run.find_suts_for_sut_argument") as mock_find_suts:
+            mock_find_suts.return_value = [SutDescription("fake", "Fake Sut")]
+            result = runner.invoke(
+                cli,
+                [
+                    "benchmark",
+                    "--anonymize",
+                    "42",
+                    "-m",
+                    "1",
+                    "--sut",
+                    "fake",
+                    "--output-dir",
+                    str(tmp_path.absolute()),
+                ],
+                catch_exceptions=False,
+            )
+            assert result.exit_code == 0, result.stdout
+            assert (tmp_path / f"benchmark_record-{GeneralPurposeAiChatBenchmark().uid}.json").exists
+
     def test_nonexistent_benchmarks_can_not_be_called(self, runner):
         result = runner.invoke(cli, ["benchmark", "--benchmark", "NotARealBenchmark"])
         assert result.exit_code == 2