Benchmark UID
-{{ content(benchmark_score.benchmark_definition, "uid") }}
+{{ benchmark_score.benchmark_definition.uid }}
diff --git a/src/modelbench/hazards.py b/src/modelbench/hazards.py index 258accc5..32d81e3a 100644 --- a/src/modelbench/hazards.py +++ b/src/modelbench/hazards.py @@ -17,6 +17,10 @@ class HazardDefinition(ABC, HasUid): + def __init__(self): + super().__init__() + self._tests = None + @classmethod def name(cls): return casefy.titlecase(cls.__name__.replace(HazardDefinition.__name__, "")) @@ -60,7 +64,7 @@ def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore": # per Peter, treat benign and hazardous as one Test # TODO: Consider if this means NewHELM should model as one test # or if more generally we don't want to report individual test estimates. - test_scores = {self.name(): score_estimate} + test_scores = {self.uid: score_estimate} return HazardScore( hazard_definition=self, score=score_estimate, @@ -72,7 +76,9 @@ def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore": def tests(self, secrets: RawSecrets) -> List[BaseTest]: if self.hazard_key is None: raise (NotImplementedError("must set hazard key")) - return [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)] + if not self._tests: + self._tests = [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)] + return self._tests # not actually a hazard! diff --git a/src/modelbench/modelgauge_runner.py b/src/modelbench/modelgauge_runner.py index 97fadca3..47062d44 100644 --- a/src/modelbench/modelgauge_runner.py +++ b/src/modelbench/modelgauge_runner.py @@ -55,6 +55,18 @@ class ModelGaugeSut(SutDescription, Enum): WIZARDLM_13B = "wizardlm-13b", "WizardLM v1.2 (13B)", TogetherChatSUT, "WizardLM/WizardLM-13B-V1.2" # YI_34B_CHAT = "yi-34b", "01-ai Yi Chat (34B)", TogetherChatSUT, "zero-one-ai/Yi-34B-Chat" + def instance(self, secrets): + if not hasattr(self, "_instance"): + if not secrets: + return None + self._instance = SUTS.make_instance(self.key, secrets=secrets) + return self._instance + + def instance_initialization(self): + instance = self.instance(None) + if instance: + return instance.initialization_record + for sut in ModelGaugeSut: required_secrets = { diff --git a/src/modelbench/record.py b/src/modelbench/record.py new file mode 100644 index 00000000..48558c00 --- /dev/null +++ b/src/modelbench/record.py @@ -0,0 +1,122 @@ +import json +import os +import pathlib +import platform +import subprocess +from datetime import datetime, timezone +from typing import Sequence + +import pydantic +from modelgauge.tests.safe import SafeTest + +from modelbench.benchmarks import BenchmarkScore, BenchmarkDefinition +from modelbench.hazards import HazardDefinition, HazardScore +from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription +from modelbench.static_site_generator import StaticContent + + +def run_command(*args): + result = subprocess.run(args, capture_output=True) + return result.stdout.decode("utf-8").strip() + + +def benchmark_code_info(): + try: + git_dir = run_command("git", "rev-parse", "git-dir") + if not git_dir: + return {"error": "couldn't find git dir"} + except FileNotFoundError: + return {"error": "git command not found"} + + return { + "git_version": run_command("git", "--version"), + "origin": run_command("git", "config", "--get", "remote.origin.url"), + "code_version": run_command("git", "describe", "--tags", "--abbrev=8", "--always", "--long", "--match", "v*"), + "changed_files": [l.strip() for l in run_command("git", "status", "-s", "--untracked-files=no").splitlines()], + } + + +def benchmark_library_info(): + try: + text = run_command("python", "-m", "pip", "list") + result = {} + for line in text.splitlines()[2:]: + package, version = line.split(maxsplit=1) + result[package] = version + return result + except FileNotFoundError: + return {"error": "pip not found"} + + +def benchmark_metadata(): + return { + "format_version": 1, + "run": { + "user": os.environ.get("USER", os.environ.get("USERNAME")), + "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z"), + "platform": platform.platform(), + "system": f"{platform.system()} {platform.release()} {platform.version()}", + "node": platform.node(), + "python": platform.python_version(), + }, + "code": { + "source": benchmark_code_info(), + "libraries": benchmark_library_info(), + }, + } + + +def benchmark_run_record(score): + return { + "score": score, + "_metadata": benchmark_metadata(), + } + + +def dump_json( + json_path: pathlib.Path, + start_time: datetime.time, + benchmark: BenchmarkDefinition, + benchmark_scores: Sequence[BenchmarkScore], +): + with open(json_path, "w") as f: + output = { + "benchmark": (benchmark), + "run_uid": f"run-{benchmark.uid}-{start_time.strftime('%Y%m%d-%H%M%S')}", + "scores": (benchmark_scores), + "content": StaticContent(), + } + json.dump(output, f, cls=BenchmarkScoreEncoder, indent=4) + + +class BenchmarkScoreEncoder(json.JSONEncoder): + + def default(self, o): + if isinstance(o, BenchmarkScore) or isinstance(o, HazardScore): + result = {} + result.update(o.__dict__) + result["numeric_grade"] = o.numeric_grade() + result["text_grade"] = o.text_grade() + if "benchmark_definition" in result: + del result["benchmark_definition"] # duplicated up the tree + return result + elif isinstance(o, BenchmarkDefinition): + return {"uid": o.uid, "hazards": o.hazards()} + elif isinstance(o, HazardDefinition): + result = {"uid": o.uid, "reference_standard": o.reference_standard()} + if o._tests: + result["tests"] = o._tests + return result + elif isinstance(o, SafeTest): + return o.uid + elif isinstance(o, SutDescription): + result = {"uid": o.key} + if isinstance(o, ModelGaugeSut) and o.instance_initialization(): + result["initialization"] = o.instance_initialization() + return result + elif isinstance(o, pydantic.BaseModel): + return o.model_dump() + elif isinstance(o, datetime): + return str(o) + else: + return super().default(o) diff --git a/src/modelbench/run.py b/src/modelbench/run.py index c45aa998..0533140b 100644 --- a/src/modelbench/run.py +++ b/src/modelbench/run.py @@ -31,6 +31,7 @@ ) from modelbench.hazards import HazardDefinition, HazardScore, STANDARDS from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription +from modelbench.record import dump_json from modelbench.static_site_generator import StaticContent, StaticSiteGenerator _DEFAULT_SUTS = ModelGaugeSut @@ -69,11 +70,7 @@ def cli() -> None: ) @click.option("--max-instances", "-m", type=int, default=100) @click.option("--debug", default=False, is_flag=True) -@click.option( - "--sut", - "-s", - multiple=True, -) +@click.option("sut_uids", "--sut", "-s", multiple=True, help="SUT uid(s) to run") @click.option("--view-embed", default=False, is_flag=True, help="Render the HTML to be embedded in another view") @click.option( "--custom-branding", @@ -83,28 +80,32 @@ def cli() -> None: @click.option("--anonymize", type=int, help="Random number seed for consistent anonymization of SUTs") @click.option("--parallel", default=False, help="Experimentally run SUTs in parallel") @click.option( + "benchmark_name", "--benchmark", type=click.Choice([c.__name__ for c in BenchmarkDefinition.__subclasses__()]), - default=["GeneralPurposeAiChatBenchmark"], + default="GeneralPurposeAiChatBenchmark", help="Benchmark to run (Default: GeneralPurposeAiChatBenchmark)", - multiple=True, + multiple=False, ) @local_plugin_dir_option def benchmark( - benchmark: str, + benchmark_name: str, output_dir: pathlib.Path, max_instances: int, debug: bool, - sut: List[str], + sut_uids: List[str], view_embed: bool, custom_branding: Optional[pathlib.Path] = None, anonymize=None, parallel=False, ) -> None: - suts = find_suts_for_sut_argument(sut) - benchmarks = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ in benchmark] - benchmark_scores = score_benchmarks(benchmarks, suts, max_instances, debug, parallel) + start_time = datetime.now(timezone.utc) + suts = find_suts_for_sut_argument(sut_uids) + benchmark = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ == benchmark_name][0] + benchmark_scores = score_benchmarks([benchmark], suts, max_instances, debug, parallel) generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding) + json_path = output_dir / f"benchmark_record-{benchmark.uid}.json" + dump_json(json_path, start_time, benchmark, benchmark_scores) def find_suts_for_sut_argument(sut_args: List[str]): @@ -120,7 +121,7 @@ def find_suts_for_sut_argument(sut_args: List[str]): else: all_sut_keys = registered_sut_keys.union(set(default_suts_by_key.keys())) raise click.BadParameter( - f"Unknown key '{sut_arg}'. Valid options are {sorted(all_sut_keys, key=lambda x:x.lower())}", + f"Unknown key '{sut_arg}'. Valid options are {sorted(all_sut_keys, key=lambda x: x.lower())}", param_hint="sut", ) @@ -147,7 +148,7 @@ def score_benchmarks(benchmarks, suts, max_instances, debug, parallel=True): def score_a_sut(benchmarks, max_instances, secrets, debug, sut): sut_scores = [] echo(termcolor.colored(f'Examining system "{sut.display_name}"', "green")) - sut_instance = SUTS.make_instance(sut.key, secrets=secrets) + sut_instance = sut.instance(secrets) for benchmark_definition in benchmarks: echo(termcolor.colored(f' Starting run for benchmark "{benchmark_definition.name()}"', "green")) hazard_scores = [] @@ -174,15 +175,16 @@ def score_a_sut(benchmarks, max_instances, secrets, debug, sut): return sut_scores +class FakeSut(SutDescription): + @property + def name(self): + return self.key.upper() + + def generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding=None): static_site_generator = StaticSiteGenerator(view_embed=view_embed, custom_branding=custom_branding) if anonymize: - class FakeSut(SutDescription): - @property - def name(self): - return self.key.upper() - rng = random.Random(anonymize) rng.shuffle(benchmark_scores) @@ -268,7 +270,7 @@ def run_tests( ) -> Mapping[HazardDefinition, HazardScore]: secrets = load_secrets_from_config() result = {} - sut_instance = SUTS.make_instance(sut.key, secrets=secrets) + sut_instance = sut.instance(secrets) for hazard in hazards: test_scores = {} for test in hazard.tests(secrets=secrets): @@ -286,9 +288,7 @@ def test_records_for_sut(sut: ModelGaugeSut, tests: Dict[str, FactoryEntry], dat @retry(tries=3, delay=1) # to compensate for modelgauge/togetherai flakiness def process_one(): - result = run_prompt_response_test( - test, SUTS.make_instance(sut.key, secrets=secrets), data_dir, max_test_items=max_test_items - ) + result = run_prompt_response_test(test, sut.instance(secrets), data_dir, max_test_items=max_test_items) return test_id, result try: diff --git a/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml b/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml index cea9070f..f87b6ec2 100644 --- a/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml +++ b/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml @@ -1,8 +1,6 @@ [general_purpose_ai_chat_benchmark] name = "Benchmark for general purpose AI chat model" -uid = "general_purpose_ai_chat_benchmark" - version = "0.5" tagline = "Evaluates the safety of a fine-tuned LLM for general purpose, low-risk chat use in the English language and North American or Western European cultural context by a typical adult user. Evaluation is currently only of a limited range of hazards focused on physical harms, criminal activity, hate speech, and sexual abuse." diff --git a/src/modelbench/templates/test_report.html b/src/modelbench/templates/test_report.html index 9ad2fe72..20eedede 100644 --- a/src/modelbench/templates/test_report.html +++ b/src/modelbench/templates/test_report.html @@ -49,7 +49,7 @@
{{ content(benchmark_score.benchmark_definition, "uid") }}
+{{ benchmark_score.benchmark_definition.uid }}