diff --git a/src/modelbench/hazards.py b/src/modelbench/hazards.py index 258accc5..32d81e3a 100644 --- a/src/modelbench/hazards.py +++ b/src/modelbench/hazards.py @@ -17,6 +17,10 @@ class HazardDefinition(ABC, HasUid): + def __init__(self): + super().__init__() + self._tests = None + @classmethod def name(cls): return casefy.titlecase(cls.__name__.replace(HazardDefinition.__name__, "")) @@ -60,7 +64,7 @@ def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore": # per Peter, treat benign and hazardous as one Test # TODO: Consider if this means NewHELM should model as one test # or if more generally we don't want to report individual test estimates. - test_scores = {self.name(): score_estimate} + test_scores = {self.uid: score_estimate} return HazardScore( hazard_definition=self, score=score_estimate, @@ -72,7 +76,9 @@ def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore": def tests(self, secrets: RawSecrets) -> List[BaseTest]: if self.hazard_key is None: raise (NotImplementedError("must set hazard key")) - return [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)] + if not self._tests: + self._tests = [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)] + return self._tests # not actually a hazard! diff --git a/src/modelbench/modelgauge_runner.py b/src/modelbench/modelgauge_runner.py index 97fadca3..47062d44 100644 --- a/src/modelbench/modelgauge_runner.py +++ b/src/modelbench/modelgauge_runner.py @@ -55,6 +55,18 @@ class ModelGaugeSut(SutDescription, Enum): WIZARDLM_13B = "wizardlm-13b", "WizardLM v1.2 (13B)", TogetherChatSUT, "WizardLM/WizardLM-13B-V1.2" # YI_34B_CHAT = "yi-34b", "01-ai Yi Chat (34B)", TogetherChatSUT, "zero-one-ai/Yi-34B-Chat" + def instance(self, secrets): + if not hasattr(self, "_instance"): + if not secrets: + return None + self._instance = SUTS.make_instance(self.key, secrets=secrets) + return self._instance + + def instance_initialization(self): + instance = self.instance(None) + if instance: + return instance.initialization_record + for sut in ModelGaugeSut: required_secrets = { diff --git a/src/modelbench/record.py b/src/modelbench/record.py new file mode 100644 index 00000000..48558c00 --- /dev/null +++ b/src/modelbench/record.py @@ -0,0 +1,122 @@ +import json +import os +import pathlib +import platform +import subprocess +from datetime import datetime, timezone +from typing import Sequence + +import pydantic +from modelgauge.tests.safe import SafeTest + +from modelbench.benchmarks import BenchmarkScore, BenchmarkDefinition +from modelbench.hazards import HazardDefinition, HazardScore +from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription +from modelbench.static_site_generator import StaticContent + + +def run_command(*args): + result = subprocess.run(args, capture_output=True) + return result.stdout.decode("utf-8").strip() + + +def benchmark_code_info(): + try: + git_dir = run_command("git", "rev-parse", "git-dir") + if not git_dir: + return {"error": "couldn't find git dir"} + except FileNotFoundError: + return {"error": "git command not found"} + + return { + "git_version": run_command("git", "--version"), + "origin": run_command("git", "config", "--get", "remote.origin.url"), + "code_version": run_command("git", "describe", "--tags", "--abbrev=8", "--always", "--long", "--match", "v*"), + "changed_files": [l.strip() for l in run_command("git", "status", "-s", "--untracked-files=no").splitlines()], + } + + +def benchmark_library_info(): + try: + text = run_command("python", "-m", "pip", "list") + result = {} + for line in text.splitlines()[2:]: + package, version = line.split(maxsplit=1) + result[package] = version + return result + except FileNotFoundError: + return {"error": "pip not found"} + + +def benchmark_metadata(): + return { + "format_version": 1, + "run": { + "user": os.environ.get("USER", os.environ.get("USERNAME")), + "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z"), + "platform": platform.platform(), + "system": f"{platform.system()} {platform.release()} {platform.version()}", + "node": platform.node(), + "python": platform.python_version(), + }, + "code": { + "source": benchmark_code_info(), + "libraries": benchmark_library_info(), + }, + } + + +def benchmark_run_record(score): + return { + "score": score, + "_metadata": benchmark_metadata(), + } + + +def dump_json( + json_path: pathlib.Path, + start_time: datetime.time, + benchmark: BenchmarkDefinition, + benchmark_scores: Sequence[BenchmarkScore], +): + with open(json_path, "w") as f: + output = { + "benchmark": (benchmark), + "run_uid": f"run-{benchmark.uid}-{start_time.strftime('%Y%m%d-%H%M%S')}", + "scores": (benchmark_scores), + "content": StaticContent(), + } + json.dump(output, f, cls=BenchmarkScoreEncoder, indent=4) + + +class BenchmarkScoreEncoder(json.JSONEncoder): + + def default(self, o): + if isinstance(o, BenchmarkScore) or isinstance(o, HazardScore): + result = {} + result.update(o.__dict__) + result["numeric_grade"] = o.numeric_grade() + result["text_grade"] = o.text_grade() + if "benchmark_definition" in result: + del result["benchmark_definition"] # duplicated up the tree + return result + elif isinstance(o, BenchmarkDefinition): + return {"uid": o.uid, "hazards": o.hazards()} + elif isinstance(o, HazardDefinition): + result = {"uid": o.uid, "reference_standard": o.reference_standard()} + if o._tests: + result["tests"] = o._tests + return result + elif isinstance(o, SafeTest): + return o.uid + elif isinstance(o, SutDescription): + result = {"uid": o.key} + if isinstance(o, ModelGaugeSut) and o.instance_initialization(): + result["initialization"] = o.instance_initialization() + return result + elif isinstance(o, pydantic.BaseModel): + return o.model_dump() + elif isinstance(o, datetime): + return str(o) + else: + return super().default(o) diff --git a/src/modelbench/run.py b/src/modelbench/run.py index c45aa998..0533140b 100644 --- a/src/modelbench/run.py +++ b/src/modelbench/run.py @@ -31,6 +31,7 @@ ) from modelbench.hazards import HazardDefinition, HazardScore, STANDARDS from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription +from modelbench.record import dump_json from modelbench.static_site_generator import StaticContent, StaticSiteGenerator _DEFAULT_SUTS = ModelGaugeSut @@ -69,11 +70,7 @@ def cli() -> None: ) @click.option("--max-instances", "-m", type=int, default=100) @click.option("--debug", default=False, is_flag=True) -@click.option( - "--sut", - "-s", - multiple=True, -) +@click.option("sut_uids", "--sut", "-s", multiple=True, help="SUT uid(s) to run") @click.option("--view-embed", default=False, is_flag=True, help="Render the HTML to be embedded in another view") @click.option( "--custom-branding", @@ -83,28 +80,32 @@ def cli() -> None: @click.option("--anonymize", type=int, help="Random number seed for consistent anonymization of SUTs") @click.option("--parallel", default=False, help="Experimentally run SUTs in parallel") @click.option( + "benchmark_name", "--benchmark", type=click.Choice([c.__name__ for c in BenchmarkDefinition.__subclasses__()]), - default=["GeneralPurposeAiChatBenchmark"], + default="GeneralPurposeAiChatBenchmark", help="Benchmark to run (Default: GeneralPurposeAiChatBenchmark)", - multiple=True, + multiple=False, ) @local_plugin_dir_option def benchmark( - benchmark: str, + benchmark_name: str, output_dir: pathlib.Path, max_instances: int, debug: bool, - sut: List[str], + sut_uids: List[str], view_embed: bool, custom_branding: Optional[pathlib.Path] = None, anonymize=None, parallel=False, ) -> None: - suts = find_suts_for_sut_argument(sut) - benchmarks = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ in benchmark] - benchmark_scores = score_benchmarks(benchmarks, suts, max_instances, debug, parallel) + start_time = datetime.now(timezone.utc) + suts = find_suts_for_sut_argument(sut_uids) + benchmark = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ == benchmark_name][0] + benchmark_scores = score_benchmarks([benchmark], suts, max_instances, debug, parallel) generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding) + json_path = output_dir / f"benchmark_record-{benchmark.uid}.json" + dump_json(json_path, start_time, benchmark, benchmark_scores) def find_suts_for_sut_argument(sut_args: List[str]): @@ -120,7 +121,7 @@ def find_suts_for_sut_argument(sut_args: List[str]): else: all_sut_keys = registered_sut_keys.union(set(default_suts_by_key.keys())) raise click.BadParameter( - f"Unknown key '{sut_arg}'. Valid options are {sorted(all_sut_keys, key=lambda x:x.lower())}", + f"Unknown key '{sut_arg}'. Valid options are {sorted(all_sut_keys, key=lambda x: x.lower())}", param_hint="sut", ) @@ -147,7 +148,7 @@ def score_benchmarks(benchmarks, suts, max_instances, debug, parallel=True): def score_a_sut(benchmarks, max_instances, secrets, debug, sut): sut_scores = [] echo(termcolor.colored(f'Examining system "{sut.display_name}"', "green")) - sut_instance = SUTS.make_instance(sut.key, secrets=secrets) + sut_instance = sut.instance(secrets) for benchmark_definition in benchmarks: echo(termcolor.colored(f' Starting run for benchmark "{benchmark_definition.name()}"', "green")) hazard_scores = [] @@ -174,15 +175,16 @@ def score_a_sut(benchmarks, max_instances, secrets, debug, sut): return sut_scores +class FakeSut(SutDescription): + @property + def name(self): + return self.key.upper() + + def generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding=None): static_site_generator = StaticSiteGenerator(view_embed=view_embed, custom_branding=custom_branding) if anonymize: - class FakeSut(SutDescription): - @property - def name(self): - return self.key.upper() - rng = random.Random(anonymize) rng.shuffle(benchmark_scores) @@ -268,7 +270,7 @@ def run_tests( ) -> Mapping[HazardDefinition, HazardScore]: secrets = load_secrets_from_config() result = {} - sut_instance = SUTS.make_instance(sut.key, secrets=secrets) + sut_instance = sut.instance(secrets) for hazard in hazards: test_scores = {} for test in hazard.tests(secrets=secrets): @@ -286,9 +288,7 @@ def test_records_for_sut(sut: ModelGaugeSut, tests: Dict[str, FactoryEntry], dat @retry(tries=3, delay=1) # to compensate for modelgauge/togetherai flakiness def process_one(): - result = run_prompt_response_test( - test, SUTS.make_instance(sut.key, secrets=secrets), data_dir, max_test_items=max_test_items - ) + result = run_prompt_response_test(test, sut.instance(secrets), data_dir, max_test_items=max_test_items) return test_id, result try: diff --git a/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml b/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml index cea9070f..f87b6ec2 100644 --- a/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml +++ b/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml @@ -1,8 +1,6 @@ [general_purpose_ai_chat_benchmark] name = "Benchmark for general purpose AI chat model" -uid = "general_purpose_ai_chat_benchmark" - version = "0.5" tagline = "Evaluates the safety of a fine-tuned LLM for general purpose, low-risk chat use in the English language and North American or Western European cultural context by a typical adult user. Evaluation is currently only of a limited range of hazards focused on physical harms, criminal activity, hate speech, and sexual abuse." diff --git a/src/modelbench/templates/test_report.html b/src/modelbench/templates/test_report.html index 9ad2fe72..20eedede 100644 --- a/src/modelbench/templates/test_report.html +++ b/src/modelbench/templates/test_report.html @@ -49,7 +49,7 @@

Run Details

Benchmark UID
-

{{ content(benchmark_score.benchmark_definition, "uid") }}

+

{{ benchmark_score.benchmark_definition.uid }}

Benchmark Version
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 85733a3f..9c10eb59 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -1,11 +1,13 @@ import pathlib from datetime import datetime from typing import List, Mapping +from unittest.mock import MagicMock import pytest from modelgauge.base_test import BaseTest from modelgauge.records import TestRecord from modelgauge.secret_values import RawSecrets +from modelgauge.tests.safe import SafeTestResult, PersonaType, PersonaResult from modelbench.benchmarks import ( BenchmarkScore, @@ -111,13 +113,27 @@ def test_hazard_definition_basics(fake_secrets): def test_hazard_score_basics(): - bd = SafeCaeHazard() - a_score = bd.reference_standard() - hs = HazardScore(hazard_definition=bd, score=ValueEstimate.make(a_score, 50), test_scores={}) - assert hs.hazard_definition == bd + hd = SafeCaeHazard() + a_score = hd.reference_standard() + hs = HazardScore(hazard_definition=hd, score=ValueEstimate.make(a_score, 50), test_scores={}) + assert hs.hazard_definition == hd assert hs.score.estimate == a_score +def test_hazard_score_test_scores(): + hd = SafeCaeHazard() + mock_test_record = MagicMock() + frac_safe = 0.5 + mock_test_record.result.to_instance.return_value = SafeTestResult( + persona_results={PersonaType.TYPICAL: PersonaResult(frac_safe=frac_safe, num_items=100)} + ) + result = hd.score({"foo": mock_test_record}) + print(result) + score_key = next(iter(result.test_scores)) + assert score_key == hd.uid + assert result.test_scores[score_key].estimate == frac_safe + + def test_modelgauge_sut_display_name_and_name(): assert ModelGaugeSut.LLAMA_2_7B.display_name == "Meta Llama 2, 7b parameters" assert ModelGaugeSut.LLAMA_2_7B.name == "LLAMA_2_7B" diff --git a/tests/test_record.py b/tests/test_record.py new file mode 100644 index 00000000..f7ca2e3e --- /dev/null +++ b/tests/test_record.py @@ -0,0 +1,177 @@ +import json +import platform +import re +from datetime import datetime, timezone +from unittest.mock import Mock, MagicMock, patch + +from modelgauge.record_init import InitializationRecord + +from modelbench.benchmarks import GeneralPurposeAiChatBenchmark +from modelbench.hazards import HazardScore, SafeCaeHazard +from modelbench.modelgauge_runner import ModelGaugeSut +from modelbench.record import ( + BenchmarkScoreEncoder, + benchmark_run_record, + dump_json, + benchmark_code_info, + benchmark_library_info, +) +from modelbench.run import FakeSut +from modelbench.scoring import ValueEstimate +from test_static_site_generator import benchmark_score + + +def encode(o): + return json.dumps(o, cls=BenchmarkScoreEncoder, indent=4) + + +def encode_and_parse(o): + s = encode(o) + return json.loads(s) + + +def test_sut(): + assert encode_and_parse(ModelGaugeSut.ALPACA_7B) == {"uid": "alpaca-7b"} + + ModelGaugeSut.ALPACA_7B.instance(MagicMock()) + with_initialization = encode_and_parse(ModelGaugeSut.ALPACA_7B) + assert "uid" in with_initialization + assert "initialization" in with_initialization + assert encode_and_parse(ModelGaugeSut.ALPACA_7B) == with_initialization + + +def test_anonymous_sut(): + j = encode_and_parse(FakeSut("a_sut-v1.0", "A SUT of some sort")) + print(j) + + +def test_value_estimate(): + ve = ValueEstimate.make(0.5, 1000) + j = encode_and_parse(ve) + assert j["lower"] == ve.lower + assert j["estimate"] == ve.estimate + assert j["upper"] == ve.upper + assert j["samples"] == ve.samples + + +def test_hazard_definition_without_tests_loaded(): + hazard = SafeCaeHazard() + j = encode_and_parse(hazard) + assert j["uid"] == hazard.uid + assert "tests" not in j + assert j["reference_standard"] == hazard.reference_standard() + + +def test_hazard_definition_with_tests_loaded(): + hazard = SafeCaeHazard() + hazard.tests({"together": {"api_key": "ignored"}}) + j = encode_and_parse(hazard) + assert j["uid"] == hazard.uid + assert j["tests"] == ["safe-cae"] + assert j["reference_standard"] == hazard.reference_standard() + + +def test_benchmark_definition(): + j = encode_and_parse(GeneralPurposeAiChatBenchmark()) + assert j["uid"] == "general_purpose_ai_chat_benchmark-0.5" + assert "safe_cae_hazard-0.5" in [i["uid"] for i in j["hazards"]] + + +def test_hazard_score(): + ve = ValueEstimate.make(1.0, 100000) + hs = HazardScore(hazard_definition=SafeCaeHazard(), score=ve, test_scores={"cae": ve}) + j = encode_and_parse(hs) + assert j["hazard_definition"]["uid"] == SafeCaeHazard().uid + assert j["score"]["samples"] == ve.samples + assert j["test_scores"]["cae"]["samples"] == ve.samples + assert j["numeric_grade"] == 5 + assert j["text_grade"] == "L" + + +def test_benchmark_score(benchmark_score): + j = encode_and_parse(benchmark_score) + assert "benchmark_definition" not in j # it's already higher up in the tree; no need to duplicate + assert j["sut"]["uid"] == benchmark_score.sut.key + assert len(j["hazard_scores"]) == len(benchmark_score.hazard_scores) + assert j["end_time"] == str(benchmark_score.end_time) + assert j["numeric_grade"] == benchmark_score.numeric_grade() + assert j["text_grade"] == benchmark_score.text_grade() + + +def test_benchmark_score_initialization_record(benchmark_score): + mock_method = Mock() + mock_method.return_value = InitializationRecord( + module="a_module", class_name="a_class", args=["arg1", "arg2"], kwargs={"kwarg1": "a_value"} + ) + benchmark_score.sut.instance_initialization = mock_method + j = encode_and_parse(benchmark_score) + print(j) + assert j["sut"]["initialization"]["module"] == "a_module" + + +def test_benchmark_run_record(benchmark_score): + r = benchmark_run_record(benchmark_score) + assert r["score"] == benchmark_score + assert r["_metadata"]["format_version"] == 1 + + run_info = r["_metadata"]["run"] + assert re.match(r"\w+", run_info["user"]) + assert re.match(r"20\d\d-.+UTC", run_info["timestamp"]) + assert run_info["platform"] == platform.platform() + assert run_info["system"] + assert run_info["node"] == platform.node() + assert run_info["python"] == platform.python_version() + + +def test_benchmark_code_record(benchmark_score): + r = benchmark_run_record(benchmark_score) + source = r["_metadata"]["code"]["source"] + assert source["git_version"].startswith("git version 2") + assert source["origin"] in ["git@github.com:mlcommons/modelbench.git", "https://github.com/mlcommons/modelbench"] + assert re.match(r"(v[.0-9]+-\d+-)?[a-z0-9]{8}", source["code_version"]) + assert isinstance(source["changed_files"], list) # hard to be more specific here + + +def test_benchmark_code_record_without_git_command(benchmark_score): + with patch("modelbench.record.run_command") as f: + f.side_effect = FileNotFoundError() + j = benchmark_code_info() + print(j) + assert j["error"].startswith("git command not found") + + +def test_benchmark_code_record_without_git_repo(benchmark_score, cwd_tmpdir): + j = benchmark_code_info() + print(j) + assert j["error"].startswith("couldn't find git dir") + + +def test_benchmark_code_record_without_git(benchmark_score): + with patch("modelbench.record.run_command") as f: + f.side_effect = FileNotFoundError() + r = benchmark_run_record(benchmark_score) + source = r["_metadata"]["code"]["source"] + assert source["error"] == "git command not found" + + +def test_pip_list(): + i = benchmark_library_info() + print(i) + assert "modelgauge" in i + + +def test_dump_json(benchmark_score, tmp_path): + # just a smoke test; everything substantial should be tested above. + json_path = tmp_path / "foo.json" + dump_json( + json_path, + datetime.fromtimestamp(1700000000, timezone.utc), + benchmark_score.benchmark_definition, + [benchmark_score], + ) + with open(json_path) as f: + j = json.load(f) + assert j["benchmark"]["uid"] == benchmark_score.benchmark_definition.uid + assert j["run_uid"] == "run-" + benchmark_score.benchmark_definition.uid + "-20231114-221320" + assert "grades" in j["content"] + assert len(j["scores"]) == 1 diff --git a/tests/test_run.py b/tests/test_run.py index 1e608d20..1fdf298d 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -1,16 +1,17 @@ import json import pathlib import unittest.mock +from datetime import datetime from unittest.mock import MagicMock, patch import click import pytest from click.testing import CliRunner -from modelbench.benchmarks import BenchmarkDefinition +from modelbench.benchmarks import BenchmarkDefinition, BenchmarkScore, GeneralPurposeAiChatBenchmark from modelbench.hazards import HazardScore, SafeCbrHazard from modelbench.hazards import SafeHazard -from modelbench.modelgauge_runner import ModelGaugeSut +from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription from modelbench.run import benchmark, cli, find_suts_for_sut_argument, update_standards_to from modelbench.scoring import ValueEstimate @@ -51,14 +52,26 @@ def test_find_suts(): class TestCli: + def mock_score(self): + benchmark = GeneralPurposeAiChatBenchmark() + return BenchmarkScore( + benchmark, + ModelGaugeSut.ALPACA_7B, + [ + HazardScore( + hazard_definition=benchmark.hazards()[0], score=ValueEstimate.make(0.123456, 100), test_scores={} + ), + ], + datetime.now(), + ) + @pytest.fixture(autouse=True) def mock_score_benchmarks(self, monkeypatch): import modelbench - mock_obj = MagicMock() - - monkeypatch.setattr(modelbench.run, "score_benchmarks", mock_obj) - return mock_obj + mock = MagicMock(return_value=[self.mock_score()]) + monkeypatch.setattr(modelbench.run, "score_benchmarks", mock) + return mock @pytest.fixture(autouse=True) def do_not_make_static_site(self, monkeypatch): @@ -70,6 +83,38 @@ def do_not_make_static_site(self, monkeypatch): def runner(self): return CliRunner() + def test_benchmark_basic_run_produces_json(self, runner, tmp_path): + with unittest.mock.patch("modelbench.run.find_suts_for_sut_argument") as mock_find_suts: + mock_find_suts.return_value = [SutDescription("fake", "Fake Sut")] + result = runner.invoke( + cli, + ["benchmark", "-m", "1", "--sut", "fake", "--output-dir", str(tmp_path.absolute())], + catch_exceptions=False, + ) + assert result.exit_code == 0 + assert (tmp_path / f"benchmark_record-{GeneralPurposeAiChatBenchmark().uid}.json").exists + + def test_benchmark_anonymous_run_produces_json(self, runner, tmp_path): + with unittest.mock.patch("modelbench.run.find_suts_for_sut_argument") as mock_find_suts: + mock_find_suts.return_value = [SutDescription("fake", "Fake Sut")] + result = runner.invoke( + cli, + [ + "benchmark", + "--anonymize", + "42", + "-m", + "1", + "--sut", + "fake", + "--output-dir", + str(tmp_path.absolute()), + ], + catch_exceptions=False, + ) + assert result.exit_code == 0, result.stdout + assert (tmp_path / f"benchmark_record-{GeneralPurposeAiChatBenchmark().uid}.json").exists + def test_nonexistent_benchmarks_can_not_be_called(self, runner): result = runner.invoke(cli, ["benchmark", "--benchmark", "NotARealBenchmark"]) assert result.exit_code == 2