Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark outcomes record #392

Merged
merged 41 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
2e464e8
Add HasUid and apply it to Benchmark and Hazard.
wpietri Jun 27, 2024
cedb78a
Pleasing the formatting gods.
wpietri Jun 27, 2024
c0001c5
Add basic output, plus metadata. More to come.
wpietri Jul 1, 2024
1db7a43
Remove accidental paste.
wpietri Jul 1, 2024
5041a9a
Remove accidental paste.
wpietri Jul 1, 2024
b373bb1
Merge remote-tracking branch 'origin/benchmark_outcomes_record' into …
wpietri Jul 1, 2024
0dabb0a
Add SUT initialization and git-derived metadata on the code.
wpietri Jul 16, 2024
75e7af3
Removing unneeded test.
wpietri Jul 17, 2024
ae61704
Removing unneeded test.
wpietri Jul 17, 2024
f482fa2
Making test work no matter how you check it out.
wpietri Jul 17, 2024
04889bd
Making test work no matter how you check it out.
wpietri Jul 17, 2024
d5189f3
Making test work no matter how you check it out.
wpietri Jul 17, 2024
09be1ec
Make HazardDefinitions cache Tests, making later output easier and mo…
wpietri Jul 17, 2024
b49203a
Add SUT initialization and git-derived metadata on the code.
wpietri Jul 16, 2024
8c566cb
Removing unneeded test.
wpietri Jul 17, 2024
f2c4ce9
Removing unneeded test.
wpietri Jul 17, 2024
a66a2d8
Making test work no matter how you check it out.
wpietri Jul 17, 2024
cc45d2c
Making test work no matter how you check it out.
wpietri Jul 17, 2024
fe766e3
Making test work no matter how you check it out.
wpietri Jul 17, 2024
6359e2f
Make HazardDefinitions cache Tests, making later output easier and mo…
wpietri Jul 17, 2024
3127b6a
Merge remote-tracking branch 'origin/benchmark_outcomes_record' into …
wpietri Jul 18, 2024
182672f
Thanks to Barbara's keen eye, fixing a bug (and adding a test for it).
wpietri Jul 18, 2024
450e2b8
Add SUT initialization and git-derived metadata on the code.
wpietri Jul 16, 2024
e9e635d
Merging from main.
wpietri Jul 17, 2024
4d8f4fc
Making test work no matter how you check it out.
wpietri Jul 17, 2024
10f1001
Making test work no matter how you check it out.
wpietri Jul 17, 2024
f860779
Make HazardDefinitions cache Tests, making later output easier and mo…
wpietri Jul 17, 2024
3bd6b69
Thanks to Barbara's keen eye, fixing a bug (and adding a test for it).
wpietri Jul 18, 2024
a5a9415
Per Barbara, make this more Pydantic-idiomatic.
wpietri Jul 19, 2024
42e639f
Merge remote-tracking branch 'origin/benchmark_outcomes_record' into …
wpietri Jul 19, 2024
4c4d7a3
Merge branch 'main' into benchmark_outcomes_record
wpietri Jul 19, 2024
f10e87a
Fix formatting after merge.
wpietri Jul 19, 2024
94db658
Fix test score key in JSON to be a UID.
wpietri Jul 19, 2024
cbbfe6f
Adding content and reference scores.
wpietri Jul 19, 2024
c0003c2
fix anonymous case for json
wpietri Jul 23, 2024
0673d15
verifying initialization makes it to JSON
wpietri Jul 23, 2024
db26323
Handling case where modelbench is installed not using git.
wpietri Jul 23, 2024
f19c4f7
Adding library info to json
wpietri Jul 23, 2024
033a69c
Removing null when tests aren't loaded for hazard.
wpietri Jul 23, 2024
bca20a1
Removing benchmark uid from content, using one in class instead.
wpietri Jul 23, 2024
4cc3ddc
Fixing anonymous runs.
wpietri Jul 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/modelbench/hazards.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@

class HazardDefinition(ABC, HasUid):

def __init__(self):
bkorycki marked this conversation as resolved.
Show resolved Hide resolved
super().__init__()
self._tests = None

@classmethod
def name(cls):
return casefy.titlecase(cls.__name__.replace(HazardDefinition.__name__, ""))
Expand Down Expand Up @@ -60,7 +64,7 @@ def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
# per Peter, treat benign and hazardous as one Test
# TODO: Consider if this means NewHELM should model as one test
# or if more generally we don't want to report individual test estimates.
test_scores = {self.name(): score_estimate}
test_scores = {self.uid: score_estimate}
return HazardScore(
hazard_definition=self,
score=score_estimate,
Expand All @@ -72,7 +76,9 @@ def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
def tests(self, secrets: RawSecrets) -> List[BaseTest]:
if self.hazard_key is None:
raise (NotImplementedError("must set hazard key"))
return [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)]
if not self._tests:
self._tests = [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)]
return self._tests


# not actually a hazard!
Expand Down
12 changes: 12 additions & 0 deletions src/modelbench/modelgauge_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,18 @@ class ModelGaugeSut(SutDescription, Enum):
WIZARDLM_13B = "wizardlm-13b", "WizardLM v1.2 (13B)", TogetherChatSUT, "WizardLM/WizardLM-13B-V1.2"
# YI_34B_CHAT = "yi-34b", "01-ai Yi Chat (34B)", TogetherChatSUT, "zero-one-ai/Yi-34B-Chat"

def instance(self, secrets):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need these methods?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving instance creation here will let me unify duplicate code, and it gives me a place to cache the instance actually used for the run, which is needed to dump out the outcome JSON.

if not hasattr(self, "_instance"):
if not secrets:
return None
self._instance = SUTS.make_instance(self.key, secrets=secrets)
return self._instance

def instance_initialization(self):
instance = self.instance(None)
if instance:
return instance.initialization_record


for sut in ModelGaugeSut:
required_secrets = {
Expand Down
104 changes: 104 additions & 0 deletions src/modelbench/record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import json
import os
import pathlib
import platform
import subprocess
from datetime import datetime, timezone
from typing import Sequence

from modelgauge.tests.safe import SafeTest
import pydantic

from modelbench.benchmarks import BenchmarkScore, BenchmarkDefinition
from modelbench.hazards import HazardDefinition, HazardScore
from modelbench.modelgauge_runner import ModelGaugeSut
from modelbench.static_site_generator import StaticContent


def run_command(*args):
result = subprocess.run(args, capture_output=True)
return result.stdout.decode("utf-8").strip()


def benchmark_code_info():
try:
return {
"git_version": run_command("git", "--version"),
"origin": run_command("git", "config", "--get", "remote.origin.url"),
"code_version": run_command(
"git", "describe", "--tags", "--abbrev=8", "--always", "--long", "--match", "v*"
),
"changed_files": [
l.strip() for l in run_command("git", "status", "-s", "--untracked-files=no").splitlines()
],
}
except FileNotFoundError:
return {"error": "git command not found"}


def benchmark_metadata():
return {
"format_version": 1,
"run": {
"user": os.environ.get("USER", os.environ.get("USERNAME")),
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z"),
"platform": platform.platform(),
"system": f"{platform.system()} {platform.release()} {platform.version()}",
"node": platform.node(),
"python": platform.python_version(),
},
"code": benchmark_code_info(),
}


def benchmark_run_record(score):
return {
"score": score,
"_metadata": benchmark_metadata(),
}


def dump_json(
json_path: pathlib.Path,
start_time: datetime.time,
benchmark: BenchmarkDefinition,
benchmark_scores: Sequence[BenchmarkScore],
):
with open(json_path, "w") as f:
output = {
"benchmark": (benchmark),
"run_uid": f"run-{benchmark.uid}-{start_time.strftime('%Y%m%d-%H%M%S')}",
"scores": (benchmark_scores),
"content": StaticContent(),
}
json.dump(output, f, cls=BenchmarkScoreEncoder, indent=4)


class BenchmarkScoreEncoder(json.JSONEncoder):

def default(self, o):
if isinstance(o, BenchmarkScore) or isinstance(o, HazardScore):
result = {}
result.update(o.__dict__)
result["numeric_grade"] = o.numeric_grade()
result["text_grade"] = o.text_grade()
if "benchmark_definition" in result:
del result["benchmark_definition"] # duplicated up the tree
return result
elif isinstance(o, BenchmarkDefinition):
return {"uid": o.uid, "hazards": o.hazards()}
elif isinstance(o, HazardDefinition):
return {"uid": o.uid, "tests": o._tests, "reference_standard": o.reference_standard()}
elif isinstance(o, SafeTest):
return o.uid
elif isinstance(o, ModelGaugeSut):
result = {"uid": o.key}
if o.instance_initialization():
result["initialization"] = o.instance_initialization()
return result
elif isinstance(o, pydantic.BaseModel):
return o.model_dump()
elif isinstance(o, datetime):
return str(o)
else:
return super().default(o)
35 changes: 17 additions & 18 deletions src/modelbench/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
)
from modelbench.hazards import HazardDefinition, HazardScore, STANDARDS
from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription
from modelbench.record import dump_json
from modelbench.static_site_generator import StaticContent, StaticSiteGenerator

_DEFAULT_SUTS = ModelGaugeSut
Expand Down Expand Up @@ -69,11 +70,7 @@ def cli() -> None:
)
@click.option("--max-instances", "-m", type=int, default=100)
@click.option("--debug", default=False, is_flag=True)
@click.option(
"--sut",
"-s",
multiple=True,
)
@click.option("sut_uids", "--sut", "-s", multiple=True, help="SUT uid(s) to run")
@click.option("--view-embed", default=False, is_flag=True, help="Render the HTML to be embedded in another view")
@click.option(
"--custom-branding",
Expand All @@ -83,28 +80,32 @@ def cli() -> None:
@click.option("--anonymize", type=int, help="Random number seed for consistent anonymization of SUTs")
@click.option("--parallel", default=False, help="Experimentally run SUTs in parallel")
@click.option(
"benchmark_name",
"--benchmark",
type=click.Choice([c.__name__ for c in BenchmarkDefinition.__subclasses__()]),
default=["GeneralPurposeAiChatBenchmark"],
default="GeneralPurposeAiChatBenchmark",
help="Benchmark to run (Default: GeneralPurposeAiChatBenchmark)",
multiple=True,
multiple=False,
)
@local_plugin_dir_option
def benchmark(
benchmark: str,
benchmark_name: str,
output_dir: pathlib.Path,
max_instances: int,
debug: bool,
sut: List[str],
sut_uids: List[str],
view_embed: bool,
custom_branding: Optional[pathlib.Path] = None,
anonymize=None,
parallel=False,
) -> None:
suts = find_suts_for_sut_argument(sut)
benchmarks = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ in benchmark]
benchmark_scores = score_benchmarks(benchmarks, suts, max_instances, debug, parallel)
start_time = datetime.now(timezone.utc)
suts = find_suts_for_sut_argument(sut_uids)
benchmark = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ == benchmark_name][0]
benchmark_scores = score_benchmarks([benchmark], suts, max_instances, debug, parallel)
bkorycki marked this conversation as resolved.
Show resolved Hide resolved
generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding)
json_path = output_dir / f"benchmark_record-{benchmark.uid}.json"
bkorycki marked this conversation as resolved.
Show resolved Hide resolved
dump_json(json_path, start_time, benchmark, benchmark_scores)
wpietri marked this conversation as resolved.
Show resolved Hide resolved


def find_suts_for_sut_argument(sut_args: List[str]):
Expand All @@ -120,7 +121,7 @@ def find_suts_for_sut_argument(sut_args: List[str]):
else:
all_sut_keys = registered_sut_keys.union(set(default_suts_by_key.keys()))
raise click.BadParameter(
f"Unknown key '{sut_arg}'. Valid options are {sorted(all_sut_keys, key=lambda x:x.lower())}",
f"Unknown key '{sut_arg}'. Valid options are {sorted(all_sut_keys, key=lambda x: x.lower())}",
param_hint="sut",
)

Expand All @@ -147,7 +148,7 @@ def score_benchmarks(benchmarks, suts, max_instances, debug, parallel=True):
def score_a_sut(benchmarks, max_instances, secrets, debug, sut):
sut_scores = []
echo(termcolor.colored(f'Examining system "{sut.display_name}"', "green"))
sut_instance = SUTS.make_instance(sut.key, secrets=secrets)
sut_instance = sut.instance(secrets)
for benchmark_definition in benchmarks:
echo(termcolor.colored(f' Starting run for benchmark "{benchmark_definition.name()}"', "green"))
hazard_scores = []
Expand Down Expand Up @@ -268,7 +269,7 @@ def run_tests(
) -> Mapping[HazardDefinition, HazardScore]:
secrets = load_secrets_from_config()
result = {}
sut_instance = SUTS.make_instance(sut.key, secrets=secrets)
sut_instance = sut.instance(secrets)
for hazard in hazards:
test_scores = {}
for test in hazard.tests(secrets=secrets):
Expand All @@ -286,9 +287,7 @@ def test_records_for_sut(sut: ModelGaugeSut, tests: Dict[str, FactoryEntry], dat

@retry(tries=3, delay=1) # to compensate for modelgauge/togetherai flakiness
def process_one():
result = run_prompt_response_test(
test, SUTS.make_instance(sut.key, secrets=secrets), data_dir, max_test_items=max_test_items
)
result = run_prompt_response_test(test, sut.instance(secrets), data_dir, max_test_items=max_test_items)
return test_id, result

try:
Expand Down
24 changes: 20 additions & 4 deletions tests/test_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import pathlib
from datetime import datetime
from typing import List, Mapping
from unittest.mock import MagicMock

import pytest
from modelgauge.base_test import BaseTest
from modelgauge.records import TestRecord
from modelgauge.secret_values import RawSecrets
from modelgauge.tests.safe import SafeTestResult, PersonaType, PersonaResult

from modelbench.benchmarks import (
BenchmarkScore,
Expand Down Expand Up @@ -111,13 +113,27 @@ def test_hazard_definition_basics(fake_secrets):


def test_hazard_score_basics():
bd = SafeCaeHazard()
a_score = bd.reference_standard()
hs = HazardScore(hazard_definition=bd, score=ValueEstimate.make(a_score, 50), test_scores={})
assert hs.hazard_definition == bd
hd = SafeCaeHazard()
a_score = hd.reference_standard()
hs = HazardScore(hazard_definition=hd, score=ValueEstimate.make(a_score, 50), test_scores={})
assert hs.hazard_definition == hd
assert hs.score.estimate == a_score


def test_hazard_score_test_scores():
hd = SafeCaeHazard()
mock_test_record = MagicMock()
frac_safe = 0.5
mock_test_record.result.to_instance.return_value = SafeTestResult(
persona_results={PersonaType.TYPICAL: PersonaResult(frac_safe=frac_safe, num_items=100)}
)
result = hd.score({"foo": mock_test_record})
print(result)
score_key = next(iter(result.test_scores))
assert score_key == hd.uid
assert result.test_scores[score_key].estimate == frac_safe


def test_modelgauge_sut_display_name_and_name():
assert ModelGaugeSut.LLAMA_2_7B.display_name == "Meta Llama 2, 7b parameters"
assert ModelGaugeSut.LLAMA_2_7B.name == "LLAMA_2_7B"
Expand Down
Loading
Loading