Skip to content

Commit

Permalink
Use Modelgauge v0.6.1 (#437)
Browse files Browse the repository at this point in the history
* Prepare to update to modelgauge v0.6.1

* Update to modelgauge v0.6.1 and other new dependencies.
  • Loading branch information
wpietri authored Sep 5, 2024
1 parent 432829f commit 8e1c9fc
Show file tree
Hide file tree
Showing 8 changed files with 738 additions and 685 deletions.
1,364 changes: 710 additions & 654 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires-python = ">=3.10, <3.13"

[tool.poetry]
name = "modelbench"
version = "0.6.0"
version = "0.6.1"
description = "Run benchmarks and generate reports measuring the behavior of many AI Systems."
license = "Apache-2.0"
authors = ["MLCommons AI Safety <[email protected]>"]
Expand Down Expand Up @@ -47,7 +47,7 @@ packages = [

[tool.poetry.dependencies]
python = ">=3.10,<3.13"
modelgauge = ">=0.6.0"
modelgauge = ">=0.6.1"
jq = "^1.6.0"
click = "^8.1.7"
casefy = "^0.1.7"
Expand Down
33 changes: 15 additions & 18 deletions src/modelbench/suts.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,22 +77,19 @@ def _register_required_suts():
# Our main model runner, TogetherAI, deprecated a number of models due to lack of use circa 2024-08-29.
# This is the original set of models used for the published v0.5 benchmark.
#
ORIGINAL_SUTS_FOR_V_0_5 = [
ModelGaugeSut.for_key(k)
for k in [
"alpaca-7b",
"chronos-hermes-13b",
"deepseek-67b",
"gemma-7b",
"llama-2-7b-chat",
"llama-2-13b-chat",
"llama-2-70b-chat",
"mistral-7b",
"mixtral-8x-7b",
"openchat-3_5",
"qwen-72b",
"stripedhyena-nous-7b",
"vicuna-13b",
"wizardlm-13b",
]
ORIGINAL_SUT_UIDS_FOR_V_0_5 = [
"alpaca-7b",
"chronos-hermes-13b",
"deepseek-67b",
"gemma-7b",
"llama-2-7b-chat",
"llama-2-13b-chat",
"llama-2-70b-chat",
"mistral-7b",
"mixtral-8x-7b",
"openchat-3_5",
"qwen-72b",
"stripedhyena-nous-7b",
"vicuna-13b",
"wizardlm-13b",
]
2 changes: 1 addition & 1 deletion tests/templates/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def _benchmark_score(end_time) -> BenchmarkScore:
cbr_score = HazardScore(hazard_definition=cbr_hazard, score=ValueEstimate.make(0.5, 10), test_scores={})
bs = BenchmarkScore(
bd,
ModelGaugeSut.for_key("alpaca-7b"),
ModelGaugeSut.for_key("mistral-7b"),
[cae_score, cbr_score],
end_time,
)
Expand Down
7 changes: 4 additions & 3 deletions tests/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from modelgauge.base_test import BaseTest
from modelgauge.records import TestRecord
from modelgauge.secret_values import RawSecrets
from modelgauge.tests.safe import SafeTestResult, SafePersonas, SafePersonasVersion1, PersonaResult
from modelgauge.tests.safe import SafeTestResult, SafePersonas, PersonaResult
from modelgauge.tests.safe_v1 import SafePersonasVersion1

from modelbench.benchmarks import BenchmarkScore, GeneralPurposeAiChatBenchmark, GeneralPurposeAiChatBenchmarkV1
from modelbench.hazards import (
Expand Down Expand Up @@ -70,7 +71,7 @@ def func(probability):
hazard.reference_standard = lambda: 0.8
bs = BenchmarkScore(
benchmark,
ModelGaugeSut.for_key("alpaca-7b"),
ModelGaugeSut.for_key("mistral-7b"),
[HazardScore(hazard_definition=hazard, score=ve, test_scores={})],
datetime.fromtimestamp(1700000000),
)
Expand Down Expand Up @@ -135,7 +136,7 @@ def test_hazard_v1_definition_basics(fake_secrets):
tests = h.tests(secrets=fake_secrets)
assert len(tests) == 1
for t in tests:
assert t.__class__ == modelgauge.tests.safe.SafeTestVersion1
assert t.__class__ == modelgauge.tests.safe_v1.SafeTestVersion1


@pytest.mark.parametrize("hazard", [SafeCaeHazard(), SafeDfmHazardV1()])
Expand Down
4 changes: 2 additions & 2 deletions tests/test_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ def encode_and_parse(o):


def test_sut():
sut = ModelGaugeSut.for_key("alpaca-7b")
assert encode_and_parse(sut) == {"uid": "alpaca-7b"}
sut = ModelGaugeSut.for_key("mistral-7b")
assert encode_and_parse(sut) == {"uid": "mistral-7b"}
sut.instance(MagicMock())
with_initialization = encode_and_parse(sut)
assert "uid" in with_initialization
Expand Down
2 changes: 1 addition & 1 deletion tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def mock_score(self):
benchmark = GeneralPurposeAiChatBenchmark()
return BenchmarkScore(
benchmark,
ModelGaugeSut.for_key("alpaca-7b"),
ModelGaugeSut.for_key("mistral-7b"),
[
HazardScore(
hazard_definition=benchmark.hazards()[0], score=ValueEstimate.make(0.123456, 100), test_scores={}
Expand Down
7 changes: 3 additions & 4 deletions tests/test_static_site_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from unittest.mock import patch

import pytest
from modelgauge.tests.safe import SafeTest, SafePersonas

from modelbench.benchmarks import (
BenchmarkDefinition,
Expand All @@ -23,7 +22,7 @@ def benchmark_score(end_time):
bd = GeneralPurposeAiChatBenchmark()
bs = BenchmarkScore(
bd,
ModelGaugeSut.for_key("alpaca-7b"),
ModelGaugeSut.for_key("mistral-7b"),
[
HazardScore(
hazard_definition=SafeCaeHazard(),
Expand Down Expand Up @@ -66,7 +65,7 @@ def static_site_generator_view_embed():
"static/style.css",
"benchmarks.html",
"general_purpose_ai_chat_benchmark.html",
"alpaca-7b_general_purpose_ai_chat_benchmark_report.html",
"mistral-7b_general_purpose_ai_chat_benchmark_report.html",
"index.html",
],
)
Expand Down Expand Up @@ -126,7 +125,7 @@ def benchmark_score(self):
bh = SafeCaeHazard()
bs = BenchmarkScore(
bd,
ModelGaugeSut.for_key("alpaca-7b"),
ModelGaugeSut.for_key("mistral-7b"),
[
HazardScore(
hazard_definition=bh, score=ValueEstimate.make(bh.reference_standard(), 50), test_scores={}
Expand Down

0 comments on commit 8e1c9fc

Please sign in to comment.