diff --git a/src/coffee/benchmark.py b/src/coffee/benchmark.py index 0a68b60c..ac0e317f 100644 --- a/src/coffee/benchmark.py +++ b/src/coffee/benchmark.py @@ -3,7 +3,7 @@ import casefy -from coffee.helm import BbqHelmTest, HelmTest +from coffee.helm_runner import BbqHelmTest, HelmTest class Benchmark(ABC): diff --git a/src/coffee/helm_interface.py b/src/coffee/helm_interface.py new file mode 100644 index 00000000..b6283a10 --- /dev/null +++ b/src/coffee/helm_interface.py @@ -0,0 +1,62 @@ +from itertools import product +from typing import Iterable, TYPE_CHECKING + +from helm.benchmark.config_registry import ( + register_builtin_configs_from_helm_package, +) +from helm.benchmark.executor import ExecutionSpec +from helm.benchmark.huggingface_registration import ( + register_huggingface_hub_model_from_flag_value, +) +from helm.benchmark.presentation.run_entry import RunEntry +from helm.benchmark.run import run_entries_to_run_specs +from helm.benchmark.runner import Runner +from helm.common.authentication import Authentication + +if TYPE_CHECKING: + from helm_runner import HelmSut, HelmTest + +from helm.benchmark.runner import RunnerError + + +def run_executions( + tests: Iterable["HelmTest"], + suts: Iterable["HelmSut"], + max_eval_instances: int = 10, + suite: str = "v1", + num_threads: int = 4, + benchmark_output_path: str = "run/benchmark_output", + prod_env_path: str = "run/prod_env", +) -> None: + register_builtin_configs_from_helm_package() + for sut in suts: + if sut.huggingface: + register_huggingface_hub_model_from_flag_value(sut.key) + run_entries = [] + for test, sut in product(tests, suts): + for runspec in test.runspecs(): + run_entries.append( + RunEntry( + description=f"{runspec},model={sut.key}", priority=1, groups=[] + ) + ) + run_specs = run_entries_to_run_specs( + run_entries, max_eval_instances=max_eval_instances + ) + execution_spec = ExecutionSpec( + url=None, + auth=Authentication(""), + local_path=prod_env_path, + parallelism=num_threads, + ) + runner = Runner( + execution_spec=execution_spec, + output_path=benchmark_output_path, + suite=suite, + skip_instances=False, + cache_instances=False, + cache_instances_only=False, + skip_completed_runs=False, + exit_on_error=False, + ) + runner.run_all(run_specs) diff --git a/src/coffee/helm.py b/src/coffee/helm_runner.py similarity index 85% rename from src/coffee/helm.py rename to src/coffee/helm_runner.py index c7c3c60e..265a5588 100644 --- a/src/coffee/helm.py +++ b/src/coffee/helm_runner.py @@ -12,6 +12,8 @@ import jq import coffee +from coffee.helm_interface import RunnerError, run_executions + # This starts with a bunch of objects that represent things already in HELM code. # As we shift HELM to accommodate a library use case, it would be nice to compose @@ -97,13 +99,11 @@ def __init__( tests: List[HelmTest], suts: List[HelmSut], output_dir: pathlib.Path, - execution_result: subprocess.CompletedProcess, ): super().__init__() self.tests = tests self.suts = suts self.output_dir = output_dir - self.execution_result = execution_result def load_scores(self): focus = self.output_dir / "benchmark_output" / "runs" / "v1" @@ -130,12 +130,6 @@ def load_scores(self): result.add(t, s, test_sut_scores) return result - def helm_stdout(self) -> str: - return self._deal_with_bytes(self.execution_result.stdout) - - def helm_stderr(self) -> str: - return self._deal_with_bytes(self.execution_result.stderr) - def _deal_with_bytes(self, o): if isinstance(o, bytes): result = o.decode("utf-8") @@ -147,15 +141,38 @@ def _filesystem_safe(self, s: str): # reproducing some behavior in HELM; would be nice to remove duplication return re.sub("/", "_", s) - def success(self): - return self.execution_result and self.execution_result.returncode == 0 - class HelmRunner(ABC): @abstractmethod - def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10): + def run(self, tests: list[HelmTest], models: list[HelmSut], max_instances=10): pass + def _make_output_dir(self): + o = pathlib.Path.cwd() + if o.name in ["src", "test"]: + o = o.parent + if not o.name == "run": + o = o / "run" + o.mkdir(exist_ok=True) + return o + + +class InProcessHelmRunner(HelmRunner): + def run(self, tests: list[HelmTest], suts: list[HelmSut], max_instances=10): + run_executions( + tests, + suts, + max_eval_instances=max_instances, + suite="v1", + num_threads=4, + benchmark_output_path="run/benchmark_output", + prod_env_path="run/prod_env", + ) + + output_dir = self._make_output_dir() + + return HelmResult(tests, suts, output_dir) + class CliHelmRunner(HelmRunner): def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10): @@ -172,18 +189,18 @@ def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10): logging.debug(f"helm run command: {command}") output_dir = self._make_output_dir() - execute_result = self._execute(command, output_dir) - return HelmResult(tests, suts, output_dir, execute_result) + self._execute(command, output_dir) + return HelmResult(tests, suts, output_dir) - def _execute( - self, command: List[str], output_dir: pathlib.Path - ) -> subprocess.CompletedProcess: + def _execute(self, command: List[str], output_dir: pathlib.Path) -> None: if coffee.app_config.debug: - return self._run_with_debug_settings(command, output_dir) + result = self._run_with_debug_settings(command, output_dir) else: - return subprocess.run( + result = subprocess.run( " ".join(command), shell=True, capture_output=True, cwd=output_dir ) + if not result.returncode == 0: + raise RunnerError(result.stderr) def _run_with_debug_settings(self, command, output_dir): with subprocess.Popen( @@ -195,16 +212,9 @@ def _run_with_debug_settings(self, command, output_dir): ) as sp: for line in sp.stdout: logging.debug(line.decode().rstrip()) - return subprocess.CompletedProcess(sp.args, sp.returncode, sp.stdout, sp.stderr) - - def _make_output_dir(self): - o = pathlib.Path.cwd() - if o.name in ["src", "test"]: - o = o.parent - if not o.name == "run": - o = o / "run" - o.mkdir(exist_ok=True) - return o + if not sp.returncode == 0: + raise RunnerError(sp.stderr) + return sp def _helm_command_for_runspecs( self, bbq_runspecs, max_instances, huggingface_models=None diff --git a/src/coffee/run.py b/src/coffee/run.py index 12ae1942..10d2f152 100644 --- a/src/coffee/run.py +++ b/src/coffee/run.py @@ -5,7 +5,7 @@ import coffee from coffee.benchmark import Benchmark, RidiculousBenchmark -from coffee.helm import CliHelmRunner, HelmSut +from coffee.helm_runner import HelmSut, InProcessHelmRunner from coffee.static_site_generator import StaticSiteGenerator @@ -30,17 +30,9 @@ def cli(output_dir: pathlib.Path, max_instances: int, debug: bool) -> None: else: logging.basicConfig(level=logging.INFO) - runner = CliHelmRunner() + runner = InProcessHelmRunner() suts = [HelmSut.GPT2, HelmSut.PYTHIA_70M, HelmSut.FB_OPT_125M] result = runner.run(RidiculousBenchmark.tests(), suts, max_instances=max_instances) - if not result.success(): - print( - f"HELM execution failed with return code {result.execution_result.returncode}:" - ) - print("stdout:") - print(result.helm_stdout()) - print("stderr:") - print(result.helm_stderr()) scores = result.load_scores() benchmarks: list[Benchmark] = [] for sut in suts: diff --git a/src/coffee/static_site_generator.py b/src/coffee/static_site_generator.py index aeb4e51f..20cf4ebe 100644 --- a/src/coffee/static_site_generator.py +++ b/src/coffee/static_site_generator.py @@ -2,7 +2,7 @@ import pathlib import shutil from itertools import groupby -from typing import Tuple +from typing import Iterable, Iterator, Tuple from jinja2 import Environment, PackageLoader, select_autoescape @@ -85,10 +85,10 @@ def _generate_index_page( def _grouped_benchmarks(self, benchmarks: list[Benchmark]) -> dict: benchmarks_dict = {} - for benchmark_name, grouped_benchmarks in groupby( + for benchmark_name, _grouped_benchmarks in groupby( benchmarks, lambda x: x.__class__.__name__ ): - grouped_benchmarks = list(grouped_benchmarks) + grouped_benchmarks = list(_grouped_benchmarks) benchmarks_dict[grouped_benchmarks[0]] = grouped_benchmarks return benchmarks_dict diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 2c0d3c83..9e38a3bf 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -1,4 +1,4 @@ -from coffee.helm import HelmSut +from coffee.helm_runner import HelmSut from coffee.benchmark import RidiculousBenchmark diff --git a/tests/test_helm_interface.py b/tests/test_helm_interface.py new file mode 100644 index 00000000..e79f9d09 --- /dev/null +++ b/tests/test_helm_interface.py @@ -0,0 +1,55 @@ +from unittest.mock import MagicMock + +import pytest + +from coffee import helm_interface +from coffee.helm_runner import BbqHelmTest, HelmSut + + +@pytest.fixture(autouse=True) +def monkeypatch_runner(monkeypatch): + mock_obj = MagicMock() + monkeypatch.setattr(helm_interface, "Runner", mock_obj) + return mock_obj + + +@pytest.fixture +def monkeypatch_register_huggingface(monkeypatch): + mock_obj = MagicMock() + monkeypatch.setattr( + helm_interface, + "register_huggingface_hub_model_from_flag_value", + mock_obj, + ) + return mock_obj + + +@pytest.fixture +def monkeypatch_run_entries_to_run_specs(monkeypatch): + mock_obj = MagicMock() + monkeypatch.setattr(helm_interface, "run_entries_to_run_specs", mock_obj) + return mock_obj + + +def test_run_executions_registers_huggingface( + monkeypatch, monkeypatch_register_huggingface, monkeypatch_run_entries_to_run_specs +): + # have to monkeypatch run_entries_to_runspecs since we can't register due to monkeypatching + # register_huggingface_hub_model_from_flag_value + + helm_interface.run_executions([BbqHelmTest()], [HelmSut.FB_OPT_125M, HelmSut.GPT2]) + monkeypatch_register_huggingface.assert_called_once_with("facebook/opt-125m") + + +@pytest.mark.parametrize( + "tests, suts, expected", + [ + ([BbqHelmTest()], [HelmSut.FB_OPT_125M, HelmSut.GPT2], 20), + ([BbqHelmTest()], [HelmSut.GPT2], 10), + ], +) +def test_generates_correct_number_runspecs( + monkeypatch, monkeypatch_run_entries_to_run_specs, tests, suts, expected +): + helm_interface.run_executions(tests, suts) + assert len(monkeypatch_run_entries_to_run_specs.call_args[0][0]) == expected diff --git a/tests/test_helm_runner.py b/tests/test_helm_runner.py index 46a29c42..742ae6b0 100644 --- a/tests/test_helm_runner.py +++ b/tests/test_helm_runner.py @@ -6,7 +6,13 @@ import pytest from coffee.run import quantize_stars -from coffee.helm import HelmSut, BbqHelmTest, HelmResult, CliHelmRunner +from coffee.helm_runner import ( + HelmSut, + BbqHelmTest, + HelmResult, + CliHelmRunner, + InProcessHelmRunner, +) from coffee.benchmark import RidiculousBenchmark @@ -35,9 +41,13 @@ def test_cli_helm_runner_command_handles_huggingface_models(cwd_tmpdir): assert shell_arguments[enables[0] + 2] == HelmSut.PYTHIA_70M.key +def test_inprocess_helm_runner(cwd_tmpdir): + pass + + @pytest.mark.datafiles(SIMPLE_BBQ_DATA) def test_read_scores(datafiles): - hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None) + hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles) scores = hr.load_scores() sut_scores = scores.for_sut(HelmSut.GPT2) assert "BbqHelmTest" in sut_scores @@ -47,7 +57,7 @@ def test_read_scores(datafiles): @pytest.mark.datafiles(SIMPLE_BBQ_DATA) def test_ridiculous_benchmark(datafiles): - hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None) + hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles) scores = hr.load_scores() b = RidiculousBenchmark(HelmSut.GPT2, scores.for_sut(HelmSut.GPT2)) assert 2.25 == pytest.approx(b.overall_score()) diff --git a/tests/test_static_site_generator.py b/tests/test_static_site_generator.py index 9d36c4a0..e1d7b390 100644 --- a/tests/test_static_site_generator.py +++ b/tests/test_static_site_generator.py @@ -4,14 +4,14 @@ import pytest -from coffee.helm import HelmResult, BbqHelmTest, HelmSut +from coffee.helm_runner import HelmResult, BbqHelmTest, HelmSut from coffee.benchmark import RidiculousBenchmark from coffee.static_site_generator import StaticSiteGenerator @pytest.fixture() def benchmark(datafiles): - hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None) + hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles) scores = hr.load_scores() b = RidiculousBenchmark(HelmSut.GPT2, scores.for_sut(HelmSut.GPT2)) return b