Skip to content

Commit

Permalink
initial helm interface and in-process runner; remove assumptions that…
Browse files Browse the repository at this point in the history
… a runner will use subprocess; stop clobbering helm package namespace; tests
  • Loading branch information
dhosterman committed Jan 17, 2024
1 parent 327e01a commit 0619e4b
Show file tree
Hide file tree
Showing 9 changed files with 178 additions and 49 deletions.
2 changes: 1 addition & 1 deletion src/coffee/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import casefy

from coffee.helm import BbqHelmTest, HelmTest
from coffee.helm_runner import BbqHelmTest, HelmTest


class Benchmark(ABC):
Expand Down
62 changes: 62 additions & 0 deletions src/coffee/helm_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from itertools import product
from typing import Iterable, TYPE_CHECKING

from helm.benchmark.config_registry import (
register_builtin_configs_from_helm_package,
)
from helm.benchmark.executor import ExecutionSpec
from helm.benchmark.huggingface_registration import (
register_huggingface_hub_model_from_flag_value,
)
from helm.benchmark.presentation.run_entry import RunEntry
from helm.benchmark.run import run_entries_to_run_specs
from helm.benchmark.runner import Runner
from helm.common.authentication import Authentication

if TYPE_CHECKING:
from helm_runner import HelmSut, HelmTest

from helm.benchmark.runner import RunnerError


def run_executions(
tests: Iterable["HelmTest"],
suts: Iterable["HelmSut"],
max_eval_instances: int = 10,
suite: str = "v1",
num_threads: int = 4,
benchmark_output_path: str = "run/benchmark_output",
prod_env_path: str = "run/prod_env",
) -> None:
register_builtin_configs_from_helm_package()
for sut in suts:
if sut.huggingface:
register_huggingface_hub_model_from_flag_value(sut.key)
run_entries = []
for test, sut in product(tests, suts):
for runspec in test.runspecs():
run_entries.append(
RunEntry(
description=f"{runspec},model={sut.key}", priority=1, groups=[]
)
)
run_specs = run_entries_to_run_specs(
run_entries, max_eval_instances=max_eval_instances
)
execution_spec = ExecutionSpec(
url=None,
auth=Authentication(""),
local_path=prod_env_path,
parallelism=num_threads,
)
runner = Runner(
execution_spec=execution_spec,
output_path=benchmark_output_path,
suite=suite,
skip_instances=False,
cache_instances=False,
cache_instances_only=False,
skip_completed_runs=False,
exit_on_error=False,
)
runner.run_all(run_specs)
68 changes: 39 additions & 29 deletions src/coffee/helm.py → src/coffee/helm_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import jq

import coffee
from coffee.helm_interface import RunnerError, run_executions


# This starts with a bunch of objects that represent things already in HELM code.
# As we shift HELM to accommodate a library use case, it would be nice to compose
Expand Down Expand Up @@ -97,13 +99,11 @@ def __init__(
tests: List[HelmTest],
suts: List[HelmSut],
output_dir: pathlib.Path,
execution_result: subprocess.CompletedProcess,
):
super().__init__()
self.tests = tests
self.suts = suts
self.output_dir = output_dir
self.execution_result = execution_result

def load_scores(self):
focus = self.output_dir / "benchmark_output" / "runs" / "v1"
Expand All @@ -130,12 +130,6 @@ def load_scores(self):
result.add(t, s, test_sut_scores)
return result

def helm_stdout(self) -> str:
return self._deal_with_bytes(self.execution_result.stdout)

def helm_stderr(self) -> str:
return self._deal_with_bytes(self.execution_result.stderr)

def _deal_with_bytes(self, o):
if isinstance(o, bytes):
result = o.decode("utf-8")
Expand All @@ -147,15 +141,38 @@ def _filesystem_safe(self, s: str):
# reproducing some behavior in HELM; would be nice to remove duplication
return re.sub("/", "_", s)

def success(self):
return self.execution_result and self.execution_result.returncode == 0


class HelmRunner(ABC):
@abstractmethod
def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):
def run(self, tests: list[HelmTest], models: list[HelmSut], max_instances=10):
pass

def _make_output_dir(self):
o = pathlib.Path.cwd()
if o.name in ["src", "test"]:
o = o.parent
if not o.name == "run":
o = o / "run"
o.mkdir(exist_ok=True)
return o


class InProcessHelmRunner(HelmRunner):
def run(self, tests: list[HelmTest], suts: list[HelmSut], max_instances=10):
run_executions(
tests,
suts,
max_eval_instances=max_instances,
suite="v1",
num_threads=4,
benchmark_output_path="run/benchmark_output",
prod_env_path="run/prod_env",
)

output_dir = self._make_output_dir()

return HelmResult(tests, suts, output_dir)


class CliHelmRunner(HelmRunner):
def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
Expand All @@ -172,18 +189,18 @@ def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
logging.debug(f"helm run command: {command}")

output_dir = self._make_output_dir()
execute_result = self._execute(command, output_dir)
return HelmResult(tests, suts, output_dir, execute_result)
self._execute(command, output_dir)
return HelmResult(tests, suts, output_dir)

def _execute(
self, command: List[str], output_dir: pathlib.Path
) -> subprocess.CompletedProcess:
def _execute(self, command: List[str], output_dir: pathlib.Path) -> None:
if coffee.app_config.debug:
return self._run_with_debug_settings(command, output_dir)
result = self._run_with_debug_settings(command, output_dir)
else:
return subprocess.run(
result = subprocess.run(
" ".join(command), shell=True, capture_output=True, cwd=output_dir
)
if not result.returncode == 0:
raise RunnerError(result.stderr)

def _run_with_debug_settings(self, command, output_dir):
with subprocess.Popen(
Expand All @@ -195,16 +212,9 @@ def _run_with_debug_settings(self, command, output_dir):
) as sp:
for line in sp.stdout:
logging.debug(line.decode().rstrip())
return subprocess.CompletedProcess(sp.args, sp.returncode, sp.stdout, sp.stderr)

def _make_output_dir(self):
o = pathlib.Path.cwd()
if o.name in ["src", "test"]:
o = o.parent
if not o.name == "run":
o = o / "run"
o.mkdir(exist_ok=True)
return o
if not sp.returncode == 0:
raise RunnerError(sp.stderr)
return sp

def _helm_command_for_runspecs(
self, bbq_runspecs, max_instances, huggingface_models=None
Expand Down
12 changes: 2 additions & 10 deletions src/coffee/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import coffee
from coffee.benchmark import Benchmark, RidiculousBenchmark
from coffee.helm import CliHelmRunner, HelmSut
from coffee.helm_runner import HelmSut, InProcessHelmRunner
from coffee.static_site_generator import StaticSiteGenerator


Expand All @@ -30,17 +30,9 @@ def cli(output_dir: pathlib.Path, max_instances: int, debug: bool) -> None:
else:
logging.basicConfig(level=logging.INFO)

runner = CliHelmRunner()
runner = InProcessHelmRunner()
suts = [HelmSut.GPT2, HelmSut.PYTHIA_70M, HelmSut.FB_OPT_125M]
result = runner.run(RidiculousBenchmark.tests(), suts, max_instances=max_instances)
if not result.success():
print(
f"HELM execution failed with return code {result.execution_result.returncode}:"
)
print("stdout:")
print(result.helm_stdout())
print("stderr:")
print(result.helm_stderr())
scores = result.load_scores()
benchmarks: list[Benchmark] = []
for sut in suts:
Expand Down
6 changes: 3 additions & 3 deletions src/coffee/static_site_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pathlib
import shutil
from itertools import groupby
from typing import Tuple
from typing import Iterable, Iterator, Tuple

from jinja2 import Environment, PackageLoader, select_autoescape

Expand Down Expand Up @@ -85,10 +85,10 @@ def _generate_index_page(

def _grouped_benchmarks(self, benchmarks: list[Benchmark]) -> dict:
benchmarks_dict = {}
for benchmark_name, grouped_benchmarks in groupby(
for benchmark_name, _grouped_benchmarks in groupby(
benchmarks, lambda x: x.__class__.__name__
):
grouped_benchmarks = list(grouped_benchmarks)
grouped_benchmarks = list(_grouped_benchmarks)
benchmarks_dict[grouped_benchmarks[0]] = grouped_benchmarks
return benchmarks_dict

Expand Down
2 changes: 1 addition & 1 deletion tests/test_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from coffee.helm import HelmSut
from coffee.helm_runner import HelmSut
from coffee.benchmark import RidiculousBenchmark


Expand Down
55 changes: 55 additions & 0 deletions tests/test_helm_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from unittest.mock import MagicMock

import pytest

from coffee import helm_interface
from coffee.helm_runner import BbqHelmTest, HelmSut


@pytest.fixture(autouse=True)
def monkeypatch_runner(monkeypatch):
mock_obj = MagicMock()
monkeypatch.setattr(helm_interface, "Runner", mock_obj)
return mock_obj


@pytest.fixture
def monkeypatch_register_huggingface(monkeypatch):
mock_obj = MagicMock()
monkeypatch.setattr(
helm_interface,
"register_huggingface_hub_model_from_flag_value",
mock_obj,
)
return mock_obj


@pytest.fixture
def monkeypatch_run_entries_to_run_specs(monkeypatch):
mock_obj = MagicMock()
monkeypatch.setattr(helm_interface, "run_entries_to_run_specs", mock_obj)
return mock_obj


def test_run_executions_registers_huggingface(
monkeypatch, monkeypatch_register_huggingface, monkeypatch_run_entries_to_run_specs
):
# have to monkeypatch run_entries_to_runspecs since we can't register due to monkeypatching
# register_huggingface_hub_model_from_flag_value

helm_interface.run_executions([BbqHelmTest()], [HelmSut.FB_OPT_125M, HelmSut.GPT2])
monkeypatch_register_huggingface.assert_called_once_with("facebook/opt-125m")


@pytest.mark.parametrize(
"tests, suts, expected",
[
([BbqHelmTest()], [HelmSut.FB_OPT_125M, HelmSut.GPT2], 20),
([BbqHelmTest()], [HelmSut.GPT2], 10),
],
)
def test_generates_correct_number_runspecs(
monkeypatch, monkeypatch_run_entries_to_run_specs, tests, suts, expected
):
helm_interface.run_executions(tests, suts)
assert len(monkeypatch_run_entries_to_run_specs.call_args[0][0]) == expected
16 changes: 13 additions & 3 deletions tests/test_helm_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@
import pytest

from coffee.run import quantize_stars
from coffee.helm import HelmSut, BbqHelmTest, HelmResult, CliHelmRunner
from coffee.helm_runner import (
HelmSut,
BbqHelmTest,
HelmResult,
CliHelmRunner,
InProcessHelmRunner,
)
from coffee.benchmark import RidiculousBenchmark


Expand Down Expand Up @@ -35,9 +41,13 @@ def test_cli_helm_runner_command_handles_huggingface_models(cwd_tmpdir):
assert shell_arguments[enables[0] + 2] == HelmSut.PYTHIA_70M.key


def test_inprocess_helm_runner(cwd_tmpdir):
pass


@pytest.mark.datafiles(SIMPLE_BBQ_DATA)
def test_read_scores(datafiles):
hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None)
hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles)
scores = hr.load_scores()
sut_scores = scores.for_sut(HelmSut.GPT2)
assert "BbqHelmTest" in sut_scores
Expand All @@ -47,7 +57,7 @@ def test_read_scores(datafiles):

@pytest.mark.datafiles(SIMPLE_BBQ_DATA)
def test_ridiculous_benchmark(datafiles):
hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None)
hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles)
scores = hr.load_scores()
b = RidiculousBenchmark(HelmSut.GPT2, scores.for_sut(HelmSut.GPT2))
assert 2.25 == pytest.approx(b.overall_score())
Expand Down
4 changes: 2 additions & 2 deletions tests/test_static_site_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@

import pytest

from coffee.helm import HelmResult, BbqHelmTest, HelmSut
from coffee.helm_runner import HelmResult, BbqHelmTest, HelmSut
from coffee.benchmark import RidiculousBenchmark
from coffee.static_site_generator import StaticSiteGenerator


@pytest.fixture()
def benchmark(datafiles):
hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None)
hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles)
scores = hr.load_scores()
b = RidiculousBenchmark(HelmSut.GPT2, scores.for_sut(HelmSut.GPT2))
return b
Expand Down

0 comments on commit 0619e4b

Please sign in to comment.