Skip to content

Commit

Permalink
In process helm runner 2 (#58)
Browse files Browse the repository at this point in the history
* rename helm module to helm_runner to prevent clobbering the helm library in the namespace; add helm_interface which includes an InProcessHelmRunner

* run black

* sorting/optimizing imports

* import from helm_runner
  • Loading branch information
dhosterman authored Feb 2, 2024
1 parent d3deae2 commit 2c4e01d
Show file tree
Hide file tree
Showing 11 changed files with 185 additions and 33 deletions.
2 changes: 1 addition & 1 deletion src/coffee/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import casefy

from coffee.helm import BbqHelmTest, HelmTest, RealToxicityPromptsHelmTest, HelmSut
from coffee.helm_runner import BbqHelmTest, HelmSut, HelmTest, RealToxicityPromptsHelmTest


def float_to_stars(value: float) -> float:
Expand Down
86 changes: 86 additions & 0 deletions src/coffee/helm_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import subprocess
from typing import Iterable

import helm.benchmark.run_specs
from helm.benchmark.config_registry import (
register_builtin_configs_from_helm_package,
)
from helm.benchmark.executor import ExecutionSpec
from helm.benchmark.huggingface_registration import (
register_huggingface_hub_model_from_flag_value,
)
from helm.benchmark.model_deployment_registry import (
ClientSpec,
ModelDeployment,
register_model_deployment,
)
from helm.benchmark.presentation.run_entry import RunEntry
from helm.benchmark.run import run_entries_to_run_specs
from helm.benchmark.runner import Runner
from helm.common.authentication import Authentication

from coffee.helm_runner import HelmResult, HelmRunner, HelmSut, HelmTest

helm.benchmark.run_specs.INCLUDE_GENERATIVE_HARMS_METRICS = True


class InProcessHelmRunner(HelmRunner):
def run(self, tests: list[HelmTest], suts: list[HelmSut], max_instances=10):
self._execute(
tests,
suts,
max_eval_instances=max_instances,
suite="v1",
num_threads=4,
benchmark_output_path="run/benchmark_output",
prod_env_path="run/prod_env",
)

output_dir = self._make_output_dir()

# THIS IS A BIG, DUMB HACK until we unwind subprocess.CompletedProcess from the run mix.
execution_result = subprocess.run("", shell=True, capture_output=True, cwd=output_dir)
# END BIG DUMB HACK

return HelmResult(tests, suts, output_dir, execution_result)

def _execute(
self,
tests: Iterable["HelmTest"],
suts: Iterable["HelmSut"],
max_eval_instances: int = 10,
suite: str = "v1",
num_threads: int = 1,
benchmark_output_path: str = "run/benchmark_output",
prod_env_path: str = "run/prod_env",
) -> None:
register_builtin_configs_from_helm_package()
for sut in suts:
if sut.huggingface:
register_huggingface_hub_model_from_flag_value(sut.key)
model_deployment = ModelDeployment(
name=sut.key,
tokenizer_name=sut.tokenizer_name,
max_sequence_length=sut.tokenizer_max_length,
client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
)
register_model_deployment(model_deployment)
run_entries = [RunEntry(r, 1, list()) for r in self._build_runspecs(suts, tests)]
run_specs = run_entries_to_run_specs(run_entries, max_eval_instances=max_eval_instances)
execution_spec = ExecutionSpec(
url=None,
auth=Authentication(""),
local_path=prod_env_path,
parallelism=num_threads,
)
runner = Runner(
execution_spec=execution_spec,
output_path=benchmark_output_path,
suite=suite,
skip_instances=False,
cache_instances=False,
cache_instances_only=False,
skip_completed_runs=False,
exit_on_error=False,
)
runner.run_all(run_specs)
42 changes: 21 additions & 21 deletions src/coffee/helm.py → src/coffee/helm_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,27 @@ class HelmRunner(ABC):
def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):
pass

def _build_runspecs(self, suts, tests):
runspecs = []
for s in suts:
for t in tests:
for r in t.runspecs():
if ":" in r:
separator = ","
else:
separator = ":"
runspecs.append(r + separator + "model=" + s.key)
return runspecs

def _make_output_dir(self):
o = pathlib.Path.cwd()
if o.name in ["src", "test"]:
o = o.parent
if not o.name == "run":
o = o / "run"
o.mkdir(exist_ok=True)
return o


class CliHelmRunner(HelmRunner):
def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
Expand All @@ -209,18 +230,6 @@ def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
execute_result = self._execute(command, output_dir)
return HelmResult(tests, suts, output_dir, execute_result)

def _build_runspecs(self, suts, tests):
runspecs = []
for s in suts:
for t in tests:
for r in t.runspecs():
if ":" in r:
separator = ","
else:
separator = ":"
runspecs.append(r + separator + "model=" + s.key)
return runspecs

def _execute(self, command: List[str], output_dir: pathlib.Path) -> subprocess.CompletedProcess:
if coffee.app_config.debug:
return self._run_with_debug_settings(command, output_dir)
Expand All @@ -239,15 +248,6 @@ def _run_with_debug_settings(self, command, output_dir):
logging.debug(line.decode().rstrip())
return subprocess.CompletedProcess(sp.args, sp.returncode, sp.stdout, sp.stderr)

def _make_output_dir(self):
o = pathlib.Path.cwd()
if o.name in ["src", "test"]:
o = o.parent
if not o.name == "run":
o = o / "run"
o.mkdir(exist_ok=True)
return o

def _helm_command_for_runspecs(self, bbq_runspecs, max_instances):
command = ["python " + str(pathlib.Path(__file__).parent.parent / "dubious_helm_cli_wrapper.py")]
command.extend(["--suite", "v1"]) # this is a fixed string for now, which is probably wrong
Expand Down
7 changes: 2 additions & 5 deletions src/coffee/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,8 @@
import termcolor

import coffee
from coffee.benchmark import (
GeneralChatBotBenchmarkDefinition,
BenchmarkScore,
)
from coffee.helm import HelmSut, CliHelmRunner, HelmResult
from coffee.benchmark import BenchmarkScore, GeneralChatBotBenchmarkDefinition
from coffee.helm_runner import CliHelmRunner, HelmResult, HelmSut
from coffee.static_site_generator import StaticSiteGenerator


Expand Down
1 change: 0 additions & 1 deletion src/coffee/static_site_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import pathlib
import shutil
from itertools import groupby
from typing import Iterator

from jinja2 import Environment, PackageLoader, select_autoescape
from markupsafe import Markup
Expand Down
2 changes: 1 addition & 1 deletion src/dubious_helm_cli_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from helm.benchmark.run import main
import helm.benchmark.run_specs
from helm.benchmark.run import main

helm.benchmark.run_specs.INCLUDE_GENERATIVE_HARMS_METRICS = True
if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion tests/templates/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from jinja2 import Environment, FileSystemLoader

from coffee.helm import HelmSut
from coffee.helm_runner import HelmSut
from coffee.benchmark import (
GeneralChatBotBenchmarkDefinition,
BiasHarmDefinition,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
ToxicityHarmDefinition,
quantize_stars,
)
from coffee.helm import HelmSut, BbqHelmTest, HelmResult
from coffee.helm_runner import HelmSut, BbqHelmTest, HelmResult

SIMPLE_BBQ_DATA = pathlib.Path(__file__).parent / "data/full_runs/simple_bbq"
SIMPLE_TOXICITY_DATA = pathlib.Path(__file__).parent / "data/full_runs/toxicity"
Expand Down
70 changes: 70 additions & 0 deletions tests/test_helm_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from unittest.mock import MagicMock

import pytest

from coffee import helm_interface
from coffee.helm_runner import BbqHelmTest, HelmSut


@pytest.fixture(autouse=True)
def monkeypatch_run_all(monkeypatch):
mock_obj = MagicMock()
monkeypatch.setattr(helm_interface.Runner, "run_all", mock_obj)
return mock_obj


@pytest.fixture(autouse=True)
def monkeypatch_run_one(monkeypatch):
mock_obj = MagicMock()
monkeypatch.setattr(helm_interface.Runner, "run_one", mock_obj)
return mock_obj


@pytest.fixture
def monkeypatch_register_huggingface(monkeypatch):
mock_obj = MagicMock()
monkeypatch.setattr(
helm_interface,
"register_huggingface_hub_model_from_flag_value",
mock_obj,
)
return mock_obj


@pytest.fixture
def monkeypatch_run_entries_to_run_specs(monkeypatch):
mock_obj = MagicMock()
monkeypatch.setattr(helm_interface, "run_entries_to_run_specs", mock_obj)
return mock_obj


def test_run_executions_registers_huggingface(
monkeypatch, monkeypatch_register_huggingface, monkeypatch_run_entries_to_run_specs
):
# have to monkeypatch run_entries_to_runspecs since we can't register due to monkeypatching
# register_huggingface_hub_model_from_flag_value
runner = helm_interface.InProcessHelmRunner()

runner.run([BbqHelmTest()], [HelmSut.FB_OPT_125M, HelmSut.GPT2])
monkeypatch_register_huggingface.assert_called_once_with("facebook/opt-125m")


@pytest.mark.parametrize(
"tests, suts, expected",
[
([BbqHelmTest()], [HelmSut.FB_OPT_125M, HelmSut.GPT2], 20),
([BbqHelmTest()], [HelmSut.GPT2], 10),
],
)
def test_generates_correct_number_runspecs(monkeypatch, monkeypatch_run_entries_to_run_specs, tests, suts, expected):
runner = helm_interface.InProcessHelmRunner()

runner.run(tests, suts)
assert len(monkeypatch_run_entries_to_run_specs.call_args[0][0]) == expected


def test_runs_run_all(monkeypatch, monkeypatch_run_all):
runner = helm_interface.InProcessHelmRunner()

runner.run([BbqHelmTest()], [HelmSut.GPT2])
monkeypatch_run_all.assert_called_once()
2 changes: 1 addition & 1 deletion tests/test_helm_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pytest

from coffee.helm import (
from coffee.helm_runner import (
HelmSut,
BbqHelmTest,
HelmResult,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_static_site_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pytest

from coffee.helm import HelmSut
from coffee.helm_runner import HelmSut
from coffee.benchmark import (
GeneralChatBotBenchmarkDefinition,
BiasHarmDefinition,
Expand Down

0 comments on commit 2c4e01d

Please sign in to comment.