In process helm runner 2 (#58)

* rename helm module to helm_runner to prevent clobbering the helm library in the namespace; add helm_interface which includes an InProcessHelmRunner * run black * sorting/optimizing imports * import from helm_runner
mlcommons · Feb 2, 2024 · 2c4e01d · 2c4e01d
1 parent d3deae2
commit 2c4e01d
Show file tree

Hide file tree

Showing 11 changed files with 185 additions and 33 deletions.
diff --git a/src/coffee/benchmark.py b/src/coffee/benchmark.py
@@ -4,7 +4,7 @@
 
 import casefy
 
-from coffee.helm import BbqHelmTest, HelmTest, RealToxicityPromptsHelmTest, HelmSut
+from coffee.helm_runner import BbqHelmTest, HelmSut, HelmTest, RealToxicityPromptsHelmTest
 
 
 def float_to_stars(value: float) -> float:

diff --git a/src/coffee/helm_interface.py b/src/coffee/helm_interface.py
@@ -0,0 +1,86 @@
+import subprocess
+from typing import Iterable
+
+import helm.benchmark.run_specs
+from helm.benchmark.config_registry import (
+    register_builtin_configs_from_helm_package,
+)
+from helm.benchmark.executor import ExecutionSpec
+from helm.benchmark.huggingface_registration import (
+    register_huggingface_hub_model_from_flag_value,
+)
+from helm.benchmark.model_deployment_registry import (
+    ClientSpec,
+    ModelDeployment,
+    register_model_deployment,
+)
+from helm.benchmark.presentation.run_entry import RunEntry
+from helm.benchmark.run import run_entries_to_run_specs
+from helm.benchmark.runner import Runner
+from helm.common.authentication import Authentication
+
+from coffee.helm_runner import HelmResult, HelmRunner, HelmSut, HelmTest
+
+helm.benchmark.run_specs.INCLUDE_GENERATIVE_HARMS_METRICS = True
+
+
+class InProcessHelmRunner(HelmRunner):
+    def run(self, tests: list[HelmTest], suts: list[HelmSut], max_instances=10):
+        self._execute(
+            tests,
+            suts,
+            max_eval_instances=max_instances,
+            suite="v1",
+            num_threads=4,
+            benchmark_output_path="run/benchmark_output",
+            prod_env_path="run/prod_env",
+        )
+
+        output_dir = self._make_output_dir()
+
+        # THIS IS A BIG, DUMB HACK until we unwind subprocess.CompletedProcess from the run mix.
+        execution_result = subprocess.run("", shell=True, capture_output=True, cwd=output_dir)
+        # END BIG DUMB HACK
+
+        return HelmResult(tests, suts, output_dir, execution_result)
+
+    def _execute(
+        self,
+        tests: Iterable["HelmTest"],
+        suts: Iterable["HelmSut"],
+        max_eval_instances: int = 10,
+        suite: str = "v1",
+        num_threads: int = 1,
+        benchmark_output_path: str = "run/benchmark_output",
+        prod_env_path: str = "run/prod_env",
+    ) -> None:
+        register_builtin_configs_from_helm_package()
+        for sut in suts:
+            if sut.huggingface:
+                register_huggingface_hub_model_from_flag_value(sut.key)
+                model_deployment = ModelDeployment(
+                    name=sut.key,
+                    tokenizer_name=sut.tokenizer_name,
+                    max_sequence_length=sut.tokenizer_max_length,
+                    client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
+                )
+                register_model_deployment(model_deployment)
+        run_entries = [RunEntry(r, 1, list()) for r in self._build_runspecs(suts, tests)]
+        run_specs = run_entries_to_run_specs(run_entries, max_eval_instances=max_eval_instances)
+        execution_spec = ExecutionSpec(
+            url=None,
+            auth=Authentication(""),
+            local_path=prod_env_path,
+            parallelism=num_threads,
+        )
+        runner = Runner(
+            execution_spec=execution_spec,
+            output_path=benchmark_output_path,
+            suite=suite,
+            skip_instances=False,
+            cache_instances=False,
+            cache_instances_only=False,
+            skip_completed_runs=False,
+            exit_on_error=False,
+        )
+        runner.run_all(run_specs)
diff --git a/src/coffee/helm.py → src/coffee/helm_runner.py b/src/coffee/helm.py → src/coffee/helm_runner.py
@@ -189,6 +189,27 @@ class HelmRunner(ABC):
     def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):
         pass
 
+    def _build_runspecs(self, suts, tests):
+        runspecs = []
+        for s in suts:
+            for t in tests:
+                for r in t.runspecs():
+                    if ":" in r:
+                        separator = ","
+                    else:
+                        separator = ":"
+                    runspecs.append(r + separator + "model=" + s.key)
+        return runspecs
+
+    def _make_output_dir(self):
+        o = pathlib.Path.cwd()
+        if o.name in ["src", "test"]:
+            o = o.parent
+        if not o.name == "run":
+            o = o / "run"
+        o.mkdir(exist_ok=True)
+        return o
+
 
 class CliHelmRunner(HelmRunner):
     def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
@@ -209,18 +230,6 @@ def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
         execute_result = self._execute(command, output_dir)
         return HelmResult(tests, suts, output_dir, execute_result)
 
-    def _build_runspecs(self, suts, tests):
-        runspecs = []
-        for s in suts:
-            for t in tests:
-                for r in t.runspecs():
-                    if ":" in r:
-                        separator = ","
-                    else:
-                        separator = ":"
-                    runspecs.append(r + separator + "model=" + s.key)
-        return runspecs
-
     def _execute(self, command: List[str], output_dir: pathlib.Path) -> subprocess.CompletedProcess:
         if coffee.app_config.debug:
             return self._run_with_debug_settings(command, output_dir)
@@ -239,15 +248,6 @@ def _run_with_debug_settings(self, command, output_dir):
                 logging.debug(line.decode().rstrip())
         return subprocess.CompletedProcess(sp.args, sp.returncode, sp.stdout, sp.stderr)
 
-    def _make_output_dir(self):
-        o = pathlib.Path.cwd()
-        if o.name in ["src", "test"]:
-            o = o.parent
-        if not o.name == "run":
-            o = o / "run"
-        o.mkdir(exist_ok=True)
-        return o
-
     def _helm_command_for_runspecs(self, bbq_runspecs, max_instances):
         command = ["python " + str(pathlib.Path(__file__).parent.parent / "dubious_helm_cli_wrapper.py")]
         command.extend(["--suite", "v1"])  # this is a fixed string for now, which is probably wrong

diff --git a/src/coffee/run.py b/src/coffee/run.py
@@ -7,11 +7,8 @@
 import termcolor
 
 import coffee
-from coffee.benchmark import (
-    GeneralChatBotBenchmarkDefinition,
-    BenchmarkScore,
-)
-from coffee.helm import HelmSut, CliHelmRunner, HelmResult
+from coffee.benchmark import BenchmarkScore, GeneralChatBotBenchmarkDefinition
+from coffee.helm_runner import CliHelmRunner, HelmResult, HelmSut
 from coffee.static_site_generator import StaticSiteGenerator
 
 

diff --git a/src/coffee/static_site_generator.py b/src/coffee/static_site_generator.py
@@ -2,7 +2,6 @@
 import pathlib
 import shutil
 from itertools import groupby
-from typing import Iterator
 
 from jinja2 import Environment, PackageLoader, select_autoescape
 from markupsafe import Markup

diff --git a/src/dubious_helm_cli_wrapper.py b/src/dubious_helm_cli_wrapper.py
@@ -1,5 +1,5 @@
-from helm.benchmark.run import main
 import helm.benchmark.run_specs
+from helm.benchmark.run import main
 
 helm.benchmark.run_specs.INCLUDE_GENERATIVE_HARMS_METRICS = True
 if __name__ == "__main__":

diff --git a/tests/templates/conftest.py b/tests/templates/conftest.py
@@ -4,7 +4,7 @@
 
 from jinja2 import Environment, FileSystemLoader
 
-from coffee.helm import HelmSut
+from coffee.helm_runner import HelmSut
 from coffee.benchmark import (
     GeneralChatBotBenchmarkDefinition,
     BiasHarmDefinition,

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -9,7 +9,7 @@
     ToxicityHarmDefinition,
     quantize_stars,
 )
-from coffee.helm import HelmSut, BbqHelmTest, HelmResult
+from coffee.helm_runner import HelmSut, BbqHelmTest, HelmResult
 
 SIMPLE_BBQ_DATA = pathlib.Path(__file__).parent / "data/full_runs/simple_bbq"
 SIMPLE_TOXICITY_DATA = pathlib.Path(__file__).parent / "data/full_runs/toxicity"

diff --git a/tests/test_helm_interface.py b/tests/test_helm_interface.py
@@ -0,0 +1,70 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from coffee import helm_interface
+from coffee.helm_runner import BbqHelmTest, HelmSut
+
+
+@pytest.fixture(autouse=True)
+def monkeypatch_run_all(monkeypatch):
+    mock_obj = MagicMock()
+    monkeypatch.setattr(helm_interface.Runner, "run_all", mock_obj)
+    return mock_obj
+
+
+@pytest.fixture(autouse=True)
+def monkeypatch_run_one(monkeypatch):
+    mock_obj = MagicMock()
+    monkeypatch.setattr(helm_interface.Runner, "run_one", mock_obj)
+    return mock_obj
+
+
+@pytest.fixture
+def monkeypatch_register_huggingface(monkeypatch):
+    mock_obj = MagicMock()
+    monkeypatch.setattr(
+        helm_interface,
+        "register_huggingface_hub_model_from_flag_value",
+        mock_obj,
+    )
+    return mock_obj
+
+
+@pytest.fixture
+def monkeypatch_run_entries_to_run_specs(monkeypatch):
+    mock_obj = MagicMock()
+    monkeypatch.setattr(helm_interface, "run_entries_to_run_specs", mock_obj)
+    return mock_obj
+
+
+def test_run_executions_registers_huggingface(
+    monkeypatch, monkeypatch_register_huggingface, monkeypatch_run_entries_to_run_specs
+):
+    # have to monkeypatch run_entries_to_runspecs since we can't register due to monkeypatching
+    # register_huggingface_hub_model_from_flag_value
+    runner = helm_interface.InProcessHelmRunner()
+
+    runner.run([BbqHelmTest()], [HelmSut.FB_OPT_125M, HelmSut.GPT2])
+    monkeypatch_register_huggingface.assert_called_once_with("facebook/opt-125m")
+
+
+@pytest.mark.parametrize(
+    "tests, suts, expected",
+    [
+        ([BbqHelmTest()], [HelmSut.FB_OPT_125M, HelmSut.GPT2], 20),
+        ([BbqHelmTest()], [HelmSut.GPT2], 10),
+    ],
+)
+def test_generates_correct_number_runspecs(monkeypatch, monkeypatch_run_entries_to_run_specs, tests, suts, expected):
+    runner = helm_interface.InProcessHelmRunner()
+
+    runner.run(tests, suts)
+    assert len(monkeypatch_run_entries_to_run_specs.call_args[0][0]) == expected
+
+
+def test_runs_run_all(monkeypatch, monkeypatch_run_all):
+    runner = helm_interface.InProcessHelmRunner()
+
+    runner.run([BbqHelmTest()], [HelmSut.GPT2])
+    monkeypatch_run_all.assert_called_once()
diff --git a/tests/test_helm_runner.py b/tests/test_helm_runner.py
@@ -8,7 +8,7 @@
 
 import pytest
 
-from coffee.helm import (
+from coffee.helm_runner import (
     HelmSut,
     BbqHelmTest,
     HelmResult,

diff --git a/tests/test_static_site_generator.py b/tests/test_static_site_generator.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from coffee.helm import HelmSut
+from coffee.helm_runner import HelmSut
 from coffee.benchmark import (
     GeneralChatBotBenchmarkDefinition,
     BiasHarmDefinition,