address comments, merge and ensure additional benchmark works properl…

…y w/ InProcessHelmRunner, some refactoring Note the extremely hacky interim bit in helm_interface.py.
mlcommons · Jan 24, 2024 · 9cfaa75 · 9cfaa75
1 parent b61b919
commit 9cfaa75
Show file tree

Hide file tree

Showing 7 changed files with 216 additions and 89 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ pytest-datafiles = "^3.0.0"
 pytest = "^7.4.3"
 mypy = "^1.7.1"
 black = "^23.11.0"
+types-pyyaml = "^6.0.12.12"
 
 [tool.pytest.ini_options]
 addopts = [

diff --git a/src/coffee/benchmark.py b/src/coffee/benchmark.py
@@ -1,10 +1,12 @@
 from abc import ABC, abstractmethod
-from typing import List
+from typing import List, TypeVar
 
 import casefy
 
 from coffee.helm_runner import BbqHelmTest, HelmTest, RealToxicityPromptsHelmTest
 
+from __future__ import annotations
+
 
 class Benchmark(ABC):
     def __init__(self, sut, scores):

diff --git a/src/coffee/helm_interface.py b/src/coffee/helm_interface.py
@@ -1,62 +1,94 @@
-from itertools import product
+import subprocess
 from typing import Iterable, TYPE_CHECKING
 
+import helm.benchmark.run_specs
 from helm.benchmark.config_registry import (
     register_builtin_configs_from_helm_package,
 )
 from helm.benchmark.executor import ExecutionSpec
 from helm.benchmark.huggingface_registration import (
     register_huggingface_hub_model_from_flag_value,
 )
+from helm.benchmark.model_deployment_registry import (
+    ClientSpec,
+    ModelDeployment,
+    register_model_deployment,
+)
 from helm.benchmark.presentation.run_entry import RunEntry
 from helm.benchmark.run import run_entries_to_run_specs
 from helm.benchmark.runner import Runner
 from helm.common.authentication import Authentication
 
-if TYPE_CHECKING:
-    from helm_runner import HelmSut, HelmTest
-
-from helm.benchmark.runner import RunnerError
-
-
-def run_executions(
-    tests: Iterable["HelmTest"],
-    suts: Iterable["HelmSut"],
-    max_eval_instances: int = 10,
-    suite: str = "v1",
-    num_threads: int = 4,
-    benchmark_output_path: str = "run/benchmark_output",
-    prod_env_path: str = "run/prod_env",
-) -> None:
-    register_builtin_configs_from_helm_package()
-    for sut in suts:
-        if sut.huggingface:
-            register_huggingface_hub_model_from_flag_value(sut.key)
-    run_entries = []
-    for test, sut in product(tests, suts):
-        for runspec in test.runspecs():
-            run_entries.append(
-                RunEntry(
-                    description=f"{runspec},model={sut.key}", priority=1, groups=[]
+from coffee.helm_runner import HelmResult, HelmRunner, HelmSut, HelmTest
+
+helm.benchmark.run_specs.INCLUDE_GENERATIVE_HARMS_METRICS = True
+
+
+class InProcessHelmRunner(HelmRunner):
+    def run(self, tests: list[HelmTest], suts: list[HelmSut], max_instances=10):
+        self._execute(
+            tests,
+            suts,
+            max_eval_instances=max_instances,
+            suite="v1",
+            num_threads=4,
+            benchmark_output_path="run/benchmark_output",
+            prod_env_path="run/prod_env",
+        )
+
+        output_dir = self._make_output_dir()
+
+        # THIS IS A BIG, DUMB HACK until we unwind subprocess.CompletedProcess from the run mix.
+        execution_result = subprocess.run(
+            "", shell=True, capture_output=True, cwd=output_dir
+        )
+        # END BIG DUMB HACK
+
+        return HelmResult(tests, suts, output_dir, execution_result)
+
+    def _execute(
+        self,
+        tests: Iterable["HelmTest"],
+        suts: Iterable["HelmSut"],
+        max_eval_instances: int = 10,
+        suite: str = "v1",
+        num_threads: int = 1,
+        benchmark_output_path: str = "run/benchmark_output",
+        prod_env_path: str = "run/prod_env",
+    ) -> None:
+        register_builtin_configs_from_helm_package()
+        for sut in suts:
+            if sut.huggingface:
+                register_huggingface_hub_model_from_flag_value(sut.key)
+                model_deployment = ModelDeployment(
+                    name=sut.key,
+                    tokenizer_name=sut.tokenizer_name,
+                    max_sequence_length=sut.tokenizer_max_length,
+                    client_spec=ClientSpec(
+                        class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"
+                    ),
                 )
-            )
-    run_specs = run_entries_to_run_specs(
-        run_entries, max_eval_instances=max_eval_instances
-    )
-    execution_spec = ExecutionSpec(
-        url=None,
-        auth=Authentication(""),
-        local_path=prod_env_path,
-        parallelism=num_threads,
-    )
-    runner = Runner(
-        execution_spec=execution_spec,
-        output_path=benchmark_output_path,
-        suite=suite,
-        skip_instances=False,
-        cache_instances=False,
-        cache_instances_only=False,
-        skip_completed_runs=False,
-        exit_on_error=False,
-    )
-    runner.run_all(run_specs)
+                register_model_deployment(model_deployment)
+        run_entries = [
+            RunEntry(r, 1, list()) for r in self._build_runspecs(suts, tests)
+        ]
+        run_specs = run_entries_to_run_specs(
+            run_entries, max_eval_instances=max_eval_instances
+        )
+        execution_spec = ExecutionSpec(
+            url=None,
+            auth=Authentication(""),
+            local_path=prod_env_path,
+            parallelism=num_threads,
+        )
+        runner = Runner(
+            execution_spec=execution_spec,
+            output_path=benchmark_output_path,
+            suite=suite,
+            skip_instances=False,
+            cache_instances=False,
+            cache_instances_only=False,
+            skip_completed_runs=False,
+            exit_on_error=False,
+        )
+        runner.run_all(run_specs)