Skip to content

Commit

Permalink
address comments, merge and ensure additional benchmark works properl…
Browse files Browse the repository at this point in the history
…y w/ InProcessHelmRunner, some refactoring

Note the extremely hacky interim bit in helm_interface.py.
  • Loading branch information
dhosterman committed Jan 24, 2024
1 parent b61b919 commit 9cfaa75
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 89 deletions.
103 changes: 98 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pytest-datafiles = "^3.0.0"
pytest = "^7.4.3"
mypy = "^1.7.1"
black = "^23.11.0"
types-pyyaml = "^6.0.12.12"

[tool.pytest.ini_options]
addopts = [
Expand Down
4 changes: 3 additions & 1 deletion src/coffee/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from abc import ABC, abstractmethod
from typing import List
from typing import List, TypeVar

import casefy

from coffee.helm_runner import BbqHelmTest, HelmTest, RealToxicityPromptsHelmTest

from __future__ import annotations


class Benchmark(ABC):
def __init__(self, sut, scores):
Expand Down
126 changes: 79 additions & 47 deletions src/coffee/helm_interface.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,94 @@
from itertools import product
import subprocess
from typing import Iterable, TYPE_CHECKING

import helm.benchmark.run_specs
from helm.benchmark.config_registry import (
register_builtin_configs_from_helm_package,
)
from helm.benchmark.executor import ExecutionSpec
from helm.benchmark.huggingface_registration import (
register_huggingface_hub_model_from_flag_value,
)
from helm.benchmark.model_deployment_registry import (
ClientSpec,
ModelDeployment,
register_model_deployment,
)
from helm.benchmark.presentation.run_entry import RunEntry
from helm.benchmark.run import run_entries_to_run_specs
from helm.benchmark.runner import Runner
from helm.common.authentication import Authentication

if TYPE_CHECKING:
from helm_runner import HelmSut, HelmTest

from helm.benchmark.runner import RunnerError


def run_executions(
tests: Iterable["HelmTest"],
suts: Iterable["HelmSut"],
max_eval_instances: int = 10,
suite: str = "v1",
num_threads: int = 4,
benchmark_output_path: str = "run/benchmark_output",
prod_env_path: str = "run/prod_env",
) -> None:
register_builtin_configs_from_helm_package()
for sut in suts:
if sut.huggingface:
register_huggingface_hub_model_from_flag_value(sut.key)
run_entries = []
for test, sut in product(tests, suts):
for runspec in test.runspecs():
run_entries.append(
RunEntry(
description=f"{runspec},model={sut.key}", priority=1, groups=[]
from coffee.helm_runner import HelmResult, HelmRunner, HelmSut, HelmTest

helm.benchmark.run_specs.INCLUDE_GENERATIVE_HARMS_METRICS = True


class InProcessHelmRunner(HelmRunner):
def run(self, tests: list[HelmTest], suts: list[HelmSut], max_instances=10):
self._execute(
tests,
suts,
max_eval_instances=max_instances,
suite="v1",
num_threads=4,
benchmark_output_path="run/benchmark_output",
prod_env_path="run/prod_env",
)

output_dir = self._make_output_dir()

# THIS IS A BIG, DUMB HACK until we unwind subprocess.CompletedProcess from the run mix.
execution_result = subprocess.run(
"", shell=True, capture_output=True, cwd=output_dir
)
# END BIG DUMB HACK

return HelmResult(tests, suts, output_dir, execution_result)

def _execute(
self,
tests: Iterable["HelmTest"],
suts: Iterable["HelmSut"],
max_eval_instances: int = 10,
suite: str = "v1",
num_threads: int = 1,
benchmark_output_path: str = "run/benchmark_output",
prod_env_path: str = "run/prod_env",
) -> None:
register_builtin_configs_from_helm_package()
for sut in suts:
if sut.huggingface:
register_huggingface_hub_model_from_flag_value(sut.key)
model_deployment = ModelDeployment(
name=sut.key,
tokenizer_name=sut.tokenizer_name,
max_sequence_length=sut.tokenizer_max_length,
client_spec=ClientSpec(
class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"
),
)
)
run_specs = run_entries_to_run_specs(
run_entries, max_eval_instances=max_eval_instances
)
execution_spec = ExecutionSpec(
url=None,
auth=Authentication(""),
local_path=prod_env_path,
parallelism=num_threads,
)
runner = Runner(
execution_spec=execution_spec,
output_path=benchmark_output_path,
suite=suite,
skip_instances=False,
cache_instances=False,
cache_instances_only=False,
skip_completed_runs=False,
exit_on_error=False,
)
runner.run_all(run_specs)
register_model_deployment(model_deployment)
run_entries = [
RunEntry(r, 1, list()) for r in self._build_runspecs(suts, tests)
]
run_specs = run_entries_to_run_specs(
run_entries, max_eval_instances=max_eval_instances
)
execution_spec = ExecutionSpec(
url=None,
auth=Authentication(""),
local_path=prod_env_path,
parallelism=num_threads,
)
runner = Runner(
execution_spec=execution_spec,
output_path=benchmark_output_path,
suite=suite,
skip_instances=False,
cache_instances=False,
cache_instances_only=False,
skip_completed_runs=False,
exit_on_error=False,
)
runner.run_all(run_specs)
Loading

0 comments on commit 9cfaa75

Please sign in to comment.