diff --git a/src/coffee/benchmark.py b/src/coffee/benchmark.py
index 0a68b60c..ac0e317f 100644
--- a/src/coffee/benchmark.py
+++ b/src/coffee/benchmark.py
@@ -3,7 +3,7 @@
 
 import casefy
 
-from coffee.helm import BbqHelmTest, HelmTest
+from coffee.helm_runner import BbqHelmTest, HelmTest
 
 
 class Benchmark(ABC):
diff --git a/src/coffee/helm_interface.py b/src/coffee/helm_interface.py
new file mode 100644
index 00000000..b6283a10
--- /dev/null
+++ b/src/coffee/helm_interface.py
@@ -0,0 +1,62 @@
+from itertools import product
+from typing import Iterable, TYPE_CHECKING
+
+from helm.benchmark.config_registry import (
+    register_builtin_configs_from_helm_package,
+)
+from helm.benchmark.executor import ExecutionSpec
+from helm.benchmark.huggingface_registration import (
+    register_huggingface_hub_model_from_flag_value,
+)
+from helm.benchmark.presentation.run_entry import RunEntry
+from helm.benchmark.run import run_entries_to_run_specs
+from helm.benchmark.runner import Runner
+from helm.common.authentication import Authentication
+
+if TYPE_CHECKING:
+    from helm_runner import HelmSut, HelmTest
+
+from helm.benchmark.runner import RunnerError
+
+
+def run_executions(
+    tests: Iterable["HelmTest"],
+    suts: Iterable["HelmSut"],
+    max_eval_instances: int = 10,
+    suite: str = "v1",
+    num_threads: int = 4,
+    benchmark_output_path: str = "run/benchmark_output",
+    prod_env_path: str = "run/prod_env",
+) -> None:
+    register_builtin_configs_from_helm_package()
+    for sut in suts:
+        if sut.huggingface:
+            register_huggingface_hub_model_from_flag_value(sut.key)
+    run_entries = []
+    for test, sut in product(tests, suts):
+        for runspec in test.runspecs():
+            run_entries.append(
+                RunEntry(
+                    description=f"{runspec},model={sut.key}", priority=1, groups=[]
+                )
+            )
+    run_specs = run_entries_to_run_specs(
+        run_entries, max_eval_instances=max_eval_instances
+    )
+    execution_spec = ExecutionSpec(
+        url=None,
+        auth=Authentication(""),
+        local_path=prod_env_path,
+        parallelism=num_threads,
+    )
+    runner = Runner(
+        execution_spec=execution_spec,
+        output_path=benchmark_output_path,
+        suite=suite,
+        skip_instances=False,
+        cache_instances=False,
+        cache_instances_only=False,
+        skip_completed_runs=False,
+        exit_on_error=False,
+    )
+    runner.run_all(run_specs)
diff --git a/src/coffee/helm.py b/src/coffee/helm_runner.py
similarity index 85%
rename from src/coffee/helm.py
rename to src/coffee/helm_runner.py
index c7c3c60e..265a5588 100644
--- a/src/coffee/helm.py
+++ b/src/coffee/helm_runner.py
@@ -12,6 +12,8 @@
 import jq
 
 import coffee
+from coffee.helm_interface import RunnerError, run_executions
+
 
 # This starts with a bunch of objects that represent things already in HELM code.
 # As we shift HELM to accommodate a library use case, it would be nice to compose
@@ -97,13 +99,11 @@ def __init__(
         tests: List[HelmTest],
         suts: List[HelmSut],
         output_dir: pathlib.Path,
-        execution_result: subprocess.CompletedProcess,
     ):
         super().__init__()
         self.tests = tests
         self.suts = suts
         self.output_dir = output_dir
-        self.execution_result = execution_result
 
     def load_scores(self):
         focus = self.output_dir / "benchmark_output" / "runs" / "v1"
@@ -130,12 +130,6 @@ def load_scores(self):
                 result.add(t, s, test_sut_scores)
         return result
 
-    def helm_stdout(self) -> str:
-        return self._deal_with_bytes(self.execution_result.stdout)
-
-    def helm_stderr(self) -> str:
-        return self._deal_with_bytes(self.execution_result.stderr)
-
     def _deal_with_bytes(self, o):
         if isinstance(o, bytes):
             result = o.decode("utf-8")
@@ -147,15 +141,38 @@ def _filesystem_safe(self, s: str):
         # reproducing some behavior in HELM; would be nice to remove duplication
         return re.sub("/", "_", s)
 
-    def success(self):
-        return self.execution_result and self.execution_result.returncode == 0
-
 
 class HelmRunner(ABC):
     @abstractmethod
-    def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):
+    def run(self, tests: list[HelmTest], models: list[HelmSut], max_instances=10):
         pass
 
+    def _make_output_dir(self):
+        o = pathlib.Path.cwd()
+        if o.name in ["src", "test"]:
+            o = o.parent
+        if not o.name == "run":
+            o = o / "run"
+        o.mkdir(exist_ok=True)
+        return o
+
+
+class InProcessHelmRunner(HelmRunner):
+    def run(self, tests: list[HelmTest], suts: list[HelmSut], max_instances=10):
+        run_executions(
+            tests,
+            suts,
+            max_eval_instances=max_instances,
+            suite="v1",
+            num_threads=4,
+            benchmark_output_path="run/benchmark_output",
+            prod_env_path="run/prod_env",
+        )
+
+        output_dir = self._make_output_dir()
+
+        return HelmResult(tests, suts, output_dir)
+
 
 class CliHelmRunner(HelmRunner):
     def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
@@ -172,18 +189,18 @@ def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
         logging.debug(f"helm run command: {command}")
 
         output_dir = self._make_output_dir()
-        execute_result = self._execute(command, output_dir)
-        return HelmResult(tests, suts, output_dir, execute_result)
+        self._execute(command, output_dir)
+        return HelmResult(tests, suts, output_dir)
 
-    def _execute(
-        self, command: List[str], output_dir: pathlib.Path
-    ) -> subprocess.CompletedProcess:
+    def _execute(self, command: List[str], output_dir: pathlib.Path) -> None:
         if coffee.app_config.debug:
-            return self._run_with_debug_settings(command, output_dir)
+            result = self._run_with_debug_settings(command, output_dir)
         else:
-            return subprocess.run(
+            result = subprocess.run(
                 " ".join(command), shell=True, capture_output=True, cwd=output_dir
             )
+        if not result.returncode == 0:
+            raise RunnerError(result.stderr)
 
     def _run_with_debug_settings(self, command, output_dir):
         with subprocess.Popen(
@@ -195,16 +212,9 @@ def _run_with_debug_settings(self, command, output_dir):
         ) as sp:
             for line in sp.stdout:
                 logging.debug(line.decode().rstrip())
-        return subprocess.CompletedProcess(sp.args, sp.returncode, sp.stdout, sp.stderr)
-
-    def _make_output_dir(self):
-        o = pathlib.Path.cwd()
-        if o.name in ["src", "test"]:
-            o = o.parent
-        if not o.name == "run":
-            o = o / "run"
-        o.mkdir(exist_ok=True)
-        return o
+        if not sp.returncode == 0:
+            raise RunnerError(sp.stderr)
+        return sp
 
     def _helm_command_for_runspecs(
         self, bbq_runspecs, max_instances, huggingface_models=None
diff --git a/src/coffee/run.py b/src/coffee/run.py
index 12ae1942..10d2f152 100644
--- a/src/coffee/run.py
+++ b/src/coffee/run.py
@@ -5,7 +5,7 @@
 
 import coffee
 from coffee.benchmark import Benchmark, RidiculousBenchmark
-from coffee.helm import CliHelmRunner, HelmSut
+from coffee.helm_runner import HelmSut, InProcessHelmRunner
 from coffee.static_site_generator import StaticSiteGenerator
 
 
@@ -30,17 +30,9 @@ def cli(output_dir: pathlib.Path, max_instances: int, debug: bool) -> None:
     else:
         logging.basicConfig(level=logging.INFO)
 
-    runner = CliHelmRunner()
+    runner = InProcessHelmRunner()
     suts = [HelmSut.GPT2, HelmSut.PYTHIA_70M, HelmSut.FB_OPT_125M]
     result = runner.run(RidiculousBenchmark.tests(), suts, max_instances=max_instances)
-    if not result.success():
-        print(
-            f"HELM execution failed with return code {result.execution_result.returncode}:"
-        )
-        print("stdout:")
-        print(result.helm_stdout())
-        print("stderr:")
-        print(result.helm_stderr())
     scores = result.load_scores()
     benchmarks: list[Benchmark] = []
     for sut in suts:
diff --git a/src/coffee/static_site_generator.py b/src/coffee/static_site_generator.py
index aeb4e51f..20cf4ebe 100644
--- a/src/coffee/static_site_generator.py
+++ b/src/coffee/static_site_generator.py
@@ -2,7 +2,7 @@
 import pathlib
 import shutil
 from itertools import groupby
-from typing import Tuple
+from typing import Iterable, Iterator, Tuple
 
 from jinja2 import Environment, PackageLoader, select_autoescape
 
@@ -85,10 +85,10 @@ def _generate_index_page(
 
     def _grouped_benchmarks(self, benchmarks: list[Benchmark]) -> dict:
         benchmarks_dict = {}
-        for benchmark_name, grouped_benchmarks in groupby(
+        for benchmark_name, _grouped_benchmarks in groupby(
             benchmarks, lambda x: x.__class__.__name__
         ):
-            grouped_benchmarks = list(grouped_benchmarks)
+            grouped_benchmarks = list(_grouped_benchmarks)
             benchmarks_dict[grouped_benchmarks[0]] = grouped_benchmarks
         return benchmarks_dict
 
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 2c0d3c83..9e38a3bf 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -1,4 +1,4 @@
-from coffee.helm import HelmSut
+from coffee.helm_runner import HelmSut
 from coffee.benchmark import RidiculousBenchmark
 
 
diff --git a/tests/test_helm_interface.py b/tests/test_helm_interface.py
new file mode 100644
index 00000000..e79f9d09
--- /dev/null
+++ b/tests/test_helm_interface.py
@@ -0,0 +1,55 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from coffee import helm_interface
+from coffee.helm_runner import BbqHelmTest, HelmSut
+
+
+@pytest.fixture(autouse=True)
+def monkeypatch_runner(monkeypatch):
+    mock_obj = MagicMock()
+    monkeypatch.setattr(helm_interface, "Runner", mock_obj)
+    return mock_obj
+
+
+@pytest.fixture
+def monkeypatch_register_huggingface(monkeypatch):
+    mock_obj = MagicMock()
+    monkeypatch.setattr(
+        helm_interface,
+        "register_huggingface_hub_model_from_flag_value",
+        mock_obj,
+    )
+    return mock_obj
+
+
+@pytest.fixture
+def monkeypatch_run_entries_to_run_specs(monkeypatch):
+    mock_obj = MagicMock()
+    monkeypatch.setattr(helm_interface, "run_entries_to_run_specs", mock_obj)
+    return mock_obj
+
+
+def test_run_executions_registers_huggingface(
+    monkeypatch, monkeypatch_register_huggingface, monkeypatch_run_entries_to_run_specs
+):
+    # have to monkeypatch run_entries_to_runspecs since we can't register due to monkeypatching
+    # register_huggingface_hub_model_from_flag_value
+
+    helm_interface.run_executions([BbqHelmTest()], [HelmSut.FB_OPT_125M, HelmSut.GPT2])
+    monkeypatch_register_huggingface.assert_called_once_with("facebook/opt-125m")
+
+
+@pytest.mark.parametrize(
+    "tests, suts, expected",
+    [
+        ([BbqHelmTest()], [HelmSut.FB_OPT_125M, HelmSut.GPT2], 20),
+        ([BbqHelmTest()], [HelmSut.GPT2], 10),
+    ],
+)
+def test_generates_correct_number_runspecs(
+    monkeypatch, monkeypatch_run_entries_to_run_specs, tests, suts, expected
+):
+    helm_interface.run_executions(tests, suts)
+    assert len(monkeypatch_run_entries_to_run_specs.call_args[0][0]) == expected
diff --git a/tests/test_helm_runner.py b/tests/test_helm_runner.py
index 46a29c42..742ae6b0 100644
--- a/tests/test_helm_runner.py
+++ b/tests/test_helm_runner.py
@@ -6,7 +6,13 @@
 import pytest
 
 from coffee.run import quantize_stars
-from coffee.helm import HelmSut, BbqHelmTest, HelmResult, CliHelmRunner
+from coffee.helm_runner import (
+    HelmSut,
+    BbqHelmTest,
+    HelmResult,
+    CliHelmRunner,
+    InProcessHelmRunner,
+)
 from coffee.benchmark import RidiculousBenchmark
 
 
@@ -35,9 +41,13 @@ def test_cli_helm_runner_command_handles_huggingface_models(cwd_tmpdir):
     assert shell_arguments[enables[0] + 2] == HelmSut.PYTHIA_70M.key
 
 
+def test_inprocess_helm_runner(cwd_tmpdir):
+    pass
+
+
 @pytest.mark.datafiles(SIMPLE_BBQ_DATA)
 def test_read_scores(datafiles):
-    hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None)
+    hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles)
     scores = hr.load_scores()
     sut_scores = scores.for_sut(HelmSut.GPT2)
     assert "BbqHelmTest" in sut_scores
@@ -47,7 +57,7 @@ def test_read_scores(datafiles):
 
 @pytest.mark.datafiles(SIMPLE_BBQ_DATA)
 def test_ridiculous_benchmark(datafiles):
-    hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None)
+    hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles)
     scores = hr.load_scores()
     b = RidiculousBenchmark(HelmSut.GPT2, scores.for_sut(HelmSut.GPT2))
     assert 2.25 == pytest.approx(b.overall_score())
diff --git a/tests/test_static_site_generator.py b/tests/test_static_site_generator.py
index 9d36c4a0..e1d7b390 100644
--- a/tests/test_static_site_generator.py
+++ b/tests/test_static_site_generator.py
@@ -4,14 +4,14 @@
 
 import pytest
 
-from coffee.helm import HelmResult, BbqHelmTest, HelmSut
+from coffee.helm_runner import HelmResult, BbqHelmTest, HelmSut
 from coffee.benchmark import RidiculousBenchmark
 from coffee.static_site_generator import StaticSiteGenerator
 
 
 @pytest.fixture()
 def benchmark(datafiles):
-    hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None)
+    hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles)
     scores = hr.load_scores()
     b = RidiculousBenchmark(HelmSut.GPT2, scores.for_sut(HelmSut.GPT2))
     return b