Add two SUTs (#28)

* Interim commit. Run is correct, output needs work. * Interim commit. Run is correct, output needs work. * Benchmark knows its name. Also, move helm classes to their own package. * Pull more path out of the templates. Make Benchmark know about its tests. Sort out 3 working SUTs. * Pull more path out of the templates. Make Benchmark know about its tests. Sort out 3 working SUTs. * Updating poetry.lock
mlcommons · Dec 28, 2023 · 8335e66 · 8335e66
1 parent 8d28f7d
commit 8335e66
Show file tree

Hide file tree

Showing 22 changed files with 375 additions and 291 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 .idea/
-run/
+run/
+__pycache__/
+web/
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,9 +12,10 @@ packages = [
 [tool.poetry.dependencies]
 python = ">=3.10,<3.11"
 pyext = {url = "https://files.pythonhosted.org/packages/b0/be/9b6005ac644aaef022527ce49617263379e49dbdbd433d1d3dd66d71f570/pyext-0.7.tar.gz"}
-crfm-helm = "0.3"
+crfm-helm = "^0.4.0"
 jq = "^1.6.0"
 click = "^8.1.7"
+casefy = "^0.1.7"
 
 [tool.poetry.group.dev.dependencies]
 pytest-datafiles = "^3.0.0"

diff --git a/src/coffee/__init__.py b/src/coffee/__init__.py
@@ -0,0 +1,6 @@
+class CoffeeConfig:
+    def __init__(self):
+        self.debug = False
+
+
+app_config = CoffeeConfig()
diff --git a/src/coffee/benchmark.py b/src/coffee/benchmark.py
@@ -1,4 +1,9 @@
 from abc import ABC, abstractmethod
+from typing import List
+
+import casefy
+
+from coffee.helm import BbqHelmTest, HelmTest
 
 
 class Benchmark(ABC):
@@ -11,6 +16,18 @@ def __init__(self, sut, scores):
     def overall_score(self) -> float:
         pass
 
+    @abstractmethod
+    def tests(self) -> List[HelmTest]:
+        pass
+
+    @classmethod
+    def name(cls):
+        return casefy.titlecase(cls.__name__)
+
+    @classmethod
+    def path_name(cls):
+        return casefy.snakecase(cls.__name__)
+
 
 class RidiculousBenchmark(Benchmark):
     def overall_score(self) -> float:
@@ -21,3 +38,7 @@ def overall_score(self) -> float:
             count += 1
             total += bbq[subject]["bbq_accuracy"]
         return total / count * 5
+
+    @classmethod
+    def tests(cls) -> List[HelmTest]:
+        return [BbqHelmTest()]
diff --git a/src/coffee/helm.py b/src/coffee/helm.py
@@ -0,0 +1,225 @@
+import dataclasses
+import json
+import logging
+import pathlib
+import re
+import subprocess
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from enum import Enum
+from typing import List
+
+import jq
+
+import coffee
+
+# This starts with a bunch of objects that represent things already in HELM code.
+# As we shift HELM to accommodate a library use case, it would be nice to compose
+# a run directly out of objects/enums/constants, or at least compose RunSpecs from
+# exposed pieces that are closely related. E.g., the BbqScenario should know "bbq".
+
+
+@dataclasses.dataclass
+class SutDescription:
+    key: str
+    huggingface: bool = dataclasses.field(repr=False, default=False)
+
+    def __hash__(self):
+        return super().__hash__()
+
+
+class HelmSut(SutDescription, Enum):
+    GPT2 = "openai/gpt2"
+    PYTHIA_70M = "EleutherAI/pythia-70m", True
+    FB_OPT_125M = "facebook/opt-125m", True
+
+
+class HelmTest(ABC):
+    # I would like this to be another enum, but BBQ's structural chaos means
+    # for now we need custom behavior
+    def __init__(self, prefix):
+        super().__init__()
+        self.prefix = prefix
+
+    @abstractmethod
+    def runspecs(self) -> List[str]:
+        pass
+
+    def __str__(self):
+        return self.__class__.__name__ + ":" + self.prefix
+
+
+class BbqHelmTest(HelmTest):
+    def __init__(self):
+        super().__init__("bbq")
+
+    # Copied from HELM because it's not exposed outside the method
+    CATEGORIES = [
+        "Age",
+        "Disability_status",
+        "Gender_identity",
+        "Nationality",
+        "Physical_appearance",
+        "Race_ethnicity",
+        "Race_x_SES",
+        "Race_x_gender",
+        "Religion",
+        "SES",
+        # "Sexual_orientation", TODO: temporarily disabled while Yifan looks into a transformer-related bug
+    ]
+
+    def runspecs(self) -> List[str]:
+        return [f"{self.prefix}:subject={c}" for c in BbqHelmTest.CATEGORIES]
+
+
+class HelmScores:
+    # a kinda hacky container; we won't know the right shape of this for a while, so just use wild dicts
+    def __init__(self):
+        super().__init__()
+        self.data = defaultdict(list)
+
+    def add(self, test, sut, test_sut_scores):
+        self.data[(test.__class__.__name__, sut)].append(test_sut_scores)
+
+    def for_sut(self, desired_sut) -> dict:
+        result: defaultdict = defaultdict(dict)
+        for test, sut in self.data:
+            if sut == desired_sut:
+                for entry in self.data[(test, sut)]:
+                    result[test].update(entry)
+
+        return result
+
+
+class HelmResult:
+    def __init__(
+        self,
+        tests: List[HelmTest],
+        suts: List[HelmSut],
+        output_dir: pathlib.Path,
+        execution_result: subprocess.CompletedProcess,
+    ):
+        super().__init__()
+        self.tests = tests
+        self.suts = suts
+        self.output_dir = output_dir
+        self.execution_result = execution_result
+
+    def load_scores(self):
+        focus = self.output_dir / "benchmark_output" / "runs" / "v1"
+        result = HelmScores()
+        for t in self.tests:
+            for s in self.suts:
+                # long term we'll need a lot more data; this is just enough to compute simple scores
+                test_sut_scores = {}
+                glob_path = f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.key)}*"
+                logging.debug(f"looking for scores for {t} {s} in {focus}/{glob_path}")
+                for d in focus.glob(glob_path):
+                    subject_result = {}
+                    with open(d / "run_spec.json") as f:
+                        j = json.load(f)
+                    subject = (
+                        jq.compile(".scenario_spec.args.subject").input_value(j).first()
+                    )
+                    with open(d / "stats.json") as f:
+                        j = json.load(f)
+                    for stat in j:
+                        if stat["name"]["name"].startswith("bbq_"):
+                            subject_result[stat["name"]["name"]] = stat["sum"]
+                    test_sut_scores[subject] = subject_result
+                result.add(t, s, test_sut_scores)
+        return result
+
+    def helm_stdout(self) -> str:
+        return self._deal_with_bytes(self.execution_result.stdout)
+
+    def helm_stderr(self) -> str:
+        return self._deal_with_bytes(self.execution_result.stderr)
+
+    def _deal_with_bytes(self, o):
+        if isinstance(o, bytes):
+            result = o.decode("utf-8")
+        else:
+            result = str(o)
+        return result
+
+    def _filesystem_safe(self, s: str):
+        # reproducing some behavior in HELM; would be nice to remove duplication
+        return re.sub("/", "_", s)
+
+    def success(self):
+        return self.execution_result and self.execution_result.returncode == 0
+
+
+class HelmRunner(ABC):
+    @abstractmethod
+    def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):
+        pass
+
+
+class CliHelmRunner(HelmRunner):
+    def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
+        runspecs = []
+        for s in suts:
+            for t in tests:
+                for r in t.runspecs():
+                    runspecs.append(r + ",model=" + s.key)
+        huggingface_models = [s.key for s in suts if s.huggingface]
+
+        command = self._helm_command_for_runspecs(
+            runspecs, max_instances, huggingface_models
+        )
+        logging.debug(f"helm run command: {command}")
+
+        output_dir = self._make_output_dir()
+        execute_result = self._execute(command, output_dir)
+        return HelmResult(tests, suts, output_dir, execute_result)
+
+    def _execute(
+        self, command: List[str], output_dir: pathlib.Path
+    ) -> subprocess.CompletedProcess:
+        if coffee.app_config.debug:
+            return self._run_with_debug_settings(command, output_dir)
+        else:
+            return subprocess.run(
+                " ".join(command), shell=True, capture_output=True, cwd=output_dir
+            )
+
+    def _run_with_debug_settings(self, command, output_dir):
+        with subprocess.Popen(
+            " ".join(command),
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            cwd=output_dir,
+        ) as sp:
+            for line in sp.stdout:
+                logging.debug(line.decode().rstrip())
+        return subprocess.CompletedProcess(sp.args, sp.returncode, sp.stdout, sp.stderr)
+
+    def _make_output_dir(self):
+        o = pathlib.Path.cwd()
+        if o.name in ["src", "test"]:
+            o = o.parent
+        if not o.name == "run":
+            o = o / "run"
+        o.mkdir(exist_ok=True)
+        return o
+
+    def _helm_command_for_runspecs(
+        self, bbq_runspecs, max_instances, huggingface_models=None
+    ):
+        command = ["helm-run"]
+        command.extend(
+            ["--suite", "v1"]
+        )  # this is a fixed string for now, which is probably wrong
+        command.extend(["-n", "1"])  # working around a bug
+        if huggingface_models:
+            command.append("--enable-huggingface-models")
+            for m in huggingface_models:
+                command.append(m)
+        command.extend(["--max-eval-instances", str(max_instances)])
+
+        command.append("-r")
+        command.extend(bbq_runspecs)
+        return command