Skip to content

Commit

Permalink
Add two SUTs (#28)
Browse files Browse the repository at this point in the history
* Interim commit. Run is correct, output needs work.

* Interim commit. Run is correct, output needs work.

* Benchmark knows its name. Also, move helm classes to their own package.

* Pull more path out of the templates.
Make Benchmark know about its tests.
Sort out 3 working SUTs.

* Pull more path out of the templates.
Make Benchmark know about its tests.
Sort out 3 working SUTs.

* Updating poetry.lock
  • Loading branch information
wpietri authored Dec 28, 2023
1 parent 8d28f7d commit 8335e66
Show file tree
Hide file tree
Showing 22 changed files with 375 additions and 291 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
.idea/
run/
run/
__pycache__/
web/
124 changes: 30 additions & 94 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ packages = [
[tool.poetry.dependencies]
python = ">=3.10,<3.11"
pyext = {url = "https://files.pythonhosted.org/packages/b0/be/9b6005ac644aaef022527ce49617263379e49dbdbd433d1d3dd66d71f570/pyext-0.7.tar.gz"}
crfm-helm = "0.3"
crfm-helm = "^0.4.0"
jq = "^1.6.0"
click = "^8.1.7"
casefy = "^0.1.7"

[tool.poetry.group.dev.dependencies]
pytest-datafiles = "^3.0.0"
Expand Down
6 changes: 6 additions & 0 deletions src/coffee/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class CoffeeConfig:
def __init__(self):
self.debug = False


app_config = CoffeeConfig()
21 changes: 21 additions & 0 deletions src/coffee/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
from abc import ABC, abstractmethod
from typing import List

import casefy

from coffee.helm import BbqHelmTest, HelmTest


class Benchmark(ABC):
Expand All @@ -11,6 +16,18 @@ def __init__(self, sut, scores):
def overall_score(self) -> float:
pass

@abstractmethod
def tests(self) -> List[HelmTest]:
pass

@classmethod
def name(cls):
return casefy.titlecase(cls.__name__)

@classmethod
def path_name(cls):
return casefy.snakecase(cls.__name__)


class RidiculousBenchmark(Benchmark):
def overall_score(self) -> float:
Expand All @@ -21,3 +38,7 @@ def overall_score(self) -> float:
count += 1
total += bbq[subject]["bbq_accuracy"]
return total / count * 5

@classmethod
def tests(cls) -> List[HelmTest]:
return [BbqHelmTest()]
225 changes: 225 additions & 0 deletions src/coffee/helm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
import dataclasses
import json
import logging
import pathlib
import re
import subprocess
from abc import ABC, abstractmethod
from collections import defaultdict
from enum import Enum
from typing import List

import jq

import coffee

# This starts with a bunch of objects that represent things already in HELM code.
# As we shift HELM to accommodate a library use case, it would be nice to compose
# a run directly out of objects/enums/constants, or at least compose RunSpecs from
# exposed pieces that are closely related. E.g., the BbqScenario should know "bbq".


@dataclasses.dataclass
class SutDescription:
key: str
huggingface: bool = dataclasses.field(repr=False, default=False)

def __hash__(self):
return super().__hash__()


class HelmSut(SutDescription, Enum):
GPT2 = "openai/gpt2"
PYTHIA_70M = "EleutherAI/pythia-70m", True
FB_OPT_125M = "facebook/opt-125m", True


class HelmTest(ABC):
# I would like this to be another enum, but BBQ's structural chaos means
# for now we need custom behavior
def __init__(self, prefix):
super().__init__()
self.prefix = prefix

@abstractmethod
def runspecs(self) -> List[str]:
pass

def __str__(self):
return self.__class__.__name__ + ":" + self.prefix


class BbqHelmTest(HelmTest):
def __init__(self):
super().__init__("bbq")

# Copied from HELM because it's not exposed outside the method
CATEGORIES = [
"Age",
"Disability_status",
"Gender_identity",
"Nationality",
"Physical_appearance",
"Race_ethnicity",
"Race_x_SES",
"Race_x_gender",
"Religion",
"SES",
# "Sexual_orientation", TODO: temporarily disabled while Yifan looks into a transformer-related bug
]

def runspecs(self) -> List[str]:
return [f"{self.prefix}:subject={c}" for c in BbqHelmTest.CATEGORIES]


class HelmScores:
# a kinda hacky container; we won't know the right shape of this for a while, so just use wild dicts
def __init__(self):
super().__init__()
self.data = defaultdict(list)

def add(self, test, sut, test_sut_scores):
self.data[(test.__class__.__name__, sut)].append(test_sut_scores)

def for_sut(self, desired_sut) -> dict:
result: defaultdict = defaultdict(dict)
for test, sut in self.data:
if sut == desired_sut:
for entry in self.data[(test, sut)]:
result[test].update(entry)

return result


class HelmResult:
def __init__(
self,
tests: List[HelmTest],
suts: List[HelmSut],
output_dir: pathlib.Path,
execution_result: subprocess.CompletedProcess,
):
super().__init__()
self.tests = tests
self.suts = suts
self.output_dir = output_dir
self.execution_result = execution_result

def load_scores(self):
focus = self.output_dir / "benchmark_output" / "runs" / "v1"
result = HelmScores()
for t in self.tests:
for s in self.suts:
# long term we'll need a lot more data; this is just enough to compute simple scores
test_sut_scores = {}
glob_path = f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.key)}*"
logging.debug(f"looking for scores for {t} {s} in {focus}/{glob_path}")
for d in focus.glob(glob_path):
subject_result = {}
with open(d / "run_spec.json") as f:
j = json.load(f)
subject = (
jq.compile(".scenario_spec.args.subject").input_value(j).first()
)
with open(d / "stats.json") as f:
j = json.load(f)
for stat in j:
if stat["name"]["name"].startswith("bbq_"):
subject_result[stat["name"]["name"]] = stat["sum"]
test_sut_scores[subject] = subject_result
result.add(t, s, test_sut_scores)
return result

def helm_stdout(self) -> str:
return self._deal_with_bytes(self.execution_result.stdout)

def helm_stderr(self) -> str:
return self._deal_with_bytes(self.execution_result.stderr)

def _deal_with_bytes(self, o):
if isinstance(o, bytes):
result = o.decode("utf-8")
else:
result = str(o)
return result

def _filesystem_safe(self, s: str):
# reproducing some behavior in HELM; would be nice to remove duplication
return re.sub("/", "_", s)

def success(self):
return self.execution_result and self.execution_result.returncode == 0


class HelmRunner(ABC):
@abstractmethod
def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):
pass


class CliHelmRunner(HelmRunner):
def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
runspecs = []
for s in suts:
for t in tests:
for r in t.runspecs():
runspecs.append(r + ",model=" + s.key)
huggingface_models = [s.key for s in suts if s.huggingface]

command = self._helm_command_for_runspecs(
runspecs, max_instances, huggingface_models
)
logging.debug(f"helm run command: {command}")

output_dir = self._make_output_dir()
execute_result = self._execute(command, output_dir)
return HelmResult(tests, suts, output_dir, execute_result)

def _execute(
self, command: List[str], output_dir: pathlib.Path
) -> subprocess.CompletedProcess:
if coffee.app_config.debug:
return self._run_with_debug_settings(command, output_dir)
else:
return subprocess.run(
" ".join(command), shell=True, capture_output=True, cwd=output_dir
)

def _run_with_debug_settings(self, command, output_dir):
with subprocess.Popen(
" ".join(command),
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
cwd=output_dir,
) as sp:
for line in sp.stdout:
logging.debug(line.decode().rstrip())
return subprocess.CompletedProcess(sp.args, sp.returncode, sp.stdout, sp.stderr)

def _make_output_dir(self):
o = pathlib.Path.cwd()
if o.name in ["src", "test"]:
o = o.parent
if not o.name == "run":
o = o / "run"
o.mkdir(exist_ok=True)
return o

def _helm_command_for_runspecs(
self, bbq_runspecs, max_instances, huggingface_models=None
):
command = ["helm-run"]
command.extend(
["--suite", "v1"]
) # this is a fixed string for now, which is probably wrong
command.extend(["-n", "1"]) # working around a bug
if huggingface_models:
command.append("--enable-huggingface-models")
for m in huggingface_models:
command.append(m)
command.extend(["--max-eval-instances", str(max_instances)])

command.append("-r")
command.extend(bbq_runspecs)
return command
Loading

0 comments on commit 8335e66

Please sign in to comment.