Skip to content

Commit

Permalink
A rough initial pass at a benchmark. Just enough to make a number com…
Browse files Browse the repository at this point in the history
…e out for one sut, and to sketch an architectural direction.
  • Loading branch information
wpietri committed Dec 7, 2023
1 parent 45b0cab commit d62c852
Show file tree
Hide file tree
Showing 15 changed files with 16,331 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.idea/
run/
3,447 changes: 3,447 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[tool.poetry]
name = "coffee"
version = "0.1.0"
description = ""
authors = ["Your Name <[email protected]>"]
readme = "README.md"

[tool.poetry.dependencies]
python = ">=3.10,<3.11"
pyext = {url = "https://files.pythonhosted.org/packages/b0/be/9b6005ac644aaef022527ce49617263379e49dbdbd433d1d3dd66d71f570/pyext-0.7.tar.gz"}
crfm-helm = "0.3"

[tool.poetry.group.dev.dependencies]
pytest-datafiles = "^3.0.0"

[tool.pytest.ini_options]
addopts = [
"--import-mode=importlib",
]

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
187 changes: 187 additions & 0 deletions src/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import json
import pathlib
import re
import subprocess
from collections import defaultdict
from enum import Enum
from typing import List

import jq


# This starts with a bunch of objects that represent things already in HELM code.
# As we shift HELM to accommodate a library use case, it would be nice to compose
# a run directly out of objects/enums/constants, or at least compose RunSpecs from
# exposed pieces that are closely related. E.g., the BbqScenario should know "bbq".

class HelmSut(Enum):
GPT2 = 'huggingface/gpt2'


class HelmTest:
# I would like this to be another enum, but BBQ's structural chaos means
# for now we need custom behavior
def __init__(self, prefix):
super().__init__()
self.prefix = prefix

def runspecs(self) -> List[str]:
raise NotImplementedError


class BbqHelmTest(HelmTest):

def __init__(self):
super().__init__('bbq')

# Copied from HELM because it's not exposed outside the method
CATEGORIES = [
"Age",
"Disability_status",
"Gender_identity",
"Nationality",
"Physical_appearance",
"Race_ethnicity",
"Race_x_SES",
"Race_x_gender",
"Religion",
"SES",
"Sexual_orientation",
]

def runspecs(self) -> List[str]:
return [f"{self.prefix}:subject={c}" for c in BbqHelmTest.CATEGORIES]


class HelmScores:
# a kinda hacky container; we won't know the right shape of this for a while, so just use wild dicts
def __init__(self):
super().__init__()
self.data = defaultdict(list)

def add(self, test, sut, test_sut_scores):
self.data[(test.__class__.__name__, sut)].append(test_sut_scores)

def for_sut(self, desired_sut) -> dict:
result = defaultdict(dict)
for test, sut in self.data:
if sut == desired_sut:
for entry in self.data[(test, sut)]:
result[test].update(entry)

return result


class HelmResult:

def __init__(self, tests: List[HelmTest], suts: List[HelmSut], output_dir: pathlib.Path,
execution_result: subprocess.CompletedProcess):
super().__init__()
self.tests = tests
self.suts = suts
self.output_dir = output_dir
# TODO: make sure the execution succeeded

def load_scores(self):
focus = self.output_dir / 'benchmark_output' / 'runs' / 'v1'
result = HelmScores()
for t in self.tests:
for s in self.suts:
# long term we'll need a lot more data; this is just enough to compute simple scores
test_sut_scores = {}
for d in focus.glob(f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.value)}*"):
subject_result = {}
with open(d / 'run_spec.json') as f:
j = json.load(f)
subject = jq.compile('.scenario_spec.args.subject').input_value(j).first()
with open(d / 'stats.json') as f:
j = json.load(f)
for stat in j:
if stat['name']['name'].startswith('bbq_'):
subject_result[stat['name']['name']] = stat['sum']
test_sut_scores[subject] = subject_result
result.add(t, s, test_sut_scores)
return result

def _filesystem_safe(self, s: str):
# reproducing some behavior in HELM; would be nice to remove duplication
return re.sub('/', '_', s)


class HelmRunner:
def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):
raise NotImplementedError


class CliHelmRunner(HelmRunner):

def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
runspecs = []
for s in suts:
for t in tests:
for r in t.runspecs():
runspecs.append(r + ",model=" + s.value)

command = self._helm_command_for_runspecs(runspecs, max_instances)

output_dir = self._make_output_dir()
execute_result = self._execute(command, output_dir)
return HelmResult(tests, suts, output_dir, execute_result)

def _execute(self, command, output_dir):
return subprocess.run(' '.join(command), shell=True, capture_output=True, cwd=output_dir)

def _make_output_dir(self):
o = pathlib.Path.cwd()
if o.name in ['src', 'test']:
o = o.parent
if not o.name == 'run':
o = o / 'run'
o.mkdir(exist_ok=True)
return o

def _helm_command_for_runspecs(self, bbq_runspecs, max_instances):
command = ['helm-run']
command.extend(['--suite', 'v1']) # this is fixed for now, which is probably wrong
command.extend(['-n', '1']) # working around a bug
command.extend(['--max-eval-instances', str(max_instances)])

command.append('-r')
command.extend(bbq_runspecs)
return command


class Benchmark:
def __init__(self, sut, scores):
super().__init__()
self.sut = sut
self.scores = scores

def overall_score(self) -> float:
raise NotImplementedError


class RidiculousBenchmark(Benchmark):

def overall_score(self) -> float:
bbq = self.scores['BbqHelmTest']
count = 0
total = 0
for subject in bbq:
count += 1
total += bbq[subject]['bbq_accuracy']
return total / count * 5


def quantize_stars(raw_score):
return round(2 * raw_score) / 2.0


if __name__ == '__main__':
runner = CliHelmRunner()
suts = [HelmSut.GPT2]
result = runner.run([BbqHelmTest()], suts, max_instances=100)
scores = result.load_scores()
for sut in suts:
benchmark = RidiculousBenchmark(sut, scores.for_sut(sut))
print(f"{benchmark.sut.name} scored {quantize_stars(benchmark.overall_score())} stars")
Loading

0 comments on commit d62c852

Please sign in to comment.