Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ridiculous benchmark #15

Merged
merged 5 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.idea/
run/
3,540 changes: 3,540 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

29 changes: 29 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[tool.poetry]
name = "coffee"
version = "0.1.0"
description = ""
authors = ["Your Name <[email protected]>"]
readme = "README.md"
packages = [
{ include = "src" }
]


[tool.poetry.dependencies]
python = ">=3.10,<3.11"
pyext = {url = "https://files.pythonhosted.org/packages/b0/be/9b6005ac644aaef022527ce49617263379e49dbdbd433d1d3dd66d71f570/pyext-0.7.tar.gz"}
crfm-helm = "0.3"
jq = "^1.6.0"

[tool.poetry.group.dev.dependencies]
pytest-datafiles = "^3.0.0"
pytest = "^7.4.3"

[tool.pytest.ini_options]
addopts = [
"--import-mode=importlib",
]

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
191 changes: 191 additions & 0 deletions src/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import json
import pathlib
import re
import subprocess
from abc import abstractmethod, ABC
from collections import defaultdict
from enum import Enum
from typing import List

import jq


# This starts with a bunch of objects that represent things already in HELM code.
# As we shift HELM to accommodate a library use case, it would be nice to compose
# a run directly out of objects/enums/constants, or at least compose RunSpecs from
# exposed pieces that are closely related. E.g., the BbqScenario should know "bbq".

class HelmSut(Enum):
GPT2 = 'huggingface/gpt2'


class HelmTest(ABC):
# I would like this to be another enum, but BBQ's structural chaos means
# for now we need custom behavior
def __init__(self, prefix):
super().__init__()
self.prefix = prefix

@abstractmethod
def runspecs(self) -> List[str]:
pass


class BbqHelmTest(HelmTest):
brianwgoldman marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self):
super().__init__('bbq')

# Copied from HELM because it's not exposed outside the method
CATEGORIES = [
"Age",
"Disability_status",
"Gender_identity",
"Nationality",
"Physical_appearance",
"Race_ethnicity",
"Race_x_SES",
"Race_x_gender",
"Religion",
"SES",
"Sexual_orientation",
]

def runspecs(self) -> List[str]:
return [f"{self.prefix}:subject={c}" for c in BbqHelmTest.CATEGORIES]


class HelmScores:
# a kinda hacky container; we won't know the right shape of this for a while, so just use wild dicts
def __init__(self):
super().__init__()
self.data = defaultdict(list)

def add(self, test, sut, test_sut_scores):
self.data[(test.__class__.__name__, sut)].append(test_sut_scores)
brianwgoldman marked this conversation as resolved.
Show resolved Hide resolved

def for_sut(self, desired_sut) -> dict:
brianwgoldman marked this conversation as resolved.
Show resolved Hide resolved
result = defaultdict(dict)
for test, sut in self.data:
if sut == desired_sut:
for entry in self.data[(test, sut)]:
result[test].update(entry)

return result


class HelmResult:

def __init__(self, tests: List[HelmTest], suts: List[HelmSut], output_dir: pathlib.Path,
execution_result: subprocess.CompletedProcess):
super().__init__()
self.tests = tests
self.suts = suts
self.output_dir = output_dir
# TODO: make sure the execution succeeded

def load_scores(self):
focus = self.output_dir / 'benchmark_output' / 'runs' / 'v1'
brianwgoldman marked this conversation as resolved.
Show resolved Hide resolved
result = HelmScores()
for t in self.tests:
for s in self.suts:
# long term we'll need a lot more data; this is just enough to compute simple scores
test_sut_scores = {}
for d in focus.glob(f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.value)}*"):
subject_result = {}
with open(d / 'run_spec.json') as f:
j = json.load(f)
subject = jq.compile('.scenario_spec.args.subject').input_value(j).first()
with open(d / 'stats.json') as f:
j = json.load(f)
for stat in j:
if stat['name']['name'].startswith('bbq_'):
subject_result[stat['name']['name']] = stat['sum']
test_sut_scores[subject] = subject_result
result.add(t, s, test_sut_scores)
return result

def _filesystem_safe(self, s: str):
# reproducing some behavior in HELM; would be nice to remove duplication
return re.sub('/', '_', s)


class HelmRunner(ABC):
@abstractmethod
def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):
pass


class CliHelmRunner(HelmRunner):

def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
runspecs = []
for s in suts:
for t in tests:
for r in t.runspecs():
runspecs.append(r + ",model=" + s.value)

command = self._helm_command_for_runspecs(runspecs, max_instances)

output_dir = self._make_output_dir()
execute_result = self._execute(command, output_dir)
return HelmResult(tests, suts, output_dir, execute_result)

def _execute(self, command, output_dir):
return subprocess.run(' '.join(command), shell=True, capture_output=True, cwd=output_dir)

def _make_output_dir(self):
o = pathlib.Path.cwd()
if o.name in ['src', 'test']:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels very magic and potentially error prone. Can we pass output_dir into run?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The magic needs to happen somewhere, as some ways of invoking this (as from my IDE) are inclined to run it in the src or test directories. Which then puts a bunch of output where it shouldn't be.

We could move the magic out and pass the directory in, but the theory here is that the CliHelmRunner is just going to do some stuff and give you a run result from which you can get scores, so requiring a working directory be passed in violates encapsulation and increases burden on the object's user, something we should generally seek to reduce.

Is there some plausible near-term use case you have in mind where this would cause a problem?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An advantage to making it passed in is any pytests we write can use a temp directory to ensure multiple runs don't conflict and data generated during the test gets cleaned up.

Is there some plausible near-term use case you have in mind where this would cause a problem?

If there is a subdirectory of src and you run this from there, you are back to getting output in unexpected places. If we move src to be in a coffee directory this logic also breaks.

requiring a working directory be passed in violates encapsulation

HelmResult has to know about output_directory, so IMO it isn't encapsulated. I think there is also a reason for a user to want to be able to find all these results after-the-fact, so asking them where to put it feels reasonable.

o = o.parent
if not o.name == 'run':
o = o / 'run'
o.mkdir(exist_ok=True)
return o

def _helm_command_for_runspecs(self, bbq_runspecs, max_instances):
command = ['helm-run']
command.extend(['--suite', 'v1']) # this is fixed for now, which is probably wrong
command.extend(['-n', '1']) # working around a bug
command.extend(['--max-eval-instances', str(max_instances)])

command.append('-r')
command.extend(bbq_runspecs)
return command


class Benchmark(ABC):
def __init__(self, sut, scores):
super().__init__()
self.sut = sut
self.scores = scores

@abstractmethod
def overall_score(self) -> float:
pass


class RidiculousBenchmark(Benchmark):

def overall_score(self) -> float:
bbq = self.scores['BbqHelmTest']
count = 0
total = 0
for subject in bbq:
count += 1
total += bbq[subject]['bbq_accuracy']
return total / count * 5


def quantize_stars(raw_score):
return round(2 * raw_score) / 2.0


if __name__ == '__main__':
runner = CliHelmRunner()
suts = [HelmSut.GPT2]
result = runner.run([BbqHelmTest()], suts, max_instances=100)
scores = result.load_scores()
for sut in suts:
benchmark = RidiculousBenchmark(sut, scores.for_sut(sut))
print(f"{benchmark.sut.name} scored {quantize_stars(benchmark.overall_score())} stars")
Loading