diff --git a/.gitignore b/.gitignore index 00f73fc9..f37c9eb4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .idea/ run/ +web/ diff --git a/src/coffee/run.py b/src/coffee/run.py index 83e1e9a9..a4fd3051 100644 --- a/src/coffee/run.py +++ b/src/coffee/run.py @@ -1,13 +1,16 @@ import json +import math import pathlib import re +import shutil import subprocess from abc import abstractmethod, ABC from collections import defaultdict from enum import Enum -from typing import List +from typing import List, Tuple import jq +from jinja2 import Environment, PackageLoader, select_autoescape # This starts with a bunch of objects that represent things already in HELM code. @@ -15,8 +18,9 @@ # a run directly out of objects/enums/constants, or at least compose RunSpecs from # exposed pieces that are closely related. E.g., the BbqScenario should know "bbq". + class HelmSut(Enum): - GPT2 = 'huggingface/gpt2' + GPT2 = "huggingface/gpt2" class HelmTest(ABC): @@ -32,9 +36,8 @@ def runspecs(self) -> List[str]: class BbqHelmTest(HelmTest): - def __init__(self): - super().__init__('bbq') + super().__init__("bbq") # Copied from HELM because it's not exposed outside the method CATEGORIES = [ @@ -75,9 +78,13 @@ def for_sut(self, desired_sut) -> dict: class HelmResult: - - def __init__(self, tests: List[HelmTest], suts: List[HelmSut], output_dir: pathlib.Path, - execution_result: subprocess.CompletedProcess): + def __init__( + self, + tests: List[HelmTest], + suts: List[HelmSut], + output_dir: pathlib.Path, + execution_result: subprocess.CompletedProcess, + ): super().__init__() self.tests = tests self.suts = suts @@ -85,29 +92,33 @@ def __init__(self, tests: List[HelmTest], suts: List[HelmSut], output_dir: pathl # TODO: make sure the execution succeeded def load_scores(self): - focus = self.output_dir / 'benchmark_output' / 'runs' / 'v1' + focus = self.output_dir / "benchmark_output" / "runs" / "v1" result = HelmScores() for t in self.tests: for s in self.suts: # long term we'll need a lot more data; this is just enough to compute simple scores test_sut_scores = {} - for d in focus.glob(f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.value)}*"): + for d in focus.glob( + f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.value)}*" + ): subject_result = {} - with open(d / 'run_spec.json') as f: + with open(d / "run_spec.json") as f: j = json.load(f) - subject = jq.compile('.scenario_spec.args.subject').input_value(j).first() - with open(d / 'stats.json') as f: + subject = ( + jq.compile(".scenario_spec.args.subject").input_value(j).first() + ) + with open(d / "stats.json") as f: j = json.load(f) for stat in j: - if stat['name']['name'].startswith('bbq_'): - subject_result[stat['name']['name']] = stat['sum'] + if stat["name"]["name"].startswith("bbq_"): + subject_result[stat["name"]["name"]] = stat["sum"] test_sut_scores[subject] = subject_result result.add(t, s, test_sut_scores) return result def _filesystem_safe(self, s: str): # reproducing some behavior in HELM; would be nice to remove duplication - return re.sub('/', '_', s) + return re.sub("/", "_", s) class HelmRunner(ABC): @@ -117,7 +128,6 @@ def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10): class CliHelmRunner(HelmRunner): - def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10): runspecs = [] for s in suts: @@ -132,24 +142,28 @@ def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10): return HelmResult(tests, suts, output_dir, execute_result) def _execute(self, command, output_dir): - return subprocess.run(' '.join(command), shell=True, capture_output=True, cwd=output_dir) + return subprocess.run( + " ".join(command), shell=True, capture_output=True, cwd=output_dir + ) def _make_output_dir(self): o = pathlib.Path.cwd() - if o.name in ['src', 'test']: + if o.name in ["src", "test"]: o = o.parent - if not o.name == 'run': - o = o / 'run' + if not o.name == "run": + o = o / "run" o.mkdir(exist_ok=True) return o def _helm_command_for_runspecs(self, bbq_runspecs, max_instances): - command = ['helm-run'] - command.extend(['--suite', 'v1']) # this is fixed for now, which is probably wrong - command.extend(['-n', '1']) # working around a bug - command.extend(['--max-eval-instances', str(max_instances)]) - - command.append('-r') + command = ["helm-run"] + command.extend( + ["--suite", "v1"] + ) # this is fixed for now, which is probably wrong + command.extend(["-n", "1"]) # working around a bug + command.extend(["--max-eval-instances", str(max_instances)]) + + command.append("-r") command.extend(bbq_runspecs) return command @@ -166,14 +180,13 @@ def overall_score(self) -> float: class RidiculousBenchmark(Benchmark): - def overall_score(self) -> float: - bbq = self.scores['BbqHelmTest'] + bbq = self.scores["BbqHelmTest"] count = 0 total = 0 for subject in bbq: count += 1 - total += bbq[subject]['bbq_accuracy'] + total += bbq[subject]["bbq_accuracy"] return total / count * 5 @@ -181,11 +194,62 @@ def quantize_stars(raw_score): return round(2 * raw_score) / 2.0 -if __name__ == '__main__': +class StaticSiteGenerator: + def __init__(self) -> None: + self.env = Environment( + loader=PackageLoader("src.coffee"), autoescape=select_autoescape() + ) + + # todo: Dedupe this, I mostly just stole it from CliHelmRunner. + def _make_output_dir(self) -> pathlib.Path: + o = pathlib.Path.cwd() + if o.name in ["src", "test"]: + o = o.parent + if not o.name == "web": + o = o / "web" + if o.exists(): + shutil.rmtree(o, ignore_errors=True) + o.mkdir(exist_ok=True) + return o + + def calculate_stars(self, benchmark: Benchmark) -> Tuple[int, bool, int]: + d, i = math.modf(benchmark.overall_score()) + stars = int(i) + half_star = d >= 0.5 + empty_stars = 5 - (stars + int(half_star)) + return stars, half_star, empty_stars + + def generate(self, benchmarks: list[Benchmark]) -> None: + output_dir = self._make_output_dir() + template = self.env.get_template("benchmark.html") + + for benchmark in benchmarks: + stars, half_star, empty_stars = self.calculate_stars(benchmark) + with open( + pathlib.Path(output_dir, f"{benchmark.sut.name.lower()}.html"), "w+" + ) as f: + f.write( + template.render( + stars=stars, + half_star=half_star, + empty_stars=empty_stars, + benchmark=benchmark, + ) + ) + + +if __name__ == "__main__": runner = CliHelmRunner() suts = [HelmSut.GPT2] result = runner.run([BbqHelmTest()], suts, max_instances=100) scores = result.load_scores() + benchmarks = [] for sut in suts: benchmark = RidiculousBenchmark(sut, scores.for_sut(sut)) - print(f"{benchmark.sut.name} scored {quantize_stars(benchmark.overall_score())} stars") + benchmarks.append(benchmark) + print( + f"{benchmark.sut.name} scored {quantize_stars(benchmark.overall_score())} stars" + ) + + static_site_generator = StaticSiteGenerator() + static_site_generator.generate(benchmarks) diff --git a/src/coffee/templates/benchmark.html b/src/coffee/templates/benchmark.html new file mode 100644 index 00000000..5c9e27b7 --- /dev/null +++ b/src/coffee/templates/benchmark.html @@ -0,0 +1,48 @@ + + +
+ + +