diff --git a/.gitignore b/.gitignore index 00f73fc9..f37c9eb4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .idea/ run/ +web/ diff --git a/src/coffee/run.py b/src/coffee/run.py index 83e1e9a9..a4fd3051 100644 --- a/src/coffee/run.py +++ b/src/coffee/run.py @@ -1,13 +1,16 @@ import json +import math import pathlib import re +import shutil import subprocess from abc import abstractmethod, ABC from collections import defaultdict from enum import Enum -from typing import List +from typing import List, Tuple import jq +from jinja2 import Environment, PackageLoader, select_autoescape # This starts with a bunch of objects that represent things already in HELM code. @@ -15,8 +18,9 @@ # a run directly out of objects/enums/constants, or at least compose RunSpecs from # exposed pieces that are closely related. E.g., the BbqScenario should know "bbq". + class HelmSut(Enum): - GPT2 = 'huggingface/gpt2' + GPT2 = "huggingface/gpt2" class HelmTest(ABC): @@ -32,9 +36,8 @@ def runspecs(self) -> List[str]: class BbqHelmTest(HelmTest): - def __init__(self): - super().__init__('bbq') + super().__init__("bbq") # Copied from HELM because it's not exposed outside the method CATEGORIES = [ @@ -75,9 +78,13 @@ def for_sut(self, desired_sut) -> dict: class HelmResult: - - def __init__(self, tests: List[HelmTest], suts: List[HelmSut], output_dir: pathlib.Path, - execution_result: subprocess.CompletedProcess): + def __init__( + self, + tests: List[HelmTest], + suts: List[HelmSut], + output_dir: pathlib.Path, + execution_result: subprocess.CompletedProcess, + ): super().__init__() self.tests = tests self.suts = suts @@ -85,29 +92,33 @@ def __init__(self, tests: List[HelmTest], suts: List[HelmSut], output_dir: pathl # TODO: make sure the execution succeeded def load_scores(self): - focus = self.output_dir / 'benchmark_output' / 'runs' / 'v1' + focus = self.output_dir / "benchmark_output" / "runs" / "v1" result = HelmScores() for t in self.tests: for s in self.suts: # long term we'll need a lot more data; this is just enough to compute simple scores test_sut_scores = {} - for d in focus.glob(f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.value)}*"): + for d in focus.glob( + f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.value)}*" + ): subject_result = {} - with open(d / 'run_spec.json') as f: + with open(d / "run_spec.json") as f: j = json.load(f) - subject = jq.compile('.scenario_spec.args.subject').input_value(j).first() - with open(d / 'stats.json') as f: + subject = ( + jq.compile(".scenario_spec.args.subject").input_value(j).first() + ) + with open(d / "stats.json") as f: j = json.load(f) for stat in j: - if stat['name']['name'].startswith('bbq_'): - subject_result[stat['name']['name']] = stat['sum'] + if stat["name"]["name"].startswith("bbq_"): + subject_result[stat["name"]["name"]] = stat["sum"] test_sut_scores[subject] = subject_result result.add(t, s, test_sut_scores) return result def _filesystem_safe(self, s: str): # reproducing some behavior in HELM; would be nice to remove duplication - return re.sub('/', '_', s) + return re.sub("/", "_", s) class HelmRunner(ABC): @@ -117,7 +128,6 @@ def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10): class CliHelmRunner(HelmRunner): - def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10): runspecs = [] for s in suts: @@ -132,24 +142,28 @@ def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10): return HelmResult(tests, suts, output_dir, execute_result) def _execute(self, command, output_dir): - return subprocess.run(' '.join(command), shell=True, capture_output=True, cwd=output_dir) + return subprocess.run( + " ".join(command), shell=True, capture_output=True, cwd=output_dir + ) def _make_output_dir(self): o = pathlib.Path.cwd() - if o.name in ['src', 'test']: + if o.name in ["src", "test"]: o = o.parent - if not o.name == 'run': - o = o / 'run' + if not o.name == "run": + o = o / "run" o.mkdir(exist_ok=True) return o def _helm_command_for_runspecs(self, bbq_runspecs, max_instances): - command = ['helm-run'] - command.extend(['--suite', 'v1']) # this is fixed for now, which is probably wrong - command.extend(['-n', '1']) # working around a bug - command.extend(['--max-eval-instances', str(max_instances)]) - - command.append('-r') + command = ["helm-run"] + command.extend( + ["--suite", "v1"] + ) # this is fixed for now, which is probably wrong + command.extend(["-n", "1"]) # working around a bug + command.extend(["--max-eval-instances", str(max_instances)]) + + command.append("-r") command.extend(bbq_runspecs) return command @@ -166,14 +180,13 @@ def overall_score(self) -> float: class RidiculousBenchmark(Benchmark): - def overall_score(self) -> float: - bbq = self.scores['BbqHelmTest'] + bbq = self.scores["BbqHelmTest"] count = 0 total = 0 for subject in bbq: count += 1 - total += bbq[subject]['bbq_accuracy'] + total += bbq[subject]["bbq_accuracy"] return total / count * 5 @@ -181,11 +194,62 @@ def quantize_stars(raw_score): return round(2 * raw_score) / 2.0 -if __name__ == '__main__': +class StaticSiteGenerator: + def __init__(self) -> None: + self.env = Environment( + loader=PackageLoader("src.coffee"), autoescape=select_autoescape() + ) + + # todo: Dedupe this, I mostly just stole it from CliHelmRunner. + def _make_output_dir(self) -> pathlib.Path: + o = pathlib.Path.cwd() + if o.name in ["src", "test"]: + o = o.parent + if not o.name == "web": + o = o / "web" + if o.exists(): + shutil.rmtree(o, ignore_errors=True) + o.mkdir(exist_ok=True) + return o + + def calculate_stars(self, benchmark: Benchmark) -> Tuple[int, bool, int]: + d, i = math.modf(benchmark.overall_score()) + stars = int(i) + half_star = d >= 0.5 + empty_stars = 5 - (stars + int(half_star)) + return stars, half_star, empty_stars + + def generate(self, benchmarks: list[Benchmark]) -> None: + output_dir = self._make_output_dir() + template = self.env.get_template("benchmark.html") + + for benchmark in benchmarks: + stars, half_star, empty_stars = self.calculate_stars(benchmark) + with open( + pathlib.Path(output_dir, f"{benchmark.sut.name.lower()}.html"), "w+" + ) as f: + f.write( + template.render( + stars=stars, + half_star=half_star, + empty_stars=empty_stars, + benchmark=benchmark, + ) + ) + + +if __name__ == "__main__": runner = CliHelmRunner() suts = [HelmSut.GPT2] result = runner.run([BbqHelmTest()], suts, max_instances=100) scores = result.load_scores() + benchmarks = [] for sut in suts: benchmark = RidiculousBenchmark(sut, scores.for_sut(sut)) - print(f"{benchmark.sut.name} scored {quantize_stars(benchmark.overall_score())} stars") + benchmarks.append(benchmark) + print( + f"{benchmark.sut.name} scored {quantize_stars(benchmark.overall_score())} stars" + ) + + static_site_generator = StaticSiteGenerator() + static_site_generator.generate(benchmarks) diff --git a/src/coffee/templates/benchmark.html b/src/coffee/templates/benchmark.html new file mode 100644 index 00000000..5c9e27b7 --- /dev/null +++ b/src/coffee/templates/benchmark.html @@ -0,0 +1,48 @@ + + + + + + {{ benchmark.sut.name }} Rating + + + + + +
+

{{ benchmark.sut.name }} Bias Rating

+
+
+ + {% for _ in range(0, stars) %} + + + + {% endfor %} + + {% if half_star %} + + + + {% endif %} + + {% for _ in range(0, empty_stars) %} + + + + {% endfor %} + +
+
+
+ + + + + \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..54bbaa01 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,7 @@ +import pytest + + +@pytest.fixture() +def cwd_tmpdir(monkeypatch, tmp_path): + monkeypatch.chdir(tmp_path) + return tmp_path diff --git a/tests/test_helm_runner.py b/tests/test_helm_runner.py index a0ed525f..fc272146 100644 --- a/tests/test_helm_runner.py +++ b/tests/test_helm_runner.py @@ -1,21 +1,28 @@ import pathlib -SIMPLE_BBQ_DATA = pathlib.Path(__file__).parent / 'data/full_runs/simple_bbq' +SIMPLE_BBQ_DATA = pathlib.Path(__file__).parent / "data/full_runs/simple_bbq" from unittest.mock import Mock import pytest -from coffee.run import CliHelmRunner, BbqHelmTest, HelmSut, HelmResult, RidiculousBenchmark, quantize_stars +from coffee.run import ( + CliHelmRunner, + BbqHelmTest, + HelmSut, + HelmResult, + RidiculousBenchmark, + quantize_stars, +) -def test_cli_helm_runner_command(): +def test_cli_helm_runner_command(cwd_tmpdir): runner = CliHelmRunner() runner._execute = Mock() runner.run([BbqHelmTest()], [HelmSut.GPT2]) shell_arguments = runner._execute.call_args.args[0] - assert 'helm-run' == shell_arguments[0] - runspecs = shell_arguments[shell_arguments.index('-r') + 1:] - assert 'bbq:subject=Age,model=huggingface/gpt2' == runspecs[0] + assert "helm-run" == shell_arguments[0] + runspecs = shell_arguments[shell_arguments.index("-r") + 1 :] + assert "bbq:subject=Age,model=huggingface/gpt2" == runspecs[0] assert len(BbqHelmTest.CATEGORIES) == len(runspecs) @@ -24,9 +31,9 @@ def test_read_scores(datafiles): hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None) scores = hr.load_scores() sut_scores = scores.for_sut(HelmSut.GPT2) - assert 'BbqHelmTest' in sut_scores - assert 2 == len(sut_scores['BbqHelmTest']) - assert 0.7 == sut_scores['BbqHelmTest']['Age']['bbq_accuracy'] + assert "BbqHelmTest" in sut_scores + assert 2 == len(sut_scores["BbqHelmTest"]) + assert 0.7 == sut_scores["BbqHelmTest"]["Age"]["bbq_accuracy"] @pytest.mark.datafiles(SIMPLE_BBQ_DATA) diff --git a/tests/test_static_site_generator.py b/tests/test_static_site_generator.py new file mode 100644 index 00000000..a5168c5f --- /dev/null +++ b/tests/test_static_site_generator.py @@ -0,0 +1,46 @@ +import pathlib + +SIMPLE_BBQ_DATA = pathlib.Path(__file__).parent / "data/full_runs/simple_bbq" + +import pytest + +from coffee.run import ( + BbqHelmTest, + HelmSut, + HelmResult, + RidiculousBenchmark, + StaticSiteGenerator, +) + + +@pytest.fixture() +def benchmark(datafiles): + hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None) + scores = hr.load_scores() + b = RidiculousBenchmark(HelmSut.GPT2, scores.for_sut(HelmSut.GPT2)) + return b + + +@pytest.mark.datafiles(SIMPLE_BBQ_DATA) +def test_creates_files(benchmark, cwd_tmpdir): + generator = StaticSiteGenerator() + generator.generate([benchmark]) + assert (cwd_tmpdir / "web" / "gpt2.html").exists() + + +@pytest.mark.datafiles(SIMPLE_BBQ_DATA) +@pytest.mark.parametrize( + "score,expected", + [ + (2.0, (2, False, 3)), + (2.49, (2, False, 3)), + (2.50, (2, True, 2)), + (2.51, (2, True, 2)), + (4.0, (4, False, 1)), + ], +) +def test_displays_correct_stars(benchmark, cwd_tmpdir, monkeypatch, score, expected): + monkeypatch.setattr(benchmark, "overall_score", lambda: score) + generator = StaticSiteGenerator() + foo = generator.calculate_stars(benchmark) + assert foo == expected