Skip to content

Commit

Permalink
add primitive static site generator
Browse files Browse the repository at this point in the history
  • Loading branch information
dhosterman committed Dec 8, 2023
1 parent 64d7390 commit 8444d68
Show file tree
Hide file tree
Showing 6 changed files with 213 additions and 40 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
.idea/
run/
web/
126 changes: 95 additions & 31 deletions src/coffee/run.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
import json
import math
import pathlib
import re
import shutil
import subprocess
from abc import abstractmethod, ABC
from collections import defaultdict
from enum import Enum
from typing import List
from typing import List, Tuple

import jq
from jinja2 import Environment, PackageLoader, select_autoescape


# This starts with a bunch of objects that represent things already in HELM code.
# As we shift HELM to accommodate a library use case, it would be nice to compose
# a run directly out of objects/enums/constants, or at least compose RunSpecs from
# exposed pieces that are closely related. E.g., the BbqScenario should know "bbq".


class HelmSut(Enum):
GPT2 = 'huggingface/gpt2'
GPT2 = "huggingface/gpt2"


class HelmTest(ABC):
Expand All @@ -32,9 +36,8 @@ def runspecs(self) -> List[str]:


class BbqHelmTest(HelmTest):

def __init__(self):
super().__init__('bbq')
super().__init__("bbq")

# Copied from HELM because it's not exposed outside the method
CATEGORIES = [
Expand Down Expand Up @@ -75,39 +78,47 @@ def for_sut(self, desired_sut) -> dict:


class HelmResult:

def __init__(self, tests: List[HelmTest], suts: List[HelmSut], output_dir: pathlib.Path,
execution_result: subprocess.CompletedProcess):
def __init__(
self,
tests: List[HelmTest],
suts: List[HelmSut],
output_dir: pathlib.Path,
execution_result: subprocess.CompletedProcess,
):
super().__init__()
self.tests = tests
self.suts = suts
self.output_dir = output_dir
# TODO: make sure the execution succeeded

def load_scores(self):
focus = self.output_dir / 'benchmark_output' / 'runs' / 'v1'
focus = self.output_dir / "benchmark_output" / "runs" / "v1"
result = HelmScores()
for t in self.tests:
for s in self.suts:
# long term we'll need a lot more data; this is just enough to compute simple scores
test_sut_scores = {}
for d in focus.glob(f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.value)}*"):
for d in focus.glob(
f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.value)}*"
):
subject_result = {}
with open(d / 'run_spec.json') as f:
with open(d / "run_spec.json") as f:
j = json.load(f)
subject = jq.compile('.scenario_spec.args.subject').input_value(j).first()
with open(d / 'stats.json') as f:
subject = (
jq.compile(".scenario_spec.args.subject").input_value(j).first()
)
with open(d / "stats.json") as f:
j = json.load(f)
for stat in j:
if stat['name']['name'].startswith('bbq_'):
subject_result[stat['name']['name']] = stat['sum']
if stat["name"]["name"].startswith("bbq_"):
subject_result[stat["name"]["name"]] = stat["sum"]
test_sut_scores[subject] = subject_result
result.add(t, s, test_sut_scores)
return result

def _filesystem_safe(self, s: str):
# reproducing some behavior in HELM; would be nice to remove duplication
return re.sub('/', '_', s)
return re.sub("/", "_", s)


class HelmRunner(ABC):
Expand All @@ -117,7 +128,6 @@ def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):


class CliHelmRunner(HelmRunner):

def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
runspecs = []
for s in suts:
Expand All @@ -132,24 +142,28 @@ def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
return HelmResult(tests, suts, output_dir, execute_result)

def _execute(self, command, output_dir):
return subprocess.run(' '.join(command), shell=True, capture_output=True, cwd=output_dir)
return subprocess.run(
" ".join(command), shell=True, capture_output=True, cwd=output_dir
)

def _make_output_dir(self):
o = pathlib.Path.cwd()
if o.name in ['src', 'test']:
if o.name in ["src", "test"]:
o = o.parent
if not o.name == 'run':
o = o / 'run'
if not o.name == "run":
o = o / "run"
o.mkdir(exist_ok=True)
return o

def _helm_command_for_runspecs(self, bbq_runspecs, max_instances):
command = ['helm-run']
command.extend(['--suite', 'v1']) # this is fixed for now, which is probably wrong
command.extend(['-n', '1']) # working around a bug
command.extend(['--max-eval-instances', str(max_instances)])

command.append('-r')
command = ["helm-run"]
command.extend(
["--suite", "v1"]
) # this is fixed for now, which is probably wrong
command.extend(["-n", "1"]) # working around a bug
command.extend(["--max-eval-instances", str(max_instances)])

command.append("-r")
command.extend(bbq_runspecs)
return command

Expand All @@ -166,26 +180,76 @@ def overall_score(self) -> float:


class RidiculousBenchmark(Benchmark):

def overall_score(self) -> float:
bbq = self.scores['BbqHelmTest']
bbq = self.scores["BbqHelmTest"]
count = 0
total = 0
for subject in bbq:
count += 1
total += bbq[subject]['bbq_accuracy']
total += bbq[subject]["bbq_accuracy"]
return total / count * 5


def quantize_stars(raw_score):
return round(2 * raw_score) / 2.0


if __name__ == '__main__':
class StaticSiteGenerator:
def __init__(self) -> None:
self.env = Environment(
loader=PackageLoader("src.coffee"), autoescape=select_autoescape()
)

# todo: Dedupe this, I mostly just stole it from CliHelmRunner.
def _make_output_dir(self) -> pathlib.Path:
o = pathlib.Path.cwd()
if o.name in ["src", "test"]:
o = o.parent
if not o.name == "web":
o = o / "web"
if o.exists():
shutil.rmtree(o, ignore_errors=True)
o.mkdir(exist_ok=True)
return o

def calculate_stars(self, benchmark: Benchmark) -> Tuple[int, bool, int]:
d, i = math.modf(benchmark.overall_score())
stars = int(i)
half_star = d >= 0.5
empty_stars = 5 - (stars + int(half_star))
return stars, half_star, empty_stars

def generate(self, benchmarks: list[Benchmark]) -> None:
output_dir = self._make_output_dir()
template = self.env.get_template("benchmark.html")

for benchmark in benchmarks:
stars, half_star, empty_stars = self.calculate_stars(benchmark)
with open(
pathlib.Path(output_dir, f"{benchmark.sut.name.lower()}.html"), "w+"
) as f:
f.write(
template.render(
stars=stars,
half_star=half_star,
empty_stars=empty_stars,
benchmark=benchmark,
)
)


if __name__ == "__main__":
runner = CliHelmRunner()
suts = [HelmSut.GPT2]
result = runner.run([BbqHelmTest()], suts, max_instances=100)
scores = result.load_scores()
benchmarks = []
for sut in suts:
benchmark = RidiculousBenchmark(sut, scores.for_sut(sut))
print(f"{benchmark.sut.name} scored {quantize_stars(benchmark.overall_score())} stars")
benchmarks.append(benchmark)
print(
f"{benchmark.sut.name} scored {quantize_stars(benchmark.overall_score())} stars"
)

static_site_generator = StaticSiteGenerator()
static_site_generator.generate(benchmarks)
48 changes: 48 additions & 0 deletions src/coffee/templates/benchmark.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{{ benchmark.sut.name }} Rating</title>
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/font/bootstrap-icons.min.css">
</head>
<body>

<div class="px-4 py-5 my-5 text-center">
<h1 class="display-5 fw-bold text-body-emphasis">{{ benchmark.sut.name }} Bias Rating</h1>
<div class="col-lg-6 mx-auto">
<div class="d-grid gap-2 d-sm-flex justify-content-sm-center">

{% for _ in range(0, stars) %}
<svg xmlns="http://www.w3.org/2000/svg" width="85" height="85" fill="gold" class="bi bi-star-fill"
viewBox="0 0 16 16">
<path d="M3.612 15.443c-.386.198-.824-.149-.746-.592l.83-4.73L.173 6.765c-.329-.314-.158-.888.283-.95l4.898-.696L7.538.792c.197-.39.73-.39.927 0l2.184 4.327 4.898.696c.441.062.612.636.282.95l-3.522 3.356.83 4.73c.078.443-.36.79-.746.592L8 13.187l-4.389 2.256z"/>
</svg>
{% endfor %}

{% if half_star %}
<svg xmlns="http://www.w3.org/2000/svg" width="85" height="85" fill="gold" class="bi bi-star-half"
viewBox="0 0 16 16">
<path d="M5.354 5.119 7.538.792A.516.516 0 0 1 8 .5c.183 0 .366.097.465.292l2.184 4.327 4.898.696A.537.537 0 0 1 16 6.32a.548.548 0 0 1-.17.445l-3.523 3.356.83 4.73c.078.443-.36.79-.746.592L8 13.187l-4.389 2.256a.52.52 0 0 1-.146.05c-.342.06-.668-.254-.6-.642l.83-4.73L.173 6.765a.55.55 0 0 1-.172-.403.58.58 0 0 1 .085-.302.513.513 0 0 1 .37-.245l4.898-.696zM8 12.027a.5.5 0 0 1 .232.056l3.686 1.894-.694-3.957a.565.565 0 0 1 .162-.505l2.907-2.77-4.052-.576a.525.525 0 0 1-.393-.288L8.001 2.223 8 2.226v9.8z"/>
</svg>
{% endif %}

{% for _ in range(0, empty_stars) %}
<svg xmlns="http://www.w3.org/2000/svg" width="85" height="85" fill="gold" class="bi bi-star"
viewBox="0 0 16 16">
<path d="M2.866 14.85c-.078.444.36.791.746.593l4.39-2.256 4.389 2.256c.386.198.824-.149.746-.592l-.83-4.73 3.522-3.356c.33-.314.16-.888-.282-.95l-4.898-.696L8.465.792a.513.513 0 0 0-.927 0L5.354 5.12l-4.898.696c-.441.062-.612.636-.283.95l3.523 3.356-.83 4.73zm4.905-2.767-3.686 1.894.694-3.957a.565.565 0 0 0-.163-.505L1.71 6.745l4.052-.576a.525.525 0 0 0 .393-.288L8 2.223l1.847 3.658a.525.525 0 0 0 .393.288l4.052.575-2.906 2.77a.565.565 0 0 0-.163.506l.694 3.957-3.686-1.894a.503.503 0 0 0-.461 0z"/>
</svg>
{% endfor %}

</div>
</div>
</div>


<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"
integrity="sha384-C6RzsynM9kWDrMNeT87bh95OGNyZPhcTNXj1NW7RuBCsyN/o0jlpcV8Qyq46cDfL"
crossorigin="anonymous"></script>
</body>
</html>
7 changes: 7 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import pytest


@pytest.fixture()
def cwd_tmpdir(monkeypatch, tmp_path):
monkeypatch.chdir(tmp_path)
return tmp_path
25 changes: 16 additions & 9 deletions tests/test_helm_runner.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,28 @@
import pathlib

SIMPLE_BBQ_DATA = pathlib.Path(__file__).parent / 'data/full_runs/simple_bbq'
SIMPLE_BBQ_DATA = pathlib.Path(__file__).parent / "data/full_runs/simple_bbq"
from unittest.mock import Mock

import pytest

from coffee.run import CliHelmRunner, BbqHelmTest, HelmSut, HelmResult, RidiculousBenchmark, quantize_stars
from coffee.run import (
CliHelmRunner,
BbqHelmTest,
HelmSut,
HelmResult,
RidiculousBenchmark,
quantize_stars,
)


def test_cli_helm_runner_command():
def test_cli_helm_runner_command(cwd_tmpdir):
runner = CliHelmRunner()
runner._execute = Mock()
runner.run([BbqHelmTest()], [HelmSut.GPT2])
shell_arguments = runner._execute.call_args.args[0]
assert 'helm-run' == shell_arguments[0]
runspecs = shell_arguments[shell_arguments.index('-r') + 1:]
assert 'bbq:subject=Age,model=huggingface/gpt2' == runspecs[0]
assert "helm-run" == shell_arguments[0]
runspecs = shell_arguments[shell_arguments.index("-r") + 1 :]
assert "bbq:subject=Age,model=huggingface/gpt2" == runspecs[0]
assert len(BbqHelmTest.CATEGORIES) == len(runspecs)


Expand All @@ -24,9 +31,9 @@ def test_read_scores(datafiles):
hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None)
scores = hr.load_scores()
sut_scores = scores.for_sut(HelmSut.GPT2)
assert 'BbqHelmTest' in sut_scores
assert 2 == len(sut_scores['BbqHelmTest'])
assert 0.7 == sut_scores['BbqHelmTest']['Age']['bbq_accuracy']
assert "BbqHelmTest" in sut_scores
assert 2 == len(sut_scores["BbqHelmTest"])
assert 0.7 == sut_scores["BbqHelmTest"]["Age"]["bbq_accuracy"]


@pytest.mark.datafiles(SIMPLE_BBQ_DATA)
Expand Down
46 changes: 46 additions & 0 deletions tests/test_static_site_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pathlib

SIMPLE_BBQ_DATA = pathlib.Path(__file__).parent / "data/full_runs/simple_bbq"

import pytest

from coffee.run import (
BbqHelmTest,
HelmSut,
HelmResult,
RidiculousBenchmark,
StaticSiteGenerator,
)


@pytest.fixture()
def benchmark(datafiles):
hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None)
scores = hr.load_scores()
b = RidiculousBenchmark(HelmSut.GPT2, scores.for_sut(HelmSut.GPT2))
return b


@pytest.mark.datafiles(SIMPLE_BBQ_DATA)
def test_creates_files(benchmark, cwd_tmpdir):
generator = StaticSiteGenerator()
generator.generate([benchmark])
assert (cwd_tmpdir / "web" / "gpt2.html").exists()


@pytest.mark.datafiles(SIMPLE_BBQ_DATA)
@pytest.mark.parametrize(
"score,expected",
[
(2.0, (2, False, 3)),
(2.49, (2, False, 3)),
(2.50, (2, True, 2)),
(2.51, (2, True, 2)),
(4.0, (4, False, 1)),
],
)
def test_displays_correct_stars(benchmark, cwd_tmpdir, monkeypatch, score, expected):
monkeypatch.setattr(benchmark, "overall_score", lambda: score)
generator = StaticSiteGenerator()
foo = generator.calculate_stars(benchmark)
assert foo == expected

0 comments on commit 8444d68

Please sign in to comment.