Skip to content

Commit

Permalink
Merge pull request #174 from P403n1x87/ci/benchmarks-ttest
Browse files Browse the repository at this point in the history
ci(benchmarks): improve benchmark reporting
  • Loading branch information
P403n1x87 authored Feb 28, 2023
2 parents 01a247a + b9d38aa commit 4c3e26b
Show file tree
Hide file tree
Showing 4 changed files with 208 additions and 32 deletions.
5 changes: 1 addition & 4 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,9 @@ jobs:
ulimit -c unlimited
source .venv/bin/activate
python scripts/benchmark.py | tee benchmarks.txt
python scripts/benchmark.py --format markdown | tee comment.txt
deactivate
# Make it a code comment
sed -e $'1i\\\n~~~' -e $'$a\\\n~~~' benchmarks.txt > comment.txt
- name: Post results on PR
uses: marocchino/sticky-pull-request-comment@v2
with:
Expand Down
232 changes: 205 additions & 27 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
# Run as python3 scripts/benchmark.py from the repository root directory.
# Ensure dependencies from requirements-bm.txt are installed.

import abc
import re
import sys
from textwrap import wrap
import typing as t
from argparse import ArgumentParser
from itertools import product
from math import floor, log
from pathlib import Path

from scipy.stats import ttest_ind

sys.path.insert(0, str(Path(__file__).parent.parent))

import tarfile
Expand All @@ -17,7 +21,7 @@
from urllib.error import HTTPError
from urllib.request import urlopen

VERSIONS = ("3.2.0", "3.3.0", "3.4.1", "dev")
VERSIONS = ("3.4.1", "3.5.0", "dev")
SCENARIOS = [
*[
(
Expand Down Expand Up @@ -62,6 +66,15 @@
]


# The metrics we evaluate and whether they are to be maximised or minimised.
METRICS = [
("Sample Rate", +1),
("Saturation", -1),
("Error Rate", -1),
("Sampling Speed", -1),
]


def get_stats(output: str) -> t.Optional[dict]:
try:
meta = metadata(output)
Expand Down Expand Up @@ -120,6 +133,7 @@ def download_release(version: str, dest: Path, variant_name: str = "austin") ->

class Outcome:
def __init__(self, data: list[float]) -> None:
self.data = data
self.mean = sum(data) / len(data)
self.stdev = (
sum(((v - self.mean) ** 2 for v in data)) / (len(data) - 1)
Expand All @@ -140,29 +154,173 @@ def __repr__(self):
def __len__(self):
return len(repr(self))

def __eq__(self, other: "Outcome") -> bool:
t, p = ttest_ind(self.data, other.data, equal_var=False)
return p < 0.05

def render(table):
_, row = table[0]
cols = list(row.keys())
max_vh = max(len(e[0]) for e in table)

col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
Results = t.Tuple[str, t.Dict[str, Outcome]]

print("=" * div_len)
print(
(" " * (max_vh + 2))
+ "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
)
print("-" * div_len)

for v, row in table:
print(f"{v:^{max_vh+2}}", end="")
for col, cw in zip(cols, col_widths):
print(f"{str(row[col]):^{cw+2}}", end="")
class Renderer(abc.ABC):
BETTER = "better"
WORSE = "worse"
SAME = "same"

@abc.abstractmethod
def render_header(self, title: str, level: int = 1) -> str:
...

@abc.abstractmethod
def render_paragraph(self, text: str) -> str:
...

@abc.abstractmethod
def render_table(self, table) -> str:
...

@abc.abstractmethod
def render_scenario(
self, title, results: t.List[t.Tuple[str, t.List[Results]]]
) -> str:
...

@abc.abstractmethod
def render_summary(
self, summary: t.List[t.Tuple[str, t.List[t.Tuple[str, bool, int]]]]
) -> str:
...

def render_scenario(
self, title, table: t.List[t.Tuple[str, t.List[Results]]]
) -> str:
self.render_header(title, level=2)
self.render_table(table)
print()

def render_summary(self, summary):
self.render_header("Benchmark Summary", level=2)
self.render_paragraph(f"Comparison of {VERSIONS[-1]} against {VERSIONS[-2]}.")

if not summary:
self.render_paragraph(
"No significant difference in performance between versions."
)
return

self.render_paragraph(
"The following scenarios show a statistically significant difference "
"in performance between the two versions."
)

self.render_table(
[
(
title,
{
m: {1: self.BETTER, -1: self.WORSE}[s] if c else self.SAME
for m, c, s in tests
},
)
for title, tests in summary
]
)


class TerminalRenderer(Renderer):
def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
_, row = table[0]
cols = list(row.keys())
max_vh = max(len(e[0]) for e in table)

col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh

print("=" * div_len)
print(
(" " * (max_vh + 2))
+ "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
)
print("-" * div_len)

for v, row in table:
print(f"{v:^{max_vh+2}}", end="")
for col, cw in zip(cols, col_widths):
print(f"{str(row[col]):^{cw+2}}", end="")
print()

print("=" * div_len)

def render_header(self, title: str, level: int = 1) -> str:
print(title)
print({1: "=", 2: "-", 3: "~"}.get(level, "-") * len(title))
print()

def render_paragraph(self, text: str) -> str:
for _ in wrap(text):
print(_)
print()


class MarkdownRenderer(Renderer):
BETTER = ":green_circle:"
WORSE = ":red_circle:"
SAME = ":yellow_circle:"

def render_header(self, title: str, level: int = 1) -> str:
print(f"{'#' * level} {title}")
print()

def render_paragraph(self, text: str) -> str:
print(text)
print()

def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
_, row = table[0]
cols = list(row.keys())
max_vh = max(len(e[0]) for e in table)

col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh

print("| |" + "|".join(f" {col} " for col in cols) + "|")
print("| --- |" + "|".join(f":{'-' * len(col)}:" for col in cols) + "|")

for v, row in table:
print(
f"| {v} |"
+ "|".join(
f" {str(row[col]):^{cw}} " for col, cw in zip(cols, col_widths)
)
+ "|"
)

def render_scenario(
self, title, table: t.List[t.Tuple[str, t.List[Results]]]
) -> str:
print("<details>")
print(f"<summary><strong>{title}</strong></summary>")
print()
super().render_scenario(title, table)
print("</details>")
print()

print("=" * div_len)

def summarize(results: t.List[t.Tuple[str, t.List[Results]]]):
summary = []
for title, table in results:
(_, a), (_, b) = table[-2:]
tests = [
(
m,
a[m] == b[m],
int((b[m].mean - a[m].mean) * s / (abs(b[m].mean - a[m].mean) or 1)),
)
for m, s in METRICS
]
if any(c for _, c, _ in tests):
summary.append((title, tests))
return summary


def main():
Expand All @@ -181,26 +339,41 @@ def main():
help="Number of times to run each scenario",
)

argp.add_argument(
"-f",
"--format",
type=str,
choices=["terminal", "markdown"],
default="terminal",
help="The output format",
)

opts = argp.parse_args()

print(
renderer = {"terminal": TerminalRenderer, "markdown": MarkdownRenderer}[
opts.format
]()

renderer.render_header("Austin Benchmarks")
renderer.render_paragraph(
f"Running Austin benchmarks with Python {'.'.join(str(_) for _ in sys.version_info[:3])}",
end="\n\n",
)

results: t.List[t.Tuple[str, t.List[Results]]] = []

for variant, title, args in SCENARIOS:
if opts.k is not None and not opts.k.match(title):
continue

print(title)

table = []
table: t.List[Results] = []
for version in VERSIONS:
print(f"> Running with Austin {version} ... ", end="\r")
print(f"> Running with Austin {version} ... ", end="\r", file=sys.stderr)
try:
austin = download_release(version, Path("/tmp"), variant_name=variant)
except RuntimeError:
print(f"WARNING: Could not download {variant} {version}")
print(
f"WARNING: Could not download {variant} {version}", file=sys.stderr
)
continue

stats = [
Expand All @@ -218,8 +391,13 @@ def main():
)
)

render(table)
print()
renderer.render_scenario(title, table)

results.append((title, table))

summary = summarize(results)

renderer.render_summary(summary)


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions scripts/requirements-bm.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
austin-python~=1.4.1
scipy~=1.10.1
2 changes: 1 addition & 1 deletion test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def __call__(
# or using the "where" option.
result.stdout = demojo(result.stdout)
else:
result.stdout = result.stdout.decode()
result.stdout = result.stdout.decode(errors="ignore")
result.stderr = result.stderr.decode()

return result
Expand Down

0 comments on commit 4c3e26b

Please sign in to comment.