From b9d38aa5161fb9877cb92eb02fa99e4839bc96cf Mon Sep 17 00:00:00 2001 From: "Gabriele N. Tornetta" Date: Sun, 26 Feb 2023 16:10:57 +0000 Subject: [PATCH] ci(benchmarks): improve benchmark reporting This change improves the report from benchmark runs by adding t-tests for each scenario. This allows the benchmarks to provide a summary at the end with the scenarios that are likely to show perfomance difference between the latest released version and the one that comes with the PR the benchmarks run for. This also updates the versions that are being benchmarked to include the latest releases. --- .github/workflows/benchmarks.yml | 5 +- scripts/benchmark.py | 232 +++++++++++++++++++++++++++---- scripts/requirements-bm.txt | 1 + test/utils.py | 2 +- 4 files changed, 208 insertions(+), 32 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 3e1c383e..5fdcc65d 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -37,12 +37,9 @@ jobs: ulimit -c unlimited source .venv/bin/activate - python scripts/benchmark.py | tee benchmarks.txt + python scripts/benchmark.py --format markdown | tee comment.txt deactivate - # Make it a code comment - sed -e $'1i\\\n~~~' -e $'$a\\\n~~~' benchmarks.txt > comment.txt - - name: Post results on PR uses: marocchino/sticky-pull-request-comment@v2 with: diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 196b91b4..5f7095a6 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -1,14 +1,18 @@ # Run as python3 scripts/benchmark.py from the repository root directory. # Ensure dependencies from requirements-bm.txt are installed. +import abc import re import sys +from textwrap import wrap import typing as t from argparse import ArgumentParser from itertools import product from math import floor, log from pathlib import Path +from scipy.stats import ttest_ind + sys.path.insert(0, str(Path(__file__).parent.parent)) import tarfile @@ -17,7 +21,7 @@ from urllib.error import HTTPError from urllib.request import urlopen -VERSIONS = ("3.2.0", "3.3.0", "3.4.1", "dev") +VERSIONS = ("3.4.1", "3.5.0", "dev") SCENARIOS = [ *[ ( @@ -62,6 +66,15 @@ ] +# The metrics we evaluate and whether they are to be maximised or minimised. +METRICS = [ + ("Sample Rate", +1), + ("Saturation", -1), + ("Error Rate", -1), + ("Sampling Speed", -1), +] + + def get_stats(output: str) -> t.Optional[dict]: try: meta = metadata(output) @@ -120,6 +133,7 @@ def download_release(version: str, dest: Path, variant_name: str = "austin") -> class Outcome: def __init__(self, data: list[float]) -> None: + self.data = data self.mean = sum(data) / len(data) self.stdev = ( sum(((v - self.mean) ** 2 for v in data)) / (len(data) - 1) @@ -140,29 +154,173 @@ def __repr__(self): def __len__(self): return len(repr(self)) + def __eq__(self, other: "Outcome") -> bool: + t, p = ttest_ind(self.data, other.data, equal_var=False) + return p < 0.05 -def render(table): - _, row = table[0] - cols = list(row.keys()) - max_vh = max(len(e[0]) for e in table) - col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols] - div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh +Results = t.Tuple[str, t.Dict[str, Outcome]] - print("=" * div_len) - print( - (" " * (max_vh + 2)) - + "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths)) - ) - print("-" * div_len) - for v, row in table: - print(f"{v:^{max_vh+2}}", end="") - for col, cw in zip(cols, col_widths): - print(f"{str(row[col]):^{cw+2}}", end="") +class Renderer(abc.ABC): + BETTER = "better" + WORSE = "worse" + SAME = "same" + + @abc.abstractmethod + def render_header(self, title: str, level: int = 1) -> str: + ... + + @abc.abstractmethod + def render_paragraph(self, text: str) -> str: + ... + + @abc.abstractmethod + def render_table(self, table) -> str: + ... + + @abc.abstractmethod + def render_scenario( + self, title, results: t.List[t.Tuple[str, t.List[Results]]] + ) -> str: + ... + + @abc.abstractmethod + def render_summary( + self, summary: t.List[t.Tuple[str, t.List[t.Tuple[str, bool, int]]]] + ) -> str: + ... + + def render_scenario( + self, title, table: t.List[t.Tuple[str, t.List[Results]]] + ) -> str: + self.render_header(title, level=2) + self.render_table(table) + print() + + def render_summary(self, summary): + self.render_header("Benchmark Summary", level=2) + self.render_paragraph(f"Comparison of {VERSIONS[-1]} against {VERSIONS[-2]}.") + + if not summary: + self.render_paragraph( + "No significant difference in performance between versions." + ) + return + + self.render_paragraph( + "The following scenarios show a statistically significant difference " + "in performance between the two versions." + ) + + self.render_table( + [ + ( + title, + { + m: {1: self.BETTER, -1: self.WORSE}[s] if c else self.SAME + for m, c, s in tests + }, + ) + for title, tests in summary + ] + ) + + +class TerminalRenderer(Renderer): + def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str: + _, row = table[0] + cols = list(row.keys()) + max_vh = max(len(e[0]) for e in table) + + col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols] + div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh + + print("=" * div_len) + print( + (" " * (max_vh + 2)) + + "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths)) + ) + print("-" * div_len) + + for v, row in table: + print(f"{v:^{max_vh+2}}", end="") + for col, cw in zip(cols, col_widths): + print(f"{str(row[col]):^{cw+2}}", end="") + print() + + print("=" * div_len) + + def render_header(self, title: str, level: int = 1) -> str: + print(title) + print({1: "=", 2: "-", 3: "~"}.get(level, "-") * len(title)) + print() + + def render_paragraph(self, text: str) -> str: + for _ in wrap(text): + print(_) + print() + + +class MarkdownRenderer(Renderer): + BETTER = ":green_circle:" + WORSE = ":red_circle:" + SAME = ":yellow_circle:" + + def render_header(self, title: str, level: int = 1) -> str: + print(f"{'#' * level} {title}") + print() + + def render_paragraph(self, text: str) -> str: + print(text) + print() + + def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str: + _, row = table[0] + cols = list(row.keys()) + max_vh = max(len(e[0]) for e in table) + + col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols] + div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh + + print("| |" + "|".join(f" {col} " for col in cols) + "|") + print("| --- |" + "|".join(f":{'-' * len(col)}:" for col in cols) + "|") + + for v, row in table: + print( + f"| {v} |" + + "|".join( + f" {str(row[col]):^{cw}} " for col, cw in zip(cols, col_widths) + ) + + "|" + ) + + def render_scenario( + self, title, table: t.List[t.Tuple[str, t.List[Results]]] + ) -> str: + print("
") + print(f"{title}") + print() + super().render_scenario(title, table) + print("
") print() - print("=" * div_len) + +def summarize(results: t.List[t.Tuple[str, t.List[Results]]]): + summary = [] + for title, table in results: + (_, a), (_, b) = table[-2:] + tests = [ + ( + m, + a[m] == b[m], + int((b[m].mean - a[m].mean) * s / (abs(b[m].mean - a[m].mean) or 1)), + ) + for m, s in METRICS + ] + if any(c for _, c, _ in tests): + summary.append((title, tests)) + return summary def main(): @@ -181,26 +339,41 @@ def main(): help="Number of times to run each scenario", ) + argp.add_argument( + "-f", + "--format", + type=str, + choices=["terminal", "markdown"], + default="terminal", + help="The output format", + ) + opts = argp.parse_args() - print( + renderer = {"terminal": TerminalRenderer, "markdown": MarkdownRenderer}[ + opts.format + ]() + + renderer.render_header("Austin Benchmarks") + renderer.render_paragraph( f"Running Austin benchmarks with Python {'.'.join(str(_) for _ in sys.version_info[:3])}", - end="\n\n", ) + results: t.List[t.Tuple[str, t.List[Results]]] = [] + for variant, title, args in SCENARIOS: if opts.k is not None and not opts.k.match(title): continue - print(title) - - table = [] + table: t.List[Results] = [] for version in VERSIONS: - print(f"> Running with Austin {version} ... ", end="\r") + print(f"> Running with Austin {version} ... ", end="\r", file=sys.stderr) try: austin = download_release(version, Path("/tmp"), variant_name=variant) except RuntimeError: - print(f"WARNING: Could not download {variant} {version}") + print( + f"WARNING: Could not download {variant} {version}", file=sys.stderr + ) continue stats = [ @@ -218,8 +391,13 @@ def main(): ) ) - render(table) - print() + renderer.render_scenario(title, table) + + results.append((title, table)) + + summary = summarize(results) + + renderer.render_summary(summary) if __name__ == "__main__": diff --git a/scripts/requirements-bm.txt b/scripts/requirements-bm.txt index edc3cd39..064d02d2 100644 --- a/scripts/requirements-bm.txt +++ b/scripts/requirements-bm.txt @@ -1 +1,2 @@ austin-python~=1.4.1 +scipy~=1.10.1 diff --git a/test/utils.py b/test/utils.py index b1901bda..7ef9b697 100644 --- a/test/utils.py +++ b/test/utils.py @@ -170,7 +170,7 @@ def __call__( # or using the "where" option. result.stdout = demojo(result.stdout) else: - result.stdout = result.stdout.decode() + result.stdout = result.stdout.decode(errors="ignore") result.stderr = result.stderr.decode() return result