Merge pull request #174 from P403n1x87/ci/benchmarks-ttest

ci(benchmarks): improve benchmark reporting
P403n1x87 · Feb 28, 2023 · 4c3e26b · 4c3e26b
2 parents 01a247a + b9d38aa
commit 4c3e26b
Show file tree

Hide file tree

Showing 4 changed files with 208 additions and 32 deletions.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -37,12 +37,9 @@ jobs:
           ulimit -c unlimited
 
           source .venv/bin/activate
-          python scripts/benchmark.py | tee benchmarks.txt
+          python scripts/benchmark.py --format markdown | tee comment.txt
           deactivate
 
-          # Make it a code comment
-          sed -e $'1i\\\n~~~' -e $'$a\\\n~~~' benchmarks.txt > comment.txt
-
       - name: Post results on PR
         uses: marocchino/sticky-pull-request-comment@v2
         with:

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -1,14 +1,18 @@
 # Run as python3 scripts/benchmark.py from the repository root directory.
 # Ensure dependencies from requirements-bm.txt are installed.
 
+import abc
 import re
 import sys
+from textwrap import wrap
 import typing as t
 from argparse import ArgumentParser
 from itertools import product
 from math import floor, log
 from pathlib import Path
 
+from scipy.stats import ttest_ind
+
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
 import tarfile
@@ -17,7 +21,7 @@
 from urllib.error import HTTPError
 from urllib.request import urlopen
 
-VERSIONS = ("3.2.0", "3.3.0", "3.4.1", "dev")
+VERSIONS = ("3.4.1", "3.5.0", "dev")
 SCENARIOS = [
     *[
         (
@@ -62,6 +66,15 @@
 ]
 
 
+# The metrics we evaluate and whether they are to be maximised or minimised.
+METRICS = [
+    ("Sample Rate", +1),
+    ("Saturation", -1),
+    ("Error Rate", -1),
+    ("Sampling Speed", -1),
+]
+
+
 def get_stats(output: str) -> t.Optional[dict]:
     try:
         meta = metadata(output)
@@ -120,6 +133,7 @@ def download_release(version: str, dest: Path, variant_name: str = "austin") ->
 
 class Outcome:
     def __init__(self, data: list[float]) -> None:
+        self.data = data
         self.mean = sum(data) / len(data)
         self.stdev = (
             sum(((v - self.mean) ** 2 for v in data)) / (len(data) - 1)
@@ -140,29 +154,173 @@ def __repr__(self):
     def __len__(self):
         return len(repr(self))
 
+    def __eq__(self, other: "Outcome") -> bool:
+        t, p = ttest_ind(self.data, other.data, equal_var=False)
+        return p < 0.05
 
-def render(table):
-    _, row = table[0]
-    cols = list(row.keys())
-    max_vh = max(len(e[0]) for e in table)
 
-    col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
-    div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
+Results = t.Tuple[str, t.Dict[str, Outcome]]
 
-    print("=" * div_len)
-    print(
-        (" " * (max_vh + 2))
-        + "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
-    )
-    print("-" * div_len)
 
-    for v, row in table:
-        print(f"{v:^{max_vh+2}}", end="")
-        for col, cw in zip(cols, col_widths):
-            print(f"{str(row[col]):^{cw+2}}", end="")
+class Renderer(abc.ABC):
+    BETTER = "better"
+    WORSE = "worse"
+    SAME = "same"
+
+    @abc.abstractmethod
+    def render_header(self, title: str, level: int = 1) -> str:
+        ...
+
+    @abc.abstractmethod
+    def render_paragraph(self, text: str) -> str:
+        ...
+
+    @abc.abstractmethod
+    def render_table(self, table) -> str:
+        ...
+
+    @abc.abstractmethod
+    def render_scenario(
+        self, title, results: t.List[t.Tuple[str, t.List[Results]]]
+    ) -> str:
+        ...
+
+    @abc.abstractmethod
+    def render_summary(
+        self, summary: t.List[t.Tuple[str, t.List[t.Tuple[str, bool, int]]]]
+    ) -> str:
+        ...
+
+    def render_scenario(
+        self, title, table: t.List[t.Tuple[str, t.List[Results]]]
+    ) -> str:
+        self.render_header(title, level=2)
+        self.render_table(table)
+        print()
+
+    def render_summary(self, summary):
+        self.render_header("Benchmark Summary", level=2)
+        self.render_paragraph(f"Comparison of {VERSIONS[-1]} against {VERSIONS[-2]}.")
+
+        if not summary:
+            self.render_paragraph(
+                "No significant difference in performance between versions."
+            )
+            return
+
+        self.render_paragraph(
+            "The following scenarios show a statistically significant difference "
+            "in performance between the two versions."
+        )
+
+        self.render_table(
+            [
+                (
+                    title,
+                    {
+                        m: {1: self.BETTER, -1: self.WORSE}[s] if c else self.SAME
+                        for m, c, s in tests
+                    },
+                )
+                for title, tests in summary
+            ]
+        )
+
+
+class TerminalRenderer(Renderer):
+    def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
+        _, row = table[0]
+        cols = list(row.keys())
+        max_vh = max(len(e[0]) for e in table)
+
+        col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
+        div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
+
+        print("=" * div_len)
+        print(
+            (" " * (max_vh + 2))
+            + "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
+        )
+        print("-" * div_len)
+
+        for v, row in table:
+            print(f"{v:^{max_vh+2}}", end="")
+            for col, cw in zip(cols, col_widths):
+                print(f"{str(row[col]):^{cw+2}}", end="")
+            print()
+
+        print("=" * div_len)
+
+    def render_header(self, title: str, level: int = 1) -> str:
+        print(title)
+        print({1: "=", 2: "-", 3: "~"}.get(level, "-") * len(title))
+        print()
+
+    def render_paragraph(self, text: str) -> str:
+        for _ in wrap(text):
+            print(_)
+        print()
+
+
+class MarkdownRenderer(Renderer):
+    BETTER = ":green_circle:"
+    WORSE = ":red_circle:"
+    SAME = ":yellow_circle:"
+
+    def render_header(self, title: str, level: int = 1) -> str:
+        print(f"{'#' * level} {title}")
+        print()
+
+    def render_paragraph(self, text: str) -> str:
+        print(text)
+        print()
+
+    def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
+        _, row = table[0]
+        cols = list(row.keys())
+        max_vh = max(len(e[0]) for e in table)
+
+        col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
+        div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
+
+        print("|     |" + "|".join(f" {col} " for col in cols) + "|")
+        print("| --- |" + "|".join(f":{'-' * len(col)}:" for col in cols) + "|")
+
+        for v, row in table:
+            print(
+                f"| {v} |"
+                + "|".join(
+                    f" {str(row[col]):^{cw}} " for col, cw in zip(cols, col_widths)
+                )
+                + "|"
+            )
+
+    def render_scenario(
+        self, title, table: t.List[t.Tuple[str, t.List[Results]]]
+    ) -> str:
+        print("<details>")
+        print(f"<summary><strong>{title}</strong></summary>")
+        print()
+        super().render_scenario(title, table)
+        print("</details>")
         print()
 
-    print("=" * div_len)
+
+def summarize(results: t.List[t.Tuple[str, t.List[Results]]]):
+    summary = []
+    for title, table in results:
+        (_, a), (_, b) = table[-2:]
+        tests = [
+            (
+                m,
+                a[m] == b[m],
+                int((b[m].mean - a[m].mean) * s / (abs(b[m].mean - a[m].mean) or 1)),
+            )
+            for m, s in METRICS
+        ]
+        if any(c for _, c, _ in tests):
+            summary.append((title, tests))
+    return summary
 
 
 def main():
@@ -181,26 +339,41 @@ def main():
         help="Number of times to run each scenario",
     )
 
+    argp.add_argument(
+        "-f",
+        "--format",
+        type=str,
+        choices=["terminal", "markdown"],
+        default="terminal",
+        help="The output format",
+    )
+
     opts = argp.parse_args()
 
-    print(
+    renderer = {"terminal": TerminalRenderer, "markdown": MarkdownRenderer}[
+        opts.format
+    ]()
+
+    renderer.render_header("Austin Benchmarks")
+    renderer.render_paragraph(
         f"Running Austin benchmarks with Python {'.'.join(str(_) for _ in sys.version_info[:3])}",
-        end="\n\n",
     )
 
+    results: t.List[t.Tuple[str, t.List[Results]]] = []
+
     for variant, title, args in SCENARIOS:
         if opts.k is not None and not opts.k.match(title):
             continue
 
-        print(title)
-
-        table = []
+        table: t.List[Results] = []
         for version in VERSIONS:
-            print(f"> Running with Austin {version} ...    ", end="\r")
+            print(f"> Running with Austin {version} ...    ", end="\r", file=sys.stderr)
             try:
                 austin = download_release(version, Path("/tmp"), variant_name=variant)
             except RuntimeError:
-                print(f"WARNING: Could not download {variant} {version}")
+                print(
+                    f"WARNING: Could not download {variant} {version}", file=sys.stderr
+                )
                 continue
 
             stats = [
@@ -218,8 +391,13 @@ def main():
                 )
             )
 
-        render(table)
-        print()
+        renderer.render_scenario(title, table)
+
+        results.append((title, table))
+
+    summary = summarize(results)
+
+    renderer.render_summary(summary)
 
 
 if __name__ == "__main__":

diff --git a/scripts/requirements-bm.txt b/scripts/requirements-bm.txt
@@ -1 +1,2 @@
 austin-python~=1.4.1
+scipy~=1.10.1
diff --git a/test/utils.py b/test/utils.py
@@ -170,7 +170,7 @@ def __call__(
             # or using the "where" option.
             result.stdout = demojo(result.stdout)
         else:
-            result.stdout = result.stdout.decode()
+            result.stdout = result.stdout.decode(errors="ignore")
         result.stderr = result.stderr.decode()
 
         return result