AbanteAI · jakethekoenig · Dec 19, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023
diff --git a/mentat/resources/templates/benchmark.jinja b/mentat/resources/templates/benchmark.jinja
@@ -0,0 +1,83 @@
+{% from 'transcript.jinja' import transcript_container %}
+<!DOCTYPE html>
+<html>
+    <head>
+        <title>Benchmark Results</title>
+        <style>
+            {% include 'css/benchmark.css' %}
+            {% include 'css/transcript.css' %}
+        </style>
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css">
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+        <script>hljs.initHighlightingOnLoad();</script>
+        <script>
+            document.addEventListener("DOMContentLoaded", function() {
+                function updateVisibility() {
+                    const hash = window.location.hash.substring(1);
+                    const resultSections = document.querySelectorAll('.result-section');
+                    resultSections.forEach(section => {
+                        if (section.id === hash || hash === '') {
+                            section.style.display = 'block';
+                        } else {
+                            section.style.display = 'none';
+                        }
+                    });
+                }
+
+                window.addEventListener('hashchange', updateVisibility);
+            });
+        </script>
+        <script>
+            function toggleVisibility(family) {
+                const familyButtons = document.querySelectorAll('.' + family + '-child');
+                familyButtons.forEach(button => {
+                    button.classList.toggle('hidden');
+                });
+            }
+        </script>
+        <script>
+            {% include 'js/transcript.js' %}
+        </script>
+    </head>
+    <body>
+        <div id="header">
+            {% for field, data in summary.formatted_summary().items() %}
+                <p>{{ field|capitalize }}: {{ data }}</p>
+            {% endfor %}
+        </div>
+        <div id="container">
+            <div id="selector">
+                {% for family, results in summary.result_groups.items() %}
+                    {% if results|length == 1 %}
+                        <button class="{{ results[0].display_color() }}" onclick="window.location.hash='{{ results[0].name }}'">{{ results[0].name }}</button>
+                    {% else %}
+                        <button class="{{ results[0].display_color() }}" onclick="toggleVisibility('{{family}}')">{{family}}</button>
+                        {% for result in results %}
+                            <button class="hidden {{ family }}-child family-child {{ result.display_color() }}" onclick="window.location.hash='{{ result.name }}'">{{ result.name }}</button>
+                        {% endfor %}
+                    {% endif %}
+                {% endfor %}
+            </div>
+            <div id="viewer">
+                {% for test, formatted_result in summary.formatted_results().items() %}
+                    <div id="{{ test }}" class="result-section">
+                        {% for display_name, section in formatted_result.items() %}
+                        <h1>{{ display_name|capitalize }}</h1>
+                            <div class="content">
+                                {% if section.type == "text" %}
+                                    {{ section.content }}
+                                {% elif section.type == "code" %}
+                                    <pre><code>{{ section.content }}</code></pre>
+                                {% elif section.type == "json" %}
+                                    <pre>{{ section.content |tojson(indent=4)|safe}}</pre>
+                                {% elif section.type == "transcript" %}
+                                    {{ transcript_container(section.content) }}
+                                {% endif %}
+                            </div>
+                        {% endfor %}
+                    </div>
+                {% endfor %}
+            </div>
+        </div>
+    </body>
+</html>
diff --git a/mentat/resources/templates/css/benchmark.css b/mentat/resources/templates/css/benchmark.css
@@ -60,18 +60,34 @@ button:hover {
     background-color: #e9e9e9;
 }
 
-button.success {
+button.green {
     background-color: #d4edda;
     color: #155724;
     border-color: #c3e6cb;
 }
 
-button.failure {
+button.yellow {
+    background-color: #fff3cd;
+    color: #856404;
+    border-color: #ffeeba;
+}
+
+button.grey {
+    background-color: #f8f9fa;
+    color: #6c757d;
+    border-color: #f8f9fa;
+}
+
+button.red {
     background-color: #f8d7da;
     color: #721c24;
     border-color: #f5c6cb;
 }
 
+.hidden {
+    display: none;
+}
+
 .content {
     white-space: pre-wrap;
     word-break: break-word;
@@ -83,6 +99,10 @@ button.failure {
     max-height: 70vh;
 }
 
+.family-child {
+    margin-left: 20px;
+}
+
 .content .container {
     display: flex;
 }

diff --git a/mentat/resources/templates/exercism_benchmark.jinja b/mentat/resources/templates/exercism_benchmark.jinja
diff --git a/tests/benchmarks/benchmark_result.py b/tests/benchmarks/benchmark_result.py
@@ -8,18 +8,73 @@
 
 @attr.define
 class BenchmarkResult:
-    iterations: int
-    passed: bool
-    name: str
-    cost: float
-    tokens: int
-    transcript: Optional[Transcript] = attr.ib(default=None)
-    instructions: Optional[str] = attr.ib(default=None)
-    code: Optional[str] = attr.ib(default=None)
-    test_output: Optional[str] = attr.ib(default=None)
-    response: Optional[str] = attr.ib(default=None)
-    reason: Optional[str] = attr.ib(default=None)
-    success: Optional[bool] = attr.ib(default=None)
+    name: str = attr.ib()
+    family: Optional[str] = attr.ib(default=None)
+    cost: Optional[float] = attr.ib(default=None, metadata={"aggregation": "sum"})
+    tokens: Optional[int] = attr.ib(default=None, metadata={"aggregation": "average"})
+    iterations: Optional[int] = attr.ib(
+        default=None, metadata={"aggregation": "histogram"}
+    )
+    transcript: Optional[Transcript] = attr.ib(
+        default=None, metadata={"display": "transcript"}
+    )
+    instructions: Optional[str] = attr.ib(default=None, metadata={"display": "text"})
+    code: Optional[str] = attr.ib(default=None, metadata={"display": "code"})
+    test_output: Optional[str] = attr.ib(
+        default=None, metadata={"formatted_name": "Test output", "display": "code"}
+    )
+    response: Optional[str] = attr.ib(
+        default=None, metadata={"formatted_name": "Analysis", "display": "text"}
+    )
+    reason: Optional[str] = attr.ib(default=None, metadata={"aggregation": "histogram"})
+    # For exercism benchmarks
+    passed: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
+    # New optional fields for benchmark results
+    diff_grade: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
+    response_grade: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
+    comparison_grade: Optional[dict] = attr.ib(
+        default=None, metadata={"display": "json"}
+    )
+    verify: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
+    off_by_one: Optional[bool] = attr.ib(
+        default=None, metadata={"aggregation": "percent"}
+    )
+    indentation_error: Optional[bool] = attr.ib(
+        default=None, metadata={"aggregation": "percent"}
+    )
+    syntax_error: Optional[bool] = attr.ib(
+        default=None, metadata={"aggregation": "percent"}
+    )
+    missing_functionality: Optional[bool] = attr.ib(
+        default=None, metadata={"aggregation": "percent"}
+    )
+    extra_functionality: Optional[bool] = attr.ib(
+        default=None, metadata={"aggregation": "percent"}
+    )
+    referenced_format: Optional[bool] = attr.ib(
+        default=None, metadata={"aggregation": "percent"}
+    )
+
+    def display_color(self) -> str:
+        if self.passed is None:
+            if self.indentation_error or self.off_by_one or self.syntax_error:
+                return "grey"
+            if (
+                self.missing_functionality
+                or self.extra_functionality
+                or self.referenced_format
+            ):
+                return "yellow"
+            if self.verify is not None:
+                if self.verify:
+                    return "green"
+                else:
+                    return "red"
+            return "green"
+        elif self.passed:
+            return "green"
+        else:
+            return "red"
 
     def to_json(self):
         return json.dumps(attr.asdict(self))