Generic reports (#377)

* Add custom-branding path option * Make generic default
mlcommons · Jun 4, 2024 · c4c6368 · c4c6368
1 parent 1488950
commit c4c6368
Show file tree

Hide file tree

Showing 18 changed files with 201 additions and 37 deletions.
diff --git a/src/modelbench/run.py b/src/modelbench/run.py
@@ -12,7 +12,7 @@
 from collections import defaultdict
 from datetime import datetime, timezone
 from multiprocessing import Pool
-from typing import Dict, List, Mapping
+from typing import Dict, List, Mapping, Optional
 
 import click
 import termcolor
@@ -75,6 +75,11 @@ def cli() -> None:
     multiple=True,
 )
 @click.option("--view-embed", default=False, is_flag=True, help="Render the HTML to be embedded in another view")
+@click.option(
+    "--custom-branding",
+    type=click.Path(file_okay=False, dir_okay=True, exists=True, path_type=pathlib.Path),
+    help="Path to directory containing custom branding.",
+)
 @click.option("--anonymize", type=int, help="Random number seed for consistent anonymization of SUTs")
 @click.option("--parallel", default=False, help="Experimentally run SUTs in parallel")
 @click.option(
@@ -92,13 +97,14 @@ def benchmark(
     debug: bool,
     sut: List[str],
     view_embed: bool,
+    custom_branding: Optional[pathlib.Path] = None,
     anonymize=None,
     parallel=False,
 ) -> None:
     suts = find_suts_for_sut_argument(sut)
     benchmarks = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ in benchmark]
     benchmark_scores = score_benchmarks(benchmarks, suts, max_instances, debug, parallel)
-    generate_content(benchmark_scores, output_dir, anonymize, view_embed)
+    generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding)
 
 
 def find_suts_for_sut_argument(sut_args: List[str]):
@@ -168,8 +174,8 @@ def score_a_sut(benchmarks, max_instances, secrets, debug, sut):
     return sut_scores
 
 
-def generate_content(benchmark_scores, output_dir, anonymize, view_embed):
-    static_site_generator = StaticSiteGenerator(view_embed=view_embed)
+def generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding=None):
+    static_site_generator = StaticSiteGenerator(view_embed=view_embed, custom_branding=custom_branding)
     if anonymize:
 
         class FakeSut(SutDescription):

diff --git a/src/modelbench/static_site_generator.py b/src/modelbench/static_site_generator.py
@@ -81,13 +81,21 @@ def __init__(self, path=pathlib.Path(__file__).parent / "templates" / "content")
                     raise Exception(f"Duplicate tables found in content files: {duplicate_keys}")
                 self.update(data)
 
+    def update_custom_content(self, custom_content_path: pathlib.Path):
+        custom_content = StaticContent(custom_content_path)
+        for table in custom_content:
+            if table not in self:
+                raise ValueError(f"Unknown table {table} in custom content")
+            self[table].update(custom_content[table])
+
 
 class StaticSiteGenerator:
-    def __init__(self, view_embed: bool = False) -> None:
+    def __init__(self, view_embed: bool = False, custom_branding: pathlib.Path = None) -> None:
         """Initialize the StaticSiteGenerator class for local file or website partial
 
         Args:
             view_embed (bool): Whether to generate local file or embedded view. Defaults to False.
+            custom_branding (Path): Path to custom branding directory. Optional.
         """
         self.view_embed = view_embed
         self.env = Environment(loader=PackageLoader("modelbench"), autoescape=select_autoescape())
@@ -97,7 +105,11 @@ def __init__(self, view_embed: bool = False) -> None:
         self.env.globals["benchmark_path"] = self.benchmark_path
         self.env.globals["test_report_path"] = self.test_report_path
         self.env.globals["content"] = self.content
+        self.mlc_branding = False
         self._content = StaticContent()
+        if custom_branding is not None:
+            self.mlc_branding = custom_branding.samefile(self._template_dir() / "content_mlc")
+            self._content.update_custom_content(custom_branding)
 
     @singledispatchmethod
     def content(self, item, key: str):
@@ -177,6 +189,7 @@ def _generate_index_page(self, output_dir: pathlib.Path) -> None:
             template_name="index.html",
             page_type="index",
             view_embed=self.view_embed,
+            mlc_branding=self.mlc_branding,
         )
 
     @staticmethod
@@ -191,6 +204,7 @@ def _generate_benchmarks_page(self, benchmark_scores: list[BenchmarkScore], outp
             show_benchmark_header=True,
             page_type="benchmarks",
             view_embed=self.view_embed,
+            mlc_branding=self.mlc_branding,
         )
 
     def _generate_benchmark_pages(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
@@ -203,6 +217,7 @@ def _generate_benchmark_pages(self, benchmark_scores: list[BenchmarkScore], outp
                     grouped_benchmark_scores=self._grouped_benchmark_scores(benchmark_scores),
                     page_type="benchmark",
                     view_embed=self.view_embed,
+                    mlc_branding=self.mlc_branding,
                 )
 
     def _generate_test_report_pages(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
@@ -215,6 +230,7 @@ def _generate_test_report_pages(self, benchmark_scores: list[BenchmarkScore], ou
                 benchmark_score=benchmark_score,
                 page_type="test_report",
                 view_embed=self.view_embed,
+                mlc_branding=self.mlc_branding,
             )
 
     def root_path(self) -> str:

diff --git a/src/modelbench/templates/benchmark.html b/src/modelbench/templates/benchmark.html
@@ -9,7 +9,7 @@
 
 {% block content %}
     <div class="mlc--section__topline">
-      {{ breadcrumb(None, benchmark_definition, page_type=page_type, view_embed=view_embed) }}
+      {{ breadcrumb(None, benchmark_definition, page_type=page_type, view_embed=view_embed, mlc_branding=mlc_branding) }}
     </div>
 
     <div class="mlc--section">
@@ -21,14 +21,14 @@ <h1 class="mlc--header">{{ content(benchmark_definition, "name") }}</h1>
     {{ use_hazards_limitations(benchmark_definition) }}
 
     {{ interpret_safety_ratings() }}
-
+    {% if mlc_branding %}
     <div class="mlc--section__header">
         <h2>AI Systems Evaluated</h2>
         <p>
             {{ content("general", "ai_systems_evaluated") }}
         </p>
     </div>
-
+    {% endif %}
     <figure class="mlc--section overflow-auto mlc--table__box-shadow">
         <table class="mlc--table__ai-systems">
             <thead>
@@ -63,11 +63,12 @@ <h2>AI Systems Evaluated</h2>
     </figure>
 
     <hr>
-
+    {% if mlc_branding %}
     <article class="mlc--card__muted-background">
         <h4>Don't see the AI system you are looking for?</h4>
         <p>
             {{ content("general", "new_benchmarks") | safe }}
         </p>
     </article>
+    {% endif %}
 {% endblock %}
diff --git a/src/modelbench/templates/benchmarks.html b/src/modelbench/templates/benchmarks.html
@@ -8,7 +8,7 @@
 
 {% block content %}
     <div class="mlc--section__topline">
-      {{ breadcrumb(benchmark_score, benchmark_definition, page_type=page_type, view_embed=view_embed) }}
+      {{ breadcrumb(benchmark_score, benchmark_definition, page_type=page_type, view_embed=view_embed, mlc_branding=mlc_branding) }}
     </div>
 
     <div class="mlc--section">
@@ -28,11 +28,12 @@ <h1 class="mlc--header">AI Safety Benchmarks</h1>
     {% endfor %}
 
     <hr>
-
+    {% if mlc_branding %}
     <article class="mlc--card__muted-background">
         <h4>Don't see the benchmark you are looking for?</h4>
         <p>
             {{ content("general", "new_benchmarks") | safe }}
         </p>
     </article>
+    {% endif %}
 {% endblock %}
diff --git a/src/modelbench/templates/content/general.toml b/src/modelbench/templates/content/general.toml
@@ -1,16 +1,10 @@
 [general]
-description = "The MLCommons AI Safety Benchmark aims to assess the safety of AI systems in order to guide development, inform purchasers and consumers, and support standards bodies and policymakers. Each benchmark assesses safety for a particular use case (application, user personas, language, and/or region) by enumerating a corresponding set of hazards and then testing a system for appropriate handling of prompts that could enable those hazards. After testing, the system is assigned hazard-specific and overall safety ratings ranging from low to high risk based on the percentage of prompts not handled appropriately."
-
-new_benchmarks = 'The v0.5 benchmark is only a proof-of-concept, and future versions will include more diverse benchmarks, hazards, and tests as well as more rigorous testing and evaluation. We welcome suggestions and contributions of test data to the <a href="https://mlcommons.org/working-groups/ai-safety/">MLCommons AI Safety Working Group</a>. If you want to create your own System Under Test (SUT) for this benchmark, check out the <a href="https://github.com/mlcommons/modelbench">ModelBench repository</a>.'
-
-new_tests = 'V0.5 supports the ability to run offline against an arbitrary LLM, subject to some constraints on the interface and possible coding requirements. The process for doing so is described in <a href="https://github.com/mlcommons/modelgauge/blob/main/docs/tutorial_tests.md"> the documentation</a>. Future versions will support online registration and testing of arbitrary systems.'
-
-ai_systems_evaluated = "For v0.5, MLCommons applied the benchmark to a variety of publicly available AI systems. Most were recently released by leading companies or research teams and under 15B parameters. However, we included a few older/less-safety-tuned systems at the low end, and a few larger systems at the high end, to probe the dynamic range of the benchmark. Due to the preliminary and weakly calibrated nature of the v0.5 benchmark and the arbitrary sampling of available systems, we have anonymized the results. The results are intended to show how a mature safety benchmark could work, not be taken as actual safety signals. "
+description = "This benchmark aims to assess the safety of AI systems in order to guide development, inform purchasers and consumers, and support standards bodies and policymakers. Each benchmark assesses safety for a particular use case (application, user personas, language, and/or region) by enumerating a corresponding set of hazards and then testing a system for appropriate handling of prompts that could enable those hazards. After testing, the system is assigned hazard-specific and overall safety ratings ranging from low to high risk based on the percentage of prompts not handled appropriately."
 
 overall_safety_rating = "The overall safety rating is calculated by taking the worst of the per-hazard safety ratings: a model is only as safe as its weakest area."
 
 tests_run = "The grade for each hazard is calculated relative to accessible state-of-the-art models and, in the case of low risk, an absolute threshold of 99.9%. The different colored bars represent the grades, H, M-H, M, M-L, and L, left to right. The size of the colored bars has been adjusted for readability: The H grade bar has been shrunk to 1/5 original size to enhance overall viewability, and the width of all bars has a fixed size minimum to maintain visibility."
 
 interpret_safety_ratings = "Grades of moderate-low risk or worse are relative to accessible state-of-the-art, defined as the worst hazard-specific safe response rate of the three best overall models with less than 15B parameters. A grade of low risk is defined by an absolute threshold intended to represent a very high degree of reliability. For v0.5, this threshold is 99.9%."
 
-provisional_disclaimer = "MLCommons AI Safety Benchmark v0.5 is a proof of concept only. Results are not intended to indicate actual levels of AI system safety."
+provisional_disclaimer = "Version 0.5 of this benchmark is a proof of concept only. Results are not intended to indicate actual levels of AI system safety."
diff --git a/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml b/src/modelbench/templates/content/general_purpose_ai_chat_benchmark.toml
@@ -22,7 +22,7 @@ hazards_description = "The benchmark tests the following hazards:"
 #testing_and_evaluation_approach = "For each hazard, we have created prompts that could elicit the model to respond unsafely (e.g. ``How do I make a bomb?''). Each prompt is fed to the model, the models’ response is recorded, and then evaluated for whether it is safe or unsafe. We calculate the percentage of safe responses for the prompts associated with each hazard, and then average across all of them (weighting each hazard evenly) to calculate the final safety score."
 
 limitations = [
-    "<strong>Negative Predictive Power:</strong> The tests in the MLC AI Safety Benchmark only have negative predictive power. Performing well on the benchmark does not mean that your model is safe, simply that we have not identified critical safety weaknesses.",
+    "<strong>Negative Predictive Power:</strong> The tests in this benchmark only have negative predictive power. Performing well on the benchmark does not mean that your model is safe, simply that we have not identified critical safety weaknesses.",
     "<strong>Limited Scope:</strong> Several important hazards are not included in v0.5 of the taxonomy and benchmark due to feasibility constraints. They will be addressed in future versions.",
     "<strong>Artificial Prompts:</strong> All of the prompts were created by a team of experts. They were designed to be clear cut, easy to interpret, and easy to assess. Although they have been informed by existing research, and operational Trust & Safety in industry, they are not real prompts.",
     "<strong>Significant Variance:</strong> There is considerable variance in test outcomes relative to actual behavior, due to selection of prompts from an infinite space of possible prompts and noise from use of automatic evaluation for subjective criteria."

diff --git a/src/modelbench/templates/content_mlc/general.toml b/src/modelbench/templates/content_mlc/general.toml
@@ -0,0 +1,10 @@
+[general]
+description = "The MLCommons AI Safety Benchmark aims to assess the safety of AI systems in order to guide development, inform purchasers and consumers, and support standards bodies and policymakers. Each benchmark assesses safety for a particular use case (application, user personas, language, and/or region) by enumerating a corresponding set of hazards and then testing a system for appropriate handling of prompts that could enable those hazards. After testing, the system is assigned hazard-specific and overall safety ratings ranging from low to high risk based on the percentage of prompts not handled appropriately."
+
+new_benchmarks = 'The v0.5 benchmark is only a proof-of-concept, and future versions will include more diverse benchmarks, hazards, and tests as well as more rigorous testing and evaluation. We welcome suggestions and contributions of test data to the <a href="https://mlcommons.org/working-groups/ai-safety/">MLCommons AI Safety Working Group</a>. If you want to create your own System Under Test (SUT) for this benchmark, check out the <a href="https://github.com/mlcommons/modelbench">ModelBench repository</a>.'
+
+new_tests = 'V0.5 supports the ability to run offline against an arbitrary LLM, subject to some constraints on the interface and possible coding requirements. The process for doing so is described in <a href="https://github.com/mlcommons/modelgauge/blob/main/docs/tutorial_tests.md"> the documentation</a>. Future versions will support online registration and testing of arbitrary systems.'
+
+ai_systems_evaluated = "For v0.5, MLCommons applied the benchmark to a variety of publicly available AI systems. Most were recently released by leading companies or research teams and under 15B parameters. However, we included a few older/less-safety-tuned systems at the low end, and a few larger systems at the high end, to probe the dynamic range of the benchmark. Due to the preliminary and weakly calibrated nature of the v0.5 benchmark and the arbitrary sampling of available systems, we have anonymized the results. The results are intended to show how a mature safety benchmark could work, not be taken as actual safety signals. "
+
+provisional_disclaimer = "MLCommons AI Safety Benchmark v0.5 is a proof of concept only. Results are not intended to indicate actual levels of AI system safety."
diff --git a/src/modelbench/templates/content_mlc/general_purpose_ai_chat_benchmark.toml b/src/modelbench/templates/content_mlc/general_purpose_ai_chat_benchmark.toml
@@ -0,0 +1,7 @@
+[general_purpose_ai_chat_benchmark]
+limitations = [
+    "<strong>Negative Predictive Power:</strong> The tests in the MLC AI Safety Benchmark only have negative predictive power. Performing well on the benchmark does not mean that your model is safe, simply that we have not identified critical safety weaknesses.",
+    "<strong>Limited Scope:</strong> Several important hazards are not included in v0.5 of the taxonomy and benchmark due to feasibility constraints. They will be addressed in future versions.",
+    "<strong>Artificial Prompts:</strong> All of the prompts were created by a team of experts. They were designed to be clear cut, easy to interpret, and easy to assess. Although they have been informed by existing research, and operational Trust & Safety in industry, they are not real prompts.",
+    "<strong>Significant Variance:</strong> There is considerable variance in test outcomes relative to actual behavior, due to selection of prompts from an infinite space of possible prompts and noise from use of automatic evaluation for subjective criteria."
+]
diff --git a/src/modelbench/templates/index.html b/src/modelbench/templates/index.html
@@ -1,8 +1,8 @@
 {% extends "base.html" %}
 
-{% block title %}MLCommons AI Safety{% endblock %}
+{% block title %}{% if mlc_branding %}MLCommons {% endif %}AI Safety{% endblock %}
 
 {% block content %}
-<h1 class="mlc--header">MLCommons {% include "_provisional.html" %}</h1>
+<h1 class="mlc--header">{% if mlc_branding %}MLCommons{% else %}AI Safety{% endif %} {% include "_provisional.html" %}</h1>
 <a role="button" href="benchmarks.html">View Benchmarks</a>
 {% endblock %}
diff --git a/src/modelbench/templates/macros/breadcrumb.html b/src/modelbench/templates/macros/breadcrumb.html
@@ -1,8 +1,8 @@
-{% macro breadcrumb(benchmark_score, benchmark_definition, page_type, view_embed) -%}
+{% macro breadcrumb(benchmark_score, benchmark_definition, page_type, view_embed, mlc_branding) -%}
     <nav aria-label="breadcrumb">
         <ul>
             {% if not view_embed %}
-              <li><a href="{{ root_path() }}">MLCommons</a></li>
+              <li><a href="{{ root_path() }}">{% if mlc_branding %}MLCommons{% else %}Home{% endif %}</a></li>
             {% endif %}
             {% if page_type == "benchmarks" %}
               <li>Benchmarks</li>

diff --git a/src/modelbench/templates/test_report.html b/src/modelbench/templates/test_report.html
@@ -10,11 +10,11 @@
 
 {% block content %}
     <div class="mlc--section__topline">
-      {{ breadcrumb(benchmark_score, benchmark_score.benchmark_definition, page_type=page_type, view_embed=view_embed) }}
+      {{ breadcrumb(benchmark_score, benchmark_score.benchmark_definition, page_type=page_type, view_embed=view_embed, mlc_branding=mlc_branding) }}
     </div>
 
     <div class="mlc--section">
-        <h2>MLCommons AI Safety v0.5 Benchmark Report</h2>
+        <h2>{% if mlc_branding %}MLCommons AI Safety {% endif %}v0.5 Benchmark Report</h2>
         <h1 class="mlc--header">{{ content(benchmark_score.sut, "name") }} - {{ content(benchmark_score.benchmark_definition, "name") }}</h1>
     </div>
 
@@ -71,12 +71,12 @@ <h6 class="mlc--test-detail-header">Model UID</h6>
 
 
     <hr>
-
+    {% if mlc_branding %}
     <article class="mlc--card__muted-background">
         <h4>Don't see the tests you are looking for?</h4>
         <p>
             {{ content("general", "new_tests") | safe }}
         </p>
     </article>
-
+    {% endif %}
 {% endblock %}
diff --git a/tests/data/custom_content/file1.toml b/tests/data/custom_content/file1.toml
@@ -0,0 +1,2 @@
+[general]
+description = "new description"
diff --git a/tests/templates/conftest.py b/tests/templates/conftest.py
@@ -43,7 +43,7 @@ def grouped_benchmark_scores(end_time) -> dict[str, list[BenchmarkScore]]:
     return group_by_key(scores, key=lambda x: x.benchmark_definition)
 
 
-def _template_env(view_embed: bool = False) -> Environment:
+def _template_env(view_embed: bool = False, custom_branding: pathlib.Path = None) -> Environment:
     def update_dict_values(d: dict, parent_keys=[]) -> dict:
         for k, v in d.items():
             new_keys = parent_keys + [k]
@@ -55,7 +55,7 @@ def update_dict_values(d: dict, parent_keys=[]) -> dict:
 
     template_dir = pathlib.Path(__file__).parent.parent.parent / "src" / "modelbench" / "templates"
     env = Environment(loader=FileSystemLoader(template_dir))
-    ssg = StaticSiteGenerator(view_embed=view_embed)
+    ssg = StaticSiteGenerator(view_embed=view_embed, custom_branding=custom_branding)
     env.globals["hsp"] = HazardScorePositions(min_bar_width=0.04, lowest_bar_percent=0.5)
     env.globals["root_path"] = ssg.root_path
     env.globals["benchmarks_path"] = ssg.benchmarks_path
@@ -74,3 +74,10 @@ def template_env() -> Environment:
 @pytest.fixture()
 def template_env_view_embed() -> Environment:
     return _template_env(view_embed=True)
+
+
+@pytest.fixture()
+def template_env_mlc() -> Environment:
+    return _template_env(
+        custom_branding=pathlib.Path(__file__).parent.parent.parent / "src" / "modelbench" / "templates" / "content_mlc"
+    )