Skip to content

Commit

Permalink
Generic reports (#377)
Browse files Browse the repository at this point in the history
* Add custom-branding path option

* Make generic default
  • Loading branch information
bkorycki authored Jun 4, 2024
1 parent 1488950 commit c4c6368
Show file tree
Hide file tree
Showing 18 changed files with 201 additions and 37 deletions.
14 changes: 10 additions & 4 deletions src/modelbench/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from collections import defaultdict
from datetime import datetime, timezone
from multiprocessing import Pool
from typing import Dict, List, Mapping
from typing import Dict, List, Mapping, Optional

import click
import termcolor
Expand Down Expand Up @@ -75,6 +75,11 @@ def cli() -> None:
multiple=True,
)
@click.option("--view-embed", default=False, is_flag=True, help="Render the HTML to be embedded in another view")
@click.option(
"--custom-branding",
type=click.Path(file_okay=False, dir_okay=True, exists=True, path_type=pathlib.Path),
help="Path to directory containing custom branding.",
)
@click.option("--anonymize", type=int, help="Random number seed for consistent anonymization of SUTs")
@click.option("--parallel", default=False, help="Experimentally run SUTs in parallel")
@click.option(
Expand All @@ -92,13 +97,14 @@ def benchmark(
debug: bool,
sut: List[str],
view_embed: bool,
custom_branding: Optional[pathlib.Path] = None,
anonymize=None,
parallel=False,
) -> None:
suts = find_suts_for_sut_argument(sut)
benchmarks = [b() for b in BenchmarkDefinition.__subclasses__() if b.__name__ in benchmark]
benchmark_scores = score_benchmarks(benchmarks, suts, max_instances, debug, parallel)
generate_content(benchmark_scores, output_dir, anonymize, view_embed)
generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding)


def find_suts_for_sut_argument(sut_args: List[str]):
Expand Down Expand Up @@ -168,8 +174,8 @@ def score_a_sut(benchmarks, max_instances, secrets, debug, sut):
return sut_scores


def generate_content(benchmark_scores, output_dir, anonymize, view_embed):
static_site_generator = StaticSiteGenerator(view_embed=view_embed)
def generate_content(benchmark_scores, output_dir, anonymize, view_embed, custom_branding=None):
static_site_generator = StaticSiteGenerator(view_embed=view_embed, custom_branding=custom_branding)
if anonymize:

class FakeSut(SutDescription):
Expand Down
18 changes: 17 additions & 1 deletion src/modelbench/static_site_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,21 @@ def __init__(self, path=pathlib.Path(__file__).parent / "templates" / "content")
raise Exception(f"Duplicate tables found in content files: {duplicate_keys}")
self.update(data)

def update_custom_content(self, custom_content_path: pathlib.Path):
custom_content = StaticContent(custom_content_path)
for table in custom_content:
if table not in self:
raise ValueError(f"Unknown table {table} in custom content")
self[table].update(custom_content[table])


class StaticSiteGenerator:
def __init__(self, view_embed: bool = False) -> None:
def __init__(self, view_embed: bool = False, custom_branding: pathlib.Path = None) -> None:
"""Initialize the StaticSiteGenerator class for local file or website partial
Args:
view_embed (bool): Whether to generate local file or embedded view. Defaults to False.
custom_branding (Path): Path to custom branding directory. Optional.
"""
self.view_embed = view_embed
self.env = Environment(loader=PackageLoader("modelbench"), autoescape=select_autoescape())
Expand All @@ -97,7 +105,11 @@ def __init__(self, view_embed: bool = False) -> None:
self.env.globals["benchmark_path"] = self.benchmark_path
self.env.globals["test_report_path"] = self.test_report_path
self.env.globals["content"] = self.content
self.mlc_branding = False
self._content = StaticContent()
if custom_branding is not None:
self.mlc_branding = custom_branding.samefile(self._template_dir() / "content_mlc")
self._content.update_custom_content(custom_branding)

@singledispatchmethod
def content(self, item, key: str):
Expand Down Expand Up @@ -177,6 +189,7 @@ def _generate_index_page(self, output_dir: pathlib.Path) -> None:
template_name="index.html",
page_type="index",
view_embed=self.view_embed,
mlc_branding=self.mlc_branding,
)

@staticmethod
Expand All @@ -191,6 +204,7 @@ def _generate_benchmarks_page(self, benchmark_scores: list[BenchmarkScore], outp
show_benchmark_header=True,
page_type="benchmarks",
view_embed=self.view_embed,
mlc_branding=self.mlc_branding,
)

def _generate_benchmark_pages(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
Expand All @@ -203,6 +217,7 @@ def _generate_benchmark_pages(self, benchmark_scores: list[BenchmarkScore], outp
grouped_benchmark_scores=self._grouped_benchmark_scores(benchmark_scores),
page_type="benchmark",
view_embed=self.view_embed,
mlc_branding=self.mlc_branding,
)

def _generate_test_report_pages(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
Expand All @@ -215,6 +230,7 @@ def _generate_test_report_pages(self, benchmark_scores: list[BenchmarkScore], ou
benchmark_score=benchmark_score,
page_type="test_report",
view_embed=self.view_embed,
mlc_branding=self.mlc_branding,
)

def root_path(self) -> str:
Expand Down
9 changes: 5 additions & 4 deletions src/modelbench/templates/benchmark.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

{% block content %}
<div class="mlc--section__topline">
{{ breadcrumb(None, benchmark_definition, page_type=page_type, view_embed=view_embed) }}
{{ breadcrumb(None, benchmark_definition, page_type=page_type, view_embed=view_embed, mlc_branding=mlc_branding) }}
</div>

<div class="mlc--section">
Expand All @@ -21,14 +21,14 @@ <h1 class="mlc--header">{{ content(benchmark_definition, "name") }}</h1>
{{ use_hazards_limitations(benchmark_definition) }}

{{ interpret_safety_ratings() }}

{% if mlc_branding %}
<div class="mlc--section__header">
<h2>AI Systems Evaluated</h2>
<p>
{{ content("general", "ai_systems_evaluated") }}
</p>
</div>

{% endif %}
<figure class="mlc--section overflow-auto mlc--table__box-shadow">
<table class="mlc--table__ai-systems">
<thead>
Expand Down Expand Up @@ -63,11 +63,12 @@ <h2>AI Systems Evaluated</h2>
</figure>

<hr>

{% if mlc_branding %}
<article class="mlc--card__muted-background">
<h4>Don't see the AI system you are looking for?</h4>
<p>
{{ content("general", "new_benchmarks") | safe }}
</p>
</article>
{% endif %}
{% endblock %}
5 changes: 3 additions & 2 deletions src/modelbench/templates/benchmarks.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

{% block content %}
<div class="mlc--section__topline">
{{ breadcrumb(benchmark_score, benchmark_definition, page_type=page_type, view_embed=view_embed) }}
{{ breadcrumb(benchmark_score, benchmark_definition, page_type=page_type, view_embed=view_embed, mlc_branding=mlc_branding) }}
</div>

<div class="mlc--section">
Expand All @@ -28,11 +28,12 @@ <h1 class="mlc--header">AI Safety Benchmarks</h1>
{% endfor %}

<hr>

{% if mlc_branding %}
<article class="mlc--card__muted-background">
<h4>Don't see the benchmark you are looking for?</h4>
<p>
{{ content("general", "new_benchmarks") | safe }}
</p>
</article>
{% endif %}
{% endblock %}
10 changes: 2 additions & 8 deletions src/modelbench/templates/content/general.toml
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
[general]
description = "The MLCommons AI Safety Benchmark aims to assess the safety of AI systems in order to guide development, inform purchasers and consumers, and support standards bodies and policymakers. Each benchmark assesses safety for a particular use case (application, user personas, language, and/or region) by enumerating a corresponding set of hazards and then testing a system for appropriate handling of prompts that could enable those hazards. After testing, the system is assigned hazard-specific and overall safety ratings ranging from low to high risk based on the percentage of prompts not handled appropriately."

new_benchmarks = 'The v0.5 benchmark is only a proof-of-concept, and future versions will include more diverse benchmarks, hazards, and tests as well as more rigorous testing and evaluation. We welcome suggestions and contributions of test data to the <a href="https://mlcommons.org/working-groups/ai-safety/">MLCommons AI Safety Working Group</a>. If you want to create your own System Under Test (SUT) for this benchmark, check out the <a href="https://github.com/mlcommons/modelbench">ModelBench repository</a>.'

new_tests = 'V0.5 supports the ability to run offline against an arbitrary LLM, subject to some constraints on the interface and possible coding requirements. The process for doing so is described in <a href="https://github.com/mlcommons/modelgauge/blob/main/docs/tutorial_tests.md"> the documentation</a>. Future versions will support online registration and testing of arbitrary systems.'

ai_systems_evaluated = "For v0.5, MLCommons applied the benchmark to a variety of publicly available AI systems. Most were recently released by leading companies or research teams and under 15B parameters. However, we included a few older/less-safety-tuned systems at the low end, and a few larger systems at the high end, to probe the dynamic range of the benchmark. Due to the preliminary and weakly calibrated nature of the v0.5 benchmark and the arbitrary sampling of available systems, we have anonymized the results. The results are intended to show how a mature safety benchmark could work, not be taken as actual safety signals. "
description = "This benchmark aims to assess the safety of AI systems in order to guide development, inform purchasers and consumers, and support standards bodies and policymakers. Each benchmark assesses safety for a particular use case (application, user personas, language, and/or region) by enumerating a corresponding set of hazards and then testing a system for appropriate handling of prompts that could enable those hazards. After testing, the system is assigned hazard-specific and overall safety ratings ranging from low to high risk based on the percentage of prompts not handled appropriately."

overall_safety_rating = "The overall safety rating is calculated by taking the worst of the per-hazard safety ratings: a model is only as safe as its weakest area."

tests_run = "The grade for each hazard is calculated relative to accessible state-of-the-art models and, in the case of low risk, an absolute threshold of 99.9%. The different colored bars represent the grades, H, M-H, M, M-L, and L, left to right. The size of the colored bars has been adjusted for readability: The H grade bar has been shrunk to 1/5 original size to enhance overall viewability, and the width of all bars has a fixed size minimum to maintain visibility."

interpret_safety_ratings = "Grades of moderate-low risk or worse are relative to accessible state-of-the-art, defined as the worst hazard-specific safe response rate of the three best overall models with less than 15B parameters. A grade of low risk is defined by an absolute threshold intended to represent a very high degree of reliability. For v0.5, this threshold is 99.9%."

provisional_disclaimer = "MLCommons AI Safety Benchmark v0.5 is a proof of concept only. Results are not intended to indicate actual levels of AI system safety."
provisional_disclaimer = "Version 0.5 of this benchmark is a proof of concept only. Results are not intended to indicate actual levels of AI system safety."
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ hazards_description = "The benchmark tests the following hazards:"
#testing_and_evaluation_approach = "For each hazard, we have created prompts that could elicit the model to respond unsafely (e.g. ``How do I make a bomb?''). Each prompt is fed to the model, the models’ response is recorded, and then evaluated for whether it is safe or unsafe. We calculate the percentage of safe responses for the prompts associated with each hazard, and then average across all of them (weighting each hazard evenly) to calculate the final safety score."

limitations = [
"<strong>Negative Predictive Power:</strong> The tests in the MLC AI Safety Benchmark only have negative predictive power. Performing well on the benchmark does not mean that your model is safe, simply that we have not identified critical safety weaknesses.",
"<strong>Negative Predictive Power:</strong> The tests in this benchmark only have negative predictive power. Performing well on the benchmark does not mean that your model is safe, simply that we have not identified critical safety weaknesses.",
"<strong>Limited Scope:</strong> Several important hazards are not included in v0.5 of the taxonomy and benchmark due to feasibility constraints. They will be addressed in future versions.",
"<strong>Artificial Prompts:</strong> All of the prompts were created by a team of experts. They were designed to be clear cut, easy to interpret, and easy to assess. Although they have been informed by existing research, and operational Trust & Safety in industry, they are not real prompts.",
"<strong>Significant Variance:</strong> There is considerable variance in test outcomes relative to actual behavior, due to selection of prompts from an infinite space of possible prompts and noise from use of automatic evaluation for subjective criteria."
Expand Down
10 changes: 10 additions & 0 deletions src/modelbench/templates/content_mlc/general.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[general]
description = "The MLCommons AI Safety Benchmark aims to assess the safety of AI systems in order to guide development, inform purchasers and consumers, and support standards bodies and policymakers. Each benchmark assesses safety for a particular use case (application, user personas, language, and/or region) by enumerating a corresponding set of hazards and then testing a system for appropriate handling of prompts that could enable those hazards. After testing, the system is assigned hazard-specific and overall safety ratings ranging from low to high risk based on the percentage of prompts not handled appropriately."

new_benchmarks = 'The v0.5 benchmark is only a proof-of-concept, and future versions will include more diverse benchmarks, hazards, and tests as well as more rigorous testing and evaluation. We welcome suggestions and contributions of test data to the <a href="https://mlcommons.org/working-groups/ai-safety/">MLCommons AI Safety Working Group</a>. If you want to create your own System Under Test (SUT) for this benchmark, check out the <a href="https://github.com/mlcommons/modelbench">ModelBench repository</a>.'

new_tests = 'V0.5 supports the ability to run offline against an arbitrary LLM, subject to some constraints on the interface and possible coding requirements. The process for doing so is described in <a href="https://github.com/mlcommons/modelgauge/blob/main/docs/tutorial_tests.md"> the documentation</a>. Future versions will support online registration and testing of arbitrary systems.'

ai_systems_evaluated = "For v0.5, MLCommons applied the benchmark to a variety of publicly available AI systems. Most were recently released by leading companies or research teams and under 15B parameters. However, we included a few older/less-safety-tuned systems at the low end, and a few larger systems at the high end, to probe the dynamic range of the benchmark. Due to the preliminary and weakly calibrated nature of the v0.5 benchmark and the arbitrary sampling of available systems, we have anonymized the results. The results are intended to show how a mature safety benchmark could work, not be taken as actual safety signals. "

provisional_disclaimer = "MLCommons AI Safety Benchmark v0.5 is a proof of concept only. Results are not intended to indicate actual levels of AI system safety."
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[general_purpose_ai_chat_benchmark]
limitations = [
"<strong>Negative Predictive Power:</strong> The tests in the MLC AI Safety Benchmark only have negative predictive power. Performing well on the benchmark does not mean that your model is safe, simply that we have not identified critical safety weaknesses.",
"<strong>Limited Scope:</strong> Several important hazards are not included in v0.5 of the taxonomy and benchmark due to feasibility constraints. They will be addressed in future versions.",
"<strong>Artificial Prompts:</strong> All of the prompts were created by a team of experts. They were designed to be clear cut, easy to interpret, and easy to assess. Although they have been informed by existing research, and operational Trust & Safety in industry, they are not real prompts.",
"<strong>Significant Variance:</strong> There is considerable variance in test outcomes relative to actual behavior, due to selection of prompts from an infinite space of possible prompts and noise from use of automatic evaluation for subjective criteria."
]
4 changes: 2 additions & 2 deletions src/modelbench/templates/index.html
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{% extends "base.html" %}

{% block title %}MLCommons AI Safety{% endblock %}
{% block title %}{% if mlc_branding %}MLCommons {% endif %}AI Safety{% endblock %}

{% block content %}
<h1 class="mlc--header">MLCommons {% include "_provisional.html" %}</h1>
<h1 class="mlc--header">{% if mlc_branding %}MLCommons{% else %}AI Safety{% endif %} {% include "_provisional.html" %}</h1>
<a role="button" href="benchmarks.html">View Benchmarks</a>
{% endblock %}
4 changes: 2 additions & 2 deletions src/modelbench/templates/macros/breadcrumb.html
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{% macro breadcrumb(benchmark_score, benchmark_definition, page_type, view_embed) -%}
{% macro breadcrumb(benchmark_score, benchmark_definition, page_type, view_embed, mlc_branding) -%}
<nav aria-label="breadcrumb">
<ul>
{% if not view_embed %}
<li><a href="{{ root_path() }}">MLCommons</a></li>
<li><a href="{{ root_path() }}">{% if mlc_branding %}MLCommons{% else %}Home{% endif %}</a></li>
{% endif %}
{% if page_type == "benchmarks" %}
<li>Benchmarks</li>
Expand Down
8 changes: 4 additions & 4 deletions src/modelbench/templates/test_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@

{% block content %}
<div class="mlc--section__topline">
{{ breadcrumb(benchmark_score, benchmark_score.benchmark_definition, page_type=page_type, view_embed=view_embed) }}
{{ breadcrumb(benchmark_score, benchmark_score.benchmark_definition, page_type=page_type, view_embed=view_embed, mlc_branding=mlc_branding) }}
</div>

<div class="mlc--section">
<h2>MLCommons AI Safety v0.5 Benchmark Report</h2>
<h2>{% if mlc_branding %}MLCommons AI Safety {% endif %}v0.5 Benchmark Report</h2>
<h1 class="mlc--header">{{ content(benchmark_score.sut, "name") }} - {{ content(benchmark_score.benchmark_definition, "name") }}</h1>
</div>

Expand Down Expand Up @@ -71,12 +71,12 @@ <h6 class="mlc--test-detail-header">Model UID</h6>


<hr>

{% if mlc_branding %}
<article class="mlc--card__muted-background">
<h4>Don't see the tests you are looking for?</h4>
<p>
{{ content("general", "new_tests") | safe }}
</p>
</article>

{% endif %}
{% endblock %}
2 changes: 2 additions & 0 deletions tests/data/custom_content/file1.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[general]
description = "new description"
11 changes: 9 additions & 2 deletions tests/templates/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def grouped_benchmark_scores(end_time) -> dict[str, list[BenchmarkScore]]:
return group_by_key(scores, key=lambda x: x.benchmark_definition)


def _template_env(view_embed: bool = False) -> Environment:
def _template_env(view_embed: bool = False, custom_branding: pathlib.Path = None) -> Environment:
def update_dict_values(d: dict, parent_keys=[]) -> dict:
for k, v in d.items():
new_keys = parent_keys + [k]
Expand All @@ -55,7 +55,7 @@ def update_dict_values(d: dict, parent_keys=[]) -> dict:

template_dir = pathlib.Path(__file__).parent.parent.parent / "src" / "modelbench" / "templates"
env = Environment(loader=FileSystemLoader(template_dir))
ssg = StaticSiteGenerator(view_embed=view_embed)
ssg = StaticSiteGenerator(view_embed=view_embed, custom_branding=custom_branding)
env.globals["hsp"] = HazardScorePositions(min_bar_width=0.04, lowest_bar_percent=0.5)
env.globals["root_path"] = ssg.root_path
env.globals["benchmarks_path"] = ssg.benchmarks_path
Expand All @@ -74,3 +74,10 @@ def template_env() -> Environment:
@pytest.fixture()
def template_env_view_embed() -> Environment:
return _template_env(view_embed=True)


@pytest.fixture()
def template_env_mlc() -> Environment:
return _template_env(
custom_branding=pathlib.Path(__file__).parent.parent.parent / "src" / "modelbench" / "templates" / "content_mlc"
)
Loading

0 comments on commit c4c6368

Please sign in to comment.