diff --git a/src/coffee/templates/_breadcrumb.html b/src/coffee/templates/_breadcrumb.html deleted file mode 100644 index 4326bd3a..00000000 --- a/src/coffee/templates/_breadcrumb.html +++ /dev/null @@ -1,14 +0,0 @@ - \ No newline at end of file diff --git a/src/coffee/templates/_interpret_safety_ratings.html b/src/coffee/templates/_interpret_safety_ratings.html deleted file mode 100644 index 62eb53f1..00000000 --- a/src/coffee/templates/_interpret_safety_ratings.html +++ /dev/null @@ -1,38 +0,0 @@ -
-

How to Interpret Safety Ratings?

-

Description goes here lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor - incididunt ut labore et dolore magna aliqua. - Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo - consequat.

-
- -
-   -
-
- {% for i in range(5, 2, -1) %} -
-

{{ stars_description[i]['rank'] }}

-
- {{ (i) | display_stars("sm") }} -
-

{{ stars_description[i]['short_description'] }}

-
- {% endfor %} -
- -
- {% for i in range(2, 0, -1) %} -
-

{{ stars_description[i]["rank"] }}

-
- {{ (i) | display_stars("sm") }} -
-

{{ stars_description[i]["short_description"] }}

-
- {% endfor %} -
-
-
-
-
\ No newline at end of file diff --git a/src/coffee/templates/_use_harms_limitations.html b/src/coffee/templates/_use_harms_limitations.html deleted file mode 100644 index 1411ba26..00000000 --- a/src/coffee/templates/_use_harms_limitations.html +++ /dev/null @@ -1,49 +0,0 @@ -{% if not benchmark_definition %} - {% set benchmark_definition = benchmark_score.benchmark_definition %} -{% endif %} - -
-
-

Use Case

-

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore - et dolore quis nostrud exercitation ullamco laboris magna aliqua.

- - -
-
-
-

Harms Tested

-

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore - et dolore quis nostrud exercitation ullamco laboris magna aliqua.

- - -
-
- -
-

- Limitations -

- -
\ No newline at end of file diff --git a/src/coffee/templates/benchmark.html b/src/coffee/templates/benchmark.html index dc08b2d8..cef6e20b 100644 --- a/src/coffee/templates/benchmark.html +++ b/src/coffee/templates/benchmark.html @@ -1,3 +1,7 @@ +{% from "macros/breadcrumb.html" import breadcrumb %} +{% from "macros/interpret_safety_ratings.html" import interpret_safety_ratings %} +{% from "macros/use_harms_limitations.html" import use_harms_limitations %} + {% extends "base.html" %} {% block title %}{{ benchmark_name }} Benchmark{% endblock %} @@ -5,7 +9,7 @@ {% block content %}
- {% include "_breadcrumb.html" %} + {{ breadcrumb(None, benchmark_definition) }}

{{ benchmark_definition.name() }} Provisional

@@ -16,9 +20,9 @@

{{ benchmark_definition.name() }} Provision

- {% include "_use_harms_limitations.html" %} + {{ use_harms_limitations(benchmark_definition) }} - {% include "_interpret_safety_ratings.html" %} + {{ interpret_safety_ratings(stars_description) }}  
diff --git a/src/coffee/templates/benchmarks.html b/src/coffee/templates/benchmarks.html index 036de67f..fcbbc18d 100644 --- a/src/coffee/templates/benchmarks.html +++ b/src/coffee/templates/benchmarks.html @@ -1,3 +1,6 @@ +{% from "macros/benchmark_card.html" import benchmark_card %} +{% from "macros/breadcrumb.html" import breadcrumb %} + {% extends "base.html" %} {% block title %}Benchmarks{% endblock %} @@ -5,7 +8,7 @@ {% block content %}
- {% include "_breadcrumb.html" %} + {{ breadcrumb(benchmark_score, benchmark_definition) }}

AIP Safety Benchmarks Provisional

@@ -21,7 +24,7 @@

AIP Safety Benchmarks Provisi

{% for benchmark_definition in grouped_benchmark_scores %} - {% include "_benchmark_card.html" %} + {{ benchmark_card(True, benchmark_definition) }}
 
{% endfor %} @@ -38,4 +41,4 @@

-{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/src/coffee/templates/_benchmark_card.html b/src/coffee/templates/macros/benchmark_card.html similarity index 91% rename from src/coffee/templates/_benchmark_card.html rename to src/coffee/templates/macros/benchmark_card.html index 5203fe0a..f50a5ecf 100644 --- a/src/coffee/templates/_benchmark_card.html +++ b/src/coffee/templates/macros/benchmark_card.html @@ -1,3 +1,4 @@ +{% macro benchmark_card(show_benchmark_header, benchmark_definition) %}
{% if show_benchmark_header %}
@@ -21,3 +22,4 @@

{{ benchmark_definition.name() }} Benchmark

+{%- endmacro %} diff --git a/src/coffee/templates/macros/breadcrumb.html b/src/coffee/templates/macros/breadcrumb.html new file mode 100644 index 00000000..7f6835a6 --- /dev/null +++ b/src/coffee/templates/macros/breadcrumb.html @@ -0,0 +1,16 @@ +{% macro breadcrumb(benchmark_score, benchmark_definition) %} + +{%- endmacro %} diff --git a/src/coffee/templates/macros/interpret_safety_ratings.html b/src/coffee/templates/macros/interpret_safety_ratings.html new file mode 100644 index 00000000..fc418527 --- /dev/null +++ b/src/coffee/templates/macros/interpret_safety_ratings.html @@ -0,0 +1,40 @@ +{% macro interpret_safety_ratings(stars_description) %} +
+

How to Interpret Safety Ratings?

+

Description goes here lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor + incididunt ut labore et dolore magna aliqua. + Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+
+ +
+   +
+
+ {% for i in range(5, 2, -1) %} +
+

{{ stars_description[i]['rank'] }}

+
+ {{ (i) | display_stars("sm") }} +
+

{{ stars_description[i]['short_description'] }}

+
+ {% endfor %} +
+ +
+ {% for i in range(2, 0, -1) %} +
+

{{ stars_description[i]["rank"] }}

+
+ {{ (i) | display_stars("sm") }} +
+

{{ stars_description[i]["short_description"] }}

+
+ {% endfor %} +
+
+
+
+
+{%- endmacro %} diff --git a/src/coffee/templates/_sut_card.html b/src/coffee/templates/macros/sut_card.html similarity index 96% rename from src/coffee/templates/_sut_card.html rename to src/coffee/templates/macros/sut_card.html index 3eeddd2d..cb383008 100644 --- a/src/coffee/templates/_sut_card.html +++ b/src/coffee/templates/macros/sut_card.html @@ -1,3 +1,4 @@ +{% macro sut_card(benchmark_score, stars_description) %}
@@ -38,3 +39,4 @@

{{ stars_description[benchmark_score.stars() | round | int]["ra

+{%- endmacro %} diff --git a/src/coffee/templates/macros/use_harms_limitations.html b/src/coffee/templates/macros/use_harms_limitations.html new file mode 100644 index 00000000..398fef15 --- /dev/null +++ b/src/coffee/templates/macros/use_harms_limitations.html @@ -0,0 +1,47 @@ +{% macro use_harms_limitations(benchmark_definition) %} +
+
+

Use Case

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore + et dolore quis nostrud exercitation ullamco laboris magna aliqua.

+ + +
+
+
+

Harms Tested

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore + et dolore quis nostrud exercitation ullamco laboris magna aliqua.

+ + +
+
+ +
+

+ Limitations +

+ +
+{%- endmacro %} diff --git a/src/coffee/templates/test_report.html b/src/coffee/templates/test_report.html index c6d6eadc..d7e425db 100644 --- a/src/coffee/templates/test_report.html +++ b/src/coffee/templates/test_report.html @@ -1,3 +1,8 @@ +{% from "macros/breadcrumb.html" import breadcrumb %} +{% from "macros/interpret_safety_ratings.html" import interpret_safety_ratings %} +{% from "macros/sut_card.html" import sut_card %} +{% from "macros/use_harms_limitations.html" import use_harms_limitations %} + {% extends "base.html" %} {% block title %}{{ benchmark_name }} Benchmark{% endblock %} @@ -5,7 +10,7 @@ {% block content %}
- {% include "_breadcrumb.html" %} + {{ breadcrumb(benchmark_score, benchmark_score.benchmark_definition) }}

Test Report

@@ -18,9 +23,9 @@

{{ benchmark_score.sut.name }} - {{ benchmark_score.benchmark_definition.nam

- {% include "_use_harms_limitations.html" %} + {{ use_harms_limitations(benchmark_score.benchmark_definition) }} - {% include "_interpret_safety_ratings.html" %} + {{ interpret_safety_ratings(stars_description) }}

Overall Safety Rating

@@ -29,7 +34,7 @@

Overall Safety Rating

nostrud

- {% include "_sut_card.html" %} + {{ sut_card(benchmark_score, stars_description) }}

Tests Run

diff --git a/tests/templates/conftest.py b/tests/templates/conftest.py new file mode 100644 index 00000000..cf347949 --- /dev/null +++ b/tests/templates/conftest.py @@ -0,0 +1,57 @@ +from itertools import groupby +import pytest +import pathlib + +from jinja2 import Environment, FileSystemLoader + +from coffee.helm import HelmSut +from coffee.benchmark import ( + GeneralChatBotBenchmarkDefinition, + BiasHarmDefinition, + HarmScore, + BenchmarkScore, + ToxicityHarmDefinition, +) +from coffee.static_site_generator import STARS_DESCRIPTION, display_stars + + +def _benchmark_score() -> BenchmarkScore: + bd = GeneralChatBotBenchmarkDefinition() + bs = BenchmarkScore( + bd, + HelmSut.GPT2, + [ + HarmScore(BiasHarmDefinition(), 0.5), + HarmScore(ToxicityHarmDefinition(), 0.8), + ], + ) + return bs + + +@pytest.fixture() +def benchmark_score() -> BenchmarkScore: + return _benchmark_score() + + +@pytest.fixture() +def grouped_benchmark_scores() -> dict[str, list[BenchmarkScore]]: + benchmark_scores_dict = {} + for benchmark_definition, grouped_benchmark_scores in groupby( + [_benchmark_score()], lambda x: x.benchmark_definition + ): + grouped_benchmark_scores_list: list = list(grouped_benchmark_scores) + benchmark_scores_dict[benchmark_definition] = grouped_benchmark_scores_list + return benchmark_scores_dict + + +@pytest.fixture() +def template_env() -> Environment: + template_dir = pathlib.Path(__file__).parent.parent.parent / "src" / "coffee" / "templates" + env = Environment(loader=FileSystemLoader(template_dir)) + env.filters["display_stars"] = display_stars + return env + + +@pytest.fixture() +def stars_description() -> dict[int, dict[str, str]]: + return STARS_DESCRIPTION diff --git a/tests/templates/macros/test_benchmark_card.py b/tests/templates/macros/test_benchmark_card.py new file mode 100644 index 00000000..53391e34 --- /dev/null +++ b/tests/templates/macros/test_benchmark_card.py @@ -0,0 +1,8 @@ +def test_display_benchmark_card(benchmark_score, template_env): + template = template_env.get_template("macros/benchmark_card.html") + result = template.module.benchmark_card(True, benchmark_score.benchmark_definition) + assert "General Chat Bot Benchmark" in result + assert "Lorem ipsum dolor sit amet" in result + result_no_header = template.module.benchmark_card(False, benchmark_score.benchmark_definition) + assert "General Chat Bot Benchmark" not in result_no_header + assert "Lorem ipsum dolor sit amet" in result diff --git a/tests/templates/macros/test_breadcrumb.py b/tests/templates/macros/test_breadcrumb.py new file mode 100644 index 00000000..4f982526 --- /dev/null +++ b/tests/templates/macros/test_breadcrumb.py @@ -0,0 +1,10 @@ +def test_display_breadcrumb(benchmark_score, template_env): + template = template_env.get_template("macros/breadcrumb.html") + result = template.module.breadcrumb(benchmark_score, benchmark_score.benchmark_definition) + assert "Benchmarks" in result + assert "General Chat Bot" in result + assert 'href="general_chat_bot_benchmark.html"' in result + result = template.module.breadcrumb(None, benchmark_score.benchmark_definition) + assert "Benchmarks" in result + assert 'href="general_chat_bot_benchmark.html"' not in result + assert "General Chat Bot" in result diff --git a/tests/templates/macros/test_interpret_safety_ratings.py b/tests/templates/macros/test_interpret_safety_ratings.py new file mode 100644 index 00000000..e082d302 --- /dev/null +++ b/tests/templates/macros/test_interpret_safety_ratings.py @@ -0,0 +1,8 @@ +def test_interpret_safety_ratings(stars_description, template_env): + template = template_env.get_template("macros/interpret_safety_ratings.html") + result = template.module.interpret_safety_ratings(stars_description) + assert "How to Interpret Safety Ratings?" in result + assert "Description goes here" in result + assert "At or above moderate quality reference system." in result + assert "Below minimum quality reference system." in result + assert "Excellent" in result diff --git a/tests/templates/macros/test_sut_card.py b/tests/templates/macros/test_sut_card.py new file mode 100644 index 00000000..0ae18a84 --- /dev/null +++ b/tests/templates/macros/test_sut_card.py @@ -0,0 +1,6 @@ +def test_display_sut(benchmark_score, stars_description, template_env): + template = template_env.get_template("macros/sut_card.html") + result = template.module.sut_card(benchmark_score, stars_description) + assert 'Model performance can be compared to a "moderate quality"' in result + assert "66% safe responses" in result + assert "What does 'Fair' mean?" in result diff --git a/tests/templates/macros/test_use_harms_limitations.py b/tests/templates/macros/test_use_harms_limitations.py new file mode 100644 index 00000000..2cc88544 --- /dev/null +++ b/tests/templates/macros/test_use_harms_limitations.py @@ -0,0 +1,7 @@ +def test_use_harms_limitations(benchmark_score, template_env): + template = template_env.get_template("macros/use_harms_limitations.html") + result = template.module.use_harms_limitations(benchmark_score.benchmark_definition) + assert "Use Case" in result + assert "Lorem ipsum dolor sit amet" in result + assert "Harms Tested" in result + assert "Limitations" in result diff --git a/tests/templates/test_benchmark.py b/tests/templates/test_benchmark.py new file mode 100644 index 00000000..a8c88e27 --- /dev/null +++ b/tests/templates/test_benchmark.py @@ -0,0 +1,14 @@ +def test_benchmark(benchmark_score, template_env, stars_description, grouped_benchmark_scores): + template = template_env.get_template("benchmark.html") + result = template.render( + benchmark_score=benchmark_score, + benchmark_definition=benchmark_score.benchmark_definition, + grouped_benchmark_scores=grouped_benchmark_scores, + stars_description=stars_description, + ) + assert "General Chat Bot" in result + assert "Harms Tested" in result + assert "How to Interpret Safety Ratings?" in result + assert "Below minimum quality reference system." in result + assert "AI Systems Evaluated" in result + assert "Privacy Policy" in result diff --git a/tests/templates/test_benchmarks.py b/tests/templates/test_benchmarks.py new file mode 100644 index 00000000..a0003533 --- /dev/null +++ b/tests/templates/test_benchmarks.py @@ -0,0 +1,11 @@ +def test_benchmarks(benchmark_score, template_env, stars_description, grouped_benchmark_scores): + template = template_env.get_template("benchmarks.html") + result = template.render( + benchmark_score=benchmark_score, + benchmark_definition=benchmark_score.benchmark_definition, + grouped_benchmark_scores=grouped_benchmark_scores, + stars_description=stars_description, + ) + assert "AIP Safety Benchmarks" in result + assert "General Chat Bot Benchmark" in result + assert "Privacy Policy" in result diff --git a/tests/templates/test_test_report.py b/tests/templates/test_test_report.py new file mode 100644 index 00000000..f2ea1904 --- /dev/null +++ b/tests/templates/test_test_report.py @@ -0,0 +1,13 @@ +def test_test_report(benchmark_score, template_env, stars_description): + template = template_env.get_template("test_report.html") + result = template.render( + benchmark_score=benchmark_score, + stars_description=stars_description, + ) + assert "Test Report" in result + assert "66% safe responses" in result + assert "Tests Run" in result + assert "Test Details" in result + assert "General Chat Bot" in result + assert "Don't see the tests you are looking for?" in result + assert "Privacy Policy" in result