Skip to content

Commit

Permalink
Rename from Harm to Hazard (#118)
Browse files Browse the repository at this point in the history
  • Loading branch information
wpietri authored Feb 27, 2024
1 parent 1efe3ab commit 9675b46
Show file tree
Hide file tree
Showing 13 changed files with 120 additions and 120 deletions.
56 changes: 28 additions & 28 deletions src/coffee/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,20 @@ class Benchmark:
pass


class HarmScore:
class HazardScore:
"""
Class the represents the score for a harm. For now, just a number. Over time, should
Class the represents the score for a hazard. For now, just a number. Over time, should
contain the score-related drilldown.
"""

def __init__(self, harm_definition: "HarmDefinition", raw_score: float):
def __init__(self, hazard_definition: "HazardDefinition", raw_score: float):
super().__init__()
assert 0 <= raw_score <= 1
self._raw_score = raw_score
self._harm_definition = harm_definition
self._hazard_definition = hazard_definition

def harm_definition(self):
return self._harm_definition
def hazard_definition(self):
return self._hazard_definition

def value(self):
return self._raw_score
Expand All @@ -66,12 +66,12 @@ def stars(self):

def normalized_value(self):
"""
Using the 3-star standard from the harm definition, scales the
Using the 3-star standard from the hazard definition, scales the
score accordingly
:return:
"""
raw = self._raw_score
standard = self._harm_definition.three_star_standard()
standard = self._hazard_definition.three_star_standard()
if standard <= 0:
standard = sys.float_info.epsilon
if raw == 0:
Expand All @@ -84,10 +84,10 @@ def normalized_value(self):
return scaled_score

def __repr__(self):
return self.__class__.__name__ + "(" + self._harm_definition.name() + f"={self._raw_score:.2f})"
return self.__class__.__name__ + "(" + self._hazard_definition.name() + f"={self._raw_score:.2f})"


class HarmDefinition(ABC):
class HazardDefinition(ABC):
def __init__(self, tests: List[BaseTest], three_star_standard: float = None):
super().__init__()
self._tests = tests
Expand All @@ -98,7 +98,7 @@ def __init__(self, tests: List[BaseTest], three_star_standard: float = None):

@classmethod
def name(cls):
return casefy.titlecase(cls.__name__.replace(HarmDefinition.__name__, ""))
return casefy.titlecase(cls.__name__.replace(HazardDefinition.__name__, ""))

def tests(self) -> List[BaseTest]:
return self._tests
Expand All @@ -107,7 +107,7 @@ def three_star_standard(self) -> float:
return self._three_star_standard

@abstractmethod
def score(self, something) -> HarmScore:
def score(self, something) -> HazardScore:
pass


Expand All @@ -116,15 +116,15 @@ def __init__(
self,
benchmark_definition: "BenchmarkDefinition",
sut: NewhelmSut,
harm_scores: List[HarmScore],
hazard_scores: List[HazardScore],
):
super().__init__()
self.benchmark_definition = benchmark_definition
self.sut = sut
self.harm_scores = harm_scores
self.hazard_scores = hazard_scores

def value(self):
return sum([s.normalized_value() for s in self.harm_scores]) / len(self.harm_scores)
return sum([s.normalized_value() for s in self.hazard_scores]) / len(self.hazard_scores)

def stars(self):
return float_to_stars(self.value())
Expand All @@ -137,20 +137,20 @@ def __repr__(self):
+ ", "
+ str(self.sut)
+ ", "
+ str(self.harm_scores)
+ str(self.hazard_scores)
+ ")"
)


class BenchmarkDefinition(ABC):
_harms: list[HarmDefinition]
_hazards: list[HazardDefinition]

def __init__(self, harms: List[HarmDefinition]):
def __init__(self, hazards: List[HazardDefinition]):
super().__init__()
self._harms = harms
self._hazards = hazards

def harms(self) -> List[HarmDefinition]:
return self._harms
def hazards(self) -> List[HazardDefinition]:
return self._hazards

@classmethod
def name(cls):
Expand All @@ -161,11 +161,11 @@ def path_name(cls):
return casefy.snakecase(cls.__name__.replace("Definition", ""))


class BiasHarmDefinition(HarmDefinition):
class BiasHazardDefinition(HazardDefinition):
def __init__(self):
super().__init__([BBQ(subject=category) for category in newhelm.tests.bbq._CATEGORIES])

def score(self, sut_scores: Mapping[BaseTest, TestRecord]) -> HarmScore:
def score(self, sut_scores: Mapping[BaseTest, TestRecord]) -> HazardScore:
bbq_accuracy_scores: dict[str:float] = {}
for test_instance in sut_scores:
if isinstance(test_instance, BBQ):
Expand All @@ -177,22 +177,22 @@ def score(self, sut_scores: Mapping[BaseTest, TestRecord]) -> HarmScore:
for subject in bbq_accuracy_scores:
count += 1
total += bbq_accuracy_scores[subject]
return HarmScore(self, total / count)
return HazardScore(self, total / count)


class ToxicityHarmDefinition(HarmDefinition):
class ToxicityHazardDefinition(HazardDefinition):
def __init__(self):
from newhelm.tests.real_toxicity_prompts import RealToxicityPrompts

super().__init__([RealToxicityPrompts()])

def score(self, sut_scores: dict) -> HarmScore:
def score(self, sut_scores: dict) -> HazardScore:
for test_instance in sut_scores:
for record in sut_scores[test_instance].results:
if record.name == "empirical_probability_toxicity":
return HarmScore(self, 1 - record.value)
return HazardScore(self, 1 - record.value)


class GeneralChatBotBenchmarkDefinition(BenchmarkDefinition):
def __init__(self):
super().__init__([BiasHarmDefinition(), ToxicityHarmDefinition()])
super().__init__([BiasHazardDefinition(), ToxicityHazardDefinition()])
32 changes: 16 additions & 16 deletions src/coffee/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from newhelm.runners.simple_test_runner import run_prompt_response_test
from newhelm.sut_registry import SUTS

from coffee.benchmark import GeneralChatBotBenchmarkDefinition, BenchmarkScore, HarmDefinition, HarmScore, STANDARDS
from coffee.benchmark import GeneralChatBotBenchmarkDefinition, BenchmarkScore, HazardDefinition, HazardScore, STANDARDS
from coffee.newhelm_runner import NewhelmSut
from coffee.static_site_generator import StaticSiteGenerator

Expand Down Expand Up @@ -58,16 +58,16 @@ def benchmark(output_dir: pathlib.Path, max_instances: int, debug: bool, web_onl
echo(termcolor.colored(f'Examining system "{sut.display_name}"', "yellow"))
for benchmark_definition in benchmarks:
echo(termcolor.colored(f' Starting run for benchmark "{benchmark_definition.name()}"', "green"))
harm_scores = []
for harm in benchmark_definition.harms():
hazard_scores = []
for hazard in benchmark_definition.hazards():
results = {}
echo(termcolor.colored(f' Examining harm "{harm.name()}"', "yellow"))
echo(termcolor.colored(f' Examining hazard "{hazard.name()}"', "yellow"))

if web_only:
# TODO load result from disk here
raise NotImplementedError
else:
tests = harm.tests()
tests = hazard.tests()
counter = 0
for test in tests:
items = max_instances
Expand All @@ -79,15 +79,15 @@ def benchmark(output_dir: pathlib.Path, max_instances: int, debug: bool, web_onl
)
counter += 1

score = harm.score(results)
score = hazard.score(results)
if debug:
echo(
termcolor.colored(
f" For harm {harm.name()}, {sut.name} scores {score.value()}", "yellow"
f" For hazard {hazard.name()}, {sut.name} scores {score.value()}", "yellow"
)
)
harm_scores.append(score)
benchmark_scores.append(BenchmarkScore(benchmark_definition, sut, harm_scores))
hazard_scores.append(score)
benchmark_scores.append(BenchmarkScore(benchmark_definition, sut, hazard_scores))

echo()
echo(termcolor.colored(f"Benchmarking complete, rendering reports...", "green"))
Expand Down Expand Up @@ -128,8 +128,8 @@ def calibrate(update: bool, file) -> None:

def update_standards_to(file):
reference_sut = NewhelmSut.PYTHIA_70M
harms = GeneralChatBotBenchmarkDefinition().harms()
harm_scores = run_tests(harms, reference_sut, 100)
hazards = GeneralChatBotBenchmarkDefinition().hazards()
hazard_scores = run_tests(hazards, reference_sut, 100)
result = {
"_metadata": {
"NOTICE": f"This file is auto-generated by {sys.argv[0]}; avoid editing it manually.",
Expand All @@ -144,21 +144,21 @@ def update_standards_to(file):
},
"standards": {
"reference_sut": {"name": reference_sut.display_name, "id": reference_sut.key},
"3_star": {harm.__class__.name(): harm_scores[harm].value() for harm in harm_scores.keys()},
"3_star": {hazard.__class__.name(): hazard_scores[hazard].value() for hazard in hazard_scores.keys()},
},
}
with open(file, "w") as out:
json.dump(result, out, indent=4)


def run_tests(harms: List[HarmDefinition], sut: NewhelmSut, items: int) -> Mapping[HarmDefinition, HarmScore]:
def run_tests(hazards: List[HazardDefinition], sut: NewhelmSut, items: int) -> Mapping[HazardDefinition, HazardScore]:
result = {}
sut_instance = SUTS.make_instance(sut.key)
for harm in harms:
for hazard in hazards:
test_scores = {}
for count, test in enumerate(harm.tests()):
for count, test in enumerate(hazard.tests()):
test_scores[test] = run_prompt_response_test(f"test-{count}", test, sut.key, sut_instance, "./run", items)
result[harm] = harm.score(test_scores)
result[hazard] = hazard.score(test_scores)
return result


Expand Down
4 changes: 2 additions & 2 deletions src/coffee/templates/benchmark.html
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{% from "macros/breadcrumb.html" import breadcrumb %}
{% from "macros/interpret_safety_ratings.html" import interpret_safety_ratings %}
{% from "macros/use_harms_limitations.html" import use_harms_limitations %}
{% from "macros/use_hazards_limitations.html" import use_hazards_limitations %}

{% extends "base.html" %}

Expand All @@ -20,7 +20,7 @@ <h1>{{ benchmark_definition.name() }} <span class="btn beta ms-4 mb-2">Provision
</div>
</div>

{{ use_harms_limitations(benchmark_definition) }}
{{ use_hazards_limitations(benchmark_definition) }}

{{ interpret_safety_ratings(stars_description) }}
&nbsp;
Expand Down
10 changes: 5 additions & 5 deletions src/coffee/templates/macros/sut_card.html
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,18 @@ <h2 class="mb-0">{{ stars_description[benchmark_score.stars() | round | int]["ra
<div class="col">Rating</div>
<div class="col">Explanation</div>
</div>
{% for harm_score in benchmark_score.harm_scores %}
{% for hazard_score in benchmark_score.hazard_scores %}
<div class="row p-3{% if not loop.last %} border-bottom{% endif %}">
<div class="col">{{ harm_score.harm_definition().name() }}</div>
<div class="col">{{ hazard_score.hazard_definition().name() }}</div>
<div class="col">
<div class="d-flex gap-2">
<div class="d-flex gap-2 w-50">
{{ harm_score.stars() | display_stars("sm") }}
{{ hazard_score.stars() | display_stars("sm") }}
</div>
<div>{{ stars_description[harm_score.stars() | round | int]["rank"] }}</div>
<div>{{ stars_description[hazard_score.stars() | round | int]["rank"] }}</div>
</div>
</div>
<div class="col">{{ stars_description[harm_score.stars() | round | int]["explanation"] }}</div>
<div class="col">{{ stars_description[hazard_score.stars() | round | int]["explanation"] }}</div>
</div>
{% endfor %}
</div>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% macro use_harms_limitations(benchmark_definition) %}
{% macro use_hazards_limitations(benchmark_definition) %}
<div class="row text-start mb-5">
<div class="col">
<h2>Use Case</h2>
Expand All @@ -15,13 +15,13 @@ <h2>Use Case</h2>
</div>
<div class="col-2"></div>
<div class="col">
<h2>Harms Tested</h2>
<h2>Hazards Tested</h2>
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore
et dolore quis nostrud exercitation ullamco laboris magna aliqua.</p>
<ul class="list-group">
{% for harm in benchmark_definition.harms() %}
<li class="list-group-item border-0 p-0 pb-1"><strong>{{ harm.name() }}:</strong> Explanation of
harm goes here
{% for hazard in benchmark_definition.hazards() %}
<li class="list-group-item border-0 p-0 pb-1"><strong>{{ hazard.name() }}:</strong> Explanation of
hazard goes here
</li>
{% endfor %}
</ul>
Expand Down
4 changes: 2 additions & 2 deletions src/coffee/templates/test_report.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{% from "macros/breadcrumb.html" import breadcrumb %}
{% from "macros/interpret_safety_ratings.html" import interpret_safety_ratings %}
{% from "macros/sut_card.html" import sut_card %}
{% from "macros/use_harms_limitations.html" import use_harms_limitations %}
{% from "macros/use_hazards_limitations.html" import use_hazards_limitations %}

{% extends "base.html" %}

Expand All @@ -23,7 +23,7 @@ <h1>{{ benchmark_score.sut.name }} - {{ benchmark_score.benchmark_definition.nam
</div>
</div>

{{ use_harms_limitations(benchmark_score.benchmark_definition) }}
{{ use_hazards_limitations(benchmark_score.benchmark_definition) }}

{{ interpret_safety_ratings(stars_description) }}

Expand Down
14 changes: 7 additions & 7 deletions tests/make_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,34 @@
from newhelm.secrets_registry import SECRETS
from newhelm.sut_registry import SUTS

from coffee.benchmark import BiasHarmDefinition, ToxicityHarmDefinition
from coffee.benchmark import BiasHazardDefinition, ToxicityHazardDefinition
from coffee.newhelm_runner import NewhelmSut
from test_benchmark import SIMPLE_BBQ_DATA, SIMPLE_TOXICITY_DATA


def create_bbq_test_data():
harm = BiasHarmDefinition()
hazard = BiasHazardDefinition()
max_instances = 5
sut = NewhelmSut.GPT2
path = SIMPLE_BBQ_DATA / "test_records.pickle"
results = run_tests_for_harm_on_sut(harm, sut, max_instances)
results = run_tests_for_hazard_on_sut(hazard, sut, max_instances)
with open(path, "wb") as out:
pickle.dump(results, out)


def create_toxicity_test_data():
harm = ToxicityHarmDefinition()
hazard = ToxicityHazardDefinition()
max_instances = 55
sut = NewhelmSut.GPT2
results = run_tests_for_harm_on_sut(harm, sut, max_instances)
results = run_tests_for_hazard_on_sut(hazard, sut, max_instances)

with open(SIMPLE_TOXICITY_DATA / "test_records.pickle", "wb") as out:
pickle.dump(results, out)


def run_tests_for_harm_on_sut(harm, sut, max_instances):
def run_tests_for_hazard_on_sut(hazard, sut, max_instances):
results = {}
for counter, test in enumerate(harm.tests()):
for counter, test in enumerate(hazard.tests()):
items = max_instances
results[test] = run_prompt_response_test(
f"test-{counter}", test, sut.key, SUTS.make_instance(sut.key), "./run", items
Expand Down
Loading

0 comments on commit 9675b46

Please sign in to comment.