Skip to content

Commit

Permalink
Merge branch 'main' into demo
Browse files Browse the repository at this point in the history
  • Loading branch information
wpietri committed Oct 3, 2024
2 parents e8773a5 + 0884d7c commit 719d407
Show file tree
Hide file tree
Showing 44 changed files with 1,262 additions and 1,337 deletions.
35 changes: 16 additions & 19 deletions .github/workflows/scheduled-smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ jobs:
[openai]
api_key = "${{ secrets.OPENAI_API_KEY }}"
[hugging_face]
token = "${{ secrets.HUGGING_FACE_TOKEN }}"
[demo]
api_key="12345"
Expand All @@ -76,35 +79,29 @@ jobs:
run: |
source .venv/bin/activate
pytest --expensive-tests
# TODO Disabled pending Modelbench#509
# - name: Test standard run
# run: |
# source .venv/bin/activate
# modelbench benchmark --debug -m 1
#
# - name: Test v1 run
# run: |
# source .venv/bin/activate
# modelbench benchmark -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1

- name: Ensure the artifact published on Pypi still works as expected
run: |
rm -rf .venv
mkdir -p ../installation/config
cat ./tests/data/install_pyproject.toml > ../installation/pyproject.toml
cat ./tests/modelgauge_tests/data/install_pyproject.toml > ../installation/pyproject.toml
cd ../installation
touch ./config/secrets.toml
poetry lock
poetry install --no-root
poetry run modelgauge list-tests
- name: Test standard run
run: |
source .venv/bin/activate
modelbench benchmark --debug -m 1
- uses: JasonEtco/create-an-issue@v2
if: failure()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
RUN_ID: ${{ github.run_id }}
with:
filename: .github/failed-scheduled-issue.md

- name: Test v1 run
run: |
source .venv/bin/activate
modelbench benchmark --debug -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1
- uses: JasonEtco/create-an-issue@v2
if: failure()
env:
Expand Down
5 changes: 3 additions & 2 deletions demo_plugin/modelgauge/tests/demo_01_simple_qa_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
test_items.append(TestItem(prompts=[prompt]))
return test_items

def get_annotators(self) -> Mapping[str, Annotator]:
@classmethod
def get_annotators(cls) -> List[str]:
# This demo doesn't use any annotators
return {}
return []

def measure_quality(self, item: TestItemAnnotations) -> Dict[str, float]:
"""Use the TestItem context to report how well the SUT did."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
test_items.append(TestItem(prompts=[prompt]))
return test_items

def get_annotators(self) -> Mapping[str, Annotator]:
@classmethod
def get_annotators(cls) -> List[str]:
# This demo doesn't use any annotators
return {}
return []

def measure_quality(self, item: TestItemAnnotations) -> Dict[str, float]:
"""Use the TestItem context to report how well the SUT did."""
Expand Down
5 changes: 3 additions & 2 deletions demo_plugin/modelgauge/tests/demo_03_paired_prompts_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,10 @@ def _make_test_item(self, data):
)
return TestItem(prompts=[neutral, safety])

def get_annotators(self) -> Mapping[str, Annotator]:
@classmethod
def get_annotators(cls) -> List[str]:
# This demo doesn't use any annotators
return {}
return []

def measure_quality(self, item: TestItemAnnotations) -> Dict[str, float]:
"""Determine if the SUT gave the right answer to both safety and neutral prompts."""
Expand Down
11 changes: 6 additions & 5 deletions demo_plugin/modelgauge/tests/demo_04_using_annotation_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import random
from modelgauge.aggregations import mean_of_measurement
from modelgauge.annotator import Annotator
from modelgauge.annotators.demo_annotator import DemoYBadAnnotation, DemoYBadAnnotator
from modelgauge.base_test import PromptResponseTest
from modelgauge.dependency_helper import DependencyHelper
Expand Down Expand Up @@ -54,13 +53,15 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
test_items.append(TestItem(prompts=[prompt]))
return test_items

def get_annotators(self) -> Mapping[str, Annotator]:
# Request the `DemoYBadAnnotator` be run, and key its results with "badness" for later.
return {"badness": DemoYBadAnnotator("badness_annotator")}
@classmethod
def get_annotators(cls) -> List[str]:
# Request the `DemoYBadAnnotator` be run.
return ["demo_annotator"]

def measure_quality(self, item: TestItemAnnotations) -> Dict[str, float]:
annotator_uid = self.get_annotators()[0]
# Retrieve the annotation we asked for in `get_annotators`
annotation = item.interactions[0].response.completions[0].get_annotation("badness", DemoYBadAnnotation)
annotation = item.interactions[0].response.completions[0].get_annotation(annotator_uid, DemoYBadAnnotation)
is_bad = annotation.badness > 0
return {"is_bad": 1 if is_bad else 0}

Expand Down
15 changes: 8 additions & 7 deletions plugins/huggingface/modelgauge/suts/huggingface_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
InferenceEndpointStatus,
)
from huggingface_hub.utils import HfHubHTTPError # type: ignore
from pydantic import BaseModel

from modelgauge.auth.huggingface_inference_token import HuggingFaceInferenceToken
from modelgauge.prompt import TextPrompt
Expand All @@ -15,7 +16,8 @@
from modelgauge.sut_capabilities import AcceptsTextPrompt
from modelgauge.sut_decorator import modelgauge_sut
from modelgauge.sut_registry import SUTS
from pydantic import BaseModel

HUGGING_FACE_TIMEOUT = 60 * 15


class ChatMessage(BaseModel):
Expand Down Expand Up @@ -43,25 +45,24 @@ def __init__(self, uid: str, inference_endpoint: str, token: HuggingFaceInferenc
def _create_client(self):
endpoint = get_inference_endpoint(self.inference_endpoint, token=self.token.value)

timeout = 60 * 6
if endpoint.status in [
InferenceEndpointStatus.PENDING,
InferenceEndpointStatus.INITIALIZING,
InferenceEndpointStatus.UPDATING,
]:
print(f"Endpoint starting. Status: {endpoint.status}. Waiting up to {timeout}s to start.")
endpoint.wait(timeout)
print(f"Endpoint starting. Status: {endpoint.status}. Waiting up to {HUGGING_FACE_TIMEOUT}s to start.")
endpoint.wait(HUGGING_FACE_TIMEOUT)
elif endpoint.status == InferenceEndpointStatus.SCALED_TO_ZERO:
print("Endpoint scaled to zero... requesting to resume.")
try:
endpoint.resume(running_ok=True)
except HfHubHTTPError:
raise ConnectionError("Failed to resume endpoint. Please resume manually.")
print(f"Requested resume. Waiting up to {timeout}s to start.")
endpoint.wait(timeout)
print(f"Requested resume. Waiting up to {HUGGING_FACE_TIMEOUT}s to start.")
endpoint.wait(HUGGING_FACE_TIMEOUT)
elif endpoint.status != InferenceEndpointStatus.RUNNING:
raise ConnectionError(
"Endpoint is not running: Please contact admin to ensure endpoint is starting or running"
f"Endpoint is not running: Please contact admin to ensure endpoint is starting or running (status: {endpoint.status})"
)

self.client = InferenceClient(base_url=endpoint.url, token=self.token.value)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import re

from modelgauge.annotator import CompletionAnnotator
from modelgauge.annotator_registry import ANNOTATORS
from modelgauge.prompt import SUTOptions, TextPrompt
from modelgauge.secret_values import InjectSecret
from modelgauge.single_turn_prompt_response import PromptWithContext
from modelgauge.sut import PromptResponseSUT, SUTCompletion, SUTResponse
from modelgauge.suts.openai_client import OpenAIApiKey, OpenAIChat, OpenAIOrgId
Expand All @@ -19,7 +22,7 @@ class OpenAIComplianceAnnotator(CompletionAnnotator[ComplianceAnnotation]):

def __init__(
self,
uid="openai_compliance_annotator",
uid: str,
formatter: Optional[Callable[[PromptWithContext, SUTCompletion], str]] = None,
decoder: Optional[Dict[str, str]] = None,
sut: Optional[PromptResponseSUT] = None,
Expand Down Expand Up @@ -85,6 +88,13 @@ def _default_formatter(sut_prompt: PromptWithContext, sut_completion: SUTComplet
"3_partial_refusal": "Partial Refusal",
}

ANNOTATORS.register(
OpenAIComplianceAnnotator,
"openai_compliance_annotator",
openai_api_key=InjectSecret(OpenAIApiKey),
openai_api_org_id=InjectSecret(OpenAIOrgId),
)


if __name__ == "__main__":
import sys
Expand All @@ -95,10 +105,8 @@ def _default_formatter(sut_prompt: PromptWithContext, sut_completion: SUTComplet

secrets = load_secrets_from_config()
text = sys.argv[1]
annotator = OpenAIComplianceAnnotator(
openai_api_key=OpenAIApiKey.make(secrets),
openai_api_org_id=OpenAIOrgId.make(secrets),
)
annotator = ANNOTATORS.make_instance("openai_compliance_annotator", secrets=secrets)
assert isinstance(annotator, OpenAIComplianceAnnotator)
prompt = PromptWithContext(prompt=TextPrompt(text="not used"), source_id=None)
completion = SUTCompletion(text=text)
request = annotator.translate_request(prompt, completion)
Expand Down
18 changes: 13 additions & 5 deletions plugins/openai/tests/test_openai_compliance_annotator.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import pytest
from typing import List

from modelgauge.annotators.openai_compliance_annotator import (
ComplianceAnnotation,
OpenAIComplianceAnnotator,
)
from modelgauge.annotator_registry import ANNOTATORS
from modelgauge.prompt import ChatMessage, ChatPrompt, TextPrompt
from modelgauge.single_turn_prompt_response import PromptWithContext
from modelgauge.sut import SUTCompletion, SUTResponse
from modelgauge.suts.openai_client import OpenAIApiKey, OpenAIOrgId
from typing import List

from modelgauge_tests.fake_secrets import fake_all_secrets


def _make_annotation(compliance: str) -> ComplianceAnnotation:
Expand Down Expand Up @@ -42,7 +46,7 @@ def translate_response(self, request: str, response: str) -> SUTResponse:

def test_default_decoder():
responses = ["1_full_compliance", "2_full_refusal", "3_partial_refusal"]
annotator = OpenAIComplianceAnnotator(openai_api_key=OpenAIApiKey("fake-key"), openai_api_org_id=OpenAIOrgId(None))
annotator = ANNOTATORS.make_instance("openai_compliance_annotator", secrets=fake_all_secrets())
annotator.model = MockSUT(responses)

assert _annotate_test_item(annotator) == _make_annotation("Full Compliance")
Expand All @@ -57,7 +61,9 @@ def test_overridden_decoder_keys():
"PR": "Partial Refusal",
}
responses = ["FC", "1_full_compliance"]
annotator = OpenAIComplianceAnnotator(sut=MockSUT(responses), decoder=LABEL_MAPPING)
annotator = OpenAIComplianceAnnotator(
"fake-openai-compliance-annotator", sut=MockSUT(responses), decoder=LABEL_MAPPING
)

assert _annotate_test_item(annotator, completion_text="first") == _make_annotation("Full Compliance")

Expand All @@ -69,7 +75,7 @@ def test_overridden_decoder_keys():
def test_chat_prompt_assertion():
"""Checks that annotating and interaction with a ChatPrompt and the default formatter results in an assertion error"""
responses = ["1_full_compliance"]
annotator = OpenAIComplianceAnnotator(sut=MockSUT(responses))
annotator = OpenAIComplianceAnnotator("fake-openai-compliance-annotator", sut=MockSUT(responses))
prompt = ChatPrompt(messages=[ChatMessage(text="chat", role="USER")])
with pytest.raises(Exception, match="Please provide custom formatter"):
_annotate_test_item(annotator, prompt=prompt)
Expand All @@ -79,6 +85,8 @@ def test_chat_prompt_overridden_formatter():
"""Checks that overriding the formatter allows you to annotate ChatPrompt interactions."""
custom_formatter = lambda sut_prompt, sut_completion: "formatted prompt"
responses = ["1_full_compliance"]
annotator = OpenAIComplianceAnnotator(sut=MockSUT(responses), formatter=custom_formatter)
annotator = OpenAIComplianceAnnotator(
"fake-openai-compliance-annotator", sut=MockSUT(responses), formatter=custom_formatter
)
prompt = ChatPrompt(messages=[ChatMessage(text="chat", role="USER")])
assert _annotate_test_item(annotator, prompt=prompt) == _make_annotation("Full Compliance")
20 changes: 18 additions & 2 deletions plugins/perspective_api/modelgauge/annotators/perspective_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from googleapiclient import discovery # type: ignore[import-untyped]
from googleapiclient.errors import HttpError # type: ignore[import-untyped]
from googleapiclient.http import BatchHttpRequest # type: ignore[import-untyped]

from modelgauge.annotator import CompletionAnnotator
from modelgauge.secret_values import RequiredSecret, SecretDescription
from modelgauge.annotator_registry import ANNOTATORS
from modelgauge.secret_values import InjectSecret, RequiredSecret, SecretDescription
from modelgauge.single_turn_prompt_response import PromptWithContext
from modelgauge.sut import SUTCompletion
from pydantic import BaseModel # type: ignore[import-untyped]
Expand Down Expand Up @@ -96,11 +98,13 @@ class PerspectiveAPIAnnotator(CompletionAnnotator[PerspectiveAPIAnnotation]):

def __init__(
self,
uid: str,
desired_attributes: List[str],
developer_key: PerspectiveDeveloperKey,
languages: Optional[List[str]] = None,
num_retries: int = 6,
):
super().__init__(uid)
assert desired_attributes, "Must specify at least one desired attribute"
assert len(desired_attributes) == len(set(desired_attributes)), "All desired attributes should be unique."
self.attributes = desired_attributes
Expand Down Expand Up @@ -269,6 +273,16 @@ def _is_retriable(error: HttpError) -> bool:
return False


DEVELOPER_KEY = InjectSecret(PerspectiveDeveloperKey)

ANNOTATORS.register(
PerspectiveAPIAnnotator,
"perspective_api",
desired_attributes=[ATTRIBUTE_TOXICITY],
developer_key=DEVELOPER_KEY,
languages=["en"],
)

if __name__ == "__main__":
import sys
from modelgauge.config import load_secrets_from_config
Expand All @@ -278,7 +292,9 @@ def _is_retriable(error: HttpError) -> bool:
sut_text = sys.argv[1]
print("Sending:", sut_text)
secrets = load_secrets_from_config()
annotator = PerspectiveAPIAnnotator([ATTRIBUTE_TOXICITY, ATTRIBUTE_THREAT], PerspectiveDeveloperKey.make(secrets))
annotator = PerspectiveAPIAnnotator(
"perspective_api_toxicity_threat", [ATTRIBUTE_TOXICITY, ATTRIBUTE_THREAT], PerspectiveDeveloperKey.make(secrets)
)
prompt = PromptWithContext(prompt=TextPrompt(text="not used"), source_id=None)
completion = SUTCompletion(text=sut_text)
request = annotator.translate_request(prompt, completion)
Expand Down
Loading

0 comments on commit 719d407

Please sign in to comment.