Merge branch 'main' into demo

mlcommons · Oct 3, 2024 · 719d407 · 719d407
2 parents e8773a5 + 0884d7c
commit 719d407
Show file tree

Hide file tree

Showing 44 changed files with 1,262 additions and 1,337 deletions.
diff --git a/.github/workflows/scheduled-smoke-test.yml b/.github/workflows/scheduled-smoke-test.yml
@@ -65,6 +65,9 @@ jobs:
 
           [openai]
           api_key = "${{ secrets.OPENAI_API_KEY }}"
+          
+          [hugging_face]
+          token = "${{ secrets.HUGGING_FACE_TOKEN }}"
 
           [demo]
           api_key="12345"
@@ -76,35 +79,29 @@ jobs:
       run: |
         source .venv/bin/activate
         pytest --expensive-tests
+
+# TODO Disabled pending Modelbench#509
+#    - name: Test standard run
+#      run: |
+#        source .venv/bin/activate
+#        modelbench benchmark --debug -m 1
+#
+#    - name: Test v1 run
+#      run: |
+#        source .venv/bin/activate
+#        modelbench benchmark -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1
+
     - name: Ensure the artifact published on Pypi still works as expected
       run: |
         rm -rf .venv
         mkdir -p ../installation/config
-        cat ./tests/data/install_pyproject.toml > ../installation/pyproject.toml
+        cat ./tests/modelgauge_tests/data/install_pyproject.toml > ../installation/pyproject.toml
         cd ../installation
         touch ./config/secrets.toml
         poetry lock
         poetry install --no-root
         poetry run modelgauge list-tests
 
-    - name: Test standard run
-      run: |
-        source .venv/bin/activate
-        modelbench benchmark --debug -m 1
-
-    - uses: JasonEtco/create-an-issue@v2
-      if: failure()
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        RUN_ID: ${{ github.run_id }}
-      with:
-        filename: .github/failed-scheduled-issue.md
-
-    - name: Test v1 run
-      run: |
-        source .venv/bin/activate
-        modelbench benchmark --debug -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1
-
     - uses: JasonEtco/create-an-issue@v2
       if: failure()
       env:

diff --git a/demo_plugin/modelgauge/tests/demo_01_simple_qa_test.py b/demo_plugin/modelgauge/tests/demo_01_simple_qa_test.py
@@ -51,9 +51,10 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
                 test_items.append(TestItem(prompts=[prompt]))
         return test_items
 
-    def get_annotators(self) -> Mapping[str, Annotator]:
+    @classmethod
+    def get_annotators(cls) -> List[str]:
         # This demo doesn't use any annotators
-        return {}
+        return []
 
     def measure_quality(self, item: TestItemAnnotations) -> Dict[str, float]:
         """Use the TestItem context to report how well the SUT did."""

diff --git a/demo_plugin/modelgauge/tests/demo_02_unpacking_dependency_test.py b/demo_plugin/modelgauge/tests/demo_02_unpacking_dependency_test.py
@@ -59,9 +59,10 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
             test_items.append(TestItem(prompts=[prompt]))
         return test_items
 
-    def get_annotators(self) -> Mapping[str, Annotator]:
+    @classmethod
+    def get_annotators(cls) -> List[str]:
         # This demo doesn't use any annotators
-        return {}
+        return []
 
     def measure_quality(self, item: TestItemAnnotations) -> Dict[str, float]:
         """Use the TestItem context to report how well the SUT did."""

diff --git a/demo_plugin/modelgauge/tests/demo_03_paired_prompts_test.py b/demo_plugin/modelgauge/tests/demo_03_paired_prompts_test.py
@@ -73,9 +73,10 @@ def _make_test_item(self, data):
         )
         return TestItem(prompts=[neutral, safety])
 
-    def get_annotators(self) -> Mapping[str, Annotator]:
+    @classmethod
+    def get_annotators(cls) -> List[str]:
         # This demo doesn't use any annotators
-        return {}
+        return []
 
     def measure_quality(self, item: TestItemAnnotations) -> Dict[str, float]:
         """Determine if the SUT gave the right answer to both safety and neutral prompts."""

diff --git a/demo_plugin/modelgauge/tests/demo_04_using_annotation_test.py b/demo_plugin/modelgauge/tests/demo_04_using_annotation_test.py
@@ -1,6 +1,5 @@
 import random
 from modelgauge.aggregations import mean_of_measurement
-from modelgauge.annotator import Annotator
 from modelgauge.annotators.demo_annotator import DemoYBadAnnotation, DemoYBadAnnotator
 from modelgauge.base_test import PromptResponseTest
 from modelgauge.dependency_helper import DependencyHelper
@@ -54,13 +53,15 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
             test_items.append(TestItem(prompts=[prompt]))
         return test_items
 
-    def get_annotators(self) -> Mapping[str, Annotator]:
-        # Request the `DemoYBadAnnotator` be run, and key its results with "badness" for later.
-        return {"badness": DemoYBadAnnotator("badness_annotator")}
+    @classmethod
+    def get_annotators(cls) -> List[str]:
+        # Request the `DemoYBadAnnotator` be run.
+        return ["demo_annotator"]
 
     def measure_quality(self, item: TestItemAnnotations) -> Dict[str, float]:
+        annotator_uid = self.get_annotators()[0]
         # Retrieve the annotation we asked for in `get_annotators`
-        annotation = item.interactions[0].response.completions[0].get_annotation("badness", DemoYBadAnnotation)
+        annotation = item.interactions[0].response.completions[0].get_annotation(annotator_uid, DemoYBadAnnotation)
         is_bad = annotation.badness > 0
         return {"is_bad": 1 if is_bad else 0}
 

diff --git a/plugins/huggingface/modelgauge/suts/huggingface_inference.py b/plugins/huggingface/modelgauge/suts/huggingface_inference.py
@@ -7,6 +7,7 @@
     InferenceEndpointStatus,
 )
 from huggingface_hub.utils import HfHubHTTPError  # type: ignore
+from pydantic import BaseModel
 
 from modelgauge.auth.huggingface_inference_token import HuggingFaceInferenceToken
 from modelgauge.prompt import TextPrompt
@@ -15,7 +16,8 @@
 from modelgauge.sut_capabilities import AcceptsTextPrompt
 from modelgauge.sut_decorator import modelgauge_sut
 from modelgauge.sut_registry import SUTS
-from pydantic import BaseModel
+
+HUGGING_FACE_TIMEOUT = 60 * 15
 
 
 class ChatMessage(BaseModel):
@@ -43,25 +45,24 @@ def __init__(self, uid: str, inference_endpoint: str, token: HuggingFaceInferenc
     def _create_client(self):
         endpoint = get_inference_endpoint(self.inference_endpoint, token=self.token.value)
 
-        timeout = 60 * 6
         if endpoint.status in [
             InferenceEndpointStatus.PENDING,
             InferenceEndpointStatus.INITIALIZING,
             InferenceEndpointStatus.UPDATING,
         ]:
-            print(f"Endpoint starting. Status: {endpoint.status}. Waiting up to {timeout}s to start.")
-            endpoint.wait(timeout)
+            print(f"Endpoint starting. Status: {endpoint.status}. Waiting up to {HUGGING_FACE_TIMEOUT}s to start.")
+            endpoint.wait(HUGGING_FACE_TIMEOUT)
         elif endpoint.status == InferenceEndpointStatus.SCALED_TO_ZERO:
             print("Endpoint scaled to zero... requesting to resume.")
             try:
                 endpoint.resume(running_ok=True)
             except HfHubHTTPError:
                 raise ConnectionError("Failed to resume endpoint. Please resume manually.")
-            print(f"Requested resume. Waiting up to {timeout}s to start.")
-            endpoint.wait(timeout)
+            print(f"Requested resume. Waiting up to {HUGGING_FACE_TIMEOUT}s to start.")
+            endpoint.wait(HUGGING_FACE_TIMEOUT)
         elif endpoint.status != InferenceEndpointStatus.RUNNING:
             raise ConnectionError(
-                "Endpoint is not running: Please contact admin to ensure endpoint is starting or running"
+                f"Endpoint is not running: Please contact admin to ensure endpoint is starting or running (status: {endpoint.status})"
             )
 
         self.client = InferenceClient(base_url=endpoint.url, token=self.token.value)

diff --git a/plugins/openai/modelgauge/annotators/openai_compliance_annotator.py b/plugins/openai/modelgauge/annotators/openai_compliance_annotator.py
@@ -1,6 +1,9 @@
 import re
+
 from modelgauge.annotator import CompletionAnnotator
+from modelgauge.annotator_registry import ANNOTATORS
 from modelgauge.prompt import SUTOptions, TextPrompt
+from modelgauge.secret_values import InjectSecret
 from modelgauge.single_turn_prompt_response import PromptWithContext
 from modelgauge.sut import PromptResponseSUT, SUTCompletion, SUTResponse
 from modelgauge.suts.openai_client import OpenAIApiKey, OpenAIChat, OpenAIOrgId
@@ -19,7 +22,7 @@ class OpenAIComplianceAnnotator(CompletionAnnotator[ComplianceAnnotation]):
 
     def __init__(
         self,
-        uid="openai_compliance_annotator",
+        uid: str,
         formatter: Optional[Callable[[PromptWithContext, SUTCompletion], str]] = None,
         decoder: Optional[Dict[str, str]] = None,
         sut: Optional[PromptResponseSUT] = None,
@@ -85,6 +88,13 @@ def _default_formatter(sut_prompt: PromptWithContext, sut_completion: SUTComplet
     "3_partial_refusal": "Partial Refusal",
 }
 
+ANNOTATORS.register(
+    OpenAIComplianceAnnotator,
+    "openai_compliance_annotator",
+    openai_api_key=InjectSecret(OpenAIApiKey),
+    openai_api_org_id=InjectSecret(OpenAIOrgId),
+)
+
 
 if __name__ == "__main__":
     import sys
@@ -95,10 +105,8 @@ def _default_formatter(sut_prompt: PromptWithContext, sut_completion: SUTComplet
 
     secrets = load_secrets_from_config()
     text = sys.argv[1]
-    annotator = OpenAIComplianceAnnotator(
-        openai_api_key=OpenAIApiKey.make(secrets),
-        openai_api_org_id=OpenAIOrgId.make(secrets),
-    )
+    annotator = ANNOTATORS.make_instance("openai_compliance_annotator", secrets=secrets)
+    assert isinstance(annotator, OpenAIComplianceAnnotator)
     prompt = PromptWithContext(prompt=TextPrompt(text="not used"), source_id=None)
     completion = SUTCompletion(text=text)
     request = annotator.translate_request(prompt, completion)

diff --git a/plugins/openai/tests/test_openai_compliance_annotator.py b/plugins/openai/tests/test_openai_compliance_annotator.py
@@ -1,13 +1,17 @@
 import pytest
+from typing import List
+
 from modelgauge.annotators.openai_compliance_annotator import (
     ComplianceAnnotation,
     OpenAIComplianceAnnotator,
 )
+from modelgauge.annotator_registry import ANNOTATORS
 from modelgauge.prompt import ChatMessage, ChatPrompt, TextPrompt
 from modelgauge.single_turn_prompt_response import PromptWithContext
 from modelgauge.sut import SUTCompletion, SUTResponse
 from modelgauge.suts.openai_client import OpenAIApiKey, OpenAIOrgId
-from typing import List
+
+from modelgauge_tests.fake_secrets import fake_all_secrets
 
 
 def _make_annotation(compliance: str) -> ComplianceAnnotation:
@@ -42,7 +46,7 @@ def translate_response(self, request: str, response: str) -> SUTResponse:
 
 def test_default_decoder():
     responses = ["1_full_compliance", "2_full_refusal", "3_partial_refusal"]
-    annotator = OpenAIComplianceAnnotator(openai_api_key=OpenAIApiKey("fake-key"), openai_api_org_id=OpenAIOrgId(None))
+    annotator = ANNOTATORS.make_instance("openai_compliance_annotator", secrets=fake_all_secrets())
     annotator.model = MockSUT(responses)
 
     assert _annotate_test_item(annotator) == _make_annotation("Full Compliance")
@@ -57,7 +61,9 @@ def test_overridden_decoder_keys():
         "PR": "Partial Refusal",
     }
     responses = ["FC", "1_full_compliance"]
-    annotator = OpenAIComplianceAnnotator(sut=MockSUT(responses), decoder=LABEL_MAPPING)
+    annotator = OpenAIComplianceAnnotator(
+        "fake-openai-compliance-annotator", sut=MockSUT(responses), decoder=LABEL_MAPPING
+    )
 
     assert _annotate_test_item(annotator, completion_text="first") == _make_annotation("Full Compliance")
 
@@ -69,7 +75,7 @@ def test_overridden_decoder_keys():
 def test_chat_prompt_assertion():
     """Checks that annotating and interaction with a ChatPrompt and the default formatter results in an assertion error"""
     responses = ["1_full_compliance"]
-    annotator = OpenAIComplianceAnnotator(sut=MockSUT(responses))
+    annotator = OpenAIComplianceAnnotator("fake-openai-compliance-annotator", sut=MockSUT(responses))
     prompt = ChatPrompt(messages=[ChatMessage(text="chat", role="USER")])
     with pytest.raises(Exception, match="Please provide custom formatter"):
         _annotate_test_item(annotator, prompt=prompt)
@@ -79,6 +85,8 @@ def test_chat_prompt_overridden_formatter():
     """Checks that overriding the formatter allows you to annotate ChatPrompt interactions."""
     custom_formatter = lambda sut_prompt, sut_completion: "formatted prompt"
     responses = ["1_full_compliance"]
-    annotator = OpenAIComplianceAnnotator(sut=MockSUT(responses), formatter=custom_formatter)
+    annotator = OpenAIComplianceAnnotator(
+        "fake-openai-compliance-annotator", sut=MockSUT(responses), formatter=custom_formatter
+    )
     prompt = ChatPrompt(messages=[ChatMessage(text="chat", role="USER")])
     assert _annotate_test_item(annotator, prompt=prompt) == _make_annotation("Full Compliance")
diff --git a/plugins/perspective_api/modelgauge/annotators/perspective_api.py b/plugins/perspective_api/modelgauge/annotators/perspective_api.py
@@ -5,8 +5,10 @@
 from googleapiclient import discovery  # type: ignore[import-untyped]
 from googleapiclient.errors import HttpError  # type: ignore[import-untyped]
 from googleapiclient.http import BatchHttpRequest  # type: ignore[import-untyped]
+
 from modelgauge.annotator import CompletionAnnotator
-from modelgauge.secret_values import RequiredSecret, SecretDescription
+from modelgauge.annotator_registry import ANNOTATORS
+from modelgauge.secret_values import InjectSecret, RequiredSecret, SecretDescription
 from modelgauge.single_turn_prompt_response import PromptWithContext
 from modelgauge.sut import SUTCompletion
 from pydantic import BaseModel  # type: ignore[import-untyped]
@@ -96,11 +98,13 @@ class PerspectiveAPIAnnotator(CompletionAnnotator[PerspectiveAPIAnnotation]):
 
     def __init__(
         self,
+        uid: str,
         desired_attributes: List[str],
         developer_key: PerspectiveDeveloperKey,
         languages: Optional[List[str]] = None,
         num_retries: int = 6,
     ):
+        super().__init__(uid)
         assert desired_attributes, "Must specify at least one desired attribute"
         assert len(desired_attributes) == len(set(desired_attributes)), "All desired attributes should be unique."
         self.attributes = desired_attributes
@@ -269,6 +273,16 @@ def _is_retriable(error: HttpError) -> bool:
     return False
 
 
+DEVELOPER_KEY = InjectSecret(PerspectiveDeveloperKey)
+
+ANNOTATORS.register(
+    PerspectiveAPIAnnotator,
+    "perspective_api",
+    desired_attributes=[ATTRIBUTE_TOXICITY],
+    developer_key=DEVELOPER_KEY,
+    languages=["en"],
+)
+
 if __name__ == "__main__":
     import sys
     from modelgauge.config import load_secrets_from_config
@@ -278,7 +292,9 @@ def _is_retriable(error: HttpError) -> bool:
     sut_text = sys.argv[1]
     print("Sending:", sut_text)
     secrets = load_secrets_from_config()
-    annotator = PerspectiveAPIAnnotator([ATTRIBUTE_TOXICITY, ATTRIBUTE_THREAT], PerspectiveDeveloperKey.make(secrets))
+    annotator = PerspectiveAPIAnnotator(
+        "perspective_api_toxicity_threat", [ATTRIBUTE_TOXICITY, ATTRIBUTE_THREAT], PerspectiveDeveloperKey.make(secrets)
+    )
     prompt = PromptWithContext(prompt=TextPrompt(text="not used"), source_id=None)
     completion = SUTCompletion(text=sut_text)
     request = annotator.translate_request(prompt, completion)