create-docker-image (#405)

* testing workflow * testing workflow * testing workflow * testing workflow * testing workflow * testing workflow * testing workflow * testing workflow * testing workflow * testing workflow * testing workflow * this should be it, thank goodness * use new modelgauge without extras; bump version; fix tests * update lock file * add .dockerignore * add .dockerignore
mlcommons · Aug 12, 2024 · 59110d4 · 59110d4
1 parent bed7ee4
commit 59110d4
Show file tree

Hide file tree

Showing 7 changed files with 567 additions and 1,007 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,9 @@
+dist
+run
+embed
+web
+tests
+docs
+.github
+.venv
+config
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -0,0 +1,33 @@
+name: Build and Publish Docker Image
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          push: true
+          tags: |
+            ghcr.io/${{ github.repository }}:latest
+            ghcr.io/${{ github.repository }}:${{ github.ref }}
+          platforms: |
+            linux/arm64/v8
+            linux/amd64
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,36 @@
+# Base Stage
+FROM python:3.10-slim AS base
+
+ENV PYTHONFAULTHANDLER=1 \
+    PYTHONHASHSEED=random \
+    PYTHONUNBUFFERED=1
+
+WORKDIR /app
+
+# Build Stage
+FROM base AS builder
+
+ENV PIP_DEFAULT_TIMEOUT=100 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1 \
+    POETRY_VERSION=1.8.3
+
+RUN pip install "poetry==$POETRY_VERSION"
+RUN python -m venv /venv
+
+COPY pyproject.toml poetry.lock ./
+RUN . /venv/bin/activate && poetry install --without=dev --no-root --no-interaction --no-ansi
+
+COPY . .
+RUN . /venv/bin/activate && poetry build
+
+# Final Stage
+FROM base AS final
+
+WORKDIR /app
+
+COPY --from=builder /venv /venv
+COPY --from=builder /app/dist .
+
+RUN . /venv/bin/activate && pip install *.whl
+ENTRYPOINT ["/venv/bin/modelbench", "--help"]
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires-python = ">=3.10, <3.13"
 
 [tool.poetry]
 name = "modelbench"
-version = "0.5.1"
+version = "0.6.0"
 description = "Run benchmarks and generate reports measuring the behavior of many AI Systems."
 license = "Apache-2.0"
 authors = ["MLCommons AI Safety <[email protected]>"]
@@ -47,7 +47,7 @@ packages = [
 
 [tool.poetry.dependencies]
 python = ">=3.10,<3.13"
-modelgauge = { version = ">=0.6.0", extras = ["perspective_api", "standard_tests"] }
+modelgauge = ">=0.6.0"
 jq = "^1.6.0"
 click = "^8.1.7"
 casefy = "^0.1.7"

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -109,7 +109,7 @@ def test_text_grade(self, probability, expected, hazard_score):
 
 
 def test_hazard_definition_basics(fake_secrets):
-    import modelgauge.tests.bbq
+    import modelgauge.tests.safe
 
     h = SafeCaeHazard()
     assert h.uid == "safe_cae_hazard-0.5"
@@ -124,7 +124,7 @@ def test_hazard_definition_basics(fake_secrets):
 
 
 def test_hazard_v1_definition_basics(fake_secrets):
-    import modelgauge.tests.bbq
+    import modelgauge.tests.safe
 
     h = SafeDfmHazardV1()
     assert h.uid == "safe_dfm_hazard_v_1-1.0"

diff --git a/tests/test_static_site_generator.py b/tests/test_static_site_generator.py
@@ -4,20 +4,13 @@
 from unittest.mock import MagicMock
 from unittest.mock import patch
 
-from modelgauge.tests.bbq import BBQ
-
-
 import pytest
 
+from modelbench.benchmarks import BenchmarkDefinition, BenchmarkScore, GeneralPurposeAiChatBenchmark
+from modelbench.hazards import HazardScore, SafeCaeHazard, SafeCbrHazard, SafeHazard
 from modelbench.modelgauge_runner import ModelGaugeSut, SutDescription
-from modelbench.benchmarks import (
-    BenchmarkDefinition,
-    GeneralPurposeAiChatBenchmark,
-    BenchmarkScore,
-)
 from modelbench.scoring import ValueEstimate
-from modelbench.hazards import HazardScore, SafeCaeHazard, SafeCbrHazard, SafeHazard
-from modelbench.static_site_generator import HazardScorePositions, StaticContent, StaticSiteGenerator
+from modelbench.static_site_generator import HazardScorePositions, StaticSiteGenerator
 
 
 @pytest.fixture()
@@ -107,6 +100,16 @@ class TestObjectContentKeysExist:
     Tests to ensure that appropriate presentation-layer content exists for objects that are added to modelbench.
     """
 
+    @pytest.fixture
+    def fake_test(self):
+        from modelgauge.base_test import BaseTest
+
+        class FakeTest(BaseTest):
+            def __init__(self, uid):
+                self.uid = uid
+
+        return FakeTest
+
     @pytest.fixture
     def ssg(self):
         _ssg = StaticSiteGenerator()
@@ -198,14 +201,13 @@ def test_safe_hazard_definitions(self, ssg, hazard, required_template_content_ke
         for key in required_template_content_keys["SafeHazard"]:
             assert ssg.content(hazard(), key)
 
-    def test_tests(self, ssg):
+    def test_tests(self, ssg, fake_test):
         # todo: This is the most naive version of this test, but we'll want a way to check all of the tests modelbench cares about at some point
-
-        test = BBQ(uid="bbq")
+        test = fake_test(uid="bbq")
         assert ssg.content(test, "display_name") == "BBQ: Bias Benchmark for QA"
 
-    def test_test_defaults(self, ssg):
-        test = BBQ(uid="not_a_real_uid")
+    def test_test_defaults(self, ssg, fake_test):
+        test = fake_test(uid="not_a_real_uid")
         assert ssg.content(test, "display_name") == "not_a_real_uid"
         assert ssg.content(test, "not_a_real_key") == ""
-Original file line number
+Diff line change
@@ -0,0 +1,9 @@
+    dist
+    run
+    embed
+    web
+    tests
+    docs
+    .github
+    .venv
+    config