diff --git a/.github/failed-scheduled-issue.md b/.github/failed-scheduled-issue.md new file mode 100644 index 00000000..cb20ccda --- /dev/null +++ b/.github/failed-scheduled-issue.md @@ -0,0 +1,8 @@ +--- +title: Daily Scheduled Test Failure +labels: bug +--- +## ❌ Daily Scheduled Test Failure ❌ + +Commit: [{{ env.GIT_COMMIT }}](https://github.com/mlcommons/modelgauge/commit/{{ env.GIT_COMMIT }}) +Run Id: [{{ env.RUN_ID }}](https://github.com/mlcommons/modelgauge/actions/runs/{{ env.RUN_ID }}) diff --git a/.github/workflows/scheduled-tests.yml b/.github/workflows/scheduled-tests.yml new file mode 100644 index 00000000..f0a93be3 --- /dev/null +++ b/.github/workflows/scheduled-tests.yml @@ -0,0 +1,84 @@ +name: Scheduled tests + +on: +# schedule: + # Everyday at 18:15 UTC +# - cron: '15 18 * * *' + push: + +permissions: + contents: read + issues: write + +jobs: + test: + runs-on: ubuntu-latest + environment: Scheduled Testing + + steps: + - uses: actions/checkout@v3 + + - name: Store commit + run: | + echo "GIT_COMMIT=$(git rev-parse HEAD)" >> $GITHUB_ENV + + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + + - name: cache poetry install + uses: actions/cache@v3 + id: cache-poetry + with: + path: ~/.local + key: poetry-1.7.1-0 + + - name: Install and configure Poetry + uses: snok/install-poetry@v1 + with: + version: 1.7.1 + virtualenvs-create: true + virtualenvs-in-project: true + + - name: cache deps + id: cache-deps + uses: actions/cache@v3 + with: + path: .venv + key: pydeps-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies with caching + run: poetry install --no-interaction --no-root + if: steps.cache-deps.outputs.cache-hit != 'true' + + - name: Install with plugins + run: poetry install --no-interaction --extras all_plugins --sync + + - name: Write secrets + env: + SECRETS_CONFIG: | + [together] + api_key = "${{ secrets.TOGETHER_API_KEY }}" + + [openai] + api_key = "${{ secrets.OPENAI_API_KEY }}" + + [demo] + api_key="12345" + run: | + mkdir -p config + echo "$SECRETS_CONFIG" > config/secrets.toml + + - name: Test with plugin + run: | + source .venv/bin/activate + modelbench benchmark --debug -m 1 + + - uses: JasonEtco/create-an-issue@v2 + if: failure() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RUN_ID: ${{ github.run_id }} + with: + filename: .github/failed-scheduled-issue.md diff --git a/src/modelbench/run.py b/src/modelbench/run.py index 14e069aa..6ba17bc6 100644 --- a/src/modelbench/run.py +++ b/src/modelbench/run.py @@ -150,7 +150,11 @@ def score_a_sut(benchmarks, max_instances, secrets, debug, sut): score = hazard.score(results) if debug: - echo(termcolor.colored(f" For hazard {hazard.name()}, {sut.name} scores {score.value()}", "green")) + echo( + termcolor.colored( + f" For hazard {hazard.name()}, {sut.name} scores {score.score.estimate}", "green" + ) + ) hazard_scores.append(score) benchmark_end_time = datetime.now(timezone.utc) sut_scores.append(benchmark_definition.score(sut, hazard_scores, benchmark_end_time))