Resurrect ao benchmark on AWS A100 runner (#2561)

Summary: I'm bringing back some example models first, one for each set (TIMM, HF, TorchBench), to have some data to unblock our TorchAO ClickHouse migration. More can be added later if we decide to keep this workflow. ### Testing https://github.com/pytorch/benchmark/actions/runs/12388956432/job/34581035274 The results are now available on `oss_ci_benchmark_v3` table `select * from oss_ci_benchmark_v3 where workflow_id = 12388956432` Pull Request resolved: #2561 Reviewed By: xuzhao9 Differential Revision: D67383198 Pulled By: huydhn fbshipit-source-id: cbe3e4c6517c656981c7e00d8564451a4d3654d0
pytorch · Dec 19, 2024 · 65789d4 · 65789d4
1 parent 49cb63b
commit 65789d4
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 78 deletions.
diff --git a/.github/workflows/_linux-benchmark-cuda.yml b/.github/workflows/_linux-benchmark-cuda.yml
@@ -10,92 +10,86 @@ on:
         required: true
         type: string
         description: Userbenchmark run command line arguments
+      only:
+        required: False
+        type: string
+        default: ''
+        description: Only run the selected model, used for testing
     secrets:
       HUGGING_FACE_HUB_TOKEN:
         required: false
         description: |
           HF auth token to avoid rate limits when downloading models or datasets from hub
-      AWS_ACCESS_KEY_ID:
-        required: true
-        description: |
-          AWS access token for S3 uploading
-      AWS_SECRET_ACCESS_KEY:
-        required: true
-        description: |
-          AWS secret access key for S3 uploading
 
 jobs:
-  # Run a specific userbenchmark with given arguments
-  # Need to pass in userbenchmark name and arguments
   benchmark:
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
-    runs-on: [a100-runner]
-    timeout-minutes: 1440 # 24 hours
+    runs-on: linux.aws.a100
+    timeout-minutes: 1440
     environment: docker-s3-upload
     env:
-      BASE_CONDA_ENV: "torchbench"
-      CONDA_ENV: "userbenchmark"
-      SETUP_SCRIPT: "/workspace/setup_instance.sh"
+      OUTPUT_DIR: .userbenchmark
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      ONLY: ${{ inputs.only || '' }}
     steps:
       - name: Checkout TorchBench
         uses: actions/checkout@v3
         with:
           path: benchmark
-      - name: Tune Nvidia GPU
-        run: |
-          sudo nvidia-smi -pm 1
-          sudo nvidia-smi -ac 1215,1410
-          sudo ldconfig
-          nvidia-smi
+
       - name: Remove result if it already exists
-        if: always()
+        shell: bash
+        working-directory: benchmark
         run: |
-          # remove old results if exists
-          if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi
-          pushd benchmark
-          if [ -d .userbenchmark ]; then rm -Rf .userbenchmark; fi
-      - name: Clone and setup conda env
+          set -eux
+
+          if [[ -d "${OUTPUT_DIR}" ]]; then
+            rm -rf "${OUTPUT_DIR}"
+          fi
+
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: "3.9"
+
+      - name: Install torch dependencies
+        shell: bash
+        working-directory: benchmark
         run: |
-          CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
-          conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}"
+          set -eux
+          ${CONDA_RUN} pip3 install --pre torch torchvision torchaudio torchao \
+            --index-url https://download.pytorch.org/whl/nightly/cu124
+
       - name: Install benchmark
+        shell: bash
+        working-directory: benchmark
         run: |
-          . "${SETUP_SCRIPT}"
-          pushd benchmark
-          python install.py
+          set -eux
+
+          if [[ -z "${ONLY}" ]]; then
+            ${CONDA_RUN} python install.py --numpy
+          else
+            ${CONDA_RUN} python install.py --numpy --models "${ONLY}"
+          fi
+
       - name: Run benchmark
+        shell: bash
+        working-directory: benchmark
         run: |
-          . "${SETUP_SCRIPT}"
-          pushd benchmark
-          python run_benchmark.py ${{ inputs.userbenchmark }} ${{ inputs.userbenchmark-run-args }}
-      - name: Copy benchmark logs
-        if: always()
-        run: |
-          pushd benchmark
-          cp -r ./.userbenchmark/${{ inputs.userbenchmark }} ../benchmark-output
-      - name: Upload benchmark result to GH Actions Artifact
-        uses: actions/upload-artifact@v3
-        if: always()
+          set -eux
+
+          if [[ -z "${ONLY}" ]]; then
+            ${CONDA_RUN} python run_benchmark.py ${{ inputs.userbenchmark }} ${{ inputs.userbenchmark-run-args }}
+          else
+            ${CONDA_RUN} python run_benchmark.py ${{ inputs.userbenchmark }} ${{ inputs.userbenchmark-run-args }} \
+              --only "${ONLY}"
+          fi
+
+      - name: Upload the benchmark results to OSS benchmark database for the dashboard
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
         with:
-          name: ${{ inputs.userbenchmark }} benchmarking result
-          path: benchmark-output/
-      - name: Copy artifact and upload to Amazon S3
-        env:
-          WORKFLOW_RUN_ID: ${{ github.run_id }}
-          WORKFLOW_RUN_ATTEMPT: ${{ github.run_attempt }}
-        run: |
-          . "${SETUP_SCRIPT}"
-          pushd benchmark
-          # Upload the result json to Amazon S3
-          python ./scripts/userbenchmark/upload_s3_csv.py --s3-prefix torchbench-csv --userbenchmark ${{ inputs.userbenchmark }} \
-                                                          --upload-path ../benchmark-output --match-filename "^${{ inputs.userbenchmark }}.*\.csv"
-      - name: Clean up Conda env
-        if: always()
-        run: |
-          . "${SETUP_SCRIPT}"
-          conda deactivate && conda deactivate
-          conda remove -n "${CONDA_ENV}" --all
+          benchmark-results-dir: benchmark/${{ env.OUTPUT_DIR }}/${{ inputs.userbenchmark }}
+          dry-run: false
+          schema-version: v3
+          github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/torchao.yml b/.github/workflows/torchao.yml
@@ -2,36 +2,36 @@ name: Torchao nightly workflow (A100)
 on:
   workflow_dispatch:
   schedule:
-    - cron: '00 18 * * *' # run at 6:00 PM UTC, K8s containers will roll out at 12PM EST
+    - cron: '00 18 * * *' # run at 6:00 PM UTC
 
 jobs:
   timm:
     uses: ./.github/workflows/_linux-benchmark-cuda.yml
     with:
-      userbenchmark: "torchao"
+      userbenchmark: torchao
       userbenchmark-run-args: "--ci --dashboard --timm"
+      # TODO (huydhn): Bring back the rest of them later
+      only: adv_inception_v3
     secrets:
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
   torchbench:
     uses: ./.github/workflows/_linux-benchmark-cuda.yml
     with:
-      userbenchmark: "torchao"
+      userbenchmark: torchao
       userbenchmark-run-args: "--ci --dashboard --torchbench"
+      # TODO (huydhn): Bring back the rest of them later
+      only: BERT_pytorch
     secrets:
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
   huggingface:
     uses: ./.github/workflows/_linux-benchmark-cuda.yml
     with:
-      userbenchmark: "torchao"
+      userbenchmark: torchao
       userbenchmark-run-args: "--ci --dashboard --huggingface"
+      # TODO (huydhn): Bring back the rest of them later
+      only: AlbertForMaskedLM
     secrets:
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}

diff --git a/userbenchmark/torchao/run.py b/userbenchmark/torchao/run.py
@@ -1,5 +1,7 @@
 import argparse
+import glob
 import itertools
+from multiprocessing import Process
 from typing import List
 
 from userbenchmark.utils import get_output_dir
@@ -89,11 +91,13 @@ def run(args: List[str]):
             raise RuntimeError(
                 "CI mode must run with --timm, --huggingface, or --torchbench"
             )
+        for params in benchmark_args:
+            params.extend(pt2_args)
     else:
         benchmark_args = [pt2_args]
 
-    output_files = [_run_pt2_args(args) for args in benchmark_args]
-    # Post-processing
-    if args.dashboard:
-        post_ci_process(output_files)
-    print("\n".join(output_files))
+    for params in benchmark_args:
+        # TODO (huydhn): Figure out why it crashes when running in the same process
+        p = Process(target=_run_pt2_args, args=(params,))
+        p.start()
+        p.join()