.github/workflows/userbenchmark-regression-detector.yml

name: TorchBench Optim Regression Detector on A100
on:
  schedule:
    - cron: '0 4 * * *' # run at 4 AM UTC = midnight ET
  workflow_dispatch:
    inputs:
      userbenchmark_name:
        description: "Name of the user benchmark to run"
      userbenchmark_options:
        description: "Option of the user benchmark to run"

jobs:
  run-userbenchmark:
    runs-on: [a100-runner]
    timeout-minutes: 1440 # 24 hours
    environment: docker-s3-upload
    env:
      BASE_CONDA_ENV: "torchbench"
      CONDA_ENV: "optim"
      PLATFORM_NAME: "gcp_a100"
      TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN }}
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
      SETUP_SCRIPT: "/workspace/setup_instance.sh"
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
    steps:
      - name: Checkout TorchBench
        uses: actions/checkout@v3
        with:
          path: benchmark
      - name: Tune Nvidia GPU
        run: |
          sudo nvidia-smi -pm 1
          sudo nvidia-smi -ac 1215,1410
          nvidia-smi
      - name: Clone and setup Conda env
        run: |
          CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
          conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}"
      - name: Install TorchBench
        run: |
          set -x
          . "${SETUP_SCRIPT}"
          pushd benchmark
          # only install the subset of models currently running.
          python install.py BERT_pytorch DALLE2_pytorch hf_GPT2_large hf_T5_large resnet50 timm_vision_transformer_large yolov3
      - name: Print torch.version.git_version
        run: |
          set -x
          . "${SETUP_SCRIPT}"
          python -c "import torch; print(torch.version.git_version)"
      - name: Run optim user benchmark
        run: |
          set -x
          . "${SETUP_SCRIPT}"
          # remove old results
          if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi
          pushd benchmark
          if [ -d .userbenchmark ]; then rm -Rf .userbenchmark; fi

          # TODO: scale this to run other benchmarks, but let's start with optim
          # Only run a subset, see if this fixes the workflow
          python -m userbenchmark.optim.run_optim_benchmarks -s -c ${{ github.event.inputs.userbenchmark_options }}
          cp -r ./.userbenchmark/optim ../benchmark-output
      - name: Detect potential regressions
        continue-on-error: true
        run: |
          . "${SETUP_SCRIPT}"
          pushd benchmark
          RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r))
          # TODO: the following assumes only one metrics-*.json is found. It will keep
          # overwriting gh-issue.md if multiple are found. Scaling this up is a potential next step.
          for r in ${RESULTS[@]}; do
            python regression_detector.py --platform "${PLATFORM_NAME}" --treatment "${r}" --owner @janeyx99 \
            --gh-issue-path gh-issue.md --errors-path errors.txt
          done
      - name: Create the github issue
        continue-on-error: true
        if: false
        uses: peter-evans/create-issue-from-file@v4
        with:
          title: Optim Perf Signal Detected by TorchBench CI on ${{ env.TORCHBENCH_REGRESSION_DETECTED }}
          token: ${{ secrets.TORCHBENCH_ACCESS_TOKEN }}
          content-filepath: ./benchmark/gh-issue.md
          labels: |
            torchbench-perf-report
      - name: Upload result jsons to Scribe and S3
        run: |
          . "${SETUP_SCRIPT}"
          pushd benchmark
          RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r))
          echo "Uploading result jsons: ${RESULTS}"
          for r in ${RESULTS[@]}; do
            python ./scripts/userbenchmark/upload_scribe.py --userbenchmark_json "${r}" --userbenchmark_platform "${PLATFORM_NAME}"
            python ./scripts/userbenchmark/upload_s3.py --upload-file "${r}" --userbenchmark_platform "${PLATFORM_NAME}"
          done
      - name: Upload artifact
        uses: actions/upload-artifact@v3
        with:
          name: TorchBench result
          path: benchmark-output/
      - name: Finally, error if errors.txt exists
        if: always()
        run: |
          # Do not error earlier as we want all artifacts and regressions to be reported first
          # TODO: potentially move errors.txt to benchmark-output so it gets uploaded to S3
          pushd benchmark
          if [ -e errors.txt ]; then cat errors.txt && exit 1; fi
      - name: Remove conda environment
        if: always()
        run: |
          . "${SETUP_SCRIPT}"
          conda deactivate && conda deactivate
          conda remove -n "${CONDA_ENV}" --all