Migrate to AWS A100 runner (#2549)

Summary: Now PyTorch provides A100 runner on AWS, we can deprecate Torchbench's own A100 testing infra and inherit the testing infra from PyTorch. Pull Request resolved: #2549 Reviewed By: kit1980 Differential Revision: D66450676 Pulled By: xuzhao9 fbshipit-source-id: 33b8b7547688be0b96703dcd696641c13e079f5f
pytorch · Nov 25, 2024 · 341ad14 · 341ad14
1 parent 820f213
commit 341ad14
Show file tree

Hide file tree

Showing 17 changed files with 67 additions and 56 deletions.
diff --git a/.ci/torchbench/install-conda.sh b/.ci/torchbench/install-conda.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+if [ -z "${CONDA_ENV}" ]; then
+  echo "ERROR: CONDA_ENV is not set"
+  exit 1
+fi
+
+mkdir workspace
+cd workspace
+wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3-latest-Linux-x86_64.sh
+chmod +x Miniconda3-latest-Linux-x86_64.sh && \
+bash ./Miniconda3-latest-Linux-x86_64.sh -b -u
+cd ..
+
+. "${HOME}"/miniconda3/etc/profile.d/conda.sh
+conda activate base
+conda init
+
+python utils/python_utils.py --create-conda-env "${CONDA_ENV}"
+
+conda activate "${CONDA_ENV}"
+
+python utils/cuda_utils.py --install-torch-deps
+python utils/cuda_utils.py --install-torch-nightly
+python utils/cuda_utils.py --install-torchbench-deps
+
+# use the same numpy version as the build environment
+pip install -r utils/build_requirements.txt
diff --git a/.ci/torchbench/install.sh b/.ci/torchbench/install.sh
@@ -1,25 +1,16 @@
 . ${HOME}/miniconda3/etc/profile.d/conda.sh
 
-if [ -z "${BASE_CONDA_ENV}" ]; then
-  echo "ERROR: BASE_CONDA_ENV is not set"
-  exit 1
-fi
-
 if [ -z "${CONDA_ENV}" ]; then
   echo "ERROR: CONDA_ENV is not set"
   exit 1
 fi
 
-if [ -z "${SETUP_SCRIPT}" ]; then
-  echo "ERROR: SETUP_SCRIPT is not set"
-  exit 1
+if [[ -n "${SETUP_SCRIPT}" && -e "${SETUP_SCRIPT}" ]]; then
+  . "${SETUP_SCRIPT}"
 fi
 
-CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
-conda activate "${BASE_CONDA_ENV}"
-# Remove the conda env if exists
-conda remove --name "${CONDA_ENV}" -y --all || true
-conda create --name "${CONDA_ENV}" -y --clone "${BASE_CONDA_ENV}"
+. "${HOME}"/miniconda3/etc/profile.d/conda.sh
+
 conda activate "${CONDA_ENV}"
 
 parent_dir=$(dirname "$(readlink -f "$0")")/../..

diff --git a/.ci/torchbench/test.sh b/.ci/torchbench/test.sh
@@ -10,12 +10,12 @@ if [ -z "${TEST_CONFIG}" ]; then
   exit 1
 fi
 
-if [ -z "${SETUP_SCRIPT}" ]; then
-  echo "ERROR: SETUP_SCRIPT is not set"
-  exit 1
+if [[ -n "${SETUP_SCRIPT}" && -e "${SETUP_SCRIPT}" ]]; then
+  . "${SETUP_SCRIPT}"
 fi
 
-. "${SETUP_SCRIPT}"
+. ${HOME}/miniconda3/etc/profile.d/conda.sh
+
 conda activate "${CONDA_ENV}"
 
 parent_dir=$(dirname "$(readlink -f "$0")")/../..

diff --git a/.github/workflows/_linux-test-cpu.yml b/.github/workflows/_linux-test-cpu.yml
@@ -15,8 +15,7 @@ jobs:
     timeout-minutes: 240
     environment: docker-s3-upload
     env:
-      BASE_CONDA_ENV: "torchbench"
-      CONDA_ENV: "pr-test-cpu"
+      CONDA_ENV: "torchbench"
       DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest"
       SETUP_SCRIPT: "/workspace/setup_instance.sh"
       TEST_CONFIG: "cpu"

diff --git a/.github/workflows/_linux-test-cuda.yml b/.github/workflows/_linux-test-cuda.yml
@@ -11,24 +11,19 @@ jobs:
   linux-test-cuda:
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
-    runs-on: [a100-runner]
+    runs-on: linux.aws.a100
     timeout-minutes: 240
     environment: docker-s3-upload
     env:
-      BASE_CONDA_ENV: "torchbench"
       CONDA_ENV: "pr-test-cuda"
-      SETUP_SCRIPT: "/workspace/setup_instance.sh"
       TEST_CONFIG: "cuda"
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
     steps:
       - name: Checkout TorchBench
         uses: actions/checkout@v3
-      - name: Tune Nvidia GPU
+      - name: Install Conda
         run: |
-          sudo nvidia-smi -pm 1
-          sudo nvidia-smi -ac 1215,1410
-          sudo ldconfig
-          nvidia-smi
+          bash ./.ci/torchbench/install-conda.sh
       - name: Install TorchBench
         run: |
           bash ./.ci/torchbench/install.sh
@@ -38,6 +33,5 @@ jobs:
       - name: Clean up Conda env
         if: always()
         run: |
-          . "${SETUP_SCRIPT}"
-          conda deactivate && conda deactivate
+          . ${HOME}/miniconda3/etc/profile.d/conda.sh
           conda remove -n "${CONDA_ENV}" --all
diff --git a/docker/infra/daemonset.yaml b/docker/infra/daemonset.yaml
@@ -68,9 +68,9 @@ spec:
           mountPath: /dev
         - name: root-mount
           mountPath: /root
-        env:
-        - name: NVIDIA_DRIVER_VERSION
-          value: "535.161.07"
+        # env:
+        # - name: NVIDIA_DRIVER_VERSION
+        #   value: latest
       containers:
       - image: "gcr.io/google-containers/pause:2.0"
         name: pause
diff --git a/requirements.txt b/requirements.txt
@@ -22,3 +22,4 @@ submitit
 pynvml
 pandas
 scipy
+numba
diff --git a/torchbenchmark/data/index.yaml b/torchbenchmark/data/index.yaml
@@ -11,6 +11,7 @@ INPUT_TARBALLS:
   - Super_SloMo_inputs.tar.gz
   - speech_transformer_inputs.tar.gz
   - Reddit_minimal.tar.gz
+  - sam_inputs.tar.gz
 MODEL_PKLS:
   - drq/obs.pkl
   - maml_omniglot/batch.pt
diff --git a/torchbenchmark/models/detectron2_maskrcnn/__init__.py b/torchbenchmark/models/detectron2_maskrcnn/__init__.py
@@ -76,7 +76,7 @@ def __init__(self, test, device, batch_size=None, extra_args=[]):
             self.model = instantiate(model_cfg).to(self.device)
             train_loader = instantiate(data_cfg.train)
             self.example_inputs = prefetch(
-                itertools.islice(train_loader, 100), self.device
+                itertools.islice(train_loader, 1), self.device
             )
             self.optimizer = torch.optim.SGD(self.model.parameters(), 0.0)
         elif test == "eval":
@@ -97,7 +97,7 @@ def __init__(self, test, device, batch_size=None, extra_args=[]):
             self.model.eval()
             test_loader = instantiate(data_cfg.test)
             self.example_inputs = prefetch(
-                itertools.islice(test_loader, 100), self.device
+                itertools.islice(test_loader, 1), self.device
             )
 
     def get_module(self):

diff --git a/torchbenchmark/models/hf_Whisper/install.py b/torchbenchmark/models/hf_Whisper/install.py
@@ -4,10 +4,8 @@
     cache_model,
     patch_transformers,
 )
-from utils.python_utils import pip_install_requirements
 
 if __name__ == "__main__":
-    pip_install_requirements()
     patch_transformers()
     model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
     cache_model(model_name)
diff --git a/torchbenchmark/models/hf_Whisper/requirements.txt b/torchbenchmark/models/hf_Whisper/requirements.txt
diff --git a/torchbenchmark/models/sam/__init__.py b/torchbenchmark/models/sam/__init__.py
@@ -6,6 +6,7 @@
 import cv2
 import numpy as np
 import torch
+from torchbenchmark import DATA_PATH
 from torchbenchmark.tasks import COMPUTER_VISION
 
 from ...util.model import BenchmarkModel
@@ -33,9 +34,7 @@ def __init__(self, test, device, batch_size=1, extra_args=[]):
 
         self.model = sam_model_registry[model_type](checkpoint=sam_checkpoint)
         self.model.to(device=device)
-        data_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".data")
-
-        image_path = os.path.join(data_folder, "truck.jpg")
+        image_path = os.path.join(DATA_PATH, "sam_inputs", "truck.jpg")
         if not os.path.exists(image_path):
             from torchbenchmark.util.framework.fb.installer import install_data
 

diff --git a/torchbenchmark/models/sam/install.py b/torchbenchmark/models/sam/install.py
@@ -1,8 +1,8 @@
 import os
-import subprocess
-import sys
 
 import requests
+
+from utils import s3_utils
 from utils.python_utils import pip_install_requirements
 
 
@@ -27,9 +27,7 @@ def download_checkpoint():
 
 
 def download_data():
-    download(
-        "https://github.com/facebookresearch/segment-anything/raw/main/notebooks/images/truck.jpg"
-    )
+    s3_utils.checkout_s3_data("INPUT_TARBALLS", "sam_inputs.tar.gz", decompress=True)
 
 
 if __name__ == "__main__":

diff --git a/torchbenchmark/models/sam_fast/__init__.py b/torchbenchmark/models/sam_fast/__init__.py
@@ -8,6 +8,8 @@
 import torch
 from segment_anything_fast.build_sam import sam_model_fast_registry
 from segment_anything_fast.predictor import SamPredictor
+
+from torchbenchmark import DATA_PATH
 from torchbenchmark.tasks import COMPUTER_VISION
 
 from ...util.model import BenchmarkModel
@@ -29,9 +31,10 @@ def __init__(self, test, device, batch_size=1, extra_args=[]):
 
         self.model = sam_model_fast_registry[model_type](checkpoint=sam_checkpoint)
         self.model.to(device=device)
-        data_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".data")
-
-        image_path = os.path.join(data_folder, "truck.jpg")
+        image_path = os.path.join(DATA_PATH, "sam_inputs", "truck.jpg")
+        assert os.path.exists(
+            image_path
+        ), f"Expected image file exists at {image_path} but not found."
         self.image = cv2.imread(image_path)
         self.image = cv2.cvtColor(self.image, cv2.COLOR_BGR2RGB)
         self.sample_image = torch.randn((3, 256, 256)).to(device)

diff --git a/torchbenchmark/models/sam_fast/install.py b/torchbenchmark/models/sam_fast/install.py
@@ -1,6 +1,8 @@
 import os
 
 import requests
+
+from utils import s3_utils
 from utils.python_utils import pip_install_requirements
 
 
@@ -25,9 +27,7 @@ def download_checkpoint():
 
 
 def download_data():
-    download(
-        "https://github.com/facebookresearch/segment-anything/raw/main/notebooks/images/truck.jpg"
-    )
+    s3_utils.checkout_s3_data("INPUT_TARBALLS", "sam_inputs.tar.gz", decompress=True)
 
 
 if __name__ == "__main__":

diff --git a/torchbenchmark/util/framework/detectron2/model_factory.py b/torchbenchmark/util/framework/detectron2/model_factory.py
@@ -147,7 +147,7 @@ def __init__(self, variant, test, device, batch_size=None, extra_args=[]):
         elif self.test == "eval":
             loader = self.setup_eval(cfg, args)
 
-        self.example_inputs = prefetch(itertools.islice(loader, 100), self.device)
+        self.example_inputs = prefetch(itertools.islice(loader, 1), self.device)
 
     def setup_train(self):
         if hasattr(self, "FCOS_USE_BN") and self.FCOS_USE_BN:

diff --git a/utils/python_utils.py b/utils/python_utils.py
@@ -4,18 +4,18 @@
 
 from typing import List, Optional
 
-DEFAULT_PYTHON_VERSION = "3.11"
+DEFAULT_PYTHON_VERSION = "3.12"
 
 PYTHON_VERSION_MAP = {
-    "3.8": {
-        "pytorch_url": "cp38",
-    },
     "3.10": {
         "pytorch_url": "cp310",
     },
     "3.11": {
         "pytorch_url": "cp311",
     },
+    "3.12": {
+        "pytorch_url": "cp312",
+    },
 }
 REPO_DIR = Path(__file__).parent.parent