Skip to content

Commit

Permalink
Migrate to AWS A100 runner (#2549)
Browse files Browse the repository at this point in the history
Summary:
Now PyTorch provides A100 runner on AWS, we can deprecate Torchbench's own A100 testing infra and inherit the testing infra from PyTorch.

Pull Request resolved: #2549

Reviewed By: kit1980

Differential Revision: D66450676

Pulled By: xuzhao9

fbshipit-source-id: 33b8b7547688be0b96703dcd696641c13e079f5f
  • Loading branch information
xuzhao9 authored and facebook-github-bot committed Nov 25, 2024
1 parent 820f213 commit 341ad14
Show file tree
Hide file tree
Showing 17 changed files with 67 additions and 56 deletions.
28 changes: 28 additions & 0 deletions .ci/torchbench/install-conda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

if [ -z "${CONDA_ENV}" ]; then
echo "ERROR: CONDA_ENV is not set"
exit 1
fi

mkdir workspace
cd workspace
wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3-latest-Linux-x86_64.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh && \
bash ./Miniconda3-latest-Linux-x86_64.sh -b -u
cd ..

. "${HOME}"/miniconda3/etc/profile.d/conda.sh
conda activate base
conda init

python utils/python_utils.py --create-conda-env "${CONDA_ENV}"

conda activate "${CONDA_ENV}"

python utils/cuda_utils.py --install-torch-deps
python utils/cuda_utils.py --install-torch-nightly
python utils/cuda_utils.py --install-torchbench-deps

# use the same numpy version as the build environment
pip install -r utils/build_requirements.txt
17 changes: 4 additions & 13 deletions .ci/torchbench/install.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,16 @@
. ${HOME}/miniconda3/etc/profile.d/conda.sh

if [ -z "${BASE_CONDA_ENV}" ]; then
echo "ERROR: BASE_CONDA_ENV is not set"
exit 1
fi

if [ -z "${CONDA_ENV}" ]; then
echo "ERROR: CONDA_ENV is not set"
exit 1
fi

if [ -z "${SETUP_SCRIPT}" ]; then
echo "ERROR: SETUP_SCRIPT is not set"
exit 1
if [[ -n "${SETUP_SCRIPT}" && -e "${SETUP_SCRIPT}" ]]; then
. "${SETUP_SCRIPT}"
fi

CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
conda activate "${BASE_CONDA_ENV}"
# Remove the conda env if exists
conda remove --name "${CONDA_ENV}" -y --all || true
conda create --name "${CONDA_ENV}" -y --clone "${BASE_CONDA_ENV}"
. "${HOME}"/miniconda3/etc/profile.d/conda.sh

conda activate "${CONDA_ENV}"

parent_dir=$(dirname "$(readlink -f "$0")")/../..
Expand Down
8 changes: 4 additions & 4 deletions .ci/torchbench/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ if [ -z "${TEST_CONFIG}" ]; then
exit 1
fi

if [ -z "${SETUP_SCRIPT}" ]; then
echo "ERROR: SETUP_SCRIPT is not set"
exit 1
if [[ -n "${SETUP_SCRIPT}" && -e "${SETUP_SCRIPT}" ]]; then
. "${SETUP_SCRIPT}"
fi

. "${SETUP_SCRIPT}"
. ${HOME}/miniconda3/etc/profile.d/conda.sh

conda activate "${CONDA_ENV}"

parent_dir=$(dirname "$(readlink -f "$0")")/../..
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/_linux-test-cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ jobs:
timeout-minutes: 240
environment: docker-s3-upload
env:
BASE_CONDA_ENV: "torchbench"
CONDA_ENV: "pr-test-cpu"
CONDA_ENV: "torchbench"
DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest"
SETUP_SCRIPT: "/workspace/setup_instance.sh"
TEST_CONFIG: "cpu"
Expand Down
14 changes: 4 additions & 10 deletions .github/workflows/_linux-test-cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,19 @@ jobs:
linux-test-cuda:
# Don't run on forked repos
if: github.repository_owner == 'pytorch'
runs-on: [a100-runner]
runs-on: linux.aws.a100
timeout-minutes: 240
environment: docker-s3-upload
env:
BASE_CONDA_ENV: "torchbench"
CONDA_ENV: "pr-test-cuda"
SETUP_SCRIPT: "/workspace/setup_instance.sh"
TEST_CONFIG: "cuda"
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
steps:
- name: Checkout TorchBench
uses: actions/checkout@v3
- name: Tune Nvidia GPU
- name: Install Conda
run: |
sudo nvidia-smi -pm 1
sudo nvidia-smi -ac 1215,1410
sudo ldconfig
nvidia-smi
bash ./.ci/torchbench/install-conda.sh
- name: Install TorchBench
run: |
bash ./.ci/torchbench/install.sh
Expand All @@ -38,6 +33,5 @@ jobs:
- name: Clean up Conda env
if: always()
run: |
. "${SETUP_SCRIPT}"
conda deactivate && conda deactivate
. ${HOME}/miniconda3/etc/profile.d/conda.sh
conda remove -n "${CONDA_ENV}" --all
6 changes: 3 additions & 3 deletions docker/infra/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ spec:
mountPath: /dev
- name: root-mount
mountPath: /root
env:
- name: NVIDIA_DRIVER_VERSION
value: "535.161.07"
# env:
# - name: NVIDIA_DRIVER_VERSION
# value: latest
containers:
- image: "gcr.io/google-containers/pause:2.0"
name: pause
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ submitit
pynvml
pandas
scipy
numba
1 change: 1 addition & 0 deletions torchbenchmark/data/index.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ INPUT_TARBALLS:
- Super_SloMo_inputs.tar.gz
- speech_transformer_inputs.tar.gz
- Reddit_minimal.tar.gz
- sam_inputs.tar.gz
MODEL_PKLS:
- drq/obs.pkl
- maml_omniglot/batch.pt
4 changes: 2 additions & 2 deletions torchbenchmark/models/detectron2_maskrcnn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def __init__(self, test, device, batch_size=None, extra_args=[]):
self.model = instantiate(model_cfg).to(self.device)
train_loader = instantiate(data_cfg.train)
self.example_inputs = prefetch(
itertools.islice(train_loader, 100), self.device
itertools.islice(train_loader, 1), self.device
)
self.optimizer = torch.optim.SGD(self.model.parameters(), 0.0)
elif test == "eval":
Expand All @@ -97,7 +97,7 @@ def __init__(self, test, device, batch_size=None, extra_args=[]):
self.model.eval()
test_loader = instantiate(data_cfg.test)
self.example_inputs = prefetch(
itertools.islice(test_loader, 100), self.device
itertools.islice(test_loader, 1), self.device
)

def get_module(self):
Expand Down
2 changes: 0 additions & 2 deletions torchbenchmark/models/hf_Whisper/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
cache_model,
patch_transformers,
)
from utils.python_utils import pip_install_requirements

if __name__ == "__main__":
pip_install_requirements()
patch_transformers()
model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
cache_model(model_name)
1 change: 0 additions & 1 deletion torchbenchmark/models/hf_Whisper/requirements.txt

This file was deleted.

5 changes: 2 additions & 3 deletions torchbenchmark/models/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import cv2
import numpy as np
import torch
from torchbenchmark import DATA_PATH
from torchbenchmark.tasks import COMPUTER_VISION

from ...util.model import BenchmarkModel
Expand Down Expand Up @@ -33,9 +34,7 @@ def __init__(self, test, device, batch_size=1, extra_args=[]):

self.model = sam_model_registry[model_type](checkpoint=sam_checkpoint)
self.model.to(device=device)
data_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".data")

image_path = os.path.join(data_folder, "truck.jpg")
image_path = os.path.join(DATA_PATH, "sam_inputs", "truck.jpg")
if not os.path.exists(image_path):
from torchbenchmark.util.framework.fb.installer import install_data

Expand Down
8 changes: 3 additions & 5 deletions torchbenchmark/models/sam/install.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os
import subprocess
import sys

import requests

from utils import s3_utils
from utils.python_utils import pip_install_requirements


Expand All @@ -27,9 +27,7 @@ def download_checkpoint():


def download_data():
download(
"https://github.com/facebookresearch/segment-anything/raw/main/notebooks/images/truck.jpg"
)
s3_utils.checkout_s3_data("INPUT_TARBALLS", "sam_inputs.tar.gz", decompress=True)


if __name__ == "__main__":
Expand Down
9 changes: 6 additions & 3 deletions torchbenchmark/models/sam_fast/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import torch
from segment_anything_fast.build_sam import sam_model_fast_registry
from segment_anything_fast.predictor import SamPredictor

from torchbenchmark import DATA_PATH
from torchbenchmark.tasks import COMPUTER_VISION

from ...util.model import BenchmarkModel
Expand All @@ -29,9 +31,10 @@ def __init__(self, test, device, batch_size=1, extra_args=[]):

self.model = sam_model_fast_registry[model_type](checkpoint=sam_checkpoint)
self.model.to(device=device)
data_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".data")

image_path = os.path.join(data_folder, "truck.jpg")
image_path = os.path.join(DATA_PATH, "sam_inputs", "truck.jpg")
assert os.path.exists(
image_path
), f"Expected image file exists at {image_path} but not found."
self.image = cv2.imread(image_path)
self.image = cv2.cvtColor(self.image, cv2.COLOR_BGR2RGB)
self.sample_image = torch.randn((3, 256, 256)).to(device)
Expand Down
6 changes: 3 additions & 3 deletions torchbenchmark/models/sam_fast/install.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os

import requests

from utils import s3_utils
from utils.python_utils import pip_install_requirements


Expand All @@ -25,9 +27,7 @@ def download_checkpoint():


def download_data():
download(
"https://github.com/facebookresearch/segment-anything/raw/main/notebooks/images/truck.jpg"
)
s3_utils.checkout_s3_data("INPUT_TARBALLS", "sam_inputs.tar.gz", decompress=True)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion torchbenchmark/util/framework/detectron2/model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def __init__(self, variant, test, device, batch_size=None, extra_args=[]):
elif self.test == "eval":
loader = self.setup_eval(cfg, args)

self.example_inputs = prefetch(itertools.islice(loader, 100), self.device)
self.example_inputs = prefetch(itertools.islice(loader, 1), self.device)

def setup_train(self):
if hasattr(self, "FCOS_USE_BN") and self.FCOS_USE_BN:
Expand Down
8 changes: 4 additions & 4 deletions utils/python_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@

from typing import List, Optional

DEFAULT_PYTHON_VERSION = "3.11"
DEFAULT_PYTHON_VERSION = "3.12"

PYTHON_VERSION_MAP = {
"3.8": {
"pytorch_url": "cp38",
},
"3.10": {
"pytorch_url": "cp310",
},
"3.11": {
"pytorch_url": "cp311",
},
"3.12": {
"pytorch_url": "cp312",
},
}
REPO_DIR = Path(__file__).parent.parent

Expand Down

0 comments on commit 341ad14

Please sign in to comment.