diff --git a/.ci/torchbench/install-conda.sh b/.ci/torchbench/install-conda.sh new file mode 100644 index 0000000000..c16ad45abc --- /dev/null +++ b/.ci/torchbench/install-conda.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +if [ -z "${CONDA_ENV}" ]; then + echo "ERROR: CONDA_ENV is not set" + exit 1 +fi + +mkdir workspace +cd workspace +wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3-latest-Linux-x86_64.sh +chmod +x Miniconda3-latest-Linux-x86_64.sh && \ +bash ./Miniconda3-latest-Linux-x86_64.sh -b -u +cd .. + +. "${HOME}"/miniconda3/etc/profile.d/conda.sh +conda activate base +conda init + +python utils/python_utils.py --create-conda-env "${CONDA_ENV}" + +conda activate "${CONDA_ENV}" + +python utils/cuda_utils.py --install-torch-deps +python utils/cuda_utils.py --install-torch-nightly +python utils/cuda_utils.py --install-torchbench-deps + +# use the same numpy version as the build environment +pip install -r utils/build_requirements.txt diff --git a/.ci/torchbench/install.sh b/.ci/torchbench/install.sh index 16dcfc6e8d..66bb507623 100644 --- a/.ci/torchbench/install.sh +++ b/.ci/torchbench/install.sh @@ -1,25 +1,16 @@ . ${HOME}/miniconda3/etc/profile.d/conda.sh -if [ -z "${BASE_CONDA_ENV}" ]; then - echo "ERROR: BASE_CONDA_ENV is not set" - exit 1 -fi - if [ -z "${CONDA_ENV}" ]; then echo "ERROR: CONDA_ENV is not set" exit 1 fi -if [ -z "${SETUP_SCRIPT}" ]; then - echo "ERROR: SETUP_SCRIPT is not set" - exit 1 +if [[ -n "${SETUP_SCRIPT}" && -e "${SETUP_SCRIPT}" ]]; then + . "${SETUP_SCRIPT}" fi -CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}" -conda activate "${BASE_CONDA_ENV}" -# Remove the conda env if exists -conda remove --name "${CONDA_ENV}" -y --all || true -conda create --name "${CONDA_ENV}" -y --clone "${BASE_CONDA_ENV}" +. "${HOME}"/miniconda3/etc/profile.d/conda.sh + conda activate "${CONDA_ENV}" parent_dir=$(dirname "$(readlink -f "$0")")/../.. diff --git a/.ci/torchbench/test.sh b/.ci/torchbench/test.sh index 9cba55cda7..fa82b9718e 100644 --- a/.ci/torchbench/test.sh +++ b/.ci/torchbench/test.sh @@ -10,12 +10,12 @@ if [ -z "${TEST_CONFIG}" ]; then exit 1 fi -if [ -z "${SETUP_SCRIPT}" ]; then - echo "ERROR: SETUP_SCRIPT is not set" - exit 1 +if [[ -n "${SETUP_SCRIPT}" && -e "${SETUP_SCRIPT}" ]]; then + . "${SETUP_SCRIPT}" fi -. "${SETUP_SCRIPT}" +. ${HOME}/miniconda3/etc/profile.d/conda.sh + conda activate "${CONDA_ENV}" parent_dir=$(dirname "$(readlink -f "$0")")/../.. diff --git a/.github/workflows/_linux-test-cpu.yml b/.github/workflows/_linux-test-cpu.yml index 7aa1569797..8886d75287 100644 --- a/.github/workflows/_linux-test-cpu.yml +++ b/.github/workflows/_linux-test-cpu.yml @@ -15,8 +15,7 @@ jobs: timeout-minutes: 240 environment: docker-s3-upload env: - BASE_CONDA_ENV: "torchbench" - CONDA_ENV: "pr-test-cpu" + CONDA_ENV: "torchbench" DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest" SETUP_SCRIPT: "/workspace/setup_instance.sh" TEST_CONFIG: "cpu" diff --git a/.github/workflows/_linux-test-cuda.yml b/.github/workflows/_linux-test-cuda.yml index 5f59a48083..245c993497 100644 --- a/.github/workflows/_linux-test-cuda.yml +++ b/.github/workflows/_linux-test-cuda.yml @@ -11,24 +11,19 @@ jobs: linux-test-cuda: # Don't run on forked repos if: github.repository_owner == 'pytorch' - runs-on: [a100-runner] + runs-on: linux.aws.a100 timeout-minutes: 240 environment: docker-s3-upload env: - BASE_CONDA_ENV: "torchbench" CONDA_ENV: "pr-test-cuda" - SETUP_SCRIPT: "/workspace/setup_instance.sh" TEST_CONFIG: "cuda" HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: Checkout TorchBench uses: actions/checkout@v3 - - name: Tune Nvidia GPU + - name: Install Conda run: | - sudo nvidia-smi -pm 1 - sudo nvidia-smi -ac 1215,1410 - sudo ldconfig - nvidia-smi + bash ./.ci/torchbench/install-conda.sh - name: Install TorchBench run: | bash ./.ci/torchbench/install.sh @@ -38,6 +33,5 @@ jobs: - name: Clean up Conda env if: always() run: | - . "${SETUP_SCRIPT}" - conda deactivate && conda deactivate + . ${HOME}/miniconda3/etc/profile.d/conda.sh conda remove -n "${CONDA_ENV}" --all diff --git a/docker/infra/daemonset.yaml b/docker/infra/daemonset.yaml index 0de2922aec..4a7a4c0613 100644 --- a/docker/infra/daemonset.yaml +++ b/docker/infra/daemonset.yaml @@ -68,9 +68,9 @@ spec: mountPath: /dev - name: root-mount mountPath: /root - env: - - name: NVIDIA_DRIVER_VERSION - value: "535.161.07" + # env: + # - name: NVIDIA_DRIVER_VERSION + # value: latest containers: - image: "gcr.io/google-containers/pause:2.0" name: pause diff --git a/requirements.txt b/requirements.txt index e7bd1bc19d..e49a935e6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ submitit pynvml pandas scipy +numba diff --git a/torchbenchmark/data/index.yaml b/torchbenchmark/data/index.yaml index 1fb65abd07..90c4aa7930 100644 --- a/torchbenchmark/data/index.yaml +++ b/torchbenchmark/data/index.yaml @@ -11,6 +11,7 @@ INPUT_TARBALLS: - Super_SloMo_inputs.tar.gz - speech_transformer_inputs.tar.gz - Reddit_minimal.tar.gz + - sam_inputs.tar.gz MODEL_PKLS: - drq/obs.pkl - maml_omniglot/batch.pt diff --git a/torchbenchmark/models/detectron2_maskrcnn/__init__.py b/torchbenchmark/models/detectron2_maskrcnn/__init__.py index b60019b815..c6215faa74 100644 --- a/torchbenchmark/models/detectron2_maskrcnn/__init__.py +++ b/torchbenchmark/models/detectron2_maskrcnn/__init__.py @@ -76,7 +76,7 @@ def __init__(self, test, device, batch_size=None, extra_args=[]): self.model = instantiate(model_cfg).to(self.device) train_loader = instantiate(data_cfg.train) self.example_inputs = prefetch( - itertools.islice(train_loader, 100), self.device + itertools.islice(train_loader, 1), self.device ) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.0) elif test == "eval": @@ -97,7 +97,7 @@ def __init__(self, test, device, batch_size=None, extra_args=[]): self.model.eval() test_loader = instantiate(data_cfg.test) self.example_inputs = prefetch( - itertools.islice(test_loader, 100), self.device + itertools.islice(test_loader, 1), self.device ) def get_module(self): diff --git a/torchbenchmark/models/hf_Whisper/install.py b/torchbenchmark/models/hf_Whisper/install.py index a02c6fdde9..4297eb3df8 100644 --- a/torchbenchmark/models/hf_Whisper/install.py +++ b/torchbenchmark/models/hf_Whisper/install.py @@ -4,10 +4,8 @@ cache_model, patch_transformers, ) -from utils.python_utils import pip_install_requirements if __name__ == "__main__": - pip_install_requirements() patch_transformers() model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__))) cache_model(model_name) diff --git a/torchbenchmark/models/hf_Whisper/requirements.txt b/torchbenchmark/models/hf_Whisper/requirements.txt deleted file mode 100644 index c3db4451e8..0000000000 --- a/torchbenchmark/models/hf_Whisper/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -numba diff --git a/torchbenchmark/models/sam/__init__.py b/torchbenchmark/models/sam/__init__.py index 80f6e0e95a..b416a01451 100644 --- a/torchbenchmark/models/sam/__init__.py +++ b/torchbenchmark/models/sam/__init__.py @@ -6,6 +6,7 @@ import cv2 import numpy as np import torch +from torchbenchmark import DATA_PATH from torchbenchmark.tasks import COMPUTER_VISION from ...util.model import BenchmarkModel @@ -33,9 +34,7 @@ def __init__(self, test, device, batch_size=1, extra_args=[]): self.model = sam_model_registry[model_type](checkpoint=sam_checkpoint) self.model.to(device=device) - data_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".data") - - image_path = os.path.join(data_folder, "truck.jpg") + image_path = os.path.join(DATA_PATH, "sam_inputs", "truck.jpg") if not os.path.exists(image_path): from torchbenchmark.util.framework.fb.installer import install_data diff --git a/torchbenchmark/models/sam/install.py b/torchbenchmark/models/sam/install.py index c6fb763ec4..227079456a 100644 --- a/torchbenchmark/models/sam/install.py +++ b/torchbenchmark/models/sam/install.py @@ -1,8 +1,8 @@ import os -import subprocess -import sys import requests + +from utils import s3_utils from utils.python_utils import pip_install_requirements @@ -27,9 +27,7 @@ def download_checkpoint(): def download_data(): - download( - "https://github.com/facebookresearch/segment-anything/raw/main/notebooks/images/truck.jpg" - ) + s3_utils.checkout_s3_data("INPUT_TARBALLS", "sam_inputs.tar.gz", decompress=True) if __name__ == "__main__": diff --git a/torchbenchmark/models/sam_fast/__init__.py b/torchbenchmark/models/sam_fast/__init__.py index 98fa58de89..c00c9556a1 100644 --- a/torchbenchmark/models/sam_fast/__init__.py +++ b/torchbenchmark/models/sam_fast/__init__.py @@ -8,6 +8,8 @@ import torch from segment_anything_fast.build_sam import sam_model_fast_registry from segment_anything_fast.predictor import SamPredictor + +from torchbenchmark import DATA_PATH from torchbenchmark.tasks import COMPUTER_VISION from ...util.model import BenchmarkModel @@ -29,9 +31,10 @@ def __init__(self, test, device, batch_size=1, extra_args=[]): self.model = sam_model_fast_registry[model_type](checkpoint=sam_checkpoint) self.model.to(device=device) - data_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".data") - - image_path = os.path.join(data_folder, "truck.jpg") + image_path = os.path.join(DATA_PATH, "sam_inputs", "truck.jpg") + assert os.path.exists( + image_path + ), f"Expected image file exists at {image_path} but not found." self.image = cv2.imread(image_path) self.image = cv2.cvtColor(self.image, cv2.COLOR_BGR2RGB) self.sample_image = torch.randn((3, 256, 256)).to(device) diff --git a/torchbenchmark/models/sam_fast/install.py b/torchbenchmark/models/sam_fast/install.py index a072a1fc1f..227079456a 100644 --- a/torchbenchmark/models/sam_fast/install.py +++ b/torchbenchmark/models/sam_fast/install.py @@ -1,6 +1,8 @@ import os import requests + +from utils import s3_utils from utils.python_utils import pip_install_requirements @@ -25,9 +27,7 @@ def download_checkpoint(): def download_data(): - download( - "https://github.com/facebookresearch/segment-anything/raw/main/notebooks/images/truck.jpg" - ) + s3_utils.checkout_s3_data("INPUT_TARBALLS", "sam_inputs.tar.gz", decompress=True) if __name__ == "__main__": diff --git a/torchbenchmark/util/framework/detectron2/model_factory.py b/torchbenchmark/util/framework/detectron2/model_factory.py index 889aed107c..dc321a6d78 100644 --- a/torchbenchmark/util/framework/detectron2/model_factory.py +++ b/torchbenchmark/util/framework/detectron2/model_factory.py @@ -147,7 +147,7 @@ def __init__(self, variant, test, device, batch_size=None, extra_args=[]): elif self.test == "eval": loader = self.setup_eval(cfg, args) - self.example_inputs = prefetch(itertools.islice(loader, 100), self.device) + self.example_inputs = prefetch(itertools.islice(loader, 1), self.device) def setup_train(self): if hasattr(self, "FCOS_USE_BN") and self.FCOS_USE_BN: diff --git a/utils/python_utils.py b/utils/python_utils.py index 41a18f96e1..e1815079c2 100644 --- a/utils/python_utils.py +++ b/utils/python_utils.py @@ -4,18 +4,18 @@ from typing import List, Optional -DEFAULT_PYTHON_VERSION = "3.11" +DEFAULT_PYTHON_VERSION = "3.12" PYTHON_VERSION_MAP = { - "3.8": { - "pytorch_url": "cp38", - }, "3.10": { "pytorch_url": "cp310", }, "3.11": { "pytorch_url": "cp311", }, + "3.12": { + "pytorch_url": "cp312", + }, } REPO_DIR = Path(__file__).parent.parent