From 521d329b975de97ec0b52395f02bb32466b8dc35 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 8 Nov 2022 10:17:03 -0800 Subject: [PATCH] Fix CI issues related to cupy install (#2483) * remove any cupy install when setting up environments * revert previous changes to run on cu111 runners * fix for when no cupy is installed * remove cupy uninstall for workflows not using latest torch version * update to cu116 for inference tests * fix pip uninstall line * move python environment list to after DS install * remove cupy uninstall * re-add --forked * fix how we get cupy version (should be based on nvcc version) --- .github/workflows/amd.yml | 8 ++++---- .github/workflows/nv-accelerate-v100.yml | 8 ++++---- .github/workflows/nv-inference.yml | 16 ++++++++-------- .github/workflows/nv-lightning-v100.yml | 8 ++++---- .github/workflows/nv-nightly.yml | 10 +++++++--- .github/workflows/nv-torch-latest-v100.yml | 17 ++++++++--------- .github/workflows/nv-torch-nightly-v100.yml | 12 ++++++------ .github/workflows/nv-torch18-p40.yml | 10 +++++----- .github/workflows/nv-torch18-v100.yml | 8 ++++---- .github/workflows/nv-transformers-v100.yml | 8 ++++---- setup.py | 3 ++- 11 files changed, 56 insertions(+), 52 deletions(-) diff --git a/.github/workflows/amd.yml b/.github/workflows/amd.yml index 97ffb9b21482..84a46c1942ad 100644 --- a/.github/workflows/amd.yml +++ b/.github/workflows/amd.yml @@ -42,10 +42,6 @@ jobs: sudo apt-get update sudo apt-get install -y libaio-dev - - name: Python environment - run: | - pip list - - name: Install transformers run: | git clone https://github.com/huggingface/transformers @@ -62,6 +58,10 @@ jobs: #python -c "from deepspeed.env_report import cli_main; cli_main()" ds_report + - name: Python environment + run: | + pip list + # Runs a set of commands using the runners shell - name: Unit tests run: | diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 5f9b1c39a13f..ed836aa04ba8 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -36,16 +36,16 @@ jobs: python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - - name: Python environment - run: | - pip list - - name: Install deepspeed run: | pip uninstall --yes deepspeed pip install .[dev,autotuning] ds_report + - name: Python environment + run: | + pip list + - name: HF Accelerate tests run: | if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml index a0d2d4df10f6..9879279ab1ef 100644 --- a/.github/workflows/nv-inference.yml +++ b/.github/workflows/nv-inference.yml @@ -17,7 +17,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu111, v100] + runs-on: [self-hosted, nvidia, cu116, v100] steps: - uses: actions/checkout@v2 @@ -32,7 +32,7 @@ jobs: nvcc --version pip install --upgrade pip pip uninstall --yes torch torchvision triton - pip install torch==1.12.0 torchvision --extra-index-url https://download.pytorch.org/whl/cu113 + pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -44,20 +44,20 @@ jobs: pip uninstall --yes transformers pip install . - - name: Python environment - run: | - pip list - - name: Install deepspeed run: | pip uninstall --yes deepspeed pip install .[dev,1bit,autotuning,inf] ds_report + - name: Python environment + run: | + pip list + - name: Unit tests run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi cd tests - TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --cuda_ver="11" - TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 -n 4 --verbose -m 'inference' unit/ --cuda_ver="11" + TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6" + TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6" diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml index a63f6b75e769..0c5ad4184f05 100644 --- a/.github/workflows/nv-lightning-v100.yml +++ b/.github/workflows/nv-lightning-v100.yml @@ -36,16 +36,16 @@ jobs: python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - - name: Python environment - run: | - pip list - - name: Install deepspeed run: | pip uninstall --yes deepspeed pip install .[dev,autotuning] ds_report + - name: Python environment + run: | + pip list + - name: PyTorch Lightning Tests run: | if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index 3f8b9414bc8e..bfa4d9ba6bfb 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -10,7 +10,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu111, v100] + runs-on: [self-hosted, nvidia, cu116, v100] steps: - uses: actions/checkout@v2 @@ -25,7 +25,7 @@ jobs: nvcc --version pip install --upgrade pip pip uninstall --yes torch torchvision triton - pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html + pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -45,9 +45,13 @@ jobs: pip install .[dev,1bit,autotuning,inf] ds_report + - name: Python environment + run: | + pip list + - name: Unit tests run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi cd tests - TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.8" --cuda_ver="11" + TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6" diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index 178976eb056e..488874a13c4b 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -17,7 +17,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu111, v100] + runs-on: [self-hosted, nvidia, cu116, v100] steps: - uses: actions/checkout@v2 @@ -32,8 +32,7 @@ jobs: nvcc --version pip install --upgrade pip pip uninstall --yes torch torchvision triton - pip install torch==1.12.0 torchvision --extra-index-url https://download.pytorch.org/whl/cu113 # Need to resolve errors with torch==1.13.0 - pip install cupy-cuda113 + pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -47,20 +46,20 @@ jobs: pip uninstall --yes transformers pip install . - - name: Python environment - run: | - pip list - - name: Install deepspeed run: | pip uninstall --yes deepspeed pip install .[dev,1bit,autotuning] ds_report + - name: Python environment + run: | + pip list + - name: Unit tests run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi cd tests - TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -n 4 unit/ --torch_ver="1.12" --cuda_ver="11" - TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/ --torch_ver="1.12" --cuda_ver="11" + TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -n 4 unit/ --torch_ver="1.13" --cuda_ver="11.6" + TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -m 'sequential' unit/ --torch_ver="1.13" --cuda_ver="11.6" diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml index c8380a544c45..5804f65f73f5 100644 --- a/.github/workflows/nv-torch-nightly-v100.yml +++ b/.github/workflows/nv-torch-nightly-v100.yml @@ -10,7 +10,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu113, v100] + runs-on: [self-hosted, nvidia, cu116, v100] steps: - uses: actions/checkout@v2 @@ -25,7 +25,7 @@ jobs: nvcc --version pip install --upgrade pip pip uninstall --yes torch torchvision triton - pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu113 + pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -39,16 +39,16 @@ jobs: pip uninstall --yes transformers pip install . - - name: Python environment - run: | - pip list - - name: Install deepspeed run: | pip uninstall --yes deepspeed pip install .[dev,1bit,autotuning] ds_report + - name: Python environment + run: | + pip list + - name: Unit tests run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch diff --git a/.github/workflows/nv-torch18-p40.yml b/.github/workflows/nv-torch18-p40.yml index fbb3a36da6ef..6be052f350b4 100644 --- a/.github/workflows/nv-torch18-p40.yml +++ b/.github/workflows/nv-torch18-p40.yml @@ -36,10 +36,6 @@ jobs: python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - - name: Python environment - run: | - pip list - - name: Install transformers run: | git clone https://github.com/huggingface/transformers @@ -56,8 +52,12 @@ jobs: pip install .[dev,1bit,autotuning] ds_report + - name: Python environment + run: | + pip list + - name: Unit tests run: | if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi cd tests - TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10" + TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10.1" diff --git a/.github/workflows/nv-torch18-v100.yml b/.github/workflows/nv-torch18-v100.yml index 994f8ac3b051..eb7a577d5608 100644 --- a/.github/workflows/nv-torch18-v100.yml +++ b/.github/workflows/nv-torch18-v100.yml @@ -46,16 +46,16 @@ jobs: pip uninstall --yes transformers pip install . - - name: Python environment - run: | - pip list - - name: Install deepspeed run: | pip uninstall --yes deepspeed pip install .[dev,1bit,autotuning] ds_report + - name: Python environment + run: | + pip list + - name: Unit tests run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index 5a213068bc91..55a887c8ed80 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -38,16 +38,16 @@ jobs: sudo apt-get update sudo apt-get install -y libaio-dev - - name: Python environment - run: | - pip list - - name: Install deepspeed run: | pip uninstall --yes deepspeed pip install .[dev,autotuning] ds_report + - name: Python environment + run: | + pip list + - name: HF transformers tests run: | if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi diff --git a/setup.py b/setup.py index a159ece2bdef..fa60986055be 100755 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ 'Please visit https://pytorch.org/ to see how to properly install torch on your system.') from op_builder import ALL_OPS, get_default_compute_capabilities, OpBuilder +from op_builder.builder import installed_cuda_version # fetch rocm state is_rocm_pytorch = OpBuilder.is_rocm_pytorch() @@ -74,7 +75,7 @@ def fetch_requirements(path): if rocm_major <= 4: cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}" else: - cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}" + cupy = f"cupy-cuda{''.join(map(str,installed_cuda_version()))}" if cupy: extras_require['1bit'].append(cupy) extras_require['1bit_mpi'].append(cupy)