Skip to content

Commit

Permalink
Fix CI issues related to cupy install (#2483)
Browse files Browse the repository at this point in the history
* remove any cupy install when setting up environments

* revert previous changes to run on cu111 runners

* fix for when no cupy is installed

* remove cupy uninstall for workflows not using latest torch version

* update to cu116 for inference tests

* fix pip uninstall line

* move python environment list to after DS install

* remove cupy uninstall

* re-add --forked

* fix how we get cupy version (should be based on nvcc version)
  • Loading branch information
mrwyattii authored Nov 8, 2022
1 parent 9cfcf74 commit 521d329
Show file tree
Hide file tree
Showing 11 changed files with 56 additions and 52 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@ jobs:
sudo apt-get update
sudo apt-get install -y libaio-dev
- name: Python environment
run: |
pip list
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
Expand All @@ -62,6 +58,10 @@ jobs:
#python -c "from deepspeed.env_report import cli_main; cli_main()"
ds_report
- name: Python environment
run: |
pip list
# Runs a set of commands using the runners shell
- name: Unit tests
run: |
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,16 @@ jobs:
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Python environment
run: |
pip list
- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: HF Accelerate tests
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
Expand Down
16 changes: 8 additions & 8 deletions .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]
runs-on: [self-hosted, nvidia, cu116, v100]

steps:
- uses: actions/checkout@v2
Expand All @@ -32,7 +32,7 @@ jobs:
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision triton
pip install torch==1.12.0 torchvision --extra-index-url https://download.pytorch.org/whl/cu113
pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -44,20 +44,20 @@ jobs:
pip uninstall --yes transformers
pip install .
- name: Python environment
run: |
pip list
- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning,inf]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --cuda_ver="11"
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 -n 4 --verbose -m 'inference' unit/ --cuda_ver="11"
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
8 changes: 4 additions & 4 deletions .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,16 @@ jobs:
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Python environment
run: |
pip list
- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: PyTorch Lightning Tests
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/nv-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]
runs-on: [self-hosted, nvidia, cu116, v100]

steps:
- uses: actions/checkout@v2
Expand All @@ -25,7 +25,7 @@ jobs:
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision triton
pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -45,9 +45,13 @@ jobs:
pip install .[dev,1bit,autotuning,inf]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.8" --cuda_ver="11"
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
17 changes: 8 additions & 9 deletions .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]
runs-on: [self-hosted, nvidia, cu116, v100]

steps:
- uses: actions/checkout@v2
Expand All @@ -32,8 +32,7 @@ jobs:
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision triton
pip install torch==1.12.0 torchvision --extra-index-url https://download.pytorch.org/whl/cu113 # Need to resolve errors with torch==1.13.0
pip install cupy-cuda113
pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -47,20 +46,20 @@ jobs:
pip uninstall --yes transformers
pip install .
- name: Python environment
run: |
pip list
- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -n 4 unit/ --torch_ver="1.12" --cuda_ver="11"
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/ --torch_ver="1.12" --cuda_ver="11"
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -n 4 unit/ --torch_ver="1.13" --cuda_ver="11.6"
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -m 'sequential' unit/ --torch_ver="1.13" --cuda_ver="11.6"
12 changes: 6 additions & 6 deletions .github/workflows/nv-torch-nightly-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu113, v100]
runs-on: [self-hosted, nvidia, cu116, v100]

steps:
- uses: actions/checkout@v2
Expand All @@ -25,7 +25,7 @@ jobs:
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision triton
pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu113
pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -39,16 +39,16 @@ jobs:
pip uninstall --yes transformers
pip install .
- name: Python environment
run: |
pip list
- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/nv-torch18-p40.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ jobs:
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Python environment
run: |
pip list
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
Expand All @@ -56,8 +52,12 @@ jobs:
pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10"
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10.1"
8 changes: 4 additions & 4 deletions .github/workflows/nv-torch18-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,16 @@ jobs:
pip uninstall --yes transformers
pip install .
- name: Python environment
run: |
pip list
- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/nv-transformers-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,16 @@ jobs:
sudo apt-get update
sudo apt-get install -y libaio-dev
- name: Python environment
run: |
pip list
- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: HF transformers tests
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
'Please visit https://pytorch.org/ to see how to properly install torch on your system.')

from op_builder import ALL_OPS, get_default_compute_capabilities, OpBuilder
from op_builder.builder import installed_cuda_version

# fetch rocm state
is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
Expand Down Expand Up @@ -74,7 +75,7 @@ def fetch_requirements(path):
if rocm_major <= 4:
cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
else:
cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
cupy = f"cupy-cuda{''.join(map(str,installed_cuda_version()))}"
if cupy:
extras_require['1bit'].append(cupy)
extras_require['1bit_mpi'].append(cupy)
Expand Down

0 comments on commit 521d329

Please sign in to comment.