From 521d329b975de97ec0b52395f02bb32466b8dc35 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Tue, 8 Nov 2022 10:17:03 -0800
Subject: [PATCH] Fix CI issues related to cupy install (#2483)

* remove any cupy install when setting up environments

* revert previous changes to run on cu111 runners

* fix for when no cupy is installed

* remove cupy uninstall for workflows not using latest torch version

* update to cu116 for inference tests

* fix pip uninstall line

* move python environment list to after DS install

* remove cupy uninstall

* re-add --forked

* fix how we get cupy version (should be based on nvcc version)
---
 .github/workflows/amd.yml                   |  8 ++++----
 .github/workflows/nv-accelerate-v100.yml    |  8 ++++----
 .github/workflows/nv-inference.yml          | 16 ++++++++--------
 .github/workflows/nv-lightning-v100.yml     |  8 ++++----
 .github/workflows/nv-nightly.yml            | 10 +++++++---
 .github/workflows/nv-torch-latest-v100.yml  | 17 ++++++++---------
 .github/workflows/nv-torch-nightly-v100.yml | 12 ++++++------
 .github/workflows/nv-torch18-p40.yml        | 10 +++++-----
 .github/workflows/nv-torch18-v100.yml       |  8 ++++----
 .github/workflows/nv-transformers-v100.yml  |  8 ++++----
 setup.py                                    |  3 ++-
 11 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/amd.yml b/.github/workflows/amd.yml
index 97ffb9b21482..84a46c1942ad 100644
--- a/.github/workflows/amd.yml
+++ b/.github/workflows/amd.yml
@@ -42,10 +42,6 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y libaio-dev
 
-      - name: Python environment
-        run: |
-          pip list
-
       - name: Install transformers
         run: |
           git clone https://github.com/huggingface/transformers
@@ -62,6 +58,10 @@ jobs:
           #python -c "from deepspeed.env_report import cli_main; cli_main()"
           ds_report
 
+      - name: Python environment
+        run: |
+          pip list
+
       # Runs a set of commands using the runners shell
       - name: Unit tests
         run: |
diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 5f9b1c39a13f..ed836aa04ba8 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -36,16 +36,16 @@ jobs:
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
-      - name: Python environment
-        run: |
-          pip list
-
       - name: Install deepspeed
         run: |
           pip uninstall --yes deepspeed
           pip install .[dev,autotuning]
           ds_report
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: HF Accelerate tests
         run: |
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
index a0d2d4df10f6..9879279ab1ef 100644
--- a/.github/workflows/nv-inference.yml
+++ b/.github/workflows/nv-inference.yml
@@ -17,7 +17,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
       - uses: actions/checkout@v2
@@ -32,7 +32,7 @@ jobs:
           nvcc --version
           pip install --upgrade pip
           pip uninstall --yes torch torchvision triton
-          pip install torch==1.12.0 torchvision --extra-index-url https://download.pytorch.org/whl/cu113
+          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -44,20 +44,20 @@ jobs:
           pip uninstall --yes transformers
           pip install .
 
-      - name: Python environment
-        run: |
-          pip list
-
       - name: Install deepspeed
         run: |
           pip uninstall --yes deepspeed
           pip install .[dev,1bit,autotuning,inf]
           ds_report
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --cuda_ver="11"
-          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 -n 4 --verbose -m 'inference' unit/ --cuda_ver="11"
+          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index a63f6b75e769..0c5ad4184f05 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -36,16 +36,16 @@ jobs:
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
-      - name: Python environment
-        run: |
-          pip list
-
       - name: Install deepspeed
         run: |
           pip uninstall --yes deepspeed
           pip install .[dev,autotuning]
           ds_report
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: PyTorch Lightning Tests
         run: |
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index 3f8b9414bc8e..bfa4d9ba6bfb 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -10,7 +10,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
       - uses: actions/checkout@v2
@@ -25,7 +25,7 @@ jobs:
           nvcc --version
           pip install --upgrade pip
           pip uninstall --yes torch torchvision triton
-          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -45,9 +45,13 @@ jobs:
           pip install .[dev,1bit,autotuning,inf]
           ds_report
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.8" --cuda_ver="11"
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 178976eb056e..488874a13c4b 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -17,7 +17,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
       - uses: actions/checkout@v2
@@ -32,8 +32,7 @@ jobs:
           nvcc --version
           pip install --upgrade pip
           pip uninstall --yes torch torchvision triton
-          pip install torch==1.12.0 torchvision --extra-index-url https://download.pytorch.org/whl/cu113 # Need to resolve errors with torch==1.13.0
-          pip install cupy-cuda113
+          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -47,20 +46,20 @@ jobs:
           pip uninstall --yes transformers
           pip install .
 
-      - name: Python environment
-        run: |
-          pip list
-
       - name: Install deepspeed
         run: |
           pip uninstall --yes deepspeed
           pip install .[dev,1bit,autotuning]
           ds_report
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -n 4 unit/ --torch_ver="1.12" --cuda_ver="11"
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/ --torch_ver="1.12" --cuda_ver="11"
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -n 4 unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -m 'sequential' unit/ --torch_ver="1.13" --cuda_ver="11.6"
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
index c8380a544c45..5804f65f73f5 100644
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@@ -10,7 +10,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu113, v100]
+    runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
       - uses: actions/checkout@v2
@@ -25,7 +25,7 @@ jobs:
           nvcc --version
           pip install --upgrade pip
           pip uninstall --yes torch torchvision triton
-          pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu113
+          pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -39,16 +39,16 @@ jobs:
           pip uninstall --yes transformers
           pip install .
 
-      - name: Python environment
-        run: |
-          pip list
-
       - name: Install deepspeed
         run: |
           pip uninstall --yes deepspeed
           pip install .[dev,1bit,autotuning]
           ds_report
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
diff --git a/.github/workflows/nv-torch18-p40.yml b/.github/workflows/nv-torch18-p40.yml
index fbb3a36da6ef..6be052f350b4 100644
--- a/.github/workflows/nv-torch18-p40.yml
+++ b/.github/workflows/nv-torch18-p40.yml
@@ -36,10 +36,6 @@ jobs:
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
-      - name: Python environment
-        run: |
-          pip list
-
       - name: Install transformers
         run: |
           git clone https://github.com/huggingface/transformers
@@ -56,8 +52,12 @@ jobs:
           pip install .[dev,1bit,autotuning]
           ds_report
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: Unit tests
         run: |
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10"
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10.1"
diff --git a/.github/workflows/nv-torch18-v100.yml b/.github/workflows/nv-torch18-v100.yml
index 994f8ac3b051..eb7a577d5608 100644
--- a/.github/workflows/nv-torch18-v100.yml
+++ b/.github/workflows/nv-torch18-v100.yml
@@ -46,16 +46,16 @@ jobs:
           pip uninstall --yes transformers
           pip install .
 
-      - name: Python environment
-        run: |
-          pip list
-
       - name: Install deepspeed
         run: |
           pip uninstall --yes deepspeed
           pip install .[dev,1bit,autotuning]
           ds_report
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
index 5a213068bc91..55a887c8ed80 100644
--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -38,16 +38,16 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y libaio-dev
 
-      - name: Python environment
-        run: |
-          pip list
-
       - name: Install deepspeed
         run: |
           pip uninstall --yes deepspeed
           pip install .[dev,autotuning]
           ds_report
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: HF transformers tests
         run: |
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
diff --git a/setup.py b/setup.py
index a159ece2bdef..fa60986055be 100755
--- a/setup.py
+++ b/setup.py
@@ -32,6 +32,7 @@
         'Please visit https://pytorch.org/ to see how to properly install torch on your system.')
 
 from op_builder import ALL_OPS, get_default_compute_capabilities, OpBuilder
+from op_builder.builder import installed_cuda_version
 
 # fetch rocm state
 is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
@@ -74,7 +75,7 @@ def fetch_requirements(path):
         if rocm_major <= 4:
             cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
     else:
-        cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
+        cupy = f"cupy-cuda{''.join(map(str,installed_cuda_version()))}"
     if cupy:
         extras_require['1bit'].append(cupy)
         extras_require['1bit_mpi'].append(cupy)