From 545b76e108c24a37d7970f6324eb1d47b61ff720 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Wed, 24 Aug 2022 15:03:11 +0200
Subject: [PATCH 01/18] reinstall things in the same manner as in docker

---
 .azure/gpu-tests.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
index f19c5bafc7814..b1fb63c4b9f51 100644
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@@ -72,12 +72,16 @@ jobs:
         set -e
         python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)"
+        TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
         CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])")
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION}
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
         pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0"
-        pip install -e .[strategies]
+        pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
         pip install -U deepspeed  # TODO: remove when docker images are upgraded
-        pip install --requirement requirements/pytorch/devel.txt
+        pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
         pip list
       env:
         PACKAGE_NAME: pytorch

From 958b0971b0e0fdd46167c9f6d1a49e86da658695 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Wed, 24 Aug 2022 15:09:53 +0200
Subject: [PATCH 02/18] remove test to trigger GPU CI. revert this later

---
 tests/tests_pytorch/models/test_horovod.py | 26 +++++++++++-----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/tests_pytorch/models/test_horovod.py b/tests/tests_pytorch/models/test_horovod.py
index 6cd354ef22cfe..7fb0a7493e7ed 100644
--- a/tests/tests_pytorch/models/test_horovod.py
+++ b/tests/tests_pytorch/models/test_horovod.py
@@ -76,19 +76,19 @@ def _run_horovod(trainer_options):
     assert exit_code == 0
 
 
-@RunIf(horovod=True, skip_windows=True)
-def test_horovod_cpu(tmpdir):
-    """Test Horovod running multi-process on CPU."""
-    trainer_options = dict(
-        default_root_dir=str(tmpdir),
-        gradient_clip_val=1.0,
-        enable_progress_bar=False,
-        max_epochs=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
-        strategy="horovod",
-    )
-    _run_horovod(trainer_options)
+# @RunIf(horovod=True, skip_windows=True)
+# def test_horovod_cpu(tmpdir):
+#     """Test Horovod running multi-process on CPU."""
+#     trainer_options = dict(
+#         default_root_dir=str(tmpdir),
+#         gradient_clip_val=1.0,
+#         enable_progress_bar=False,
+#         max_epochs=1,
+#         limit_train_batches=0.4,
+#         limit_val_batches=0.2,
+#         strategy="horovod",
+#     )
+#     _run_horovod(trainer_options)
 
 
 @RunIf(horovod=True, skip_windows=True)

From 40146ce357a85ecf55f17c9c5755def2b8158ec5 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Wed, 24 Aug 2022 15:19:08 +0200
Subject: [PATCH 03/18] .


From 97708a824d67116ecbd30032716bdcea197bf50c Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Wed, 24 Aug 2022 15:31:43 +0200
Subject: [PATCH 04/18] .


From 052fe4ca70529dd6859737fe54ac0e49ec328fb6 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Wed, 24 Aug 2022 15:50:19 +0200
Subject: [PATCH 05/18] Revert "remove test to trigger GPU CI. revert this
 later"

This reverts commit 958b0971b0e0fdd46167c9f6d1a49e86da658695.
---
 tests/tests_pytorch/models/test_horovod.py | 26 +++++++++++-----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/tests_pytorch/models/test_horovod.py b/tests/tests_pytorch/models/test_horovod.py
index 7fb0a7493e7ed..6cd354ef22cfe 100644
--- a/tests/tests_pytorch/models/test_horovod.py
+++ b/tests/tests_pytorch/models/test_horovod.py
@@ -76,19 +76,19 @@ def _run_horovod(trainer_options):
     assert exit_code == 0
 
 
-# @RunIf(horovod=True, skip_windows=True)
-# def test_horovod_cpu(tmpdir):
-#     """Test Horovod running multi-process on CPU."""
-#     trainer_options = dict(
-#         default_root_dir=str(tmpdir),
-#         gradient_clip_val=1.0,
-#         enable_progress_bar=False,
-#         max_epochs=1,
-#         limit_train_batches=0.4,
-#         limit_val_batches=0.2,
-#         strategy="horovod",
-#     )
-#     _run_horovod(trainer_options)
+@RunIf(horovod=True, skip_windows=True)
+def test_horovod_cpu(tmpdir):
+    """Test Horovod running multi-process on CPU."""
+    trainer_options = dict(
+        default_root_dir=str(tmpdir),
+        gradient_clip_val=1.0,
+        enable_progress_bar=False,
+        max_epochs=1,
+        limit_train_batches=0.4,
+        limit_val_batches=0.2,
+        strategy="horovod",
+    )
+    _run_horovod(trainer_options)
 
 
 @RunIf(horovod=True, skip_windows=True)

From 9af84b2e9f9da00107ab820b65027197eff6f2cb Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Wed, 24 Aug 2022 15:52:27 +0200
Subject: [PATCH 06/18] add debug prints to CI

---
 tests/tests_pytorch/models/test_horovod.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/tests_pytorch/models/test_horovod.py b/tests/tests_pytorch/models/test_horovod.py
index 6cd354ef22cfe..1c7ec83ceab23 100644
--- a/tests/tests_pytorch/models/test_horovod.py
+++ b/tests/tests_pytorch/models/test_horovod.py
@@ -55,6 +55,7 @@ def test_nccl_is_available_on_gpu_environment():
 def _run_horovod(trainer_options):
     """Execute the training script across multiple workers in parallel."""
     devices = trainer_options.get("devices", 1)
+    os.environ["NCCL_DEBUG"] = "INFO"
     tutils.reset_seed()
     # TODO: Find out why coverage breaks CI.
     # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else ''

From bfc62fa2cdf913cb3eefa5d43c45e24bd365c6e7 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 05:00:31 -0400
Subject: [PATCH 07/18] explicitly install latest working nccl version into
 docker

---
 dockers/base-cuda/Dockerfile               | 23 ++++++++++------------
 tests/tests_pytorch/models/test_horovod.py |  1 -
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index be613f3b6415f..5fd3ec52ec9b1 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -37,7 +37,11 @@ RUN \
     # https://github.com/NVIDIA/nvidia-docker/issues/1631
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update -qq --fix-missing && \
-    apt-get install -y --no-install-recommends \
+    NCCL_VER=$(apt list libnccl2 | awk -F ' ' '{print $2}' | grep -ve '^\s*$') && \
+    CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
+    MAX_ALLOWED_NCCL=2.11.4-1+cuda${CUDA_VERSION_MM} && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort | head -n1) && \
+    apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
         build-essential \
         pkg-config \
         cmake \
@@ -50,8 +54,10 @@ RUN \
         libopenmpi-dev \
         openmpi-bin \
         ssh \
-    && \
+        libnccl2=$TO_INSTALL_NCCL \
+        libnccl-dev=$TO_INSTALL_NCCL
 
+RUN \
 # Install python
     add-apt-repository ppa:deadsnakes/ppa && \
     apt-get install -y \
@@ -61,8 +67,9 @@ RUN \
     && \
 
     update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
-    update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1
 
+RUN \
 # Cleaning
     apt-get autoremove -y && \
     apt-get clean && \
@@ -91,16 +98,6 @@ RUN \
     pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
     rm assistant.py
 
-RUN \
-    apt-get purge -y cmake && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
-    tar -zxvf cmake-3.20.2.tar.gz && \
-    cd cmake-3.20.2 && \
-    ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
-    make && \
-    make install && \
-    cmake  --version
-
 ENV \
     HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
     HOROVOD_GPU_OPERATIONS=NCCL \
diff --git a/tests/tests_pytorch/models/test_horovod.py b/tests/tests_pytorch/models/test_horovod.py
index 1c7ec83ceab23..6cd354ef22cfe 100644
--- a/tests/tests_pytorch/models/test_horovod.py
+++ b/tests/tests_pytorch/models/test_horovod.py
@@ -55,7 +55,6 @@ def test_nccl_is_available_on_gpu_environment():
 def _run_horovod(trainer_options):
     """Execute the training script across multiple workers in parallel."""
     devices = trainer_options.get("devices", 1)
-    os.environ["NCCL_DEBUG"] = "INFO"
     tutils.reset_seed()
     # TODO: Find out why coverage breaks CI.
     # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else ''

From 1f1e69ebd63587ae8991bfca4fb8123450484f79 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 05:02:34 -0400
Subject: [PATCH 08/18] PUSH TO HUB. REVERT THIS AT THE END

---
 .github/workflows/ci-pytorch-dockers.yml | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml
index 6cb28885e79ef..d79a7d18f7d7f 100644
--- a/.github/workflows/ci-pytorch-dockers.yml
+++ b/.github/workflows/ci-pytorch-dockers.yml
@@ -1,28 +1,14 @@
 name: Docker
 
 on:
-  push:
-    branches: [master, "release/*"]
-  pull_request:
-    branches: [master, "release/*"]
-    paths:
-      - "dockers/**"
-      - "!dockers/README.md"
-      - "requirements.txt"
-      - "requirements/*.txt"
-      - "requirements/pytorch/*"
-      - "environment.yml"
-      - ".github/workflows/*docker*.yml"
-      - "setup.py"
-  schedule:
-    - cron: "0 0 * * *"  # at the end of every day
+  pull_request: {}
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }}
   cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
 
 env:
-  PUSH_TO_HUB: ${{ github.event_name == 'schedule' }}
+  PUSH_TO_HUB: true
 
 jobs:
   build-pl:

From 6ddf0c613aec4af88b2a5a4354baadd0bf5ff951 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 05:11:02 -0400
Subject: [PATCH 09/18] sort by version

---
 dockers/base-cuda/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 5fd3ec52ec9b1..2e251294185a8 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -40,7 +40,7 @@ RUN \
     NCCL_VER=$(apt list libnccl2 | awk -F ' ' '{print $2}' | grep -ve '^\s*$') && \
     CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
     MAX_ALLOWED_NCCL=2.11.4-1+cuda${CUDA_VERSION_MM} && \
-    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort | head -n1) && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1) && \
     apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
         build-essential \
         pkg-config \

From d6712680c01db08dfa4955f7513716359ab64632 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 05:16:50 -0400
Subject: [PATCH 10/18] sort by version

---
 dockers/base-cuda/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 2e251294185a8..38d6dc4531c8f 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -40,7 +40,7 @@ RUN \
     NCCL_VER=$(apt list libnccl2 | awk -F ' ' '{print $2}' | grep -ve '^\s*$') && \
     CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
     MAX_ALLOWED_NCCL=2.11.4-1+cuda${CUDA_VERSION_MM} && \
-    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1) && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -t '.' -k 1,1 -k 2,2 -k 3,3 -g | head -n1) && \
     apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
         build-essential \
         pkg-config \

From c588f511594af80ec7561bfaa0da37e810efcb4e Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 05:30:16 -0400
Subject: [PATCH 11/18] get correct version from dpkg

---
 dockers/base-cuda/Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 38d6dc4531c8f..511295d8a1fa3 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -37,10 +37,10 @@ RUN \
     # https://github.com/NVIDIA/nvidia-docker/issues/1631
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update -qq --fix-missing && \
-    NCCL_VER=$(apt list libnccl2 | awk -F ' ' '{print $2}' | grep -ve '^\s*$') && \
+    NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
     CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
-    MAX_ALLOWED_NCCL=2.11.4-1+cuda${CUDA_VERSION_MM} && \
-    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -t '.' -k 1,1 -k 2,2 -k 3,3 -g | head -n1) && \
+    MAX_ALLOWED_NCCL=2.11.4 && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V  | head -n1)-1+cuda${CUDA_VERSION_MM} && \
     apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
         build-essential \
         pkg-config \

From 41f0e1a2fe2d31f118eb4003ee4ed7abaec6549b Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 07:37:53 -0400
Subject: [PATCH 12/18] .


From e1f81f631d8509b7b296f8d746defcdcfa411893 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 08:19:52 -0400
Subject: [PATCH 13/18] apply suggestions + port NCCL  trick to conda images

---
 .azure/gpu-tests.yml          |  1 -
 dockers/base-conda/Dockerfile | 14 ++++++++------
 dockers/base-cuda/Dockerfile  | 10 ++--------
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
index b1fb63c4b9f51..ef65c76e823aa 100644
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@@ -80,7 +80,6 @@ jobs:
         python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
         pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0"
         pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
-        pip install -U deepspeed  # TODO: remove when docker images are upgraded
         pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
         pip list
       env:
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
index d6bfeee90d561..9bb75e34b8ff6 100644
--- a/dockers/base-conda/Dockerfile
+++ b/dockers/base-conda/Dockerfile
@@ -34,6 +34,10 @@ RUN \
     # https://github.com/NVIDIA/nvidia-docker/issues/1631
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update -qq --fix-missing && \
+    NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
+    CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
+    MAX_ALLOWED_NCCL=2.11.4 && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V  | head -n1)-1+cuda${CUDA_VERSION_MM} && \
     apt-get install -y --no-install-recommends \
         build-essential \
         cmake \
@@ -42,17 +46,15 @@ RUN \
         curl \
         unzip \
         ca-certificates \
-        libopenmpi-dev
-
-RUN \
+        libopenmpi-dev \
+        libnccl2=$TO_INSTALL_NCCL \
+        libnccl-dev=$TO_INSTALL_NCCL && \
 # Install conda and python.
 # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
     curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
     chmod +x ~/miniconda.sh && \
     ~/miniconda.sh -b && \
-    rm ~/miniconda.sh
-
-RUN \
+    rm ~/miniconda.sh && \
 # Cleaning
     apt-get autoremove -y && \
     apt-get clean && \
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 511295d8a1fa3..08692ff00ab78 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -55,9 +55,7 @@ RUN \
         openmpi-bin \
         ssh \
         libnccl2=$TO_INSTALL_NCCL \
-        libnccl-dev=$TO_INSTALL_NCCL
-
-RUN \
+        libnccl-dev=$TO_INSTALL_NCCL && \
 # Install python
     add-apt-repository ppa:deadsnakes/ppa && \
     apt-get install -y \
@@ -65,11 +63,8 @@ RUN \
         python${PYTHON_VERSION}-distutils \
         python${PYTHON_VERSION}-dev \
     && \
-
     update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
-    update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1
-
-RUN \
+    update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
 # Cleaning
     apt-get autoremove -y && \
     apt-get clean && \
@@ -85,7 +80,6 @@ RUN \
     wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
     python${PYTHON_VERSION} get-pip.py && \
     rm get-pip.py && \
-
     pip install -q fire && \
     # Disable cache \
     CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \

From e32e0f52e261d48575e75a5d2c0085a92c542311 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 09:24:13 -0400
Subject: [PATCH 14/18] Revert "PUSH TO HUB. REVERT THIS AT THE END"

This reverts commit 1f1e69ebd63587ae8991bfca4fb8123450484f79.
---
 .github/workflows/ci-pytorch-dockers.yml | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml
index d79a7d18f7d7f..6cb28885e79ef 100644
--- a/.github/workflows/ci-pytorch-dockers.yml
+++ b/.github/workflows/ci-pytorch-dockers.yml
@@ -1,14 +1,28 @@
 name: Docker
 
 on:
-  pull_request: {}
+  push:
+    branches: [master, "release/*"]
+  pull_request:
+    branches: [master, "release/*"]
+    paths:
+      - "dockers/**"
+      - "!dockers/README.md"
+      - "requirements.txt"
+      - "requirements/*.txt"
+      - "requirements/pytorch/*"
+      - "environment.yml"
+      - ".github/workflows/*docker*.yml"
+      - "setup.py"
+  schedule:
+    - cron: "0 0 * * *"  # at the end of every day
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }}
   cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
 
 env:
-  PUSH_TO_HUB: true
+  PUSH_TO_HUB: ${{ github.event_name == 'schedule' }}
 
 jobs:
   build-pl:

From c3e2fd1fc3f3396231ac151a0ae2d38c95a59665 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 09:43:25 -0400
Subject: [PATCH 15/18] .


From f8649c6f52fd0eeee170fafbf08848b5855b01b1 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 17:17:06 +0200
Subject: [PATCH 16/18] remove test to trigger GPU CI. revert this later

---
 tests/tests_pytorch/models/test_amp.py | 56 +++++++++++++-------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/tests/tests_pytorch/models/test_amp.py b/tests/tests_pytorch/models/test_amp.py
index 786de99f59714..a569d15d8787d 100644
--- a/tests/tests_pytorch/models/test_amp.py
+++ b/tests/tests_pytorch/models/test_amp.py
@@ -67,34 +67,34 @@ def _assert_autocast_enabled(self):
             assert torch.is_autocast_enabled()
 
 
-@RunIf(min_torch="1.10")
-@pytest.mark.parametrize(
-    "strategy",
-    [
-        None,
-        pytest.param("dp", marks=pytest.mark.skip("dp + amp not supported on CPU currently")),  # TODO
-        "ddp_spawn",
-    ],
-)
-@pytest.mark.parametrize("precision", [16, "bf16"])
-@pytest.mark.parametrize("devices", [1, 2])
-def test_amp_cpus(tmpdir, strategy, precision, devices):
-    """Make sure combinations of AMP and strategies work if supported."""
-    tutils.reset_seed()
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        accelerator="cpu",
-        devices=devices,
-        max_epochs=1,
-        strategy=strategy,
-        precision=precision,
-    )
-
-    model = AMPTestModel()
-    trainer.fit(model)
-    trainer.test(model)
-    trainer.predict(model, DataLoader(RandomDataset(32, 64)))
+# @RunIf(min_torch="1.10")
+# @pytest.mark.parametrize(
+#     "strategy",
+#     [
+#         None,
+#         pytest.param("dp", marks=pytest.mark.skip("dp + amp not supported on CPU currently")),  # TODO
+#         "ddp_spawn",
+#     ],
+# )
+# @pytest.mark.parametrize("precision", [16, "bf16"])
+# @pytest.mark.parametrize("devices", [1, 2])
+# def test_amp_cpus(tmpdir, strategy, precision, devices):
+#     """Make sure combinations of AMP and strategies work if supported."""
+#     tutils.reset_seed()
+
+#     trainer = Trainer(
+#         default_root_dir=tmpdir,
+#         accelerator="cpu",
+#         devices=devices,
+#         max_epochs=1,
+#         strategy=strategy,
+#         precision=precision,
+#     )
+
+#     model = AMPTestModel()
+#     trainer.fit(model)
+#     trainer.test(model)
+#     trainer.predict(model, DataLoader(RandomDataset(32, 64)))
 
 
 @RunIf(min_cuda_gpus=2, min_torch="1.10")

From a634f219c9e02632e072c1ab2de08ccf22fd8fbf Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Thu, 25 Aug 2022 17:25:01 +0200
Subject: [PATCH 17/18] Revert "remove test to trigger GPU CI. revert this
 later"

This reverts commit f8649c6f52fd0eeee170fafbf08848b5855b01b1.
---
 tests/tests_pytorch/models/test_amp.py | 56 +++++++++++++-------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/tests/tests_pytorch/models/test_amp.py b/tests/tests_pytorch/models/test_amp.py
index a569d15d8787d..786de99f59714 100644
--- a/tests/tests_pytorch/models/test_amp.py
+++ b/tests/tests_pytorch/models/test_amp.py
@@ -67,34 +67,34 @@ def _assert_autocast_enabled(self):
             assert torch.is_autocast_enabled()
 
 
-# @RunIf(min_torch="1.10")
-# @pytest.mark.parametrize(
-#     "strategy",
-#     [
-#         None,
-#         pytest.param("dp", marks=pytest.mark.skip("dp + amp not supported on CPU currently")),  # TODO
-#         "ddp_spawn",
-#     ],
-# )
-# @pytest.mark.parametrize("precision", [16, "bf16"])
-# @pytest.mark.parametrize("devices", [1, 2])
-# def test_amp_cpus(tmpdir, strategy, precision, devices):
-#     """Make sure combinations of AMP and strategies work if supported."""
-#     tutils.reset_seed()
-
-#     trainer = Trainer(
-#         default_root_dir=tmpdir,
-#         accelerator="cpu",
-#         devices=devices,
-#         max_epochs=1,
-#         strategy=strategy,
-#         precision=precision,
-#     )
-
-#     model = AMPTestModel()
-#     trainer.fit(model)
-#     trainer.test(model)
-#     trainer.predict(model, DataLoader(RandomDataset(32, 64)))
+@RunIf(min_torch="1.10")
+@pytest.mark.parametrize(
+    "strategy",
+    [
+        None,
+        pytest.param("dp", marks=pytest.mark.skip("dp + amp not supported on CPU currently")),  # TODO
+        "ddp_spawn",
+    ],
+)
+@pytest.mark.parametrize("precision", [16, "bf16"])
+@pytest.mark.parametrize("devices", [1, 2])
+def test_amp_cpus(tmpdir, strategy, precision, devices):
+    """Make sure combinations of AMP and strategies work if supported."""
+    tutils.reset_seed()
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        accelerator="cpu",
+        devices=devices,
+        max_epochs=1,
+        strategy=strategy,
+        precision=precision,
+    )
+
+    model = AMPTestModel()
+    trainer.fit(model)
+    trainer.test(model)
+    trainer.predict(model, DataLoader(RandomDataset(32, 64)))
 
 
 @RunIf(min_cuda_gpus=2, min_torch="1.10")

From 9e0849514811c12efab04c0b3bbab8e798ec065a Mon Sep 17 00:00:00 2001
From: Jirka <jirka.borovec@seznam.cz>
Date: Thu, 25 Aug 2022 18:41:49 +0200
Subject: [PATCH 18/18] azure

---
 .azure/gpu-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
index ef65c76e823aa..2da30c0dd66ab 100644
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@@ -44,7 +44,7 @@ jobs:
 
     - bash: |
         CHANGED_FILES=$(git diff --name-status origin/master -- . | awk  '{print $2}')
-        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
+        FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
         echo $CHANGED_FILES > changed_files.txt
         MATCHES=$(cat changed_files.txt | grep -E $FILTER)
         echo $MATCHES