From 545b76e108c24a37d7970f6324eb1d47b61ff720 Mon Sep 17 00:00:00 2001 From: otaj Date: Wed, 24 Aug 2022 15:03:11 +0200 Subject: [PATCH 01/18] reinstall things in the same manner as in docker --- .azure/gpu-tests.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index f19c5bafc7814..b1fb63c4b9f51 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -72,12 +72,16 @@ jobs: set -e python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)" + TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION} + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION} + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION} pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0" - pip install -e .[strategies] + pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html pip install -U deepspeed # TODO: remove when docker images are upgraded - pip install --requirement requirements/pytorch/devel.txt + pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html pip list env: PACKAGE_NAME: pytorch From 958b0971b0e0fdd46167c9f6d1a49e86da658695 Mon Sep 17 00:00:00 2001 From: otaj Date: Wed, 24 Aug 2022 15:09:53 +0200 Subject: [PATCH 02/18] remove test to trigger GPU CI. revert this later --- tests/tests_pytorch/models/test_horovod.py | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/tests_pytorch/models/test_horovod.py b/tests/tests_pytorch/models/test_horovod.py index 6cd354ef22cfe..7fb0a7493e7ed 100644 --- a/tests/tests_pytorch/models/test_horovod.py +++ b/tests/tests_pytorch/models/test_horovod.py @@ -76,19 +76,19 @@ def _run_horovod(trainer_options): assert exit_code == 0 -@RunIf(horovod=True, skip_windows=True) -def test_horovod_cpu(tmpdir): - """Test Horovod running multi-process on CPU.""" - trainer_options = dict( - default_root_dir=str(tmpdir), - gradient_clip_val=1.0, - enable_progress_bar=False, - max_epochs=1, - limit_train_batches=0.4, - limit_val_batches=0.2, - strategy="horovod", - ) - _run_horovod(trainer_options) +# @RunIf(horovod=True, skip_windows=True) +# def test_horovod_cpu(tmpdir): +# """Test Horovod running multi-process on CPU.""" +# trainer_options = dict( +# default_root_dir=str(tmpdir), +# gradient_clip_val=1.0, +# enable_progress_bar=False, +# max_epochs=1, +# limit_train_batches=0.4, +# limit_val_batches=0.2, +# strategy="horovod", +# ) +# _run_horovod(trainer_options) @RunIf(horovod=True, skip_windows=True) From 40146ce357a85ecf55f17c9c5755def2b8158ec5 Mon Sep 17 00:00:00 2001 From: otaj Date: Wed, 24 Aug 2022 15:19:08 +0200 Subject: [PATCH 03/18] . From 97708a824d67116ecbd30032716bdcea197bf50c Mon Sep 17 00:00:00 2001 From: otaj Date: Wed, 24 Aug 2022 15:31:43 +0200 Subject: [PATCH 04/18] . From 052fe4ca70529dd6859737fe54ac0e49ec328fb6 Mon Sep 17 00:00:00 2001 From: otaj Date: Wed, 24 Aug 2022 15:50:19 +0200 Subject: [PATCH 05/18] Revert "remove test to trigger GPU CI. revert this later" This reverts commit 958b0971b0e0fdd46167c9f6d1a49e86da658695. --- tests/tests_pytorch/models/test_horovod.py | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/tests_pytorch/models/test_horovod.py b/tests/tests_pytorch/models/test_horovod.py index 7fb0a7493e7ed..6cd354ef22cfe 100644 --- a/tests/tests_pytorch/models/test_horovod.py +++ b/tests/tests_pytorch/models/test_horovod.py @@ -76,19 +76,19 @@ def _run_horovod(trainer_options): assert exit_code == 0 -# @RunIf(horovod=True, skip_windows=True) -# def test_horovod_cpu(tmpdir): -# """Test Horovod running multi-process on CPU.""" -# trainer_options = dict( -# default_root_dir=str(tmpdir), -# gradient_clip_val=1.0, -# enable_progress_bar=False, -# max_epochs=1, -# limit_train_batches=0.4, -# limit_val_batches=0.2, -# strategy="horovod", -# ) -# _run_horovod(trainer_options) +@RunIf(horovod=True, skip_windows=True) +def test_horovod_cpu(tmpdir): + """Test Horovod running multi-process on CPU.""" + trainer_options = dict( + default_root_dir=str(tmpdir), + gradient_clip_val=1.0, + enable_progress_bar=False, + max_epochs=1, + limit_train_batches=0.4, + limit_val_batches=0.2, + strategy="horovod", + ) + _run_horovod(trainer_options) @RunIf(horovod=True, skip_windows=True) From 9af84b2e9f9da00107ab820b65027197eff6f2cb Mon Sep 17 00:00:00 2001 From: otaj Date: Wed, 24 Aug 2022 15:52:27 +0200 Subject: [PATCH 06/18] add debug prints to CI --- tests/tests_pytorch/models/test_horovod.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_pytorch/models/test_horovod.py b/tests/tests_pytorch/models/test_horovod.py index 6cd354ef22cfe..1c7ec83ceab23 100644 --- a/tests/tests_pytorch/models/test_horovod.py +++ b/tests/tests_pytorch/models/test_horovod.py @@ -55,6 +55,7 @@ def test_nccl_is_available_on_gpu_environment(): def _run_horovod(trainer_options): """Execute the training script across multiple workers in parallel.""" devices = trainer_options.get("devices", 1) + os.environ["NCCL_DEBUG"] = "INFO" tutils.reset_seed() # TODO: Find out why coverage breaks CI. # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' From bfc62fa2cdf913cb3eefa5d43c45e24bd365c6e7 Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 05:00:31 -0400 Subject: [PATCH 07/18] explicitly install latest working nccl version into docker --- dockers/base-cuda/Dockerfile | 23 ++++++++++------------ tests/tests_pytorch/models/test_horovod.py | 1 - 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index be613f3b6415f..5fd3ec52ec9b1 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -37,7 +37,11 @@ RUN \ # https://github.com/NVIDIA/nvidia-docker/issues/1631 apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ apt-get update -qq --fix-missing && \ - apt-get install -y --no-install-recommends \ + NCCL_VER=$(apt list libnccl2 | awk -F ' ' '{print $2}' | grep -ve '^\s*$') && \ + CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ + MAX_ALLOWED_NCCL=2.11.4-1+cuda${CUDA_VERSION_MM} && \ + TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort | head -n1) && \ + apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ build-essential \ pkg-config \ cmake \ @@ -50,8 +54,10 @@ RUN \ libopenmpi-dev \ openmpi-bin \ ssh \ - && \ + libnccl2=$TO_INSTALL_NCCL \ + libnccl-dev=$TO_INSTALL_NCCL +RUN \ # Install python add-apt-repository ppa:deadsnakes/ppa && \ apt-get install -y \ @@ -61,8 +67,9 @@ RUN \ && \ update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \ - update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 +RUN \ # Cleaning apt-get autoremove -y && \ apt-get clean && \ @@ -91,16 +98,6 @@ RUN \ pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \ rm assistant.py -RUN \ - apt-get purge -y cmake && \ - wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \ - tar -zxvf cmake-3.20.2.tar.gz && \ - cd cmake-3.20.2 && \ - ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \ - make && \ - make install && \ - cmake --version - ENV \ HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \ HOROVOD_GPU_OPERATIONS=NCCL \ diff --git a/tests/tests_pytorch/models/test_horovod.py b/tests/tests_pytorch/models/test_horovod.py index 1c7ec83ceab23..6cd354ef22cfe 100644 --- a/tests/tests_pytorch/models/test_horovod.py +++ b/tests/tests_pytorch/models/test_horovod.py @@ -55,7 +55,6 @@ def test_nccl_is_available_on_gpu_environment(): def _run_horovod(trainer_options): """Execute the training script across multiple workers in parallel.""" devices = trainer_options.get("devices", 1) - os.environ["NCCL_DEBUG"] = "INFO" tutils.reset_seed() # TODO: Find out why coverage breaks CI. # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' From 1f1e69ebd63587ae8991bfca4fb8123450484f79 Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 05:02:34 -0400 Subject: [PATCH 08/18] PUSH TO HUB. REVERT THIS AT THE END --- .github/workflows/ci-pytorch-dockers.yml | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 6cb28885e79ef..d79a7d18f7d7f 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -1,28 +1,14 @@ name: Docker on: - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - paths: - - "dockers/**" - - "!dockers/README.md" - - "requirements.txt" - - "requirements/*.txt" - - "requirements/pytorch/*" - - "environment.yml" - - ".github/workflows/*docker*.yml" - - "setup.py" - schedule: - - cron: "0 0 * * *" # at the end of every day + pull_request: {} concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }} cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} env: - PUSH_TO_HUB: ${{ github.event_name == 'schedule' }} + PUSH_TO_HUB: true jobs: build-pl: From 6ddf0c613aec4af88b2a5a4354baadd0bf5ff951 Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 05:11:02 -0400 Subject: [PATCH 09/18] sort by version --- dockers/base-cuda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 5fd3ec52ec9b1..2e251294185a8 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -40,7 +40,7 @@ RUN \ NCCL_VER=$(apt list libnccl2 | awk -F ' ' '{print $2}' | grep -ve '^\s*$') && \ CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ MAX_ALLOWED_NCCL=2.11.4-1+cuda${CUDA_VERSION_MM} && \ - TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort | head -n1) && \ + TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1) && \ apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ build-essential \ pkg-config \ From d6712680c01db08dfa4955f7513716359ab64632 Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 05:16:50 -0400 Subject: [PATCH 10/18] sort by version --- dockers/base-cuda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 2e251294185a8..38d6dc4531c8f 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -40,7 +40,7 @@ RUN \ NCCL_VER=$(apt list libnccl2 | awk -F ' ' '{print $2}' | grep -ve '^\s*$') && \ CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ MAX_ALLOWED_NCCL=2.11.4-1+cuda${CUDA_VERSION_MM} && \ - TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1) && \ + TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -t '.' -k 1,1 -k 2,2 -k 3,3 -g | head -n1) && \ apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ build-essential \ pkg-config \ From c588f511594af80ec7561bfaa0da37e810efcb4e Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 05:30:16 -0400 Subject: [PATCH 11/18] get correct version from dpkg --- dockers/base-cuda/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 38d6dc4531c8f..511295d8a1fa3 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -37,10 +37,10 @@ RUN \ # https://github.com/NVIDIA/nvidia-docker/issues/1631 apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ apt-get update -qq --fix-missing && \ - NCCL_VER=$(apt list libnccl2 | awk -F ' ' '{print $2}' | grep -ve '^\s*$') && \ + NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ - MAX_ALLOWED_NCCL=2.11.4-1+cuda${CUDA_VERSION_MM} && \ - TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -t '.' -k 1,1 -k 2,2 -k 3,3 -g | head -n1) && \ + MAX_ALLOWED_NCCL=2.11.4 && \ + TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \ apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ build-essential \ pkg-config \ From 41f0e1a2fe2d31f118eb4003ee4ed7abaec6549b Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 07:37:53 -0400 Subject: [PATCH 12/18] . From e1f81f631d8509b7b296f8d746defcdcfa411893 Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 08:19:52 -0400 Subject: [PATCH 13/18] apply suggestions + port NCCL trick to conda images --- .azure/gpu-tests.yml | 1 - dockers/base-conda/Dockerfile | 14 ++++++++------ dockers/base-cuda/Dockerfile | 10 ++-------- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index b1fb63c4b9f51..ef65c76e823aa 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -80,7 +80,6 @@ jobs: python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION} pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0" pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html - pip install -U deepspeed # TODO: remove when docker images are upgraded pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html pip list env: diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index d6bfeee90d561..9bb75e34b8ff6 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -34,6 +34,10 @@ RUN \ # https://github.com/NVIDIA/nvidia-docker/issues/1631 apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ apt-get update -qq --fix-missing && \ + NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ + CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ + MAX_ALLOWED_NCCL=2.11.4 && \ + TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \ apt-get install -y --no-install-recommends \ build-essential \ cmake \ @@ -42,17 +46,15 @@ RUN \ curl \ unzip \ ca-certificates \ - libopenmpi-dev - -RUN \ + libopenmpi-dev \ + libnccl2=$TO_INSTALL_NCCL \ + libnccl-dev=$TO_INSTALL_NCCL && \ # Install conda and python. # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b && \ - rm ~/miniconda.sh - -RUN \ + rm ~/miniconda.sh && \ # Cleaning apt-get autoremove -y && \ apt-get clean && \ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 511295d8a1fa3..08692ff00ab78 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -55,9 +55,7 @@ RUN \ openmpi-bin \ ssh \ libnccl2=$TO_INSTALL_NCCL \ - libnccl-dev=$TO_INSTALL_NCCL - -RUN \ + libnccl-dev=$TO_INSTALL_NCCL && \ # Install python add-apt-repository ppa:deadsnakes/ppa && \ apt-get install -y \ @@ -65,11 +63,8 @@ RUN \ python${PYTHON_VERSION}-distutils \ python${PYTHON_VERSION}-dev \ && \ - update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \ - update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 - -RUN \ + update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \ # Cleaning apt-get autoremove -y && \ apt-get clean && \ @@ -85,7 +80,6 @@ RUN \ wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \ python${PYTHON_VERSION} get-pip.py && \ rm get-pip.py && \ - pip install -q fire && \ # Disable cache \ CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ From e32e0f52e261d48575e75a5d2c0085a92c542311 Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 09:24:13 -0400 Subject: [PATCH 14/18] Revert "PUSH TO HUB. REVERT THIS AT THE END" This reverts commit 1f1e69ebd63587ae8991bfca4fb8123450484f79. --- .github/workflows/ci-pytorch-dockers.yml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index d79a7d18f7d7f..6cb28885e79ef 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -1,14 +1,28 @@ name: Docker on: - pull_request: {} + push: + branches: [master, "release/*"] + pull_request: + branches: [master, "release/*"] + paths: + - "dockers/**" + - "!dockers/README.md" + - "requirements.txt" + - "requirements/*.txt" + - "requirements/pytorch/*" + - "environment.yml" + - ".github/workflows/*docker*.yml" + - "setup.py" + schedule: + - cron: "0 0 * * *" # at the end of every day concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }} cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} env: - PUSH_TO_HUB: true + PUSH_TO_HUB: ${{ github.event_name == 'schedule' }} jobs: build-pl: From c3e2fd1fc3f3396231ac151a0ae2d38c95a59665 Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 09:43:25 -0400 Subject: [PATCH 15/18] . From f8649c6f52fd0eeee170fafbf08848b5855b01b1 Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 17:17:06 +0200 Subject: [PATCH 16/18] remove test to trigger GPU CI. revert this later --- tests/tests_pytorch/models/test_amp.py | 56 +++++++++++++------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/tests/tests_pytorch/models/test_amp.py b/tests/tests_pytorch/models/test_amp.py index 786de99f59714..a569d15d8787d 100644 --- a/tests/tests_pytorch/models/test_amp.py +++ b/tests/tests_pytorch/models/test_amp.py @@ -67,34 +67,34 @@ def _assert_autocast_enabled(self): assert torch.is_autocast_enabled() -@RunIf(min_torch="1.10") -@pytest.mark.parametrize( - "strategy", - [ - None, - pytest.param("dp", marks=pytest.mark.skip("dp + amp not supported on CPU currently")), # TODO - "ddp_spawn", - ], -) -@pytest.mark.parametrize("precision", [16, "bf16"]) -@pytest.mark.parametrize("devices", [1, 2]) -def test_amp_cpus(tmpdir, strategy, precision, devices): - """Make sure combinations of AMP and strategies work if supported.""" - tutils.reset_seed() - - trainer = Trainer( - default_root_dir=tmpdir, - accelerator="cpu", - devices=devices, - max_epochs=1, - strategy=strategy, - precision=precision, - ) - - model = AMPTestModel() - trainer.fit(model) - trainer.test(model) - trainer.predict(model, DataLoader(RandomDataset(32, 64))) +# @RunIf(min_torch="1.10") +# @pytest.mark.parametrize( +# "strategy", +# [ +# None, +# pytest.param("dp", marks=pytest.mark.skip("dp + amp not supported on CPU currently")), # TODO +# "ddp_spawn", +# ], +# ) +# @pytest.mark.parametrize("precision", [16, "bf16"]) +# @pytest.mark.parametrize("devices", [1, 2]) +# def test_amp_cpus(tmpdir, strategy, precision, devices): +# """Make sure combinations of AMP and strategies work if supported.""" +# tutils.reset_seed() + +# trainer = Trainer( +# default_root_dir=tmpdir, +# accelerator="cpu", +# devices=devices, +# max_epochs=1, +# strategy=strategy, +# precision=precision, +# ) + +# model = AMPTestModel() +# trainer.fit(model) +# trainer.test(model) +# trainer.predict(model, DataLoader(RandomDataset(32, 64))) @RunIf(min_cuda_gpus=2, min_torch="1.10") From a634f219c9e02632e072c1ab2de08ccf22fd8fbf Mon Sep 17 00:00:00 2001 From: otaj Date: Thu, 25 Aug 2022 17:25:01 +0200 Subject: [PATCH 17/18] Revert "remove test to trigger GPU CI. revert this later" This reverts commit f8649c6f52fd0eeee170fafbf08848b5855b01b1. --- tests/tests_pytorch/models/test_amp.py | 56 +++++++++++++------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/tests/tests_pytorch/models/test_amp.py b/tests/tests_pytorch/models/test_amp.py index a569d15d8787d..786de99f59714 100644 --- a/tests/tests_pytorch/models/test_amp.py +++ b/tests/tests_pytorch/models/test_amp.py @@ -67,34 +67,34 @@ def _assert_autocast_enabled(self): assert torch.is_autocast_enabled() -# @RunIf(min_torch="1.10") -# @pytest.mark.parametrize( -# "strategy", -# [ -# None, -# pytest.param("dp", marks=pytest.mark.skip("dp + amp not supported on CPU currently")), # TODO -# "ddp_spawn", -# ], -# ) -# @pytest.mark.parametrize("precision", [16, "bf16"]) -# @pytest.mark.parametrize("devices", [1, 2]) -# def test_amp_cpus(tmpdir, strategy, precision, devices): -# """Make sure combinations of AMP and strategies work if supported.""" -# tutils.reset_seed() - -# trainer = Trainer( -# default_root_dir=tmpdir, -# accelerator="cpu", -# devices=devices, -# max_epochs=1, -# strategy=strategy, -# precision=precision, -# ) - -# model = AMPTestModel() -# trainer.fit(model) -# trainer.test(model) -# trainer.predict(model, DataLoader(RandomDataset(32, 64))) +@RunIf(min_torch="1.10") +@pytest.mark.parametrize( + "strategy", + [ + None, + pytest.param("dp", marks=pytest.mark.skip("dp + amp not supported on CPU currently")), # TODO + "ddp_spawn", + ], +) +@pytest.mark.parametrize("precision", [16, "bf16"]) +@pytest.mark.parametrize("devices", [1, 2]) +def test_amp_cpus(tmpdir, strategy, precision, devices): + """Make sure combinations of AMP and strategies work if supported.""" + tutils.reset_seed() + + trainer = Trainer( + default_root_dir=tmpdir, + accelerator="cpu", + devices=devices, + max_epochs=1, + strategy=strategy, + precision=precision, + ) + + model = AMPTestModel() + trainer.fit(model) + trainer.test(model) + trainer.predict(model, DataLoader(RandomDataset(32, 64))) @RunIf(min_cuda_gpus=2, min_torch="1.10") From 9e0849514811c12efab04c0b3bbab8e798ec065a Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 25 Aug 2022 18:41:49 +0200 Subject: [PATCH 18/18] azure --- .azure/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index ef65c76e823aa..2da30c0dd66ab 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -44,7 +44,7 @@ jobs: - bash: | CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' echo $CHANGED_FILES > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES