From 49ed2b102befd4c9876e2d14badeb1251af5b2b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 29 Apr 2024 13:16:13 +0200 Subject: [PATCH] Add PyTorch 2.3 to CI matrix (#19708) --- .azure/gpu-benchmarks.yml | 3 +-- .azure/gpu-tests-fabric.yml | 5 ++--- .azure/gpu-tests-pytorch.yml | 5 ++--- .github/checkgroup.yml | 6 ++++++ .github/workflows/ci-tests-fabric.yml | 7 ++++++- .github/workflows/ci-tests-pytorch.yml | 7 ++++++- .github/workflows/docker-build.yml | 1 + dockers/base-cuda/Dockerfile | 6 ------ requirements/fabric/base.txt | 2 +- requirements/fabric/examples.txt | 2 +- requirements/pytorch/base.txt | 2 +- requirements/pytorch/examples.txt | 3 +-- src/lightning/fabric/CHANGELOG.md | 2 +- src/lightning/fabric/accelerators/mps.py | 4 +++- src/lightning/pytorch/CHANGELOG.md | 3 ++- tests/tests_fabric/plugins/precision/test_fsdp.py | 4 +++- tests/tests_pytorch/callbacks/test_finetuning_callback.py | 5 +++++ tests/tests_pytorch/conftest.py | 1 + tests/tests_pytorch/plugins/precision/test_fsdp.py | 4 +++- 19 files changed, 46 insertions(+), 26 deletions(-) diff --git a/.azure/gpu-benchmarks.yml b/.azure/gpu-benchmarks.yml index bae7babf69266..d869084fb351b 100644 --- a/.azure/gpu-benchmarks.yml +++ b/.azure/gpu-benchmarks.yml @@ -46,8 +46,7 @@ jobs: variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) container: - # TODO: Upgrade to Python 3.11 - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" options: "--gpus=all --shm-size=32g" strategy: matrix: diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index ba86449e92355..1a854604606aa 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -56,12 +56,11 @@ jobs: options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp" strategy: matrix: - # TODO: Upgrade to Python 3.11 "Fabric | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "fabric" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "lightning" workspace: clean: all diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index b9ab6ead7f0d1..156513d604210 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -48,12 +48,11 @@ jobs: cancelTimeoutInMinutes: "2" strategy: matrix: - # TODO: Upgrade to Python 3.11 "PyTorch | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "pytorch" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "lightning" pool: lit-rtx-3090 variables: diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 37f1e3cd844d2..0c5e7e4579ccb 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -23,14 +23,17 @@ subprojects: - "pl-cpu (macOS-11, lightning, 3.10, 2.0)" - "pl-cpu (macOS-11, lightning, 3.10, 2.1)" - "pl-cpu (macOS-11, lightning, 3.10, 2.2)" + - "pl-cpu (macOS-14, lightning, 3.10, 2.3)" - "pl-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.1)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.2)" + - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.3)" - "pl-cpu (windows-2022, lightning, 3.8, 2.0, oldest)" - "pl-cpu (windows-2022, lightning, 3.10, 2.0)" - "pl-cpu (windows-2022, lightning, 3.10, 2.1)" - "pl-cpu (windows-2022, lightning, 3.10, 2.2)" + - "pl-cpu (windows-2022, lightning, 3.10, 2.3)" - "pl-cpu (macOS-11, pytorch, 3.8, 2.0)" - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 2.0)" - "pl-cpu (windows-2022, pytorch, 3.8, 2.0)" @@ -171,14 +174,17 @@ subprojects: - "fabric-cpu (macOS-11, lightning, 3.10, 2.0)" - "fabric-cpu (macOS-11, lightning, 3.11, 2.1)" - "fabric-cpu (macOS-11, lightning, 3.11, 2.2)" + - "fabric-cpu (macOS-14, lightning, 3.10, 2.3)" - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)" - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.1)" - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.2)" + - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.3)" - "fabric-cpu (windows-2022, lightning, 3.8, 2.0, oldest)" - "fabric-cpu (windows-2022, lightning, 3.10, 2.0)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.1)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.2)" + - "fabric-cpu (windows-2022, lightning, 3.11, 2.3)" - "fabric-cpu (macOS-11, fabric, 3.8, 2.0)" - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 2.0)" - "fabric-cpu (windows-2022, fabric, 3.8, 2.0)" diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 61c60889a5aa0..082636a617227 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -49,6 +49,9 @@ jobs: - { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } + - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues - { os: "macOS-12", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.0" } - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.0" } @@ -83,6 +86,8 @@ jobs: PYPI_CACHE_DIR: "_pip-wheels" TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/torch_stable.html" TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch_test.html" + # TODO: Remove this - Enable running MPS tests on this platform + DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} steps: - uses: actions/checkout@v4 @@ -119,7 +124,7 @@ jobs: - name: Env. variables run: | # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.2' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.3' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index b7f5b14baf255..b0b10ef4acea8 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -53,6 +53,9 @@ jobs: - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } + - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues - { os: "macOS-12", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.0" } - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.0" } @@ -88,6 +91,8 @@ jobs: TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch_test.html" FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} PYPI_CACHE_DIR: "_pip-wheels" + # TODO: Remove this - Enable running MPS tests on this platform + DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} steps: - uses: actions/checkout@v4 @@ -125,7 +130,7 @@ jobs: - name: Env. variables run: | # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.2' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.3' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'pytorch_lightning'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 7ea9f824bb6b1..73c6e7496f9fa 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -109,6 +109,7 @@ jobs: - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" } + - { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" } # - { python_version: "3.12", pytorch_version: "2.2", cuda_version: "12.1.0" } # todo: pending on `onnxruntime` steps: - uses: actions/checkout@v4 diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index d5b72768148ed..b8c29d01b0f77 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -34,13 +34,7 @@ ENV \ MAKEFLAGS="-j2" RUN \ - # TODO: Remove the manual key installation once the base image is updated. - # https://github.com/NVIDIA/nvidia-docker/issues/1631 - # https://github.com/NVIDIA/nvidia-docker/issues/1631#issuecomment-1264715214 apt-get update && apt-get install -y wget && \ - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ - mkdir -p /etc/apt/keyrings/ && mv 3bf863cc.pub /etc/apt/keyrings/ && \ - echo "deb [signed-by=/etc/apt/keyrings/3bf863cc.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" /etc/apt/sources.list.d/cuda.list && \ apt-get update -qq --fix-missing && \ NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ CUDA_VERSION_MM=${CUDA_VERSION%.*} && \ diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 3a6cdbacd302f..4cac78e19bf23 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy >=1.17.2, <1.27.0 -torch >=2.0.0, <2.3.0 +torch >=2.0.0, <2.4.0 fsspec[http] >=2022.5.0, <2023.11.0 packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt index d0be7e3af8496..0e2feb97eccc4 100644 --- a/requirements/fabric/examples.txt +++ b/requirements/fabric/examples.txt @@ -1,6 +1,6 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -torchvision >=0.15.0, <0.18.0 +torchvision >=0.15.0, <0.19.0 torchmetrics >=0.10.0, <1.3.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 3578917e2cdf0..9af0b13c15ccd 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy >=1.17.2, <1.27.0 -torch >=2.0.0, <2.3.0 +torch >=2.0.0, <2.4.0 tqdm >=4.57.0, <4.67.0 PyYAML >=5.4, <6.1.0 fsspec[http] >=2022.5.0, <2023.11.0 diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 716b033def533..55b85025bddb2 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -2,8 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment requests <2.32.0 -torchvision >=0.15.0, <0.18.0 -gym[classic_control] >=0.17.0, <0.27.0 +torchvision >=0.15.0, <0.19.0 ipython[all] <8.15.0 torchmetrics >=0.10.0, <1.3.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 154433a1c101d..15e8ba16b6c72 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -11,7 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Enabled consolidating distributed checkpoints through `fabric consolidate` in the new CLI [#19560](https://github.com/Lightning-AI/pytorch-lightning/pull/19560)) -- +- Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) - diff --git a/src/lightning/fabric/accelerators/mps.py b/src/lightning/fabric/accelerators/mps.py index d0f36698616d4..75497169cda0f 100644 --- a/src/lightning/fabric/accelerators/mps.py +++ b/src/lightning/fabric/accelerators/mps.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import platform from functools import lru_cache from typing import List, Optional, Union @@ -70,7 +71,8 @@ def auto_device_count() -> int: @lru_cache(1) def is_available() -> bool: """MPS is only available on a machine with the ARM-based Apple Silicon processors.""" - return torch.backends.mps.is_available() and platform.processor() in ("arm", "arm64") + mps_disabled = os.getenv("DISABLE_MPS", "0") == "1" + return not mps_disabled and torch.backends.mps.is_available() and platform.processor() in ("arm", "arm64") @classmethod @override diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index e4ae5a29c336c..11c9238a6d409 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -16,7 +16,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `on_exception` hook to `LightningDataModule` ([#19601](https://github.com/Lightning-AI/pytorch-lightning/pull/19601)) -- +- Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) + ### Changed diff --git a/tests/tests_fabric/plugins/precision/test_fsdp.py b/tests/tests_fabric/plugins/precision/test_fsdp.py index 148292dcd48df..e42df493dd725 100644 --- a/tests/tests_fabric/plugins/precision/test_fsdp.py +++ b/tests/tests_fabric/plugins/precision/test_fsdp.py @@ -58,8 +58,10 @@ def test_fsdp_precision_scaler_with_bf16(): @RunIf(min_cuda_gpus=1) def test_fsdp_precision_forward_context(): """Test to ensure that the context manager correctly is set to bfloat16.""" + from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler + precision = FSDPPrecision(precision="16-mixed") - assert isinstance(precision.scaler, torch.cuda.amp.GradScaler) + assert isinstance(precision.scaler, ShardedGradScaler) assert torch.get_default_dtype() == torch.float32 with precision.forward_context(): assert torch.get_autocast_gpu_dtype() == torch.float16 diff --git a/tests/tests_pytorch/callbacks/test_finetuning_callback.py b/tests/tests_pytorch/callbacks/test_finetuning_callback.py index 56d46a62048cd..0c09ae5d5042a 100644 --- a/tests/tests_pytorch/callbacks/test_finetuning_callback.py +++ b/tests/tests_pytorch/callbacks/test_finetuning_callback.py @@ -15,6 +15,7 @@ import pytest import torch +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from lightning.pytorch import LightningModule, Trainer, seed_everything from lightning.pytorch.callbacks import BackboneFinetuning, BaseFinetuning, ModelCheckpoint from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset @@ -359,6 +360,8 @@ def test_callbacks_restore(tmp_path): "foreach": None, "differentiable": False, } + if _TORCH_GREATER_EQUAL_2_3: + expected["fused"] = None assert callback._internal_optimizer_metadata[0][0] == expected @@ -374,6 +377,8 @@ def test_callbacks_restore(tmp_path): "foreach": None, "differentiable": False, } + if _TORCH_GREATER_EQUAL_2_3: + expected["fused"] = None assert callback._internal_optimizer_metadata[0][1] == expected diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index be05768cab430..8b9ca14684db0 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -88,6 +88,7 @@ def restore_env_variables(): "KMP_DUPLICATE_LIB_OK", # leaked by PyTorch "CRC32C_SW_MODE", # leaked by tensorboardX "TRITON_CACHE_DIR", # leaked by torch.compile + "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR", # leaked by torch.compile "OMP_NUM_THREADS", # set by our launchers # leaked by XLA "ALLOW_MULTIPLE_LIBTPU_LOAD", diff --git a/tests/tests_pytorch/plugins/precision/test_fsdp.py b/tests/tests_pytorch/plugins/precision/test_fsdp.py index 6b19fdabdf6d6..8b595c2c74a32 100644 --- a/tests/tests_pytorch/plugins/precision/test_fsdp.py +++ b/tests/tests_pytorch/plugins/precision/test_fsdp.py @@ -58,8 +58,10 @@ def test_fsdp_precision_scaler_with_bf16(): @RunIf(min_cuda_gpus=1) def test_fsdp_precision_forward_context(): """Test to ensure that the context manager correctly is set to bfloat16.""" + from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler + precision = FSDPPrecision(precision="16-mixed") - assert isinstance(precision.scaler, torch.cuda.amp.GradScaler) + assert isinstance(precision.scaler, ShardedGradScaler) assert torch.get_default_dtype() == torch.float32 with precision.forward_context(): assert torch.get_autocast_gpu_dtype() == torch.float16