From 0f4f809a7c1855fe7e7304b9bfb97533687c0f08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 17 Jan 2023 02:29:06 +0100 Subject: [PATCH] Deprecate the FairScale integration (#16353) --- .../advanced/model_parallel.rst | 172 +----------------- docs/source-pytorch/extensions/strategy.rst | 11 +- src/pytorch_lightning/CHANGELOG.md | 9 +- src/pytorch_lightning/overrides/fairscale.py | 7 + .../precision/fully_sharded_native_amp.py | 11 ++ .../plugins/precision/sharded_native_amp.py | 8 + .../strategies/fully_sharded.py | 9 +- src/pytorch_lightning/strategies/sharded.py | 13 +- .../strategies/sharded_spawn.py | 13 +- src/pytorch_lightning/strategies/strategy.py | 2 +- .../callbacks/test_stochastic_weight_avg.py | 1 - .../deprecated_api/test_remove_2-0.py | 14 +- .../precision/test_sharded_precision.py | 3 +- .../plugins/test_cluster_integration.py | 22 ++- ..._ddp_fully_sharded_with_full_state_dict.py | 111 +++++------ .../strategies/test_ddp_strategy.py | 31 ---- .../tests_pytorch/strategies/test_registry.py | 9 +- .../strategies/test_sharded_strategy.py | 146 +++++++-------- .../connectors/test_accelerator_connector.py | 32 +++- tests/tests_pytorch/trainer/test_trainer.py | 14 +- 20 files changed, 251 insertions(+), 387 deletions(-) diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst index c1e68b780f186..d7660680eaccf 100644 --- a/docs/source-pytorch/advanced/model_parallel.rst +++ b/docs/source-pytorch/advanced/model_parallel.rst @@ -6,8 +6,6 @@ Train 1 trillion+ parameter models When training large models, fitting larger batch sizes, or trying to increase throughput using multi-GPU compute, Lightning provides advanced optimized distributed training strategies to support these cases and offer substantial improvements in memory usage. -In many cases these strategies are some flavour of model parallelism however we only introduce concepts at a high level to get you started. Refer to the `FairScale documentation `_ for more information about model parallelism. - Note that some of the extreme memory saving configurations will affect the speed of training. This Speed/Memory trade-off in most cases can be adjusted. Some of these memory-efficient strategies rely on offloading onto other forms of memory, such as CPU RAM or NVMe. This means you can even see memory benefits on a **single GPU**, using a strategy such as :ref:`deepspeed-zero-stage-3-offload`. @@ -40,7 +38,7 @@ Overall: * When **fine-tuning** a model, use advanced memory efficient strategies such as :ref:`deepspeed-zero-stage-3` or :ref:`deepspeed-zero-stage-3-offload`, allowing you to fine-tune larger models if you are limited on compute * When **pre-training** a model, use simpler optimizations such :ref:`sharded-training`, :ref:`deepspeed-zero-stage-2` or :ref:`fully-sharded-training`, scaling the number of GPUs to reach larger parameter sizes -* For both fine-tuning and pre-training, use :ref:`deepspeed-activation-checkpointing` or :ref:`fairscale-activation-checkpointing` as the throughput degradation is not significant +* For both fine-tuning and pre-training, use :ref:`deepspeed-activation-checkpointing` as the throughput degradation is not significant For example when using 128 GPUs, you can **pre-train** large 10 to 20 Billion parameter models using :ref:`deepspeed-zero-stage-2` without having to take a performance hit with more advanced optimized multi-gpu strategy. @@ -153,11 +151,10 @@ Here's an example of changing the placement policy to "cpu". .. _sharded-training: -************************** -FairScale Sharded Training -************************** +**************** +Sharded Training +**************** -Lightning integration of optimizer sharded training provided by `FairScale `_. The technique can be found within `DeepSpeed ZeRO `_ and `ZeRO-2 `_, however the implementation is built from the ground up to be PyTorch compatible and standalone. @@ -171,178 +168,25 @@ these benefits in multi-GPU setups are almost free and throughput scales well wi It is highly recommended to use Sharded Training in multi-GPU environments where memory is limited, or where training larger models are beneficial (500M+ parameter models). A technical note: as batch size scales, storing activations for the backwards pass becomes the bottleneck in training. As a result, sharding optimizer state and gradients becomes less impactful. -Use :ref:`fairscale-activation-checkpointing` to see even more benefit at the cost of some throughput. - -To use Sharded Training, you need to first install FairScale using the command below. - -.. code-block:: bash - - pip install fairscale - .. code-block:: python # train using Sharded DDP trainer = Trainer(strategy="ddp_sharded") -Sharded Training can work across all DDP variants by adding the additional ``--strategy ddp_sharded`` flag via command line using a PyTorch Lightning script. - Internally we re-initialize your optimizers and shard them across your machines and processes. We handle all communication using PyTorch distributed, so no code changes are required. ---- .. _fully-sharded-training: -FairScale Fully Sharded Training -================================ - -.. warning:: - FairScale Fully Sharded Training is in BETA and the API is subject to change. Please create an `issue `_ if you run into any problems. - -`Fully Sharded `_ shards optimizer state, gradients, and parameters across data parallel workers. This allows you to fit much larger models onto multiple GPUs into memory. - -Fully Sharded Training alleviates the need to worry about balancing layers onto specific devices using some form of pipe parallelism, and optimizes for distributed communication with minimal effort. - -Shard Parameters to Reach 10+ Billion Parameters ------------------------------------------------- - -To reach larger parameter sizes and to be memory efficient, we have to shard parameters. There are various ways to enable this. - -.. note:: - Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``. - This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``. - This is a limitation of Fully Sharded Training that will be resolved in the future. - -Enabling Module Sharding for Maximum Memory Efficiency ------------------------------------------------------- - -Auto Wrapping -^^^^^^^^^^^^^ - -Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The -simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't -have to ``wrap`` layers manually as in the case of manual wrapping. - -.. note:: - While initializing the optimizers inside ``configure_optimizers`` hook, make sure to use ``self.trainer.model.parameters()``, else - PyTorch will raise an error. This is required because when you use auto-wrap, the model layers are sharded and your - ``lightning_module.parameters()`` will return a generator with no params. This inconvenience will be addressed in the future. - -.. code-block:: python - - class MyModel(BoringModel): - def configure_optimizers(self): - return torch.optim.AdamW(self.trainer.model.parameters(), lr=1e-2) - - - model = MyModel() - trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp", precision=16) - trainer.fit(model) - - -Manual Wrapping -^^^^^^^^^^^^^^^ - -Manual wrapping can be useful to explore complex sharding strategies by applying ``wrap`` selectively to some parts of the model. To activate -parameter sharding with manual wrapping, you can wrap your model using the ``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly. - -When not using Fully Sharded Training these wrap functions are a no-op. That means once the changes have been made, there is no need to remove the changes for other strategies. - -``auto_wrap`` recursively wraps :class:`~torch.nn.Module` within the ``LightningModule`` with nested Fully Sharded Wrappers, -signalling that we'd like to partition these modules across data parallel devices, discarding the full weights when not required (information :class:`here `). - -``auto_wrap`` can have varying levels of success based on the complexity of your model. **Auto Wrap does not support models with shared parameters**. - -``wrap`` simply wraps the module with a Fully Sharded Parallel class with the correct parameters from the Lightning context manager. - -Here's an example using both ``wrap`` and ``auto_wrap`` to create your model: - -.. code-block:: python - - import torch - import torch.nn as nn - import pytorch_lightning as pl - from pytorch_lightning import Trainer - from fairscale.nn import checkpoint_wrapper, auto_wrap, wrap - - - class MyModel(pl.LightningModule): - def __init__(self): - super().__init__() - self.linear_layer = nn.Linear(32, 32) - self.block = nn.Sequential(nn.Linear(32, 32), nn.ReLU()) - self.final_block = nn.Sequential(nn.Linear(32, 32), nn.ReLU()) - - def configure_sharded_model(self): - # modules are sharded across processes - # as soon as they are wrapped with `wrap` or `auto_wrap`. - # During the forward/backward passes, weights get synced across processes - # and de-allocated once computation is complete, saving memory. - - # Wraps the layer in a Fully Sharded Wrapper automatically - linear_layer = wrap(self.linear_layer) - - # Wraps the module recursively - # based on a minimum number of parameters (default 100M parameters) - block = auto_wrap(self.block) - - # For best memory efficiency, - # add FairScale activation checkpointing - final_block = auto_wrap(checkpoint_wrapper(self.final_block)) - self.model = nn.Sequential(linear_layer, nn.ReLU(), block, final_block) - - def configure_optimizers(self): - return torch.optim.AdamW(self.model.parameters(), lr=1e-2) - - - model = MyModel() - trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp", precision=16) - trainer.fit(model) - - trainer.test() - trainer.predict() - ----- - -.. _fairscale-activation-checkpointing: - -Activation Checkpointing ------------------------- - -Activation checkpointing frees activations from memory as soon as they are not needed during the forward pass. They are then re-computed for the backwards pass as needed. Activation checkpointing is very useful when you have intermediate layers that produce large activations. - -FairScale's checkpointing wrapper also handles batch norm layers correctly, unlike the PyTorch implementation, ensuring stats are tracked correctly due to the multiple forward passes. - -This saves memory when training larger models, however it requires wrapping modules you'd like to use activation checkpointing on. See :class:`here ` for more information. - -.. warning:: - - Do not wrap the entire model with activation checkpointing. This is not the intended use of activation checkpointing, and will lead to failures as seen in `this discussion `_. - -.. code-block:: python - - from pytorch_lightning import Trainer - from fairscale.nn import checkpoint_wrapper - - - class MyModel(pl.LightningModule): - def __init__(self): - super().__init__() - # Wrap layers using checkpoint_wrapper - self.block_1 = checkpoint_wrapper(nn.Sequential(nn.Linear(32, 32), nn.ReLU())) - self.block_2 = nn.Linear(32, 2) - ----- - -.. _fully-sharded-native-training: - -****************************** -PyTorch Fully Sharded Training -****************************** +********************** +Fully Sharded Training +********************** PyTorch has it's own version of `FSDP `_ which is upstreamed from their `fairscale `__ project. It was introduced in their `v1.11.0 release `_ but it is recommended to use it with PyTorch v1.12 or more and that's what -Lightning supports. The API is pretty similar to that of FairScale. +Lightning supports. Auto Wrapping diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst index 3d97a14946ebd..82d1d5e103564 100644 --- a/docs/source-pytorch/extensions/strategy.rst +++ b/docs/source-pytorch/extensions/strategy.rst @@ -80,16 +80,7 @@ The below table lists all relevant strategies available in Lightning with their - Colossal-AI provides a collection of parallel components for you. It aims to support you to write your distributed deep learning models just like how you write your model on your laptop. `Learn more. `__ * - fsdp_native - :class:`~pytorch_lightning.strategies.DDPFullyShardedNativeStrategy` - - Strategy for Fully Sharded Data Parallel provided by PyTorch. :ref:`Learn more. ` - * - fsdp - - :class:`~pytorch_lightning.strategies.DDPFullyShardedStrategy` - - Strategy for Fully Sharded Data Parallel provided by FairScale. :ref:`Learn more. ` - * - ddp_sharded - - :class:`~pytorch_lightning.strategies.DDPShardedStrategy` - - Optimizer and gradient sharded training provided by FairScale. :ref:`Learn more. ` - * - ddp_sharded_spawn - - :class:`~pytorch_lightning.strategies.DDPSpawnShardedStrategy` - - Optimizer sharded training provided by FairScale. :ref:`Learn more. ` + - Strategy for Fully Sharded Data Parallel. :ref:`Learn more. ` * - ddp_spawn - :class:`~pytorch_lightning.strategies.DDPSpawnStrategy` - Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training finishes. :ref:`Learn more. ` diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5306790c61c04..39087a2af617f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -51,12 +51,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). * Deprecated the `Trainer.amp_backend` property * Deprecated the `Trainer(amp_level=...)` argument * Deprecated the `pytorch_lightning.plugins.ApexMixedPrecisionPlugin` class - * Deprecates the `pytorch_lightning.utilities.enum.sAMPType` enum + * Deprecates the `pytorch_lightning.utilities.enums.AMPType` enum * Deprecates the `DeepSpeedPrecisionPlugin(amp_type=..., amp_level=...)` arguments - `horovod` deprecation ([#16141](https://github.com/PyTorchLightning/pytorch-lightning/pull/16141)) * Deprecated `Trainer(strategy="horovod")` * Deprecated the `HorovodStrategy` class - Deprecated `pytorch_lightning.lite.LightningLite` in favor of `lightning.fabric.Fabric` ([#16314](https://github.com/Lightning-AI/lightning/pull/16314)) +- `FairScale` deprecation (in favor of PyTorch's FSDP implementation) ([#16353](https://github.com/PyTorchLightning/pytorch-lightning/pull/16353)) + * Deprecated the `pytorch_lightning.overrides.fairscale.LightningShardedDataParallel` class + * Deprecated the `pytorch_lightning.plugins.precision.fully_sharded_native_amp.FullyShardedNativeMixedPrecisionPlugin` class + * Deprecated the `pytorch_lightning.plugins.precision.sharded_native_amp.ShardedNativeMixedPrecisionPlugin` class + * Deprecated the `pytorch_lightning.strategies.fully_sharded.DDPFullyShardedStrategy` class + * Deprecated the `pytorch_lightning.strategies.sharded.DDPShardedStrategy` class + * Deprecated the `pytorch_lightning.strategies.sharded_spawn.DDPSpawnShardedStrategy` class ### Removed diff --git a/src/pytorch_lightning/overrides/fairscale.py b/src/pytorch_lightning/overrides/fairscale.py index d9ebb6345f215..f818792e575a9 100644 --- a/src/pytorch_lightning/overrides/fairscale.py +++ b/src/pytorch_lightning/overrides/fairscale.py @@ -41,6 +41,13 @@ def __init__( forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, ) -> None: + rank_zero_deprecation( + "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" + " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." + " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" + " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" + " the native version by default." + ) self._validate_init_arguments(pl_module, forward_module) super().__init__(forward_module=(pl_module or forward_module)) diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py index 870e658bfc9c3..904d61f4dffc3 100644 --- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py @@ -15,11 +15,22 @@ from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin): """Native AMP for Fully Sharded Training.""" + def __init__(self, *args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" + " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." + " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" + " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" + " the native version by default." + ) + super().__init__(*args, **kwargs) + def clip_grad_by_norm(self, *_: Any, **__: Any) -> None: # see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html # section `Gradient Clipping`, using `torch.nn.utils.clip_grad_norm_` is incorrect diff --git a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py index 077b1e6679113..f4f646b4239a2 100644 --- a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py @@ -18,6 +18,7 @@ from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.precision.native_amp import MixedPrecisionPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation if _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS @@ -32,6 +33,13 @@ class ShardedNativeMixedPrecisionPlugin(MixedPrecisionPlugin): def __init__( self, precision: Literal["16", 16, "bf16"], device: str, scaler: Optional[ShardedGradScaler] = None ) -> None: + rank_zero_deprecation( + "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" + " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." + " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" + " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" + " the native version by default." + ) if not _FAIRSCALE_AVAILABLE: raise MisconfigurationException( "You have asked for sharded AMP but you have not installed it." diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index 64ddd1272a546..534fdf8dbbe32 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -28,6 +28,7 @@ from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation from pytorch_lightning.utilities.types import STEP_OUTPUT if _FAIRSCALE_AVAILABLE: @@ -117,7 +118,13 @@ def __init__( If ``False``, this will default to ``compute_device``. (Default: True). """ - + rank_zero_deprecation( + "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" + " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." + " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" + " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" + " the native version by default." + ) super().__init__( accelerator=accelerator, parallel_devices=parallel_devices, diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py index 922a4d70d92c4..e8749d53cac08 100644 --- a/src/pytorch_lightning/strategies/sharded.py +++ b/src/pytorch_lightning/strategies/sharded.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from typing import Dict, Generator, List, Tuple +from typing import Any, Dict, Generator, List, Tuple from torch import Tensor from torch.nn import Module @@ -26,6 +26,7 @@ from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation if _FAIRSCALE_AVAILABLE: from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel @@ -40,6 +41,16 @@ class DDPShardedStrategy(DDPStrategy): strategy_name = "ddp_sharded" _REDUCE_BUFFER_SIZE_DEFAULT: int = 2**23 # 8M + def __init__(self, *args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" + " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." + " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" + " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" + " the native version by default." + ) + super().__init__(*args, **kwargs) + def connect(self, model: "pl.LightningModule") -> None: if not _FAIRSCALE_AVAILABLE: # pragma: no cover raise MisconfigurationException( diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py index cf12b3b71c32a..74fb1f4026ec0 100644 --- a/src/pytorch_lightning/strategies/sharded_spawn.py +++ b/src/pytorch_lightning/strategies/sharded_spawn.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from typing import Dict, Generator, List, Tuple +from typing import Any, Dict, Generator, List, Tuple from torch import Tensor from torch.nn import Module @@ -26,6 +26,7 @@ from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation if _FAIRSCALE_AVAILABLE: from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel @@ -40,6 +41,16 @@ class DDPSpawnShardedStrategy(DDPSpawnStrategy): strategy_name = "ddp_sharded_spawn" + def __init__(self, *args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" + " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." + " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" + " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" + " the native version by default." + ) + super().__init__(*args, **kwargs) + def connect(self, model: "pl.LightningModule") -> None: if not _FAIRSCALE_AVAILABLE: # pragma: no cover raise MisconfigurationException( diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index d7d3005e8fd98..415d50177ae2e 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -174,7 +174,7 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]: optimizer = optimizer._optimizer if hasattr(optimizer, "consolidate_state_dict"): - # there are optimizers like Fairscale's OSS or PyTorch's ZeroRedundancyOptimizer that shard their + # there are optimizers like PyTorch's ZeroRedundancyOptimizer that shard their # states, and to avoid OOM we consolidate the full state on rank 0 only optimizer.consolidate_state_dict() return optimizer.state_dict() if self.is_global_zero else {} diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py index d1f5004bcbb35..c009fbd53fa82 100644 --- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py +++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py @@ -361,7 +361,6 @@ def test_swa_resume_training_from_checkpoint_ddp(tmpdir): @pytest.mark.parametrize( "strategy", [ - pytest.param("fsdp", marks=RunIf(fairscale=True, min_cuda_gpus=1)), pytest.param("deepspeed", marks=RunIf(deepspeed=True, min_cuda_gpus=1)), pytest.param("fsdp_native", marks=RunIf(min_cuda_gpus=1, skip_windows=True, min_torch="1.12")), ], diff --git a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py index d5348b70c7728..1283ff0991f4e 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py @@ -398,7 +398,6 @@ def test_rename_lightning_lite(): LightningParallelModule, LightningDistributedModule, LightningBaguaModule, - pytest.param(LightningShardedDataParallel, marks=RunIf(fairscale=True)), ], ) def test_v1_10_deprecated_pl_module_init_parameter(wrapper_class): @@ -413,6 +412,19 @@ def test_v1_10_deprecated_pl_module_init_parameter(wrapper_class): wrapper_class(pl_module=BoringModel()) +@RunIf(fairscale=True) +def test_v1_10_deprecated_fairscale_pl_module_init_parameter(): + with no_warning_call( + DeprecationWarning, match=r"The argument `pl_module` in `LightningShardedDataParallel` is deprecated in v1.8.0" + ), pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + LightningShardedDataParallel(BoringModel()) + + with pytest.deprecated_call( + match=r"The argument `pl_module` in `LightningShardedDataParallel` is deprecated in v1.8.0" + ): + LightningShardedDataParallel(pl_module=BoringModel()) + + def test_v1_10_deprecated_unwrap_lightning_module(): with pytest.deprecated_call(match=r"The function `unwrap_lightning_module` is deprecated in v1.8.0"): unwrap_lightning_module(BoringModel()) diff --git a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py b/tests/tests_pytorch/plugins/precision/test_sharded_precision.py index 7d6cc87da54c0..e040523c1e9c9 100644 --- a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py +++ b/tests/tests_pytorch/plugins/precision/test_sharded_precision.py @@ -35,7 +35,8 @@ ], ) def test_sharded_precision_scaler(precision, scaler, expected): - plugin = ShardedNativeMixedPrecisionPlugin(precision=precision, scaler=scaler, device="cuda") + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + plugin = ShardedNativeMixedPrecisionPlugin(precision=precision, scaler=scaler, device="cuda") if expected: assert isinstance(plugin.scaler, expected) else: diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py index e8beecf15020a..8a96bd8fdd90c 100644 --- a/tests/tests_pytorch/plugins/test_cluster_integration.py +++ b/tests/tests_pytorch/plugins/test_cluster_integration.py @@ -65,11 +65,17 @@ def environment_combinations(): def test_ranks_available_manual_strategy_selection(_, strategy_cls): """Test that the rank information is readily available after Trainer initialization.""" num_nodes = 2 - for cluster, variables, expected in environment_combinations(): + for i, (cluster, variables, expected) in enumerate(environment_combinations()): with mock.patch.dict(os.environ, variables): - strategy = strategy_cls( - parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster - ) + if strategy_cls is DDPShardedStrategy and i == 0: + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + strategy = strategy_cls( + parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster + ) + else: + strategy = strategy_cls( + parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster + ) trainer = Trainer(strategy=strategy, num_nodes=num_nodes) assert rank_zero_only.rank == expected["global_rank"] assert trainer.global_rank == expected["global_rank"] @@ -93,7 +99,7 @@ def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwar num_nodes = 2 trainer_kwargs.update(num_nodes=num_nodes) - for cluster, variables, expected in environment_combinations(): + for i, (cluster, variables, expected) in enumerate(environment_combinations()): if trainer_kwargs["strategy"] == "ddp_spawn": if isinstance(cluster, (SLURMEnvironment, TorchElasticEnvironment)): # slurm and torchelastic do not work with spawn strategies @@ -102,7 +108,11 @@ def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwar expected.update(global_rank=(expected["node_rank"] * 2), local_rank=0) with mock.patch.dict(os.environ, variables): - trainer = Trainer(**trainer_kwargs) + if "sharded" in trainer_kwargs["strategy"] and i == 0: + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(**trainer_kwargs) + else: + trainer = Trainer(**trainer_kwargs) assert type(trainer.strategy.cluster_environment) is type(cluster) assert rank_zero_only.rank == expected["global_rank"] assert trainer.global_rank == expected["global_rank"] diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py index 0afd24ba798db..a60c4d8cb8ecf 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py @@ -6,7 +6,6 @@ import torch from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins import FullyShardedNativeMixedPrecisionPlugin @@ -149,11 +148,12 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): def test_invalid_on_cpu(tmpdir): """Test to ensure that to raise Misconfiguration for FSDP on CPU.""" + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp") + assert isinstance(trainer.strategy, DDPFullyShardedStrategy) with pytest.raises( MisconfigurationException, match="You selected strategy to be `ddp_fully_sharded`, but GPU is not available." ): - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp") - assert isinstance(trainer.strategy, DDPFullyShardedStrategy) trainer.strategy.setup_environment() @@ -161,9 +161,10 @@ def test_invalid_on_cpu(tmpdir): @RunIf(fairscale=True) def test_fsdp_with_sharded_amp(cuda_count_1, tmpdir): """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" - trainer = Trainer( - default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16 - ) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer( + default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16 + ) assert isinstance(trainer.strategy, DDPFullyShardedStrategy) assert isinstance(trainer.strategy.precision_plugin, FullyShardedNativeMixedPrecisionPlugin) @@ -173,65 +174,37 @@ def test_fully_sharded_strategy_checkpoint(tmpdir): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" model = TestFSDPModelManualWrapped() - trainer = Trainer( - default_root_dir=tmpdir, - accelerator="gpu", - devices=1, - strategy="fsdp", - precision=16, - max_epochs=1, - enable_progress_bar=False, - enable_model_summary=False, - ) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer( + default_root_dir=tmpdir, + accelerator="gpu", + devices=1, + strategy="fsdp", + precision=16, + max_epochs=1, + enable_progress_bar=False, + enable_model_summary=False, + ) _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) -@RunIf(min_cuda_gpus=2, standalone=True, fairscale=True) -@pytest.mark.parametrize( - "model, strategy", - [ - (TestFSDPModelManualWrapped(), DDPFullyShardedStrategy(min_num_params=2)), - (TestFSDPModelAutoWrapped(), "fsdp"), - ], -) -def test_fully_sharded_strategy_checkpoint_multi_gpus(tmpdir, model, strategy): - """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run.""" - - ck = ModelCheckpoint(save_last=True) - trainer = Trainer( - default_root_dir=tmpdir, - accelerator="gpu", - devices=2, - strategy=strategy, - precision=16, - max_epochs=1, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - limit_predict_batches=2, - callbacks=[ck], - enable_progress_bar=False, - enable_model_summary=False, - ) - _run_multiple_stages(trainer, model) - - @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) def test_fsdp_gradient_clipping_raises(tmpdir): """Test to ensure that an exception is raised when clipping gradients by value with FSDP.""" model = TestFSDPModelManualWrapped() - trainer = Trainer( - default_root_dir=tmpdir, - strategy="fsdp", - fast_dev_run=True, - accelerator="gpu", - devices=1, - precision=16, - gradient_clip_val=1, - gradient_clip_algorithm="norm", - enable_progress_bar=False, - enable_model_summary=False, - ) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer( + default_root_dir=tmpdir, + strategy="fsdp", + fast_dev_run=True, + accelerator="gpu", + devices=1, + precision=16, + gradient_clip_val=1, + gradient_clip_algorithm="norm", + enable_progress_bar=False, + enable_model_summary=False, + ) with pytest.raises( MisconfigurationException, match="gradient_clip_algorithm='norm'` is currently not supported for `FullySharded" ): @@ -240,15 +213,16 @@ def test_fsdp_gradient_clipping_raises(tmpdir): @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) def test_fsdp_rewrap_limitation(tmpdir): - trainer = Trainer( - default_root_dir=tmpdir, - accelerator="gpu", - devices=1, - max_steps=1, - limit_val_batches=0, - limit_test_batches=1, - strategy="fsdp", - ) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer( + default_root_dir=tmpdir, + accelerator="gpu", + devices=1, + max_steps=1, + limit_val_batches=0, + limit_test_batches=1, + strategy="fsdp", + ) model = TestFSDPModelAutoWrapped() trainer.fit(model) @@ -258,7 +232,8 @@ def test_fsdp_rewrap_limitation(tmpdir): @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) def test_invalid_parameters_in_optimizer(): - trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1) class EmptyParametersModel(BoringModel): def configure_optimizers(self): diff --git a/tests/tests_pytorch/strategies/test_ddp_strategy.py b/tests/tests_pytorch/strategies/test_ddp_strategy.py index fcdc683ec9bc3..035b7e47b372b 100644 --- a/tests/tests_pytorch/strategies/test_ddp_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_strategy.py @@ -23,14 +23,10 @@ from lightning_fabric.plugins.environments import ClusterEnvironment, LightningEnvironment from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn from tests_pytorch.helpers.runif import RunIf -if _FAIRSCALE_AVAILABLE: - from fairscale.optim import OSS - class BoringModelGPU(BoringModel): def on_train_start(self) -> None: @@ -256,33 +252,6 @@ def test_ddp_strategy_set_timeout(mock_init_process_group): ) -class BoringFairScaleOptimizerModel(BoringModel): - def configure_optimizers(self): - base_optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults) - - -@RunIf(min_cuda_gpus=2, fairscale=True) -@pytest.mark.parametrize("strategy", (pytest.param("ddp", marks=RunIf(standalone=True)), "ddp_spawn")) -def test_ddp_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy): - """Test to ensure that checkpoint is saved correctly when using fairscale optimizer.""" - model = BoringFairScaleOptimizerModel() - trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1) - - trainer.fit(model) - - checkpoint_path = os.path.join(tmpdir, "model.pt") - # need to broadcast because tmpdir is different on each process - checkpoint_path = trainer.strategy.broadcast(checkpoint_path) - trainer.save_checkpoint(checkpoint_path) - trainer.strategy.barrier() # ensure the checkpoint is saved before load - saved_model = BoringModel.load_from_checkpoint(checkpoint_path) - - # Assert model parameters are identical after loading - for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()): - assert torch.equal(trained_param.to("cpu"), loaded_param) - - class BoringZeroRedundancyOptimizerModel(BoringModel): def configure_optimizers(self): return ZeroRedundancyOptimizer(self.layer.parameters(), optimizer_class=torch.optim.Adam, lr=0.1) diff --git a/tests/tests_pytorch/strategies/test_registry.py b/tests/tests_pytorch/strategies/test_registry.py index 8536e0b8b3438..39e10a05fc328 100644 --- a/tests/tests_pytorch/strategies/test_registry.py +++ b/tests/tests_pytorch/strategies/test_registry.py @@ -74,7 +74,8 @@ def test_fsdp_strategy_registry(tmpdir): assert strategy in StrategyRegistry assert StrategyRegistry[strategy]["strategy"] == DDPFullyShardedStrategy - trainer = Trainer(strategy=strategy) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(strategy=strategy) assert isinstance(trainer.strategy, DDPFullyShardedStrategy) @@ -117,7 +118,11 @@ def test_fsdp_strategy_registry(tmpdir): ], ) def test_ddp_find_unused_parameters_strategy_registry(tmpdir, strategy_name, strategy, expected_init_params): - trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name) + if "sharded" in strategy_name: + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name) + else: + trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name) assert isinstance(trainer.strategy, strategy) assert strategy_name in StrategyRegistry assert StrategyRegistry[strategy_name]["init_params"] == expected_init_params diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py index b8db7d1c786a8..29fd4607c521b 100644 --- a/tests/tests_pytorch/strategies/test_sharded_strategy.py +++ b/tests/tests_pytorch/strategies/test_sharded_strategy.py @@ -9,7 +9,7 @@ from torch import Tensor from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.demos.boring_classes import BoringModel, ManualOptimBoringModel +from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins import MixedPrecisionPlugin from pytorch_lightning.strategies import DDPShardedStrategy, DDPSpawnShardedStrategy @@ -58,15 +58,16 @@ def _is_equal(self, a, b): def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_val, tmpdir): """Ensure that clip gradients is only called if the value is greater than 0.""" model = BoringModel() - trainer = Trainer( - default_root_dir=tmpdir, - strategy="ddp_sharded", - accelerator="gpu", - devices=1, - precision=16, - fast_dev_run=True, - gradient_clip_val=clip_val, - ) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer( + default_root_dir=tmpdir, + strategy="ddp_sharded", + accelerator="gpu", + devices=1, + precision=16, + fast_dev_run=True, + gradient_clip_val=clip_val, + ) trainer.fit(model) if clip_val > 0: mock_oss_clip_grad_norm.assert_called() @@ -80,7 +81,8 @@ def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_v ) def test_sharded_ddp_choice(strategy, expected): """Test to ensure that strategy is correctly chosen.""" - trainer = Trainer(fast_dev_run=True, strategy=strategy) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(fast_dev_run=True, strategy=strategy) assert isinstance(trainer.strategy, expected) @@ -90,7 +92,8 @@ def test_sharded_ddp_choice(strategy, expected): ) def test_ddp_choice_sharded_amp(strategy, expected): """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" - trainer = Trainer(fast_dev_run=True, accelerator="gpu", devices=1, precision=16, strategy=strategy) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(fast_dev_run=True, accelerator="gpu", devices=1, precision=16, strategy=strategy) assert isinstance(trainer.strategy, expected) assert isinstance(trainer.precision_plugin, MixedPrecisionPlugin) @@ -99,7 +102,8 @@ def test_ddp_choice_sharded_amp(strategy, expected): def test_ddp_sharded_strategy_checkpoint_cpu(tmpdir): """Test to ensure that checkpoint is saved correctly.""" model = BoringModel() - trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) trainer.fit(model) @@ -116,7 +120,8 @@ def test_ddp_sharded_strategy_checkpoint_cpu(tmpdir): def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir): """Test to ensure that checkpoint is saved correctly when using multiple GPUs.""" model = BoringModel() - trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True) trainer.fit(model) @@ -133,7 +138,8 @@ def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir): def test_ddp_sharded_strategy_finetune(tmpdir): """Test to ensure that we can save and restart training (simulate fine-tuning)""" model = BoringModel() - trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") @@ -148,7 +154,8 @@ def test_ddp_sharded_strategy_finetune(tmpdir): def test_ddp_sharded_strategy_fit_ckpt_path(tmpdir): """Test to ensure that resuming from checkpoint works.""" model = BoringModel() - trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) trainer.fit(model) @@ -166,7 +173,8 @@ def test_ddp_sharded_strategy_fit_ckpt_path(tmpdir): def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir): """Test to ensure that resuming from checkpoint works when going from GPUs- > CPU.""" model = BoringModel() - trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=1, fast_dev_run=True) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=1, fast_dev_run=True) trainer.fit(model) @@ -175,7 +183,8 @@ def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir): model = BoringModel() - trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) trainer.fit(model, ckpt_path=checkpoint_path) @@ -191,34 +200,19 @@ def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir): def test_ddp_sharded_strategy_test_multigpu(trainer_kwargs): """Test to ensure we can use validate and test without fit.""" model = BoringModel() - trainer = Trainer( - strategy="ddp_sharded_spawn", - fast_dev_run=True, - enable_progress_bar=False, - enable_model_summary=False, - **trainer_kwargs, - ) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer( + strategy="ddp_sharded_spawn", + fast_dev_run=True, + enable_progress_bar=False, + enable_model_summary=False, + **trainer_kwargs, + ) trainer.validate(model) trainer.test(model) -@RunIf(min_cuda_gpus=2, standalone=True, fairscale=True) -@pytest.mark.parametrize("strategy", ("ddp_sharded", "ddp_sharded_spawn")) -def test_ddp_sharded_strategy_manual_optimization(tmpdir, strategy): - model = ManualOptimBoringModel() - trainer = Trainer( - default_root_dir=tmpdir, - strategy=strategy, - fast_dev_run=2, - accelerator="gpu", - devices=2, - enable_progress_bar=False, - enable_model_summary=False, - ) - trainer.fit(model) - - class BoringModelSharded(BoringModel): def on_train_start(self) -> None: """Check if trainer module is wrapped as ShardedDataParallel during training stage.""" @@ -243,7 +237,8 @@ def on_predict_start(self) -> None: @RunIf(fairscale=True) def test_configure_ddp(tmpdir): """Tests with ddp sharded strategy.""" - trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=True) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=True) model = BoringModelSharded() @@ -258,7 +253,8 @@ def test_configure_ddp(tmpdir): @pytest.mark.parametrize("cls", [DDPShardedStrategy, DDPSpawnShardedStrategy]) def test_custom_kwargs_sharded(_, cls): """Tests to ensure that if custom kwargs are passed, they are set correctly.""" - strategy = cls(reduce_fp16=True) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + strategy = cls(reduce_fp16=True) strategy._lightning_module = Mock(spec=LightningModule) strategy._lightning_module.trainer = Mock() strategy.parallel_devices = [Mock()] @@ -277,7 +273,8 @@ def test_custom_kwargs_sharded(_, cls): @pytest.mark.parametrize("num_nodes", [1, 2]) def test_custom_kwargs_sharded_reduce_buffer_size(_, params, expected_buffer_size, num_nodes): """Tests to ensure that ``reduce_buffer_size`` is correctly set based on user kwargs.""" - strategy = DDPShardedStrategy(**params) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + strategy = DDPShardedStrategy(**params) strategy.num_nodes = num_nodes strategy._lightning_module = Mock(spec=LightningModule) strategy._lightning_module.trainer = Mock() @@ -297,7 +294,8 @@ def test_custom_kwargs_sharded_reduce_buffer_size(_, params, expected_buffer_siz @RunIf(fairscale=True) def test_block_backward_sync(): - strategy = DDPShardedStrategy() + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + strategy = DDPShardedStrategy() model = mock.MagicMock(spec=ShardedDataParallel) with mock.patch.object(strategy, "_model", model): with strategy.block_backward_sync(): @@ -315,7 +313,8 @@ def test_block_backward_sync(): ], ) def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): - trainer = Trainer(strategy=strategy_name) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(strategy=strategy_name) assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs @@ -325,38 +324,18 @@ def configure_optimizers(self): return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults) -@RunIf(min_cuda_gpus=2, fairscale=True) -@pytest.mark.parametrize("strategy", (pytest.param("ddp_sharded", marks=RunIf(standalone=True)), "ddp_sharded_spawn")) -def test_ddp_sharded_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy): - """Test to ensure that checkpoint is saved correctly when using fairscale optimizers.""" - model = BoringFairScaleOptimizerModel() - trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1) - - trainer.fit(model) - - checkpoint_path = os.path.join(tmpdir, "model.pt") - # need to broadcast because tmpdir is different on each process - checkpoint_path = trainer.strategy.broadcast(checkpoint_path) - trainer.save_checkpoint(checkpoint_path) - trainer.strategy.barrier() # ensure the checkpoint is saved before load - saved_model = BoringModel.load_from_checkpoint(checkpoint_path) - - # Assert model parameters are identical after loading - for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()): - assert torch.equal(trained_param.to("cpu"), loaded_param) - - @RunIf(min_cuda_gpus=2, fairscale=True) def test_ddp_sharded_strategy_fit_ckpt_path_downsize_gpus(tmpdir): model = ModelWithAdamOptimizer() - trainer = Trainer( - strategy="ddp_sharded_spawn", - max_epochs=1, - limit_train_batches=1, - limit_val_batches=0, - accelerator="gpu", - devices=2, - ) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer( + strategy="ddp_sharded_spawn", + max_epochs=1, + limit_train_batches=1, + limit_val_batches=0, + accelerator="gpu", + devices=2, + ) trainer.fit(model) checkpoint_path = trainer.checkpoint_callback.best_model_path @@ -365,12 +344,13 @@ def test_ddp_sharded_strategy_fit_ckpt_path_downsize_gpus(tmpdir): old_optimizer_states = deepcopy(ckpt["optimizer_states"]) model = CheckModelRestore(old_model_state_dict, old_optimizer_states) - trainer = Trainer( - strategy="ddp_sharded_spawn", - max_epochs=2, - limit_train_batches=1, - limit_val_batches=0, - accelerator="gpu", - devices=1, - ) + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer( + strategy="ddp_sharded_spawn", + max_epochs=2, + limit_train_batches=1, + limit_val_batches=0, + accelerator="gpu", + devices=1, + ) trainer.fit(model, ckpt_path=checkpoint_path) diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index c0da8086a8b84..067cc811bd3e9 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -241,7 +241,9 @@ def test_interactive_incompatible_backend_error(cuda_count_2, monkeypatch): with pytest.raises(MisconfigurationException, match=r"strategy='ddp_spawn'\)`.*is not compatible"): Trainer(strategy="ddp_spawn", accelerator="gpu", devices=2) - with pytest.raises(MisconfigurationException, match=r"strategy='ddp_sharded_spawn'\)`.*is not compatible"): + with pytest.raises( + MisconfigurationException, match=r"strategy='ddp_sharded_spawn'\)`.*is not compatible" + ), pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=2) with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"): @@ -282,7 +284,13 @@ def test_interactive_compatible_strategy_ddp_fork(monkeypatch): ) @pytest.mark.parametrize("devices", [1, 2]) def test_accelerator_choice_multi_node_gpu(cuda_count_2, tmpdir, strategy, strategy_class, devices): - trainer = Trainer(default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices) + if "sharded" in strategy: + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer( + default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices + ) + else: + trainer = Trainer(default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices) assert isinstance(trainer.strategy, strategy_class) @@ -386,10 +394,16 @@ def test_exception_invalid_strategy(): ) @pytest.mark.parametrize("accelerator", ["mps", "auto", "gpu", None, MPSAccelerator()]) def test_invalid_ddp_strategy_with_mps(accelerator, strategy, strategy_class, mps_count_1, cuda_count_0): - with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): - Trainer(accelerator=accelerator, strategy=strategy) - - with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): + if "sharded" in strategy: + with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): + Trainer(accelerator=accelerator, strategy=strategy) + else: + with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): + Trainer(accelerator=accelerator, strategy=strategy) + + with pytest.raises(ValueError, match="strategies from the DDP family are not supported"), pytest.deprecated_call( + match="FairScale has been deprecated in v1.9.0" + ): Trainer(accelerator="mps", strategy=strategy_class()) @@ -428,7 +442,11 @@ def test_strategy_choice_cpu_instance(strategy_class): ], ) def test_strategy_choice_gpu_str(strategy, strategy_class): - trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2) + if "sharded" in strategy: + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2) + else: + trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2) assert isinstance(trainer.strategy, strategy_class) diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index edace5429a531..231ed0a415bc7 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -2100,13 +2100,6 @@ def training_step(self, batch, batch_idx): CUDAAccelerator, 2, ), - ( - {"strategy": DDPShardedStrategy(), "accelerator": "cuda", "devices": 2}, - DDPShardedStrategy, - "ddp_sharded", - CUDAAccelerator, - 2, - ), ( {"strategy": "ddp_spawn", "accelerator": "cuda", "devices": 2, "num_nodes": 2}, DDPSpawnStrategy, @@ -2141,7 +2134,12 @@ def test_trainer_config_strategy(monkeypatch, trainer_kwargs, strategy_cls, stra if trainer_kwargs.get("accelerator") == "cuda": mock_cuda_count(monkeypatch, trainer_kwargs["devices"]) - trainer = Trainer(**trainer_kwargs) + strategy = trainer_kwargs.get("strategy") + if (isinstance(strategy, str) and "sharded" in strategy) or isinstance(strategy, (DDPShardedStrategy)): + with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): + trainer = Trainer(**trainer_kwargs) + else: + trainer = Trainer(**trainer_kwargs) assert isinstance(trainer.strategy, strategy_cls) assert strategy_cls.strategy_name == strategy_name