From f67a6fee58361ecf87a03d5094ea0cab8aedd3e9 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 25 Oct 2022 17:29:19 +0530 Subject: [PATCH 01/11] Add info about optimizer initialization for native fsdp --- docs/source-pytorch/advanced/model_parallel.rst | 6 ++++++ src/pytorch_lightning/strategies/fully_sharded_native.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst index 3a57f7b9499bc..ca6d2be30faa5 100644 --- a/docs/source-pytorch/advanced/model_parallel.rst +++ b/docs/source-pytorch/advanced/model_parallel.rst @@ -352,6 +352,12 @@ Model layers should be wrapped in FSDP in a nested way to save peak memory and e simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't have to ``wrap`` layers manually as in the case of manual wrapping. +.. note:: + While initializing the optimizers inside ``configure_optimizers`` hook, make sure to use ``self.trainer.model.parameters()``, else + PyTorch will raise an error. This is required because when you use auto-wrap, the model layers are sharded and your + ``lightning_module.parameters()`` will return a generator with no params. This inconvenience will be addressed in the future. + + .. code-block:: python model = BoringModel() diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 3c095dcc01403..c7022bcc4cf77 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -212,6 +212,12 @@ def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel: del self.kwargs["auto_wrap_policy"] log.detail(f"setting up FSDP model with device id: {self.root_device.index}, kwargs: {self.kwargs}") + + rank_zero_info( + "When using PyTorch FSDP auto-wrap, make sure to initalize your model using trainer else" + " you will get an error.\ntorch.optim.Optimizer(self.trainer.model.parameters(), ...)" + ) + return FullyShardedDataParallel( module=model, process_group=self.process_group, From 7f69cd2be7747e595b80bcbe181cc0d25bd58ddc Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 31 Oct 2022 18:27:42 +0100 Subject: [PATCH 02/11] validate optimizer parameters in fsdp strategy --- src/lightning_lite/strategies/fsdp.py | 20 +++++++++++++++++++ .../strategies/fully_sharded_native.py | 15 +++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 src/lightning_lite/strategies/fsdp.py diff --git a/src/lightning_lite/strategies/fsdp.py b/src/lightning_lite/strategies/fsdp.py new file mode 100644 index 0000000000000..9e70400e476dc --- /dev/null +++ b/src/lightning_lite/strategies/fsdp.py @@ -0,0 +1,20 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from torch.optim import Optimizer + + +def _optimizer_has_flat_params(optimizer: Optimizer) -> bool: + from torch.distributed.fsdp import FlatParameter + + return any(isinstance(param, FlatParameter) for param in optimizer.param_groups[0]["params"]) diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index c7022bcc4cf77..7c71b7fdc175e 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -13,13 +13,15 @@ # limitations under the License. import contextlib import logging -from typing import Any, Dict, Generator, List, Optional, Union +from typing import Any, Dict, Generator, List, Optional, Union, Iterable import torch from torch import Tensor +from torch.optim import Optimizer import pytorch_lightning as pl from lightning_lite.plugins import CheckpointIO, ClusterEnvironment +from lightning_lite.strategies.fsdp import _optimizer_has_flat_params from lightning_lite.utilities.distributed import get_default_process_group_backend_for_device from lightning_lite.utilities.distributed import group as _group from lightning_lite.utilities.distributed import init_dist_connection, sync_ddp_if_available @@ -254,6 +256,7 @@ def setup(self, trainer: "pl.Trainer") -> None: self.barrier() self.setup_optimizers(trainer) + _validate_optimizers(self.optimizers) optimizers_to_device(self.optimizers, self.root_device) self.setup_precision_plugin() @@ -374,3 +377,13 @@ def register_strategies(cls, strategy_registry: Dict) -> None: cpu_offload=CPUOffload(offload_params=True), ) cls._registered_strategies.append("fsdp_native_full_shard_offload") + + +def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None: + for optimizer in optimizers: + if not _optimizer_has_flat_params(optimizer): + raise ValueError( + "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" + " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" + " `configure_optimizers()` hook." + ) From e1a273e13bf718c0fa70b5b568e167a78ec46806 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 31 Oct 2022 18:48:21 +0000 Subject: [PATCH 03/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/pytorch_lightning/strategies/fully_sharded_native.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 8351aa82e2356..b50f08269e0da 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -13,7 +13,7 @@ # limitations under the License. import contextlib import logging -from typing import Any, Dict, Generator, List, Optional, Union, Iterable +from typing import Any, Dict, Generator, Iterable, List, Optional, Union import torch from torch import Tensor From 595b896d75c525a4f1b27e7ffaa3ffb90e05fa89 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 31 Oct 2022 17:15:24 -0400 Subject: [PATCH 04/11] add a test --- .../strategies/fully_sharded.py | 15 +- .../strategies/fully_sharded_native.py | 14 +- .../test_ddp_fully_sharded_native.py | 154 ++++++++++-------- ..._ddp_fully_sharded_with_full_state_dict.py | 146 +++++++++-------- 4 files changed, 180 insertions(+), 149 deletions(-) diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index d5716d6e45f00..6b9065a7b0099 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -13,13 +13,15 @@ # limitations under the License. import contextlib import logging -from typing import Any, Dict, Generator, List, Optional +from typing import Any, Dict, Generator, Iterable, List, Optional import torch +from torch.optim import Optimizer import pytorch_lightning as pl from lightning_lite.plugins import CheckpointIO, ClusterEnvironment from lightning_lite.strategies.fairscale import _FAIRSCALE_AVAILABLE +from lightning_lite.strategies.fsdp import _optimizer_has_flat_params from lightning_lite.utilities.enums import PrecisionType from lightning_lite.utilities.optimizer import _optimizers_to_device from pytorch_lightning.overrides.base import _LightningModuleWrapperBase @@ -186,6 +188,7 @@ def setup(self, trainer: "pl.Trainer") -> None: if not is_overridden("configure_sharded_model", self.lightning_module): self.model = self._setup_model(self.model) self.setup_optimizers(self.lightning_module.trainer) + _validate_optimizers(self.optimizers) _optimizers_to_device(self.optimizers, self.root_device) self.barrier() @@ -288,3 +291,13 @@ def register_strategies(cls, strategy_registry: Dict) -> None: cls, description=f"{cls.__class__.__name__}", ) + + +def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None: + for optimizer in optimizers: + if not _optimizer_has_flat_params(optimizer): + raise ValueError( + "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" + " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" + " `configure_optimizers()` hook." + ) diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index b50f08269e0da..c381af3ba16fc 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -13,11 +13,10 @@ # limitations under the License. import contextlib import logging -from typing import Any, Dict, Generator, Iterable, List, Optional, Union +from typing import Any, Dict, Generator, List, Optional, Union import torch from torch import Tensor -from torch.optim import Optimizer import pytorch_lightning as pl from lightning_lite.plugins import CheckpointIO, ClusterEnvironment @@ -34,6 +33,7 @@ from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin +from pytorch_lightning.strategies.fully_sharded import _validate_optimizers from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast @@ -380,13 +380,3 @@ def register_strategies(cls, strategy_registry: Dict) -> None: cpu_offload=CPUOffload(offload_params=True), ) cls._registered_strategies.append("fsdp_native_full_shard_offload") - - -def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None: - for optimizer in optimizers: - if not _optimizer_has_flat_params(optimizer): - raise ValueError( - "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" - " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" - " `configure_optimizers()` hook." - ) diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index be8bced2cbf5f..673fce60f3747 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -18,46 +18,6 @@ from torch.distributed.fsdp.wrap import wrap -def custom_auto_wrap_policy( - module, - recurse, - unwrapped_params: int, - min_num_params: int = int(1e8), -) -> bool: - return unwrapped_params >= 2 - - -@RunIf(min_torch="1.12") -def test_invalid_on_cpu(tmpdir): - """Test to ensure that we raise Misconfiguration for Native FSDP on CPU.""" - with pytest.raises( - MisconfigurationException, - match=f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " - "but GPU accelerator is not used.", - ): - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp_native") - assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy) - trainer.strategy.setup_environment() - - -@RunIf(min_torch="1.12", min_cuda_gpus=1) -@pytest.mark.parametrize("precision, expected", [(16, torch.float16), ("bf16", torch.bfloat16)]) -def test_precision_plugin_config(precision, expected): - plugin = FullyShardedNativeNativeMixedPrecisionPlugin(precision=precision, device="cuda") - config = plugin.mixed_precision_config - assert config.param_dtype == expected - assert config.buffer_dtype == expected - assert config.reduce_dtype == expected - - -@RunIf(min_torch="1.12") -def test_fsdp_custom_mixed_precision(tmpdir): - """Test to ensure that passing a custom mixed precision config works.""" - config = MixedPrecision() - strategy = DDPFullyShardedNativeStrategy(mixed_precision=config) - assert strategy.mixed_precision_config == config - - class TestFSDPModel(BoringModel): def __init__(self): super().__init__() @@ -154,6 +114,80 @@ def _assert_layer_fsdp_instance(self) -> None: assert self.layer[layer_num].mixed_precision.buffer_dtype == precision +def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): + trainer.fit(model) + model_path = trainer.strategy.broadcast(model_path) + model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path + + trainer.save_checkpoint(model_path, weights_only=True) + + _assert_save_equality(trainer, model_path, cls=model.__class__) + + # Test entry point + trainer.test(model) # model is wrapped, will not call `configure_sharded_model` + + # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap + trainer.test(ckpt_path=model_path) + + # Predict entry point + trainer.predict(model) # model is wrapped, will not call `configure_sharded_model` + + # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap + trainer.predict(ckpt_path=model_path) + + +def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel): + # Use FullySharded to get the state dict for the sake of comparison + model_state_dict = trainer.strategy.lightning_module_state_dict() + + if trainer.is_global_zero: + saved_model = cls.load_from_checkpoint(ckpt_path) + + # Assert model parameters are identical after loading + for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()): + assert torch.equal(ddp_param.float().cpu(), shard_param) + + +def custom_auto_wrap_policy( + module, + recurse, + unwrapped_params: int, + min_num_params: int = int(1e8), +) -> bool: + return unwrapped_params >= 2 + + +@RunIf(min_torch="1.12") +def test_invalid_on_cpu(tmpdir): + """Test to ensure that we raise Misconfiguration for Native FSDP on CPU.""" + with pytest.raises( + MisconfigurationException, + match=f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " + "but GPU accelerator is not used.", + ): + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp_native") + assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy) + trainer.strategy.setup_environment() + + +@RunIf(min_torch="1.12", min_cuda_gpus=1) +@pytest.mark.parametrize("precision, expected", [(16, torch.float16), ("bf16", torch.bfloat16)]) +def test_precision_plugin_config(precision, expected): + plugin = FullyShardedNativeNativeMixedPrecisionPlugin(precision=precision, device="cuda") + config = plugin.mixed_precision_config + assert config.param_dtype == expected + assert config.buffer_dtype == expected + assert config.reduce_dtype == expected + + +@RunIf(min_torch="1.12") +def test_fsdp_custom_mixed_precision(tmpdir): + """Test to ensure that passing a custom mixed precision config works.""" + config = MixedPrecision() + strategy = DDPFullyShardedNativeStrategy(mixed_precision=config) + assert strategy.mixed_precision_config == config + + @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.12") def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): """Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run.""" @@ -214,35 +248,15 @@ def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir, model, stra _run_multiple_stages(trainer, model) -def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): - trainer.fit(model) - model_path = trainer.strategy.broadcast(model_path) - model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path - - trainer.save_checkpoint(model_path, weights_only=True) - - _assert_save_equality(trainer, model_path, cls=model.__class__) - - # Test entry point - trainer.test(model) # model is wrapped, will not call `configure_sharded_model` - - # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap - trainer.test(ckpt_path=model_path) - - # Predict entry point - trainer.predict(model) # model is wrapped, will not call `configure_sharded_model` - - # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap - trainer.predict(ckpt_path=model_path) - - -def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel): - # Use FullySharded to get the state dict for the sake of comparison - model_state_dict = trainer.strategy.lightning_module_state_dict() +@RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, min_torch="1.12") +def test_invalid_parameters_in_optimizer(tmpdir): + class CustomModel(BoringModel): + def configure_optimizers(self): + layer = torch.nn.Linear(4, 5) + return torch.optim.Adam(layer.parameters(), lr=1e-2) - if trainer.is_global_zero: - saved_model = cls.load_from_checkpoint(ckpt_path) + trainer = Trainer(strategy="fsdp_native", accelerator="gpu", devices=1) + model = CustomModel() - # Assert model parameters are identical after loading - for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()): - assert torch.equal(ddp_param.float().cpu(), shard_param) + with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"): + trainer.fit(model) diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py index 5043d3a8c4aa3..0b3ce9947cb06 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py @@ -18,27 +18,6 @@ from fairscale.nn import FullyShardedDataParallel, wrap -def test_invalid_on_cpu(tmpdir): - """Test to ensure that to raise Misconfiguration for FSDP on CPU.""" - with pytest.raises( - MisconfigurationException, match="You selected strategy to be `ddp_fully_sharded`, but GPU is not available." - ): - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp") - assert isinstance(trainer.strategy, DDPFullyShardedStrategy) - trainer.strategy.setup_environment() - - -@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) -@RunIf(fairscale=True) -def test_fsdp_with_sharded_amp(cuda_count_1, tmpdir): - """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" - trainer = Trainer( - default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16 - ) - assert isinstance(trainer.strategy, DDPFullyShardedStrategy) - assert isinstance(trainer.strategy.precision_plugin, FullyShardedNativeMixedPrecisionPlugin) - - class TestFSDPModelManualWrapped(BoringModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -123,6 +102,72 @@ def _assert_layer_fsdp_instance(self) -> None: assert self.trainer.model.mixed_precision +def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModelManualWrapped): + # Use FullySharded to get the state dict for the sake of comparison + model_state_dict = trainer.strategy.lightning_module_state_dict() + + if trainer.is_global_zero: + saved_model = cls.load_from_checkpoint(ckpt_path) + + # Assert model parameters are identical after loading + for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()): + assert torch.equal(ddp_param.float().cpu(), shard_param) + + +def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): + trainer.fit(model) + + model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path + + trainer.save_checkpoint(model_path, weights_only=True) + + _assert_save_equality(trainer, model_path, cls=model.__class__) + + # Test entry point + if model.__class__ is TestFSDPModelAutoWrapped: + model = TestFSDPModelAutoWrapped() + trainer.test(model) # model is wrapped, will not call configure_shared_model + + # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap + if model.__class__ is TestFSDPModelAutoWrapped: + model = TestFSDPModelAutoWrapped() + trainer.test(model, ckpt_path=model_path) + + # Predict entry point + if model.__class__ is TestFSDPModelAutoWrapped: + model = TestFSDPModelAutoWrapped() + + if model.__class__ is TestFSDPModelAutoWrapped: + model = TestFSDPModelAutoWrapped() + trainer.predict(model) # model is wrapped, will not call `configure_sharded_model` + + # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap + if model.__class__ is TestFSDPModelAutoWrapped: + model = TestFSDPModelAutoWrapped() + trainer.predict(model, ckpt_path=model_path) + + +def test_invalid_on_cpu(tmpdir): + """Test to ensure that to raise Misconfiguration for FSDP on CPU.""" + with pytest.raises( + MisconfigurationException, match="You selected strategy to be `ddp_fully_sharded`, but GPU is not available." + ): + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp") + assert isinstance(trainer.strategy, DDPFullyShardedStrategy) + trainer.strategy.setup_environment() + + +@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) +@RunIf(fairscale=True) +def test_fsdp_with_sharded_amp(cuda_count_1, tmpdir): + """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" + trainer = Trainer( + default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16 + ) + assert isinstance(trainer.strategy, DDPFullyShardedStrategy) + assert isinstance(trainer.strategy.precision_plugin, FullyShardedNativeMixedPrecisionPlugin) + + @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) def test_fully_sharded_strategy_checkpoint(tmpdir): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" @@ -171,51 +216,6 @@ def test_fully_sharded_strategy_checkpoint_multi_gpus(tmpdir, model, strategy): _run_multiple_stages(trainer, model) -def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModelManualWrapped): - # Use FullySharded to get the state dict for the sake of comparison - model_state_dict = trainer.strategy.lightning_module_state_dict() - - if trainer.is_global_zero: - saved_model = cls.load_from_checkpoint(ckpt_path) - - # Assert model parameters are identical after loading - for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()): - assert torch.equal(ddp_param.float().cpu(), shard_param) - - -def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): - trainer.fit(model) - - model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path - - trainer.save_checkpoint(model_path, weights_only=True) - - _assert_save_equality(trainer, model_path, cls=model.__class__) - - # Test entry point - if model.__class__ is TestFSDPModelAutoWrapped: - model = TestFSDPModelAutoWrapped() - trainer.test(model) # model is wrapped, will not call configure_shared_model - - # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap - if model.__class__ is TestFSDPModelAutoWrapped: - model = TestFSDPModelAutoWrapped() - trainer.test(model, ckpt_path=model_path) - - # Predict entry point - if model.__class__ is TestFSDPModelAutoWrapped: - model = TestFSDPModelAutoWrapped() - - if model.__class__ is TestFSDPModelAutoWrapped: - model = TestFSDPModelAutoWrapped() - trainer.predict(model) # model is wrapped, will not call `configure_sharded_model` - - # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap - if model.__class__ is TestFSDPModelAutoWrapped: - model = TestFSDPModelAutoWrapped() - trainer.predict(model, ckpt_path=model_path) - - @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) def test_fsdp_gradient_clipping_raises(tmpdir): """Test to ensure that an exception is raised when clipping gradients by value with FSDP.""" @@ -254,3 +254,17 @@ def test_fsdp_rewrap_limitation(tmpdir): with pytest.raises(MisconfigurationException, match="Using the same instance of model .* not supported"): trainer.test(model) + + +@RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, fairscale_fully_sharded=True) +def test_invalid_parameters_in_optimizer(tmpdir): + class CustomModel(BoringModel): + def configure_optimizers(self): + layer = torch.nn.Linear(4, 5) + return torch.optim.Adam(layer.parameters(), lr=1e-2) + + trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1) + model = CustomModel() + + with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"): + trainer.fit(model) From f2c7542428b1a921b44c6846a8c312dff626d370 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 31 Oct 2022 17:29:53 -0400 Subject: [PATCH 05/11] add a test --- src/lightning_lite/strategies/fairscale.py | 6 ++++++ .../strategies/{fsdp.py => fsdp_native.py} | 0 .../strategies/fully_sharded.py | 3 +-- .../strategies/fully_sharded_native.py | 16 +++++++++++++--- 4 files changed, 20 insertions(+), 5 deletions(-) rename src/lightning_lite/strategies/{fsdp.py => fsdp_native.py} (100%) diff --git a/src/lightning_lite/strategies/fairscale.py b/src/lightning_lite/strategies/fairscale.py index f0157edbf336f..12895bcee5466 100644 --- a/src/lightning_lite/strategies/fairscale.py +++ b/src/lightning_lite/strategies/fairscale.py @@ -196,3 +196,9 @@ def no_backward_sync(self, module: Module) -> Generator: ) with module.no_sync(): yield None + + +def _optimizer_has_flat_params(optimizer: Optimizer) -> bool: + from fairscale.nn.misc.flatten_params_wrapper import FlatParameter + + return any(isinstance(param, FlatParameter) for param in optimizer.param_groups[0]["params"]) diff --git a/src/lightning_lite/strategies/fsdp.py b/src/lightning_lite/strategies/fsdp_native.py similarity index 100% rename from src/lightning_lite/strategies/fsdp.py rename to src/lightning_lite/strategies/fsdp_native.py diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index 6b9065a7b0099..50f83a28dec03 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -20,8 +20,7 @@ import pytorch_lightning as pl from lightning_lite.plugins import CheckpointIO, ClusterEnvironment -from lightning_lite.strategies.fairscale import _FAIRSCALE_AVAILABLE -from lightning_lite.strategies.fsdp import _optimizer_has_flat_params +from lightning_lite.strategies.fairscale import _FAIRSCALE_AVAILABLE, _optimizer_has_flat_params from lightning_lite.utilities.enums import PrecisionType from lightning_lite.utilities.optimizer import _optimizers_to_device from pytorch_lightning.overrides.base import _LightningModuleWrapperBase diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index c381af3ba16fc..8168839a4607c 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -13,14 +13,15 @@ # limitations under the License. import contextlib import logging -from typing import Any, Dict, Generator, List, Optional, Union +from typing import Any, Dict, Generator, Iterable, List, Optional, Union import torch from torch import Tensor +from torch.optim import Optimizer import pytorch_lightning as pl from lightning_lite.plugins import CheckpointIO, ClusterEnvironment -from lightning_lite.strategies.fsdp import _optimizer_has_flat_params +from lightning_lite.strategies.fsdp_native import _optimizer_has_flat_params from lightning_lite.utilities.distributed import ( _get_default_process_group_backend_for_device, _init_dist_connection, @@ -33,7 +34,6 @@ from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin -from pytorch_lightning.strategies.fully_sharded import _validate_optimizers from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast @@ -380,3 +380,13 @@ def register_strategies(cls, strategy_registry: Dict) -> None: cpu_offload=CPUOffload(offload_params=True), ) cls._registered_strategies.append("fsdp_native_full_shard_offload") + + +def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None: + for optimizer in optimizers: + if not _optimizer_has_flat_params(optimizer): + raise ValueError( + "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" + " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" + " `configure_optimizers()` hook." + ) From 039e65b4ac73182216380744225b858c02b6443b Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 31 Oct 2022 17:31:02 -0400 Subject: [PATCH 06/11] chlog --- src/pytorch_lightning/CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index f3e1f93d24b70..df80a70734b57 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -11,6 +11,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added an error message when attempting to launch processes with `python -i` and an interactive-incompatible strategy ([#15293](https://github.com/Lightning-AI/lightning/pull/15293)) +- Added a check to validate that wrapped FSDP models are used while initializing optimizers ([#15319](https://github.com/Lightning-AI/lightning/pull/15319)) + + ### Changed - The `NeptuneLogger` now uses `neptune.init_run` instead of the deprecated `neptune.init` to initialize a run ([#15393](https://github.com/Lightning-AI/lightning/pull/15393)) From 6f332cbd64930ab5e3985f1ba95b99d211b046ff Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 1 Nov 2022 04:52:35 -0400 Subject: [PATCH 07/11] flaky --- tests/tests_pytorch/models/test_restore.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/tests_pytorch/models/test_restore.py b/tests/tests_pytorch/models/test_restore.py index c640e2268ad64..73fe6c6894f75 100644 --- a/tests/tests_pytorch/models/test_restore.py +++ b/tests/tests_pytorch/models/test_restore.py @@ -391,6 +391,8 @@ def test_callbacks_references_fit_ckpt_path(tmpdir): @RunIf(min_cuda_gpus=2) def test_running_test_pretrained_model_distrib_dp(tmpdir): """Verify `test()` on pretrained model.""" + seed_everything(7) + dm = ClassifDataModule() model = CustomClassificationModelDP(lr=0.1) From 130e0b1d71790b9f34ecfcda61d0a39cb969d5c1 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 1 Nov 2022 18:47:23 +0100 Subject: [PATCH 08/11] error handling --- .../strategies/fully_sharded.py | 28 ++++++++-------- .../strategies/fully_sharded_native.py | 33 +++++++++---------- .../test_ddp_fully_sharded_native.py | 16 ++++++--- ..._ddp_fully_sharded_with_full_state_dict.py | 16 ++++++--- 4 files changed, 54 insertions(+), 39 deletions(-) diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index 50f83a28dec03..0ca35adf0002a 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -13,10 +13,9 @@ # limitations under the License. import contextlib import logging -from typing import Any, Dict, Generator, Iterable, List, Optional +from typing import Any, Dict, Generator, List, Optional import torch -from torch.optim import Optimizer import pytorch_lightning as pl from lightning_lite.plugins import CheckpointIO, ClusterEnvironment @@ -187,12 +186,25 @@ def setup(self, trainer: "pl.Trainer") -> None: if not is_overridden("configure_sharded_model", self.lightning_module): self.model = self._setup_model(self.model) self.setup_optimizers(self.lightning_module.trainer) - _validate_optimizers(self.optimizers) _optimizers_to_device(self.optimizers, self.root_device) self.barrier() self.setup_precision_plugin() + def setup_optimizers(self, trainer: "pl.Trainer") -> None: + error = False + try: + super().setup_optimizers(trainer) + except ValueError as e: + error = "optimizer got an empty parameter list" in str(e) + + if error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers): + raise ValueError( + "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" + " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" + " `configure_optimizers()` hook." + ) + def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel: """Wraps the model into a :class:`~fairscale.nn.data_parallel.fully_sharded_data_parallel.FullyShardedDataParallel` module.""" @@ -290,13 +302,3 @@ def register_strategies(cls, strategy_registry: Dict) -> None: cls, description=f"{cls.__class__.__name__}", ) - - -def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None: - for optimizer in optimizers: - if not _optimizer_has_flat_params(optimizer): - raise ValueError( - "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" - " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" - " `configure_optimizers()` hook." - ) diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 8168839a4607c..d5efd61884bd6 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -13,11 +13,10 @@ # limitations under the License. import contextlib import logging -from typing import Any, Dict, Generator, Iterable, List, Optional, Union +from typing import Any, Dict, Generator, List, Optional, Union import torch from torch import Tensor -from torch.optim import Optimizer import pytorch_lightning as pl from lightning_lite.plugins import CheckpointIO, ClusterEnvironment @@ -218,11 +217,6 @@ def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel: log.detail(f"setting up FSDP model with device id: {self.root_device.index}, kwargs: {self.kwargs}") - rank_zero_info( - "When using PyTorch FSDP auto-wrap, make sure to initalize your model using trainer else" - " you will get an error.\ntorch.optim.Optimizer(self.trainer.model.parameters(), ...)" - ) - return FullyShardedDataParallel( module=model, process_group=self.process_group, @@ -259,11 +253,24 @@ def setup(self, trainer: "pl.Trainer") -> None: self.barrier() self.setup_optimizers(trainer) - _validate_optimizers(self.optimizers) _optimizers_to_device(self.optimizers, self.root_device) self.setup_precision_plugin() + def setup_optimizers(self, trainer: "pl.Trainer") -> None: + error = False + try: + super().setup_optimizers(trainer) + except ValueError as e: + error = "optimizer got an empty parameter list" in str(e) + + if error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers): + raise ValueError( + "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" + " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" + " `configure_optimizers()` hook." + ) + def model_to_device(self) -> None: pass @@ -380,13 +387,3 @@ def register_strategies(cls, strategy_registry: Dict) -> None: cpu_offload=CPUOffload(offload_params=True), ) cls._registered_strategies.append("fsdp_native_full_shard_offload") - - -def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None: - for optimizer in optimizers: - if not _optimizer_has_flat_params(optimizer): - raise ValueError( - "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" - " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" - " `configure_optimizers()` hook." - ) diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index 673fce60f3747..4c1e1a306d540 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -250,13 +250,21 @@ def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir, model, stra @RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, min_torch="1.12") def test_invalid_parameters_in_optimizer(tmpdir): - class CustomModel(BoringModel): + trainer = Trainer(strategy="fsdp_native", accelerator="cuda", devices=1) + + class EmptyParametersModel(BoringModel): + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=1e-2) + + model = EmptyParametersModel() + with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"): + trainer.fit(model) + + class NoFlatParametersModel(BoringModel): def configure_optimizers(self): layer = torch.nn.Linear(4, 5) return torch.optim.Adam(layer.parameters(), lr=1e-2) - trainer = Trainer(strategy="fsdp_native", accelerator="gpu", devices=1) - model = CustomModel() - + model = NoFlatParametersModel() with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"): trainer.fit(model) diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py index 0b3ce9947cb06..ba77fc561db1f 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py @@ -258,13 +258,21 @@ def test_fsdp_rewrap_limitation(tmpdir): @RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, fairscale_fully_sharded=True) def test_invalid_parameters_in_optimizer(tmpdir): - class CustomModel(BoringModel): + trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1) + + class EmptyParametersModel(BoringModel): + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=1e-2) + + model = EmptyParametersModel() + with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"): + trainer.fit(model) + + class NoFlatParametersModel(BoringModel): def configure_optimizers(self): layer = torch.nn.Linear(4, 5) return torch.optim.Adam(layer.parameters(), lr=1e-2) - trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1) - model = CustomModel() - + model = NoFlatParametersModel() with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"): trainer.fit(model) From fe37081f09ebcdef1c9d51c99c7eabe3b1aab803 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 2 Nov 2022 10:42:49 +0100 Subject: [PATCH 09/11] fix --- .github/workflows/docs-checks.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index f7c75e52cde18..c4f159442b105 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -79,6 +79,10 @@ jobs: pip install -e . -U --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --find-links pypi git checkout -- setup.py MANIFEST.in + - name: Adjust examples + if: ${{ matrix.pkg-name == 'lightning' }} + run: python .actions/assistant.py copy_replace_imports --source_dir="./src" --source_import="pytorch_lightning,lightning_lite" --target_import="lightning.pytorch,lightning.lite" + - name: Install this package env: PACKAGE_NAME: ${{ matrix.pkg }} From 7ed9cbf066ccc9b71431008012a9f387c8627ca4 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Wed, 2 Nov 2022 07:14:01 -0400 Subject: [PATCH 10/11] catch and raise other errors --- src/pytorch_lightning/strategies/fully_sharded.py | 14 +++++--------- .../strategies/fully_sharded_native.py | 8 +++++--- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index 0ca35adf0002a..9801cff27f9ae 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -28,7 +28,6 @@ from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.rank_zero import rank_zero_info from pytorch_lightning.utilities.types import STEP_OUTPUT if _FAIRSCALE_AVAILABLE: @@ -192,13 +191,15 @@ def setup(self, trainer: "pl.Trainer") -> None: self.setup_precision_plugin() def setup_optimizers(self, trainer: "pl.Trainer") -> None: - error = False + invalid_params_error = False try: super().setup_optimizers(trainer) except ValueError as e: - error = "optimizer got an empty parameter list" in str(e) + if "optimizer got an empty parameter list" not in str(e): + raise + invalid_params_error = True - if error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers): + if invalid_params_error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers): raise ValueError( "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" @@ -210,11 +211,6 @@ def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel: :class:`~fairscale.nn.data_parallel.fully_sharded_data_parallel.FullyShardedDataParallel` module.""" log.detail(f"setting up `Fairscale FSDP` model with device id: {self.root_device.index}.") - rank_zero_info( - "When using FairScale FSDP auto-wrap, make sure to initalize your model using trainer else" - " you will get an error.\ntorch.optim.Optimizer(self.trainer.model.parameters(), ...)" - ) - return FullyShardedDataParallel( module=model, process_group=self.process_group, diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index d5efd61884bd6..c628f2a653a79 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -258,13 +258,15 @@ def setup(self, trainer: "pl.Trainer") -> None: self.setup_precision_plugin() def setup_optimizers(self, trainer: "pl.Trainer") -> None: - error = False + invalid_params_error = False try: super().setup_optimizers(trainer) except ValueError as e: - error = "optimizer got an empty parameter list" in str(e) + if "optimizer got an empty parameter list" not in str(e): + raise + invalid_params_error = True - if error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers): + if invalid_params_error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers): raise ValueError( "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" From 8a5268e63267deaf2d98339ad9fe27cbb1d5b16d Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Mon, 7 Nov 2022 14:56:32 +0900 Subject: [PATCH 11/11] Update src/pytorch_lightning/CHANGELOG.md --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index b2de64593e8ff..7e0413d9a2cca 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -17,7 +17,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - -- Added a check to validate that wrapped FSDP models are used while initializing optimizers ([#15319](https://github.com/Lightning-AI/lightning/pull/15319)) +- Added a check to validate that wrapped FSDP models are used while initializing optimizers ([#15301](https://github.com/Lightning-AI/lightning/pull/15301)) ### Changed