From f67a6fee58361ecf87a03d5094ea0cab8aedd3e9 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Tue, 25 Oct 2022 17:29:19 +0530
Subject: [PATCH 01/11] Add info about optimizer initialization for native fsdp

---
 docs/source-pytorch/advanced/model_parallel.rst          | 6 ++++++
 src/pytorch_lightning/strategies/fully_sharded_native.py | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst
index 3a57f7b9499bc..ca6d2be30faa5 100644
--- a/docs/source-pytorch/advanced/model_parallel.rst
+++ b/docs/source-pytorch/advanced/model_parallel.rst
@@ -352,6 +352,12 @@ Model layers should be wrapped in FSDP in a nested way to save peak memory and e
 simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't
 have to ``wrap`` layers manually as in the case of manual wrapping.
 
+.. note::
+    While initializing the optimizers inside ``configure_optimizers`` hook, make sure to use ``self.trainer.model.parameters()``, else
+    PyTorch will raise an error. This is required because when you use auto-wrap, the model layers are sharded and your
+    ``lightning_module.parameters()`` will return a generator with no params. This inconvenience will be addressed in the future.
+
+
 .. code-block:: python
 
     model = BoringModel()
diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py
index 3c095dcc01403..c7022bcc4cf77 100644
--- a/src/pytorch_lightning/strategies/fully_sharded_native.py
+++ b/src/pytorch_lightning/strategies/fully_sharded_native.py
@@ -212,6 +212,12 @@ def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel:
             del self.kwargs["auto_wrap_policy"]
 
         log.detail(f"setting up FSDP model with device id: {self.root_device.index}, kwargs: {self.kwargs}")
+
+        rank_zero_info(
+            "When using PyTorch FSDP auto-wrap, make sure to initalize your model using trainer else"
+            " you will get an error.\ntorch.optim.Optimizer(self.trainer.model.parameters(), ...)"
+        )
+
         return FullyShardedDataParallel(
             module=model,
             process_group=self.process_group,

From 7f69cd2be7747e595b80bcbe181cc0d25bd58ddc Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Mon, 31 Oct 2022 18:27:42 +0100
Subject: [PATCH 02/11] validate optimizer parameters in fsdp strategy

---
 src/lightning_lite/strategies/fsdp.py         | 20 +++++++++++++++++++
 .../strategies/fully_sharded_native.py        | 15 +++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 src/lightning_lite/strategies/fsdp.py

diff --git a/src/lightning_lite/strategies/fsdp.py b/src/lightning_lite/strategies/fsdp.py
new file mode 100644
index 0000000000000..9e70400e476dc
--- /dev/null
+++ b/src/lightning_lite/strategies/fsdp.py
@@ -0,0 +1,20 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch.optim import Optimizer
+
+
+def _optimizer_has_flat_params(optimizer: Optimizer) -> bool:
+    from torch.distributed.fsdp import FlatParameter
+
+    return any(isinstance(param, FlatParameter) for param in optimizer.param_groups[0]["params"])
diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py
index c7022bcc4cf77..7c71b7fdc175e 100644
--- a/src/pytorch_lightning/strategies/fully_sharded_native.py
+++ b/src/pytorch_lightning/strategies/fully_sharded_native.py
@@ -13,13 +13,15 @@
 # limitations under the License.
 import contextlib
 import logging
-from typing import Any, Dict, Generator, List, Optional, Union
+from typing import Any, Dict, Generator, List, Optional, Union, Iterable
 
 import torch
 from torch import Tensor
+from torch.optim import Optimizer
 
 import pytorch_lightning as pl
 from lightning_lite.plugins import CheckpointIO, ClusterEnvironment
+from lightning_lite.strategies.fsdp import _optimizer_has_flat_params
 from lightning_lite.utilities.distributed import get_default_process_group_backend_for_device
 from lightning_lite.utilities.distributed import group as _group
 from lightning_lite.utilities.distributed import init_dist_connection, sync_ddp_if_available
@@ -254,6 +256,7 @@ def setup(self, trainer: "pl.Trainer") -> None:
         self.barrier()
 
         self.setup_optimizers(trainer)
+        _validate_optimizers(self.optimizers)
         optimizers_to_device(self.optimizers, self.root_device)
 
         self.setup_precision_plugin()
@@ -374,3 +377,13 @@ def register_strategies(cls, strategy_registry: Dict) -> None:
                 cpu_offload=CPUOffload(offload_params=True),
             )
             cls._registered_strategies.append("fsdp_native_full_shard_offload")
+
+
+def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None:
+    for optimizer in optimizers:
+        if not _optimizer_has_flat_params(optimizer):
+            raise ValueError(
+                "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
+                " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
+                " `configure_optimizers()` hook."
+            )

From e1a273e13bf718c0fa70b5b568e167a78ec46806 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 31 Oct 2022 18:48:21 +0000
Subject: [PATCH 03/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/pytorch_lightning/strategies/fully_sharded_native.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py
index 8351aa82e2356..b50f08269e0da 100644
--- a/src/pytorch_lightning/strategies/fully_sharded_native.py
+++ b/src/pytorch_lightning/strategies/fully_sharded_native.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import contextlib
 import logging
-from typing import Any, Dict, Generator, List, Optional, Union, Iterable
+from typing import Any, Dict, Generator, Iterable, List, Optional, Union
 
 import torch
 from torch import Tensor

From 595b896d75c525a4f1b27e7ffaa3ffb90e05fa89 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Mon, 31 Oct 2022 17:15:24 -0400
Subject: [PATCH 04/11] add a test

---
 .../strategies/fully_sharded.py               |  15 +-
 .../strategies/fully_sharded_native.py        |  14 +-
 .../test_ddp_fully_sharded_native.py          | 154 ++++++++++--------
 ..._ddp_fully_sharded_with_full_state_dict.py | 146 +++++++++--------
 4 files changed, 180 insertions(+), 149 deletions(-)

diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py
index d5716d6e45f00..6b9065a7b0099 100644
--- a/src/pytorch_lightning/strategies/fully_sharded.py
+++ b/src/pytorch_lightning/strategies/fully_sharded.py
@@ -13,13 +13,15 @@
 # limitations under the License.
 import contextlib
 import logging
-from typing import Any, Dict, Generator, List, Optional
+from typing import Any, Dict, Generator, Iterable, List, Optional
 
 import torch
+from torch.optim import Optimizer
 
 import pytorch_lightning as pl
 from lightning_lite.plugins import CheckpointIO, ClusterEnvironment
 from lightning_lite.strategies.fairscale import _FAIRSCALE_AVAILABLE
+from lightning_lite.strategies.fsdp import _optimizer_has_flat_params
 from lightning_lite.utilities.enums import PrecisionType
 from lightning_lite.utilities.optimizer import _optimizers_to_device
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
@@ -186,6 +188,7 @@ def setup(self, trainer: "pl.Trainer") -> None:
         if not is_overridden("configure_sharded_model", self.lightning_module):
             self.model = self._setup_model(self.model)
         self.setup_optimizers(self.lightning_module.trainer)
+        _validate_optimizers(self.optimizers)
         _optimizers_to_device(self.optimizers, self.root_device)
         self.barrier()
 
@@ -288,3 +291,13 @@ def register_strategies(cls, strategy_registry: Dict) -> None:
             cls,
             description=f"{cls.__class__.__name__}",
         )
+
+
+def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None:
+    for optimizer in optimizers:
+        if not _optimizer_has_flat_params(optimizer):
+            raise ValueError(
+                "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
+                " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
+                " `configure_optimizers()` hook."
+            )
diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py
index b50f08269e0da..c381af3ba16fc 100644
--- a/src/pytorch_lightning/strategies/fully_sharded_native.py
+++ b/src/pytorch_lightning/strategies/fully_sharded_native.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 import contextlib
 import logging
-from typing import Any, Dict, Generator, Iterable, List, Optional, Union
+from typing import Any, Dict, Generator, List, Optional, Union
 
 import torch
 from torch import Tensor
-from torch.optim import Optimizer
 
 import pytorch_lightning as pl
 from lightning_lite.plugins import CheckpointIO, ClusterEnvironment
@@ -34,6 +33,7 @@
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin
+from pytorch_lightning.strategies.fully_sharded import _validate_optimizers
 from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher
 from pytorch_lightning.strategies.parallel import ParallelStrategy
 from pytorch_lightning.strategies.strategy import TBroadcast
@@ -380,13 +380,3 @@ def register_strategies(cls, strategy_registry: Dict) -> None:
                 cpu_offload=CPUOffload(offload_params=True),
             )
             cls._registered_strategies.append("fsdp_native_full_shard_offload")
-
-
-def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None:
-    for optimizer in optimizers:
-        if not _optimizer_has_flat_params(optimizer):
-            raise ValueError(
-                "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
-                " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
-                " `configure_optimizers()` hook."
-            )
diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
index be8bced2cbf5f..673fce60f3747 100644
--- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
+++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
@@ -18,46 +18,6 @@
     from torch.distributed.fsdp.wrap import wrap
 
 
-def custom_auto_wrap_policy(
-    module,
-    recurse,
-    unwrapped_params: int,
-    min_num_params: int = int(1e8),
-) -> bool:
-    return unwrapped_params >= 2
-
-
-@RunIf(min_torch="1.12")
-def test_invalid_on_cpu(tmpdir):
-    """Test to ensure that we raise Misconfiguration for Native FSDP on CPU."""
-    with pytest.raises(
-        MisconfigurationException,
-        match=f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
-        "but GPU accelerator is not used.",
-    ):
-        trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp_native")
-        assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy)
-        trainer.strategy.setup_environment()
-
-
-@RunIf(min_torch="1.12", min_cuda_gpus=1)
-@pytest.mark.parametrize("precision, expected", [(16, torch.float16), ("bf16", torch.bfloat16)])
-def test_precision_plugin_config(precision, expected):
-    plugin = FullyShardedNativeNativeMixedPrecisionPlugin(precision=precision, device="cuda")
-    config = plugin.mixed_precision_config
-    assert config.param_dtype == expected
-    assert config.buffer_dtype == expected
-    assert config.reduce_dtype == expected
-
-
-@RunIf(min_torch="1.12")
-def test_fsdp_custom_mixed_precision(tmpdir):
-    """Test to ensure that passing a custom mixed precision config works."""
-    config = MixedPrecision()
-    strategy = DDPFullyShardedNativeStrategy(mixed_precision=config)
-    assert strategy.mixed_precision_config == config
-
-
 class TestFSDPModel(BoringModel):
     def __init__(self):
         super().__init__()
@@ -154,6 +114,80 @@ def _assert_layer_fsdp_instance(self) -> None:
             assert self.layer[layer_num].mixed_precision.buffer_dtype == precision
 
 
+def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
+    trainer.fit(model)
+    model_path = trainer.strategy.broadcast(model_path)
+    model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path
+
+    trainer.save_checkpoint(model_path, weights_only=True)
+
+    _assert_save_equality(trainer, model_path, cls=model.__class__)
+
+    # Test entry point
+    trainer.test(model)  # model is wrapped, will not call `configure_sharded_model`
+
+    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
+    trainer.test(ckpt_path=model_path)
+
+    # Predict entry point
+    trainer.predict(model)  # model is wrapped, will not call `configure_sharded_model`
+
+    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
+    trainer.predict(ckpt_path=model_path)
+
+
+def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel):
+    # Use FullySharded to get the state dict for the sake of comparison
+    model_state_dict = trainer.strategy.lightning_module_state_dict()
+
+    if trainer.is_global_zero:
+        saved_model = cls.load_from_checkpoint(ckpt_path)
+
+        # Assert model parameters are identical after loading
+        for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()):
+            assert torch.equal(ddp_param.float().cpu(), shard_param)
+
+
+def custom_auto_wrap_policy(
+    module,
+    recurse,
+    unwrapped_params: int,
+    min_num_params: int = int(1e8),
+) -> bool:
+    return unwrapped_params >= 2
+
+
+@RunIf(min_torch="1.12")
+def test_invalid_on_cpu(tmpdir):
+    """Test to ensure that we raise Misconfiguration for Native FSDP on CPU."""
+    with pytest.raises(
+        MisconfigurationException,
+        match=f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
+        "but GPU accelerator is not used.",
+    ):
+        trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp_native")
+        assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy)
+        trainer.strategy.setup_environment()
+
+
+@RunIf(min_torch="1.12", min_cuda_gpus=1)
+@pytest.mark.parametrize("precision, expected", [(16, torch.float16), ("bf16", torch.bfloat16)])
+def test_precision_plugin_config(precision, expected):
+    plugin = FullyShardedNativeNativeMixedPrecisionPlugin(precision=precision, device="cuda")
+    config = plugin.mixed_precision_config
+    assert config.param_dtype == expected
+    assert config.buffer_dtype == expected
+    assert config.reduce_dtype == expected
+
+
+@RunIf(min_torch="1.12")
+def test_fsdp_custom_mixed_precision(tmpdir):
+    """Test to ensure that passing a custom mixed precision config works."""
+    config = MixedPrecision()
+    strategy = DDPFullyShardedNativeStrategy(mixed_precision=config)
+    assert strategy.mixed_precision_config == config
+
+
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.12")
 def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
     """Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run."""
@@ -214,35 +248,15 @@ def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir, model, stra
     _run_multiple_stages(trainer, model)
 
 
-def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
-    trainer.fit(model)
-    model_path = trainer.strategy.broadcast(model_path)
-    model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path
-
-    trainer.save_checkpoint(model_path, weights_only=True)
-
-    _assert_save_equality(trainer, model_path, cls=model.__class__)
-
-    # Test entry point
-    trainer.test(model)  # model is wrapped, will not call `configure_sharded_model`
-
-    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
-    trainer.test(ckpt_path=model_path)
-
-    # Predict entry point
-    trainer.predict(model)  # model is wrapped, will not call `configure_sharded_model`
-
-    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
-    trainer.predict(ckpt_path=model_path)
-
-
-def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel):
-    # Use FullySharded to get the state dict for the sake of comparison
-    model_state_dict = trainer.strategy.lightning_module_state_dict()
+@RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, min_torch="1.12")
+def test_invalid_parameters_in_optimizer(tmpdir):
+    class CustomModel(BoringModel):
+        def configure_optimizers(self):
+            layer = torch.nn.Linear(4, 5)
+            return torch.optim.Adam(layer.parameters(), lr=1e-2)
 
-    if trainer.is_global_zero:
-        saved_model = cls.load_from_checkpoint(ckpt_path)
+    trainer = Trainer(strategy="fsdp_native", accelerator="gpu", devices=1)
+    model = CustomModel()
 
-        # Assert model parameters are identical after loading
-        for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()):
-            assert torch.equal(ddp_param.float().cpu(), shard_param)
+    with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"):
+        trainer.fit(model)
diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
index 5043d3a8c4aa3..0b3ce9947cb06 100644
--- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
+++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
@@ -18,27 +18,6 @@
     from fairscale.nn import FullyShardedDataParallel, wrap
 
 
-def test_invalid_on_cpu(tmpdir):
-    """Test to ensure that to raise Misconfiguration for FSDP on CPU."""
-    with pytest.raises(
-        MisconfigurationException, match="You selected strategy to be `ddp_fully_sharded`, but GPU is not available."
-    ):
-        trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp")
-        assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
-        trainer.strategy.setup_environment()
-
-
-@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
-@RunIf(fairscale=True)
-def test_fsdp_with_sharded_amp(cuda_count_1, tmpdir):
-    """Test to ensure that plugin native amp plugin is correctly chosen when using sharded."""
-    trainer = Trainer(
-        default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16
-    )
-    assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
-    assert isinstance(trainer.strategy.precision_plugin, FullyShardedNativeMixedPrecisionPlugin)
-
-
 class TestFSDPModelManualWrapped(BoringModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -123,6 +102,72 @@ def _assert_layer_fsdp_instance(self) -> None:
             assert self.trainer.model.mixed_precision
 
 
+def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModelManualWrapped):
+    # Use FullySharded to get the state dict for the sake of comparison
+    model_state_dict = trainer.strategy.lightning_module_state_dict()
+
+    if trainer.is_global_zero:
+        saved_model = cls.load_from_checkpoint(ckpt_path)
+
+        # Assert model parameters are identical after loading
+        for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()):
+            assert torch.equal(ddp_param.float().cpu(), shard_param)
+
+
+def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
+    trainer.fit(model)
+
+    model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path
+
+    trainer.save_checkpoint(model_path, weights_only=True)
+
+    _assert_save_equality(trainer, model_path, cls=model.__class__)
+
+    # Test entry point
+    if model.__class__ is TestFSDPModelAutoWrapped:
+        model = TestFSDPModelAutoWrapped()
+    trainer.test(model)  # model is wrapped, will not call configure_shared_model
+
+    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
+    if model.__class__ is TestFSDPModelAutoWrapped:
+        model = TestFSDPModelAutoWrapped()
+    trainer.test(model, ckpt_path=model_path)
+
+    # Predict entry point
+    if model.__class__ is TestFSDPModelAutoWrapped:
+        model = TestFSDPModelAutoWrapped()
+
+    if model.__class__ is TestFSDPModelAutoWrapped:
+        model = TestFSDPModelAutoWrapped()
+    trainer.predict(model)  # model is wrapped, will not call `configure_sharded_model`
+
+    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
+    if model.__class__ is TestFSDPModelAutoWrapped:
+        model = TestFSDPModelAutoWrapped()
+    trainer.predict(model, ckpt_path=model_path)
+
+
+def test_invalid_on_cpu(tmpdir):
+    """Test to ensure that to raise Misconfiguration for FSDP on CPU."""
+    with pytest.raises(
+        MisconfigurationException, match="You selected strategy to be `ddp_fully_sharded`, but GPU is not available."
+    ):
+        trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp")
+        assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
+        trainer.strategy.setup_environment()
+
+
+@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
+@RunIf(fairscale=True)
+def test_fsdp_with_sharded_amp(cuda_count_1, tmpdir):
+    """Test to ensure that plugin native amp plugin is correctly chosen when using sharded."""
+    trainer = Trainer(
+        default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16
+    )
+    assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
+    assert isinstance(trainer.strategy.precision_plugin, FullyShardedNativeMixedPrecisionPlugin)
+
+
 @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True)
 def test_fully_sharded_strategy_checkpoint(tmpdir):
     """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
@@ -171,51 +216,6 @@ def test_fully_sharded_strategy_checkpoint_multi_gpus(tmpdir, model, strategy):
     _run_multiple_stages(trainer, model)
 
 
-def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModelManualWrapped):
-    # Use FullySharded to get the state dict for the sake of comparison
-    model_state_dict = trainer.strategy.lightning_module_state_dict()
-
-    if trainer.is_global_zero:
-        saved_model = cls.load_from_checkpoint(ckpt_path)
-
-        # Assert model parameters are identical after loading
-        for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()):
-            assert torch.equal(ddp_param.float().cpu(), shard_param)
-
-
-def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
-    trainer.fit(model)
-
-    model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path
-
-    trainer.save_checkpoint(model_path, weights_only=True)
-
-    _assert_save_equality(trainer, model_path, cls=model.__class__)
-
-    # Test entry point
-    if model.__class__ is TestFSDPModelAutoWrapped:
-        model = TestFSDPModelAutoWrapped()
-    trainer.test(model)  # model is wrapped, will not call configure_shared_model
-
-    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
-    if model.__class__ is TestFSDPModelAutoWrapped:
-        model = TestFSDPModelAutoWrapped()
-    trainer.test(model, ckpt_path=model_path)
-
-    # Predict entry point
-    if model.__class__ is TestFSDPModelAutoWrapped:
-        model = TestFSDPModelAutoWrapped()
-
-    if model.__class__ is TestFSDPModelAutoWrapped:
-        model = TestFSDPModelAutoWrapped()
-    trainer.predict(model)  # model is wrapped, will not call `configure_sharded_model`
-
-    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
-    if model.__class__ is TestFSDPModelAutoWrapped:
-        model = TestFSDPModelAutoWrapped()
-    trainer.predict(model, ckpt_path=model_path)
-
-
 @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True)
 def test_fsdp_gradient_clipping_raises(tmpdir):
     """Test to ensure that an exception is raised when clipping gradients by value with FSDP."""
@@ -254,3 +254,17 @@ def test_fsdp_rewrap_limitation(tmpdir):
 
     with pytest.raises(MisconfigurationException, match="Using the same instance of model .* not supported"):
         trainer.test(model)
+
+
+@RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, fairscale_fully_sharded=True)
+def test_invalid_parameters_in_optimizer(tmpdir):
+    class CustomModel(BoringModel):
+        def configure_optimizers(self):
+            layer = torch.nn.Linear(4, 5)
+            return torch.optim.Adam(layer.parameters(), lr=1e-2)
+
+    trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1)
+    model = CustomModel()
+
+    with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"):
+        trainer.fit(model)

From f2c7542428b1a921b44c6846a8c312dff626d370 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Mon, 31 Oct 2022 17:29:53 -0400
Subject: [PATCH 05/11] add a test

---
 src/lightning_lite/strategies/fairscale.py       |  6 ++++++
 .../strategies/{fsdp.py => fsdp_native.py}       |  0
 .../strategies/fully_sharded.py                  |  3 +--
 .../strategies/fully_sharded_native.py           | 16 +++++++++++++---
 4 files changed, 20 insertions(+), 5 deletions(-)
 rename src/lightning_lite/strategies/{fsdp.py => fsdp_native.py} (100%)

diff --git a/src/lightning_lite/strategies/fairscale.py b/src/lightning_lite/strategies/fairscale.py
index f0157edbf336f..12895bcee5466 100644
--- a/src/lightning_lite/strategies/fairscale.py
+++ b/src/lightning_lite/strategies/fairscale.py
@@ -196,3 +196,9 @@ def no_backward_sync(self, module: Module) -> Generator:
             )
         with module.no_sync():
             yield None
+
+
+def _optimizer_has_flat_params(optimizer: Optimizer) -> bool:
+    from fairscale.nn.misc.flatten_params_wrapper import FlatParameter
+
+    return any(isinstance(param, FlatParameter) for param in optimizer.param_groups[0]["params"])
diff --git a/src/lightning_lite/strategies/fsdp.py b/src/lightning_lite/strategies/fsdp_native.py
similarity index 100%
rename from src/lightning_lite/strategies/fsdp.py
rename to src/lightning_lite/strategies/fsdp_native.py
diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py
index 6b9065a7b0099..50f83a28dec03 100644
--- a/src/pytorch_lightning/strategies/fully_sharded.py
+++ b/src/pytorch_lightning/strategies/fully_sharded.py
@@ -20,8 +20,7 @@
 
 import pytorch_lightning as pl
 from lightning_lite.plugins import CheckpointIO, ClusterEnvironment
-from lightning_lite.strategies.fairscale import _FAIRSCALE_AVAILABLE
-from lightning_lite.strategies.fsdp import _optimizer_has_flat_params
+from lightning_lite.strategies.fairscale import _FAIRSCALE_AVAILABLE, _optimizer_has_flat_params
 from lightning_lite.utilities.enums import PrecisionType
 from lightning_lite.utilities.optimizer import _optimizers_to_device
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py
index c381af3ba16fc..8168839a4607c 100644
--- a/src/pytorch_lightning/strategies/fully_sharded_native.py
+++ b/src/pytorch_lightning/strategies/fully_sharded_native.py
@@ -13,14 +13,15 @@
 # limitations under the License.
 import contextlib
 import logging
-from typing import Any, Dict, Generator, List, Optional, Union
+from typing import Any, Dict, Generator, Iterable, List, Optional, Union
 
 import torch
 from torch import Tensor
+from torch.optim import Optimizer
 
 import pytorch_lightning as pl
 from lightning_lite.plugins import CheckpointIO, ClusterEnvironment
-from lightning_lite.strategies.fsdp import _optimizer_has_flat_params
+from lightning_lite.strategies.fsdp_native import _optimizer_has_flat_params
 from lightning_lite.utilities.distributed import (
     _get_default_process_group_backend_for_device,
     _init_dist_connection,
@@ -33,7 +34,6 @@
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin
-from pytorch_lightning.strategies.fully_sharded import _validate_optimizers
 from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher
 from pytorch_lightning.strategies.parallel import ParallelStrategy
 from pytorch_lightning.strategies.strategy import TBroadcast
@@ -380,3 +380,13 @@ def register_strategies(cls, strategy_registry: Dict) -> None:
                 cpu_offload=CPUOffload(offload_params=True),
             )
             cls._registered_strategies.append("fsdp_native_full_shard_offload")
+
+
+def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None:
+    for optimizer in optimizers:
+        if not _optimizer_has_flat_params(optimizer):
+            raise ValueError(
+                "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
+                " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
+                " `configure_optimizers()` hook."
+            )

From 039e65b4ac73182216380744225b858c02b6443b Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Mon, 31 Oct 2022 17:31:02 -0400
Subject: [PATCH 06/11] chlog

---
 src/pytorch_lightning/CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index f3e1f93d24b70..df80a70734b57 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -11,6 +11,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added an error message when attempting to launch processes with `python -i` and an interactive-incompatible strategy ([#15293](https://github.com/Lightning-AI/lightning/pull/15293))
 
 
+- Added a check to validate that wrapped FSDP models are used while initializing optimizers ([#15319](https://github.com/Lightning-AI/lightning/pull/15319))
+
+
 ### Changed
 
 - The `NeptuneLogger` now uses `neptune.init_run` instead of the deprecated `neptune.init` to initialize a run ([#15393](https://github.com/Lightning-AI/lightning/pull/15393))

From 6f332cbd64930ab5e3985f1ba95b99d211b046ff Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Tue, 1 Nov 2022 04:52:35 -0400
Subject: [PATCH 07/11] flaky

---
 tests/tests_pytorch/models/test_restore.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/tests_pytorch/models/test_restore.py b/tests/tests_pytorch/models/test_restore.py
index c640e2268ad64..73fe6c6894f75 100644
--- a/tests/tests_pytorch/models/test_restore.py
+++ b/tests/tests_pytorch/models/test_restore.py
@@ -391,6 +391,8 @@ def test_callbacks_references_fit_ckpt_path(tmpdir):
 @RunIf(min_cuda_gpus=2)
 def test_running_test_pretrained_model_distrib_dp(tmpdir):
     """Verify `test()` on pretrained model."""
+    seed_everything(7)
+
     dm = ClassifDataModule()
     model = CustomClassificationModelDP(lr=0.1)
 

From 130e0b1d71790b9f34ecfcda61d0a39cb969d5c1 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Tue, 1 Nov 2022 18:47:23 +0100
Subject: [PATCH 08/11] error handling

---
 .../strategies/fully_sharded.py               | 28 ++++++++--------
 .../strategies/fully_sharded_native.py        | 33 +++++++++----------
 .../test_ddp_fully_sharded_native.py          | 16 ++++++---
 ..._ddp_fully_sharded_with_full_state_dict.py | 16 ++++++---
 4 files changed, 54 insertions(+), 39 deletions(-)

diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py
index 50f83a28dec03..0ca35adf0002a 100644
--- a/src/pytorch_lightning/strategies/fully_sharded.py
+++ b/src/pytorch_lightning/strategies/fully_sharded.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 import contextlib
 import logging
-from typing import Any, Dict, Generator, Iterable, List, Optional
+from typing import Any, Dict, Generator, List, Optional
 
 import torch
-from torch.optim import Optimizer
 
 import pytorch_lightning as pl
 from lightning_lite.plugins import CheckpointIO, ClusterEnvironment
@@ -187,12 +186,25 @@ def setup(self, trainer: "pl.Trainer") -> None:
         if not is_overridden("configure_sharded_model", self.lightning_module):
             self.model = self._setup_model(self.model)
         self.setup_optimizers(self.lightning_module.trainer)
-        _validate_optimizers(self.optimizers)
         _optimizers_to_device(self.optimizers, self.root_device)
         self.barrier()
 
         self.setup_precision_plugin()
 
+    def setup_optimizers(self, trainer: "pl.Trainer") -> None:
+        error = False
+        try:
+            super().setup_optimizers(trainer)
+        except ValueError as e:
+            error = "optimizer got an empty parameter list" in str(e)
+
+        if error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers):
+            raise ValueError(
+                "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
+                " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
+                " `configure_optimizers()` hook."
+            )
+
     def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel:
         """Wraps the model into a
         :class:`~fairscale.nn.data_parallel.fully_sharded_data_parallel.FullyShardedDataParallel` module."""
@@ -290,13 +302,3 @@ def register_strategies(cls, strategy_registry: Dict) -> None:
             cls,
             description=f"{cls.__class__.__name__}",
         )
-
-
-def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None:
-    for optimizer in optimizers:
-        if not _optimizer_has_flat_params(optimizer):
-            raise ValueError(
-                "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
-                " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
-                " `configure_optimizers()` hook."
-            )
diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py
index 8168839a4607c..d5efd61884bd6 100644
--- a/src/pytorch_lightning/strategies/fully_sharded_native.py
+++ b/src/pytorch_lightning/strategies/fully_sharded_native.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 import contextlib
 import logging
-from typing import Any, Dict, Generator, Iterable, List, Optional, Union
+from typing import Any, Dict, Generator, List, Optional, Union
 
 import torch
 from torch import Tensor
-from torch.optim import Optimizer
 
 import pytorch_lightning as pl
 from lightning_lite.plugins import CheckpointIO, ClusterEnvironment
@@ -218,11 +217,6 @@ def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel:
 
         log.detail(f"setting up FSDP model with device id: {self.root_device.index}, kwargs: {self.kwargs}")
 
-        rank_zero_info(
-            "When using PyTorch FSDP auto-wrap, make sure to initalize your model using trainer else"
-            " you will get an error.\ntorch.optim.Optimizer(self.trainer.model.parameters(), ...)"
-        )
-
         return FullyShardedDataParallel(
             module=model,
             process_group=self.process_group,
@@ -259,11 +253,24 @@ def setup(self, trainer: "pl.Trainer") -> None:
         self.barrier()
 
         self.setup_optimizers(trainer)
-        _validate_optimizers(self.optimizers)
         _optimizers_to_device(self.optimizers, self.root_device)
 
         self.setup_precision_plugin()
 
+    def setup_optimizers(self, trainer: "pl.Trainer") -> None:
+        error = False
+        try:
+            super().setup_optimizers(trainer)
+        except ValueError as e:
+            error = "optimizer got an empty parameter list" in str(e)
+
+        if error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers):
+            raise ValueError(
+                "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
+                " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
+                " `configure_optimizers()` hook."
+            )
+
     def model_to_device(self) -> None:
         pass
 
@@ -380,13 +387,3 @@ def register_strategies(cls, strategy_registry: Dict) -> None:
                 cpu_offload=CPUOffload(offload_params=True),
             )
             cls._registered_strategies.append("fsdp_native_full_shard_offload")
-
-
-def _validate_optimizers(optimizers: Iterable[Optimizer]) -> None:
-    for optimizer in optimizers:
-        if not _optimizer_has_flat_params(optimizer):
-            raise ValueError(
-                "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
-                " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
-                " `configure_optimizers()` hook."
-            )
diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
index 673fce60f3747..4c1e1a306d540 100644
--- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
+++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
@@ -250,13 +250,21 @@ def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir, model, stra
 
 @RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, min_torch="1.12")
 def test_invalid_parameters_in_optimizer(tmpdir):
-    class CustomModel(BoringModel):
+    trainer = Trainer(strategy="fsdp_native", accelerator="cuda", devices=1)
+
+    class EmptyParametersModel(BoringModel):
+        def configure_optimizers(self):
+            return torch.optim.Adam(self.parameters(), lr=1e-2)
+
+    model = EmptyParametersModel()
+    with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"):
+        trainer.fit(model)
+
+    class NoFlatParametersModel(BoringModel):
         def configure_optimizers(self):
             layer = torch.nn.Linear(4, 5)
             return torch.optim.Adam(layer.parameters(), lr=1e-2)
 
-    trainer = Trainer(strategy="fsdp_native", accelerator="gpu", devices=1)
-    model = CustomModel()
-
+    model = NoFlatParametersModel()
     with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"):
         trainer.fit(model)
diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
index 0b3ce9947cb06..ba77fc561db1f 100644
--- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
+++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
@@ -258,13 +258,21 @@ def test_fsdp_rewrap_limitation(tmpdir):
 
 @RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, fairscale_fully_sharded=True)
 def test_invalid_parameters_in_optimizer(tmpdir):
-    class CustomModel(BoringModel):
+    trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1)
+
+    class EmptyParametersModel(BoringModel):
+        def configure_optimizers(self):
+            return torch.optim.Adam(self.parameters(), lr=1e-2)
+
+    model = EmptyParametersModel()
+    with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"):
+        trainer.fit(model)
+
+    class NoFlatParametersModel(BoringModel):
         def configure_optimizers(self):
             layer = torch.nn.Linear(4, 5)
             return torch.optim.Adam(layer.parameters(), lr=1e-2)
 
-    trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1)
-    model = CustomModel()
-
+    model = NoFlatParametersModel()
     with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"):
         trainer.fit(model)

From fe37081f09ebcdef1c9d51c99c7eabe3b1aab803 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Wed, 2 Nov 2022 10:42:49 +0100
Subject: [PATCH 09/11] fix

---
 .github/workflows/docs-checks.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index f7c75e52cde18..c4f159442b105 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -79,6 +79,10 @@ jobs:
           pip install -e . -U  --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --find-links pypi
           git checkout -- setup.py MANIFEST.in
 
+      - name: Adjust examples
+        if: ${{ matrix.pkg-name == 'lightning' }}
+        run: python .actions/assistant.py copy_replace_imports --source_dir="./src" --source_import="pytorch_lightning,lightning_lite" --target_import="lightning.pytorch,lightning.lite"
+
       - name: Install this package
         env:
           PACKAGE_NAME: ${{ matrix.pkg }}

From 7ed9cbf066ccc9b71431008012a9f387c8627ca4 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Wed, 2 Nov 2022 07:14:01 -0400
Subject: [PATCH 10/11] catch and raise other errors

---
 src/pytorch_lightning/strategies/fully_sharded.py  | 14 +++++---------
 .../strategies/fully_sharded_native.py             |  8 +++++---
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py
index 0ca35adf0002a..9801cff27f9ae 100644
--- a/src/pytorch_lightning/strategies/fully_sharded.py
+++ b/src/pytorch_lightning/strategies/fully_sharded.py
@@ -28,7 +28,6 @@
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
-from pytorch_lightning.utilities.rank_zero import rank_zero_info
 from pytorch_lightning.utilities.types import STEP_OUTPUT
 
 if _FAIRSCALE_AVAILABLE:
@@ -192,13 +191,15 @@ def setup(self, trainer: "pl.Trainer") -> None:
         self.setup_precision_plugin()
 
     def setup_optimizers(self, trainer: "pl.Trainer") -> None:
-        error = False
+        invalid_params_error = False
         try:
             super().setup_optimizers(trainer)
         except ValueError as e:
-            error = "optimizer got an empty parameter list" in str(e)
+            if "optimizer got an empty parameter list" not in str(e):
+                raise
+            invalid_params_error = True
 
-        if error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers):
+        if invalid_params_error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers):
             raise ValueError(
                 "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
                 " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
@@ -210,11 +211,6 @@ def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel:
         :class:`~fairscale.nn.data_parallel.fully_sharded_data_parallel.FullyShardedDataParallel` module."""
         log.detail(f"setting up `Fairscale FSDP` model with device id: {self.root_device.index}.")
 
-        rank_zero_info(
-            "When using FairScale FSDP auto-wrap, make sure to initalize your model using trainer else"
-            " you will get an error.\ntorch.optim.Optimizer(self.trainer.model.parameters(), ...)"
-        )
-
         return FullyShardedDataParallel(
             module=model,
             process_group=self.process_group,
diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py
index d5efd61884bd6..c628f2a653a79 100644
--- a/src/pytorch_lightning/strategies/fully_sharded_native.py
+++ b/src/pytorch_lightning/strategies/fully_sharded_native.py
@@ -258,13 +258,15 @@ def setup(self, trainer: "pl.Trainer") -> None:
         self.setup_precision_plugin()
 
     def setup_optimizers(self, trainer: "pl.Trainer") -> None:
-        error = False
+        invalid_params_error = False
         try:
             super().setup_optimizers(trainer)
         except ValueError as e:
-            error = "optimizer got an empty parameter list" in str(e)
+            if "optimizer got an empty parameter list" not in str(e):
+                raise
+            invalid_params_error = True
 
-        if error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers):
+        if invalid_params_error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers):
             raise ValueError(
                 "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
                 " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"

From 8a5268e63267deaf2d98339ad9fe27cbb1d5b16d Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Mon, 7 Nov 2022 14:56:32 +0900
Subject: [PATCH 11/11] Update src/pytorch_lightning/CHANGELOG.md

---
 src/pytorch_lightning/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index b2de64593e8ff..7e0413d9a2cca 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -17,7 +17,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 -
 
 
-- Added a check to validate that wrapped FSDP models are used while initializing optimizers ([#15319](https://github.com/Lightning-AI/lightning/pull/15319))
+- Added a check to validate that wrapped FSDP models are used while initializing optimizers ([#15301](https://github.com/Lightning-AI/lightning/pull/15301))
 
 
 ### Changed