Lightning-AI · awaelchli · Nov 21, 2022 · Oct 1, 2022 · Oct 1, 2022 · Oct 1, 2022
@@ -20,6 +20,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `LightningLite.setup_module()` and `LightningLite.setup_optimizers()` to support strategies that need to set up the model before an optimizer can be created ([#15185](https://github.com/Lightning-AI/lightning/pull/15185))
 
 
+- Added support for Fully Sharded Data Parallel (FSDP) training in Lightning Lite ([#14967](https://github.com/Lightning-AI/lightning/issues/14967))
+
+
 ### Changed
 
 - The `LightningLite.run()` method is no longer abstract ([#14992](https://github.com/Lightning-AI/lightning/issues/14992))

@@ -40,6 +40,7 @@
     TorchElasticEnvironment,
 )
 from lightning_lite.plugins.precision.double import DoublePrecision
+from lightning_lite.plugins.precision.fsdp import FSDPPrecision
 from lightning_lite.strategies import (
     DDPShardedStrategy,
     DDPSpawnShardedStrategy,
@@ -53,6 +54,7 @@
     XLAStrategy,
 )
 from lightning_lite.strategies.ddp_spawn import _DDP_FORK_ALIASES
+from lightning_lite.strategies.fsdp import _FSDP_ALIASES, FSDPStrategy
 from lightning_lite.utilities import _StrategyType, rank_zero_info, rank_zero_warn
 from lightning_lite.utilities.device_parser import _determine_root_gpu_device
 from lightning_lite.utilities.imports import _IS_INTERACTIVE
@@ -417,6 +419,13 @@ def _check_strategy_and_fallback(self) -> None:
                 f"You selected `Lite(strategy='{strategy_flag}')` but process forking is not supported on this"
                 f" platform. We recommed `Lite(strategy='ddp_spawn')` instead."
             )
+        if (
+            strategy_flag in _FSDP_ALIASES or isinstance(self._strategy_flag, FSDPStrategy)
+        ) and self._accelerator_flag not in ("cuda", "gpu"):
+            raise ValueError(
+                "You selected the FSDP strategy but FSDP is only available on GPU. Set `Lite(accelerator='gpu', ...)`"
+                " to continue or select a different strategy."
+            )
         if strategy_flag:
             self._strategy_flag = strategy_flag
 
@@ -465,9 +474,11 @@ def _check_and_init_precision(self) -> Precision:
                 if self._precision_input == 16
                 else "Using bfloat16 Automatic Mixed Precision (AMP)"
             )
-
             device = "cpu" if self._accelerator_flag == "cpu" else "cuda"
-            return NativeMixedPrecision(self._precision_input, device)
+
+            if isinstance(self.strategy, FSDPStrategy):
+                return FSDPPrecision(precision=self._precision_input, device=device)
+            return NativeMixedPrecision(precision=self._precision_input, device=device)
 
         raise RuntimeError("No precision set")
 

@@ -35,6 +35,7 @@
     DDPShardedStrategy,
     DDPSpawnShardedStrategy,
     DeepSpeedStrategy,
+    FSDPStrategy,
     SingleDeviceStrategy,
     Strategy,
     XLAStrategy,
@@ -593,14 +594,20 @@ def _prepare_run_method(self) -> None:
         # wrap the run method, so we can inject setup logic or spawn processes for the user
         setattr(self, "run", partial(self._run_impl, self.run))
 
-    @staticmethod
-    def _validate_setup(module: nn.Module, optimizers: Sequence[Optimizer]) -> None:
+    def _validate_setup(self, module: nn.Module, optimizers: Sequence[Optimizer]) -> None:
         if isinstance(module, _LiteModule):
             raise ValueError("A model should be passed only once to the `setup` method.")
 
         if any(isinstance(opt, _LiteOptimizer) for opt in optimizers):
             raise ValueError("An optimizer should be passed only once to the `setup` method.")
 
+        if isinstance(self._strategy, FSDPStrategy):
+            raise RuntimeError(
+                f"The `{type(self).__name__}` requires the model and optimizer(s) to be set up separately."
+                " Create and set up the model first through `model = self.setup_model(model)`. Then create the"
+                " optimizer and set it up: `optimizer = self.setup_optimizer(optimizer)`."
+            )
+
     def _validate_setup_module(self, module: nn.Module) -> None:
         if isinstance(module, _LiteModule):
             raise ValueError("A model should be passed only once to the `setup_module` method.")

@@ -17,6 +17,7 @@
 from lightning_lite.plugins.io.xla import XLACheckpointIO
 from lightning_lite.plugins.precision.deepspeed import DeepSpeedPrecision
 from lightning_lite.plugins.precision.double import DoublePrecision
+from lightning_lite.plugins.precision.fsdp import FSDPPrecision
 from lightning_lite.plugins.precision.native_amp import NativeMixedPrecision
 from lightning_lite.plugins.precision.precision import Precision
 from lightning_lite.plugins.precision.tpu import TPUPrecision
@@ -33,4 +34,5 @@
     "NativeMixedPrecision",
     "TPUPrecision",
     "TPUBf16Precision",
+    "FSDPPrecision",
 ]
@@ -13,6 +13,7 @@
 # limitations under the License.
 from lightning_lite.plugins.precision.deepspeed import DeepSpeedPrecision
 from lightning_lite.plugins.precision.double import DoublePrecision
+from lightning_lite.plugins.precision.fsdp import FSDPPrecision
 from lightning_lite.plugins.precision.native_amp import NativeMixedPrecision
 from lightning_lite.plugins.precision.precision import Precision
 from lightning_lite.plugins.precision.tpu import TPUPrecision
@@ -25,4 +26,5 @@
     "Precision",
     "TPUPrecision",
     "TPUBf16Precision",
+    "FSDPPrecision",
 ]
@@ -0,0 +1,59 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, TYPE_CHECKING
+
+import torch
+from typing_extensions import Literal
+
+from lightning_lite.plugins.precision.native_amp import NativeMixedPrecision
+from lightning_lite.utilities.enums import PrecisionType
+from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_12
+
+if TYPE_CHECKING:
+    from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
+    from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+
+
+class FSDPPrecision(NativeMixedPrecision):
+    """AMP for Fully Sharded Data Parallel training."""
+
+    def __init__(
+        self, precision: Literal[16, "bf16"], device: str, scaler: Optional["ShardedGradScaler"] = None
+    ) -> None:
+        if not _TORCH_GREATER_EQUAL_1_12:
+            raise NotImplementedError("`FSDPPrecision` is supported from PyTorch v1.12.0 onwards.")
+
+        from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+
+        super().__init__(
+            precision=precision,
+            device=device,
+            scaler=(ShardedGradScaler() if scaler is None and precision == 16 else None),
+        )
+
+    @property
+    def mixed_precision_config(self) -> "MixedPrecision":
+        from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
+
+        if self.precision == PrecisionType.HALF:
+            dtype = torch.float16
+        elif self.precision == PrecisionType.BFLOAT:
+            dtype = torch.bfloat16
+        else:
+            raise ValueError(f"Was unable to infer precision type, received {self.precision!r}.")
+        return MixedPrecision(
+            param_dtype=dtype,
+            reduce_dtype=dtype,
+            buffer_dtype=dtype,
+        )
@@ -17,6 +17,7 @@
 from lightning_lite.strategies.dp import DataParallelStrategy  # noqa: F401
 from lightning_lite.strategies.fairscale import DDPShardedStrategy  # noqa: F401
 from lightning_lite.strategies.fairscale import DDPSpawnShardedStrategy  # noqa: F401
+from lightning_lite.strategies.fsdp import FSDPStrategy  # noqa: F401
 from lightning_lite.strategies.parallel import ParallelStrategy  # noqa: F401
 from lightning_lite.strategies.registry import _call_register_strategies, _StrategyRegistry
 from lightning_lite.strategies.single_device import SingleDeviceStrategy  # noqa: F401

@@ -92,8 +92,7 @@ def num_processes(self) -> int:
 
     @property
     def distributed_sampler_kwargs(self) -> Dict[str, Any]:
-        distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
-        return distributed_sampler_kwargs
+        return dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
 
     @property
     def process_group_backend(self) -> Optional[str]:

@@ -99,8 +99,7 @@ def num_processes(self) -> int:
 
     @property
     def distributed_sampler_kwargs(self) -> Dict[str, int]:
-        distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
-        return distributed_sampler_kwargs
+        return dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
 
     @property
     def process_group_backend(self) -> Optional[str]:

@@ -297,8 +297,7 @@ def zero_stage_3(self) -> bool:
 
     @property
     def distributed_sampler_kwargs(self) -> Dict[str, int]:
-        distributed_sampler_kwargs = dict(num_replicas=self.world_size, rank=self.global_rank)
-        return distributed_sampler_kwargs
+        return dict(num_replicas=self.world_size, rank=self.global_rank)
 
     @property
     def model(self) -> "deepspeed.DeepSpeedEngine":