From 0f4f809a7c1855fe7e7304b9bfb97533687c0f08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 17 Jan 2023 02:29:06 +0100
Subject: [PATCH] Deprecate the FairScale integration (#16353)

---
 .../advanced/model_parallel.rst               | 172 +-----------------
 docs/source-pytorch/extensions/strategy.rst   |  11 +-
 src/pytorch_lightning/CHANGELOG.md            |   9 +-
 src/pytorch_lightning/overrides/fairscale.py  |   7 +
 .../precision/fully_sharded_native_amp.py     |  11 ++
 .../plugins/precision/sharded_native_amp.py   |   8 +
 .../strategies/fully_sharded.py               |   9 +-
 src/pytorch_lightning/strategies/sharded.py   |  13 +-
 .../strategies/sharded_spawn.py               |  13 +-
 src/pytorch_lightning/strategies/strategy.py  |   2 +-
 .../callbacks/test_stochastic_weight_avg.py   |   1 -
 .../deprecated_api/test_remove_2-0.py         |  14 +-
 .../precision/test_sharded_precision.py       |   3 +-
 .../plugins/test_cluster_integration.py       |  22 ++-
 ..._ddp_fully_sharded_with_full_state_dict.py | 111 +++++------
 .../strategies/test_ddp_strategy.py           |  31 ----
 .../tests_pytorch/strategies/test_registry.py |   9 +-
 .../strategies/test_sharded_strategy.py       | 146 +++++++--------
 .../connectors/test_accelerator_connector.py  |  32 +++-
 tests/tests_pytorch/trainer/test_trainer.py   |  14 +-
 20 files changed, 251 insertions(+), 387 deletions(-)

diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst
index c1e68b780f186..d7660680eaccf 100644
--- a/docs/source-pytorch/advanced/model_parallel.rst
+++ b/docs/source-pytorch/advanced/model_parallel.rst
@@ -6,8 +6,6 @@ Train 1 trillion+ parameter models
 
 When training large models, fitting larger batch sizes, or trying to increase throughput using multi-GPU compute, Lightning provides advanced optimized distributed training strategies to support these cases and offer substantial improvements in memory usage.
 
-In many cases these strategies are some flavour of model parallelism however we only introduce concepts at a high level to get you started. Refer to the `FairScale documentation <https://fairscale.readthedocs.io/en/latest/deep_dive/oss_sdp_fsdp.html>`_ for more information about model parallelism.
-
 Note that some of the extreme memory saving configurations will affect the speed of training. This Speed/Memory trade-off in most cases can be adjusted.
 
 Some of these memory-efficient strategies rely on offloading onto other forms of memory, such as CPU RAM or NVMe. This means you can even see memory benefits on a **single GPU**, using a strategy such as :ref:`deepspeed-zero-stage-3-offload`.
@@ -40,7 +38,7 @@ Overall:
 
 * When **fine-tuning** a model, use advanced memory efficient strategies such as :ref:`deepspeed-zero-stage-3` or :ref:`deepspeed-zero-stage-3-offload`, allowing you to fine-tune larger models if you are limited on compute
 * When **pre-training** a model, use simpler optimizations such :ref:`sharded-training`, :ref:`deepspeed-zero-stage-2` or :ref:`fully-sharded-training`, scaling the number of GPUs to reach larger parameter sizes
-* For both fine-tuning and pre-training, use :ref:`deepspeed-activation-checkpointing` or :ref:`fairscale-activation-checkpointing` as the throughput degradation is not significant
+* For both fine-tuning and pre-training, use :ref:`deepspeed-activation-checkpointing` as the throughput degradation is not significant
 
 For example when using 128 GPUs, you can **pre-train** large 10 to 20 Billion parameter models using :ref:`deepspeed-zero-stage-2` without having to take a performance hit with more advanced optimized multi-gpu strategy.
 
@@ -153,11 +151,10 @@ Here's an example of changing the placement policy to "cpu".
 
 .. _sharded-training:
 
-**************************
-FairScale Sharded Training
-**************************
+****************
+Sharded Training
+****************
 
-Lightning integration of optimizer sharded training provided by `FairScale <https://github.com/facebookresearch/fairscale>`_.
 The technique can be found within `DeepSpeed ZeRO <https://arxiv.org/abs/1910.02054>`_ and
 `ZeRO-2 <https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/>`_,
 however the implementation is built from the ground up to be PyTorch compatible and standalone.
@@ -171,178 +168,25 @@ these benefits in multi-GPU setups are almost free and throughput scales well wi
 
 It is highly recommended to use Sharded Training in multi-GPU environments where memory is limited, or where training larger models are beneficial (500M+ parameter models).
 A technical note: as batch size scales, storing activations for the backwards pass becomes the bottleneck in training. As a result, sharding optimizer state and gradients becomes less impactful.
-Use :ref:`fairscale-activation-checkpointing` to see even more benefit at the cost of some throughput.
-
-To use Sharded Training, you need to first install FairScale using the command below.
-
-.. code-block:: bash
-
-    pip install fairscale
-
 
 .. code-block:: python
 
     # train using Sharded DDP
     trainer = Trainer(strategy="ddp_sharded")
 
-Sharded Training can work across all DDP variants by adding the additional ``--strategy ddp_sharded`` flag via command line using a PyTorch Lightning script.
-
 Internally we re-initialize your optimizers and shard them across your machines and processes. We handle all communication using PyTorch distributed, so no code changes are required.
 
 ----
 
 .. _fully-sharded-training:
 
-FairScale Fully Sharded Training
-================================
-
-.. warning::
-    FairScale Fully Sharded Training is in BETA and the API is subject to change. Please create an `issue <https://github.com/Lightning-AI/lightning/issues>`_ if you run into any problems.
-
-`Fully Sharded <https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html>`_ shards optimizer state, gradients, and parameters across data parallel workers. This allows you to fit much larger models onto multiple GPUs into memory.
-
-Fully Sharded Training alleviates the need to worry about balancing layers onto specific devices using some form of pipe parallelism, and optimizes for distributed communication with minimal effort.
-
-Shard Parameters to Reach 10+ Billion Parameters
-------------------------------------------------
-
-To reach larger parameter sizes and to be memory efficient, we have to shard parameters. There are various ways to enable this.
-
-.. note::
-    Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``.
-    This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``.
-    This is a limitation of Fully Sharded Training that will be resolved in the future.
-
-Enabling Module Sharding for Maximum Memory Efficiency
-------------------------------------------------------
-
-Auto Wrapping
-^^^^^^^^^^^^^
-
-Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The
-simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't
-have to ``wrap`` layers manually as in the case of manual wrapping.
-
-.. note::
-    While initializing the optimizers inside ``configure_optimizers`` hook, make sure to use ``self.trainer.model.parameters()``, else
-    PyTorch will raise an error. This is required because when you use auto-wrap, the model layers are sharded and your
-    ``lightning_module.parameters()`` will return a generator with no params. This inconvenience will be addressed in the future.
-
-.. code-block:: python
-
-    class MyModel(BoringModel):
-        def configure_optimizers(self):
-            return torch.optim.AdamW(self.trainer.model.parameters(), lr=1e-2)
-
-
-    model = MyModel()
-    trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp", precision=16)
-    trainer.fit(model)
-
-
-Manual Wrapping
-^^^^^^^^^^^^^^^
-
-Manual wrapping can be useful to explore complex sharding strategies by applying ``wrap`` selectively to some parts of the model. To activate
-parameter sharding with manual wrapping, you can wrap your model using the ``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
-
-When not using Fully Sharded Training these wrap functions are a no-op. That means once the changes have been made, there is no need to remove the changes for other strategies.
-
-``auto_wrap`` recursively wraps :class:`~torch.nn.Module` within the ``LightningModule`` with nested Fully Sharded Wrappers,
-signalling that we'd like to partition these modules across data parallel devices, discarding the full weights when not required (information :class:`here <fairscale.nn.fsdp>`).
-
-``auto_wrap`` can have varying levels of success based on the complexity of your model. **Auto Wrap does not support models with shared parameters**.
-
-``wrap`` simply wraps the module with a Fully Sharded Parallel class with the correct parameters from the Lightning context manager.
-
-Here's an example using both ``wrap`` and ``auto_wrap`` to create your model:
-
-.. code-block:: python
-
-    import torch
-    import torch.nn as nn
-    import pytorch_lightning as pl
-    from pytorch_lightning import Trainer
-    from fairscale.nn import checkpoint_wrapper, auto_wrap, wrap
-
-
-    class MyModel(pl.LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.linear_layer = nn.Linear(32, 32)
-            self.block = nn.Sequential(nn.Linear(32, 32), nn.ReLU())
-            self.final_block = nn.Sequential(nn.Linear(32, 32), nn.ReLU())
-
-        def configure_sharded_model(self):
-            # modules are sharded across processes
-            # as soon as they are wrapped with `wrap` or `auto_wrap`.
-            # During the forward/backward passes, weights get synced across processes
-            # and de-allocated once computation is complete, saving memory.
-
-            # Wraps the layer in a Fully Sharded Wrapper automatically
-            linear_layer = wrap(self.linear_layer)
-
-            # Wraps the module recursively
-            # based on a minimum number of parameters (default 100M parameters)
-            block = auto_wrap(self.block)
-
-            # For best memory efficiency,
-            # add FairScale activation checkpointing
-            final_block = auto_wrap(checkpoint_wrapper(self.final_block))
-            self.model = nn.Sequential(linear_layer, nn.ReLU(), block, final_block)
-
-        def configure_optimizers(self):
-            return torch.optim.AdamW(self.model.parameters(), lr=1e-2)
-
-
-    model = MyModel()
-    trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp", precision=16)
-    trainer.fit(model)
-
-    trainer.test()
-    trainer.predict()
-
-----
-
-.. _fairscale-activation-checkpointing:
-
-Activation Checkpointing
-------------------------
-
-Activation checkpointing frees activations from memory as soon as they are not needed during the forward pass. They are then re-computed for the backwards pass as needed. Activation checkpointing is very useful when you have intermediate layers that produce large activations.
-
-FairScale's checkpointing wrapper also handles batch norm layers correctly, unlike the PyTorch implementation, ensuring stats are tracked correctly due to the multiple forward passes.
-
-This saves memory when training larger models, however it requires wrapping modules you'd like to use activation checkpointing on. See :class:`here <fairscale.nn.checkpoint.checkpoint_wrapper>` for more information.
-
-.. warning::
-
-    Do not wrap the entire model with activation checkpointing. This is not the intended use of activation checkpointing, and will lead to failures as seen in `this discussion <https://github.com/Lightning-AI/lightning/discussions/9144>`_.
-
-.. code-block:: python
-
-    from pytorch_lightning import Trainer
-    from fairscale.nn import checkpoint_wrapper
-
-
-    class MyModel(pl.LightningModule):
-        def __init__(self):
-            super().__init__()
-            # Wrap layers using checkpoint_wrapper
-            self.block_1 = checkpoint_wrapper(nn.Sequential(nn.Linear(32, 32), nn.ReLU()))
-            self.block_2 = nn.Linear(32, 2)
-
-----
-
-.. _fully-sharded-native-training:
-
-******************************
-PyTorch Fully Sharded Training
-******************************
+**********************
+Fully Sharded Training
+**********************
 
 PyTorch has it's own version of `FSDP <https://pytorch.org/docs/stable/fsdp.html>`_ which is upstreamed from their `fairscale <https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html>`__ project.
 It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_ but it is recommended to use it with PyTorch v1.12 or more and that's what
-Lightning supports. The API is pretty similar to that of FairScale.
+Lightning supports.
 
 
 Auto Wrapping
diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst
index 3d97a14946ebd..82d1d5e103564 100644
--- a/docs/source-pytorch/extensions/strategy.rst
+++ b/docs/source-pytorch/extensions/strategy.rst
@@ -80,16 +80,7 @@ The below table lists all relevant strategies available in Lightning with their
      - Colossal-AI provides a collection of parallel components for you. It aims to support you to write your distributed deep learning models just like how you write your model on your laptop. `Learn more. <https://www.colossalai.org/>`__
    * - fsdp_native
      - :class:`~pytorch_lightning.strategies.DDPFullyShardedNativeStrategy`
-     - Strategy for Fully Sharded Data Parallel provided by PyTorch. :ref:`Learn more. <advanced/model_parallel:PyTorch Fully Sharded Training>`
-   * - fsdp
-     - :class:`~pytorch_lightning.strategies.DDPFullyShardedStrategy`
-     - Strategy for Fully Sharded Data Parallel provided by FairScale. :ref:`Learn more. <advanced/model_parallel:FairScale Fully Sharded Training>`
-   * - ddp_sharded
-     - :class:`~pytorch_lightning.strategies.DDPShardedStrategy`
-     - Optimizer and gradient sharded training provided by FairScale. :ref:`Learn more. <advanced/model_parallel:FairScale Sharded Training>`
-   * - ddp_sharded_spawn
-     - :class:`~pytorch_lightning.strategies.DDPSpawnShardedStrategy`
-     - Optimizer sharded training provided by FairScale. :ref:`Learn more. <advanced/model_parallel:FairScale Sharded Training>`
+     - Strategy for Fully Sharded Data Parallel. :ref:`Learn more. <advanced/model_parallel:Fully Sharded Training>`
    * - ddp_spawn
      - :class:`~pytorch_lightning.strategies.DDPSpawnStrategy`
      - Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training finishes. :ref:`Learn more. <accelerators/gpu_intermediate:Distributed Data Parallel Spawn>`
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 5306790c61c04..39087a2af617f 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -51,12 +51,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
   * Deprecated the `Trainer.amp_backend` property
   * Deprecated the `Trainer(amp_level=...)` argument
   * Deprecated the `pytorch_lightning.plugins.ApexMixedPrecisionPlugin` class
-  * Deprecates the `pytorch_lightning.utilities.enum.sAMPType` enum
+  * Deprecates the `pytorch_lightning.utilities.enums.AMPType` enum
   * Deprecates the `DeepSpeedPrecisionPlugin(amp_type=..., amp_level=...)` arguments
 - `horovod` deprecation ([#16141](https://github.com/PyTorchLightning/pytorch-lightning/pull/16141))
   * Deprecated `Trainer(strategy="horovod")`
   * Deprecated the `HorovodStrategy` class
 - Deprecated `pytorch_lightning.lite.LightningLite` in favor of `lightning.fabric.Fabric` ([#16314](https://github.com/Lightning-AI/lightning/pull/16314))
+- `FairScale` deprecation (in favor of PyTorch's FSDP implementation) ([#16353](https://github.com/PyTorchLightning/pytorch-lightning/pull/16353))
+  * Deprecated the `pytorch_lightning.overrides.fairscale.LightningShardedDataParallel` class
+  * Deprecated the `pytorch_lightning.plugins.precision.fully_sharded_native_amp.FullyShardedNativeMixedPrecisionPlugin` class
+  * Deprecated the `pytorch_lightning.plugins.precision.sharded_native_amp.ShardedNativeMixedPrecisionPlugin` class
+  * Deprecated the `pytorch_lightning.strategies.fully_sharded.DDPFullyShardedStrategy` class
+  * Deprecated the `pytorch_lightning.strategies.sharded.DDPShardedStrategy` class
+  * Deprecated the `pytorch_lightning.strategies.sharded_spawn.DDPSpawnShardedStrategy` class
 
 
 ### Removed
diff --git a/src/pytorch_lightning/overrides/fairscale.py b/src/pytorch_lightning/overrides/fairscale.py
index d9ebb6345f215..f818792e575a9 100644
--- a/src/pytorch_lightning/overrides/fairscale.py
+++ b/src/pytorch_lightning/overrides/fairscale.py
@@ -41,6 +41,13 @@ def __init__(
         forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
         pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
     ) -> None:
+        rank_zero_deprecation(
+            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
+            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
+            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
+            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
+            " the native version by default."
+        )
         self._validate_init_arguments(pl_module, forward_module)
         super().__init__(forward_module=(pl_module or forward_module))
 
diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
index 870e658bfc9c3..904d61f4dffc3 100644
--- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
+++ b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
@@ -15,11 +15,22 @@
 
 from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
 
 
 class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin):
     """Native AMP for Fully Sharded Training."""
 
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        rank_zero_deprecation(
+            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
+            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
+            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
+            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
+            " the native version by default."
+        )
+        super().__init__(*args, **kwargs)
+
     def clip_grad_by_norm(self, *_: Any, **__: Any) -> None:
         # see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html
         # section `Gradient Clipping`, using `torch.nn.utils.clip_grad_norm_` is incorrect
diff --git a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py
index 077b1e6679113..f4f646b4239a2 100644
--- a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py
+++ b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py
@@ -18,6 +18,7 @@
 from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.plugins.precision.native_amp import MixedPrecisionPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
 
 if _FAIRSCALE_AVAILABLE:
     from fairscale.optim import OSS
@@ -32,6 +33,13 @@ class ShardedNativeMixedPrecisionPlugin(MixedPrecisionPlugin):
     def __init__(
         self, precision: Literal["16", 16, "bf16"], device: str, scaler: Optional[ShardedGradScaler] = None
     ) -> None:
+        rank_zero_deprecation(
+            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
+            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
+            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
+            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
+            " the native version by default."
+        )
         if not _FAIRSCALE_AVAILABLE:
             raise MisconfigurationException(
                 "You have asked for sharded AMP but you have not installed it."
diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py
index 64ddd1272a546..534fdf8dbbe32 100644
--- a/src/pytorch_lightning/strategies/fully_sharded.py
+++ b/src/pytorch_lightning/strategies/fully_sharded.py
@@ -28,6 +28,7 @@
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
+from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
 from pytorch_lightning.utilities.types import STEP_OUTPUT
 
 if _FAIRSCALE_AVAILABLE:
@@ -117,7 +118,13 @@ def __init__(
                 If ``False``, this will default to ``compute_device``.
                 (Default: True).
         """
-
+        rank_zero_deprecation(
+            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
+            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
+            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
+            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
+            " the native version by default."
+        )
         super().__init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py
index 922a4d70d92c4..e8749d53cac08 100644
--- a/src/pytorch_lightning/strategies/sharded.py
+++ b/src/pytorch_lightning/strategies/sharded.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from typing import Dict, Generator, List, Tuple
+from typing import Any, Dict, Generator, List, Tuple
 
 from torch import Tensor
 from torch.nn import Module
@@ -26,6 +26,7 @@
 from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
 
 if _FAIRSCALE_AVAILABLE:
     from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
@@ -40,6 +41,16 @@ class DDPShardedStrategy(DDPStrategy):
     strategy_name = "ddp_sharded"
     _REDUCE_BUFFER_SIZE_DEFAULT: int = 2**23  # 8M
 
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        rank_zero_deprecation(
+            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
+            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
+            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
+            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
+            " the native version by default."
+        )
+        super().__init__(*args, **kwargs)
+
     def connect(self, model: "pl.LightningModule") -> None:
         if not _FAIRSCALE_AVAILABLE:  # pragma: no cover
             raise MisconfigurationException(
diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py
index cf12b3b71c32a..74fb1f4026ec0 100644
--- a/src/pytorch_lightning/strategies/sharded_spawn.py
+++ b/src/pytorch_lightning/strategies/sharded_spawn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from typing import Dict, Generator, List, Tuple
+from typing import Any, Dict, Generator, List, Tuple
 
 from torch import Tensor
 from torch.nn import Module
@@ -26,6 +26,7 @@
 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
 
 if _FAIRSCALE_AVAILABLE:
     from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
@@ -40,6 +41,16 @@ class DDPSpawnShardedStrategy(DDPSpawnStrategy):
 
     strategy_name = "ddp_sharded_spawn"
 
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        rank_zero_deprecation(
+            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
+            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
+            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
+            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
+            " the native version by default."
+        )
+        super().__init__(*args, **kwargs)
+
     def connect(self, model: "pl.LightningModule") -> None:
         if not _FAIRSCALE_AVAILABLE:  # pragma: no cover
             raise MisconfigurationException(
diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py
index d7d3005e8fd98..415d50177ae2e 100644
--- a/src/pytorch_lightning/strategies/strategy.py
+++ b/src/pytorch_lightning/strategies/strategy.py
@@ -174,7 +174,7 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]:
             optimizer = optimizer._optimizer
 
         if hasattr(optimizer, "consolidate_state_dict"):
-            # there are optimizers like Fairscale's OSS or PyTorch's ZeroRedundancyOptimizer that shard their
+            # there are optimizers like PyTorch's ZeroRedundancyOptimizer that shard their
             # states, and to avoid OOM we consolidate the full state on rank 0 only
             optimizer.consolidate_state_dict()
             return optimizer.state_dict() if self.is_global_zero else {}
diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
index d1f5004bcbb35..c009fbd53fa82 100644
--- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
+++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
@@ -361,7 +361,6 @@ def test_swa_resume_training_from_checkpoint_ddp(tmpdir):
 @pytest.mark.parametrize(
     "strategy",
     [
-        pytest.param("fsdp", marks=RunIf(fairscale=True, min_cuda_gpus=1)),
         pytest.param("deepspeed", marks=RunIf(deepspeed=True, min_cuda_gpus=1)),
         pytest.param("fsdp_native", marks=RunIf(min_cuda_gpus=1, skip_windows=True, min_torch="1.12")),
     ],
diff --git a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py
index d5348b70c7728..1283ff0991f4e 100644
--- a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py
+++ b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py
@@ -398,7 +398,6 @@ def test_rename_lightning_lite():
         LightningParallelModule,
         LightningDistributedModule,
         LightningBaguaModule,
-        pytest.param(LightningShardedDataParallel, marks=RunIf(fairscale=True)),
     ],
 )
 def test_v1_10_deprecated_pl_module_init_parameter(wrapper_class):
@@ -413,6 +412,19 @@ def test_v1_10_deprecated_pl_module_init_parameter(wrapper_class):
         wrapper_class(pl_module=BoringModel())
 
 
+@RunIf(fairscale=True)
+def test_v1_10_deprecated_fairscale_pl_module_init_parameter():
+    with no_warning_call(
+        DeprecationWarning, match=r"The argument `pl_module` in `LightningShardedDataParallel` is deprecated in v1.8.0"
+    ), pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        LightningShardedDataParallel(BoringModel())
+
+    with pytest.deprecated_call(
+        match=r"The argument `pl_module` in `LightningShardedDataParallel` is deprecated in v1.8.0"
+    ):
+        LightningShardedDataParallel(pl_module=BoringModel())
+
+
 def test_v1_10_deprecated_unwrap_lightning_module():
     with pytest.deprecated_call(match=r"The function `unwrap_lightning_module` is deprecated in v1.8.0"):
         unwrap_lightning_module(BoringModel())
diff --git a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py b/tests/tests_pytorch/plugins/precision/test_sharded_precision.py
index 7d6cc87da54c0..e040523c1e9c9 100644
--- a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py
+++ b/tests/tests_pytorch/plugins/precision/test_sharded_precision.py
@@ -35,7 +35,8 @@
     ],
 )
 def test_sharded_precision_scaler(precision, scaler, expected):
-    plugin = ShardedNativeMixedPrecisionPlugin(precision=precision, scaler=scaler, device="cuda")
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        plugin = ShardedNativeMixedPrecisionPlugin(precision=precision, scaler=scaler, device="cuda")
     if expected:
         assert isinstance(plugin.scaler, expected)
     else:
diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py
index e8beecf15020a..8a96bd8fdd90c 100644
--- a/tests/tests_pytorch/plugins/test_cluster_integration.py
+++ b/tests/tests_pytorch/plugins/test_cluster_integration.py
@@ -65,11 +65,17 @@ def environment_combinations():
 def test_ranks_available_manual_strategy_selection(_, strategy_cls):
     """Test that the rank information is readily available after Trainer initialization."""
     num_nodes = 2
-    for cluster, variables, expected in environment_combinations():
+    for i, (cluster, variables, expected) in enumerate(environment_combinations()):
         with mock.patch.dict(os.environ, variables):
-            strategy = strategy_cls(
-                parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster
-            )
+            if strategy_cls is DDPShardedStrategy and i == 0:
+                with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+                    strategy = strategy_cls(
+                        parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster
+                    )
+            else:
+                strategy = strategy_cls(
+                    parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster
+                )
             trainer = Trainer(strategy=strategy, num_nodes=num_nodes)
             assert rank_zero_only.rank == expected["global_rank"]
             assert trainer.global_rank == expected["global_rank"]
@@ -93,7 +99,7 @@ def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwar
     num_nodes = 2
     trainer_kwargs.update(num_nodes=num_nodes)
 
-    for cluster, variables, expected in environment_combinations():
+    for i, (cluster, variables, expected) in enumerate(environment_combinations()):
         if trainer_kwargs["strategy"] == "ddp_spawn":
             if isinstance(cluster, (SLURMEnvironment, TorchElasticEnvironment)):
                 # slurm and torchelastic do not work with spawn strategies
@@ -102,7 +108,11 @@ def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwar
             expected.update(global_rank=(expected["node_rank"] * 2), local_rank=0)
 
         with mock.patch.dict(os.environ, variables):
-            trainer = Trainer(**trainer_kwargs)
+            if "sharded" in trainer_kwargs["strategy"] and i == 0:
+                with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+                    trainer = Trainer(**trainer_kwargs)
+            else:
+                trainer = Trainer(**trainer_kwargs)
             assert type(trainer.strategy.cluster_environment) is type(cluster)
             assert rank_zero_only.rank == expected["global_rank"]
             assert trainer.global_rank == expected["global_rank"]
diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
index 0afd24ba798db..a60c4d8cb8ecf 100644
--- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
+++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
@@ -6,7 +6,6 @@
 import torch
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.demos.boring_classes import BoringModel
 from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.plugins import FullyShardedNativeMixedPrecisionPlugin
@@ -149,11 +148,12 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
 
 def test_invalid_on_cpu(tmpdir):
     """Test to ensure that to raise Misconfiguration for FSDP on CPU."""
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp")
+    assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
     with pytest.raises(
         MisconfigurationException, match="You selected strategy to be `ddp_fully_sharded`, but GPU is not available."
     ):
-        trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp")
-        assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
         trainer.strategy.setup_environment()
 
 
@@ -161,9 +161,10 @@ def test_invalid_on_cpu(tmpdir):
 @RunIf(fairscale=True)
 def test_fsdp_with_sharded_amp(cuda_count_1, tmpdir):
     """Test to ensure that plugin native amp plugin is correctly chosen when using sharded."""
-    trainer = Trainer(
-        default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16
-    )
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(
+            default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16
+        )
     assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
     assert isinstance(trainer.strategy.precision_plugin, FullyShardedNativeMixedPrecisionPlugin)
 
@@ -173,65 +174,37 @@ def test_fully_sharded_strategy_checkpoint(tmpdir):
     """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
 
     model = TestFSDPModelManualWrapped()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        accelerator="gpu",
-        devices=1,
-        strategy="fsdp",
-        precision=16,
-        max_epochs=1,
-        enable_progress_bar=False,
-        enable_model_summary=False,
-    )
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            accelerator="gpu",
+            devices=1,
+            strategy="fsdp",
+            precision=16,
+            max_epochs=1,
+            enable_progress_bar=False,
+            enable_model_summary=False,
+        )
     _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
 
 
-@RunIf(min_cuda_gpus=2, standalone=True, fairscale=True)
-@pytest.mark.parametrize(
-    "model, strategy",
-    [
-        (TestFSDPModelManualWrapped(), DDPFullyShardedStrategy(min_num_params=2)),
-        (TestFSDPModelAutoWrapped(), "fsdp"),
-    ],
-)
-def test_fully_sharded_strategy_checkpoint_multi_gpus(tmpdir, model, strategy):
-    """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""
-
-    ck = ModelCheckpoint(save_last=True)
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        accelerator="gpu",
-        devices=2,
-        strategy=strategy,
-        precision=16,
-        max_epochs=1,
-        limit_train_batches=2,
-        limit_val_batches=2,
-        limit_test_batches=2,
-        limit_predict_batches=2,
-        callbacks=[ck],
-        enable_progress_bar=False,
-        enable_model_summary=False,
-    )
-    _run_multiple_stages(trainer, model)
-
-
 @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True)
 def test_fsdp_gradient_clipping_raises(tmpdir):
     """Test to ensure that an exception is raised when clipping gradients by value with FSDP."""
     model = TestFSDPModelManualWrapped()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        strategy="fsdp",
-        fast_dev_run=True,
-        accelerator="gpu",
-        devices=1,
-        precision=16,
-        gradient_clip_val=1,
-        gradient_clip_algorithm="norm",
-        enable_progress_bar=False,
-        enable_model_summary=False,
-    )
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            strategy="fsdp",
+            fast_dev_run=True,
+            accelerator="gpu",
+            devices=1,
+            precision=16,
+            gradient_clip_val=1,
+            gradient_clip_algorithm="norm",
+            enable_progress_bar=False,
+            enable_model_summary=False,
+        )
     with pytest.raises(
         MisconfigurationException, match="gradient_clip_algorithm='norm'` is currently not supported for `FullySharded"
     ):
@@ -240,15 +213,16 @@ def test_fsdp_gradient_clipping_raises(tmpdir):
 
 @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True)
 def test_fsdp_rewrap_limitation(tmpdir):
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        accelerator="gpu",
-        devices=1,
-        max_steps=1,
-        limit_val_batches=0,
-        limit_test_batches=1,
-        strategy="fsdp",
-    )
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            accelerator="gpu",
+            devices=1,
+            max_steps=1,
+            limit_val_batches=0,
+            limit_test_batches=1,
+            strategy="fsdp",
+        )
     model = TestFSDPModelAutoWrapped()
     trainer.fit(model)
 
@@ -258,7 +232,8 @@ def test_fsdp_rewrap_limitation(tmpdir):
 
 @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True)
 def test_invalid_parameters_in_optimizer():
-    trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1)
 
     class EmptyParametersModel(BoringModel):
         def configure_optimizers(self):
diff --git a/tests/tests_pytorch/strategies/test_ddp_strategy.py b/tests/tests_pytorch/strategies/test_ddp_strategy.py
index fcdc683ec9bc3..035b7e47b372b 100644
--- a/tests/tests_pytorch/strategies/test_ddp_strategy.py
+++ b/tests/tests_pytorch/strategies/test_ddp_strategy.py
@@ -23,14 +23,10 @@
 from lightning_fabric.plugins.environments import ClusterEnvironment, LightningEnvironment
 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.demos.boring_classes import BoringModel
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.strategies import DDPStrategy
 from pytorch_lightning.trainer.states import TrainerFn
 from tests_pytorch.helpers.runif import RunIf
 
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.optim import OSS
-
 
 class BoringModelGPU(BoringModel):
     def on_train_start(self) -> None:
@@ -256,33 +252,6 @@ def test_ddp_strategy_set_timeout(mock_init_process_group):
     )
 
 
-class BoringFairScaleOptimizerModel(BoringModel):
-    def configure_optimizers(self):
-        base_optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-        return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults)
-
-
-@RunIf(min_cuda_gpus=2, fairscale=True)
-@pytest.mark.parametrize("strategy", (pytest.param("ddp", marks=RunIf(standalone=True)), "ddp_spawn"))
-def test_ddp_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy):
-    """Test to ensure that checkpoint is saved correctly when using fairscale optimizer."""
-    model = BoringFairScaleOptimizerModel()
-    trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1)
-
-    trainer.fit(model)
-
-    checkpoint_path = os.path.join(tmpdir, "model.pt")
-    # need to broadcast because tmpdir is different on each process
-    checkpoint_path = trainer.strategy.broadcast(checkpoint_path)
-    trainer.save_checkpoint(checkpoint_path)
-    trainer.strategy.barrier()  # ensure the checkpoint is saved before load
-    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
-
-    # Assert model parameters are identical after loading
-    for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()):
-        assert torch.equal(trained_param.to("cpu"), loaded_param)
-
-
 class BoringZeroRedundancyOptimizerModel(BoringModel):
     def configure_optimizers(self):
         return ZeroRedundancyOptimizer(self.layer.parameters(), optimizer_class=torch.optim.Adam, lr=0.1)
diff --git a/tests/tests_pytorch/strategies/test_registry.py b/tests/tests_pytorch/strategies/test_registry.py
index 8536e0b8b3438..39e10a05fc328 100644
--- a/tests/tests_pytorch/strategies/test_registry.py
+++ b/tests/tests_pytorch/strategies/test_registry.py
@@ -74,7 +74,8 @@ def test_fsdp_strategy_registry(tmpdir):
     assert strategy in StrategyRegistry
     assert StrategyRegistry[strategy]["strategy"] == DDPFullyShardedStrategy
 
-    trainer = Trainer(strategy=strategy)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(strategy=strategy)
 
     assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
 
@@ -117,7 +118,11 @@ def test_fsdp_strategy_registry(tmpdir):
     ],
 )
 def test_ddp_find_unused_parameters_strategy_registry(tmpdir, strategy_name, strategy, expected_init_params):
-    trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name)
+    if "sharded" in strategy_name:
+        with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+            trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name)
+    else:
+        trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name)
     assert isinstance(trainer.strategy, strategy)
     assert strategy_name in StrategyRegistry
     assert StrategyRegistry[strategy_name]["init_params"] == expected_init_params
diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py
index b8db7d1c786a8..29fd4607c521b 100644
--- a/tests/tests_pytorch/strategies/test_sharded_strategy.py
+++ b/tests/tests_pytorch/strategies/test_sharded_strategy.py
@@ -9,7 +9,7 @@
 from torch import Tensor
 
 from pytorch_lightning import LightningModule, Trainer
-from pytorch_lightning.demos.boring_classes import BoringModel, ManualOptimBoringModel
+from pytorch_lightning.demos.boring_classes import BoringModel
 from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.plugins import MixedPrecisionPlugin
 from pytorch_lightning.strategies import DDPShardedStrategy, DDPSpawnShardedStrategy
@@ -58,15 +58,16 @@ def _is_equal(self, a, b):
 def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_val, tmpdir):
     """Ensure that clip gradients is only called if the value is greater than 0."""
     model = BoringModel()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        strategy="ddp_sharded",
-        accelerator="gpu",
-        devices=1,
-        precision=16,
-        fast_dev_run=True,
-        gradient_clip_val=clip_val,
-    )
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            strategy="ddp_sharded",
+            accelerator="gpu",
+            devices=1,
+            precision=16,
+            fast_dev_run=True,
+            gradient_clip_val=clip_val,
+        )
     trainer.fit(model)
     if clip_val > 0:
         mock_oss_clip_grad_norm.assert_called()
@@ -80,7 +81,8 @@ def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_v
 )
 def test_sharded_ddp_choice(strategy, expected):
     """Test to ensure that strategy is correctly chosen."""
-    trainer = Trainer(fast_dev_run=True, strategy=strategy)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(fast_dev_run=True, strategy=strategy)
     assert isinstance(trainer.strategy, expected)
 
 
@@ -90,7 +92,8 @@ def test_sharded_ddp_choice(strategy, expected):
 )
 def test_ddp_choice_sharded_amp(strategy, expected):
     """Test to ensure that plugin native amp plugin is correctly chosen when using sharded."""
-    trainer = Trainer(fast_dev_run=True, accelerator="gpu", devices=1, precision=16, strategy=strategy)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(fast_dev_run=True, accelerator="gpu", devices=1, precision=16, strategy=strategy)
     assert isinstance(trainer.strategy, expected)
     assert isinstance(trainer.precision_plugin, MixedPrecisionPlugin)
 
@@ -99,7 +102,8 @@ def test_ddp_choice_sharded_amp(strategy, expected):
 def test_ddp_sharded_strategy_checkpoint_cpu(tmpdir):
     """Test to ensure that checkpoint is saved correctly."""
     model = BoringModel()
-    trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True)
 
     trainer.fit(model)
 
@@ -116,7 +120,8 @@ def test_ddp_sharded_strategy_checkpoint_cpu(tmpdir):
 def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir):
     """Test to ensure that checkpoint is saved correctly when using multiple GPUs."""
     model = BoringModel()
-    trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True)
 
     trainer.fit(model)
 
@@ -133,7 +138,8 @@ def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir):
 def test_ddp_sharded_strategy_finetune(tmpdir):
     """Test to ensure that we can save and restart training (simulate fine-tuning)"""
     model = BoringModel()
-    trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True)
     trainer.fit(model)
 
     checkpoint_path = os.path.join(tmpdir, "model.pt")
@@ -148,7 +154,8 @@ def test_ddp_sharded_strategy_finetune(tmpdir):
 def test_ddp_sharded_strategy_fit_ckpt_path(tmpdir):
     """Test to ensure that resuming from checkpoint works."""
     model = BoringModel()
-    trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True)
 
     trainer.fit(model)
 
@@ -166,7 +173,8 @@ def test_ddp_sharded_strategy_fit_ckpt_path(tmpdir):
 def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir):
     """Test to ensure that resuming from checkpoint works when going from GPUs- > CPU."""
     model = BoringModel()
-    trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=1, fast_dev_run=True)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=1, fast_dev_run=True)
 
     trainer.fit(model)
 
@@ -175,7 +183,8 @@ def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir):
 
     model = BoringModel()
 
-    trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True)
 
     trainer.fit(model, ckpt_path=checkpoint_path)
 
@@ -191,34 +200,19 @@ def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir):
 def test_ddp_sharded_strategy_test_multigpu(trainer_kwargs):
     """Test to ensure we can use validate and test without fit."""
     model = BoringModel()
-    trainer = Trainer(
-        strategy="ddp_sharded_spawn",
-        fast_dev_run=True,
-        enable_progress_bar=False,
-        enable_model_summary=False,
-        **trainer_kwargs,
-    )
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(
+            strategy="ddp_sharded_spawn",
+            fast_dev_run=True,
+            enable_progress_bar=False,
+            enable_model_summary=False,
+            **trainer_kwargs,
+        )
 
     trainer.validate(model)
     trainer.test(model)
 
 
-@RunIf(min_cuda_gpus=2, standalone=True, fairscale=True)
-@pytest.mark.parametrize("strategy", ("ddp_sharded", "ddp_sharded_spawn"))
-def test_ddp_sharded_strategy_manual_optimization(tmpdir, strategy):
-    model = ManualOptimBoringModel()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        strategy=strategy,
-        fast_dev_run=2,
-        accelerator="gpu",
-        devices=2,
-        enable_progress_bar=False,
-        enable_model_summary=False,
-    )
-    trainer.fit(model)
-
-
 class BoringModelSharded(BoringModel):
     def on_train_start(self) -> None:
         """Check if trainer module is wrapped as ShardedDataParallel during training stage."""
@@ -243,7 +237,8 @@ def on_predict_start(self) -> None:
 @RunIf(fairscale=True)
 def test_configure_ddp(tmpdir):
     """Tests with ddp sharded strategy."""
-    trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=True)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=True)
 
     model = BoringModelSharded()
 
@@ -258,7 +253,8 @@ def test_configure_ddp(tmpdir):
 @pytest.mark.parametrize("cls", [DDPShardedStrategy, DDPSpawnShardedStrategy])
 def test_custom_kwargs_sharded(_, cls):
     """Tests to ensure that if custom kwargs are passed, they are set correctly."""
-    strategy = cls(reduce_fp16=True)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        strategy = cls(reduce_fp16=True)
     strategy._lightning_module = Mock(spec=LightningModule)
     strategy._lightning_module.trainer = Mock()
     strategy.parallel_devices = [Mock()]
@@ -277,7 +273,8 @@ def test_custom_kwargs_sharded(_, cls):
 @pytest.mark.parametrize("num_nodes", [1, 2])
 def test_custom_kwargs_sharded_reduce_buffer_size(_, params, expected_buffer_size, num_nodes):
     """Tests to ensure that ``reduce_buffer_size`` is correctly set based on user kwargs."""
-    strategy = DDPShardedStrategy(**params)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        strategy = DDPShardedStrategy(**params)
     strategy.num_nodes = num_nodes
     strategy._lightning_module = Mock(spec=LightningModule)
     strategy._lightning_module.trainer = Mock()
@@ -297,7 +294,8 @@ def test_custom_kwargs_sharded_reduce_buffer_size(_, params, expected_buffer_siz
 
 @RunIf(fairscale=True)
 def test_block_backward_sync():
-    strategy = DDPShardedStrategy()
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        strategy = DDPShardedStrategy()
     model = mock.MagicMock(spec=ShardedDataParallel)
     with mock.patch.object(strategy, "_model", model):
         with strategy.block_backward_sync():
@@ -315,7 +313,8 @@ def test_block_backward_sync():
     ],
 )
 def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
-    trainer = Trainer(strategy=strategy_name)
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(strategy=strategy_name)
     assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs
 
 
@@ -325,38 +324,18 @@ def configure_optimizers(self):
         return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults)
 
 
-@RunIf(min_cuda_gpus=2, fairscale=True)
-@pytest.mark.parametrize("strategy", (pytest.param("ddp_sharded", marks=RunIf(standalone=True)), "ddp_sharded_spawn"))
-def test_ddp_sharded_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy):
-    """Test to ensure that checkpoint is saved correctly when using fairscale optimizers."""
-    model = BoringFairScaleOptimizerModel()
-    trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1)
-
-    trainer.fit(model)
-
-    checkpoint_path = os.path.join(tmpdir, "model.pt")
-    # need to broadcast because tmpdir is different on each process
-    checkpoint_path = trainer.strategy.broadcast(checkpoint_path)
-    trainer.save_checkpoint(checkpoint_path)
-    trainer.strategy.barrier()  # ensure the checkpoint is saved before load
-    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
-
-    # Assert model parameters are identical after loading
-    for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()):
-        assert torch.equal(trained_param.to("cpu"), loaded_param)
-
-
 @RunIf(min_cuda_gpus=2, fairscale=True)
 def test_ddp_sharded_strategy_fit_ckpt_path_downsize_gpus(tmpdir):
     model = ModelWithAdamOptimizer()
-    trainer = Trainer(
-        strategy="ddp_sharded_spawn",
-        max_epochs=1,
-        limit_train_batches=1,
-        limit_val_batches=0,
-        accelerator="gpu",
-        devices=2,
-    )
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(
+            strategy="ddp_sharded_spawn",
+            max_epochs=1,
+            limit_train_batches=1,
+            limit_val_batches=0,
+            accelerator="gpu",
+            devices=2,
+        )
     trainer.fit(model)
 
     checkpoint_path = trainer.checkpoint_callback.best_model_path
@@ -365,12 +344,13 @@ def test_ddp_sharded_strategy_fit_ckpt_path_downsize_gpus(tmpdir):
     old_optimizer_states = deepcopy(ckpt["optimizer_states"])
 
     model = CheckModelRestore(old_model_state_dict, old_optimizer_states)
-    trainer = Trainer(
-        strategy="ddp_sharded_spawn",
-        max_epochs=2,
-        limit_train_batches=1,
-        limit_val_batches=0,
-        accelerator="gpu",
-        devices=1,
-    )
+    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+        trainer = Trainer(
+            strategy="ddp_sharded_spawn",
+            max_epochs=2,
+            limit_train_batches=1,
+            limit_val_batches=0,
+            accelerator="gpu",
+            devices=1,
+        )
     trainer.fit(model, ckpt_path=checkpoint_path)
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
index c0da8086a8b84..067cc811bd3e9 100644
--- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
+++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -241,7 +241,9 @@ def test_interactive_incompatible_backend_error(cuda_count_2, monkeypatch):
     with pytest.raises(MisconfigurationException, match=r"strategy='ddp_spawn'\)`.*is not compatible"):
         Trainer(strategy="ddp_spawn", accelerator="gpu", devices=2)
 
-    with pytest.raises(MisconfigurationException, match=r"strategy='ddp_sharded_spawn'\)`.*is not compatible"):
+    with pytest.raises(
+        MisconfigurationException, match=r"strategy='ddp_sharded_spawn'\)`.*is not compatible"
+    ), pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
         Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=2)
 
     with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"):
@@ -282,7 +284,13 @@ def test_interactive_compatible_strategy_ddp_fork(monkeypatch):
 )
 @pytest.mark.parametrize("devices", [1, 2])
 def test_accelerator_choice_multi_node_gpu(cuda_count_2, tmpdir, strategy, strategy_class, devices):
-    trainer = Trainer(default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices)
+    if "sharded" in strategy:
+        with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+            trainer = Trainer(
+                default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices
+            )
+    else:
+        trainer = Trainer(default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices)
     assert isinstance(trainer.strategy, strategy_class)
 
 
@@ -386,10 +394,16 @@ def test_exception_invalid_strategy():
 )
 @pytest.mark.parametrize("accelerator", ["mps", "auto", "gpu", None, MPSAccelerator()])
 def test_invalid_ddp_strategy_with_mps(accelerator, strategy, strategy_class, mps_count_1, cuda_count_0):
-    with pytest.raises(ValueError, match="strategies from the DDP family are not supported"):
-        Trainer(accelerator=accelerator, strategy=strategy)
-
-    with pytest.raises(ValueError, match="strategies from the DDP family are not supported"):
+    if "sharded" in strategy:
+        with pytest.raises(ValueError, match="strategies from the DDP family are not supported"):
+            Trainer(accelerator=accelerator, strategy=strategy)
+    else:
+        with pytest.raises(ValueError, match="strategies from the DDP family are not supported"):
+            Trainer(accelerator=accelerator, strategy=strategy)
+
+    with pytest.raises(ValueError, match="strategies from the DDP family are not supported"), pytest.deprecated_call(
+        match="FairScale has been deprecated in v1.9.0"
+    ):
         Trainer(accelerator="mps", strategy=strategy_class())
 
 
@@ -428,7 +442,11 @@ def test_strategy_choice_cpu_instance(strategy_class):
     ],
 )
 def test_strategy_choice_gpu_str(strategy, strategy_class):
-    trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2)
+    if "sharded" in strategy:
+        with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+            trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2)
+    else:
+        trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2)
     assert isinstance(trainer.strategy, strategy_class)
 
 
diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py
index edace5429a531..231ed0a415bc7 100644
--- a/tests/tests_pytorch/trainer/test_trainer.py
+++ b/tests/tests_pytorch/trainer/test_trainer.py
@@ -2100,13 +2100,6 @@ def training_step(self, batch, batch_idx):
             CUDAAccelerator,
             2,
         ),
-        (
-            {"strategy": DDPShardedStrategy(), "accelerator": "cuda", "devices": 2},
-            DDPShardedStrategy,
-            "ddp_sharded",
-            CUDAAccelerator,
-            2,
-        ),
         (
             {"strategy": "ddp_spawn", "accelerator": "cuda", "devices": 2, "num_nodes": 2},
             DDPSpawnStrategy,
@@ -2141,7 +2134,12 @@ def test_trainer_config_strategy(monkeypatch, trainer_kwargs, strategy_cls, stra
     if trainer_kwargs.get("accelerator") == "cuda":
         mock_cuda_count(monkeypatch, trainer_kwargs["devices"])
 
-    trainer = Trainer(**trainer_kwargs)
+    strategy = trainer_kwargs.get("strategy")
+    if (isinstance(strategy, str) and "sharded" in strategy) or isinstance(strategy, (DDPShardedStrategy)):
+        with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
+            trainer = Trainer(**trainer_kwargs)
+    else:
+        trainer = Trainer(**trainer_kwargs)
 
     assert isinstance(trainer.strategy, strategy_cls)
     assert strategy_cls.strategy_name == strategy_name