diff --git a/.actions/assistant.py b/.actions/assistant.py
index 4e68b548e78675..e8088599ae1865 100644
--- a/.actions/assistant.py
+++ b/.actions/assistant.py
@@ -10,7 +10,7 @@
     "requirements.txt",
     "requirements/extra.txt",
     "requirements/loggers.txt",
-    # "requirements/test.txt",
+    "requirements/strategies.txt",
     "requirements/examples.txt",
 )
 
diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 0a2465b85c484e..d9b59c5b2cfc03 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -19,7 +19,7 @@ pr:
 jobs:
   - job: pytest
     # how long to run the job before automatically cancelling
-    timeoutInMinutes: "45"
+    timeoutInMinutes: "55"
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
 
@@ -51,9 +51,7 @@ jobs:
       displayName: 'Image info & NVIDIA'
 
     - bash: |
-        python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
-        pip install fairscale>=0.4.5
-        pip install deepspeed>=0.6.0
+        python -c "fname = 'requirements/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
         pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
         pip install . --requirement requirements/devel.txt
diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index 13d1c1ecb1f765..c4abbe3c949fd2 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -14,10 +14,10 @@ pr:
   - "release/*"
 
 jobs:
-  - job: hpu
+  - job: tests
 
     # how long to run the job before automatically cancelling
-    timeoutInMinutes: "5"
+    timeoutInMinutes: "10"
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
 
@@ -33,6 +33,7 @@ jobs:
       displayName: 'Instance HW info'
 
     - bash: |
+        pip install . --requirement requirements/extra.txt
         pip install . --requirement requirements/test.txt
       displayName: 'Install dependencies'
 
diff --git a/.azure-pipelines/ipu-tests.yml b/.azure-pipelines/ipu-tests.yml
index caa3df86a71682..0a60cf78677257 100644
--- a/.azure-pipelines/ipu-tests.yml
+++ b/.azure-pipelines/ipu-tests.yml
@@ -16,8 +16,10 @@ variables:
   value: "poplar_sdk-ubuntu_20_04-2.3.1+793-89796d462d"
 
 jobs:
-  - job: ipu
+  - job: tests
 
+    # how long to run the job before automatically cancelling
+    timeoutInMinutes: "15"
     pool: graphcore-ipus
 
     workspace:
@@ -51,11 +53,9 @@ jobs:
 
     - bash: |
         export GIT_TERMINAL_PROMPT=1
-        python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)"
-        python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         python ./requirements/adjust-versions.py requirements/extra.txt
         python ./requirements/adjust-versions.py requirements/examples.txt
-        pip install . --requirement requirements/devel.txt
+        pip install . --requirement ./requirements/devel-base.txt
         pip list
       displayName: 'Install dependencies'
 
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 9ee1932624042d..02675f044c1300 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -13,11 +13,13 @@
 *.yml           @borda @tchaton @carmocca
 
 # Docs
-/docs/                      @edenlightning @tchaton @borda @awaelchli
-/.github/*.md               @edenlightning @williamfalcon @borda
-/.github/ISSUE_TEMPLATE/    @edenlightning @borda @tchaton
-/docs/source/conf.py        @borda @awaelchli @carmocca
-/docs/source/index.rst      @williamfalcon
+/docs/                                  @edenlightning @tchaton @borda @awaelchli
+/.github/*.md                           @edenlightning @williamfalcon @borda
+/.github/ISSUE_TEMPLATE/                @edenlightning @borda @tchaton
+/docs/source/conf.py                    @borda @awaelchli @carmocca
+/docs/source/index.rst                  @williamfalcon
+/docs/source/levels                     @williamfalcon
+/docs/source/expertise_levels           @williamfalcon
 
 # Packages
 /pytorch_lightning/accelerators         @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11
diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index a159539fd9229a..1e8412319df44d 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -73,11 +73,15 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python_version: ["3.7", "3.9"]
-        pytorch_version: ["1.8", "1.11"]
         include:
           # the config used in '.azure-pipelines/gpu-tests.yml'
-          - {python_version: "3.9", pytorch_version: "1.10"}
+          - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1"}
+          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          # latest (used in Tutorials)
+          - {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1"}
+          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -88,6 +92,7 @@ jobs:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
           file: dockers/base-cuda/Dockerfile
           push: false
         timeout-minutes: 75
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index 9e2e1554ca21dd..84ca60fd30f2e4 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -98,10 +98,6 @@ jobs:
       shell: bash
 
     - name: Install extra dependencies
-      env:
-        HOROVOD_BUILD_ARCH_FLAGS: "-mfma"
-        HOROVOD_WITHOUT_MXNET: 1
-        HOROVOD_WITHOUT_TENSORFLOW: 1
       run: |
         # adjust versions according installed Torch version
         python ./requirements/adjust-versions.py requirements/extra.txt
@@ -119,7 +115,7 @@ jobs:
         HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')" || true)
         if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then
           pip uninstall -y horovod
-          echo $(grep "horovod" requirements/extra.txt) > requirements/horovod.txt
+          grep "horovod" requirements/strategies.txt > requirements/horovod.txt
           pip install --no-cache-dir -r requirements/horovod.txt
         fi
         horovodrun --check-build
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index bb019ecb408117..df34d2c47208d0 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -13,17 +13,10 @@ concurrency:
 jobs:
   mypy:
     runs-on: ubuntu-20.04
-    #strategy:
-    #  fail-fast: false
-    #  matrix:
-    #    include:
-    #      - {python-version: "3.8", pytorch-version: "1.8"}
-    #      - {python-version: "3.9", pytorch-version: "1.10"}
     steps:
     - uses: actions/checkout@master
     - uses: actions/setup-python@v2
       with:
-        # python-version: ${{ matrix.python-version }}
         python-version: 3.9
 
     # Note: This uses an internal pip API and may not always work
@@ -37,15 +30,10 @@ jobs:
           ${{ runner.os }}-pip-
 
     - name: Install dependencies
-      env:
-        # TORCH_VERSION: ${{ matrix.pytorch-version }}
-        TORCH_VERSION: "1.10"
       run: |
-        pip install "torch==$TORCH_VERSION" --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
-        # adjust versions according installed Torch version
+        pip install torch==1.11 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
         python ./requirements/adjust-versions.py requirements/extra.txt
-        python ./requirements/adjust-versions.py requirements/examples.txt
-        pip install '.[dev]' --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
+        pip install '.[dev]'
         pip list
 
     - name: Type check
diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index c2219479f6354f..7c9069e53dd671 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -41,12 +41,9 @@ jobs:
           sudo apt-get install -y cmake pandoc
           pip --version
           pip install -q fire
-          # remove Horovod from requirements
-          python .actions/assistant.py requirements_prune_pkgs horovod
           # python -m pip install --upgrade --user pip
           pip install --requirement requirements.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
-          pip install --requirement requirements/extra.txt
-          pip install --requirement requirements/loggers.txt
+          pip install --requirement requirements/devel-base.txt
           pip install --requirement requirements/docs.txt
           pip list
         shell: bash
diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index c3c92d7965aead..3506a099fb6e40 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -115,12 +115,13 @@ jobs:
       matrix:
         include:
           # the config used in '.azure-pipelines/gpu-tests.yml'
-          - {python_version: "3.7", pytorch_version: "1.8"}
-          - {python_version: "3.7", pytorch_version: "1.10"}
+          - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1"}
+          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1"}
           # latest (used in Tutorials)
-          - {python_version: "3.8", pytorch_version: "1.8"}
-          - {python_version: "3.9", pytorch_version: "1.10"}
-          - {python_version: "3.9", pytorch_version: "1.11"}
+          - {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1"}
+          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
 
     steps:
       - name: Checkout
@@ -140,6 +141,7 @@ jobs:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
           file: dockers/base-cuda/Dockerfile
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
diff --git a/.gitignore b/.gitignore
index 923c2a1829c228..e9bcc4cfd385f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,9 +7,6 @@ pip-wheel-metadata/
 lightning_logs/
 .vscode/
 
-# Test-tube
-test_tube_*/
-
 # Documentations
 docs/source/api
 docs/source/*.md
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1f88bee2247342..800dd6859f6083 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -36,6 +36,7 @@ repos:
         args: ['--maxkb=350', '--enforce-all']
         exclude: |
             (?x)^(
+                CHANGELOG.md|
                 docs/source/_static/images/general/fast_2.gif|
                 docs/source/_static/images/mnist_imgs/pt_to_pl.jpg|
                 docs/source/_static/images/lightning_module/pt_to_pl.png|
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 008c9d249a5de6..0a39151dee8d85 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,19 +9,47 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
--
+- Added support for reloading the last checkpoint saved by passing `ckpt_path="last"` ([#12816](https://github.com/PyTorchLightning/pytorch-lightning/pull/12816))
+
+
+- Added `LightningDataModule.load_from_checkpoint` to support loading datamodules directly from checkpoint ([#12550](https://github.com/PyTorchLightning/pytorch-lightning/pull/12550))
+
+
+- Added a friendly error message when attempting to call `Trainer.save_checkpoint()` without a model attached ([#12772](https://github.com/PyTorchLightning/pytorch-lightning/pull/12772))
+
+
+- Added a friendly error message when attempting to use `DeepSpeedStrategy` on unsupported accelerators ([#12699](https://github.com/PyTorchLightning/pytorch-lightning/pull/12699))
+
+
+- Enabled `torch.inference_mode` for evaluation and prediction ([#12715](https://github.com/PyTorchLightning/pytorch-lightning/pull/12715))
+
+
+- Added support for setting `val_check_interval` to a value higher than the amount of training batches when `check_val_every_n_epoch=None` ([#11993](https://github.com/PyTorchLightning/pytorch-lightning/pull/11993))
 
 
 - Include the `pytorch_lightning` version as a header in the CLI config files ([#12532](https://github.com/PyTorchLightning/pytorch-lightning/pull/12532))
 
 
--
+- Added support for `Callback` registration through entry points ([#12739](https://github.com/PyTorchLightning/pytorch-lightning/pull/12739))
 
 
--
+- Added support for `Trainer(deterministic="warn")` to warn instead of fail when a non-deterministic operation is encountered ([#12588](https://github.com/PyTorchLightning/pytorch-lightning/pull/12588))
 
 
--
+- Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842))
+
+
+- Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/PyTorchLightning/pytorch-lightning/pull/12902))
+
+
+- Added missing `predict_dataset` argument in `LightningDataModule.from_datasets` to create predict dataloaders ([#12942](https://github.com/PyTorchLightning/pytorch-lightning/pull/12942))
+
+
+- Added class name prefix to metrics logged by `DeviceStatsMonitor` ([#12228](https://github.com/PyTorchLightning/pytorch-lightning/pull/12228))
+
+
+- Added Native FSDP Strategy ([#12447](https://github.com/PyTorchLightning/pytorch-lightning/pull/12447))
+
 
 ### Changed
 
@@ -40,10 +68,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Marked `swa_lrs` argument in `StochasticWeightAveraging` callback as required ([#12556](https://github.com/PyTorchLightning/pytorch-lightning/pull/12556))
 
 
--
+- `LightningCLI`'s shorthand notation changed to use jsonargparse native feature ([#12614](https://github.com/PyTorchLightning/pytorch-lightning/pull/12614))
 
 
--
+- Changed `seed_everything_default` argument in the `LightningCLI` to type `Union[bool, int]`. If set to `True` a seed is automatically generated for the parser argument `--seed_everything`. ([#12822](https://github.com/PyTorchLightning/pytorch-lightning/pull/12822))
+
+
+- Make positional arguments required for classes passed into the `add_argparse_args` function. ([#12504](https://github.com/PyTorchLightning/pytorch-lightning/pull/12504))
+
+
+- Raise an error if there are insufficient training batches when using a float value of `limit_train_batches` ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885))
 
 
 -
@@ -57,7 +91,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `num_processes`, `gpus`, `tpu_cores,` and `ipus` from the `Trainer` constructor in favor of using the `accelerator` and `devices` arguments ([#11040](https://github.com/PyTorchLightning/pytorch-lightning/pull/11040))
 
 
--
+- Deprecated setting `LightningCLI(seed_everything_default=None)` in favor of `False` ([#12804](https://github.com/PyTorchLightning/pytorch-lightning/issues/12804)).
 
 
 -
@@ -67,6 +101,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Removed
 
+- Removed the deprecated `TestTubeLogger` ([#12859](https://github.com/PyTorchLightning/pytorch-lightning/pull/12859))
+
+
 - Removed the deprecated `pytorch_lightning.core.memory.LayerSummary` and `pytorch_lightning.core.memory.ModelSummary` ([#12593](https://github.com/PyTorchLightning/pytorch-lightning/pull/12593))
 
 
@@ -118,16 +155,64 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed support for passing strategy names or strategy instances to the plugins Trainer argument ([#12700](https://github.com/PyTorchLightning/pytorch-lightning/pull/12700))
 
 
+- Removed the deprecated `val_transforms` argument from the `LightningDataModule` constructor ([#12763](https://github.com/PyTorchLightning/pytorch-lightning/pull/12763))
+
+
+- Removed the deprecated `test_transforms` argument from the `LightningDataModule` constructor ([#12773](https://github.com/PyTorchLightning/pytorch-lightning/pull/12773))
+
+
 - Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/PyTorchLightning/pytorch-lightning/pull/12769))
 
+
+- Removed deprecated `get_progress_bar_dict` property from `LightningModule` ([#12839](https://github.com/PyTorchLightning/pytorch-lightning/pull/12839))
+
 ### Fixed
 
+- Fixed an issue causing zero-division error for empty dataloaders ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885))
+
+
+-
+
+
+-
+
+
 -
 
 
 -
 
 
+## [1.6.3] - 2022-05-03
+
+### Fixed
+
+- Use only a single instance of `rich.console.Console` throughout codebase ([#12886](https://github.com/PyTorchLightning/pytorch-lightning/pull/12886))
+- Fixed an issue to ensure all the checkpoint states are saved in a common filepath with `DeepspeedStrategy` ([#12887](https://github.com/PyTorchLightning/pytorch-lightning/pull/12887))
+- Fixed `trainer.logger` deprecation message ([#12671](https://github.com/PyTorchLightning/pytorch-lightning/pull/12671))
+- Fixed an issue where sharded grad scaler is passed in when using BF16 with the `ShardedStrategy` ([#12915](https://github.com/PyTorchLightning/pytorch-lightning/pull/12915))
+- Fixed an issue wrt recursive invocation of DDP configuration in hpu parallel plugin ([#12912](https://github.com/PyTorchLightning/pytorch-lightning/pull/12912))
+- Fixed printing of ragged dictionaries in `Trainer.validate` and `Trainer.test` ([#12857](https://github.com/PyTorchLightning/pytorch-lightning/pull/12857))
+- Fixed threading support for legacy loading of checkpoints ([#12814](https://github.com/PyTorchLightning/pytorch-lightning/pull/12814))
+- Fixed pickling of `KFoldLoop` ([#12441](https://github.com/PyTorchLightning/pytorch-lightning/pull/12441))
+- Stopped `optimizer_zero_grad` from being called after IPU execution ([#12913](https://github.com/PyTorchLightning/pytorch-lightning/pull/12913))
+- Fixed `fuse_modules` to be qat-aware for `torch>=1.11` ([#12891](https://github.com/PyTorchLightning/pytorch-lightning/pull/12891))
+- Enforced eval shuffle warning only for default samplers in DataLoader ([#12653](https://github.com/PyTorchLightning/pytorch-lightning/pull/12653))
+- Enable mixed precision in `DDPFullyShardedStrategy` when `precision=16` ([#12965](https://github.com/PyTorchLightning/pytorch-lightning/pull/12965))
+- Fixed `TQDMProgressBar` reset and update to show correct time estimation ([#12889](https://github.com/PyTorchLightning/pytorch-lightning/pull/12889))
+- Fixed fit loop restart logic to enable resume using the checkpoint ([#12821](https://github.com/PyTorchLightning/pytorch-lightning/pull/12821))
+
+
+## [1.6.2] - 2022-04-27
+
+### Fixed
+
+- Fixed `ImportError` when `torch.distributed` is not available. ([#12794](https://github.com/PyTorchLightning/pytorch-lightning/pull/12794))
+- When using custom DataLoaders in LightningDataModule, multiple inheritance is resolved properly ([#12716](https://github.com/PyTorchLightning/pytorch-lightning/pull/12716))
+- Fixed encoding issues on terminals that do not support unicode characters ([#12828](https://github.com/PyTorchLightning/pytorch-lightning/pull/12828))
+- Fixed support for `ModelCheckpoint` monitors with dots ([#12783](https://github.com/PyTorchLightning/pytorch-lightning/pull/12783))
+
+
 ## [1.6.1] - 2022-04-13
 
 ### Changed
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
index 790c997f7b0079..d778cc3f9e8b79 100644
--- a/dockers/base-conda/Dockerfile
+++ b/dockers/base-conda/Dockerfile
@@ -29,7 +29,11 @@ ENV \
     # CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \
     MKL_THREADING_LAYER=GNU
 
-RUN apt-get update -qq --fix-missing && \
+RUN \
+    # TODO: Remove the manual key installation once the base image is updated.
+    # https://github.com/NVIDIA/nvidia-docker/issues/1631
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    apt-get update -qq --fix-missing && \
     apt-get install -y --no-install-recommends \
         build-essential \
         cmake \
@@ -87,6 +91,7 @@ ENV \
 COPY ./requirements.txt requirements.txt
 COPY ./requirements/extra.txt requirements-extra.txt
 COPY ./requirements/examples.txt requirements-examples.txt
+COPY ./requirements/strategies.txt requirements-strategies.txt
 COPY ./requirements/adjust-versions.py requirements_adjust_versions.py
 COPY ./.actions/assistant.py assistant.py
 
@@ -104,16 +109,6 @@ RUN \
     pip install -r requirements-examples.txt --no-cache-dir --find-links https://download.pytorch.org/whl/test/torch_test.html && \
     rm assistant.py
 
-RUN \
-   apt-get purge -y cmake && \
-   wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
-   tar -zxvf cmake-3.20.2.tar.gz && \
-   cd cmake-3.20.2 && \
-   ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
-   make && \
-   make install && \
-   cmake  --version
-
 ENV \
     # if you want this environment to be the default o \ne, uncomment the following line:
     CONDA_DEFAULT_ENV=${CONDA_ENV} \
@@ -129,7 +124,7 @@ RUN \
    HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \
    export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
    cat requirements_horovod.txt && \
-   pip install --no-cache-dir -r requirements_horovod.txt && \
+   pip install --no-cache-dir -r requirements-strategies.txt && \
    rm requirements*
 
 RUN \
@@ -147,16 +142,6 @@ RUN \
     pip install --no-cache-dir --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \
     python -c "from apex import amp"
 
-RUN \
-    # install FairScale
-    pip install fairscale==0.4.5 && \
-    python -c "import fairscale; print(fairscale.__version__)"
-
-RUN \
-    # install DeepSpeed
-    pip install deepspeed==0.6.0 && \
-    python -c "import deepspeed; print(deepspeed.__version__)"
-
 RUN \
     # install Bagua
     CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 44ffab8833c295..41dea516ae3780 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG CUDA_VERSION=11.1
+ARG CUDA_VERSION=11.3.1
 
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 
@@ -31,7 +31,11 @@ ENV \
     # MAKEFLAGS="-j$(nproc)"
     MAKEFLAGS="-j2"
 
-RUN apt-get update -qq --fix-missing  && \
+RUN \
+    # TODO: Remove the manual key installation once the base image is updated.
+    # https://github.com/NVIDIA/nvidia-docker/issues/1631
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    apt-get update -qq --fix-missing && \
     apt-get install -y --no-install-recommends \
         build-essential \
         pkg-config \
@@ -81,10 +85,8 @@ RUN \
     python ./requirements/adjust-versions.py requirements.txt ${PYTORCH_VERSION} && \
     python ./requirements/adjust-versions.py requirements/extra.txt ${PYTORCH_VERSION} && \
     python ./requirements/adjust-versions.py requirements/examples.txt ${PYTORCH_VERSION} && \
-    python -c "print(' '.join([ln for ln in open('requirements/extra.txt').readlines() if 'horovod' in ln]))" > ./requirements/horovod.txt && \
-    python assistant.py requirements_prune_pkgs "horovod" && \
     # Install all requirements \
-    pip install -r requirements/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
+    pip install -r requirements/devel-base.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
     rm -rf requirements.* && \
     rm assistant.py
 
@@ -110,9 +112,8 @@ ENV \
 RUN \
     HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \
     export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
-    cat ./requirements/horovod.txt && \
     cmake --version && \
-    pip install --no-cache-dir -r ./requirements/horovod.txt && \
+    pip install --no-cache-dir -r ./requirements/strategies.txt && \
     rm -rf requirements/
 
 RUN \
@@ -130,16 +131,6 @@ RUN \
     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \
     python -c "from apex import amp"
 
-RUN \
-    # install FairScale
-    pip install fairscale==0.4.5 && \
-    python -c "import fairscale; print(fairscale.__version__)"
-
-RUN \
-    # install DeepSpeed
-    pip install deepspeed==0.6.0 && \
-    python -c "import deepspeed; print(deepspeed.__version__)"
-
 RUN \
     # install Bagua
     CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
diff --git a/dockers/base-ipu/Dockerfile b/dockers/base-ipu/Dockerfile
index 727bb292031892..48efdd5e7adbf9 100644
--- a/dockers/base-ipu/Dockerfile
+++ b/dockers/base-ipu/Dockerfile
@@ -83,7 +83,6 @@ RUN \
     python -c "import torch; print(torch.__version__)" && \
     python adjust_versions.py requirements-extra.txt && \
     pip install -q fire && \
-    python assistant.py requirements_prune_pkgs fairscale,horovod --req_files requirements-extra.txt && \
     # Install remaining requirements
     pip install -r requirements-extra.txt --no-cache-dir && \
     pip install -r requirements-test.txt --no-cache-dir && \
diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile
index 57a2bd3e953c48..2fb5b999051219 100644
--- a/dockers/base-xla/Dockerfile
+++ b/dockers/base-xla/Dockerfile
@@ -95,10 +95,9 @@ RUN \
     # drop packages installed with XLA
     python .actions/assistant.py requirements_prune_pkgs torch,torchvision && \
     # drop unnecessary packages
-    python .actions/assistant.py requirements_prune_pkgs fairscale,horovod && \
     python ./requirements/adjust-versions.py ./requirements/extra.txt && \
     # install PL dependencies
-    pip install --requirement ./requirements/devel.txt --no-cache-dir && \
+    pip install --requirement ./requirements/devel-base.txt --no-cache-dir && \
     cd .. && \
     rm -rf pytorch-lightning && \
     rm -rf /root/.cache
diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
index cd0c3c779dff35..9a7528cc22e218 100644
--- a/dockers/tpu-tests/Dockerfile
+++ b/dockers/tpu-tests/Dockerfile
@@ -30,8 +30,7 @@ RUN cd pytorch-lightning && \
 RUN \
     pip install -q fire && \
     # drop unnecessary packages
-    python .actions/assistant.py requirements_prune_pkgs fairscale,horovod --req_files ./pytorch-lightning/requirements/extra.txt && \
-    pip install -r pytorch-lightning/requirements/devel.txt --no-cache-dir
+    pip install -r pytorch-lightning/requirements/devel-base.txt --no-cache-dir
 
 COPY ./dockers/tpu-tests/docker-entrypoint.sh /usr/local/bin/
 RUN chmod +x /usr/local/bin/docker-entrypoint.sh
diff --git a/docs/source/accelerators/accelerator_prepare.rst b/docs/source/accelerators/accelerator_prepare.rst
new file mode 100644
index 00000000000000..38921f4f520dbd
--- /dev/null
+++ b/docs/source/accelerators/accelerator_prepare.rst
@@ -0,0 +1,165 @@
+:orphan:
+
+.. _gpu_prepare:
+
+########################################
+Hardware agnostic training (preparation)
+########################################
+
+To train on CPU/GPU/TPU without changing your code, we need to build a few good habits :)
+
+----
+
+*****************************
+Delete .cuda() or .to() calls
+*****************************
+
+Delete any calls to .cuda() or .to(device).
+
+.. testcode::
+
+    # before lightning
+    def forward(self, x):
+        x = x.cuda(0)
+        layer_1.cuda(0)
+        x_hat = layer_1(x)
+
+
+    # after lightning
+    def forward(self, x):
+        x_hat = layer_1(x)
+
+----
+
+**********************************************
+Init tensors using type_as and register_buffer
+**********************************************
+When you need to create a new tensor, use ``type_as``.
+This will make your code scale to any arbitrary number of GPUs or TPUs with Lightning.
+
+.. testcode::
+
+    # before lightning
+    def forward(self, x):
+        z = torch.Tensor(2, 3)
+        z = z.cuda(0)
+
+
+    # with lightning
+    def forward(self, x):
+        z = torch.Tensor(2, 3)
+        z = z.type_as(x)
+
+The :class:`~pytorch_lightning.core.lightning.LightningModule` knows what device it is on. You can access the reference via ``self.device``.
+Sometimes it is necessary to store tensors as module attributes. However, if they are not parameters they will
+remain on the CPU even if the module gets moved to a new device. To prevent that and remain device agnostic,
+register the tensor as a buffer in your modules' ``__init__`` method with :meth:`~torch.nn.Module.register_buffer`.
+
+.. testcode::
+
+    class LitModel(LightningModule):
+        def __init__(self):
+            ...
+            self.register_buffer("sigma", torch.eye(3))
+            # you can now access self.sigma anywhere in your module
+
+----
+
+***************
+Remove samplers
+***************
+
+:class:`~torch.utils.data.distributed.DistributedSampler` is automatically handled by Lightning.
+
+See :ref:`replace-sampler-ddp` for more information.
+
+----
+
+***************************************
+Synchronize validation and test logging
+***************************************
+
+When running in distributed mode, we have to ensure that the validation and test step logging calls are synchronized across processes.
+This is done by adding ``sync_dist=True`` to all ``self.log`` calls in the validation and test step.
+This ensures that each GPU worker has the same behaviour when tracking model checkpoints, which is important for later downstream tasks such as testing the best checkpoint across all workers.
+The ``sync_dist`` option can also be used in logging calls during the step methods, but be aware that this can lead to significant communication overhead and slow down your training.
+
+Note if you use any built in metrics or custom metrics that use `TorchMetrics <https://torchmetrics.readthedocs.io/>`_, these do not need to be updated and are automatically handled for you.
+
+.. testcode::
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = self.loss(logits, y)
+        # Add sync_dist=True to sync logging across all GPU workers (may have performance impact)
+        self.log("validation_loss", loss, on_step=True, on_epoch=True, sync_dist=True)
+
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = self.loss(logits, y)
+        # Add sync_dist=True to sync logging across all GPU workers (may have performance impact)
+        self.log("test_loss", loss, on_step=True, on_epoch=True, sync_dist=True)
+
+It is possible to perform some computation manually and log the reduced result on rank 0 as follows:
+
+.. testcode::
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        tensors = self(x)
+        return tensors
+
+
+    def test_epoch_end(self, outputs):
+        mean = torch.mean(self.all_gather(outputs))
+
+        # When logging only on rank 0, don't forget to add
+        # ``rank_zero_only=True`` to avoid deadlocks on synchronization.
+        if self.trainer.is_global_zero:
+            self.log("my_reduced_metric", mean, rank_zero_only=True)
+
+----
+
+**********************
+Make models pickleable
+**********************
+It's very likely your code is already `pickleable <https://docs.python.org/3/library/pickle.html>`_,
+in that case no change in necessary.
+However, if you run a distributed model and get the following error:
+
+.. code-block::
+
+    self._launch(process_obj)
+    File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/popen_spawn_posix.py", line 47,
+    in _launch reduction.dump(process_obj, fp)
+    File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/reduction.py", line 60, in dump
+    ForkingPickler(file, protocol).dump(obj)
+    _pickle.PicklingError: Can't pickle <function <lambda> at 0x2b599e088ae8>:
+    attribute lookup <lambda> on __main__ failed
+
+This means something in your model definition, transforms, optimizer, dataloader or callbacks cannot be pickled, and the following code will fail:
+
+.. code-block:: python
+
+    import pickle
+
+    pickle.dump(some_object)
+
+This is a limitation of using multiple processes for distributed training within PyTorch.
+To fix this issue, find your piece of code that cannot be pickled. The end of the stacktrace
+is usually helpful.
+ie: in the stacktrace example here, there seems to be a lambda function somewhere in the code
+which cannot be pickled.
+
+.. code-block::
+
+    self._launch(process_obj)
+    File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/popen_spawn_posix.py", line 47,
+    in _launch reduction.dump(process_obj, fp)
+    File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/reduction.py", line 60, in dump
+    ForkingPickler(file, protocol).dump(obj)
+    _pickle.PicklingError: Can't pickle [THIS IS THE THING TO FIND AND DELETE]:
+    attribute lookup <lambda> on __main__ failed
diff --git a/docs/source/accelerators/gpu.rst b/docs/source/accelerators/gpu.rst
index 8e8e5a8c05b8e3..dff76461dfc409 100644
--- a/docs/source/accelerators/gpu.rst
+++ b/docs/source/accelerators/gpu.rst
@@ -1,844 +1,63 @@
-.. testsetup:: *
-
-    import torch
-    from pytorch_lightning.trainer.trainer import Trainer
-    from pytorch_lightning.core.lightning import LightningModule
-
 .. _gpu:
 
-Graphics Processing Unit (GPU)
-==============================
-
-
-Single GPU Training
--------------------
-
-Make sure you're running on a machine with at least one GPU. There's no need to specify any NVIDIA flags
-as Lightning will do it for you.
-
-.. testcode::
-    :skipif: torch.cuda.device_count() < 1
-
-    trainer = Trainer(accelerator="gpu", devices=1)
-
-----------------
-
-Multi GPU Training
-------------------
-
-Lightning supports multiple ways of doing distributed training.
+Accelerator: GPU training
+=========================
 
 .. raw:: html
 
-    <video width="50%" max-width="400px" controls
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/yt_thumbs/thumb_multi_gpus.png"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/yt/Trainer+flags+4-+multi+node+training_3.mp4"></video>
-
-|
-
-Model Parallel Training
------------------------
-
-Check out the :ref:`Model Parallel Guide <model_parallel>` documentation.
-
-----------
-
-Preparing your code
--------------------
-To train on CPU/GPU/TPU without changing your code, we need to build a few good habits :)
-
-Delete .cuda() or .to() calls
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Delete any calls to .cuda() or .to(device).
-
-.. testcode::
-
-    # before lightning
-    def forward(self, x):
-        x = x.cuda(0)
-        layer_1.cuda(0)
-        x_hat = layer_1(x)
-
-
-    # after lightning
-    def forward(self, x):
-        x_hat = layer_1(x)
-
-Init tensors using type_as and register_buffer
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-When you need to create a new tensor, use `type_as`.
-This will make your code scale to any arbitrary number of GPUs or TPUs with Lightning.
-
-.. testcode::
-
-    # before lightning
-    def forward(self, x):
-        z = torch.Tensor(2, 3)
-        z = z.cuda(0)
-
-
-    # with lightning
-    def forward(self, x):
-        z = torch.Tensor(2, 3)
-        z = z.type_as(x)
-
-The :class:`~pytorch_lightning.core.lightning.LightningModule` knows what device it is on. You can access the reference via ``self.device``.
-Sometimes it is necessary to store tensors as module attributes. However, if they are not parameters they will
-remain on the CPU even if the module gets moved to a new device. To prevent that and remain device agnostic,
-register the tensor as a buffer in your modules's ``__init__`` method with :meth:`~torch.nn.Module.register_buffer`.
-
-.. testcode::
-
-    class LitModel(LightningModule):
-        def __init__(self):
-            ...
-            self.register_buffer("sigma", torch.eye(3))
-            # you can now access self.sigma anywhere in your module
-
-
-Remove samplers
-^^^^^^^^^^^^^^^
-
-:class:`~torch.utils.data.distributed.DistributedSampler` is automatically handled by Lightning.
-
-See :ref:`replace-sampler-ddp` for more information.
-
-
-Synchronize validation and test logging
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-When running in distributed mode, we have to ensure that the validation and test step logging calls are synchronized across processes.
-This is done by adding ``sync_dist=True`` to all ``self.log`` calls in the validation and test step.
-This ensures that each GPU worker has the same behaviour when tracking model checkpoints, which is important for later downstream tasks such as testing the best checkpoint across all workers.
-The ``sync_dist`` option can also be used in logging calls during the step methods, but be aware that this can lead to significant communication overhead and slow down your training.
-
-Note if you use any built in metrics or custom metrics that use `TorchMetrics <https://torchmetrics.readthedocs.io/>`_, these do not need to be updated and are automatically handled for you.
-
-.. testcode::
-
-    def validation_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        loss = self.loss(logits, y)
-        # Add sync_dist=True to sync logging across all GPU workers (may have performance impact)
-        self.log("validation_loss", loss, on_step=True, on_epoch=True, sync_dist=True)
-
-
-    def test_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        loss = self.loss(logits, y)
-        # Add sync_dist=True to sync logging across all GPU workers (may have performance impact)
-        self.log("test_loss", loss, on_step=True, on_epoch=True, sync_dist=True)
-
-It is possible to perform some computation manually and log the reduced result on rank 0 as follows:
-
-.. testcode::
-
-    def test_step(self, batch, batch_idx):
-        x, y = batch
-        tensors = self(x)
-        return tensors
-
-
-    def test_epoch_end(self, outputs):
-        mean = torch.mean(self.all_gather(outputs))
-
-        # When logging only on rank 0, don't forget to add
-        # ``rank_zero_only=True`` to avoid deadlocks on synchronization.
-        if self.trainer.is_global_zero:
-            self.log("my_reduced_metric", mean, rank_zero_only=True)
-
-
-Make models pickleable
-^^^^^^^^^^^^^^^^^^^^^^
-It's very likely your code is already `pickleable <https://docs.python.org/3/library/pickle.html>`_,
-in that case no change in necessary.
-However, if you run a distributed model and get the following error:
-
-.. code-block::
-
-    self._launch(process_obj)
-    File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/popen_spawn_posix.py", line 47,
-    in _launch reduction.dump(process_obj, fp)
-    File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/reduction.py", line 60, in dump
-    ForkingPickler(file, protocol).dump(obj)
-    _pickle.PicklingError: Can't pickle <function <lambda> at 0x2b599e088ae8>:
-    attribute lookup <lambda> on __main__ failed
-
-This means something in your model definition, transforms, optimizer, dataloader or callbacks cannot be pickled, and the following code will fail:
-
-.. code-block:: python
-
-    import pickle
-
-    pickle.dump(some_object)
-
-This is a limitation of using multiple processes for distributed training within PyTorch.
-To fix this issue, find your piece of code that cannot be pickled. The end of the stacktrace
-is usually helpful.
-ie: in the stacktrace example here, there seems to be a lambda function somewhere in the code
-which cannot be pickled.
-
-.. code-block::
-
-    self._launch(process_obj)
-    File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/popen_spawn_posix.py", line 47,
-    in _launch reduction.dump(process_obj, fp)
-    File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/reduction.py", line 60, in dump
-    ForkingPickler(file, protocol).dump(obj)
-    _pickle.PicklingError: Can't pickle [THIS IS THE THING TO FIND AND DELETE]:
-    attribute lookup <lambda> on __main__ failed
-
-----------
-
-Select GPU devices
-------------------
-
-You can select the GPU devices using ranges, a list of indices or a string containing
-a comma separated list of GPU ids:
-
-.. testsetup::
-
-    k = 1
-
-.. testcode::
-    :skipif: torch.cuda.device_count() < 2
-
-    # DEFAULT (int) specifies how many GPUs to use per node
-    Trainer(accelerator="gpu", devices=k)
-
-    # Above is equivalent to
-    Trainer(accelerator="gpu", devices=list(range(k)))
-
-    # Specify which GPUs to use (don't use when running on cluster)
-    Trainer(accelerator="gpu", devices=[0, 1])
-
-    # Equivalent using a string
-    Trainer(accelerator="gpu", devices="0, 1")
-
-    # To use all available GPUs put -1 or '-1'
-    # equivalent to list(range(torch.cuda.device_count()))
-    Trainer(accelerator="gpu", devices=-1)
-
-The table below lists examples of possible input formats and how they are interpreted by Lightning.
-
-+------------------+-----------+---------------------+---------------------------------+
-| `devices`        | Type      | Parsed              | Meaning                         |
-+==================+===========+=====================+=================================+
-| 3                | int       | [0, 1, 2]           | first 3 GPUs                    |
-+------------------+-----------+---------------------+---------------------------------+
-| -1               | int       | [0, 1, 2, ...]      | all available GPUs              |
-+------------------+-----------+---------------------+---------------------------------+
-| [0]              | list      | [0]                 | GPU 0                           |
-+------------------+-----------+---------------------+---------------------------------+
-| [1, 3]           | list      | [1, 3]              | GPUs 1 and 3                    |
-+------------------+-----------+---------------------+---------------------------------+
-| "3"              | str       | [0, 1, 2]           | first 3 GPUs                    |
-+------------------+-----------+---------------------+---------------------------------+
-| "1, 3"           | str       | [1, 3]              | GPUs 1 and 3                    |
-+------------------+-----------+---------------------+---------------------------------+
-| "-1"             | str       | [0, 1, 2, ...]      | all available GPUs              |
-+------------------+-----------+---------------------+---------------------------------+
-
-.. note::
-
-    When specifying number of ``devices`` as an integer ``devices=k``, setting the trainer flag
-    ``auto_select_gpus=True`` will automatically help you find ``k`` GPUs that are not
-    occupied by other processes. This is especially useful when GPUs are configured
-    to be in "exclusive mode", such that only one process at a time can access them.
-    For more details see the :doc:`trainer guide <../common/trainer>`.
-
-
-Select torch distributed backend
---------------------------------
-
-By default, Lightning will select the ``nccl`` backend over ``gloo`` when running on GPUs.
-Find more information about PyTorch's supported backends `here <https://pytorch.org/docs/stable/distributed.html>`__.
-
-Lightning allows explicitly specifying the backend via the `process_group_backend` constructor argument on the relevant Strategy classes. By default, Lightning will select the appropriate process group backend based on the hardware used.
-
-.. code-block:: python
-
-    from pytorch_lightning.strategies import DDPStrategy
-
-    # Explicitly specify the process group backend if you choose to
-    ddp = DDPStrategy(process_group_backend="nccl")
-
-    # Configure the strategy on the Trainer
-    trainer = Trainer(strategy=ddp, accelerator="gpu", devices=8)
-
-
-----------
-
-Distributed modes
------------------
-Lightning allows multiple ways of training
-
-- Data Parallel (``strategy='dp'``) (multiple-gpus, 1 machine)
-- DistributedDataParallel (``strategy='ddp'``) (multiple-gpus across many machines (python script based)).
-- DistributedDataParallel (``strategy='ddp_spawn'``) (multiple-gpus across many machines (spawn based)).
-- DistributedDataParallel 2 (``strategy='ddp2'``) (DP in a machine, DDP across machines).
-- Horovod (``strategy='horovod'``) (multi-machine, multi-gpu, configured at runtime)
-- Bagua (``strategy='bagua'``) (multiple-gpus across many machines with advanced training algorithms)
-- TPUs (``accelerator="tpu", devices=8|x``) (tpu or TPU pod)
-
-.. note::
-    If you request multiple GPUs or nodes without setting a mode, DDP Spawn will be automatically used.
-
-For a deeper understanding of what Lightning is doing, feel free to read this
-`guide <https://medium.com/@_willfalcon/9-tips-for-training-lightning-fast-neural-networks-in-pytorch-8e63a502f565>`_.
-
-
-
-Data Parallel
-^^^^^^^^^^^^^
-:class:`~torch.nn.DataParallel` (DP) splits a batch across k GPUs.
-That is, if you have a batch of 32 and use DP with 2 GPUs, each GPU will process 16 samples,
-after which the root node will aggregate the results.
-
-.. warning:: DP use is discouraged by PyTorch and Lightning. State is not maintained on the replicas created by the
-    :class:`~torch.nn.DataParallel` wrapper and you may see errors or misbehavior if you assign state to the module
-    in the ``forward()`` or ``*_step()`` methods. For the same reason we cannot fully support
-    :ref:`manual_optimization` with DP. Use DDP which is more stable and at least 3x faster.
-
-.. warning:: DP only supports scattering and gathering primitive collections of tensors like lists, dicts, etc.
-    Therefore the :meth:`~pytorch_lightning.core.hooks.ModelHooks.transfer_batch_to_device` hook does not apply in
-    this mode and if you have overridden it, it will not be called.
-
-.. testcode::
-    :skipif: torch.cuda.device_count() < 2
-
-    # train on 2 GPUs (using DP mode)
-    trainer = Trainer(accelerator="gpu", devices=2, strategy="dp")
-
-Distributed Data Parallel
-^^^^^^^^^^^^^^^^^^^^^^^^^
-:class:`~torch.nn.parallel.DistributedDataParallel` (DDP) works as follows:
-
-1. Each GPU across each node gets its own process.
-
-2. Each GPU gets visibility into a subset of the overall dataset. It will only ever see that subset.
-
-3. Each process inits the model.
-
-4. Each process performs a full forward and backward pass in parallel.
-
-5. The gradients are synced and averaged across all processes.
-
-6. Each process updates its optimizer.
-
-.. code-block:: python
-
-    # train on 8 GPUs (same machine (ie: node))
-    trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp")
-
-    # train on 32 GPUs (4 nodes)
-    trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp", num_nodes=4)
-
-This Lightning implementation of DDP calls your script under the hood multiple times with the correct environment
-variables:
-
-.. code-block:: bash
-
-    # example for 3 GPUs DDP
-    MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=0 LOCAL_RANK=0 python my_file.py --accelerator 'gpu' --devices 3 --etc
-    MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=1 LOCAL_RANK=0 python my_file.py --accelerator 'gpu' --devices 3 --etc
-    MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=2 LOCAL_RANK=0 python my_file.py --accelerator 'gpu' --devices 3 --etc
-
-We use DDP this way because `ddp_spawn` has a few limitations (due to Python and PyTorch):
-
-1. Since `.spawn()` trains the model in subprocesses, the model on the main process does not get updated.
-2. Dataloader(num_workers=N), where N is large, bottlenecks training with DDP... ie: it will be VERY slow or won't work at all. This is a PyTorch limitation.
-3. Forces everything to be picklable.
-
-There are cases in which it is NOT possible to use DDP. Examples are:
-
-- Jupyter Notebook, Google COLAB, Kaggle, etc.
-- You have a nested script without a root package
-
-In these situations you should use `dp` or `ddp_spawn` instead.
-
-Distributed Data Parallel 2
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-In certain cases, it's advantageous to use all batches on the same machine instead of a subset.
-For instance, you might want to compute a NCE loss where it pays to have more negative samples.
-
-In  this case, we can use DDP2 which behaves like DP in a machine and DDP across nodes. DDP2 does the following:
-
-1. Copies a subset of the data to each node.
-
-2. Inits a model on each node.
-
-3. Runs a forward and backward pass using DP.
-
-4. Syncs gradients across nodes.
-
-5. Applies the optimizer updates.
-
-.. code-block:: python
-
-    # train on 32 GPUs (4 nodes)
-    trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp2", num_nodes=4)
-
-Distributed Data Parallel Spawn
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-`ddp_spawn` is exactly like `ddp` except that it uses .spawn to start the training processes.
-
-.. warning:: It is STRONGLY recommended to use `DDP` for speed and performance.
-
-.. code-block:: python
-
-    mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,))
-
-If your script does not support being called from the command line (ie: it is nested without a root
-project module) you can use the following method:
-
-.. code-block:: python
-
-    # train on 8 GPUs (same machine (ie: node))
-    trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp_spawn")
-
-We STRONGLY discourage this use because it has limitations (due to Python and PyTorch):
-
-1. The model you pass in will not update. Please save a checkpoint and restore from there.
-2. Set Dataloader(num_workers=0) or it will bottleneck training.
-
-`ddp` is MUCH faster than `ddp_spawn`. We recommend you
-
-1. Install a top-level module for your project using setup.py
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Prepare your code (Optional)
+   :description: Prepare your code to run on any hardware
+   :col_css: col-md-4
+   :button_link: accelerator_prepare.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Basic
+   :description: Learn the basics of single and multi-GPU training.
+   :col_css: col-md-4
+   :button_link: gpu_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Intermediate
+   :description: Learn about different distributed strategies, torchelastic and how to optimize communication layers.
+   :col_css: col-md-4
+   :button_link: gpu_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Advanced
+   :description: Train 1 trillion+ parameter models with these techniques.
+   :col_css: col-md-4
+   :button_link: gpu_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Expert
+   :description: Develop new strategies for training and deploying larger and larger models.
+   :col_css: col-md-4
+   :button_link: gpu_expert.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: FAQ
+   :description: Frequently asked questions about GPU training.
+   :col_css: col-md-4
+   :button_link: gpu_faq.html
+   :height: 150
 
-.. code-block:: python
-
-    # setup.py
-    #!/usr/bin/env python
-
-    from setuptools import setup, find_packages
-
-    setup(
-        name="src",
-        version="0.0.1",
-        description="Describe Your Cool Project",
-        author="",
-        author_email="",
-        url="https://github.com/YourSeed",  # REPLACE WITH YOUR OWN GITHUB PROJECT LINK
-        install_requires=["pytorch-lightning"],
-        packages=find_packages(),
-    )
-
-2. Setup your project like so:
-
-.. code-block:: bash
-
-    /project
-        /src
-            some_file.py
-            /or_a_folder
-        setup.py
-
-3. Install as a root-level package
-
-.. code-block:: bash
-
-    cd /project
-    pip install -e .
-
-You can then call your scripts anywhere
-
-.. code-block:: bash
-
-    cd /project/src
-    python some_file.py --accelerator 'gpu' --devices 8 --strategy 'ddp'
-
-
-Horovod
-^^^^^^^
-`Horovod <http://horovod.ai>`_ allows the same training script to be used for single-GPU,
-multi-GPU, and multi-node training.
-
-Like Distributed Data Parallel, every process in Horovod operates on a single GPU with a fixed
-subset of the data.  Gradients are averaged across all GPUs in parallel during the backward pass,
-then synchronously applied before beginning the next step.
-
-The number of worker processes is configured by a driver application (`horovodrun` or `mpirun`). In
-the training script, Horovod will detect the number of workers from the environment, and automatically
-scale the learning rate to compensate for the increased total batch size.
-
-Horovod can be configured in the training script to run with any number of GPUs / processes as follows:
-
-.. code-block:: python
-
-    # train Horovod on GPU (number of GPUs / machines provided on command-line)
-    trainer = Trainer(strategy="horovod", accelerator="gpu", devices=1)
-
-    # train Horovod on CPU (number of processes / machines provided on command-line)
-    trainer = Trainer(strategy="horovod")
-
-When starting the training job, the driver application will then be used to specify the total
-number of worker processes:
-
-.. code-block:: bash
-
-    # run training with 4 GPUs on a single machine
-    horovodrun -np 4 python train.py
-
-    # run training with 8 GPUs on two machines (4 GPUs each)
-    horovodrun -np 8 -H hostname1:4,hostname2:4 python train.py
-
-See the official `Horovod documentation <https://horovod.readthedocs.io/en/stable>`_ for details
-on installation and performance tuning.
-
-
-Bagua
-^^^^^
-`Bagua <https://github.com/BaguaSys/bagua>`_ is a deep learning training acceleration framework which supports
-multiple advanced distributed training algorithms including:
-
-- `Gradient AllReduce <https://tutorials.baguasys.com/algorithms/gradient-allreduce>`_ for centralized synchronous communication, where gradients are averaged among all workers.
-- `Decentralized SGD <https://tutorials.baguasys.com/algorithms/decentralized>`_ for decentralized synchronous communication, where each worker exchanges data with one or a few specific workers.
-- `ByteGrad <https://tutorials.baguasys.com/algorithms/bytegrad>`_ and `QAdam <https://tutorials.baguasys.com/algorithms/q-adam>`_ for low precision communication, where data is compressed into low precision before communication.
-- `Asynchronous Model Average <https://tutorials.baguasys.com/algorithms/async-model-average>`_ for asynchronous communication, where workers are not required to be synchronized in the same iteration in a lock-step style.
-
-By default, Bagua uses *Gradient AllReduce* algorithm, which is also the algorithm implemented in Distributed Data Parallel and Horovod,
-but Bagua can usually produce a higher training throughput due to its backend written in Rust.
-
-.. code-block:: python
-
-    # train on 4 GPUs (using Bagua mode)
-    trainer = Trainer(strategy="bagua", accelerator="gpu", devices=4)
-
-
-By specifying the ``algorithm`` in the ``BaguaStrategy``, you can select more advanced training algorithms featured by Bagua:
-
-
-.. code-block:: python
-
-    # train on 4 GPUs, using Bagua Gradient AllReduce algorithm
-    trainer = Trainer(
-        strategy=BaguaStrategy(algorithm="gradient_allreduce"),
-        accelerator="gpu",
-        devices=4,
-    )
-
-    # train on 4 GPUs, using Bagua ByteGrad algorithm
-    trainer = Trainer(
-        strategy=BaguaStrategy(algorithm="bytegrad"),
-        accelerator="gpu",
-        devices=4,
-    )
-
-    # train on 4 GPUs, using Bagua Decentralized SGD
-    trainer = Trainer(
-        strategy=BaguaStrategy(algorithm="decentralized"),
-        accelerator="gpu",
-        devices=4,
-    )
-
-    # train on 4 GPUs, using Bagua Low Precision Decentralized SGD
-    trainer = Trainer(
-        strategy=BaguaStrategy(algorithm="low_precision_decentralized"),
-        accelerator="gpu",
-        devices=4,
-    )
-
-    # train on 4 GPUs, using Asynchronous Model Average algorithm, with a synchronization interval of 100ms
-    trainer = Trainer(
-        strategy=BaguaStrategy(algorithm="async", sync_interval_ms=100),
-        accelerator="gpu",
-        devices=4,
-    )
-
-To use *QAdam*, we need to initialize
-`QAdamOptimizer <https://bagua.readthedocs.io/en/latest/autoapi/bagua/torch_api/algorithms/q_adam/index.html#bagua.torch_api.algorithms.q_adam.QAdamOptimizer>`_ first:
-
-.. code-block:: python
-
-    from pytorch_lightning.strategies import BaguaStrategy
-    from bagua.torch_api.algorithms.q_adam import QAdamOptimizer
-
-
-    class MyModel(pl.LightningModule):
-        ...
-
-        def configure_optimizers(self):
-            # initialize QAdam Optimizer
-            return QAdamOptimizer(self.parameters(), lr=0.05, warmup_steps=100)
-
-
-    model = MyModel()
-    trainer = Trainer(
-        accelerator="gpu",
-        devices=4,
-        strategy=BaguaStrategy(algorithm="qadam"),
-    )
-    trainer.fit(model)
-
-Bagua relies on its own `launcher <https://tutorials.baguasys.com/getting-started/#launch-job>`_ to schedule jobs.
-Below, find examples using ``bagua.distributed.launch`` which follows ``torch.distributed.launch`` API:
-
-.. code-block:: bash
-
-    # start training with 8 GPUs on a single node
-    python -m bagua.distributed.launch --nproc_per_node=8 train.py
-
-If the ssh service is available with passwordless login on each node, you can launch the distributed job on a
-single node with ``baguarun`` which has a similar syntax as ``mpirun``. When staring the job, ``baguarun`` will
-automatically spawn new processes on each of your training node provided by ``--host_list`` option and each node in it
-is described as an ip address followed by a ssh port.
-
-.. code-block:: bash
-
-    # Run on node1 (or node2) to start training on two nodes (node1 and node2), 8 GPUs per node
-    baguarun --host_list hostname1:ssh_port1,hostname2:ssh_port2 --nproc_per_node=8 --master_port=port1 train.py
-
-
-.. note:: You can also start training in the same way as Distributed Data Parallel. However, system optimizations like
-    `Bagua-Net <https://tutorials.baguasys.com/more-optimizations/bagua-net>`_ and
-    `Performance autotuning <https://tutorials.baguasys.com/performance-autotuning/>`_ can only be enabled through bagua
-    launcher. It is worth noting that with ``Bagua-Net``, Distributed Data Parallel can also achieve
-    better performance without modifying the training script.
-
-
-See `Bagua Tutorials <https://tutorials.baguasys.com/>`_ for more details on installation and advanced features.
-
-
-DP/DDP2 caveats
-^^^^^^^^^^^^^^^
-In DP and DDP2 each GPU within a machine sees a portion of a batch.
-DP and ddp2 roughly do the following:
-
-.. testcode::
-
-    def distributed_forward(batch, model):
-        batch = torch.Tensor(32, 8)
-        gpu_0_batch = batch[:8]
-        gpu_1_batch = batch[8:16]
-        gpu_2_batch = batch[16:24]
-        gpu_3_batch = batch[24:]
-
-        y_0 = model_copy_gpu_0(gpu_0_batch)
-        y_1 = model_copy_gpu_1(gpu_1_batch)
-        y_2 = model_copy_gpu_2(gpu_2_batch)
-        y_3 = model_copy_gpu_3(gpu_3_batch)
-
-        return [y_0, y_1, y_2, y_3]
-
-So, when Lightning calls any of the `training_step`, `validation_step`, `test_step`
-you will only be operating on one of those pieces.
-
-.. testcode::
-
-    # the batch here is a portion of the FULL batch
-    def training_step(self, batch, batch_idx):
-        y_0 = batch
-
-For most metrics, this doesn't really matter. However, if you want
-to add something to your computational graph (like softmax)
-using all batch parts you can use the `training_step_end` step.
-
-.. testcode::
-
-    def training_step_end(self, outputs):
-        # only use when  on dp
-        outputs = torch.cat(outputs, dim=1)
-        softmax = softmax(outputs, dim=1)
-        out = softmax.mean()
-        return out
-
-In pseudocode, the full sequence is:
-
-.. code-block:: python
-
-    # get data
-    batch = next(dataloader)
-
-    # copy model and data to each gpu
-    batch_splits = split_batch(batch, num_gpus)
-    models = copy_model_to_gpus(model)
-
-    # in parallel, operate on each batch chunk
-    all_results = []
-    for gpu_num in gpus:
-        batch_split = batch_splits[gpu_num]
-        gpu_model = models[gpu_num]
-        out = gpu_model(batch_split)
-        all_results.append(out)
-
-    # use the full batch for something like softmax
-    full_out = model.training_step_end(all_results)
-
-To illustrate why this is needed, let's look at DataParallel
-
-.. testcode::
-
-    def training_step(self, batch, batch_idx):
-        x, y = batch
-        y_hat = self(batch)
-
-        # on dp or ddp2 if we did softmax now it would be wrong
-        # because batch is actually a piece of the full batch
-        return y_hat
-
-
-    def training_step_end(self, step_output):
-        # step_output has outputs of each part of the batch
-
-        # do softmax here
-        outputs = torch.cat(outputs, dim=1)
-        softmax = softmax(outputs, dim=1)
-        out = softmax.mean()
-
-        return out
-
-If `training_step_end` is defined it will be called regardless of TPU, DP, DDP, etc... which means
-it will behave the same regardless of the backend.
-
-Validation and test step have the same option when using DP.
-
-.. testcode::
-
-    def validation_step_end(self, step_output):
-        ...
-
-
-    def test_step_end(self, step_output):
-        ...
-
-
-Distributed and 16-bit precision
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Due to an issue with Apex and DataParallel (PyTorch and NVIDIA issue), Lightning does
-not allow 16-bit and DP training. We tried to get this to work, but it's an issue on their end.
-
-Below are the possible configurations we support.
-
-+-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
-| 1 GPU | 1+ GPUs | DP  | DDP | 16-bit | command                                                               |
-+=======+=========+=====+=====+========+=======================================================================+
-| Y     |         |     |     |        | `Trainer(accelerator="gpu", devices=1)`                               |
-+-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
-| Y     |         |     |     | Y      | `Trainer(accelerator="gpu", devices=1, precision=16)`                 |
-+-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
-|       | Y       | Y   |     |        | `Trainer(accelerator="gpu", devices=k, strategy='dp')`                |
-+-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
-|       | Y       |     | Y   |        | `Trainer(accelerator="gpu", devices=k, strategy='ddp')`               |
-+-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
-|       | Y       |     | Y   | Y      | `Trainer(accelerator="gpu", devices=k, strategy='ddp', precision=16)` |
-+-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
-
-
-Implement Your Own Distributed (DDP) training
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-If you need your own way to init PyTorch DDP you can override :meth:`pytorch_lightning.strategies.ddp.DDPStrategy.init_dist_connection`.
-
-If you also need to use your own DDP implementation, override :meth:`pytorch_lightning.strategies.ddp.DDPStrategy.configure_ddp`.
-
-
-Batch size
-----------
-When using distributed training make sure to modify your learning rate according to your effective
-batch size.
-
-Let's say you have a batch size of 7 in your dataloader.
-
-.. testcode::
-
-    class LitModel(LightningModule):
-        def train_dataloader(self):
-            return Dataset(..., batch_size=7)
-
-In DDP, DDP_SPAWN, Deepspeed, DDP_SHARDED, or Horovod your effective batch size will be 7 * devices * num_nodes.
-
-.. code-block:: python
-
-    # effective batch size = 7 * 8
-    Trainer(accelerator="gpu", devices=8, strategy="ddp")
-    Trainer(accelerator="gpu", devices=8, strategy="ddp_spawn")
-    Trainer(accelerator="gpu", devices=8, strategy="ddp_sharded")
-    Trainer(accelerator="gpu", devices=8, strategy="horovod")
-
-    # effective batch size = 7 * 8 * 10
-    Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy="ddp")
-    Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy="ddp_spawn")
-    Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy="ddp_sharded")
-    Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy="horovod")
-
-In DDP2 or DP, your effective batch size will be 7 * num_nodes.
-The reason is that the full batch is visible to all GPUs on the node when using DDP2.
-
-.. code-block:: python
-
-    # effective batch size = 7
-    Trainer(accelerator="gpu", devices=8, strategy="ddp2")
-    Trainer(accelerator="gpu", devices=8, strategy="dp")
-
-    # effective batch size = 7 * 10
-    Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy="ddp2")
-    Trainer(accelerator="gpu", devices=8, strategy="dp")
-
-
-.. note:: Huge batch sizes are actually really bad for convergence. Check out:
-        `Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour <https://arxiv.org/abs/1706.02677>`_
-
-----------
-
-Torch Distributed Elastic
--------------------------
-Lightning supports the use of Torch Distributed Elastic to enable fault-tolerant and elastic distributed job scheduling. To use it, specify the 'ddp' or 'ddp2' backend and the number of GPUs you want to use in the trainer.
-
-.. code-block:: python
-
-    Trainer(accelerator="gpu", devices=8, strategy="ddp")
-
-To launch a fault-tolerant job, run the following on all nodes.
-
-.. code-block:: bash
-
-    python -m torch.distributed.run
-            --nnodes=NUM_NODES
-            --nproc_per_node=TRAINERS_PER_NODE
-            --rdzv_id=JOB_ID
-            --rdzv_backend=c10d
-            --rdzv_endpoint=HOST_NODE_ADDR
-            YOUR_LIGHTNING_TRAINING_SCRIPT.py (--arg1 ... train script args...)
-
-To launch an elastic job, run the following on at least ``MIN_SIZE`` nodes and at most ``MAX_SIZE`` nodes.
-
-.. code-block:: bash
-
-    python -m torch.distributed.run
-            --nnodes=MIN_SIZE:MAX_SIZE
-            --nproc_per_node=TRAINERS_PER_NODE
-            --rdzv_id=JOB_ID
-            --rdzv_backend=c10d
-            --rdzv_endpoint=HOST_NODE_ADDR
-            YOUR_LIGHTNING_TRAINING_SCRIPT.py (--arg1 ... train script args...)
-
-See the official `Torch Distributed Elastic documentation <https://pytorch.org/docs/stable/distributed.elastic.html>`_ for details
-on installation and more use cases.
-
-----------
-
-Jupyter Notebooks
------------------
-Unfortunately any `ddp_` is not supported in jupyter notebooks. Please use `dp` for multiple GPUs. This is a known
-Jupyter issue. If you feel like taking a stab at adding this support, feel free to submit a PR!
-
-----------
-
-Pickle Errors
---------------
-Multi-GPU training sometimes requires your model to be pickled. If you run into an issue with pickling
-try the following to figure out the issue
-
-.. code-block:: python
-
-    import pickle
-
-    model = YourModel()
-    pickle.dumps(model)
+.. raw:: html
 
-However, if you use `ddp` the pickling requirement is not there and you should be fine. If you use `ddp_spawn` the
-pickling requirement remains. This is a limitation of Python.
+        </div>
+    </div>
diff --git a/docs/source/accelerators/gpu_advanced.rst b/docs/source/accelerators/gpu_advanced.rst
new file mode 100644
index 00000000000000..eadeb03edd7ce9
--- /dev/null
+++ b/docs/source/accelerators/gpu_advanced.rst
@@ -0,0 +1,16 @@
+:orphan:
+
+.. _gpu_advanced:
+
+GPU training (Advanced)
+=======================
+**Audience:** Users looking to scale massive models (ie: 1 Trillion parameters).
+
+----
+
+For experts pushing the state-of-the-art in model development, Lightning offers various techniques to enable Trillion+ parameter-scale models.
+
+----
+
+..
+    .. include:: ../advanced/model_parallel.rst
diff --git a/docs/source/accelerators/gpu_basic.rst b/docs/source/accelerators/gpu_basic.rst
new file mode 100644
index 00000000000000..43be718180aa97
--- /dev/null
+++ b/docs/source/accelerators/gpu_basic.rst
@@ -0,0 +1,97 @@
+:orphan:
+
+.. _gpu_basic:
+
+GPU training (Basic)
+====================
+**Audience:** Users looking to save money and run large models faster using single or multiple
+
+----
+
+What is a GPU?
+--------------
+A Graphics Processing Unit (GPU), is a specialized hardware accelerator designed to speed up mathematical computations used in gaming and deep learning.
+
+----
+
+Train on 1 GPU
+--------------
+
+Make sure you're running on a machine with at least one GPU. There's no need to specify any NVIDIA flags
+as Lightning will do it for you.
+
+.. testcode::
+    :skipif: torch.cuda.device_count() < 1
+
+    trainer = Trainer(accelerator="gpu", devices=1)
+
+----------------
+
+
+.. _multi_gpu:
+
+Train on multiple GPUs
+----------------------
+
+To use multiple GPUs, set the number of devices in the Trainer or the index of the GPUs.
+
+.. code::
+
+    trainer = Trainer(accelerator="gpu", devices=4)
+
+Choosing GPU devices
+^^^^^^^^^^^^^^^^^^^^
+
+You can select the GPU devices using ranges, a list of indices or a string containing
+a comma separated list of GPU ids:
+
+.. testsetup::
+
+    k = 1
+
+.. testcode::
+    :skipif: torch.cuda.device_count() < 2
+
+    # DEFAULT (int) specifies how many GPUs to use per node
+    Trainer(accelerator="gpu", devices=k)
+
+    # Above is equivalent to
+    Trainer(accelerator="gpu", devices=list(range(k)))
+
+    # Specify which GPUs to use (don't use when running on cluster)
+    Trainer(accelerator="gpu", devices=[0, 1])
+
+    # Equivalent using a string
+    Trainer(accelerator="gpu", devices="0, 1")
+
+    # To use all available GPUs put -1 or '-1'
+    # equivalent to list(range(torch.cuda.device_count()))
+    Trainer(accelerator="gpu", devices=-1)
+
+The table below lists examples of possible input formats and how they are interpreted by Lightning.
+
++------------------+-----------+---------------------+---------------------------------+
+| `devices`        | Type      | Parsed              | Meaning                         |
++==================+===========+=====================+=================================+
+| 3                | int       | [0, 1, 2]           | first 3 GPUs                    |
++------------------+-----------+---------------------+---------------------------------+
+| -1               | int       | [0, 1, 2, ...]      | all available GPUs              |
++------------------+-----------+---------------------+---------------------------------+
+| [0]              | list      | [0]                 | GPU 0                           |
++------------------+-----------+---------------------+---------------------------------+
+| [1, 3]           | list      | [1, 3]              | GPUs 1 and 3                    |
++------------------+-----------+---------------------+---------------------------------+
+| "3"              | str       | [0, 1, 2]           | first 3 GPUs                    |
++------------------+-----------+---------------------+---------------------------------+
+| "1, 3"           | str       | [1, 3]              | GPUs 1 and 3                    |
++------------------+-----------+---------------------+---------------------------------+
+| "-1"             | str       | [0, 1, 2, ...]      | all available GPUs              |
++------------------+-----------+---------------------+---------------------------------+
+
+.. note::
+
+    When specifying number of ``devices`` as an integer ``devices=k``, setting the trainer flag
+    ``auto_select_gpus=True`` will automatically help you find ``k`` GPUs that are not
+    occupied by other processes. This is especially useful when GPUs are configured
+    to be in "exclusive mode", such that only one process at a time can access them.
+    For more details see the :doc:`trainer guide <../common/trainer>`.
diff --git a/docs/source/accelerators/gpu_expert.rst b/docs/source/accelerators/gpu_expert.rst
new file mode 100644
index 00000000000000..947850b13f65fe
--- /dev/null
+++ b/docs/source/accelerators/gpu_expert.rst
@@ -0,0 +1,23 @@
+:orphan:
+
+.. _gpu_expert:
+
+GPU training (Expert)
+=====================
+**Audience:** Experts creating new scaling techniques such as Deepspeed or FSDP
+
+----
+
+Lightning enables experts focused on researching new ways of optimizing distributed training/inference strategies to create new strategies and plug them into Lightning.
+
+For example, Lightning worked closely with the Microsoft team to develop a Deepspeed integration and with the Facebook(Meta) team to develop a FSDP integration.
+
+
+----
+
+.. include:: ../extensions/strategy.rst
+
+
+----
+
+.. include:: ../advanced/strategy_registry.rst
diff --git a/docs/source/accelerators/gpu_faq.rst b/docs/source/accelerators/gpu_faq.rst
new file mode 100644
index 00000000000000..c697b2ca7b3549
--- /dev/null
+++ b/docs/source/accelerators/gpu_faq.rst
@@ -0,0 +1,97 @@
+:orphan:
+
+.. _gpu_faq:
+
+GPU training (FAQ)
+==================
+
+******************************************************************
+How should I adjust the learning rate when using multiple devices?
+******************************************************************
+
+When using distributed training make sure to modify your learning rate according to your effective
+batch size.
+
+Let's say you have a batch size of 7 in your dataloader.
+
+.. testcode::
+
+    class LitModel(LightningModule):
+        def train_dataloader(self):
+            return Dataset(..., batch_size=7)
+
+In DDP, DDP_SPAWN, Deepspeed, DDP_SHARDED, or Horovod your effective batch size will be 7 * devices * num_nodes.
+
+.. code-block:: python
+
+    # effective batch size = 7 * 8
+    Trainer(accelerator="gpu", devices=8, strategy="ddp")
+    Trainer(accelerator="gpu", devices=8, strategy="ddp_spawn")
+    Trainer(accelerator="gpu", devices=8, strategy="ddp_sharded")
+    Trainer(accelerator="gpu", devices=8, strategy="horovod")
+
+    # effective batch size = 7 * 8 * 10
+    Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy="ddp")
+    Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy="ddp_spawn")
+    Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy="ddp_sharded")
+    Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy="horovod")
+
+In DDP2 or DP, your effective batch size will be 7 * num_nodes.
+The reason is that the full batch is visible to all GPUs on the node when using DDP2.
+
+.. code-block:: python
+
+    # effective batch size = 7
+    Trainer(accelerator="gpu", devices=8, strategy="ddp2")
+    Trainer(accelerator="gpu", devices=8, strategy="dp")
+
+    # effective batch size = 7 * 10
+    Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy="ddp2")
+    Trainer(accelerator="gpu", devices=8, strategy="dp")
+
+
+.. note:: Huge batch sizes are actually really bad for convergence. Check out:
+        `Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour <https://arxiv.org/abs/1706.02677>`_
+
+----
+
+*********************************************************
+How do I use multiple GPUs on Jupyter or Colab notebooks?
+*********************************************************
+
+To use multiple GPUs on notebooks, use the *DP* mode.
+
+.. code-block:: python
+
+    Trainer(accelerator="gpu", devices=4, strategy="dp")
+
+If you want to use other models, please launch your training via the command-shell.
+
+.. note:: Learn how to :ref:`access a cloud machine with multiple GPUs <grid_cloud_session_basic>` in this guide.
+
+----
+
+*****************************************************
+I'm getting errors related to Pickling. What do I do?
+*****************************************************
+
+Pickle is Python's mechanism for serializing and unserializing data. A majority of distributed modes require that your code is fully pickle compliant. If you run into an issue with pickling try the following to figure out the issue
+
+.. code-block:: python
+
+    import pickle
+
+    model = YourModel()
+    pickle.dumps(model)
+
+If you `ddp` your code doesn't need to be pickled.
+
+.. code-block:: python
+
+    Trainer(accelerator="gpu", devices=4, strategy="ddp")
+
+If you use `ddp_spawn` the pickling requirement remains. This is a limitation of Python.
+
+.. code-block:: python
+
+    Trainer(accelerator="gpu", devices=4, strategy="ddp_spawn")
diff --git a/docs/source/accelerators/gpu_intermediate.rst b/docs/source/accelerators/gpu_intermediate.rst
new file mode 100644
index 00000000000000..c4d9ad8817621c
--- /dev/null
+++ b/docs/source/accelerators/gpu_intermediate.rst
@@ -0,0 +1,533 @@
+:orphan:
+
+.. _gpu_intermediate:
+
+GPU training (Intermediate)
+===========================
+**Audience:** Users looking to train across machines or experiment with different scaling techniques.
+
+----
+
+Distributed Training strategies
+-------------------------------
+Lightning supports multiple ways of doing distributed training.
+
+.. raw:: html
+
+    <video width="50%" max-width="400px" controls
+    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/yt_thumbs/thumb_multi_gpus.png"
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/yt/Trainer+flags+4-+multi+node+training_3.mp4"></video>
+
+|
+
+- Data Parallel (``strategy='dp'``) (multiple-gpus, 1 machine)
+- DistributedDataParallel (``strategy='ddp'``) (multiple-gpus across many machines (python script based)).
+- DistributedDataParallel (``strategy='ddp_spawn'``) (multiple-gpus across many machines (spawn based)).
+- DistributedDataParallel 2 (``strategy='ddp2'``) (DP in a machine, DDP across machines).
+- Horovod (``strategy='horovod'``) (multi-machine, multi-gpu, configured at runtime)
+- Bagua (``strategy='bagua'``) (multiple-gpus across many machines with advanced training algorithms)
+
+.. note::
+    If you request multiple GPUs or nodes without setting a mode, DDP Spawn will be automatically used.
+
+For a deeper understanding of what Lightning is doing, feel free to read this
+`guide <https://medium.com/@_willfalcon/9-tips-for-training-lightning-fast-neural-networks-in-pytorch-8e63a502f565>`_.
+
+
+Data Parallel
+^^^^^^^^^^^^^
+:class:`~torch.nn.DataParallel` (DP) splits a batch across k GPUs.
+That is, if you have a batch of 32 and use DP with 2 GPUs, each GPU will process 16 samples,
+after which the root node will aggregate the results.
+
+.. warning:: DP use is discouraged by PyTorch and Lightning. State is not maintained on the replicas created by the
+    :class:`~torch.nn.DataParallel` wrapper and you may see errors or misbehavior if you assign state to the module
+    in the ``forward()`` or ``*_step()`` methods. For the same reason we cannot fully support
+    :doc:`Manual Optimization <../model/manual_optimization>` with DP. Use DDP which is more stable and at least 3x faster.
+
+.. warning:: DP only supports scattering and gathering primitive collections of tensors like lists, dicts, etc.
+    Therefore the :meth:`~pytorch_lightning.core.hooks.ModelHooks.transfer_batch_to_device` hook does not apply in
+    this mode and if you have overridden it, it will not be called.
+
+.. testcode::
+    :skipif: torch.cuda.device_count() < 2
+
+    # train on 2 GPUs (using DP mode)
+    trainer = Trainer(accelerator="gpu", devices=2, strategy="dp")
+
+Distributed Data Parallel
+^^^^^^^^^^^^^^^^^^^^^^^^^
+:class:`~torch.nn.parallel.DistributedDataParallel` (DDP) works as follows:
+
+1. Each GPU across each node gets its own process.
+
+2. Each GPU gets visibility into a subset of the overall dataset. It will only ever see that subset.
+
+3. Each process inits the model.
+
+4. Each process performs a full forward and backward pass in parallel.
+
+5. The gradients are synced and averaged across all processes.
+
+6. Each process updates its optimizer.
+
+.. code-block:: python
+
+    # train on 8 GPUs (same machine (ie: node))
+    trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp")
+
+    # train on 32 GPUs (4 nodes)
+    trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp", num_nodes=4)
+
+This Lightning implementation of DDP calls your script under the hood multiple times with the correct environment
+variables:
+
+.. code-block:: bash
+
+    # example for 3 GPUs DDP
+    MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=0 LOCAL_RANK=0 python my_file.py --accelerator 'gpu' --devices 3 --etc
+    MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=1 LOCAL_RANK=0 python my_file.py --accelerator 'gpu' --devices 3 --etc
+    MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=2 LOCAL_RANK=0 python my_file.py --accelerator 'gpu' --devices 3 --etc
+
+We use DDP this way because `ddp_spawn` has a few limitations (due to Python and PyTorch):
+
+1. Since `.spawn()` trains the model in subprocesses, the model on the main process does not get updated.
+2. Dataloader(num_workers=N), where N is large, bottlenecks training with DDP... ie: it will be VERY slow or won't work at all. This is a PyTorch limitation.
+3. Forces everything to be picklable.
+
+There are cases in which it is NOT possible to use DDP. Examples are:
+
+- Jupyter Notebook, Google COLAB, Kaggle, etc.
+- You have a nested script without a root package
+
+In these situations you should use `dp` or `ddp_spawn` instead.
+
+Distributed Data Parallel 2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+In certain cases, it's advantageous to use all batches on the same machine instead of a subset.
+For instance, you might want to compute a NCE loss where it pays to have more negative samples.
+
+In  this case, we can use DDP2 which behaves like DP in a machine and DDP across nodes. DDP2 does the following:
+
+1. Copies a subset of the data to each node.
+
+2. Inits a model on each node.
+
+3. Runs a forward and backward pass using DP.
+
+4. Syncs gradients across nodes.
+
+5. Applies the optimizer updates.
+
+.. code-block:: python
+
+    # train on 32 GPUs (4 nodes)
+    trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp2", num_nodes=4)
+
+Distributed Data Parallel Spawn
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+`ddp_spawn` is exactly like `ddp` except that it uses .spawn to start the training processes.
+
+.. warning:: It is STRONGLY recommended to use `DDP` for speed and performance.
+
+.. code-block:: python
+
+    mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,))
+
+If your script does not support being called from the command line (ie: it is nested without a root
+project module) you can use the following method:
+
+.. code-block:: python
+
+    # train on 8 GPUs (same machine (ie: node))
+    trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp_spawn")
+
+We STRONGLY discourage this use because it has limitations (due to Python and PyTorch):
+
+1. The model you pass in will not update. Please save a checkpoint and restore from there.
+2. Set Dataloader(num_workers=0) or it will bottleneck training.
+
+`ddp` is MUCH faster than `ddp_spawn`. We recommend you
+
+1. Install a top-level module for your project using setup.py
+
+.. code-block:: python
+
+    # setup.py
+    #!/usr/bin/env python
+
+    from setuptools import setup, find_packages
+
+    setup(
+        name="src",
+        version="0.0.1",
+        description="Describe Your Cool Project",
+        author="",
+        author_email="",
+        url="https://github.com/YourSeed",  # REPLACE WITH YOUR OWN GITHUB PROJECT LINK
+        install_requires=["pytorch-lightning"],
+        packages=find_packages(),
+    )
+
+2. Setup your project like so:
+
+.. code-block:: bash
+
+    /project
+        /src
+            some_file.py
+            /or_a_folder
+        setup.py
+
+3. Install as a root-level package
+
+.. code-block:: bash
+
+    cd /project
+    pip install -e .
+
+You can then call your scripts anywhere
+
+.. code-block:: bash
+
+    cd /project/src
+    python some_file.py --accelerator 'gpu' --devices 8 --strategy 'ddp'
+
+
+Horovod
+^^^^^^^
+`Horovod <http://horovod.ai>`_ allows the same training script to be used for single-GPU,
+multi-GPU, and multi-node training.
+
+Like Distributed Data Parallel, every process in Horovod operates on a single GPU with a fixed
+subset of the data.  Gradients are averaged across all GPUs in parallel during the backward pass,
+then synchronously applied before beginning the next step.
+
+The number of worker processes is configured by a driver application (`horovodrun` or `mpirun`). In
+the training script, Horovod will detect the number of workers from the environment, and automatically
+scale the learning rate to compensate for the increased total batch size.
+
+Horovod can be configured in the training script to run with any number of GPUs / processes as follows:
+
+.. code-block:: python
+
+    # train Horovod on GPU (number of GPUs / machines provided on command-line)
+    trainer = Trainer(strategy="horovod", accelerator="gpu", devices=1)
+
+    # train Horovod on CPU (number of processes / machines provided on command-line)
+    trainer = Trainer(strategy="horovod")
+
+When starting the training job, the driver application will then be used to specify the total
+number of worker processes:
+
+.. code-block:: bash
+
+    # run training with 4 GPUs on a single machine
+    horovodrun -np 4 python train.py
+
+    # run training with 8 GPUs on two machines (4 GPUs each)
+    horovodrun -np 8 -H hostname1:4,hostname2:4 python train.py
+
+See the official `Horovod documentation <https://horovod.readthedocs.io/en/stable>`_ for details
+on installation and performance tuning.
+
+
+Bagua
+^^^^^
+`Bagua <https://github.com/BaguaSys/bagua>`_ is a deep learning training acceleration framework which supports
+multiple advanced distributed training algorithms including:
+
+- `Gradient AllReduce <https://tutorials.baguasys.com/algorithms/gradient-allreduce>`_ for centralized synchronous communication, where gradients are averaged among all workers.
+- `Decentralized SGD <https://tutorials.baguasys.com/algorithms/decentralized>`_ for decentralized synchronous communication, where each worker exchanges data with one or a few specific workers.
+- `ByteGrad <https://tutorials.baguasys.com/algorithms/bytegrad>`_ and `QAdam <https://tutorials.baguasys.com/algorithms/q-adam>`_ for low precision communication, where data is compressed into low precision before communication.
+- `Asynchronous Model Average <https://tutorials.baguasys.com/algorithms/async-model-average>`_ for asynchronous communication, where workers are not required to be synchronized in the same iteration in a lock-step style.
+
+By default, Bagua uses *Gradient AllReduce* algorithm, which is also the algorithm implemented in Distributed Data Parallel and Horovod,
+but Bagua can usually produce a higher training throughput due to its backend written in Rust.
+
+.. code-block:: python
+
+    # train on 4 GPUs (using Bagua mode)
+    trainer = Trainer(strategy="bagua", accelerator="gpu", devices=4)
+
+
+By specifying the ``algorithm`` in the ``BaguaStrategy``, you can select more advanced training algorithms featured by Bagua:
+
+
+.. code-block:: python
+
+    # train on 4 GPUs, using Bagua Gradient AllReduce algorithm
+    trainer = Trainer(
+        strategy=BaguaStrategy(algorithm="gradient_allreduce"),
+        accelerator="gpu",
+        devices=4,
+    )
+
+    # train on 4 GPUs, using Bagua ByteGrad algorithm
+    trainer = Trainer(
+        strategy=BaguaStrategy(algorithm="bytegrad"),
+        accelerator="gpu",
+        devices=4,
+    )
+
+    # train on 4 GPUs, using Bagua Decentralized SGD
+    trainer = Trainer(
+        strategy=BaguaStrategy(algorithm="decentralized"),
+        accelerator="gpu",
+        devices=4,
+    )
+
+    # train on 4 GPUs, using Bagua Low Precision Decentralized SGD
+    trainer = Trainer(
+        strategy=BaguaStrategy(algorithm="low_precision_decentralized"),
+        accelerator="gpu",
+        devices=4,
+    )
+
+    # train on 4 GPUs, using Asynchronous Model Average algorithm, with a synchronization interval of 100ms
+    trainer = Trainer(
+        strategy=BaguaStrategy(algorithm="async", sync_interval_ms=100),
+        accelerator="gpu",
+        devices=4,
+    )
+
+To use *QAdam*, we need to initialize
+`QAdamOptimizer <https://bagua.readthedocs.io/en/latest/autoapi/bagua/torch_api/algorithms/q_adam/index.html#bagua.torch_api.algorithms.q_adam.QAdamOptimizer>`_ first:
+
+.. code-block:: python
+
+    from pytorch_lightning.strategies import BaguaStrategy
+    from bagua.torch_api.algorithms.q_adam import QAdamOptimizer
+
+
+    class MyModel(pl.LightningModule):
+        ...
+
+        def configure_optimizers(self):
+            # initialize QAdam Optimizer
+            return QAdamOptimizer(self.parameters(), lr=0.05, warmup_steps=100)
+
+
+    model = MyModel()
+    trainer = Trainer(
+        accelerator="gpu",
+        devices=4,
+        strategy=BaguaStrategy(algorithm="qadam"),
+    )
+    trainer.fit(model)
+
+Bagua relies on its own `launcher <https://tutorials.baguasys.com/getting-started/#launch-job>`_ to schedule jobs.
+Below, find examples using ``bagua.distributed.launch`` which follows ``torch.distributed.launch`` API:
+
+.. code-block:: bash
+
+    # start training with 8 GPUs on a single node
+    python -m bagua.distributed.launch --nproc_per_node=8 train.py
+
+If the ssh service is available with passwordless login on each node, you can launch the distributed job on a
+single node with ``baguarun`` which has a similar syntax as ``mpirun``. When staring the job, ``baguarun`` will
+automatically spawn new processes on each of your training node provided by ``--host_list`` option and each node in it
+is described as an ip address followed by a ssh port.
+
+.. code-block:: bash
+
+    # Run on node1 (or node2) to start training on two nodes (node1 and node2), 8 GPUs per node
+    baguarun --host_list hostname1:ssh_port1,hostname2:ssh_port2 --nproc_per_node=8 --master_port=port1 train.py
+
+
+.. note:: You can also start training in the same way as Distributed Data Parallel. However, system optimizations like
+    `Bagua-Net <https://tutorials.baguasys.com/more-optimizations/bagua-net>`_ and
+    `Performance autotuning <https://tutorials.baguasys.com/performance-autotuning/>`_ can only be enabled through bagua
+    launcher. It is worth noting that with ``Bagua-Net``, Distributed Data Parallel can also achieve
+    better performance without modifying the training script.
+
+
+See `Bagua Tutorials <https://tutorials.baguasys.com/>`_ for more details on installation and advanced features.
+
+
+DP/DDP2 caveats
+^^^^^^^^^^^^^^^
+In DP and DDP2 each GPU within a machine sees a portion of a batch.
+DP and ddp2 roughly do the following:
+
+.. testcode::
+
+    def distributed_forward(batch, model):
+        batch = torch.Tensor(32, 8)
+        gpu_0_batch = batch[:8]
+        gpu_1_batch = batch[8:16]
+        gpu_2_batch = batch[16:24]
+        gpu_3_batch = batch[24:]
+
+        y_0 = model_copy_gpu_0(gpu_0_batch)
+        y_1 = model_copy_gpu_1(gpu_1_batch)
+        y_2 = model_copy_gpu_2(gpu_2_batch)
+        y_3 = model_copy_gpu_3(gpu_3_batch)
+
+        return [y_0, y_1, y_2, y_3]
+
+So, when Lightning calls any of the `training_step`, `validation_step`, `test_step`
+you will only be operating on one of those pieces.
+
+.. testcode::
+
+    # the batch here is a portion of the FULL batch
+    def training_step(self, batch, batch_idx):
+        y_0 = batch
+
+For most metrics, this doesn't really matter. However, if you want
+to add something to your computational graph (like softmax)
+using all batch parts you can use the `training_step_end` step.
+
+.. testcode::
+
+    def training_step_end(self, outputs):
+        # only use when  on dp
+        outputs = torch.cat(outputs, dim=1)
+        softmax = softmax(outputs, dim=1)
+        out = softmax.mean()
+        return out
+
+In pseudocode, the full sequence is:
+
+.. code-block:: python
+
+    # get data
+    batch = next(dataloader)
+
+    # copy model and data to each gpu
+    batch_splits = split_batch(batch, num_gpus)
+    models = copy_model_to_gpus(model)
+
+    # in parallel, operate on each batch chunk
+    all_results = []
+    for gpu_num in gpus:
+        batch_split = batch_splits[gpu_num]
+        gpu_model = models[gpu_num]
+        out = gpu_model(batch_split)
+        all_results.append(out)
+
+    # use the full batch for something like softmax
+    full_out = model.training_step_end(all_results)
+
+To illustrate why this is needed, let's look at DataParallel
+
+.. testcode::
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(batch)
+
+        # on dp or ddp2 if we did softmax now it would be wrong
+        # because batch is actually a piece of the full batch
+        return y_hat
+
+
+    def training_step_end(self, step_output):
+        # step_output has outputs of each part of the batch
+
+        # do softmax here
+        outputs = torch.cat(outputs, dim=1)
+        softmax = softmax(outputs, dim=1)
+        out = softmax.mean()
+
+        return out
+
+If `training_step_end` is defined it will be called regardless of TPU, DP, DDP, etc... which means
+it will behave the same regardless of the backend.
+
+Validation and test step have the same option when using DP.
+
+.. testcode::
+
+    def validation_step_end(self, step_output):
+        ...
+
+
+    def test_step_end(self, step_output):
+        ...
+
+
+Distributed and 16-bit precision
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Due to an issue with Apex and DataParallel (PyTorch and NVIDIA issue), Lightning does
+not allow 16-bit and DP training. We tried to get this to work, but it's an issue on their end.
+
+Below are the possible configurations we support.
+
++-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
+| 1 GPU | 1+ GPUs | DP  | DDP | 16-bit | command                                                               |
++=======+=========+=====+=====+========+=======================================================================+
+| Y     |         |     |     |        | `Trainer(accelerator="gpu", devices=1)`                               |
++-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
+| Y     |         |     |     | Y      | `Trainer(accelerator="gpu", devices=1, precision=16)`                 |
++-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
+|       | Y       | Y   |     |        | `Trainer(accelerator="gpu", devices=k, strategy='dp')`                |
++-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
+|       | Y       |     | Y   |        | `Trainer(accelerator="gpu", devices=k, strategy='ddp')`               |
++-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
+|       | Y       |     | Y   | Y      | `Trainer(accelerator="gpu", devices=k, strategy='ddp', precision=16)` |
++-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
+
+
+Implement Your Own Distributed (DDP) training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+If you need your own way to init PyTorch DDP you can override :meth:`pytorch_lightning.strategies.ddp.DDPStrategy.init_dist_connection`.
+
+If you also need to use your own DDP implementation, override :meth:`pytorch_lightning.strategies.ddp.DDPStrategy.configure_ddp`.
+
+----------
+
+Torch Distributed Elastic
+-------------------------
+Lightning supports the use of Torch Distributed Elastic to enable fault-tolerant and elastic distributed job scheduling. To use it, specify the 'ddp' or 'ddp2' backend and the number of GPUs you want to use in the trainer.
+
+.. code-block:: python
+
+    Trainer(accelerator="gpu", devices=8, strategy="ddp")
+
+To launch a fault-tolerant job, run the following on all nodes.
+
+.. code-block:: bash
+
+    python -m torch.distributed.run
+            --nnodes=NUM_NODES
+            --nproc_per_node=TRAINERS_PER_NODE
+            --rdzv_id=JOB_ID
+            --rdzv_backend=c10d
+            --rdzv_endpoint=HOST_NODE_ADDR
+            YOUR_LIGHTNING_TRAINING_SCRIPT.py (--arg1 ... train script args...)
+
+To launch an elastic job, run the following on at least ``MIN_SIZE`` nodes and at most ``MAX_SIZE`` nodes.
+
+.. code-block:: bash
+
+    python -m torch.distributed.run
+            --nnodes=MIN_SIZE:MAX_SIZE
+            --nproc_per_node=TRAINERS_PER_NODE
+            --rdzv_id=JOB_ID
+            --rdzv_backend=c10d
+            --rdzv_endpoint=HOST_NODE_ADDR
+            YOUR_LIGHTNING_TRAINING_SCRIPT.py (--arg1 ... train script args...)
+
+See the official `Torch Distributed Elastic documentation <https://pytorch.org/docs/stable/distributed.elastic.html>`_ for details
+on installation and more use cases.
+
+Optimize multi-machine communication
+------------------------------------
+
+By default, Lightning will select the ``nccl`` backend over ``gloo`` when running on GPUs.
+Find more information about PyTorch's supported backends `here <https://pytorch.org/docs/stable/distributed.html>`__.
+
+Lightning allows explicitly specifying the backend via the `process_group_backend` constructor argument on the relevant Strategy classes. By default, Lightning will select the appropriate process group backend based on the hardware used.
+
+.. code-block:: python
+
+    from pytorch_lightning.strategies import DDPStrategy
+
+    # Explicitly specify the process group backend if you choose to
+    ddp = DDPStrategy(process_group_backend="nccl")
+
+    # Configure the strategy on the Trainer
+    trainer = Trainer(strategy=ddp, accelerator="gpu", devices=8)
diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index fd7bd310ffc435..13eeab8e9a72eb 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -1,124 +1,40 @@
 .. _hpu:
 
-Habana Gaudi AI Processor (HPU)
-===============================
-
-Lightning supports `Habana Gaudi AI Processor (HPU) <https://habana.ai/>`__, for accelerating Deep Learning training workloads.
-
-HPU Terminology
----------------
-
-Habana® Gaudi® AI training processors are built on a heterogeneous architecture with a cluster of fully programmable Tensor Processing Cores (TPC) along with its associated development tools and libraries, and a configurable Matrix Math engine.
-
-The TPC core is a VLIW SIMD processor with an instruction set and hardware tailored to serve training workloads efficiently.
-The Gaudi memory architecture includes on-die SRAM and local memories in each TPC and,
-Gaudi is the first DL training processor that has integrated RDMA over Converged Ethernet (RoCE v2) engines on-chip.
-
-On the software side, the PyTorch Habana bridge interfaces between the framework and SynapseAI software stack to enable the execution of deep learning models on the Habana Gaudi device.
-
-Gaudi offers a substantial price/performance advantage -- so you get to do more deep learning training while spending less.
-
-For more information, check out `Gaudi Architecture <https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Overview.html#gaudi-architecture>`__ and `Gaudi Developer Docs <https://developer.habana.ai>`__.
-
-How to access HPUs
-------------------
-
-To use HPUs, you must have access to a system with HPU devices.
-You can either use `Gaudi-based AWS EC2 DL1 instances <https://aws.amazon.com/ec2/instance-types/dl1/>`__ or `Supermicro X12 Gaudi server <https://www.supermicro.com/en/solutions/habana-gaudi>`__ to get access to HPUs.
-
-Check out the `Getting Started Guide with AWS and Habana <https://docs.habana.ai/en/latest/AWS_EC2_Getting_Started/AWS_EC2_Getting_Started.html>`__.
-
-Training with HPUs
-------------------
-
-To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``accelerator="hpu"`` parameter to the Trainer class.
-
-.. code-block:: python
-
-    trainer = Trainer(accelerator="hpu")
-
-Passing ``devices=1`` and ``accelerator="hpu"`` to the Trainer class enables the Habana accelerator for single Gaudi training.
-
-.. code-block:: python
-
-    trainer = Trainer(devices=1, accelerator="hpu")
-
-The ``devices=8`` and ``accelerator="hpu"`` parameters to the Trainer class enables the Habana accelerator for distributed training with 8 Gaudis.
-It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy` internally which is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
-
-.. code-block:: python
-
-    trainer = Trainer(devices=8, accelerator="hpu")
-
-.. note::
-    If the ``devices`` flag is not defined, it will assume ``devices`` to be ``"auto"`` and select 8 Gaudi devices for :class:`~pytorch_lightning.accelerators.hpu.HPUAccelerator`.
-
-
-Mixed Precision Plugin
-----------------------
-
-Lightning also allows mixed precision training with HPUs.
-By default, HPU training will use 32-bit precision. To enable mixed precision, set the ``precision`` flag.
-
-.. code-block:: python
-
-    trainer = Trainer(devices=1, accelerator="hpu", precision=16)
-
-
-Enabling Mixed Precision Options
---------------------------------
-
-Internally, :class:`~pytorch_lightning.plugins.precision.hpu.HPUPrecisionPlugin` uses the Habana Mixed Precision (HMP) package to enable mixed precision training.
-
-You can execute the ops in FP32 or BF16 precision. The HMP package modifies the Python operators to add the appropriate cast operations for the arguments before execution.
-The default settings enable users to enable mixed precision training with minimal code easily.
-
-In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their
-BF16 and FP32 operator lists by passing them as parameter to :class:`~pytorch_lightning.plugins.precision.hpu.HPUPrecisionPlugin`.
-
-The below snippet shows an example model using MNIST with a single Habana Gaudi device and making use of HMP by overriding the default parameters.
-This enables advanced users to provide their own BF16 and FP32 operator list instead of using the HMP defaults.
-
-.. code-block:: python
-
-    import pytorch_lightning as pl
-    from pytorch_lightning.plugins import HPUPrecisionPlugin
-
-    # Initialize a trainer with HPU accelerator for HPU strategy for single device,
-    # with mixed precision using overidden HMP settings
-    trainer = pl.Trainer(
-        accelerator="hpu",
-        devices=1,
-        # Optional Habana mixed precision params to be set
-        # Checkout `pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt` for the format
-        plugins=[
-            HPUPrecisionPlugin(
-                precision=16,
-                opt_level="O1",
-                verbose=False,
-                bf16_file_path="ops_bf16_mnist.txt",
-                fp32_file_path="ops_fp32_mnist.txt",
-            )
-        ],
-    )
-
-    # Init our model
-    model = LitClassifier()
-    # Init the data
-    dm = MNISTDataModule(batch_size=batch_size)
-
-    # Train the model ⚡
-    trainer.fit(model, datamodule=dm)
-
-For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`__.
-
-----------------
-
-.. _known-limitations_hpu:
-
-Known limitations
------------------
-
-* Multiple optimizers are not supported.
-* `Habana dataloader <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#habana-data-loader>`__ is not supported.
-* :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is not supported.
+Accelerator: HPU training
+=========================
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Prepare your code (Optional)
+   :description: Prepare your code to run on any hardware
+   :col_css: col-md-4
+   :button_link: accelerator_prepare.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Basic
+   :description: Learn the basics of single and multi-HPU core training.
+   :col_css: col-md-4
+   :button_link: hpu_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Intermediate
+   :description: Enable state-of-the-art scaling with advanced mix-precision settings.
+   :col_css: col-md-4
+   :button_link: hpu_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/accelerators/hpu_basic.rst b/docs/source/accelerators/hpu_basic.rst
new file mode 100644
index 00000000000000..e07e1531313904
--- /dev/null
+++ b/docs/source/accelerators/hpu_basic.rst
@@ -0,0 +1,83 @@
+:orphan:
+
+.. _hpu_basics:
+
+Accelerator: HPU training
+=========================
+**Audience:** Users looking to save money and run large models faster using single or multiple Gaudi devices.
+
+----
+
+What is an HPU?
+---------------
+
+`Habana® Gaudi® AI Processor (HPU) <https://habana.ai/>`__ training processors are built on a heterogeneous architecture with a cluster of fully programmable Tensor Processing Cores (TPC) along with its associated development tools and libraries, and a configurable Matrix Math engine.
+
+The TPC core is a VLIW SIMD processor with an instruction set and hardware tailored to serve training workloads efficiently.
+The Gaudi memory architecture includes on-die SRAM and local memories in each TPC and,
+Gaudi is the first DL training processor that has integrated RDMA over Converged Ethernet (RoCE v2) engines on-chip.
+
+On the software side, the PyTorch Habana bridge interfaces between the framework and SynapseAI software stack to enable the execution of deep learning models on the Habana Gaudi device.
+
+Gaudi offers a substantial price/performance advantage -- so you get to do more deep learning training while spending less.
+
+For more information, check out `Gaudi Architecture <https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Overview.html#gaudi-architecture>`__ and `Gaudi Developer Docs <https://developer.habana.ai>`__.
+
+----
+
+Run on 1 Gaudi
+--------------
+
+To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``accelerator="hpu"`` parameter to the Trainer class.
+
+.. code-block:: python
+
+    trainer = Trainer(accelerator="hpu", devices=1)
+
+----
+
+Run on multiple Gaudis
+----------------------
+The ``devices=8`` and ``accelerator="hpu"`` parameters to the Trainer class enables the Habana accelerator for distributed training with 8 Gaudis.
+It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy` internally which is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
+
+.. code-block:: python
+
+    trainer = Trainer(devices=8, accelerator="hpu")
+
+----
+
+Select Gaudis automatically
+---------------------------
+
+Lightning can automatically detect the number of Gaudi devices to run on. This setting is enabled by default if the devices argument is missing.
+
+.. code-block:: python
+
+    # equivalent
+    trainer = Trainer(accelerator="hpu")
+    trainer = Trainer(accelerator="hpu", devices="auto")
+
+----
+
+How to access HPUs
+------------------
+
+To use HPUs, you must have access to a system with HPU devices.
+
+AWS
+^^^
+You can either use `Gaudi-based AWS EC2 DL1 instances <https://aws.amazon.com/ec2/instance-types/dl1/>`__ or `Supermicro X12 Gaudi server <https://www.supermicro.com/en/solutions/habana-gaudi>`__ to get access to HPUs.
+
+Check out the `Get Started Guide with AWS and Habana <https://docs.habana.ai/en/latest/AWS_EC2_Getting_Started/AWS_EC2_Getting_Started.html>`__.
+
+----
+
+.. _known-limitations_hpu:
+
+Known limitations
+-----------------
+
+* Multiple optimizers are not supported.
+* `Habana dataloader <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#habana-data-loader>`__ is not supported.
+* :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is not supported.
diff --git a/docs/source/accelerators/hpu_intermediate.rst b/docs/source/accelerators/hpu_intermediate.rst
new file mode 100644
index 00000000000000..65dca85479407c
--- /dev/null
+++ b/docs/source/accelerators/hpu_intermediate.rst
@@ -0,0 +1,68 @@
+:orphan:
+
+.. _hpu_intermediate:
+
+Accelerator: HPU training
+=========================
+**Audience:** Gaudi chip users looking to save memory and scale models with mixed-precision training.
+
+----
+
+Enable Mixed Precision
+----------------------
+
+Lightning also allows mixed precision training with HPUs.
+By default, HPU training will use 32-bit precision. To enable mixed precision, set the ``precision`` flag.
+
+.. code-block:: python
+
+    trainer = Trainer(devices=1, accelerator="hpu", precision=16)
+
+----
+
+Customize Mixed Precision
+-------------------------
+
+Internally, :class:`~pytorch_lightning.plugins.precision.hpu.HPUPrecisionPlugin` uses the Habana Mixed Precision (HMP) package to enable mixed precision training.
+
+You can execute the ops in FP32 or BF16 precision. The HMP package modifies the Python operators to add the appropriate cast operations for the arguments before execution.
+The default settings enable users to enable mixed precision training with minimal code easily.
+
+In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their
+BF16 and FP32 operator lists by passing them as parameter to :class:`~pytorch_lightning.plugins.precision.hpu.HPUPrecisionPlugin`.
+
+The below snippet shows an example model using MNIST with a single Habana Gaudi device and making use of HMP by overriding the default parameters.
+This enables advanced users to provide their own BF16 and FP32 operator list instead of using the HMP defaults.
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+    from pytorch_lightning.plugins import HPUPrecisionPlugin
+
+    # Initialize a trainer with HPU accelerator for HPU strategy for single device,
+    # with mixed precision using overidden HMP settings
+    trainer = pl.Trainer(
+        accelerator="hpu",
+        devices=1,
+        # Optional Habana mixed precision params to be set
+        # Checkout `pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt` for the format
+        plugins=[
+            HPUPrecisionPlugin(
+                precision=16,
+                opt_level="O1",
+                verbose=False,
+                bf16_file_path="ops_bf16_mnist.txt",
+                fp32_file_path="ops_fp32_mnist.txt",
+            )
+        ],
+    )
+
+    # Init our model
+    model = LitClassifier()
+    # Init the data
+    dm = MNISTDataModule(batch_size=batch_size)
+
+    # Train the model ⚡
+    trainer.fit(model, datamodule=dm)
+
+For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`__.
diff --git a/docs/source/accelerators/ipu.rst b/docs/source/accelerators/ipu.rst
index b85f9124c2d006..138814fefc8502 100644
--- a/docs/source/accelerators/ipu.rst
+++ b/docs/source/accelerators/ipu.rst
@@ -1,238 +1,48 @@
 .. _ipu:
 
-Intelligence Processing Unit (IPU)
-==================================
-
-Lightning supports the Graphcore `Intelligence Processing Unit (IPU) <https://www.graphcore.ai/products/ipu>`__, built for Artificial Intelligence and Machine Learning.
-
-.. note::
-  IPU support is experimental and a work in progress (see :ref:`known-limitations`). If you run into any problems, please leave an issue.
-
-IPU terminology
----------------
-
-IPUs consist of many individual cores, called *tiles*, allowing highly parallel computation. Due to the high bandwidth between tiles,
-IPUs facilitate machine learning loads where parallelization is essential. Because computation is heavily parallelized,
-IPUs operate in a different way to conventional accelerators such as CPU/GPUs.
-IPUs do not require large batch sizes for maximum parallelization, can provide optimizations across the compiled graph and rely on model parallelism to fully utilize tiles for larger models.
-
-IPUs are used to build IPU-PODs, rack-based systems of IPU-Machines for larger workloads. See the `IPU Architecture <https://www.graphcore.ai/products/ipu>`__ for more information.
-
-See the `Graphcore Glossary <https://docs.graphcore.ai/projects/graphcore-glossary/>`__ for the definitions of other IPU-specific terminology.
-
-How to access IPUs
-------------------
-
-To use IPUs you must have access to a system with IPU devices. To get access see `getting started <https://www.graphcore.ai/getstarted>`__.
-
-You must ensure that the IPU system has enabled the PopART and Poplar packages from the SDK. Instructions are in the Getting Started guide for your IPU system, on the Graphcore `documents portal <https://docs.graphcore.ai/page/getting-started.html>`__.
-
-Training with IPUs
-------------------
-
-Specify the number of IPUs to train with. Note that when training with IPUs, you must select 1 or a power of 2 number of IPUs (i.e. 2/4/8..).
-
-.. code-block:: python
-
-    trainer = pl.Trainer(accelerator="ipu", devices=8)  # Train using data parallel on 8 IPUs
-
-IPUs only support specifying a single number to allocate devices, which is handled via the underlying libraries.
-
-Mixed precision & 16 bit precision
-----------------------------------
-
-Lightning also supports training in mixed precision with IPUs.
-By default, IPU training will use 32-bit precision. To enable mixed precision,
-set the precision flag.
-
-.. note::
-    Currently there is no dynamic scaling of the loss with mixed precision training.
-
-.. code-block:: python
-
-    import pytorch_lightning as pl
-
-    model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="ipu", devices=8, precision=16)
-    trainer.fit(model)
-
-You can also use pure 16-bit training, where the weights are also in 16-bit precision.
-
-.. code-block:: python
-
-    import pytorch_lightning as pl
-    from pytorch_lightning.strategies import IPUStrategy
-
-    model = MyLightningModule()
-    model = model.half()
-    trainer = pl.Trainer(accelerator="ipu", devices=8, precision=16)
-    trainer.fit(model)
-
-Advanced IPU options
---------------------
-
-IPUs provide further optimizations to speed up training. By using the ``IPUStrategy`` we can set the ``device_iterations``, which controls the number of iterations run directly on the IPU devices before returning to the host. Increasing the number of on-device iterations will improve throughput, as there is less device to host communication required.
-
-.. note::
-
-    When using model parallelism, it is a hard requirement to increase the number of device iterations to ensure we fully saturate the devices via micro-batching. see :ref:`ipu-model-parallelism` for more information.
-
-.. code-block:: python
-
-    import pytorch_lightning as pl
-    from pytorch_lightning.strategies import IPUStrategy
-
-    model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=32))
-    trainer.fit(model)
-
-Note that by default we return the last device iteration loss. You can override this by passing in your own ``poptorch.Options`` and setting the AnchorMode as described in the `PopTorch documentation <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.Options.anchorMode>`__.
-
-.. code-block:: python
-
-    import poptorch
-    import pytorch_lightning as pl
-    from pytorch_lightning.strategies import IPUStrategy
-
-    model = MyLightningModule()
-    inference_opts = poptorch.Options()
-    inference_opts.deviceIterations(32)
-
-    training_opts = poptorch.Options()
-    training_opts.anchorMode(poptorch.AnchorMode.All)
-    training_opts.deviceIterations(32)
-
-    trainer = Trainer(
-        accelerator="ipu", devices=8, strategy=IPUStrategy(inference_opts=inference_opts, training_opts=training_opts)
-    )
-    trainer.fit(model)
-
-You can also override all options by passing the ``poptorch.Options`` to the plugin. See `PopTorch options documentation <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/batching.html>`__ for more information.
-
-PopVision Graph Analyser
-------------------------
-
-.. figure:: ../_static/images/accelerator/ipus/profiler.png
-   :alt: PopVision Graph Analyser
-   :width: 500
-
-Lightning supports integration with the `PopVision Graph Analyser Tool <https://docs.graphcore.ai/projects/graph-analyser-userguide/en/latest/>`__. This helps to look at utilization of IPU devices and provides helpful metrics during the lifecycle of your trainer. Once you have gained access, The PopVision Graph Analyser Tool can be downloaded via the `GraphCore download website <https://downloads.graphcore.ai/>`__.
-
-Lightning supports dumping all reports to a directory to open using the tool.
-
-.. code-block:: python
-
-    import pytorch_lightning as pl
-    from pytorch_lightning.strategies import IPUStrategy
-
-    model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(autoreport_dir="report_dir/"))
-    trainer.fit(model)
-
-This will dump all reports to ``report_dir/`` which can then be opened using the Graph Analyser Tool, see `Opening Reports <https://docs.graphcore.ai/projects/graph-analyser-userguide/en/latest/graph-analyser.html#opening-reports>`__.
-
-.. _ipu-model-parallelism:
-
-Model parallelism
------------------
-
-Due to the IPU architecture, larger models should be parallelized across IPUs by design. Currently PopTorch provides the capabilities via annotations as described in `parallel execution strategies <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/overview.html#execution-strategies>`__.
-
-Below is an example using the block annotation in a LightningModule.
-
-.. note::
-
-    Currently, when using model parallelism we do not infer the number of IPUs required for you. This is done via the annotations themselves. If you specify 4 different IDs when defining Blocks, this means your model will be split onto 4 different IPUs.
-
-    This is also mutually exclusive with the Trainer flag. In other words, if your model is split onto 2 IPUs and you set ``Trainer(accelerator="ipu", devices=4)`` this will require 8 IPUs in total: data parallelism will be used to replicate the two-IPU model 4 times.
-
-    When pipelining the model you must also increase the `device_iterations` to ensure full data saturation of the devices data, i.e whilst one device in the model pipeline processes a batch of data, the other device can start on the next batch. For example if the model is split onto 4 IPUs, we require `device_iterations` to be at-least 4.
-
-
-.. code-block:: python
-
-    import pytorch_lightning as pl
-    import poptorch
-
-
-    class MyLightningModule(pl.LightningModule):
-        def __init__(self):
-            super().__init__()
-            # This will place layer1, layer2+layer3, layer4, softmax on different IPUs at runtime.
-            # BeginBlock will start a new id for all layers within this block
-            self.layer1 = poptorch.BeginBlock(torch.nn.Linear(5, 10), ipu_id=0)
-
-            # This layer starts a new block,
-            # adding subsequent layers to this current block at runtime
-            # till the next block has been declared
-            self.layer2 = poptorch.BeginBlock(torch.nn.Linear(10, 5), ipu_id=1)
-            self.layer3 = torch.nn.Linear(5, 5)
-
-            # Create new blocks
-            self.layer4 = poptorch.BeginBlock(torch.nn.Linear(5, 5), ipu_id=2)
-            self.softmax = poptorch.BeginBlock(torch.nn.Softmax(dim=1), ipu_id=3)
-
-        ...
-
-
-    model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=20))
-    trainer.fit(model)
-
-
-You can also use the block context manager within the forward function, or any of the step functions.
-
-.. code-block:: python
-
-    import pytorch_lightning as pl
-    import poptorch
-
-
-    class MyLightningModule(pl.LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.layer1 = torch.nn.Linear(5, 10)
-            self.layer2 = torch.nn.Linear(10, 5)
-            self.layer3 = torch.nn.Linear(5, 5)
-            self.layer4 = torch.nn.Linear(5, 5)
-
-            self.act = torch.nn.ReLU()
-            self.softmax = torch.nn.Softmax(dim=1)
-
-        def forward(self, x):
-
-            with poptorch.Block(ipu_id=0):
-                x = self.act(self.layer1(x))
-
-            with poptorch.Block(ipu_id=1):
-                x = self.act(self.layer2(x))
-
-            with poptorch.Block(ipu_id=2):
-                x = self.act(self.layer3(x))
-                x = self.act(self.layer4(x))
-
-            with poptorch.Block(ipu_id=3):
-                x = self.softmax(x)
-            return x
-
-        ...
-
-
-    model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=20))
-    trainer.fit(model)
-
-
-.. _known-limitations:
-
-Known limitations
------------------
-
-Currently there are some known limitations that are being addressed in the near future to make the experience seamless when moving from different devices.
-
-Please see the `MNIST example <https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/ipu_examples/mnist.py>`__ which displays most of the limitations and how to overcome them till they are resolved.
-
-* ``self.log`` is not supported in the ``training_step``, ``validation_step``, ``test_step`` or ``predict_step``. This is due to the step function being traced and sent to the IPU devices. We're actively working on fixing this
-* Multiple optimizers are not supported. ``training_step`` only supports returning one loss from the ``training_step`` function as a result
-* Since the step functions are traced, branching logic or any form of primitive values are traced into constants. Be mindful as this could lead to errors in your custom code
-* Clipping gradients is not supported
+Accelerator: IPU training
+=========================
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Prepare your code (Optional)
+   :description: Prepare your code to run on any hardware
+   :col_css: col-md-6
+   :button_link: accelerator_prepare.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Basic
+   :description: Learn the basics of single and multi-IPU training.
+   :col_css: col-md-6
+   :button_link: ipu_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Intermediate
+   :description: Tune model performance with mix-precision settings and the performance analyser.
+   :col_css: col-md-6
+   :button_link: ipu_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Advanced
+   :description: Learn advanced techniques to customize IPU training for massive models.
+   :col_css: col-md-6
+   :button_link: ipu_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/accelerators/ipu_advanced.rst b/docs/source/accelerators/ipu_advanced.rst
new file mode 100644
index 00000000000000..1dc4e71ee4d7ae
--- /dev/null
+++ b/docs/source/accelerators/ipu_advanced.rst
@@ -0,0 +1,143 @@
+:orphan:
+
+.. _ipu_advanced:
+
+Accelerator: IPU training
+=========================
+**Audience:** Users looking to customize IPU training for massive models.
+
+----
+
+Advanced IPU options
+--------------------
+
+IPUs provide further optimizations to speed up training. By using the ``IPUStrategy`` we can set the ``device_iterations``, which controls the number of iterations run directly on the IPU devices before returning to the host. Increasing the number of on-device iterations will improve throughput, as there is less device to host communication required.
+
+.. note::
+
+    When using model parallelism, it is a hard requirement to increase the number of device iterations to ensure we fully saturate the devices via micro-batching. see :ref:`ipu-model-parallelism` for more information.
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+    from pytorch_lightning.strategies import IPUStrategy
+
+    model = MyLightningModule()
+    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=32))
+    trainer.fit(model)
+
+Note that by default we return the last device iteration loss. You can override this by passing in your own ``poptorch.Options`` and setting the AnchorMode as described in the `PopTorch documentation <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.Options.anchorMode>`__.
+
+.. code-block:: python
+
+    import poptorch
+    import pytorch_lightning as pl
+    from pytorch_lightning.strategies import IPUStrategy
+
+    model = MyLightningModule()
+    inference_opts = poptorch.Options()
+    inference_opts.deviceIterations(32)
+
+    training_opts = poptorch.Options()
+    training_opts.anchorMode(poptorch.AnchorMode.All)
+    training_opts.deviceIterations(32)
+
+    trainer = Trainer(
+        accelerator="ipu", devices=8, strategy=IPUStrategy(inference_opts=inference_opts, training_opts=training_opts)
+    )
+    trainer.fit(model)
+
+You can also override all options by passing the ``poptorch.Options`` to the plugin. See `PopTorch options documentation <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/batching.html>`__ for more information.
+
+----
+
+.. _ipu-model-parallelism:
+
+Model parallelism
+-----------------
+
+Due to the IPU architecture, larger models should be parallelized across IPUs by design. Currently PopTorch provides the capabilities via annotations as described in `parallel execution strategies <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/overview.html#execution-strategies>`__.
+
+Below is an example using the block annotation in a LightningModule.
+
+.. note::
+
+    Currently, when using model parallelism we do not infer the number of IPUs required for you. This is done via the annotations themselves. If you specify 4 different IDs when defining Blocks, this means your model will be split onto 4 different IPUs.
+
+    This is also mutually exclusive with the Trainer flag. In other words, if your model is split onto 2 IPUs and you set ``Trainer(accelerator="ipu", devices=4)`` this will require 8 IPUs in total: data parallelism will be used to replicate the two-IPU model 4 times.
+
+    When pipelining the model you must also increase the `device_iterations` to ensure full data saturation of the devices data, i.e whilst one device in the model pipeline processes a batch of data, the other device can start on the next batch. For example if the model is split onto 4 IPUs, we require `device_iterations` to be at-least 4.
+
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+    import poptorch
+
+
+    class MyLightningModule(pl.LightningModule):
+        def __init__(self):
+            super().__init__()
+            # This will place layer1, layer2+layer3, layer4, softmax on different IPUs at runtime.
+            # BeginBlock will start a new id for all layers within this block
+            self.layer1 = poptorch.BeginBlock(torch.nn.Linear(5, 10), ipu_id=0)
+
+            # This layer starts a new block,
+            # adding subsequent layers to this current block at runtime
+            # till the next block has been declared
+            self.layer2 = poptorch.BeginBlock(torch.nn.Linear(10, 5), ipu_id=1)
+            self.layer3 = torch.nn.Linear(5, 5)
+
+            # Create new blocks
+            self.layer4 = poptorch.BeginBlock(torch.nn.Linear(5, 5), ipu_id=2)
+            self.softmax = poptorch.BeginBlock(torch.nn.Softmax(dim=1), ipu_id=3)
+
+        ...
+
+
+    model = MyLightningModule()
+    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=20))
+    trainer.fit(model)
+
+
+You can also use the block context manager within the forward function, or any of the step functions.
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+    import poptorch
+
+
+    class MyLightningModule(pl.LightningModule):
+        def __init__(self):
+            super().__init__()
+            self.layer1 = torch.nn.Linear(5, 10)
+            self.layer2 = torch.nn.Linear(10, 5)
+            self.layer3 = torch.nn.Linear(5, 5)
+            self.layer4 = torch.nn.Linear(5, 5)
+
+            self.act = torch.nn.ReLU()
+            self.softmax = torch.nn.Softmax(dim=1)
+
+        def forward(self, x):
+
+            with poptorch.Block(ipu_id=0):
+                x = self.act(self.layer1(x))
+
+            with poptorch.Block(ipu_id=1):
+                x = self.act(self.layer2(x))
+
+            with poptorch.Block(ipu_id=2):
+                x = self.act(self.layer3(x))
+                x = self.act(self.layer4(x))
+
+            with poptorch.Block(ipu_id=3):
+                x = self.softmax(x)
+            return x
+
+        ...
+
+
+    model = MyLightningModule()
+    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=20))
+    trainer.fit(model)
diff --git a/docs/source/accelerators/ipu_basic.rst b/docs/source/accelerators/ipu_basic.rst
new file mode 100644
index 00000000000000..492c7bf27ade63
--- /dev/null
+++ b/docs/source/accelerators/ipu_basic.rst
@@ -0,0 +1,68 @@
+:orphan:
+
+.. _ipu_basic:
+
+Accelerator: IPU training
+=========================
+**Audience:** Users looking to save money and run large models faster using single or multiple IPU devices.
+
+----
+
+What is an IPU?
+---------------
+
+The Graphcore `Intelligence Processing Unit (IPU) <https://www.graphcore.ai/products/ipu>`__, built for Artificial Intelligence and Machine Learning, consists of many individual cores, called *tiles*, allowing highly parallel computation. Due to the high bandwidth between tiles, IPUs facilitate machine learning loads where parallelization is essential. Because computation is heavily parallelized,
+
+IPUs operate in a different way to conventional accelerators such as CPU/GPUs. IPUs do not require large batch sizes for maximum parallelization, can provide optimizations across the compiled graph and rely on model parallelism to fully utilize tiles for larger models.
+
+IPUs are used to build IPU-PODs, rack-based systems of IPU-Machines for larger workloads. See the `IPU Architecture <https://www.graphcore.ai/products/ipu>`__ for more information.
+
+See the `Graphcore Glossary <https://docs.graphcore.ai/projects/graphcore-glossary/>`__ for the definitions of other IPU-specific terminology.
+
+.. note::
+  IPU support is experimental and a work in progress (see :ref:`known-limitations`). If you run into any problems, please leave an issue.
+
+----
+
+Run on 1 IPU
+------------
+To use a single IPU, set the accelerator and devices argument.
+
+.. code-block:: python
+
+    trainer = pl.Trainer(accelerator="ipu", devices=1)
+
+----
+
+Run on multiple IPUs
+--------------------
+To use multiple IPUs set the devices to a number that is a power of 2 (i.e: 2, 4, 8, 16, ...)
+
+.. code-block:: python
+
+    trainer = pl.Trainer(accelerator="ipu", devices=8)
+
+----
+
+How to access IPUs
+------------------
+
+To use IPUs you must have access to a system with IPU devices. To get access see `get started <https://www.graphcore.ai/getstarted>`__.
+
+You must ensure that the IPU system has enabled the PopART and Poplar packages from the SDK. Instructions are in the Get Started guide for your IPU system, on the Graphcore `documents portal <https://docs.graphcore.ai/page/getting-started.html>`__.
+
+----
+
+.. _known-limitations:
+
+Known limitations
+-----------------
+
+Currently there are some known limitations that are being addressed in the near future to make the experience seamless when moving from different devices.
+
+Please see the `MNIST example <https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/ipu_examples/mnist.py>`__ which displays most of the limitations and how to overcome them till they are resolved.
+
+* ``self.log`` is not supported in the ``training_step``, ``validation_step``, ``test_step`` or ``predict_step``. This is due to the step function being traced and sent to the IPU devices. We're actively working on fixing this
+* Multiple optimizers are not supported. ``training_step`` only supports returning one loss from the ``training_step`` function as a result
+* Since the step functions are traced, branching logic or any form of primitive values are traced into constants. Be mindful as this could lead to errors in your custom code
+* Clipping gradients is not supported
diff --git a/docs/source/accelerators/ipu_intermediate.rst b/docs/source/accelerators/ipu_intermediate.rst
new file mode 100644
index 00000000000000..68c866ea966447
--- /dev/null
+++ b/docs/source/accelerators/ipu_intermediate.rst
@@ -0,0 +1,63 @@
+:orphan:
+
+.. _ipu_intermediate:
+
+Accelerator: IPU training
+=========================
+**Audience:** IPU users looking to increase performance via mixed precision and analysis tools.
+
+----
+
+Mixed precision & 16 bit precision
+----------------------------------
+
+Lightning also supports training in mixed precision with IPUs.
+By default, IPU training will use 32-bit precision. To enable mixed precision,
+set the precision flag.
+
+.. note::
+    Currently there is no dynamic scaling of the loss with mixed precision training.
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+
+    model = MyLightningModule()
+    trainer = pl.Trainer(accelerator="ipu", devices=8, precision=16)
+    trainer.fit(model)
+
+You can also use pure 16-bit training, where the weights are also in 16-bit precision.
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+    from pytorch_lightning.strategies import IPUStrategy
+
+    model = MyLightningModule()
+    model = model.half()
+    trainer = pl.Trainer(accelerator="ipu", devices=8, precision=16)
+    trainer.fit(model)
+
+----
+
+PopVision Graph Analyser
+------------------------
+
+.. figure:: ../_static/images/accelerator/ipus/profiler.png
+   :alt: PopVision Graph Analyser
+   :width: 500
+
+Lightning supports integration with the `PopVision Graph Analyser Tool <https://docs.graphcore.ai/projects/graph-analyser-userguide/en/latest/>`__. This helps to look at utilization of IPU devices and provides helpful metrics during the lifecycle of your trainer. Once you have gained access, The PopVision Graph Analyser Tool can be downloaded via the `GraphCore download website <https://downloads.graphcore.ai/>`__.
+
+Lightning supports dumping all reports to a directory to open using the tool.
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+    from pytorch_lightning.strategies import IPUStrategy
+
+    model = MyLightningModule()
+    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(autoreport_dir="report_dir/"))
+    trainer.fit(model)
+
+This will dump all reports to ``report_dir/`` which can then be opened using the Graph Analyser Tool, see `Opening Reports <https://docs.graphcore.ai/projects/graph-analyser-userguide/en/latest/graph-analyser.html#opening-reports>`__.
diff --git a/docs/source/accelerators/tpu.rst b/docs/source/accelerators/tpu.rst
index 86f072ec4fcc62..f8ca8f5e1f6c7f 100644
--- a/docs/source/accelerators/tpu.rst
+++ b/docs/source/accelerators/tpu.rst
@@ -1,406 +1,55 @@
 .. _tpu:
 
-Tensor Processing Unit (TPU)
-============================
+Accelerator: TPU training
+=========================
 
 .. raw:: html
 
-    <video width="50%" max-width="400px" controls autoplay
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/yt_thumbs/thumb_tpus.png"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/tpu_cores.mp4"></video>
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Prepare your code (Optional)
+   :description: Prepare your code to run on any hardware
+   :col_css: col-md-4
+   :button_link: accelerator_prepare.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Basic
+   :description: Learn the basics of single and multi-TPU core training.
+   :col_css: col-md-4
+   :button_link: tpu_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Intermediate
+   :description: Scale massive models using cloud TPUs.
+   :col_css: col-md-4
+   :button_link: tpu_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Advanced
+   :description: Dive into XLA and advanced techniques to optimize TPU-powered models.
+   :col_css: col-md-4
+   :button_link: tpu_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: FAQ
+   :description: Frequently asked questions about TPU training.
+   :col_css: col-md-4
+   :button_link: tpu_faq.html
+   :height: 150
 
-|
-
-Lightning supports running on TPUs. At this moment, TPUs are available
-on Google Cloud (GCP), Google Colab and Kaggle Environments. For more information on TPUs
-`watch this video <https://www.youtube.com/watch?v=kPMpmcl_Pyw>`_.
-
-----------------
-
-TPU Terminology
----------------
-A TPU is a Tensor processing unit. Each TPU has 8 cores where each
-core is optimized for 128x128 matrix multiplies. In general, a single
-TPU is about as fast as 5 V100 GPUs!
-
-A TPU pod hosts many TPUs on it. Currently, TPU v3 Pod has up to 2048 TPU cores and 32 TiB of memory!
-You can request a full pod from Google cloud or a "slice" which gives you
-some subset of those 2048 cores.
-
-----------------
-
-How to access TPUs
-------------------
-To access TPUs, there are three main ways.
-
-1. Using Google Colab.
-2. Using Google Cloud (GCP).
-3. Using Kaggle.
-
-----------------
-
-Kaggle TPUs
------------
-For starting Kaggle projects with TPUs, refer to this `kernel <https://www.kaggle.com/pytorchlightning/pytorch-on-tpu-with-pytorch-lightning>`_.
-
----------
-
-Colab TPUs
-----------
-Colab is like a jupyter notebook with a free GPU or TPU
-hosted on GCP.
-
-To get a TPU on colab, follow these steps:
-
-1. Go to `https://colab.research.google.com/ <https://colab.research.google.com/>`_.
-
-2. Click "new notebook" (bottom right of pop-up).
-
-3. Click runtime > change runtime settings. Select Python 3, and hardware accelerator "TPU".
-   This will give you a TPU with 8 cores.
-
-4. Next, insert this code into the first cell and execute.
-   This will install the xla library that interfaces between PyTorch and the TPU.
-
-   .. code-block::
-
-        !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
-
-5. Once the above is done, install PyTorch Lightning.
-
-   .. code-block::
-
-        !pip install pytorch-lightning
-
-6. Then set up your LightningModule as normal.
-
-----------------
-
-DistributedSamplers
--------------------
-Lightning automatically inserts the correct samplers - no need to do this yourself!
-
-Usually, with TPUs (and DDP), you would need to define a DistributedSampler to move the right
-chunk of data to the appropriate TPU. As mentioned, this is not needed in Lightning
-
-.. note:: Don't add distributedSamplers. Lightning does this automatically
-
-If for some reason you still need to, this is how to construct the sampler
-for TPU use
-
-.. code-block:: python
-
-    import torch_xla.core.xla_model as xm
-
-
-    def train_dataloader(self):
-        dataset = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
-
-        # required for TPU support
-        sampler = None
-        if use_tpu:
-            sampler = torch.utils.data.distributed.DistributedSampler(
-                dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True
-            )
-
-        loader = DataLoader(dataset, sampler=sampler, batch_size=32)
-
-        return loader
-
-Configure the number of TPU cores in the trainer. You can only choose 1 or 8.
-To use a full TPU pod skip to the TPU pod section.
-
-.. code-block:: python
-
-    import pytorch_lightning as pl
-
-    my_model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="tpu", devices=8)
-    trainer.fit(my_model)
-
-That's it! Your model will train on all 8 TPU cores.
-
-----------------
-
-TPU core training
------------------
-
-Lightning supports training on a single TPU core or 8 TPU cores.
-
-The Trainer parameters ``devices`` along with ``accelerator="tpu"`` defines how many TPU cores to train on (1 or 8) / Single TPU to train on [1].
-
-For Single TPU training, Just pass the TPU core ID [1-8] in a list.
-
-Single TPU core training. Model will train on TPU core ID 5.
-
-.. code-block:: python
-
-    trainer = pl.Trainer(accelerator="tpu", devices=[5])
-
-8 TPU cores training. Model will train on 8 TPU cores.
-
-.. code-block:: python
-
-    trainer = pl.Trainer(accelerator="tpu", devices=8)
-
-----------------
-
-Distributed Backend with TPU
-----------------------------
-The ``accelerator`` option used for GPUs does not apply to TPUs.
-TPUs work in DDP mode by default (distributing over each core)
-
-----------------
-
-TPU VM
-------
-Lightning supports training on the new Cloud TPU VMs.
-Previously, we needed separate VMs to connect to the TPU machines, but as
-Cloud TPU VMs run on the TPU Host machines, it allows direct SSH access
-for the users. Hence, this architecture upgrade leads to cheaper and significantly
-better performance and usability while working with TPUs.
-
-The TPUVMs come pre-installed with latest versions of PyTorch and PyTorch XLA.
-After connecting to the VM and before running your Lightning code, you would need
-to set the XRT TPU device configuration.
-
-.. code-block:: bash
-
-    $ export XRT_TPU_CONFIG="localservice;0;localhost:51011"
-
-You could learn more about the Cloud TPU VM architecture `here <https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_vms_3>`_
-
-----------------
-
-TPU Pod
--------
-To train on more than 8 cores, your code actually doesn't change!
-All you need to do is submit the following command:
-
-.. code-block:: bash
-
-    $ python -m torch_xla.distributed.xla_dist
-    --tpu=$TPU_POD_NAME
-    --conda-env=torch-xla-nightly
-    -- python /usr/share/torch-xla-1.8.1/pytorch/xla/test/test_train_imagenet.py --fake_data
-
-See `this guide <https://cloud.google.com/tpu/docs/tutorials/pytorch-pod>`_
-on how to set up the instance groups and VMs needed to run TPU Pods.
-
-----------------
-
-16 bit precision
-----------------
-Lightning also supports training in 16-bit precision with TPUs.
-By default, TPU training will use 32-bit precision. To enable 16-bit,
-set the 16-bit flag.
-
-.. code-block:: python
-
-    import pytorch_lightning as pl
-
-    my_model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="tpu", devices=8, precision=16)
-    trainer.fit(my_model)
-
-Under the hood the xla library will use the `bfloat16 type <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_.
-
-
------------------
-
-Weight Sharing/Tying
---------------------
-Weight Tying/Sharing is a technique where in the module weights are shared among two or more layers.
-This is a common method to reduce memory consumption and is utilized in many State of the Art
-architectures today.
-
-PyTorch XLA requires these weights to be tied/shared after moving the model
-to the TPU device. To support this requirement Lightning provides a model hook which is
-called after the model is moved to the device. Any weights that require to be tied should
-be done in the `on_post_move_to_device` model hook. This will ensure that the weights
-among the modules are shared and not copied.
-
-PyTorch Lightning has an inbuilt check which verifies that the model parameter lengths
-match once the model is moved to the device. If the lengths do not match Lightning
-throws a warning message.
-
-Example:
-
-.. code-block:: python
-
-    from pytorch_lightning.core.lightning import LightningModule
-    from torch import nn
-    from pytorch_lightning.trainer.trainer import Trainer
-
-
-    class WeightSharingModule(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.layer_1 = nn.Linear(32, 10, bias=False)
-            self.layer_2 = nn.Linear(10, 32, bias=False)
-            self.layer_3 = nn.Linear(32, 10, bias=False)
-            # TPU shared weights are copied independently
-            # on the XLA device and this line won't have any effect.
-            # However, it works fine for CPU and GPU.
-            self.layer_3.weight = self.layer_1.weight
-
-        def forward(self, x):
-            x = self.layer_1(x)
-            x = self.layer_2(x)
-            x = self.layer_3(x)
-            return x
-
-        def on_post_move_to_device(self):
-            # Weights shared after the model has been moved to TPU Device
-            self.layer_3.weight = self.layer_1.weight
-
-
-    model = WeightSharingModule()
-    trainer = Trainer(max_epochs=1, accelerator="tpu", devices=8)
-
-See `XLA Documentation <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#xla-tensor-quirks>`_
-
------------------------
-
-Performance considerations
---------------------------
-
-The TPU was designed for specific workloads and operations to carry out large volumes of matrix multiplication,
-convolution operations and other commonly used ops in applied deep learning.
-The specialization makes it a strong choice for NLP tasks, sequential convolutional networks, and under low precision operation.
-There are cases in which training on TPUs is slower when compared with GPUs, for possible reasons listed:
-
-- Too small batch size.
-- Explicit evaluation of tensors during training, e.g. ``tensor.item()``
-- Tensor shapes (e.g. model inputs) change often during training.
-- Limited resources when using TPU's with PyTorch `Link <https://github.com/pytorch/xla/issues/2054#issuecomment-627367729>`_
-- XLA Graph compilation during the initial steps `Reference <https://github.com/pytorch/xla/issues/2383#issuecomment-666519998>`_
-- Some tensor ops are not fully supported on TPU, or not supported at all. These operations will be performed on CPU (context switch).
-- PyTorch integration is still experimental. Some performance bottlenecks may simply be the result of unfinished implementation.
-
-The official PyTorch XLA `performance guide <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#known-performance-caveats>`_
-has more detailed information on how PyTorch code can be optimized for TPU. In particular, the
-`metrics report <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#get-a-metrics-report>`_ allows
-one to identify operations that lead to context switching.
-
--------------
-
-Troubleshooting
----------------
-
-- **Missing XLA configuration**
-
-.. code-block::
-
-    File "/usr/local/lib/python3.8/dist-packages/torch_xla/core/xla_model.py", line 18, in <lambda>
-        _DEVICES = xu.LazyProperty(lambda: torch_xla._XLAC._xla_get_devices())
-    RuntimeError: tensorflow/compiler/xla/xla_client/computation_client.cc:273 : Missing XLA configuration
-    Traceback (most recent call last):
-    ...
-    File "/home/kaushikbokka/pytorch-lightning/pytorch_lightning/utilities/device_parser.py", line 125, in parse_tpu_cores
-        raise MisconfigurationException('No TPU devices were found.')
-    pytorch_lightning.utilities.exceptions.MisconfigurationException: No TPU devices were found.
-
-This means the system is missing XLA configuration. You would need to set up XRT TPU device configuration.
-
-For TPUVM architecture, you could set it in your terminal by:
-
-.. code-block:: bash
-
-    export XRT_TPU_CONFIG="localservice;0;localhost:51011"
-
-And for the old TPU + 2VM architecture, you could set it by:
-
-.. code-block:: bash
-
-    export TPU_IP_ADDRESS=10.39.209.42  # You could get the IP Address in the GCP TPUs section
-    export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"
-
-- **How to clear up the programs using TPUs in the background**
-
-.. code-block:: bash
-
-    lsof -w /lib/libtpu.so | grep "python" |  awk '{print $2}' | xargs -r kill -9
-
-Sometimes, there can still be old programs running on the TPUs, which would make the TPUs unavailable to use. You could use the above command in the terminal to kill the running processes.
-
-- **Replication issue**
-
-.. code-block::
-
-    File "/usr/local/lib/python3.6/dist-packages/torch_xla/core/xla_model.py", line 200, in set_replication
-        replication_devices = xla_replication_devices(devices)
-    File "/usr/local/lib/python3.6/dist-packages/torch_xla/core/xla_model.py", line 187, in xla_replication_devices
-        .format(len(local_devices), len(kind_devices)))
-    RuntimeError: Cannot replicate if number of devices (1) is different from 8
-
-This error is raised when the XLA device is called outside the spawn process. Internally in `TPUSpawn` Strategy for training on multiple tpu cores, we use XLA's `xmp.spawn`.
-Don't use ``xm.xla_device()`` while working on Lightning + TPUs!
-
-- **Unsupported datatype transfer to TPU**
-
-.. code-block::
-
-    File "/usr/local/lib/python3.8/dist-packages/torch_xla/utils/utils.py", line 205, in _for_each_instance_rewrite
-        v = _for_each_instance_rewrite(result.__dict__[k], select_fn, fn, rwmap)
-    File "/usr/local/lib/python3.8/dist-packages/torch_xla/utils/utils.py", line 206, in _for_each_instance_rewrite
-        result.__dict__[k] = v
-    TypeError: 'mappingproxy' object does not support item assignment
-
-PyTorch XLA only supports Tensor objects for CPU to TPU data transfer. Might cause issues if the User is trying to send some non-tensor objects through the DataLoader or during saving states.
-
-- **Using `tpu_spawn_debug` Strategy alias**
-
-.. code-block:: python
-
-    import pytorch_lightning as pl
-
-    my_model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="tpu", devices=8, strategy="tpu_spawn_debug")
-    trainer.fit(my_model)
-
-Example Metrics report:
-
-.. code-block::
-
-    Metric: CompileTime
-        TotalSamples: 202
-        Counter: 06m09s401ms746.001us
-        ValueRate: 778ms572.062us / second
-        Rate: 0.425201 / second
-        Percentiles: 1%=001ms32.778us; 5%=001ms61.283us; 10%=001ms79.236us; 20%=001ms110.973us; 50%=001ms228.773us; 80%=001ms339.183us; 90%=001ms434.305us; 95%=002ms921.063us; 99%=21s102ms853.173us
-
-
-A lot of PyTorch operations aren't lowered to XLA, which could lead to significant slowdown of the training process.
-These operations are moved to the CPU memory and evaluated, and then the results are transferred back to the XLA device(s).
-By using the `tpu_spawn_debug` Strategy, users could create a metrics report to diagnose issues.
-
-The report includes things like (`XLA Reference <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#troubleshooting>`_):
-
-* how many times we issue XLA compilations and time spent on issuing.
-* how many times we execute and time spent on execution
-* how many device data handles we create/destroy etc.
-
-- **TPU Pod Training Startup script**
-
-All TPU VMs in a Pod setup are required to access the model code and data.
-One easy way to achieve this is to use the following startup script when creating the TPU VM pod.
-It will perform the data downloading on all TPU VMs. Note that you need to export the corresponding environment variables following the instruction in Create TPU Node.
-
-.. code-block:: bash
-
-    gcloud alpha compute tpus tpu-vm create ${TPU_NAME} --zone ${ZONE} --project ${PROJECT_ID} --accelerator-type v3-32 --version ${RUNTIME_VERSION} --metadata startup-script=setup.py
-
-Then users could ssh to any TPU worker, e.g. worker 0, check if data/model downloading is finished and
-start the training after generating the ssh-keys to ssh between VM workers on a pod:
-
-.. code-block:: bash
-
-    python3 -m torch_xla.distributed.xla_dist --tpu=$TPU_NAME -- python3 train.py --max_epochs=5 --batch_size=32
-
-About XLA
-----------
-XLA is the library that interfaces PyTorch with the TPUs.
-For more information check out `XLA <https://github.com/pytorch/xla>`_.
+.. raw:: html
 
-Guide for `troubleshooting XLA <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md>`_
+        </div>
+    </div>
diff --git a/docs/source/accelerators/tpu_advanced.rst b/docs/source/accelerators/tpu_advanced.rst
new file mode 100644
index 00000000000000..0aa490e6e448da
--- /dev/null
+++ b/docs/source/accelerators/tpu_advanced.rst
@@ -0,0 +1,68 @@
+:orphan:
+
+TPU training (Advanced)
+=======================
+**Audience:** Users looking to apply advanced performance techniques to TPU training.
+
+----
+
+Weight Sharing/Tying
+--------------------
+Weight Tying/Sharing is a technique where in the module weights are shared among two or more layers.
+This is a common method to reduce memory consumption and is utilized in many State of the Art
+architectures today.
+
+PyTorch XLA requires these weights to be tied/shared after moving the model
+to the TPU device. To support this requirement Lightning provides a model hook which is
+called after the model is moved to the device. Any weights that require to be tied should
+be done in the `on_post_move_to_device` model hook. This will ensure that the weights
+among the modules are shared and not copied.
+
+PyTorch Lightning has an inbuilt check which verifies that the model parameter lengths
+match once the model is moved to the device. If the lengths do not match Lightning
+throws a warning message.
+
+Example:
+
+.. code-block:: python
+
+    from pytorch_lightning.core.lightning import LightningModule
+    from torch import nn
+    from pytorch_lightning.trainer.trainer import Trainer
+
+
+    class WeightSharingModule(LightningModule):
+        def __init__(self):
+            super().__init__()
+            self.layer_1 = nn.Linear(32, 10, bias=False)
+            self.layer_2 = nn.Linear(10, 32, bias=False)
+            self.layer_3 = nn.Linear(32, 10, bias=False)
+            # TPU shared weights are copied independently
+            # on the XLA device and this line won't have any effect.
+            # However, it works fine for CPU and GPU.
+            self.layer_3.weight = self.layer_1.weight
+
+        def forward(self, x):
+            x = self.layer_1(x)
+            x = self.layer_2(x)
+            x = self.layer_3(x)
+            return x
+
+        def on_post_move_to_device(self):
+            # Weights shared after the model has been moved to TPU Device
+            self.layer_3.weight = self.layer_1.weight
+
+
+    model = WeightSharingModule()
+    trainer = Trainer(max_epochs=1, accelerator="tpu", devices=8)
+
+See `XLA Documentation <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#xla-tensor-quirks>`_
+
+----
+
+XLA
+---
+XLA is the library that interfaces PyTorch with the TPUs.
+For more information check out `XLA <https://github.com/pytorch/xla>`_.
+
+Guide for `troubleshooting XLA <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md>`_
diff --git a/docs/source/accelerators/tpu_basic.rst b/docs/source/accelerators/tpu_basic.rst
new file mode 100644
index 00000000000000..a4b13bde1900d8
--- /dev/null
+++ b/docs/source/accelerators/tpu_basic.rst
@@ -0,0 +1,130 @@
+:orphan:
+
+TPU training (Basic)
+====================
+**Audience:** Users looking to train on single or multiple TPU cores.
+
+----
+
+.. raw:: html
+
+    <video width="50%" max-width="400px" controls
+    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/yt_thumbs/thumb_tpus.png"
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/tpu_cores.mp4"></video>
+
+|
+
+Lightning supports running on TPUs. At this moment, TPUs are available
+on Google Cloud (GCP), Google Colab and Kaggle Environments. For more information on TPUs
+`watch this video <https://www.youtube.com/watch?v=kPMpmcl_Pyw>`_.
+
+----------------
+
+What is a TPU?
+--------------
+Tensor Processing Unit (TPU) is an AI accelerator application-specific integrated circuit (ASIC) developed by Google specifically for neural networks.
+
+A TPU has 8 cores where each core is optimized for 128x128 matrix multiplies. In general, a single TPU is about as fast as 5 V100 GPUs!
+
+A TPU pod hosts many TPUs on it. Currently, TPU v3 Pod has up to 2048 TPU cores and 32 TiB of memory!
+You can request a full pod from Google cloud or a "slice" which gives you
+some subset of those 2048 cores.
+
+----
+
+Run on 1 TPU core
+-----------------
+Enable the following Trainer arguments to run on 1 TPU.
+
+.. code::
+
+    trainer = Trainer(accelerator="tpu", devices=1)
+
+----
+
+Run on multiple TPU cores
+-------------------------
+For multiple TPU cores, change the value of the devices flag.
+
+.. code::
+
+    trainer = Trainer(accelerator="tpu", devices=8)
+
+----
+
+Run on a specific TPU core
+--------------------------
+
+To run on a specific core, specify the index of the TPU core.
+
+.. code-block:: python
+
+    trainer = pl.Trainer(accelerator="tpu", devices=[5])
+
+This example runs on the 5th core, not on five cores.
+
+----
+
+How to access TPUs
+------------------
+To access TPUs, there are three main ways.
+
+Google Colab
+^^^^^^^^^^^^
+Colab is like a jupyter notebook with a free GPU or TPU
+hosted on GCP.
+
+To get a TPU on colab, follow these steps:
+
+1. Go to `Google Colab <https://colab.research.google.com/>`_.
+
+2. Click "new notebook" (bottom right of pop-up).
+
+3. Click runtime > change runtime settings. Select Python 3, and hardware accelerator "TPU".
+   This will give you a TPU with 8 cores.
+
+4. Next, insert this code into the first cell and execute.
+   This will install the xla library that interfaces between PyTorch and the TPU.
+
+   .. code-block::
+
+        !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
+
+5. Once the above is done, install PyTorch Lightning.
+
+   .. code-block::
+
+        !pip install pytorch-lightning
+
+6. Then set up your LightningModule as normal.
+
+Google Cloud (GCP)
+^^^^^^^^^^^^^^^^^^
+You could refer to this `page <https://cloud.google.com/tpu/docs/setup-gcp-account>`_ for getting started with Cloud TPU resources on GCP.
+
+Kaggle
+^^^^^^
+For starting Kaggle projects with TPUs, refer to this `kernel <https://www.kaggle.com/pytorchlightning/pytorch-on-tpu-with-pytorch-lightning>`_.
+
+----
+
+Optimize Performance
+--------------------
+
+The TPU was designed for specific workloads and operations to carry out large volumes of matrix multiplication,
+convolution operations and other commonly used ops in applied deep learning.
+The specialization makes it a strong choice for NLP tasks, sequential convolutional networks, and under low precision operation.
+There are cases in which training on TPUs is slower when compared with GPUs, for possible reasons listed:
+
+- Too small batch size.
+- Explicit evaluation of tensors during training, e.g. ``tensor.item()``
+- Tensor shapes (e.g. model inputs) change often during training.
+- Limited resources when using TPU's with PyTorch `Link <https://github.com/pytorch/xla/issues/2054#issuecomment-627367729>`_
+- XLA Graph compilation during the initial steps `Reference <https://github.com/pytorch/xla/issues/2383#issuecomment-666519998>`_
+- Some tensor ops are not fully supported on TPU, or not supported at all. These operations will be performed on CPU (context switch).
+- PyTorch integration is still experimental. Some performance bottlenecks may simply be the result of unfinished implementation.
+
+The official PyTorch XLA `performance guide <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#known-performance-caveats>`_
+has more detailed information on how PyTorch code can be optimized for TPU. In particular, the
+`metrics report <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#get-a-metrics-report>`_ allows
+one to identify operations that lead to context switching.
diff --git a/docs/source/accelerators/tpu_faq.rst b/docs/source/accelerators/tpu_faq.rst
new file mode 100644
index 00000000000000..f38f0a865b4cde
--- /dev/null
+++ b/docs/source/accelerators/tpu_faq.rst
@@ -0,0 +1,117 @@
+:orphan:
+
+.. _tpu_faq:
+
+TPU training (FAQ)
+==================
+
+*****************************
+XLA configuration is missing?
+*****************************
+
+.. code-block::
+
+    File "/usr/local/lib/python3.8/dist-packages/torch_xla/core/xla_model.py", line 18, in <lambda>
+        _DEVICES = xu.LazyProperty(lambda: torch_xla._XLAC._xla_get_devices())
+    RuntimeError: tensorflow/compiler/xla/xla_client/computation_client.cc:273 : Missing XLA configuration
+    Traceback (most recent call last):
+    ...
+    File "/home/kaushikbokka/pytorch-lightning/pytorch_lightning/utilities/device_parser.py", line 125, in parse_tpu_cores
+        raise MisconfigurationException('No TPU devices were found.')
+    pytorch_lightning.utilities.exceptions.MisconfigurationException: No TPU devices were found.
+
+This means the system is missing XLA configuration. You would need to set up XRT TPU device configuration.
+
+For TPUVM architecture, you could set it in your terminal by:
+
+.. code-block:: bash
+
+    export XRT_TPU_CONFIG="localservice;0;localhost:51011"
+
+And for the old TPU + 2VM architecture, you could set it by:
+
+.. code-block:: bash
+
+    export TPU_IP_ADDRESS=10.39.209.42  # You could get the IP Address in the GCP TPUs section
+    export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"
+
+----
+
+**********************************************************
+How to clear up the programs using TPUs in the background?
+**********************************************************
+
+.. code-block:: bash
+
+    lsof -w /lib/libtpu.so | grep "python" |  awk '{print $2}' | xargs -r kill -9
+
+Sometimes, there can still be old programs running on the TPUs, which would make the TPUs unavailable to use. You could use the above command in the terminal to kill the running processes.
+
+----
+
+*************************************
+How to resolve the replication issue?
+*************************************
+
+.. code-block::
+
+    File "/usr/local/lib/python3.6/dist-packages/torch_xla/core/xla_model.py", line 200, in set_replication
+        replication_devices = xla_replication_devices(devices)
+    File "/usr/local/lib/python3.6/dist-packages/torch_xla/core/xla_model.py", line 187, in xla_replication_devices
+        .format(len(local_devices), len(kind_devices)))
+    RuntimeError: Cannot replicate if number of devices (1) is different from 8
+
+This error is raised when the XLA device is called outside the spawn process. Internally in `TPUSpawn` Strategy for training on multiple tpu cores, we use XLA's `xmp.spawn`.
+Don't use ``xm.xla_device()`` while working on Lightning + TPUs!
+
+----
+
+**************************************
+Unsupported datatype transfer to TPUs?
+**************************************
+
+.. code-block::
+
+    File "/usr/local/lib/python3.8/dist-packages/torch_xla/utils/utils.py", line 205, in _for_each_instance_rewrite
+        v = _for_each_instance_rewrite(result.__dict__[k], select_fn, fn, rwmap)
+    File "/usr/local/lib/python3.8/dist-packages/torch_xla/utils/utils.py", line 206, in _for_each_instance_rewrite
+        result.__dict__[k] = v
+    TypeError: 'mappingproxy' object does not support item assignment
+
+PyTorch XLA only supports Tensor objects for CPU to TPU data transfer. Might cause issues if the User is trying to send some non-tensor objects through the DataLoader or during saving states.
+
+----
+
+*************************************************
+How to setup the debug mode for Training on TPUs?
+*************************************************
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+
+    my_model = MyLightningModule()
+    trainer = pl.Trainer(accelerator="tpu", devices=8, strategy="tpu_spawn_debug")
+    trainer.fit(my_model)
+
+Example Metrics report:
+
+.. code-block::
+
+    Metric: CompileTime
+        TotalSamples: 202
+        Counter: 06m09s401ms746.001us
+        ValueRate: 778ms572.062us / second
+        Rate: 0.425201 / second
+        Percentiles: 1%=001ms32.778us; 5%=001ms61.283us; 10%=001ms79.236us; 20%=001ms110.973us; 50%=001ms228.773us; 80%=001ms339.183us; 90%=001ms434.305us; 95%=002ms921.063us; 99%=21s102ms853.173us
+
+
+A lot of PyTorch operations aren't lowered to XLA, which could lead to significant slowdown of the training process.
+These operations are moved to the CPU memory and evaluated, and then the results are transferred back to the XLA device(s).
+By using the `tpu_spawn_debug` Strategy, users could create a metrics report to diagnose issues.
+
+The report includes things like (`XLA Reference <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#troubleshooting>`_):
+
+* how many times we issue XLA compilations and time spent on issuing.
+* how many times we execute and time spent on execution
+* how many device data handles we create/destroy etc.
diff --git a/docs/source/accelerators/tpu_intermediate.rst b/docs/source/accelerators/tpu_intermediate.rst
new file mode 100644
index 00000000000000..233491b0de2a00
--- /dev/null
+++ b/docs/source/accelerators/tpu_intermediate.rst
@@ -0,0 +1,114 @@
+:orphan:
+
+TPU training (Intermediate)
+===========================
+**Audience:** Users looking to use cloud TPUs.
+
+----
+
+DistributedSamplers
+-------------------
+Lightning automatically inserts the correct samplers - no need to do this yourself!
+
+Usually, with TPUs (and DDP), you would need to define a DistributedSampler to move the right
+chunk of data to the appropriate TPU. As mentioned, this is not needed in Lightning
+
+.. note:: Don't add distributedSamplers. Lightning does this automatically
+
+If for some reason you still need to, this is how to construct the sampler
+for TPU use
+
+.. code-block:: python
+
+    import torch_xla.core.xla_model as xm
+
+
+    def train_dataloader(self):
+        dataset = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
+
+        # required for TPU support
+        sampler = None
+        if use_tpu:
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True
+            )
+
+        loader = DataLoader(dataset, sampler=sampler, batch_size=32)
+
+        return loader
+
+Configure the number of TPU cores in the trainer. You can only choose 1 or 8.
+To use a full TPU pod skip to the TPU pod section.
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+
+    my_model = MyLightningModule()
+    trainer = pl.Trainer(accelerator="tpu", devices=8)
+    trainer.fit(my_model)
+
+That's it! Your model will train on all 8 TPU cores.
+
+----------------
+
+TPU VM
+------
+Lightning supports training on the new Cloud TPU VMs.
+Previously, we needed separate VMs to connect to the TPU machines, but as
+Cloud TPU VMs run on the TPU Host machines, it allows direct SSH access
+for the users. Hence, this architecture upgrade leads to cheaper and significantly
+better performance and usability while working with TPUs.
+
+The TPUVMs come pre-installed with latest versions of PyTorch and PyTorch XLA.
+After connecting to the VM and before running your Lightning code, you would need
+to set the XRT TPU device configuration.
+
+.. code-block:: bash
+
+    $ export XRT_TPU_CONFIG="localservice;0;localhost:51011"
+
+You could learn more about the Cloud TPU VM architecture `here <https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_vms_3>`_
+
+----------------
+
+TPU Pod
+-------
+To train on more than 8 cores, your code actually doesn't change!
+
+All TPU VMs in a Pod setup are required to access the model code and data.
+One easy way to achieve this is to use the following startup script when creating the TPU VM pod.
+It will perform the data downloading on all TPU VMs. Note that you need to export the corresponding environment variables following the instruction in Create TPU Node.
+
+.. code-block:: bash
+
+    gcloud alpha compute tpus tpu-vm create ${TPU_POD_NAME} --zone ${ZONE} --project ${PROJECT_ID} --accelerator-type ${ACCELERATOR_TYPE} --version ${RUNTIME_VERSION} --metadata startup-script=setup.py
+
+Then you could ssh to any TPU worker, e.g. worker 0, check if data/model downloading is finished and
+start the training after generating the ssh-keys to ssh between VM workers on a pod.
+All you need to do is submit the following command:
+
+.. code-block:: bash
+
+    python3 -m torch_xla.distributed.xla_dist --tpu=$TPU_POD_NAME -- python3 train.py --max_epochs=5 --batch_size=32
+
+See `this guide <https://cloud.google.com/tpu/docs/tutorials/pytorch-pod>`_
+on how to set up the instance groups and VMs needed to run TPU Pods.
+
+----------------
+
+16 bit precision
+----------------
+Lightning also supports training in 16-bit precision with TPUs.
+By default, TPU training will use 32-bit precision. To enable 16-bit,
+set the 16-bit flag.
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+
+    my_model = MyLightningModule()
+    trainer = pl.Trainer(accelerator="tpu", devices=8, precision=16)
+    trainer.fit(my_model)
+
+Under the hood the xla library will use the `bfloat16 type <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_.
diff --git a/docs/source/advanced/finetuning.rst b/docs/source/advanced/finetuning.rst
new file mode 100644
index 00000000000000..4809e6e79ab3ae
--- /dev/null
+++ b/docs/source/advanced/finetuning.rst
@@ -0,0 +1 @@
+.. include:: transfer_learning.rst
diff --git a/docs/source/advanced/model_parallel.rst b/docs/source/advanced/model_parallel.rst
index 5cf7556be1efdc..811bc57db884b3 100644
--- a/docs/source/advanced/model_parallel.rst
+++ b/docs/source/advanced/model_parallel.rst
@@ -1,7 +1,7 @@
-.. _model_parallel:
+.. _model-parallel:
 
-Model Parallel GPU Training
-===========================
+Train 1 trillion+ parameter models
+==================================
 
 When training large models, fitting larger batch sizes, or trying to increase throughput using multi-GPU compute, Lightning provides advanced optimized distributed training strategies to support these cases and offer substantial improvements in memory usage.
 
@@ -37,7 +37,7 @@ This means we cannot sacrifice throughput as much as if we were fine-tuning, bec
 Overall:
 
 * When **fine-tuning** a model, use advanced memory efficient strategies such as :ref:`deepspeed-zero-stage-3` or :ref:`deepspeed-zero-stage-3-offload`, allowing you to fine-tune larger models if you are limited on compute
-* When **pre-training** a model, use simpler optimizations such :ref:`sharded`, :ref:`deepspeed-zero-stage-2` or :ref:`fully-sharded`, scaling the number of GPUs to reach larger parameter sizes
+* When **pre-training** a model, use simpler optimizations such :ref:`sharded-training`, :ref:`deepspeed-zero-stage-2` or :ref:`fully-sharded-training`, scaling the number of GPUs to reach larger parameter sizes
 * For both fine-tuning and pre-training, use :ref:`deepspeed-activation-checkpointing` or :ref:`fairscale-activation-checkpointing` as the throughput degradation is not significant
 
 For example when using 128 GPUs, you can **pre-train** large 10 to 20 Billion parameter models using :ref:`deepspeed-zero-stage-2` without having to take a performance hit with more advanced optimized multi-gpu strategy.
@@ -53,7 +53,7 @@ Sharding techniques help when model sizes are fairly large; roughly 500M+ parame
 
 ----------
 
-.. _sharded:
+.. _sharded-training:
 
 Sharded Training
 ^^^^^^^^^^^^^^^^
@@ -91,7 +91,7 @@ Internally we re-initialize your optimizers and shard them across your machines
 
 ----------
 
-.. _fully-sharded:
+.. _fully-sharded-training:
 
 Fully Sharded Training
 ^^^^^^^^^^^^^^^^^^^^^^
@@ -206,7 +206,7 @@ This saves memory when training larger models however requires wrapping modules
             self.block_2 = nn.Linear(32, 2)
 
 
-.. _deepspeed:
+.. _deepspeed_advanced:
 
 DeepSpeed
 ^^^^^^^^^
diff --git a/docs/source/advanced/pretrained.rst b/docs/source/advanced/pretrained.rst
new file mode 100644
index 00000000000000..4809e6e79ab3ae
--- /dev/null
+++ b/docs/source/advanced/pretrained.rst
@@ -0,0 +1 @@
+.. include:: transfer_learning.rst
diff --git a/docs/source/advanced/profiler.rst b/docs/source/advanced/profiler.rst
deleted file mode 100644
index 2386708b871e3a..00000000000000
--- a/docs/source/advanced/profiler.rst
+++ /dev/null
@@ -1,334 +0,0 @@
-.. _profiler:
-
-#########
-Profiling
-#########
-
-Profiling your training/testing/inference run can help you identify bottlenecks in your code. The reports can be generated with ``trainer.fit()``,
-``trainer.test()``, ``trainer.validate()`` and ``trainer.predict()`` for their respective actions.
-
-
-------------
-
-****************
-Built-in Actions
-****************
-
-PyTorch Lightning supports profiling standard actions in the training loop out of the box, including:
-
-- on_train_epoch_start
-- on_train_epoch_end
-- on_train_batch_start
-- model_backward
-- on_after_backward
-- optimizer_step
-- on_train_batch_end
-- training_step_end
-- on_training_end
-- etc...
-
-------------
-
-*******************
-Supported Profilers
-*******************
-
-Lightning provides the following profilers:
-
-Simple Profiler
-===============
-
-If you only wish to profile the standard actions, you can set ``profiler="simple"``. It uses the built-in
-:class:`~pytorch_lightning.profiler.simple.SimpleProfiler`.
-
-.. code-block:: python
-
-    # by passing a string
-    trainer = Trainer(..., profiler="simple")
-
-    # or by passing an instance
-    from pytorch_lightning.profiler import SimpleProfiler
-
-    profiler = SimpleProfiler()
-    trainer = Trainer(..., profiler=profiler)
-
-The profiler's results will be printed at the completion of a training ``trainer.fit()``. Find an example of the
-:class:`~pytorch_lightning.profiler.simple.SimpleProfiler` report containing a few of the actions:
-
-.. code-block::
-
-    FIT Profiler Report
-
-    -----------------------------------------------------------------------------------------------
-    |  Action                                          |  Mean duration (s)	|  Total time (s) |
-    -----------------------------------------------------------------------------------------------
-    |  run_training_epoch                              |  6.1558         	|  6.1558         |
-    |  run_training_batch                              |  0.0022506      	|  0.015754       |
-    |  [LightningModule]BoringModel.optimizer_step     |  0.0017477      	|  0.012234       |
-    |  [LightningModule]BoringModel.val_dataloader     |  0.00024388     	|  0.00024388     |
-    |  on_train_batch_start                            |  0.00014637     	|  0.0010246      |
-    |  [LightningModule]BoringModel.teardown           |  2.15e-06       	|  2.15e-06       |
-    |  [LightningModule]BoringModel.prepare_data       |  1.955e-06      	|  1.955e-06      |
-    |  [LightningModule]BoringModel.on_train_start     |  1.644e-06      	|  1.644e-06      |
-    |  [LightningModule]BoringModel.on_train_end       |  1.516e-06      	|  1.516e-06      |
-    |  [LightningModule]BoringModel.on_fit_end         |  1.426e-06      	|  1.426e-06      |
-    |  [LightningModule]BoringModel.setup              |  1.403e-06      	|  1.403e-06      |
-    |  [LightningModule]BoringModel.on_fit_start       |  1.226e-06      	|  1.226e-06      |
-    -----------------------------------------------------------------------------------------------
-
-.. note:: Note that there are a lot more actions that will be present in the final report along with percentage and call count for each action.
-
-
-Advanced Profiler
-=================
-
-If you want more information on the functions called during each event, you can use the :class:`~pytorch_lightning.profiler.advanced.AdvancedProfiler`.
-This option uses Python's `cProfiler <https://docs.python.org/3/library/profile.html#module-cProfile>`_ to provide an in-depth report of time spent within *each* function called in your code.
-
-.. code-block:: python
-
-    # by passing a string
-    trainer = Trainer(..., profiler="advanced")
-
-    # or by passing an instance
-    from pytorch_lightning.profiler import AdvancedProfiler
-
-    profiler = AdvancedProfiler()
-    trainer = Trainer(..., profiler=profiler)
-
-The profiler's results will be printed at the completion of ``trainer.fit()``. This profiler
-report can be quite long, so you can also specify a ``dirpath`` and ``filename`` to save the report instead
-of logging it to the output in your terminal. The output below shows the profiling for the action
-``get_train_batch``.
-
-.. code-block::
-
-    Profiler Report
-
-    Profile stats for: get_train_batch
-            4869394 function calls (4863767 primitive calls) in 18.893 seconds
-    Ordered by: cumulative time
-    List reduced from 76 to 10 due to restriction <10>
-    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
-    3752/1876    0.011    0.000   18.887    0.010 {built-in method builtins.next}
-        1876     0.008    0.000   18.877    0.010 dataloader.py:344(__next__)
-        1876     0.074    0.000   18.869    0.010 dataloader.py:383(_next_data)
-        1875     0.012    0.000   18.721    0.010 fetch.py:42(fetch)
-        1875     0.084    0.000   18.290    0.010 fetch.py:44(<listcomp>)
-        60000    1.759    0.000   18.206    0.000 mnist.py:80(__getitem__)
-        60000    0.267    0.000   13.022    0.000 transforms.py:68(__call__)
-        60000    0.182    0.000    7.020    0.000 transforms.py:93(__call__)
-        60000    1.651    0.000    6.839    0.000 functional.py:42(to_tensor)
-        60000    0.260    0.000    5.734    0.000 transforms.py:167(__call__)
-
-
-PyTorch Profiler
-================
-
-PyTorch includes a `profiler <https://pytorch.org/docs/master/profiler.html>`__ that lets you inspect the cost of different operators
-inside your model - both on the CPU and GPU. It's used by our :class:`~pytorch_lightning.profiler.pytorch.PyTorchProfiler`.
-
-.. code-block:: python
-
-    # by passing a string
-    trainer = Trainer(..., profiler="pytorch")
-
-    # or by passing an instance
-    from pytorch_lightning.profiler import PyTorchProfiler
-
-    profiler = PyTorchProfiler()
-    trainer = Trainer(..., profiler=profiler)
-
-
-This profiler works with multi-device settings.
-If ``filename`` is provided, each rank will save their profiled operation to their own file. The profiler
-report can be quite long, so you setting a ``filename`` will save the report instead of logging it to the
-output in your terminal. If no filename is given, it will be logged only on rank 0.
-
-The profiler's results will be printed on the completion of ``{fit,validate,test,predict}``.
-
-This profiler will record ``training_step``, ``backward``, ``validation_step``, ``test_step``, and ``predict_step`` by default.
-The output below shows the profiling for the action ``training_step``. The user can provide ``PyTorchProfiler(record_functions={...})``
-to extend the scope of profiled functions.
-
-.. note::
-    When using the PyTorch Profiler, wall clock time will not not be representative of the true wall clock time.
-    This is due to forcing profiled operations to be measured synchronously, when many CUDA ops happen asynchronously.
-    It is recommended to use this Profiler to find bottlenecks/breakdowns, however for end to end wall clock time use
-    the ``SimpleProfiler``.
-
-.. code-block::
-
-    Profiler Report
-
-    Profile stats for: training_step
-    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
-    Name                   Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg
-    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
-    t                      62.10%           1.044ms          62.77%           1.055ms          1.055ms
-    addmm                  32.32%           543.135us        32.69%           549.362us        549.362us
-    mse_loss               1.35%            22.657us         3.58%            60.105us         60.105us
-    mean                   0.22%            3.694us          2.05%            34.523us         34.523us
-    div_                   0.64%            10.756us         1.90%            32.001us         16.000us
-    ones_like              0.21%            3.461us          0.81%            13.669us         13.669us
-    sum_out                0.45%            7.638us          0.74%            12.432us         12.432us
-    transpose              0.23%            3.786us          0.68%            11.393us         11.393us
-    as_strided             0.60%            10.060us         0.60%            10.060us         3.353us
-    to                     0.18%            3.059us          0.44%            7.464us          7.464us
-    empty_like             0.14%            2.387us          0.41%            6.859us          6.859us
-    empty_strided          0.38%            6.351us          0.38%            6.351us          3.175us
-    fill_                  0.28%            4.782us          0.33%            5.566us          2.783us
-    expand                 0.20%            3.336us          0.28%            4.743us          4.743us
-    empty                  0.27%            4.456us          0.27%            4.456us          2.228us
-    copy_                  0.15%            2.526us          0.15%            2.526us          2.526us
-    broadcast_tensors      0.15%            2.492us          0.15%            2.492us          2.492us
-    size                   0.06%            0.967us          0.06%            0.967us          0.484us
-    is_complex             0.06%            0.961us          0.06%            0.961us          0.481us
-    stride                 0.03%            0.517us          0.03%            0.517us          0.517us
-    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
-    Self CPU time total: 1.681ms
-
-When running with ``PyTorchProfiler(emit_nvtx=True)``, you should run as following:
-
-.. code-block::
-
-    nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
-
-To visualize the profiled operation, you can either:
-
-.. code-block::
-
-    nvvp trace_name.prof
-
-.. code-block::
-
-    python -c 'import torch; print(torch.autograd.profiler.load_nvprof("trace_name.prof"))'
-
-
-XLA Profiler
-============
-
-:class:`~pytorch_lightning.profiler.xla.XLAProfiler` will help you debug and optimize training
-workload performance for your models using Cloud TPU performance tools.
-
-.. code-block:: python
-
-    # by passing the `XLAProfiler` alias
-    trainer = Trainer(..., profiler="xla")
-
-    # or by passing an instance
-    from pytorch_lightning.profiler import XLAProfiler
-
-    profiler = XLAProfiler(port=9001)
-    trainer = Trainer(..., profiler=profiler)
-
-
-Manual Capture via TensorBoard
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The following instructions are for capturing traces from a running program:
-
-0. This `guide <https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm#tpu-vm>`_ will
-help you with the Cloud TPU setup with the required installations.
-
-1. Start a `TensorBoard <https://www.tensorflow.org/tensorboard>`_ server. You could view the TensorBoard output at ``http://localhost:9001`` on your local machine, and then open the
-``PROFILE`` plugin from the top right dropdown or open ``http://localhost:9001/#profile``
-
-.. code-block:: bash
-
-    tensorboard --logdir ./tensorboard --port 9001
-
-2. Once the code you'd like to profile is running, click on the ``CAPTURE PROFILE`` button. Enter
-``localhost:9001`` (default port for XLA Profiler) as the Profile Service URL. Then, enter
-the number of milliseconds for the profiling duration, and click ``CAPTURE``
-
-3. Make sure the code is running while you are trying to capture the traces. Also, it would lead to better
-performance insights if the profiling duration is longer than the step time.
-
-4. Once the capture is finished, the page will refresh and you can browse through the insights using the
-``Tools`` dropdown at the top left
-
-
-----------------
-
-****************
-Custom Profiling
-****************
-
-Custom Profiler
-===============
-
-You can also configure a custom profiler and pass it to the Trainer. To configure it, subclass :class:`~pytorch_lightning.profiler.base.Profiler`
-and override some of its methods. The following is a simple example that profiles the first occurrence and total calls of each action:
-
-.. code-block:: python
-
-    from pytorch_lightning.profiler import Profiler
-    from collections import defaultdict
-    import time
-
-
-    class ActionCountProfiler(Profiler):
-        def __init__(self, dirpath=None, filename=None):
-            super().__init__(dirpath=dirpath, filename=filename)
-            self._action_count = defaultdict(int)
-            self._action_first_occurrence = {}
-
-        def start(self, action_name):
-            if action_name not in self._action_first_occurrence:
-                self._action_first_occurrence[action_name] = time.strftime("%m/%d/%Y, %H:%M:%S")
-
-        def stop(self, action_name):
-            self._action_count[action_name] += 1
-
-        def summary(self):
-            res = f"\nProfile Summary: \n"
-            max_len = max(len(x) for x in self._action_count)
-
-            for action_name in self._action_count:
-                # generate summary for actions called more than once
-                if self._action_count[action_name] > 1:
-                    res += (
-                        f"{action_name:<{max_len}s} \t "
-                        + "self._action_first_occurrence[action_name]} \t "
-                        + "{self._action_count[action_name]} \n"
-                    )
-
-            return res
-
-        def teardown(self, stage):
-            self._action_count = {}
-            self._action_first_occurrence = {}
-            super().teardown(stage=stage)
-
-.. code-block:: python
-
-    trainer = Trainer(..., profiler=ActionCountProfiler())
-    trainer.fit(...)
-
-
-Profile Logic of Your Interest
-==============================
-
-You can also reference this profiler in your LightningModule to profile specific actions of interest.
-Each profiler has a method ``profile()`` which returns a context handler. Simply pass in the name of
-your action that you want to track and the profiler will record performance for code executed within this context.
-
-.. code-block:: python
-
-    from pytorch_lightning.profiler import SimpleProfiler, PassThroughProfiler
-
-
-    class MyModel(LightningModule):
-        def __init__(self, profiler=None):
-            self.profiler = profiler or PassThroughProfiler()
-
-        def custom_processing_step(self, data):
-            with self.profiler.profile("my_custom_action"):
-                ...
-            return data
-
-
-    profiler = SimpleProfiler()
-    model = MyModel(profiler)
-    trainer = Trainer(profiler=profiler, max_epochs=1)
diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 708a1ecb0d2e26..552a96dff9f45a 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -1,9 +1,3 @@
-.. testsetup:: *
-
-    import os
-    from pytorch_lightning.trainer.trainer import Trainer
-    from pytorch_lightning.core.lightning import LightningModule
-
 .. _pruning_quantization:
 
 ########################
diff --git a/docs/source/advanced/training_tricks.rst b/docs/source/advanced/training_tricks.rst
index 3c46567da9326c..a8d5c2d0659816 100644
--- a/docs/source/advanced/training_tricks.rst
+++ b/docs/source/advanced/training_tricks.rst
@@ -1,6 +1,5 @@
 .. testsetup:: *
 
-    from pytorch_lightning import Trainer
     from pytorch_lightning.callbacks import StochasticWeightAveraging
 
 .. _training_tricks:
diff --git a/docs/source/advanced/transfer_learning.rst b/docs/source/advanced/transfer_learning.rst
index 2bf8c48877453b..caa739bdfc1f64 100644
--- a/docs/source/advanced/transfer_learning.rst
+++ b/docs/source/advanced/transfer_learning.rst
@@ -1,23 +1,22 @@
-.. testsetup:: *
-
-    from pytorch_lightning.core.lightning import LightningModule
-
 #################
 Transfer Learning
 #################
+**Audience**: Users looking to use pretrained models with Lightning.
 
-***********************
-Using Pretrained Models
-***********************
+----
 
-Sometimes we want to use a LightningModule as a pretrained model. This is fine because
-a LightningModule is just a `torch.nn.Module`!
+*************************
+Use any PyTorch nn.Module
+*************************
+Any model that is a PyTorch nn.Module can be used with Lightning (because LightningModules are nn.Modules also).
 
-.. note:: Remember that a LightningModule is EXACTLY a torch.nn.Module but with more capabilities.
+----
 
+********************************
+Use a pretrained LightningModule
+********************************
 Let's use the `AutoEncoder` as a feature extractor in a separate model.
 
-
 .. testcode::
 
     class Encoder(torch.nn.Module):
@@ -46,8 +45,11 @@ Let's use the `AutoEncoder` as a feature extractor in a separate model.
 
 We used our pretrained Autoencoder (a LightningModule) for transfer learning!
 
+----
+
+***********************************
 Example: Imagenet (Computer Vision)
-===================================
+***********************************
 
 .. testcode::
     :skipif: not _TORCHVISION_AVAILABLE
@@ -97,9 +99,11 @@ And use it to predict your data of interest
 We used a pretrained model on imagenet, finetuned on CIFAR-10 to predict on CIFAR-10.
 In the non-academic world we would finetune on a tiny dataset you have and predict on your dataset.
 
-Example: BERT (NLP)
-===================
+----
 
+*******************
+Example: BERT (NLP)
+*******************
 Lightning is completely agnostic to what's used for transfer learning so long
 as it is a `torch.nn.Module` subclass.
 
diff --git a/docs/source/api_references.rst b/docs/source/api_references.rst
index 5d573b31eb486c..17ed5b6dc5a810 100644
--- a/docs/source/api_references.rst
+++ b/docs/source/api_references.rst
@@ -1,10 +1,7 @@
-API References
-==============
-
 .. include:: links.rst
 
-Accelerator API
----------------
+accelerators
+------------
 
 .. currentmodule:: pytorch_lightning.accelerators
 
@@ -20,57 +17,8 @@ Accelerator API
     IPUAccelerator
     TPUAccelerator
 
-Core API
---------
-
-.. currentmodule:: pytorch_lightning.core
-
-.. autosummary::
-    :toctree: api
-    :nosignatures:
-    :template: classtemplate.rst
-
-    ~hooks.CheckpointHooks
-    ~hooks.DataHooks
-    ~hooks.ModelHooks
-    LightningDataModule
-    LightningModule
-    ~mixins.DeviceDtypeModuleMixin
-    ~mixins.HyperparametersMixin
-    ~optimizer.LightningOptimizer
-    ~saving.ModelIO
-
-Strategy API
-------------
-
-.. currentmodule:: pytorch_lightning.strategies
-
-.. autosummary::
-    :toctree: api
-    :nosignatures:
-    :template: classtemplate.rst
-
-    BaguaStrategy
-    DDP2Strategy
-    DDPFullyShardedStrategy
-    DDPShardedStrategy
-    DDPSpawnShardedStrategy
-    DDPSpawnStrategy
-    DDPStrategy
-    DataParallelStrategy
-    DeepSpeedStrategy
-    HorovodStrategy
-    HPUParallelStrategy
-    IPUStrategy
-    ParallelStrategy
-    SingleDeviceStrategy
-    SingleHPUStrategy
-    SingleTPUStrategy
-    Strategy
-    TPUSpawnStrategy
-
-Callbacks API
--------------
+callbacks
+---------
 
 .. currentmodule:: pytorch_lightning.callbacks
 
@@ -99,8 +47,39 @@ Callbacks API
     Timer
     TQDMProgressBar
 
-Loggers API
------------
+core
+----
+
+.. currentmodule:: pytorch_lightning.core
+
+.. autosummary::
+    :toctree: api
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ~hooks.CheckpointHooks
+    ~hooks.DataHooks
+    ~hooks.ModelHooks
+    LightningDataModule
+    LightningModule
+    ~mixins.DeviceDtypeModuleMixin
+    ~mixins.HyperparametersMixin
+    ~optimizer.LightningOptimizer
+    ~saving.ModelIO
+
+lightninglite
+-------------
+
+.. currentmodule:: pytorch_lightning.lite
+
+.. autosummary::
+    :toctree: api
+    :nosignatures:
+
+    LightningLite
+
+loggers
+-------
 
 .. currentmodule:: pytorch_lightning.loggers
 
@@ -114,14 +93,13 @@ Loggers API
     mlflow
     neptune
     tensorboard
-    test_tube
     wandb
 
-Loop API
---------
+loops
+^^^^^
 
 Base Classes
-^^^^^^^^^^^^
+""""""""""""
 
 .. currentmodule:: pytorch_lightning.loops
 
@@ -133,10 +111,6 @@ Base Classes
     ~dataloader.dataloader_loop.DataLoaderLoop
     ~base.Loop
 
-
-Default Loop Implementations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
 Training
 """"""""
 
@@ -182,11 +156,11 @@ Prediction
     ~dataloader.PredictionLoop
 
 
-Plugins API
------------
+plugins
+^^^^^^^
 
-Precision Plugins
-^^^^^^^^^^^^^^^^^
+precision
+"""""""""
 
 .. currentmodule:: pytorch_lightning.plugins.precision
 
@@ -208,8 +182,8 @@ Precision Plugins
     TPUBf16PrecisionPlugin
     TPUPrecisionPlugin
 
-Cluster Environments
-^^^^^^^^^^^^^^^^^^^^
+environments
+""""""""""""
 
 .. currentmodule:: pytorch_lightning.plugins.environments
 
@@ -225,8 +199,8 @@ Cluster Environments
     SLURMEnvironment
     TorchElasticEnvironment
 
-Checkpoint IO Plugins
-^^^^^^^^^^^^^^^^^^^^^
+io
+""
 
 .. currentmodule:: pytorch_lightning.plugins.io
 
@@ -241,8 +215,8 @@ Checkpoint IO Plugins
     XLACheckpointIO
 
 
-Other Plugins
-^^^^^^^^^^^^^
+others
+""""""
 
 .. currentmodule:: pytorch_lightning.plugins
 
@@ -254,9 +228,8 @@ Other Plugins
     LayerSync
     NativeSyncBatchNorm
 
-
-Profiler API
-------------
+profiler
+--------
 
 .. currentmodule:: pytorch_lightning.profiler
 
@@ -272,9 +245,8 @@ Profiler API
     SimpleProfiler
     XLAProfiler
 
-
-Trainer API
------------
+trainer
+-------
 
 .. currentmodule:: pytorch_lightning.trainer.trainer
 
@@ -285,19 +257,37 @@ Trainer API
 
     Trainer
 
-LightningLite API
------------------
+strategies
+----------
 
-.. currentmodule:: pytorch_lightning.lite
+.. currentmodule:: pytorch_lightning.strategies
 
 .. autosummary::
     :toctree: api
     :nosignatures:
+    :template: classtemplate.rst
 
-    LightningLite
+    BaguaStrategy
+    DDP2Strategy
+    DDPFullyShardedStrategy
+    DDPShardedStrategy
+    DDPSpawnShardedStrategy
+    DDPSpawnStrategy
+    DDPStrategy
+    DataParallelStrategy
+    DeepSpeedStrategy
+    HorovodStrategy
+    HPUParallelStrategy
+    IPUStrategy
+    ParallelStrategy
+    SingleDeviceStrategy
+    SingleHPUStrategy
+    SingleTPUStrategy
+    Strategy
+    TPUSpawnStrategy
 
-Tuner API
----------
+tuner
+-----
 
 .. currentmodule:: pytorch_lightning.tuner.tuning
 
@@ -308,8 +298,8 @@ Tuner API
 
     Tuner
 
-Utilities API
--------------
+utilities
+---------
 
 .. currentmodule:: pytorch_lightning.utilities
 
diff --git a/docs/source/benchmarking/benchmarks.rst b/docs/source/benchmarking/benchmarks.rst
index 790c380e147a61..af9715fbf4304e 100644
--- a/docs/source/benchmarking/benchmarks.rst
+++ b/docs/source/benchmarking/benchmarks.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 Benchmark with vanilla PyTorch
 ==============================
 
diff --git a/docs/source/cli/lightning_cli.rst b/docs/source/cli/lightning_cli.rst
new file mode 100644
index 00000000000000..76f3f121402119
--- /dev/null
+++ b/docs/source/cli/lightning_cli.rst
@@ -0,0 +1,94 @@
+:orphan:
+
+.. _lightning-cli:
+
+############################
+Eliminate config boilerplate
+############################
+
+*********
+Basic use
+*********
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: 1: Control it all from the CLI
+   :description: Learn to control a LightningModule and LightningDataModule from the CLI
+   :col_css: col-md-4
+   :button_link: lightning_cli_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: 2: Mix models and datasets
+   :description: Register models, datasets, optimizers and learning rate schedulers
+   :col_css: col-md-4
+   :button_link: lightning_cli_intermediate_2.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: 3: Control it all via YAML
+   :description: Enable composable YAMLs
+   :col_css: col-md-4
+   :button_link: lightning_cli_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
+
+----
+
+************
+Advanced use
+************
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. displayitem::
+   :header: YAML for production
+   :description: Use the Lightning CLI with YAMLs for production environments
+   :col_css: col-md-6
+   :button_link: lightning_cli_advanced_2.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Customize configs for complex projects
+   :description: Learn how to connect complex projects with each Registry.
+   :col_css: col-md-6
+   :button_link: lightning_cli_advanced_3.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Extend the Lightning CLI
+   :description: Customize the Lightning CLI
+   :col_css: col-md-6
+   :button_link: lightning_cli_expert.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: FAQ
+   :description: Frequently asked questions about working with the Lightning CLI and YAML files
+   :col_css: col-md-6
+   :button_link: lightning_cli_faq.html
+   :height: 150
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/cli/lightning_cli_advanced.rst b/docs/source/cli/lightning_cli_advanced.rst
new file mode 100644
index 00000000000000..2d4f3307e7f188
--- /dev/null
+++ b/docs/source/cli/lightning_cli_advanced.rst
@@ -0,0 +1,113 @@
+:orphan:
+
+#######################################
+Eliminate config boilerplate (Advanced)
+#######################################
+**Audience:** Users looking to modularize their code for a professional project.
+
+**Pre-reqs:** You must have read :doc:`(Control it all from the CLI) <lightning_cli_intermediate>`.
+
+----
+
+***************************
+What is a yaml config file?
+***************************
+A yaml is a standard configuration file that describes parameters for sections of a program. It is a common tool in engineering, and it has recently started to gain popularity in machine learning.
+
+.. code:: yaml
+
+    # file.yaml
+    car:
+        max_speed:100
+        max_passengers:2
+    plane:
+        fuel_capacity: 50
+    class_3:
+        option_1: 'x'
+        option_2: 'y'
+
+----
+
+
+*********************
+Print the config used
+*********************
+Before or after you run a training routine, you can print the full training spec in yaml format using ``--print_config``:
+
+.. code:: bash
+
+    python main.py fit --print_config
+
+which generates the following config:
+
+.. code:: bash
+
+    seed_everything: null
+    trainer:
+        logger: true
+        ...
+        terminate_on_nan: null
+    model:
+        out_dim: 10
+        learning_rate: 0.02
+    data:
+        data_dir: ./
+    ckpt_path: null
+
+----
+
+********************************
+Write a config yaml from the CLI
+********************************
+To have a copy of the configuration that produced this model, save a *yaml* file from the *--print_config* outputs:
+
+.. code:: bash
+
+    python main.py fit --model.learning_rate 0.001 --print_config > config.yaml
+
+----
+
+**********************
+Run from a single yaml
+**********************
+To run from a yaml, pass a yaml produced with ``--print_config`` to the ``--config`` argument:
+
+.. code:: bash
+
+    python main.py fit --config config.yaml
+
+when using a yaml to run, you can still pass in inline arguments
+
+.. code:: bash
+
+    python main.py fit --config config.yaml --trainer.max_epochs 100
+
+----
+
+******************
+Compose yaml files
+******************
+For production or complex research projects it's advisable to have each object in its own config file. To compose all the configs, pass them all inline:
+
+.. code-block:: bash
+
+    $ python trainer.py fit --config trainer.yaml --config datamodules.yaml --config models.yaml ...
+
+The configs will be parsed sequentially. Let's say we have two configs with the same args:
+
+.. code:: yaml
+
+    # trainer.yaml
+    trainer:
+        num_epochs: 10
+
+
+    # trainer_2.yaml
+    trainer:
+        num_epochs: 20
+
+the ones from the last config will be used (num_epochs = 20) in this case:
+
+.. code-block:: bash
+
+    $ python trainer.py fit --config trainer.yaml --config trainer_2.yaml
diff --git a/docs/source/cli/lightning_cli_advanced_2.rst b/docs/source/cli/lightning_cli_advanced_2.rst
new file mode 100644
index 00000000000000..0474699db706d1
--- /dev/null
+++ b/docs/source/cli/lightning_cli_advanced_2.rst
@@ -0,0 +1,207 @@
+:orphan:
+
+.. testsetup:: *
+    :skipif: not _JSONARGPARSE_AVAILABLE
+
+    import torch
+    from unittest import mock
+    from typing import List
+    import pytorch_lightning as pl
+    from pytorch_lightning import LightningModule, LightningDataModule, Trainer, Callback
+
+
+    class NoFitTrainer(Trainer):
+        def fit(self, *_, **__):
+            pass
+
+
+    class LightningCLI(pl.utilities.cli.LightningCLI):
+        def __init__(self, *args, trainer_class=NoFitTrainer, run=False, **kwargs):
+            super().__init__(*args, trainer_class=trainer_class, run=run, **kwargs)
+
+
+    class MyModel(LightningModule):
+        def __init__(
+            self,
+            encoder_layers: int = 12,
+            decoder_layers: List[int] = [2, 4],
+            batch_size: int = 8,
+        ):
+            pass
+
+
+    class MyDataModule(LightningDataModule):
+        def __init__(self, batch_size: int = 8):
+            self.num_classes = 5
+
+
+    mock_argv = mock.patch("sys.argv", ["any.py"])
+    mock_argv.start()
+
+.. testcleanup:: *
+
+    mock_argv.stop()
+
+#######################################
+Eliminate config boilerplate (Advanced)
+#######################################
+
+******************************
+Customize arguments by command
+******************************
+To customize arguments by subcommand, pass the config *before* the subcommand:
+
+.. code-block:: bash
+
+    $ python main.py [before] [subcommand] [after]
+    $ python main.py  ...         fit       ...
+
+For example, here we set the Trainer argument [max_steps = 100] for the full training routine and [max_steps = 10] for testing:
+
+.. code-block:: bash
+
+    # config1.yaml
+    fit:
+        trainer:
+            max_steps: 100
+    test:
+        trainer:
+            max_epochs: 10
+
+now you can toggle this behavior by subcommand:
+
+.. code-block:: bash
+
+    # full routine with max_steps = 100
+    $ python main.py --config config1.yaml fit
+
+    # test only with max_epochs = 10
+    $ python main.py --config config1.yaml test
+
+----
+
+*********************
+Use groups of options
+*********************
+Groups of options can also be given as independent config files:
+
+.. code-block:: bash
+
+    $ python trainer.py fit --trainer trainer.yaml --model model.yaml --data data.yaml [...]
+
+----
+
+***************************
+Run from cloud yaml configs
+***************************
+For certain enterprise workloads, Lightning CLI supports running from hosted configs:
+
+.. code-block:: bash
+
+    $ python trainer.py [subcommand] --config s3://bucket/config.yaml
+
+For more options, refer to :doc:`Remote filesystems <../common/remote_fs>`.
+
+----
+
+**************************************
+Use a config via environment variables
+**************************************
+For certain CI/CD systems, it's useful to pass in config files as environment variables:
+
+.. code-block:: bash
+
+    $ python trainer.py fit --trainer "$TRAINER_CONFIG" --model "$MODEL_CONFIG" [...]
+
+----
+
+***************************************
+Run from environment variables directly
+***************************************
+The Lightning CLI can convert every possible CLI flag into an environment variable. To enable this, set the *env_parse* argument:
+
+.. code:: python
+
+    LightningCLI(env_parse=True)
+
+now use the ``--help`` CLI flag with any subcommand:
+
+.. code:: bash
+
+    $ python main.py fit --help
+
+which will show you ALL possible environment variables you can now set:
+
+.. code:: bash
+
+    usage: main.py [options] fit [-h] [-c CONFIG]
+                                [--trainer.max_epochs MAX_EPOCHS] [--trainer.min_epochs MIN_EPOCHS]
+                                [--trainer.max_steps MAX_STEPS] [--trainer.min_steps MIN_STEPS]
+                                ...
+                                [--ckpt_path CKPT_PATH]
+
+    optional arguments:
+    ...
+    --model CONFIG        Path to a configuration file.
+    --model.out_dim OUT_DIM
+                            (type: int, default: 10)
+    --model.learning_rate LEARNING_RATE
+                            (type: float, default: 0.02)
+
+now you can customize the behavior via environment variables:
+
+.. code:: bash
+
+    # set the options via env vars
+    $ export LEARNING_RATE=0.01
+    $ export OUT_DIM=5
+
+    $ python main.py fit
+
+----
+
+************************
+Set default config files
+************************
+To set a path to a config file of defaults, use the ``default_config_files`` argument:
+
+.. testcode::
+
+    cli = LightningCLI(MyModel, MyDataModule, parser_kwargs={"default_config_files": ["my_cli_defaults.yaml"]})
+
+or if you want defaults per subcommand:
+
+.. testcode::
+
+    cli = LightningCLI(MyModel, MyDataModule, parser_kwargs={"fit": {"default_config_files": ["my_fit_defaults.yaml"]}})
+
+For more configuration options, refer to the `ArgumentParser API
+<https://jsonargparse.readthedocs.io/en/stable/#jsonargparse.core.ArgumentParser.__init__>`_ documentation.
+
+----
+
+*****************************
+Enable variable interpolation
+*****************************
+In certain cases where multiple configs need to share variables, consider using variable interpolation. Variable interpolation
+allows you to add variables to your yaml configs like so:
+
+.. code-block:: yaml
+
+    model:
+      encoder_layers: 12
+      decoder_layers:
+      - ${model.encoder_layers}
+      - 4
+
+To enable variable interpolation, first install omegaconf:
+
+.. code:: bash
+
+    pip install omegaconf
+
+Once this is installed, the Lightning CLI will automatically handle variables in yaml files:
+
+.. code bash:
+
+    python main.py --model.encoder_layers=12
diff --git a/docs/source/cli/lightning_cli_advanced_3.rst b/docs/source/cli/lightning_cli_advanced_3.rst
new file mode 100644
index 00000000000000..2eeae17620bf24
--- /dev/null
+++ b/docs/source/cli/lightning_cli_advanced_3.rst
@@ -0,0 +1,415 @@
+:orphan:
+
+.. testsetup:: *
+    :skipif: not _JSONARGPARSE_AVAILABLE
+
+    import torch
+    from unittest import mock
+    from typing import List
+    import pytorch_lightning as pl
+    from pytorch_lightning import LightningModule, LightningDataModule, Trainer, Callback
+
+
+    class NoFitTrainer(Trainer):
+        def fit(self, *_, **__):
+            pass
+
+
+    class LightningCLI(pl.utilities.cli.LightningCLI):
+        def __init__(self, *args, trainer_class=NoFitTrainer, run=False, **kwargs):
+            super().__init__(*args, trainer_class=trainer_class, run=run, **kwargs)
+
+
+    class MyModel(LightningModule):
+        def __init__(
+            self,
+            encoder_layers: int = 12,
+            decoder_layers: List[int] = [2, 4],
+            batch_size: int = 8,
+        ):
+            pass
+
+
+    class MyDataModule(LightningDataModule):
+        def __init__(self, batch_size: int = 8):
+            self.num_classes = 5
+
+
+    MyModelBaseClass = MyModel
+    MyDataModuleBaseClass = MyDataModule
+
+    mock_argv = mock.patch("sys.argv", ["any.py"])
+    mock_argv.start()
+
+.. testcleanup:: *
+
+    mock_argv.stop()
+
+Instantiation only mode
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The CLI is designed to start fitting with minimal code changes. On class instantiation, the CLI will automatically
+call the trainer function associated to the subcommand provided so you don't have to do it.
+To avoid this, you can set the following argument:
+
+.. testcode::
+
+    cli = LightningCLI(MyModel, run=False)  # True by default
+    # you'll have to call fit yourself:
+    cli.trainer.fit(cli.model)
+
+In this mode, there are subcommands added to the parser.
+This can be useful to implement custom logic without having to subclass the CLI, but still using the CLI's instantiation
+and argument parsing capabilities.
+
+
+Subclass registration
+^^^^^^^^^^^^^^^^^^^^^
+
+To use shorthand notation, the options need to be registered beforehand. This can be easily done with:
+
+.. code-block::
+
+    LightningCLI(auto_registry=True)  # False by default
+
+which will register all subclasses of :class:`torch.optim.Optimizer`, :class:`torch.optim.lr_scheduler._LRScheduler`,
+:class:`~pytorch_lightning.core.lightning.LightningModule`,
+:class:`~pytorch_lightning.core.datamodule.LightningDataModule`, :class:`~pytorch_lightning.callbacks.Callback`, and
+:class:`~pytorch_lightning.loggers.LightningLoggerBase` across all imported modules. This includes those in your own
+code.
+
+Alternatively, if this is left unset, only the subclasses defined in PyTorch's :class:`torch.optim.Optimizer`,
+:class:`torch.optim.lr_scheduler._LRScheduler` and Lightning's :class:`~pytorch_lightning.callbacks.Callback` and
+:class:`~pytorch_lightning.loggers.LightningLoggerBase` subclassess will be registered.
+
+In subsequent sections, we will go over adding specific classes to specific registries as well as how to use
+shorthand notation.
+
+
+Trainer Callbacks and arguments with class type
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A very important argument of the :class:`~pytorch_lightning.trainer.trainer.Trainer` class is the :code:`callbacks`. In
+contrast to other more simple arguments which just require numbers or strings, :code:`callbacks` expects a list of
+instances of subclasses of :class:`~pytorch_lightning.callbacks.Callback`. To specify this kind of argument in a config
+file, each callback must be given as a dictionary including a :code:`class_path` entry with an import path of the class,
+and optionally an :code:`init_args` entry with arguments required to instantiate it. Therefore, a simple configuration
+file example that defines a couple of callbacks is the following:
+
+.. code-block:: yaml
+
+    trainer:
+      callbacks:
+        - class_path: pytorch_lightning.callbacks.EarlyStopping
+          init_args:
+            patience: 5
+        - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+          init_args:
+            ...
+
+Similar to the callbacks, any arguments in :class:`~pytorch_lightning.trainer.trainer.Trainer` and user extended
+:class:`~pytorch_lightning.core.lightning.LightningModule` and
+:class:`~pytorch_lightning.core.datamodule.LightningDataModule` classes that have as type hint a class can be configured
+the same way using :code:`class_path` and :code:`init_args`.
+
+For callbacks in particular, Lightning simplifies the command line so that only
+the :class:`~pytorch_lightning.callbacks.Callback` name is required.
+The argument's order matters and the user needs to pass the arguments in the following way.
+
+.. code-block:: bash
+
+    $ python ... \
+        --trainer.callbacks={CALLBACK_1_NAME} \
+        --trainer.callbacks.{CALLBACK_1_ARGS_1}=... \
+        --trainer.callbacks.{CALLBACK_1_ARGS_2}=... \
+        ...
+        --trainer.callbacks={CALLBACK_N_NAME} \
+        --trainer.callbacks.{CALLBACK_N_ARGS_1}=... \
+        ...
+
+Here is an example:
+
+.. code-block:: bash
+
+    $ python ... \
+        --trainer.callbacks=EarlyStopping \
+        --trainer.callbacks.patience=5 \
+        --trainer.callbacks=LearningRateMonitor \
+        --trainer.callbacks.logging_interval=epoch
+
+Lightning provides a mechanism for you to add your own callbacks and benefit from the command line simplification
+as described above:
+
+.. code-block:: python
+
+    from pytorch_lightning.utilities.cli import CALLBACK_REGISTRY
+
+
+    @CALLBACK_REGISTRY
+    class CustomCallback(Callback):
+        ...
+
+
+    cli = LightningCLI(...)
+
+.. code-block:: bash
+
+    $  python ... --trainer.callbacks=CustomCallback ...
+
+.. note::
+
+    This shorthand notation is only supported in the shell and not inside a configuration file. The configuration file
+    generated by calling the previous command with ``--print_config`` will have the ``class_path`` notation.
+
+    .. code-block:: yaml
+
+        trainer:
+          callbacks:
+            - class_path: your_class_path.CustomCallback
+              init_args:
+                ...
+
+
+.. tip::
+
+    ``--trainer.logger`` also supports shorthand notation and a ``LOGGER_REGISTRY`` is available to register custom
+    Loggers.
+
+
+Multiple models and/or datasets
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Additionally, the tool can be configured such that a model and/or a datamodule is
+specified by an import path and init arguments. For example, with a tool implemented as:
+
+.. code-block:: python
+
+    cli = LightningCLI(MyModelBaseClass, MyDataModuleBaseClass, subclass_mode_model=True, subclass_mode_data=True)
+
+A possible config file could be as follows:
+
+.. code-block:: yaml
+
+    model:
+      class_path: mycode.mymodels.MyModel
+      init_args:
+        decoder_layers:
+        - 2
+        - 4
+        encoder_layers: 12
+    data:
+      class_path: mycode.mydatamodules.MyDataModule
+      init_args:
+        ...
+    trainer:
+      callbacks:
+        - class_path: pytorch_lightning.callbacks.EarlyStopping
+          init_args:
+            patience: 5
+        ...
+
+Only model classes that are a subclass of :code:`MyModelBaseClass` would be allowed, and similarly only subclasses of
+:code:`MyDataModuleBaseClass`. If as base classes :class:`~pytorch_lightning.core.lightning.LightningModule` and
+:class:`~pytorch_lightning.core.datamodule.LightningDataModule` are given, then the tool would allow any lightning
+module and data module.
+
+.. tip::
+
+    Note that with the subclass modes the :code:`--help` option does not show information for a specific subclass. To
+    get help for a subclass the options :code:`--model.help` and :code:`--data.help` can be used, followed by the
+    desired class path. Similarly :code:`--print_config` does not include the settings for a particular subclass. To
+    include them the class path should be given before the :code:`--print_config` option. Examples for both help and
+    print config are:
+
+    .. code-block:: bash
+
+        $ python trainer.py fit --model.help mycode.mymodels.MyModel
+        $ python trainer.py fit --model mycode.mymodels.MyModel --print_config
+
+
+Models with multiple submodules
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Many use cases require to have several modules each with its own configurable options. One possible way to handle this
+with LightningCLI is to implement a single module having as init parameters each of the submodules. Since the init
+parameters have as type a class, then in the configuration these would be specified with :code:`class_path` and
+:code:`init_args` entries. For instance a model could be implemented as:
+
+.. testcode::
+
+    class MyMainModel(LightningModule):
+        def __init__(self, encoder: nn.Module, decoder: nn.Module):
+            """Example encoder-decoder submodules model
+
+            Args:
+                encoder: Instance of a module for encoding
+                decoder: Instance of a module for decoding
+            """
+            super().__init__()
+            self.encoder = encoder
+            self.decoder = decoder
+
+If the CLI is implemented as :code:`LightningCLI(MyMainModel)` the configuration would be as follows:
+
+.. code-block:: yaml
+
+    model:
+      encoder:
+        class_path: mycode.myencoders.MyEncoder
+        init_args:
+          ...
+      decoder:
+        class_path: mycode.mydecoders.MyDecoder
+        init_args:
+          ...
+
+It is also possible to combine :code:`subclass_mode_model=True` and submodules, thereby having two levels of
+:code:`class_path`.
+
+
+Class type defaults
+^^^^^^^^^^^^^^^^^^^
+
+The support for classes as type hints allows to try many possibilities with the same CLI. This is a useful feature, but
+it can make it tempting to use an instance of a class as a default. For example:
+
+.. code-block::
+
+    class MyMainModel(LightningModule):
+        def __init__(
+            self,
+            backbone: torch.nn.Module = MyModel(encoder_layers=24),  # BAD PRACTICE!
+        ):
+            super().__init__()
+            self.backbone = backbone
+
+Normally classes are mutable as it is in this case. The instance of :code:`MyModel` would be created the moment that the
+module that defines :code:`MyMainModel` is first imported. This means that the default of :code:`backbone` will be
+initialized before the CLI class runs :code:`seed_everything` making it non-reproducible. Furthermore, if
+:code:`MyMainModel` is used more than once in the same Python process and the :code:`backbone` parameter is not
+overridden, the same instance would be used in multiple places which very likely is not what the developer intended.
+Having an instance as default also makes it impossible to generate the complete config file since for arbitrary classes
+it is not known which arguments were used to instantiate it.
+
+A good solution to these problems is to not have a default or set the default to a special value (e.g. a
+string) which would be checked in the init and instantiated accordingly. If a class parameter has no default and the CLI
+is subclassed then a default can be set as follows:
+
+.. testcode::
+
+    default_backbone = {
+        "class_path": "import.path.of.MyModel",
+        "init_args": {
+            "encoder_layers": 24,
+        },
+    }
+
+
+    class MyLightningCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.set_defaults({"model.backbone": default_backbone})
+
+A more compact version that avoids writing a dictionary would be:
+
+.. testcode::
+
+    from jsonargparse import lazy_instance
+
+
+    class MyLightningCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.set_defaults({"model.backbone": lazy_instance(MyModel, encoder_layers=24)})
+
+Optimizers
+^^^^^^^^^^
+
+If you will not be changing the class, you can manually add the arguments for specific optimizers and/or
+learning rate schedulers by subclassing the CLI. This has the advantage of providing the proper help message for those
+classes. The following code snippet shows how to implement it:
+
+.. testcode::
+
+    class MyLightningCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.add_optimizer_args(torch.optim.Adam)
+            parser.add_lr_scheduler_args(torch.optim.lr_scheduler.ExponentialLR)
+
+With this, in the config the :code:`optimizer` and :code:`lr_scheduler` groups would accept all of the options for the
+given classes, in this example :code:`Adam` and :code:`ExponentialLR`.
+Therefore, the config file would be structured like:
+
+.. code-block:: yaml
+
+    optimizer:
+      lr: 0.01
+    lr_scheduler:
+      gamma: 0.2
+    model:
+      ...
+    trainer:
+      ...
+
+Where the arguments can be passed directly through command line without specifying the class. For example:
+
+.. code-block:: bash
+
+    $ python trainer.py fit --optimizer.lr=0.01 --lr_scheduler.gamma=0.2
+
+The automatic implementation of :code:`configure_optimizers` can be disabled by linking the configuration group. An
+example can be when one wants to add support for multiple optimizers:
+
+.. code-block:: python
+
+    from pytorch_lightning.utilities.cli import instantiate_class
+
+
+    class MyModel(LightningModule):
+        def __init__(self, optimizer1_init: dict, optimizer2_init: dict):
+            super().__init__()
+            self.optimizer1_init = optimizer1_init
+            self.optimizer2_init = optimizer2_init
+
+        def configure_optimizers(self):
+            optimizer1 = instantiate_class(self.parameters(), self.optimizer1_init)
+            optimizer2 = instantiate_class(self.parameters(), self.optimizer2_init)
+            return [optimizer1, optimizer2]
+
+
+    class MyLightningCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.add_optimizer_args(
+                OPTIMIZER_REGISTRY.classes, nested_key="gen_optimizer", link_to="model.optimizer1_init"
+            )
+            parser.add_optimizer_args(
+                OPTIMIZER_REGISTRY.classes, nested_key="gen_discriminator", link_to="model.optimizer2_init"
+            )
+
+
+    cli = MyLightningCLI(MyModel)
+
+The value given to :code:`optimizer*_init` will always be a dictionary including :code:`class_path` and
+:code:`init_args` entries. The function :func:`~pytorch_lightning.utilities.cli.instantiate_class`
+takes care of importing the class defined in :code:`class_path` and instantiating it using some positional arguments,
+in this case :code:`self.parameters()`, and the :code:`init_args`.
+Any number of optimizers and learning rate schedulers can be added when using :code:`link_to`.
+
+With shorthand notation:
+
+.. code-block:: bash
+
+    $ python trainer.py fit \
+        --gen_optimizer=Adam \
+        --gen_optimizer.lr=0.01 \
+        --gen_discriminator=AdamW \
+        --gen_discriminator.lr=0.0001
+
+You can also pass the class path directly, for example, if the optimizer hasn't been registered to the
+``OPTIMIZER_REGISTRY``:
+
+.. code-block:: bash
+
+    $ python trainer.py fit \
+        --gen_optimizer.class_path=torch.optim.Adam \
+        --gen_optimizer.init_args.lr=0.01 \
+        --gen_discriminator.class_path=torch.optim.AdamW \
+        --gen_discriminator.init_args.lr=0.0001
diff --git a/docs/source/cli/lightning_cli_expert.rst b/docs/source/cli/lightning_cli_expert.rst
new file mode 100644
index 00000000000000..50292b3dd251ac
--- /dev/null
+++ b/docs/source/cli/lightning_cli_expert.rst
@@ -0,0 +1,266 @@
+:orphan:
+
+.. testsetup:: *
+    :skipif: not _JSONARGPARSE_AVAILABLE
+
+    import torch
+    from unittest import mock
+    from typing import List
+    import pytorch_lightning as pl
+    from pytorch_lightning import LightningModule, LightningDataModule, Trainer, Callback
+
+
+    class NoFitTrainer(Trainer):
+        def fit(self, *_, **__):
+            pass
+
+
+    class LightningCLI(pl.utilities.cli.LightningCLI):
+        def __init__(self, *args, trainer_class=NoFitTrainer, run=False, **kwargs):
+            super().__init__(*args, trainer_class=trainer_class, run=run, **kwargs)
+
+
+    class MyModel(LightningModule):
+        def __init__(
+            self,
+            encoder_layers: int = 12,
+            decoder_layers: List[int] = [2, 4],
+            batch_size: int = 8,
+        ):
+            pass
+
+
+    class MyClassModel(LightningModule):
+        def __init__(self, num_classes: int):
+            pass
+
+
+    class MyDataModule(LightningDataModule):
+        def __init__(self, batch_size: int = 8):
+            self.num_classes = 5
+
+
+    def send_email(address, message):
+        pass
+
+
+    mock_argv = mock.patch("sys.argv", ["any.py"])
+    mock_argv.start()
+
+.. testcleanup:: *
+
+    mock_argv.stop()
+
+#######################################
+Eliminate config boilerplate (Advanced)
+#######################################
+**Audience:** Users who already understand the LightningCLI and want to customize it.
+
+----
+
+**************************
+Customize the LightningCLI
+**************************
+
+The init parameters of the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class can be used to customize some
+things, namely: the description of the tool, enabling parsing of environment variables and additional arguments to
+instantiate the trainer and configuration parser.
+
+Nevertheless the init arguments are not enough for many use cases. For this reason the class is designed so that can be
+extended to customize different parts of the command line tool. The argument parser class used by
+:class:`~pytorch_lightning.utilities.cli.LightningCLI` is
+:class:`~pytorch_lightning.utilities.cli.LightningArgumentParser` which is an extension of python's argparse, thus
+adding arguments can be done using the :func:`add_argument` method. In contrast to argparse it has additional methods to
+add arguments, for example :func:`add_class_arguments` adds all arguments from the init of a class, though requiring
+parameters to have type hints. For more details about this please refer to the `respective documentation
+<https://jsonargparse.readthedocs.io/en/stable/#classes-methods-and-functions>`_.
+
+The :class:`~pytorch_lightning.utilities.cli.LightningCLI` class has the
+:meth:`~pytorch_lightning.utilities.cli.LightningCLI.add_arguments_to_parser` method which can be implemented to include
+more arguments. After parsing, the configuration is stored in the :code:`config` attribute of the class instance. The
+:class:`~pytorch_lightning.utilities.cli.LightningCLI` class also has two methods that can be used to run code before
+and after the trainer runs: :code:`before_<subcommand>` and :code:`after_<subcommand>`.
+A realistic example for these would be to send an email before and after the execution.
+The code for the :code:`fit` subcommand would be something like:
+
+.. testcode::
+
+    class MyLightningCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.add_argument("--notification_email", default="will@email.com")
+
+        def before_fit(self):
+            send_email(address=self.config["notification_email"], message="trainer.fit starting")
+
+        def after_fit(self):
+            send_email(address=self.config["notification_email"], message="trainer.fit finished")
+
+
+    cli = MyLightningCLI(MyModel)
+
+Note that the config object :code:`self.config` is a dictionary whose keys are global options or groups of options. It
+has the same structure as the yaml format described previously. This means for instance that the parameters used for
+instantiating the trainer class can be found in :code:`self.config['fit']['trainer']`.
+
+.. tip::
+
+    Have a look at the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class API reference to learn about other
+    methods that can be extended to customize a CLI.
+
+----
+
+**************************
+Configure forced callbacks
+**************************
+As explained previously, any Lightning callback can be added by passing it through command line or
+including it in the config via :code:`class_path` and :code:`init_args` entries.
+
+However, certain callbacks MUST be coupled with a model so they are always present and configurable.
+This can be implemented as follows:
+
+.. testcode::
+
+    from pytorch_lightning.callbacks import EarlyStopping
+
+
+    class MyLightningCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.add_lightning_class_args(EarlyStopping, "my_early_stopping")
+            parser.set_defaults({"my_early_stopping.monitor": "val_loss", "my_early_stopping.patience": 5})
+
+
+    cli = MyLightningCLI(MyModel)
+
+To change the configuration of the :code:`EarlyStopping` in the config it would be:
+
+.. code-block:: yaml
+
+    model:
+      ...
+    trainer:
+      ...
+    my_early_stopping:
+      patience: 5
+
+.. note::
+
+    The example above overrides a default in :code:`add_arguments_to_parser`. This is included to show that defaults can
+    be changed if needed. However, note that overriding of defaults in the source code is not intended to be used to
+    store the best hyperparameters for a task after experimentation. To ease reproducibility the source code should be
+    stable. It is better practice to store the best hyperparameters for a task in a configuration file independent from
+    the source code.
+
+----
+
+*******************
+Class type defaults
+*******************
+
+The support for classes as type hints allows to try many possibilities with the same CLI. This is a useful feature, but
+it can make it tempting to use an instance of a class as a default. For example:
+
+.. testcode::
+
+    class MyMainModel(LightningModule):
+        def __init__(
+            self,
+            backbone: torch.nn.Module = MyModel(encoder_layers=24),  # BAD PRACTICE!
+        ):
+            super().__init__()
+            self.backbone = backbone
+
+Normally classes are mutable as it is in this case. The instance of :code:`MyModel` would be created the moment that the
+module that defines :code:`MyMainModel` is first imported. This means that the default of :code:`backbone` will be
+initialized before the CLI class runs :code:`seed_everything` making it non-reproducible. Furthermore, if
+:code:`MyMainModel` is used more than once in the same Python process and the :code:`backbone` parameter is not
+overridden, the same instance would be used in multiple places which very likely is not what the developer intended.
+Having an instance as default also makes it impossible to generate the complete config file since for arbitrary classes
+it is not known which arguments were used to instantiate it.
+
+A good solution to these problems is to not have a default or set the default to a special value (e.g. a
+string) which would be checked in the init and instantiated accordingly. If a class parameter has no default and the CLI
+is subclassed then a default can be set as follows:
+
+.. testcode::
+
+    default_backbone = {
+        "class_path": "import.path.of.MyModel",
+        "init_args": {
+            "encoder_layers": 24,
+        },
+    }
+
+
+    class MyLightningCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.set_defaults({"model.backbone": default_backbone})
+
+A more compact version that avoids writing a dictionary would be:
+
+.. testcode::
+
+    from jsonargparse import lazy_instance
+
+
+    class MyLightningCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.set_defaults({"model.backbone": lazy_instance(MyModel, encoder_layers=24)})
+
+----
+
+************************
+Connect two config files
+************************
+Another case in which it might be desired to extend :class:`~pytorch_lightning.utilities.cli.LightningCLI` is that the
+model and data module depend on a common parameter. For example in some cases both classes require to know the
+:code:`batch_size`. It is a burden and error prone giving the same value twice in a config file. To avoid this the
+parser can be configured so that a value is only given once and then propagated accordingly. With a tool implemented
+like shown below, the :code:`batch_size` only has to be provided in the :code:`data` section of the config.
+
+.. testcode::
+
+    class MyLightningCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.link_arguments("data.batch_size", "model.batch_size")
+
+
+    cli = MyLightningCLI(MyModel, MyDataModule)
+
+The linking of arguments is observed in the help of the tool, which for this example would look like:
+
+.. code-block:: bash
+
+    $ python trainer.py fit --help
+      ...
+        --data.batch_size BATCH_SIZE
+                              Number of samples in a batch (type: int, default: 8)
+
+      Linked arguments:
+        model.batch_size <-- data.batch_size
+                              Number of samples in a batch (type: int)
+
+Sometimes a parameter value is only available after class instantiation. An example could be that your model requires
+the number of classes to instantiate its fully connected layer (for a classification task) but the value is not
+available until the data module has been instantiated. The code below illustrates how to address this.
+
+.. testcode::
+
+    class MyLightningCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.link_arguments("data.num_classes", "model.num_classes", apply_on="instantiate")
+
+
+    cli = MyLightningCLI(MyClassModel, MyDataModule)
+
+Instantiation links are used to automatically determine the order of instantiation, in this case data first.
+
+.. tip::
+
+    The linking of arguments can be used for more complex cases. For example to derive a value via a function that takes
+    multiple settings as input. For more details have a look at the API of `link_arguments
+    <https://jsonargparse.readthedocs.io/en/stable/#jsonargparse.ArgumentParser.link_arguments>`_.
+
+
+The linking of arguments is intended for things that are meant to be non-configurable. This improves the CLI user
+experience since it avoids the need for providing more parameters. A related concept is
+variable interpolation which in contrast keeps things being configurable.
diff --git a/docs/source/cli/lightning_cli_faq.rst b/docs/source/cli/lightning_cli_faq.rst
new file mode 100644
index 00000000000000..ca1be71cae7f8d
--- /dev/null
+++ b/docs/source/cli/lightning_cli_faq.rst
@@ -0,0 +1,136 @@
+:orphan:
+
+.. testsetup:: *
+    :skipif: not _JSONARGPARSE_AVAILABLE
+
+    import torch
+    from unittest import mock
+    from typing import List
+    import pytorch_lightning as pl
+    from pytorch_lightning import LightningModule, LightningDataModule, Trainer, Callback
+
+
+    class NoFitTrainer(Trainer):
+        def fit(self, *_, **__):
+            pass
+
+
+    class LightningCLI(pl.utilities.cli.LightningCLI):
+        def __init__(self, *args, trainer_class=NoFitTrainer, run=False, **kwargs):
+            super().__init__(*args, trainer_class=trainer_class, run=run, **kwargs)
+
+
+    class MyModel(LightningModule):
+        def __init__(
+            self,
+            encoder_layers: int = 12,
+            decoder_layers: List[int] = [2, 4],
+            batch_size: int = 8,
+        ):
+            pass
+
+
+    mock_argv = mock.patch("sys.argv", ["any.py"])
+    mock_argv.start()
+
+.. testcleanup:: *
+
+    mock_argv.stop()
+
+#####################################
+Eliminate config boilerplate (expert)
+#####################################
+
+***************
+Troubleshooting
+***************
+The standard behavior for CLIs, when they fail, is to terminate the process with a non-zero exit code and a short message
+to hint the user about the cause. This is problematic while developing the CLI since there is no information to track
+down the root of the problem. A simple change in the instantiation of the ``LightningCLI`` can be used such that when
+there is a failure an exception is raised and the full stack trace printed.
+
+.. testcode::
+
+    cli = LightningCLI(MyModel, parser_kwargs={"error_handler": None})
+
+.. note::
+
+    When asking about problems and reporting issues please set the ``error_handler`` to ``None`` and include the stack
+    trace in your description. With this, it is more likely for people to help out identifying the cause without needing
+    to create a reproducible script.
+
+----
+
+*************************************
+Reproducibility with the LightningCLI
+*************************************
+The topic of reproducibility is complex and it is impossible to guarantee reproducibility by just providing a class that
+people can use in unexpected ways. Nevertheless, the :class:`~pytorch_lightning.utilities.cli.LightningCLI` tries to
+give a framework and recommendations to make reproducibility simpler.
+
+When an experiment is run, it is good practice to use a stable version of the source code, either being a released
+package or at least a commit of some version controlled repository. For each run of a CLI the config file is
+automatically saved including all settings. This is useful to figure out what was done for a particular run without
+requiring to look at the source code. If by mistake the exact version of the source code is lost or some defaults
+changed, having the full config means that most of the information is preserved.
+
+The class is targeted at implementing CLIs because running a command from a shell provides a separation with the Python
+source code. Ideally the CLI would be placed in your path as part of the installation of a stable package, instead of
+running from a clone of a repository that could have uncommitted local modifications. Creating installable packages that
+include CLIs is out of the scope of this document. This is mentioned only as a teaser for people who would strive for
+the best practices possible.
+
+
+For every CLI implemented, users are encouraged to learn how to run it by reading the documentation printed with the
+:code:`--help` option and use the :code:`--print_config` option to guide the writing of config files. A few more details
+that might not be clear by only reading the help are the following.
+
+:class:`~pytorch_lightning.utilities.cli.LightningCLI` is based on argparse and as such follows the same arguments style
+as many POSIX command line tools. Long options are prefixed with two dashes and its corresponding values should be
+provided with an empty space or an equal sign, as :code:`--option value` or :code:`--option=value`. Command line options
+are parsed from left to right, therefore if a setting appears multiple times the value most to the right will override
+the previous ones. If a class has an init parameter that is required (i.e. no default value), it is given as
+:code:`--option` which makes it explicit and more readable instead of relying on positional arguments.
+
+----
+
+*********************
+What is a subcommand?
+*********************
+A subcommand is what is the action the LightningCLI applies to the script:
+
+.. code:: bash
+
+    python main.py [subcommand]
+
+See the Potential subcommands with:
+
+.. code:: bash
+
+    python main.py --help
+
+which prints:
+
+.. code:: bash
+
+        ...
+
+        fit                 Runs the full optimization routine.
+        validate            Perform one evaluation epoch over the validation set.
+        test                Perform one evaluation epoch over the test set.
+        predict             Run inference on your data.
+        tune                Runs routines to tune hyperparameters before training.
+
+use a subcommand as follows:
+
+.. code:: bash
+
+    python main.py fit
+    python main.py test
+
+----
+
+****************
+What is the CLI?
+****************
+CLI is short for commandline interface. Use your terminal to enter these commands.
diff --git a/docs/source/cli/lightning_cli_intermediate.rst b/docs/source/cli/lightning_cli_intermediate.rst
new file mode 100644
index 00000000000000..5f2cd3bca272d9
--- /dev/null
+++ b/docs/source/cli/lightning_cli_intermediate.rst
@@ -0,0 +1,185 @@
+:orphan:
+
+###########################################
+Eliminate config boilerplate (Intermediate)
+###########################################
+**Audience:** Users who want advanced modularity via the commandline interface (CLI).
+
+**Pre-reqs:** You must already understand how to use a commandline and :doc:`LightningDataModule <../data/datamodule>`.
+
+----
+
+***************************
+What is config boilerplate?
+***************************
+As Lightning projects grow in complexity it becomes desirable to enable full customizability from the commandline (CLI) so you can
+change any hyperparameters without changing your code:
+
+.. code:: bash
+
+    # Mix and match anything
+    $ python main.py fit --model.learning_rate 0.02
+    $ python main.py fit --model.learning_rate 0.01 --trainer.fast_dev_run True
+
+This is what the Lightning CLI enables. Without the Lightning CLI, you usually end up with a TON of boilerplate that looks like this:
+
+.. code:: python
+
+    from argparse import ArgumentParser
+
+    if __name__ == "__main__":
+        parser = ArgumentParser()
+        parser.add_argument("--learning_rate_1", default=0.02)
+        parser.add_argument("--learning_rate_2", default=0.03)
+        parser.add_argument("--model", default="cnn")
+        parser.add_argument("--command", default="fit")
+        parser.add_argument("--run_fast", default=True)
+        ...
+        # add 100 more of these
+        ...
+
+        args = parser.parse_args()
+
+        if args.model == "cnn":
+            model = ConvNet(learning_rate=args.learning_rate_1)
+        elif args.model == "transformer":
+            model = Transformer(learning_rate=args.learning_rate_2)
+        trainer = Trainer(fast_dev_run=args.run_fast)
+        ...
+
+        if args.command == "fit":
+            trainer.fit()
+        elif args.command == "test":
+            ...
+
+This kind of boilerplate is unsustainable as projects grow in complexity.
+
+----
+
+************************
+Enable the Lightning CLI
+************************
+To enable the Lightning CLI install the extras:
+
+.. code:: bash
+
+    pip install pytorch-lightning[extra]
+
+if the above fails, only install jsonargparse:
+
+.. code:: bash
+
+    pip install -U jsonargparse[signatures]
+
+----
+
+**************************
+Connect a model to the CLI
+**************************
+The simplest way to control a model with the CLI is to wrap it in the LightningCLI object:
+
+.. code:: python
+
+    # main.py
+    import torch
+    from pytorch_lightning.utilities.cli import LightningCLI
+
+    # simple demo classes for your convenience
+    from pytorch_lightning.demos.boring_classes import DemoModel, BoringDataModule
+
+    cli = LightningCLI(DemoModel, BoringDataModule)
+    # note: don't call fit!!
+
+Now your model can be managed via the CLI. To see the available commands type:
+
+.. code:: bash
+
+    $ python main.py --help
+
+Which prints out:
+
+.. code:: bash
+
+    usage: a.py [-h] [-c CONFIG] [--print_config [={comments,skip_null,skip_default}+]]
+            {fit,validate,test,predict,tune} ...
+
+    pytorch-lightning trainer command line tool
+
+    optional arguments:
+    -h, --help            Show this help message and exit.
+    -c CONFIG, --config CONFIG
+                            Path to a configuration file in json or yaml format.
+    --print_config [={comments,skip_null,skip_default}+]
+                            Print configuration and exit.
+
+    subcommands:
+    For more details of each subcommand add it as argument followed by --help.
+
+    {fit,validate,test,predict,tune}
+        fit                 Runs the full optimization routine.
+        validate            Perform one evaluation epoch over the validation set.
+        test                Perform one evaluation epoch over the test set.
+        predict             Run inference on your data.
+        tune                Runs routines to tune hyperparameters before training.
+
+
+the message tells us that we have a few available subcommands:
+
+.. code:: bash
+
+    python main.py [subcommand]
+
+which you can use depending on your use case:
+
+.. code:: bash
+
+    $ python main.py fit
+    $ python main.py validate
+    $ python main.py test
+    $ python main.py predict
+    $ python main.py tune
+
+----
+
+**************************
+Train a model with the CLI
+**************************
+To run the full training routine (train, val, test), use the subcommand ``fit``:
+
+.. code:: bash
+
+    python main.py fit
+
+View all available options with the ``--help`` command:
+
+.. code:: bash
+
+    usage: main.py [options] fit [-h] [-c CONFIG]
+                                [--seed_everything SEED_EVERYTHING] [--trainer CONFIG]
+                                ...
+                                [--ckpt_path CKPT_PATH]
+        --trainer.logger LOGGER
+
+    optional arguments:
+    <class '__main__.DemoModel'>:
+        --model.out_dim OUT_DIM
+                                (type: int, default: 10)
+        --model.learning_rate LEARNING_RATE
+                                (type: float, default: 0.02)
+    <class 'pytorch_lightning.demos.boring_classes.BoringDataModule'>:
+    --data CONFIG         Path to a configuration file.
+    --data.data_dir DATA_DIR
+                            (type: str, default: ./)
+
+With the Lightning CLI enabled, you can now change the parameters without touching your code:
+
+.. code:: bash
+
+    # change the learning_rate
+    python main.py fit --model.out_dim 30
+
+    # change the out dimensions also
+    python main.py fit --model.out_dim 10 --model.learning_rate 0.1
+
+    # change trainer and data arguments too
+    python main.py fit --model.out_dim 2 --model.learning_rate 0.1 --data.data_dir '~/' --trainer.logger False
diff --git a/docs/source/cli/lightning_cli_intermediate_2.rst b/docs/source/cli/lightning_cli_intermediate_2.rst
new file mode 100644
index 00000000000000..8d0a5e091036d3
--- /dev/null
+++ b/docs/source/cli/lightning_cli_intermediate_2.rst
@@ -0,0 +1,251 @@
+:orphan:
+
+###########################################
+Eliminate config boilerplate (intermediate)
+###########################################
+**Audience:** Users who have multiple models and datasets per project.
+
+**Pre-reqs:** You must have read :doc:`(Control it all from the CLI) <lightning_cli_intermediate>`.
+
+----
+
+****************************************
+Why do I want to mix models and datasets
+****************************************
+Lightning projects usually begin with one model and one dataset. As the project grows in complexity and you introduce more models and more datasets, it becomes desirable
+to mix any model with any dataset directly from the commandline without changing your code.
+
+
+.. code:: bash
+
+    # Mix and match anything
+    $ python main.py fit --model=GAN --data=MNIST
+    $ python main.py fit --model=Transformer --data=MNIST
+
+This is what the Lightning CLI enables. Otherwise, this kind of configuration requires a significant amount of boilerplate that often looks like this:
+
+.. code:: python
+
+    # choose model
+    if args.model == "gan":
+        model = GAN(args.feat_dim)
+    elif args.model == "transformer":
+        model = Transformer(args.feat_dim)
+    ...
+
+    # choose datamodule
+    if args.data == "MNIST":
+        datamodule = MNIST()
+    elif args.data == "imagenet":
+        datamodule = Imagenet()
+    ...
+
+    # mix them!
+    trainer.fit(model, datamodule)
+
+----
+
+*************************
+Register LightningModules
+*************************
+Connect models across different files with the ``MODEL_REGISTRY`` to make them available from the CLI:
+
+.. code:: python
+
+    # main.py
+
+    from pytorch_lightning import demos
+    from pytorch_lightning.utilities import cli as pl_cli
+
+
+    @pl_cli.MODEL_REGISTRY
+    class Model1(DemoModel):
+        def configure_optimizers(self):
+            print("⚡", "using Model1", "⚡")
+            return super().configure_optimizers()
+
+
+    @pl_cli.MODEL_REGISTRY
+    class Model2(DemoModel):
+        def configure_optimizers(self):
+            print("⚡", "using Model2", "⚡")
+            return super().configure_optimizers()
+
+
+    cli = pl_cli.LightningCLI(datamodule_class=BoringDataModule)
+
+Now you can choose between any model from the CLI:
+
+.. code:: bash
+
+    # use Model1
+    python main.py fit --model Model1
+
+    # use Model2
+    python main.py fit --model Model2
+
+----
+
+********************
+Register DataModules
+********************
+Connect DataModules across different files with the ``DATAMODULE_REGISTRY`` to make them available from the CLI:
+
+.. code:: python
+
+    # main.py
+    import torch
+    from pytorch_lightning.utilities import cli as pl_cli
+    from pytorch_lightning import demos
+
+
+    @pl_cli.DATAMODULE_REGISTRY
+    class FakeDataset1(BoringDataModule):
+        def train_dataloader(self):
+            print("⚡", "using FakeDataset1", "⚡")
+            return torch.utils.data.DataLoader(self.random_train)
+
+
+    @pl_cli.DATAMODULE_REGISTRY
+    class FakeDataset2(BoringDataModule):
+        def train_dataloader(self):
+            print("⚡", "using FakeDataset2", "⚡")
+            return torch.utils.data.DataLoader(self.random_train)
+
+
+    cli = pl_cli.LightningCLI(DemoModel)
+
+Now you can choose between any dataset at runtime:
+
+.. code:: bash
+
+    # use Model1
+    python main.py fit --data FakeDataset1
+
+    # use Model2
+    python main.py fit --data FakeDataset2
+
+----
+
+*******************
+Register optimizers
+*******************
+Connect optimizers with the ``OPTIMIZER_REGISTRY`` to make them available from the CLI:
+
+.. code:: python
+
+    # main.py
+    import torch
+    from pytorch_lightning.utilities import cli as pl_cli
+    from pytorch_lightning import demos
+
+
+    @pl_cli.OPTIMIZER_REGISTRY
+    class LitAdam(torch.optim.Adam):
+        def step(self, closure):
+            print("⚡", "using LitAdam", "⚡")
+            super().step(closure)
+
+
+    @pl_cli.OPTIMIZER_REGISTRY
+    class FancyAdam(torch.optim.Adam):
+        def step(self, closure):
+            print("⚡", "using FancyAdam", "⚡")
+            super().step(closure)
+
+
+    cli = pl_cli.LightningCLI(DemoModel, BoringDataModule)
+
+Now you can choose between any optimizer at runtime:
+
+.. code:: bash
+
+    # use LitAdam
+    python main.py fit --optimizer LitAdam
+
+    # use FancyAdam
+    python main.py fit --optimizer FancyAdam
+
+Bonus: If you need only 1 optimizer, the Lightning CLI already works out of the box with any Optimizer from ``torch.optim.optim``:
+
+.. code:: bash
+
+    python main.py fit --optimizer AdamW
+
+If the optimizer you want needs other arguments, add them via the CLI (no need to change your code)!
+
+.. code:: bash
+
+    python main.py fit --optimizer SGD --optimizer.lr=0.01
+
+----
+
+**********************
+Register LR schedulers
+**********************
+Connect learning rate schedulers with the ``LR_SCHEDULER_REGISTRY`` to make them available from the CLI:
+
+.. code:: python
+
+    # main.py
+    import torch
+    from pytorch_lightning.utilities import cli as pl_cli
+    from pytorch_lightning import demos
+
+
+    @pl_cli.LR_SCHEDULER_REGISTRY
+    class LitLRScheduler(torch.optim.lr_scheduler.CosineAnnealingLR):
+        def step(self):
+            print("⚡", "using LitLRScheduler", "⚡")
+            super().step()
+
+
+    cli = pl_cli.LightningCLI(DemoModel, BoringDataModule)
+
+Now you can choose between any learning rate scheduler at runtime:
+
+.. code:: bash
+
+    # LitLRScheduler
+    python main.py fit --lr_scheduler LitLRScheduler
+
+
+Bonus: If you need only 1 LRScheduler, the Lightning CLI already works out of the box with any LRScheduler from ``torch.optim``:
+
+.. code:: bash
+
+    python main.py fit --lr_scheduler CosineAnnealingLR
+    python main.py fit --lr_scheduler LinearLR
+    ...
+
+If the scheduler you want needs other arguments, add them via the CLI (no need to change your code)!
+
+.. code:: bash
+
+    python main.py fit --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=epoch
+
+----
+
+*************************
+Register from any package
+*************************
+A shortcut to register many classes from a package is to use the ``register_classes`` method. Here we register all optimizers from the ``torch.optim`` library:
+
+.. code:: python
+
+    import torch
+    from pytorch_lightning.utilities import cli as pl_cli
+    from pytorch_lightning import demos
+
+    # add all PyTorch optimizers!
+    pl_cli.OPTIMIZER_REGISTRY.register_classes(module=torch.optim, base_cls=torch.optim.Optimizer)
+
+    cli = pl_cli.LightningCLI(DemoModel, BoringDataModule)
+
+Now use any of the optimizers in the ``torch.optim`` library:
+
+.. code:: bash
+
+    python main.py fit --optimizer AdamW
+
+This method is supported by all the registry classes.
diff --git a/docs/source/clouds/cloud_training.rst b/docs/source/clouds/cloud_training.rst
index 3f9cbf745ebc5e..1bd57b1c805bd3 100644
--- a/docs/source/clouds/cloud_training.rst
+++ b/docs/source/clouds/cloud_training.rst
@@ -1,48 +1,86 @@
 .. _grid:
 
-##############
-Cloud Training
-##############
+##################
+Train on the cloud
+##################
+**Audience:** Users who want to develop and train models on the cloud (public cloud, private cloud or onprem clusters).
 
-Lightning makes it easy to scale your training, without the boilerplate.
-If you want to train your models on the cloud, without dealing with engineering infrastructure and servers, you can try `Grid.ai <https://www.grid.ai/>`_.
+----
 
-Developed by the creators of `PyTorch Lightning <https://www.pytorchlightning.ai/>`_, Grid is a platform that allows you to:
+.. raw:: html
 
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: 1: Access a GPU machine on the cloud
+   :description: Learn to train models using an interactive cloud machine.
+   :col_css: col-md-4
+   :button_link: session_basic.html
+   :height: 200
+   :tag: basic
+
+.. displayitem::
+   :header: 2: Run a model in the background on the cloud
+   :description: Learn to run a model in the background on a cloud machine.
+   :col_css: col-md-4
+   :button_link: run_basic.html
+   :height: 200
+   :tag: intermediate
+
+.. displayitem::
+   :header: 3: Save up to 80% on cloud costs with fault-tolerant training
+   :description: Run on the cloud for 1/10th the price with fault-tolerant training.
+   :col_css: col-md-4
+   :button_link: fault_tolerant_training_basic.html
+   :height: 200
+   :tag: intermediate
+
+.. displayitem::
+   :header: 4: Run many models at once
+   :description: Run many models at once (sweep) to find the best performing model.
+   :col_css: col-md-4
+   :button_link: run_intermediate.html
+   :height: 200
+   :tag: intermediate
+
+.. displayitem::
+   :header: 5: Run on your own cloud
+   :description: Learn how to run on your Company or University private clouds.
+   :col_css: col-md-4
+   :button_link: run_expert.html
+   :height: 200
+   :tag: expert
 
-- **Scale your models to multi-GPU and multiple nodes** instantly with interactive sessions
-- **Run Hyperparameter Sweeps on 100s of GPUs** in one command
-- **Upload huge datasets** for availability at scale
-- **Iterate faster and cheaper**, you only pay for what you need
+.. raw:: html
 
+        </div>
+    </div>
 
-****************
-Training on Grid
-****************
+----
 
 .. raw:: html
 
-    <video width="50%" max-width="400px" controls
-    poster="https://grid-docs.s3.us-east-2.amazonaws.com/grid.png"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/grid.mp4"></video>
-
-|
-
-You can launch any Lightning model on Grid using the Grid `CLI <https://pypi.org/project/lightning-grid/>`_:
+   <div class="row" style='font-size: 14px'>
+      <div class='col-md-6'>
 
-.. code-block:: bash
+.. raw:: html
 
-    grid run --instance_type v100 --gpus 4 my_model.py --accelerator 'gpu' --devices 4 --learning_rate 'uniform(1e-6, 1e-1, 20)' --layers '[2, 4, 8, 16]'
+    <video width="100%" max-width="400px" controls
+    poster="https://grid-docs.s3.us-east-2.amazonaws.com/grid.png"
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/grid.mp4"></video>
 
-You can also start runs or interactive sessions from the `Grid platform <https://platform.grid.ai>`_, where you can upload datasets, view artifacts, view the logs, the cost, log into tensorboard, and so much more.
 
+.. raw:: html
 
-**********
-Learn More
-**********
+      </div>
+      <div class='col-md-6'>
 
-`Sign up for Grid <http://platform.grid.ai>`_ and receive free credits to get you started!
+`Grid.ai <https://www.grid.ai/>`_ is the official cloud training solution for PyTorch Lightning. Grid is designed to support researcher workloads at both academic labs and major companies.
 
-`Grid in 3 minutes <https://docs.grid.ai/#introduction>`_
+.. raw:: html
 
-`Grid.ai Terms of Service <https://www.grid.ai/terms-of-service/>`_
+      </div>
+   </div>
diff --git a/docs/source/clouds/cloud_training_intermediate.rst b/docs/source/clouds/cloud_training_intermediate.rst
new file mode 100644
index 00000000000000..c5a65d7e782e67
--- /dev/null
+++ b/docs/source/clouds/cloud_training_intermediate.rst
@@ -0,0 +1,7 @@
+:orphan:
+
+.. _grid_cloud_intermediate:
+
+#################################
+Train on the cloud (intermediate)
+#################################
diff --git a/docs/source/clouds/cluster.rst b/docs/source/clouds/cluster.rst
index 573a9d8fcc472a..59a252a43ba3c2 100644
--- a/docs/source/clouds/cluster.rst
+++ b/docs/source/clouds/cluster.rst
@@ -1,341 +1,48 @@
-.. testsetup:: *
-
-    from pytorch_lightning.trainer.trainer import Trainer
-
-*****************
-Computing cluster
-*****************
-
-With Lightning it is easy to run your training script on a computing cluster without almost any modifications to the script.
-In this guide, we cover
-
-1.  General purpose cluster (not managed)
-
-2.  Using `Torch Distributed Run <https://pytorch.org/docs/stable/elastic/run.html>`__
-
-3.  SLURM cluster
-
-4.  Custom cluster environment
-
-5.  General tips for multi-node training
-
---------
-
-.. _non-slurm:
-
-1. General purpose cluster
-==========================
-
-This guide shows how to run a training job on a general purpose cluster. We recommend beginners to try this method
-first because it requires the least amount of configuration and changes to the code.
-To setup a multi-node computing cluster you need:
-
-1) Multiple computers with PyTorch Lightning installed
-2) A network connectivity between them with firewall rules that allow traffic flow on a specified *MASTER_PORT*.
-3) Defined environment variables on each node required for the PyTorch Lightning multi-node distributed training
-
-PyTorch Lightning follows the design of `PyTorch distributed communication package <https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization>`_. and requires the following environment variables to be defined on each node:
-
-- *MASTER_PORT* - required; has to be a free port on machine with NODE_RANK 0
-- *MASTER_ADDR* - required (except for NODE_RANK 0); address of NODE_RANK 0 node
-- *WORLD_SIZE* - required; how many nodes are in the cluster
-- *NODE_RANK* - required; id of the node in the cluster
-
-.. _training_script_setup:
-
-Training script setup
----------------------
-
-To train a model using multiple nodes, do the following:
-
-1.  Design your :ref:`lightning_module` (no need to add anything specific here).
-
-2.  Enable DDP in the trainer
-
-    .. code-block:: python
-
-       # train on 32 GPUs across 4 nodes
-       trainer = Trainer(accelerator="gpu", devices=8, num_nodes=4, strategy="ddp")
-
-
-Submit a job to the cluster
----------------------------
-
-To submit a training job to the cluster you need to run the same training script on each node of the cluster.
-This means that you need to:
-
-1. Copy all third-party libraries to each node (usually means - distribute requirements.txt file and install it).
-2. Copy all your import dependencies and the script itself to each node.
-3. Run the script on each node.
-
-
-----------
-
-.. _torch_distributed_run:
-
-2. Torch Distributed Run
-========================
-
-`Torch Distributed Run <https://pytorch.org/docs/stable/elastic/run.html>`__ provides helper functions to setup distributed environment variables from the `PyTorch distributed communication package <https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization>`__ that need to be defined on each node.
-
-Once the script is setup like described in :ref:`training_script_setup`, you can run the below command across your nodes to start multi-node training.
-
-Like a custom cluster, you have to ensure that there is network connectivity between the nodes with firewall rules that allow traffic flow on a specified *MASTER_PORT*.
-
-Finally, you'll need to decide which node you'd like to be the main node (*MASTER_ADDR*), and the ranks of each node (*NODE_RANK*).
-
-For example:
-
-* *MASTER_ADDR* 10.10.10.16
-* *MASTER_PORT* 29500
-* *NODE_RANK* 0 for the first node, 1 for the second node
-
-Run the below command with the appropriate variables set on each node.
-
-.. code-block:: bash
-
-    python -m torch.distributed.run
-        --nnodes=2 # number of nodes you'd like to run with
-        --master_addr <MASTER_ADDR>
-        --master_port <MASTER_PORT>
-        --node_rank <NODE_RANK>
-        train.py (--arg1 ... train script args...)
-
-.. note::
-
-    ``torch.distributed.run`` assumes that you'd like to spawn a process per GPU if GPU devices are found on the node. This can be adjusted with ``-nproc_per_node``.
-
-.. _slurm:
-
-3. SLURM managed cluster
-========================
-
-Lightning automates the details behind training on a SLURM-powered cluster. In contrast to the general purpose
-cluster above, the user does not start the jobs manually on each node and instead submits it to SLURM which
-schedules the resources and time for which the job is allowed to run.
-
-
-Training script design
-----------------------
-
-To train a model using multiple nodes, do the following:
-
-1.  Design your :ref:`lightning_module` (no need to add anything specific here).
-
-2.  Enable DDP in the trainer
-
-    .. code-block:: python
-
-       # train on 32 GPUs across 4 nodes
-       trainer = Trainer(accelerator="gpu", devices=8, num_nodes=4, strategy="ddp")
-
-3.  It's a good idea to structure your training script like this:
-
-    .. testcode::
-
-        # train.py
-        def main(hparams):
-            model = LightningTemplateModel(hparams)
-
-            trainer = Trainer(accelerator="gpu", devices=8, num_nodes=4, strategy="ddp")
-
-            trainer.fit(model)
-
-
-        if __name__ == "__main__":
-            root_dir = os.path.dirname(os.path.realpath(__file__))
-            parent_parser = ArgumentParser(add_help=False)
-            hyperparams = parser.parse_args()
-
-            # TRAIN
-            main(hyperparams)
-
-4.  Create the appropriate SLURM job:
-
-    .. code-block:: bash
-
-        # (submit.sh)
-        #!/bin/bash -l
-
-        # SLURM SUBMIT SCRIPT
-        #SBATCH --nodes=4
-        #SBATCH --gres=gpu:8
-        #SBATCH --ntasks-per-node=8
-        #SBATCH --mem=0
-        #SBATCH --time=0-02:00:00
-
-        # activate conda env
-        source activate $1
-
-        # debugging flags (optional)
-        export NCCL_DEBUG=INFO
-        export PYTHONFAULTHANDLER=1
-
-        # on your cluster you might need these:
-        # set the network interface
-        # export NCCL_SOCKET_IFNAME=^docker0,lo
-
-        # might need the latest CUDA
-        # module load NCCL/2.4.7-1-cuda.10.0
-
-        # run script from above
-        srun python3 train.py
-
-5.  If you want auto-resubmit (read below), add this line to the submit.sh script
-
-    .. code-block:: bash
-
-        #SBATCH --signal=SIGUSR1@90
-
-6.  Submit the SLURM job
-
-    .. code-block:: bash
-
-        sbatch submit.sh
-
-
-Wall time auto-resubmit
------------------------
-When you use Lightning in a SLURM cluster, it automatically detects when it is about
-to run into the wall time and does the following:
-
-1.  Saves a temporary checkpoint.
-2.  Requeues the job.
-3.  When the job starts, it loads the temporary checkpoint.
-
-To get this behavior make sure to add the correct signal to your SLURM script
-
-.. code-block:: bash
-
-    # 90 seconds before training ends
-    SBATCH --signal=SIGUSR1@90
-
-If auto-resubmit is not desired, it can be turned off in the :class:`~pytorch_lightning.plugins.environments.slurm_environment.SLURMEnvironment` plugin:
-
-.. code-block:: python
-
-    from pytorch_lightning.plugins.environments import SLURMEnvironment
-
-    trainer = Trainer(plugins=[SLURMEnvironment(auto_requeue=False)])
-
-
-Building SLURM scripts
-----------------------
-
-Instead of manually building SLURM scripts, you can use the
-`SlurmCluster object <https://williamfalcon.github.io/test-tube/hpc/SlurmCluster>`_
-to do this for you. The SlurmCluster can also run a grid search if you pass
-in a `HyperOptArgumentParser
-<https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser>`_.
-
-Here is an example where you run a grid search of 9 combinations of hyperparameters.
-See also the multi-node examples
-`here <https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples>`__.
-
-.. code-block:: python
-
-    # grid search 3 values of learning rate and 3 values of number of layers for your net
-    # this generates 9 experiments (lr=1e-3, layers=16), (lr=1e-3, layers=32),
-    # (lr=1e-3, layers=64), ... (lr=1e-1, layers=64)
-    parser = HyperOptArgumentParser(strategy="grid_search", add_help=False)
-    parser.opt_list("--learning_rate", default=0.001, type=float, options=[1e-3, 1e-2, 1e-1], tunable=True)
-    parser.opt_list("--layers", default=1, type=float, options=[16, 32, 64], tunable=True)
-    hyperparams = parser.parse_args()
-
-    # Slurm cluster submits 9 jobs, each with a set of hyperparams
-    cluster = SlurmCluster(
-        hyperparam_optimizer=hyperparams,
-        log_path="/some/path/to/save",
-    )
-
-    # OPTIONAL FLAGS WHICH MAY BE CLUSTER DEPENDENT
-    # which interface your nodes use for communication
-    cluster.add_command("export NCCL_SOCKET_IFNAME=^docker0,lo")
-
-    # see the output of the NCCL connection process
-    # NCCL is how the nodes talk to each other
-    cluster.add_command("export NCCL_DEBUG=INFO")
-
-    # setting a main port here is a good idea.
-    cluster.add_command("export MASTER_PORT=%r" % PORT)
-
-    # ************** DON'T FORGET THIS ***************
-    # MUST load the latest NCCL version
-    cluster.load_modules(["NCCL/2.4.7-1-cuda.10.0"])
-
-    # configure cluster
-    cluster.per_experiment_nb_nodes = 12
-    cluster.per_experiment_nb_gpus = 8
-
-    cluster.add_slurm_cmd(cmd="ntasks-per-node", value=8, comment="1 task per gpu")
-
-    # submit a script with 9 combinations of hyper params
-    # (lr=1e-3, layers=16), (lr=1e-3, layers=32), (lr=1e-3, layers=64), ... (lr=1e-1, layers=64)
-    cluster.optimize_parallel_cluster_gpu(
-        main, nb_trials=9, job_name="name_for_squeue"  # how many permutations of the grid search to run
-    )
-
-
-The other option is that you generate scripts on your own via a bash command or use our
-:doc:`native solution <../clouds/cloud_training>`.
-
-----------
-
-.. _custom-cluster:
-
-4. Custom cluster
-=================
-
-Lightning provides an interface for providing your own definition of a cluster environment. It mainly consists of
-parsing the right environment variables to access information such as world size, global and local rank (process id),
-and node rank (node id). Here is an example of a custom
-:class:`~pytorch_lightning.plugins.environments.cluster_environment.ClusterEnvironment`:
-
-.. code-block:: python
-
-    import os
-    from pytorch_lightning.plugins.environments import ClusterEnvironment
-
-
-    class MyClusterEnvironment(ClusterEnvironment):
-        @property
-        def creates_processes_externally(self) -> bool:
-            """Return True if the cluster is managed (you don't launch processes yourself)"""
-            return True
-
-        def world_size(self) -> int:
-            return int(os.environ["WORLD_SIZE"])
-
-        def global_rank(self) -> int:
-            return int(os.environ["RANK"])
-
-        def local_rank(self) -> int:
-            return int(os.environ["LOCAL_RANK"])
-
-        def node_rank(self) -> int:
-            return int(os.environ["NODE_RANK"])
-
-        def main_address(self) -> str:
-            return os.environ["MASTER_ADDRESS"]
-
-        def main_port(self) -> int:
-            return int(os.environ["MASTER_PORT"])
-
-
-    trainer = Trainer(plugins=[MyClusterEnvironment()])
-
-
-----------
-
-5. General tips for multi-node training
-=======================================
-
-Debugging flags
----------------
-
-When running in DDP mode, some errors in your code can show up as an NCCL issue.
-Set the ``NCCL_DEBUG=INFO`` environment variable to see the ACTUAL error.
-
-.. code-block:: bash
-
-    NCCL_DEBUG=INFO python train.py ...
+#########################
+Run on an on-prem cluster
+#########################
+
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Run on an on-prem cluster
+   :description: Learn to train models on a general compute cluster.
+   :col_css: col-md-6
+   :button_link: cluster_intermediate_1.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Run with Torch Distributed
+   :description: Run models on a cluster with torch distributed.
+   :col_css: col-md-6
+   :button_link: cluster_intermediate_2.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Run on a SLURM cluster
+   :description: Run models on a SLURM-managed cluster
+   :col_css: col-md-6
+   :button_link: cluster_advanced.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Integrate your own cluster
+   :description: Learn how to integrate your own cluster
+   :col_css: col-md-6
+   :button_link: cluster_expert.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+      </div>
+   </div>
diff --git a/docs/source/clouds/cluster_advanced.rst b/docs/source/clouds/cluster_advanced.rst
new file mode 100644
index 00000000000000..918bf0616b72ec
--- /dev/null
+++ b/docs/source/clouds/cluster_advanced.rst
@@ -0,0 +1,213 @@
+####################################
+Run on an on-prem cluster (advanced)
+####################################
+
+.. _slurm:
+
+----
+
+******************************
+Run on a SLRUM managed cluster
+******************************
+Lightning automates the details behind training on a SLURM-powered cluster. In contrast to the general purpose
+cluster above, the user does not start the jobs manually on each node and instead submits it to SLURM which
+schedules the resources and time for which the job is allowed to run.
+
+----
+
+***************************
+Design your training script
+***************************
+
+To train a model using multiple nodes, do the following:
+
+1.  Design your :ref:`lightning_module` (no need to add anything specific here).
+
+2.  Enable DDP in the trainer
+
+    .. code-block:: python
+
+       # train on 32 GPUs across 4 nodes
+       trainer = Trainer(accelerator="gpu", devices=8, num_nodes=4, strategy="ddp")
+
+3.  It's a good idea to structure your training script like this:
+
+    .. testcode::
+
+        # train.py
+        def main(hparams):
+            model = LightningTemplateModel(hparams)
+
+            trainer = Trainer(accelerator="gpu", devices=8, num_nodes=4, strategy="ddp")
+
+            trainer.fit(model)
+
+
+        if __name__ == "__main__":
+            root_dir = os.path.dirname(os.path.realpath(__file__))
+            parent_parser = ArgumentParser(add_help=False)
+            hyperparams = parser.parse_args()
+
+            # TRAIN
+            main(hyperparams)
+
+4.  Create the appropriate SLURM job:
+
+    .. code-block:: bash
+
+        # (submit.sh)
+        #!/bin/bash -l
+
+        # SLURM SUBMIT SCRIPT
+        #SBATCH --nodes=4
+        #SBATCH --gres=gpu:8
+        #SBATCH --ntasks-per-node=8
+        #SBATCH --mem=0
+        #SBATCH --time=0-02:00:00
+
+        # activate conda env
+        source activate $1
+
+        # debugging flags (optional)
+        export NCCL_DEBUG=INFO
+        export PYTHONFAULTHANDLER=1
+
+        # on your cluster you might need these:
+        # set the network interface
+        # export NCCL_SOCKET_IFNAME=^docker0,lo
+
+        # might need the latest CUDA
+        # module load NCCL/2.4.7-1-cuda.10.0
+
+        # run script from above
+        srun python3 train.py
+
+5.  If you want auto-resubmit (read below), add this line to the submit.sh script
+
+    .. code-block:: bash
+
+        #SBATCH --signal=SIGUSR1@90
+
+6.  Submit the SLURM job
+
+    .. code-block:: bash
+
+        sbatch submit.sh
+
+----
+
+**********************************
+Enable auto wall-time resubmitions
+**********************************
+When you use Lightning in a SLURM cluster, it automatically detects when it is about
+to run into the wall time and does the following:
+
+1.  Saves a temporary checkpoint.
+2.  Requeues the job.
+3.  When the job starts, it loads the temporary checkpoint.
+
+To get this behavior make sure to add the correct signal to your SLURM script
+
+.. code-block:: bash
+
+    # 90 seconds before training ends
+    SBATCH --signal=SIGUSR1@90
+
+If auto-resubmit is not desired, it can be turned off in the :class:`~pytorch_lightning.plugins.environments.slurm_environment.SLURMEnvironment` plugin:
+
+.. code-block:: python
+
+    from pytorch_lightning.plugins.environments import SLURMEnvironment
+
+    trainer = Trainer(plugins=[SLURMEnvironment(auto_requeue=False)])
+
+----
+
+***********************
+Build your SLURM script
+***********************
+Instead of manually building SLURM scripts, you can use the
+`SlurmCluster object <https://williamfalcon.github.io/test-tube/hpc/SlurmCluster>`_
+to do this for you. The SlurmCluster can also run a grid search if you pass
+in a `HyperOptArgumentParser
+<https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser>`_.
+
+Here is an example where you run a grid search of 9 combinations of hyperparameters.
+See also the multi-node examples
+`here <https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples>`__.
+
+.. code-block:: python
+
+    # grid search 3 values of learning rate and 3 values of number of layers for your net
+    # this generates 9 experiments (lr=1e-3, layers=16), (lr=1e-3, layers=32),
+    # (lr=1e-3, layers=64), ... (lr=1e-1, layers=64)
+    parser = HyperOptArgumentParser(strategy="grid_search", add_help=False)
+    parser.opt_list("--learning_rate", default=0.001, type=float, options=[1e-3, 1e-2, 1e-1], tunable=True)
+    parser.opt_list("--layers", default=1, type=float, options=[16, 32, 64], tunable=True)
+    hyperparams = parser.parse_args()
+
+    # Slurm cluster submits 9 jobs, each with a set of hyperparams
+    cluster = SlurmCluster(
+        hyperparam_optimizer=hyperparams,
+        log_path="/some/path/to/save",
+    )
+
+    # OPTIONAL FLAGS WHICH MAY BE CLUSTER DEPENDENT
+    # which interface your nodes use for communication
+    cluster.add_command("export NCCL_SOCKET_IFNAME=^docker0,lo")
+
+    # see the output of the NCCL connection process
+    # NCCL is how the nodes talk to each other
+    cluster.add_command("export NCCL_DEBUG=INFO")
+
+    # setting a main port here is a good idea.
+    cluster.add_command("export MASTER_PORT=%r" % PORT)
+
+    # ************** DON'T FORGET THIS ***************
+    # MUST load the latest NCCL version
+    cluster.load_modules(["NCCL/2.4.7-1-cuda.10.0"])
+
+    # configure cluster
+    cluster.per_experiment_nb_nodes = 12
+    cluster.per_experiment_nb_gpus = 8
+
+    cluster.add_slurm_cmd(cmd="ntasks-per-node", value=8, comment="1 task per gpu")
+
+    # submit a script with 9 combinations of hyper params
+    # (lr=1e-3, layers=16), (lr=1e-3, layers=32), (lr=1e-3, layers=64), ... (lr=1e-1, layers=64)
+    cluster.optimize_parallel_cluster_gpu(
+        main, nb_trials=9, job_name="name_for_squeue"  # how many permutations of the grid search to run
+    )
+
+
+The other option is that you generate scripts on your own via a bash command or use our
+:doc:`native solution <../clouds/cloud_training>`.
+
+----
+
+********
+Get help
+********
+Setting up a cluster for distributed training is not trivial. Lightning offers lightning-grid which allows you to configure a cluster easily and run experiments via the CLI and web UI.
+
+Try it out for free today:
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Train models on the cloud
+   :description: Learn to run a model in the background on a cloud machine.
+   :col_css: col-md-6
+   :button_link: cloud_training.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div
diff --git a/docs/source/clouds/cluster_expert.rst b/docs/source/clouds/cluster_expert.rst
new file mode 100644
index 00000000000000..7c4baa4c24141b
--- /dev/null
+++ b/docs/source/clouds/cluster_expert.rst
@@ -0,0 +1,80 @@
+:orphan:
+
+##################################
+Run on an on-prem cluster (expert)
+##################################
+
+.. _custom-cluster:
+
+----
+
+**************************
+Integrate your own cluster
+**************************
+
+Lightning provides an interface for providing your own definition of a cluster environment. It mainly consists of
+parsing the right environment variables to access information such as world size, global and local rank (process id),
+and node rank (node id). Here is an example of a custom
+:class:`~pytorch_lightning.plugins.environments.cluster_environment.ClusterEnvironment`:
+
+.. code-block:: python
+
+    import os
+    from pytorch_lightning.plugins.environments import ClusterEnvironment
+
+
+    class MyClusterEnvironment(ClusterEnvironment):
+        @property
+        def creates_processes_externally(self) -> bool:
+            """Return True if the cluster is managed (you don't launch processes yourself)"""
+            return True
+
+        def world_size(self) -> int:
+            return int(os.environ["WORLD_SIZE"])
+
+        def global_rank(self) -> int:
+            return int(os.environ["RANK"])
+
+        def local_rank(self) -> int:
+            return int(os.environ["LOCAL_RANK"])
+
+        def node_rank(self) -> int:
+            return int(os.environ["NODE_RANK"])
+
+        def main_address(self) -> str:
+            return os.environ["MASTER_ADDRESS"]
+
+        def main_port(self) -> int:
+            return int(os.environ["MASTER_PORT"])
+
+
+    trainer = Trainer(plugins=[MyClusterEnvironment()])
+
+----
+
+********
+Get help
+********
+Setting up a cluster for distributed training is not trivial. Lightning offers lightning-grid which allows you to configure a cluster easily and run experiments via the CLI and web UI.
+
+Try it out for free today:
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Train models on the cloud
+   :description: Learn to run a model in the background on a cloud machine.
+   :col_css: col-md-6
+   :button_link: cloud_training.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div
diff --git a/docs/source/clouds/cluster_intermediate_1.rst b/docs/source/clouds/cluster_intermediate_1.rst
new file mode 100644
index 00000000000000..c2d92e200bde0e
--- /dev/null
+++ b/docs/source/clouds/cluster_intermediate_1.rst
@@ -0,0 +1,99 @@
+:orphan:
+
+########################################
+Run on an on-prem cluster (intermediate)
+########################################
+**Audience**: Users who need to run on an academic or enterprise private cluster.
+
+----
+
+.. _non-slurm:
+
+*****************
+Setup the cluster
+*****************
+This guide shows how to run a training job on a general purpose cluster. We recommend beginners to try this method
+first because it requires the least amount of configuration and changes to the code.
+To setup a multi-node computing cluster you need:
+
+1) Multiple computers with PyTorch Lightning installed
+2) A network connectivity between them with firewall rules that allow traffic flow on a specified *MASTER_PORT*.
+3) Defined environment variables on each node required for the PyTorch Lightning multi-node distributed training
+
+PyTorch Lightning follows the design of `PyTorch distributed communication package <https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization>`_. and requires the following environment variables to be defined on each node:
+
+- *MASTER_PORT* - required; has to be a free port on machine with NODE_RANK 0
+- *MASTER_ADDR* - required (except for NODE_RANK 0); address of NODE_RANK 0 node
+- *WORLD_SIZE* - required; how many nodes are in the cluster
+- *NODE_RANK* - required; id of the node in the cluster
+
+.. _training_script_setup:
+
+----
+
+*************************
+Setup the training script
+*************************
+To train a model using multiple nodes, do the following:
+
+1.  Design your :ref:`lightning_module` (no need to add anything specific here).
+
+2.  Enable DDP in the trainer
+
+    .. code-block:: python
+
+       # train on 32 GPUs across 4 nodes
+       trainer = Trainer(accelerator="gpu", devices=8, num_nodes=4, strategy="ddp")
+
+----
+
+***************************
+Submit a job to the cluster
+***************************
+To submit a training job to the cluster you need to run the same training script on each node of the cluster.
+This means that you need to:
+
+1. Copy all third-party libraries to each node (usually means - distribute requirements.txt file and install it).
+2. Copy all your import dependencies and the script itself to each node.
+3. Run the script on each node.
+
+----
+
+******************
+Debug on a cluster
+******************
+When running in DDP mode, some errors in your code can show up as an NCCL issue.
+Set the ``NCCL_DEBUG=INFO`` environment variable to see the ACTUAL error.
+
+.. code-block:: bash
+
+    NCCL_DEBUG=INFO python train.py ...
+
+----
+
+********
+Get help
+********
+Setting up a cluster for distributed training is not trivial. Lightning offers lightning-grid which allows you to configure a cluster easily and run experiments via the CLI and web UI.
+
+Try it out for free today:
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Train models on the cloud
+   :description: Learn to run a model in the background on a cloud machine.
+   :col_css: col-md-6
+   :button_link: cloud_training.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div
diff --git a/docs/source/clouds/cluster_intermediate_2.rst b/docs/source/clouds/cluster_intermediate_2.rst
new file mode 100644
index 00000000000000..8a4fdc2fe89c7c
--- /dev/null
+++ b/docs/source/clouds/cluster_intermediate_2.rst
@@ -0,0 +1,66 @@
+########################################
+Run on an on-prem cluster (intermediate)
+########################################
+
+.. _torch_distributed_run:
+
+*************************
+Run with TorchDistributed
+*************************
+`Torch Distributed Run <https://pytorch.org/docs/stable/elastic/run.html>`__ provides helper functions to setup distributed environment variables from the `PyTorch distributed communication package <https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization>`__ that need to be defined on each node.
+
+Once the script is setup like described in :ref:` Training Script Setup<training_script_setup>`, you can run the below command across your nodes to start multi-node training.
+
+Like a custom cluster, you have to ensure that there is network connectivity between the nodes with firewall rules that allow traffic flow on a specified *MASTER_PORT*.
+
+Finally, you'll need to decide which node you'd like to be the main node (*MASTER_ADDR*), and the ranks of each node (*NODE_RANK*).
+
+For example:
+
+* *MASTER_ADDR* 10.10.10.16
+* *MASTER_PORT* 29500
+* *NODE_RANK* 0 for the first node, 1 for the second node
+
+Run the below command with the appropriate variables set on each node.
+
+.. code-block:: bash
+
+    python -m torch.distributed.run
+        --nnodes=2 # number of nodes you'd like to run with
+        --master_addr <MASTER_ADDR>
+        --master_port <MASTER_PORT>
+        --node_rank <NODE_RANK>
+        train.py (--arg1 ... train script args...)
+
+.. note::
+
+    ``torch.distributed.run`` assumes that you'd like to spawn a process per GPU if GPU devices are found on the node. This can be adjusted with ``-nproc_per_node``.
+
+----
+
+********
+Get help
+********
+Setting up a cluster for distributed training is not trivial. Lightning offers lightning-grid which allows you to configure a cluster easily and run experiments via the CLI and web UI.
+
+Try it out for free today:
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Train models on the cloud
+   :description: Learn to run a model in the background on a cloud machine.
+   :col_css: col-md-6
+   :button_link: cloud_training.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div
diff --git a/docs/source/clouds/fault_tolerant_training.rst b/docs/source/clouds/fault_tolerant_training.rst
new file mode 100644
index 00000000000000..063b9b9fbd6261
--- /dev/null
+++ b/docs/source/clouds/fault_tolerant_training.rst
@@ -0,0 +1,39 @@
+#######################
+Fault-tolerant Training
+#######################
+**Pre-requisites:** Make sure you've first worked through :doc:`Cloud training basics <cloud_training>`.
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Basic
+   :description: Save money with fault-tolerant training on the cloud
+   :col_css: col-md-4
+   :button_link: fault_tolerant_training_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Expert
+   :description: Learn how to enable fault tolerance on any cloud or cluster environment
+   :col_css: col-md-4
+   :button_link: fault_tolerant_training_expert.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: FAQ
+   :description: Frequently asked questions about fault-tolerant training.
+   :col_css: col-md-4
+   :button_link: fault_tolerant_training_faq.html
+   :height: 150
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/clouds/fault_tolerant_training_basic.rst b/docs/source/clouds/fault_tolerant_training_basic.rst
new file mode 100644
index 00000000000000..d5af9ed0247d98
--- /dev/null
+++ b/docs/source/clouds/fault_tolerant_training_basic.rst
@@ -0,0 +1,43 @@
+:orphan:
+
+###############################
+Fault-tolerant Training (basic)
+###############################
+**Audience:** User who want to run on the cloud or a cluster environment.
+
+**Pre-requisites**: Users must have first read :doc:`Run on the cloud (basic) <run_basic>`
+
+----
+
+********************************
+What is fault-tolerant training?
+********************************
+When developing models on the cloud or cluster environments, you may be forced to restart from scratch in the event of a software or hardware failure (ie: a *fault*). Lightning models can run fault-proof.
+
+With Fault Tolerant Training, when ``Trainer.fit()`` fails in the middle of an epoch during training or validation,
+Lightning will restart exactly where it failed, and everything will be restored (down to the batch it was on even if the dataset was shuffled).
+
+.. warning:: Fault-tolerant Training is currently an experimental feature within Lightning.
+
+----
+
+***************************************************
+Use fault-tolerance to save money on cloud training
+***************************************************
+Cloud providers offer pre-emptible machines which can be priced as low as 1/10th the cost but can be shut-down automatically at any time.
+Because fault-tolerant training can automatically recover from an interruption, you can train models for many weeks/months at a time for the pre-emptible prices.
+
+To easily run on the cloud with fault-tolerance with lightning-grid, use the following arguments:
+
+.. code-block:: bash
+
+    grid run --use_spot --auto_resume lightning_script.py
+
+The ``--use_spot`` argument enables cheap preemptible pricing (but the machines that can be interrupted).
+If the machine is interrupted, the ``--auto_resume`` argument automatically restarts the machine.
+
+As long as you are running a script that runs a lightning model, the model will restore itself and handle all the details of fault tolerance.
+
+----
+
+.. include:: grid_costs.rst
diff --git a/docs/source/clouds/fault_tolerant_training_expert.rst b/docs/source/clouds/fault_tolerant_training_expert.rst
new file mode 100644
index 00000000000000..f0051f78202093
--- /dev/null
+++ b/docs/source/clouds/fault_tolerant_training_expert.rst
@@ -0,0 +1,34 @@
+:orphan:
+
+################################
+Fault-tolerant Training (expert)
+################################
+**Audience**: Experts looking to enable and handle their own fault-tolerance.
+
+**Pre-requisites**: Users must have first read :doc:`Fault-tolrance Training (basic) <fault_tolerant_training_basic>`
+
+----
+
+***************************************
+Enable fault-tolerant behavior anywhere
+***************************************
+To enable fault tolerance on your own cloud or cluster environment enable the *PL_FAULT_TOLERANT_TRAINING* environment variable:
+
+.. code-block:: bash
+
+    PL_FAULT_TOLERANT_TRAINING=1 python script.py
+
+Although Lighting will now be fault-tolerant, you'll have to handle all the nuances of making sure the models are automatically restarted.
+
+.. note:: This complexity is already handled for you if you use **lightning-grid**.
+
+----
+
+**************************************************
+Enable fault-tolerant behavior on your own cluster
+**************************************************
+The simplest way to enable fault-tolerant behavior is to enable lightning-grid to work on your on-prem cluster or cloud environment which will handle all the nuances of fault-tolerant training at scale.
+
+Email us to connect with your own cloud account:
+
+`<onprem@pytorchlightning.ai>`_
diff --git a/docs/source/advanced/fault_tolerant_training.rst b/docs/source/clouds/fault_tolerant_training_faq.rst
similarity index 80%
rename from docs/source/advanced/fault_tolerant_training.rst
rename to docs/source/clouds/fault_tolerant_training_faq.rst
index 614150a80779b5..4f2bdf436a06d9 100644
--- a/docs/source/advanced/fault_tolerant_training.rst
+++ b/docs/source/clouds/fault_tolerant_training_faq.rst
@@ -1,57 +1,13 @@
-Fault-tolerant Training
-=======================
+:orphan:
 
-.. warning:: Fault-tolerant Training is currently an experimental feature within Lightning.
+#############################
+Fault-tolerant Training (FAQ)
+#############################
 
-Fault-tolerant Training is an internal mechanism that enables PyTorch Lightning to recover from a hardware or software failure.
-This is particularly interesting while training in the cloud with preemptive instances which can shutdown at any time.
-
-Until now, a ``Trainer.fit()`` failing in the middle of an epoch during training or validation
-would require the user to restart that epoch completely, losing any progress made during the epoch.
-This would make benchmarking non-reproducible as optimization has been interrupted and only partially restored.
-
-With Fault Tolerant Training, when ``Trainer.fit()`` fails in the middle of an epoch during training or validation,
-Lightning will restart exactly where it failed, and everything will be restored.
-
-Fault tolerance can be enabled as follows:
-
-.. code-block:: bash
-
-    PL_FAULT_TOLERANT_TRAINING=1 python script.py
-
-
-Under The Hood
---------------
-
-Lightning keeps track of the following state updates during training:
-
-* Samplers indices and random states across multiple processes and workers: This enables restoring random transforms and batch fetching to the exact state as it was right before the failure.
-* Optimizers, learning rate schedulers, callbacks, etc..
-* Loop progression
-* Logging internal states such that metric reductions on epoch end are not getting affected by the failure and model selection can continue as expected.
-
-Currently Supported
--------------------
-
-If you are using a single map-based dataset by sub-classing :class:`~torch.utils.data.Dataset`, everything should work as expected.
-
-.. code-block:: python
-
-    from torch.utils.data import Dataset, DataLoader
-
-
-    class RandomDataset(Dataset):
-        def __init__(self, size: int, length: int):
-            self.len = length
-            self.data = torch.randn(length, size)
-
-        def __getitem__(self, index):
-            return self.data[index]
-
-        def __len__(self):
-            return self.len
-
-If you are using a single iterable-based dataset, there are some limitations. To support fault-tolerance, you will need to use and expose a sampler within your dataset.
+*******************************
+How do I use iterable datasets?
+*******************************
+To support fault-tolerance, you will need to use and expose a sampler within your dataset.
 
 For example, the following implementation for an iterable dataset sub-classing :class:`~torch.utils.data.IterableDataset` won't be supported.
 
@@ -106,10 +62,11 @@ If your iterable dataset are implemented in the following way, everything should
             index = next(self.sampler_iter)
             return self.data[index]
 
+----
 
-Current Known Limitations
--------------------------
-
+**********************************
+How do I use multiple dataloaders?
+**********************************
 If you are using multiple training dataloaders, Lightning won't be able to restore the random state properly.
 
 .. testcode::
@@ -130,9 +87,11 @@ If you are using multiple training dataloaders, Lightning won't be able to resto
 If you believe this to be useful, please open a `feature request <https://github.com/PyTorchLightning/pytorch-lightning/issues>`_.
 
 
-Performance Impacts
--------------------
+----
 
+*********************************
+What are the performance impacts?
+*********************************
 Fault-tolerant Training was tested on common and worst-case scenarios in order to measure the impact of the internal state tracking on the total training time.
 On tiny models like the `BoringModel and RandomDataset <https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report/bug_report_model.py>`_
 which has virtually no data loading and processing overhead, we noticed up to 50% longer training time with fault tolerance enabled.
@@ -148,3 +107,38 @@ More detailed benchmarks will be shared in the future.
 
     - Capturing the iteration count + random states for each sample within each DataLoader workers and pass it through the data_queue
     - Extra logic to handle / store the dataloader's states from each batch.
+
+----
+
+************************************
+What happens to my shuffled dataset?
+************************************
+If you are using a single map-based dataset by sub-classing :class:`~torch.utils.data.Dataset`, everything should work as expected.
+
+.. code-block:: python
+
+    from torch.utils.data import Dataset, DataLoader
+
+
+    class RandomDataset(Dataset):
+        def __init__(self, size: int, length: int):
+            self.len = length
+            self.data = torch.randn(length, size)
+
+        def __getitem__(self, index):
+            return self.data[index]
+
+        def __len__(self):
+            return self.len
+
+----
+
+******************************
+What parts are fault-tolerant?
+******************************
+Lightning keeps track of the following state updates during training:
+
+* Samplers indices and random states across multiple processes and workers: This enables restoring random transforms and batch fetching to the exact state as it was right before the failure.
+* Optimizers, learning rate schedulers, callbacks, etc..
+* Loop progression
+* Logging internal states such that metric reductions on epoch end are not getting affected by the failure and model selection can continue as expected.
diff --git a/docs/source/clouds/grid_costs.rst b/docs/source/clouds/grid_costs.rst
new file mode 100644
index 00000000000000..04b18640183b66
--- /dev/null
+++ b/docs/source/clouds/grid_costs.rst
@@ -0,0 +1,6 @@
+****
+Cost
+****
+Lightning (via `lightning-grid <https://platform.grid.ai/#/dashboard>`_) provides access to cloud machines to the community for free. However, you must buy credits on `lightning-grid <https://platform.grid.ai/#/dashboard>`_ which are used to pay the cloud providers on your behalf.
+
+If you want to run on your own AWS account and pay the cloud provider directly, please contact our onprem team: `<onprem@pytorchlightning.ai>`_
diff --git a/docs/source/clouds/run_advanced.rst b/docs/source/clouds/run_advanced.rst
new file mode 100644
index 00000000000000..3418dee869ae40
--- /dev/null
+++ b/docs/source/clouds/run_advanced.rst
@@ -0,0 +1,130 @@
+:orphan:
+
+.. _grid_cloud_advanced:
+
+#############################
+Train on the cloud (advanced)
+#############################
+**Audience**: Anyone looking to train a model on the cloud in the background
+
+----
+
+****************************
+What is background training?
+****************************
+Background training lets you train models in the background without you needing to interact with the machine. As the model trains you can monitor its progress via Tensorboard or an experiment manager of your choice.
+
+----
+
+*************************
+0: Install lightning-grid
+*************************
+First Navigate to https://platform.grid.ai to create a free account.
+
+Next, install lightning-grid and login
+
+.. code:: bash
+
+      pip install lightning-grid
+      grid login
+
+----
+
+*******************
+1: Create a dataset
+*******************
+Create a datastore which optimizes your datasets for training at scale on the cloud.
+
+First, let's download a dummy dataset we created.
+
+.. code:: bash
+
+      # download
+      curl https://pl-flash-data.s3.amazonaws.com/cifar5.zip -o cifar5.zip
+
+      # unzip
+      unzip cifar5.zip
+
+Now create the datastore
+
+.. code:: bash
+
+      grid datastore create cifar5/ --name cifar5
+
+Now your dataset is ready to be used for training on the cloud!
+
+.. note::  In some *research* workflows, your model script ALSO downloads the dataset. If the dataset is only a few GBs this is fine. Otherwise we recommend you create a Datastore.
+
+----
+
+**************************
+2: Choose the model to run
+**************************
+You can run any python script in the background. For this example, we'll use a simple classifier:
+
+Clone the code to your machine:
+
+.. code bash
+
+      git clone https://github.com/williamFalcon/cifar5-simple.git
+
+
+.. note:: Code repositories can be as complicated as needed. This is just a simple demo.
+
+----
+
+*******************
+3: Run on the cloud
+*******************
+To run this model on the cloud, use the **grid run** command which has two parts:
+
+.. code:: bash
+
+      grid run [run args] file.py [file args]
+
+To attach the datastore **cifar5** to the **cifar5.py** file use the following command:
+
+.. code:: bash
+
+      # command | the datastore to use   |  the model  | argument to the model
+      grid run --datastore_name cifar5 cifar5.py.py --data_dir /datastores/cifar5
+
+----
+
+*********************
+4: Monitor and manage
+*********************
+Now that your model is running in the background you can monitor and manage it `here <https://platform.grid.ai/#/runs>`_.
+
+You can also monitor its progress on the commandline:
+
+.. code:: bash
+
+      grid status
+
+----
+
+**********
+Next Steps
+**********
+Here are the recommended next steps depending on your workflow.
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Run many models at once
+   :description: Learn how to run many models at once using sweeps.
+   :col_css: col-md-12
+   :button_link: session_intermediate.html
+   :height: 150
+   :tag: basic
+
+.. raw:: html
+
+        </div>
+    </div
diff --git a/docs/source/clouds/run_basic.rst b/docs/source/clouds/run_basic.rst
new file mode 100644
index 00000000000000..168b6d1249de95
--- /dev/null
+++ b/docs/source/clouds/run_basic.rst
@@ -0,0 +1,125 @@
+:orphan:
+
+.. _grid_cloud_run_basic:
+
+#################################
+Train on the cloud (intermediate)
+#################################
+**Audience**: Anyone looking to train a model on the cloud in the background
+
+----
+
+****************************
+What is background training?
+****************************
+Background training lets you train models in the background without you needing to interact with the machine. As the model trains you can monitor its progress via Tensorboard or an experiment manager of your choice.
+
+----
+
+*************************
+0: Install lightning-grid
+*************************
+First Navigate to https://platform.grid.ai to create a free account.
+
+Next, install lightning-grid and login
+
+.. code:: bash
+
+      pip install lightning-grid
+      grid login
+
+      # Login successful. Welcome to Grid.
+
+----
+
+*******************
+1: Create a dataset
+*******************
+Create a datastore which optimizes your datasets for training at scale on the cloud. Datastores can be created from all sorts of sources such as .zip and .tar links, local files/folders and even s3 buckets.
+
+Let's create a datastore from this .zip file
+
+.. code:: bash
+
+   grid datastore create https://pl-flash-data.s3.amazonaws.com/tinycifar5.zip --name cifar5
+
+Now your dataset is ready to be used for training on the cloud!
+
+.. note::  In some *research* workflows, your model script ALSO downloads the dataset. If the dataset is only a few GBs this is fine. Otherwise we recommend you create a Datastore.
+
+----
+
+**************************
+2: Choose the model to run
+**************************
+You can run any python script in the background. For this example, we'll use a simple classifier:
+
+Clone the code to your machine:
+
+.. code:: bash
+
+      git clone https://github.com/williamFalcon/cifar5-simple.git
+      cd cifar5-simple
+
+.. note:: Code repositories can be as complicated as needed. This is just a simple demo.
+
+----
+
+*******************
+3: Run on the cloud
+*******************
+To run this model on the cloud with the attached datastore, use the **grid run** command:
+
+.. code:: bash
+
+      grid run --datastore_name cifar5 cifar5.py --data_dir /datastores/cifar5
+
+The grid command has two parts the *[run args]* and the *[file args]*
+
+.. code:: bash
+
+      grid run [run args] file.py [file args]
+
+----
+
+*********************
+4: Monitor and manage
+*********************
+Now that your model is running in the background, `monitor and manage it here <https://platform.grid.ai/#/runs>`_.
+
+You can also monitor its progress on the commandline:
+
+.. code:: bash
+
+      grid status
+
+----
+
+.. include:: grid_costs.rst
+
+----
+
+**********
+Next Steps
+**********
+Here are the recommended next steps depending on your workflow.
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Run many models at once
+   :description: Learn how to find the best performaning model by running multiple models at once using a sweep.
+   :col_css: col-md-4
+   :button_link: run_intermediate.html
+   :height: 150
+   :tag: basic
+
+.. raw:: html
+
+        </div>
+    </div
diff --git a/docs/source/clouds/run_expert.rst b/docs/source/clouds/run_expert.rst
new file mode 100644
index 00000000000000..ddfe44047b61ca
--- /dev/null
+++ b/docs/source/clouds/run_expert.rst
@@ -0,0 +1,39 @@
+:orphan:
+
+.. _grid_cloud_run_expert:
+
+###########################
+Train on the cloud (expert)
+###########################
+**Audience**: Corporate or academic users who want to run on their Company or University private cloud account.
+
+----
+
+********************************
+Run on your own cloud or cluster
+********************************
+If you have access to a corporate or academic cluster, you can simply submit jobs that run Lightning models and lightning will automatically work.
+Please refer to :doc:`Clusters guide <cluster>` for more information.
+
+----
+
+***********************************
+Run on your own cloud (hassle free)
+***********************************
+Cluster training can get complicated once you start doing multi-node training, fault-tolerant training or sweeps.
+If you'd prefer to not deal with any of the hassles of running on your own cloud environments, lightning-grid enables University and Enterprise customers to run on the cloud with their own credentials or even onprem.
+
+These are some of the benefits of running via lightning-grid:
+
+- create datasets optimized for scale
+- fully configurable on-prem deployment
+- SOC-2 compliance (in-progress) (ETA Q3 2022)
+- micro cost optimizations everywhere (which add up)
+- built-in fault tolerance
+- enabled collaboration for teams and enterprises
+
+Contact our sales support engineering team so we can help you set up Grid with your own cloud credentials.
+
+Email us to connect with your own cloud account:
+
+`<onprem@pytorchlightning.ai>`_.
diff --git a/docs/source/clouds/run_intermediate.rst b/docs/source/clouds/run_intermediate.rst
new file mode 100644
index 00000000000000..dad2edfc4b8b85
--- /dev/null
+++ b/docs/source/clouds/run_intermediate.rst
@@ -0,0 +1,229 @@
+:orphan:
+
+.. _grid_cloud_run_intermediate:
+
+#################################
+Train on the cloud (intermediate)
+#################################
+**Audience**: User looking to run many models at once
+
+----
+
+****************
+What is a sweep?
+****************
+A sweep is the term giving to running the same model multiple times with different hyperparameters to find the one that performs the best (according to your definition of performance).
+
+Let's say I have a python script that trains a Lighting model to classify images. We run this file like so:
+
+.. code:: bash
+
+      grid run file.py --batch_size 8
+
+with such a model, I would be interested in knowing how it performs with different batch size. In this case, I'm going to train many versions of this model.
+
+.. code:: bash
+
+      # run 4 models in parallel
+      grid run file.py --batch_size 8
+      grid run file.py --batch_size 16
+      grid run file.py --batch_size 32
+      grid run file.py --batch_size 64
+
+Now I can see how my model performs according to the layers and based on time and cost I can pick my "best" model:
+
+.. list-table:: Training speed vs cost
+   :widths: 10 40 15 15
+   :header-rows: 1
+
+   * - Batch size
+     - classification accuracy (%)
+     - training time
+     - cost
+   * - 8
+     - 0.80
+     - 5 minutes
+     - $0.15
+   * - 16
+     - 0.85
+     - 10 minutes
+     - $0.30
+   * - 32
+     - 0.90
+     - 30 minutes
+     - $0.50
+   * - 64
+     - 0.95
+     - 60 minutes
+     - $1.01
+
+----
+
+*************
+Start a Sweep
+*************
+First, recall that in the `previous tutorial <run_basic.rst>`_ we ran a single model using this command:
+
+.. code:: bash
+
+    grid run --datastore_name cifar5 cifar5.py --data_dir /datastores/cifar5
+
+Now we're going to run that same model 4 different times each with a different number of layers:
+
+.. code:: bash
+
+    grid run --datastore_name cifar5 cifar5.py --data_dir /datastores/cifar5 --batch_size 8
+    grid run --datastore_name cifar5 cifar5.py --data_dir /datastores/cifar5 --batch_size 16
+    grid run --datastore_name cifar5 cifar5.py --data_dir /datastores/cifar5 --batch_size 32
+    grid run --datastore_name cifar5 cifar5.py --data_dir /datastores/cifar5 --batch_size 64
+
+Grid has a special syntax based on python that gives you shortcuts for sweeps. The shortcut for the above commands is:
+
+.. code:: bash
+
+    grid run --datastore_name cifar5 cifar5.py --data_dir /datastores/cifar5 --batch_size "[8, 16, 32, 64]"
+
+----
+
+****************
+Syntax Shortcuts
+****************
+
+List
+====
+
+.. code:: bash
+
+    grid run file.py --batch_size "[8, 16, 32, 64]"
+
+equivalent to:
+
+.. code:: bash
+
+    grid run file.py --batch_size 8
+    grid run file.py --batch_size 16
+    grid run file.py --batch_size 32
+    grid run file.py --batch_size 64
+
+----
+
+Range
+=====
+
+.. code:: bash
+
+    grid run file.py --batch_size "range(1, 10, 2)"
+
+equivalent to:
+
+.. code:: bash
+
+  grid run main.py --batch_size 1
+  grid run main.py --batch_size 3
+  grid run main.py --batch_size 5
+  grid run main.py --batch_size 7
+  grid run main.py --batch_size 9
+
+---
+
+String list
+===========
+
+.. code:: bash
+
+    grid run file.py --model_backbone "['resnet18' 'transformer', 'resnet50']"
+
+equivalent to:
+
+.. code:: bash
+
+  grid run file.py --model_backbone 'resnet18'
+  grid run file.py --model_backbone 'transformer'
+  grid run file.py --model_backbone 'resnet50'
+
+----
+
+Sampling
+========
+
+.. code:: bash
+
+    grid run file.py --learning_rate "uniform(1e-5, 1e-1, 3)"
+
+equivalent to:
+
+.. code:: bash
+
+    grid run file.py --learning_rate 0.03977392
+    grid run file.py --learning_rate 0.04835479
+    grid run file.py --learning_rate 0.05200016
+
+----
+
+****************
+Sweep strategies
+****************
+Models often have dozens of hyperparameters. We usually don't run all combinations because it would be too prohibitive. Grid supports two strategies:
+
+----
+
+Grid search
+===========
+Grid search is a common approach that tries all combinations of hyperparamaters. Grid will automatically compute combinations when it detects special syntax:
+
+.. code:: bash
+
+    grid run file.py --batch_size "[1, 2]" --layers "[3, 5]"
+
+is equivalent to:
+
+.. code:: bash
+
+    grid run file.py --batch_size 1 --layers 3
+    grid run file.py --batch_size 2 --layers 3
+    grid run file.py --batch_size 1 --layers 5
+    grid run file.py --batch_size 2 --layers 5
+
+----
+
+Random search
+=============
+With random search, we choose only a subset of hyperparamaters. The larger the number of trials (*num_trials*) the more probable we'll find a great performing model without needing to try all possible combinations.
+
+.. code:: bash
+
+    grid run --strategy random_search --num_trials 2 file.py --batch_size "[1, 2]" --layers "[3, 5]"
+
+the above command generates the 4 combinations and runs only 2 at random
+
+.. code:: bash
+
+    grid run file.py --batch_size 2 --layers 3
+    grid run file.py --batch_size 1 --layers 5
+
+----
+
+**********
+Next Steps
+**********
+Here are the recommended next steps depending on your workflow.
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Run with your own cloud credentials
+   :description: Learn how to use Grid products with your Company or University cloud account.
+   :col_css: col-md-4
+   :button_link: run_expert.html
+   :height: 180
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div
diff --git a/docs/source/clouds/session_basic.rst b/docs/source/clouds/session_basic.rst
new file mode 100644
index 00000000000000..49f59ffc1f2996
--- /dev/null
+++ b/docs/source/clouds/session_basic.rst
@@ -0,0 +1,113 @@
+:orphan:
+
+.. _grid_cloud_session_basic:
+
+##########################
+Train on the cloud (basic)
+##########################
+**Audience**: Anyone looking to train across many machines at once on the cloud.
+
+----
+
+*****************************
+Why do I need cloud training?
+*****************************
+Training on the cloud is a cost effective way to train your models faster by allowing you to access powerful GPU machines.
+
+For example, if your model takes 10 days to train on a CPU machine, here's how cloud training can speed up your training time:
+
+.. list-table:: Training speed vs cost
+   :widths: 20 20 20
+   :header-rows: 1
+
+   * - Machine type
+     - Training time
+     - Cost (AWS 1 M60 GPU)
+   * - CPU
+     - 10 days
+     - $12.00
+   * - 1 GPU
+     - 2 days
+     - $11.52
+   * - 2 GPU
+     - 1 day
+     - $20.64
+   * - 4 GPU
+     - 12 hours
+     - $19.08
+
+----
+
+***********************************
+Start a cloud machine in < 1 minute
+***********************************
+Lightning has a native cloud solution with various products (lightning-grid) designed for researchers and ML practicioners in industry.
+To start an interactive machine simply go to `Lightning Grid <https://platform.grid.ai>`_ to create a free account, then start a new Grid Session.
+
+A Grid Session is an interactive machine with 1-16 GPUs per machine.
+
+.. image:: https://docs.grid.ai/assets/images/new-session-3c58be3fd64ffabcdeb7b52516e0782e.gif
+    :alt: Start a Grid Session in a few seconds
+
+----
+
+*************************
+Open the Jupyter Notebook
+*************************
+Once the Session starts, open a Jupyter notebook.
+
+.. raw:: html
+
+    <video width="100%" max-width="800px" controls muted playsinline
+    src="https://grid-docs.s3.us-east-2.amazonaws.com/jupyter.mp4"></video>
+
+----
+
+************************
+Clone and run your model
+************************
+On the Jupyter page you can use a Notebook, or to clone your code and run via the CLI.
+
+.. raw:: html
+
+    <video width="100%" max-width="800px" controls muted playsinline
+    src="https://grid-docs.s3.us-east-2.amazonaws.com/notebook_or_cli.mp4"></video>
+
+----
+
+.. include:: grid_costs.rst
+
+----
+
+**********
+Next Steps
+**********
+Here are the recommended next steps depending on your workflow.
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Run a model in the background
+   :description: Learn to run a model in the background
+   :col_css: col-md-6
+   :button_link: run_basic.html
+   :height: 180
+   :tag: basic
+
+.. displayitem::
+   :header: Run with your own cloud credentials
+   :description: Learn how to use Grid products on your Company or University private cloud account.
+   :col_css: col-md-6
+   :button_link: run_expert.html
+   :height: 180
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div
diff --git a/docs/source/clouds/session_intermediate.rst b/docs/source/clouds/session_intermediate.rst
new file mode 100644
index 00000000000000..d279e4393a3903
--- /dev/null
+++ b/docs/source/clouds/session_intermediate.rst
@@ -0,0 +1,8 @@
+:orphan:
+
+.. _grid_cloud_session_intermediate:
+
+##########################
+Train on the cloud (basic)
+##########################
+**Audience**: Anyone looking to train across many machines at once on the cloud.
diff --git a/docs/source/common/checkpointing.rst b/docs/source/common/checkpointing.rst
index 31824e828cc7d0..765d5cce6c36eb 100644
--- a/docs/source/common/checkpointing.rst
+++ b/docs/source/common/checkpointing.rst
@@ -1,406 +1,70 @@
-.. testsetup:: *
-
-    import os
-    from pytorch_lightning.trainer.trainer import Trainer
-    from pytorch_lightning.core.lightning import LightningModule
-
 .. _checkpointing:
 
 #############
 Checkpointing
 #############
 
-Lightning provides functions to save and load checkpoints.
-
-Checkpointing your training allows you to resume a training process in case it was interrupted, fine-tune a model or use a pre-trained model for inference without having to retrain the model.
-
-
-
-*******************
-Checkpoint Contents
-*******************
-
-A Lightning checkpoint has everything needed to restore a training session including:
-
-- 16-bit scaling factor (if using 16-bit precision training)
-- Current epoch
-- Global step
-- LightningModule's state_dict
-- State of all optimizers
-- State of all learning rate schedulers
-- State of all callbacks (for stateful callbacks)
-- State of datamodule (for stateful datamodules)
-- The hyperparameters used for that model if passed in as hparams (Argparse.Namespace)
-- State of Loops (if using Fault-Tolerant training)
-
-
-Individual Component States
-===========================
-
-Each component can save and load its state by implementing the PyTorch ``state_dict``, ``load_state_dict`` stateful protocol.
-For details on implementing your own stateful callbacks and datamodules, refer to the individual docs pages at :doc:`callbacks <../extensions/callbacks>` and :doc:`datamodules <../extensions/datamodules>`.
-
-
-Operating on Global Checkpoint Component States
-===============================================
-
-If you need to operate on the global component state (i.e. the entire checkpoint dictionary), you can read/add/delete/modify custom states in your checkpoints before they are being saved or loaded.
-For this you can override :meth:`~pytorch_lightning.core.hooks.CheckpointHooks.on_save_checkpoint` and :meth:`~pytorch_lightning.core.hooks.CheckpointHooks.on_load_checkpoint` in your ``LightningModule``
-or :meth:`~pytorch_lightning.callbacks.base.Callback.on_save_checkpoint` and :meth:`~pytorch_lightning.callbacks.base.Callback.on_load_checkpoint` methods in your ``Callback``.
-
-
-*****************
-Checkpoint Saving
-*****************
-
-Automatic Saving
-================
-
-Lightning automatically saves a checkpoint for you in your current working directory, with the state of your last training epoch. This makes sure you can resume training in case it was interrupted.
-
-To change the checkpoint path pass in:
-
-.. code-block:: python
-
-    # saves checkpoints to '/your/path/to/save/checkpoints' at every epoch end
-    trainer = Trainer(default_root_dir="/your/path/to/save/checkpoints")
-
-You can retrieve the checkpoint after training by calling:
-
-.. code-block:: python
-
-        checkpoint_callback = ModelCheckpoint(dirpath="my/path/", save_top_k=2, monitor="val_loss")
-        trainer = Trainer(callbacks=[checkpoint_callback])
-        trainer.fit(model)
-        checkpoint_callback.best_model_path
-
-
-Disabling Checkpoints
-=====================
-
-You can disable checkpointing by passing:
-
-.. testcode::
-
-   trainer = Trainer(enable_checkpointing=False)
-
-
-Manual Saving
-=============
-
-You can manually save checkpoints and restore your model from the checkpointed state using :meth:`~pytorch_lightning.trainer.trainer.Trainer.save_checkpoint`
-and :meth:`~pytorch_lightning.core.saving.ModelIO.load_from_checkpoint`.
-
-.. code-block:: python
-
-    model = MyLightningModule(hparams)
-    trainer.fit(model)
-    trainer.save_checkpoint("example.ckpt")
-    new_model = MyLightningModule.load_from_checkpoint(checkpoint_path="example.ckpt")
-
-
-Manual Saving with Distributed Training Strategies
-==================================================
-
-Lightning also handles strategies where multiple processes are running, such as DDP. For example, when using the DDP strategy our training script is running across multiple devices at the same time.
-Lightning automatically ensures that the model is saved only on the main process, whilst other processes do not interfere with saving checkpoints. This requires no code changes as seen below:
-
-.. code-block:: python
-
-    trainer = Trainer(strategy="ddp")
-    model = MyLightningModule(hparams)
-    trainer.fit(model)
-    # Saves only on the main process
-    trainer.save_checkpoint("example.ckpt")
-
-Not using :meth:`~pytorch_lightning.trainer.trainer.Trainer.save_checkpoint` can lead to unexpected behavior and potential deadlock. Using other saving functions will result in all devices attempting to save the checkpoint. As a result, we highly recommend using the Trainer's save functionality.
-If using custom saving functions cannot be avoided, we recommend using the :func:`~pytorch_lightning.utilities.rank_zero.rank_zero_only` decorator to ensure saving occurs only on the main process. Note that this will only work if all ranks hold the exact same state and won't work when using
-model parallel distributed strategies such as deepspeed or sharded training.
-
-
-Checkpointing Hyperparameters
-=============================
-
-The Lightning checkpoint also saves the arguments passed into the LightningModule init
-under the ``"hyper_parameters"`` key in the checkpoint.
-
-.. code-block:: python
-
-    class MyLightningModule(LightningModule):
-        def __init__(self, learning_rate, *args, **kwargs):
-            super().__init__()
-            self.save_hyperparameters()
-
-
-    # all init args were saved to the checkpoint
-    checkpoint = torch.load(CKPT_PATH)
-    print(checkpoint["hyper_parameters"])
-    # {"learning_rate": the_value}
-
-
------------
-
-
-******************
-Checkpoint Loading
-******************
-
-To load a model along with its weights and hyperparameters use the following method:
-
-.. code-block:: python
-
-    model = MyLightningModule.load_from_checkpoint(PATH)
-
-    print(model.learning_rate)
-    # prints the learning_rate you used in this checkpoint
-
-    model.eval()
-    y_hat = model(x)
-
-But if you don't want to use the hyperparameters saved in the checkpoint, pass in your own here:
-
-.. testcode::
-
-    class LitModel(LightningModule):
-        def __init__(self, in_dim, out_dim):
-            super().__init__()
-            self.save_hyperparameters()
-            self.l1 = nn.Linear(self.hparams.in_dim, self.hparams.out_dim)
-
-you can restore the model like this
-
-.. code-block:: python
-
-    # if you train and save the model like this it will use these values when loading
-    # the weights. But you can overwrite this
-    LitModel(in_dim=32, out_dim=10)
-
-    # uses in_dim=32, out_dim=10
-    model = LitModel.load_from_checkpoint(PATH)
-
-    # uses in_dim=128, out_dim=10
-    model = LitModel.load_from_checkpoint(PATH, in_dim=128, out_dim=10)
-
-
-Restoring Training State
-========================
-
-If you don't just want to load weights, but instead restore the full training,
-do the following:
-
-.. code-block:: python
-
-   model = LitModel()
-   trainer = Trainer()
-
-   # automatically restores model, epoch, step, LR schedulers, apex, etc...
-   trainer.fit(model, ckpt_path="some/path/to/my_checkpoint.ckpt")
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Basic
+   :description: Learn to save and load checkpoints
+   :col_css: col-md-3
+   :button_link: checkpointing_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Intermediate
+   :description: Customize checkpointing behavior
+   :col_css: col-md-3
+   :button_link: checkpointing_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Advanced
+   :description: Enable cloud-based checkpointing and composable checkpoints.
+   :col_css: col-md-3
+   :button_link: checkpointing_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Expert
+   :description: Customize checkpointing for custom distributed strategies and accelerators.
+   :col_css: col-md-3
+   :button_link: checkpointing_expert.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
 
-
------------
-
-
-*******************************************
-Conditional Checkpointing (ModelCheckpoint)
-*******************************************
-
-The :class:`~pytorch_lightning.callbacks.ModelCheckpoint` callback allows you to configure when/which/what/where checkpointing should happen. It follows the normal Callback hook structure so you can
-hack it around/override its methods for your use-cases as well. Following are some of the common use-cases along with the arguments you need to specify to configure it:
-
-
-How does it work?
-=================
-
-``ModelCheckpoint`` helps cover the following cases from WH-Family:
-
-When
 ----
 
-- When using iterative training which doesn't have an epoch, you can checkpoint at every ``N`` training steps by specifying ``every_n_training_steps=N``.
-- You can also control the interval of epochs between checkpoints using ``every_n_epochs`` between checkpoints, to avoid slowdowns.
-- You can checkpoint at a regular time interval using ``train_time_interval`` argument independent of the steps or epochs.
-- In case you are monitoring a training metrics, we'd suggest using ``save_on_train_epoch_end=True`` to ensure the required metric is being accumulated correctly for creating a checkpoint.
-
-
-Which
------
-
-- You can save the last checkpoint when training ends using ``save_last`` argument.
-
-- You can save top-K and last-K checkpoints by configuring the ``monitor`` and ``save_top_k`` argument.
-
-|
-
-    .. testcode::
-
-        from pytorch_lightning.callbacks import ModelCheckpoint
-
-
-        # saves top-K checkpoints based on "val_loss" metric
-        checkpoint_callback = ModelCheckpoint(
-            save_top_k=10,
-            monitor="val_loss",
-            mode="min",
-            dirpath="my/path/",
-            filename="sample-mnist-{epoch:02d}-{val_loss:.2f}",
-        )
-
-        # saves last-K checkpoints based on "global_step" metric
-        # make sure you log it inside your LightningModule
-        checkpoint_callback = ModelCheckpoint(
-            save_top_k=10,
-            monitor="global_step",
-            mode="max",
-            dirpath="my/path/",
-            filename="sample-mnist-{epoch:02d}-{global_step}",
-        )
-
--  You can customize the checkpointing behavior to monitor any quantity of your training or validation steps. For example, if you want to update your checkpoints based on your validation loss:
-
-|
-
-    .. testcode::
-
-        from pytorch_lightning.callbacks import ModelCheckpoint
-
-
-        class LitAutoEncoder(LightningModule):
-            def validation_step(self, batch, batch_idx):
-                x, y = batch
-                y_hat = self.backbone(x)
-
-                # 1. calculate loss
-                loss = F.cross_entropy(y_hat, y)
-
-                # 2. log val_loss
-                self.log("val_loss", loss)
-
-
-        # 3. Init ModelCheckpoint callback, monitoring "val_loss"
-        checkpoint_callback = ModelCheckpoint(monitor="val_loss")
-
-        # 4. Add your callback to the callbacks list
-        trainer = Trainer(callbacks=[checkpoint_callback])
-
-
-What
-----
-
-- By default, the ``ModelCheckpoint`` callback saves model weights, optimizer states, etc., but in case you have limited disk space or just need the model weights to be saved you can specify ``save_weights_only=True``.
-
-
-Where
------
-
-- It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.lightning.LightningModule.log`.
-
-|
-
-    .. testcode::
-
-        from pytorch_lightning.callbacks import ModelCheckpoint
-
-
-        # saves a file like: my/path/sample-mnist-epoch=02-val_loss=0.32.ckpt
-        checkpoint_callback = ModelCheckpoint(
-            dirpath="my/path/",
-            filename="sample-mnist-{epoch:02d}-{val_loss:.2f}",
-        )
-
-|
-
-The :class:`~pytorch_lightning.callbacks.ModelCheckpoint` callback is very robust and should cover 99% of the use-cases. If you find a use-case that is not configured yet, feel free to open an issue with a feature request on GitHub
-and the Lightning Team will be happy to integrate/help integrate it.
-
-
------------
-
-.. _customize_checkpointing:
-
-***********************
-Customize Checkpointing
-***********************
-
-.. warning::
-
-    The Checkpoint IO API is experimental and subject to change.
-
-
-Lightning supports modifying the checkpointing save/load functionality through the ``CheckpointIO``. This encapsulates the save/load logic
-that is managed by the ``Strategy``. ``CheckpointIO`` is different from :meth:`~pytorch_lightning.core.hooks.CheckpointHooks.on_save_checkpoint`
-and :meth:`~pytorch_lightning.core.hooks.CheckpointHooks.on_load_checkpoint` methods as it determines how the checkpoint is saved/loaded to storage rather than
-what's saved in the checkpoint.
-
-
-Built-in Checkpoint IO Plugins
-==============================
-
-.. list-table:: Built-in Checkpoint IO Plugins
-   :widths: 25 75
-   :header-rows: 1
-
-   * - Plugin
-     - Description
-   * - :class:`~pytorch_lightning.plugins.io.TorchCheckpointIO`
-     - CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints
-       respectively, common for most use cases.
-   * - :class:`~pytorch_lightning.plugins.io.XLACheckpointIO`
-     - CheckpointIO that utilizes :func:`xm.save` to save checkpoints for TPU training strategies.
-
-
-Custom Checkpoint IO Plugin
-===========================
-
-``CheckpointIO`` can be extended to include your custom save/load functionality to and from a path. The ``CheckpointIO`` object can be passed to either a ``Trainer`` directly or a ``Strategy`` as shown below:
-
-.. code-block:: python
-
-    from pytorch_lightning import Trainer
-    from pytorch_lightning.callbacks import ModelCheckpoint
-    from pytorch_lightning.plugins import CheckpointIO
-    from pytorch_lightning.strategies import SingleDeviceStrategy
-
-
-    class CustomCheckpointIO(CheckpointIO):
-        def save_checkpoint(self, checkpoint, path, storage_options=None):
-            ...
-
-        def load_checkpoint(self, path, storage_options=None):
-            ...
-
-        def remove_checkpoint(self, path):
-            ...
-
-
-    custom_checkpoint_io = CustomCheckpointIO()
-
-    # Either pass into the Trainer object
-    model = MyModel()
-    trainer = Trainer(
-        plugins=[custom_checkpoint_io],
-        callbacks=ModelCheckpoint(save_last=True),
-    )
-    trainer.fit(model)
-
-    # or pass into Strategy
-    model = MyModel()
-    device = torch.device("cpu")
-    trainer = Trainer(
-        strategy=SingleDeviceStrategy(device, checkpoint_io=custom_checkpoint_io),
-        callbacks=ModelCheckpoint(save_last=True),
-    )
-    trainer.fit(model)
-
-.. note::
+.. raw:: html
 
-    Some strategies like :class:`~pytorch_lightning.strategies.deepspeed.DeepSpeedStrategy` do not support custom :class:`~pytorch_lightning.plugins.io.checkpoint_plugin.CheckpointIO` as checkpointing logic is not modifiable.
+    <div class="display-card-container">
+        <div class="row">
 
------------
+.. Add callout items below this line
 
-***************************
-Managing Remote Filesystems
-***************************
+.. displayitem::
+   :header: ModelCheckpoint API
+   :description: Dig into the ModelCheckpoint API
+   :col_css: col-md-4
+   :button_link: ../api/pytorch_lightning.callbacks.ModelCheckpoint.html
+   :height: 150
 
-Lightning supports saving and loading checkpoints from a variety of filesystems, including local filesystems and several cloud storage providers.
+.. raw:: html
 
-Check out :ref:`Remote Filesystems <remote_fs>` document for more info.
+        </div>
+    </div>
diff --git a/docs/source/common/checkpointing_advanced.rst b/docs/source/common/checkpointing_advanced.rst
new file mode 100644
index 00000000000000..561ca95a5da5d3
--- /dev/null
+++ b/docs/source/common/checkpointing_advanced.rst
@@ -0,0 +1,75 @@
+.. _checkpointing_advanced:
+
+########################
+Checkpointing (advanced)
+########################
+
+
+*****************
+Cloud checkpoints
+*****************
+Lightning is integrated with the major remote file systems including local filesystems and several cloud storage providers such as
+`S3 <https://aws.amazon.com/s3/>`_ on `AWS <https://aws.amazon.com/>`_, `GCS <https://cloud.google.com/storage>`_ on `Google Cloud <https://cloud.google.com/>`_,
+or `ADL <https://azure.microsoft.com/solutions/data-lake/>`_ on `Azure <https://azure.microsoft.com/>`_.
+
+PyTorch Lightning uses `fsspec <https://filesystem-spec.readthedocs.io/>`_ internally to handle all filesystem operations.
+
+----
+
+Save a cloud checkpoint
+=======================
+
+To save to a remote filesystem, prepend a protocol like "s3:/" to the root_dir used for writing and reading model data.
+
+.. code-block:: python
+
+    # `default_root_dir` is the default path used for logs and checkpoints
+    trainer = Trainer(default_root_dir="s3://my_bucket/data/")
+    trainer.fit(model)
+
+----
+
+Resume training from a cloud checkpoint
+=======================================
+To resume training from a cloud checkpoint use a cloud url.
+
+.. code-block:: python
+
+    trainer = Trainer(default_root_dir=tmpdir, max_steps=3)
+    trainer.fit(model, ckpt_path="s3://my_bucket/ckpts/classifier.ckpt")
+
+PyTorch Lightning uses `fsspec <https://filesystem-spec.readthedocs.io/>`_ internally to handle all filesystem operations.
+
+----
+
+***************************
+Modularize your checkpoints
+***************************
+Checkpoints can also save the state of :doc:`datamodules <../extensions/datamodules_state>` and :doc:`callbacks <../extensions/callbacks_state>`.
+
+----
+
+****************************
+Modify a checkpoint anywhere
+****************************
+When you need to change the components of a checkpoint before saving or loading, use the :meth:`~pytorch_lightning.core.hooks.CheckpointHooks.on_save_checkpoint` and :meth:`~pytorch_lightning.core.hooks.CheckpointHooks.on_load_checkpoint` of your ``LightningModule``.
+
+.. code:: python
+
+    class LitModel(pl.LightningModule):
+        def on_save_checkpoint(self, checkpoint):
+            checkpoint["something_cool_i_want_to_save"] = my_cool_pickable_object
+
+        def on_load_checkpoint(self, checkpoint):
+            my_cool_pickable_object = checkpoint["something_cool_i_want_to_save"]
+
+Use the above approach when you need to couple this behavior to your LightningModule for reproducibility reasons. Otherwise, Callbacks also have the :meth:`~pytorch_lightning.callbacks.base.Callback.on_save_checkpoint` and :meth:`~pytorch_lightning.callbacks.base.Callback.on_load_checkpoint` which you should use instead:
+
+.. code:: python
+
+    class LitCallback(pl.Callback):
+        def on_save_checkpoint(self, checkpoint):
+            checkpoint["something_cool_i_want_to_save"] = my_cool_pickable_object
+
+        def on_load_checkpoint(self, checkpoint):
+            my_cool_pickable_object = checkpoint["something_cool_i_want_to_save"]
diff --git a/docs/source/common/checkpointing_basic.rst b/docs/source/common/checkpointing_basic.rst
new file mode 100644
index 00000000000000..6ff54c94245d25
--- /dev/null
+++ b/docs/source/common/checkpointing_basic.rst
@@ -0,0 +1,190 @@
+:orphan:
+
+.. _checkpointing_basic:
+
+#####################
+Checkpointing (basic)
+#####################
+**Audience:** All users
+
+----
+
+*********************
+What is a checkpoint?
+*********************
+When a model is training, the performance changes as it continues to see more data. It is a best practice to save the state of a model throughout the training process. This gives you a version of the model, *a checkpoint*, at each key point during the development of the model. Once training has completed, use the checkpoint that corresponds to the best performance you found during the training process.
+
+Checkpoints also enable your training to resume from where it was in case the training process is interrupted.
+
+PyTorch Lightning checkpoints are fully usable in plain PyTorch.
+
+----
+
+************************
+Contents of a checkpoint
+************************
+A Lightning checkpoint contains a dump of the model's entire internal state. Unlike plain PyTorch, Lightning saves *everything* you need to restore a model even in the most complex distributed training environments.
+
+Inside a Lightning checkpoint you'll find:
+
+- 16-bit scaling factor (if using 16-bit precision training)
+- Current epoch
+- Global step
+- LightningModule's state_dict
+- State of all optimizers
+- State of all learning rate schedulers
+- State of all callbacks (for stateful callbacks)
+- State of datamodule (for stateful datamodules)
+- The hyperparameters used for that model if passed in as hparams (Argparse.Namespace)
+- The hyperparameters used for that datamodule if passed in as hparams (Argparse.Namespace)
+- State of Loops (if using Fault-Tolerant training)
+
+----
+
+*****************
+Save a checkpoint
+*****************
+Lightning automatically saves a checkpoint for you in your current working directory, with the state of your last training epoch. This makes sure you can resume training in case it was interrupted.
+
+.. code-block:: python
+
+    # simply by using the Trainer you get automatic checkpointing
+    trainer = Trainer()
+
+To change the checkpoint path use the `default_root_dir` argument:
+
+.. code-block:: python
+
+    # saves checkpoints to 'some/path/' at every epoch end
+    trainer = Trainer(default_root_dir="some/path/")
+
+----
+
+*******************************
+LightningModule from checkpoint
+*******************************
+
+To load a LightningModule along with its weights and hyperparameters use the following method:
+
+.. code-block:: python
+
+    model = MyLightningModule.load_from_checkpoint("/path/to/checkpoint.ckpt")
+
+    # disable randomness, dropout, etc...
+    model.eval()
+
+    # predict with the model
+    y_hat = model(x)
+
+----
+
+Save hyperparameters
+====================
+The LightningModule allows you to automatically save all the hyperparameters passed to *init* simply by calling *self.save_hyperparameters()*.
+
+.. code-block:: python
+
+    class MyLightningModule(LightningModule):
+        def __init__(self, learning_rate, another_parameter, *args, **kwargs):
+            super().__init__()
+            self.save_hyperparameters()
+
+The hyperparameters are saved to the "hyper_parameters" key in the checkpoint
+
+.. code-block:: python
+
+    checkpoint = torch.load(checkpoint, map_location=lambda storage, loc: storage)
+    print(checkpoint["hyper_parameters"])
+    # {"learning_rate": the_value, "another_parameter": the_other_value}
+
+The LightningModule also has access to the Hyperparameters
+
+.. code-block:: python
+
+    model = MyLightningModule.load_from_checkpoint("/path/to/checkpoint.ckpt")
+    print(model.learning_rate)
+
+----
+
+Initalize with other parameters
+===============================
+If you used the *self.save_hyperparameters()* method in the init of the LightningModule, you can initialize the model with different hyperparameters.
+
+.. code-block:: python
+
+    # if you train and save the model like this it will use these values when loading
+    # the weights. But you can overwrite this
+    LitModel(in_dim=32, out_dim=10)
+
+    # uses in_dim=32, out_dim=10
+    model = LitModel.load_from_checkpoint(PATH)
+
+    # uses in_dim=128, out_dim=10
+    model = LitModel.load_from_checkpoint(PATH, in_dim=128, out_dim=10)
+
+----
+
+*************************
+nn.Module from checkpoint
+*************************
+Lightning checkpoints are fully compatible with plain torch nn.Modules.
+
+.. code-block:: python
+
+    checkpoint = torch.load(CKPT_PATH)
+    print(checkpoint.keys())
+
+For example, let's pretend we created a LightningModule like so:
+
+.. code-block:: python
+
+    class Encoder(nn.Module):
+        ...
+
+
+    class Decoder(nn.Module):
+        ...
+
+
+    class Autoencoder(pl.LightningModule):
+        def __init__(self, encoder, decoder, *args, **kwargs):
+            ...
+
+
+    autoencoder = Autoencoder(Encoder(), Decoder())
+
+Once the autoencoder has trained, pull out the relevant weights for your torch nn.Module:
+
+.. code-block:: python
+
+    checkpoint = torch.load(CKPT_PATH)
+    encoder_weights = checkpoint["encoder"]
+    decoder_weights = checkpoint["decoder"]
+
+----
+
+*********************
+Disable checkpointing
+*********************
+
+You can disable checkpointing by passing:
+
+.. testcode::
+
+   trainer = Trainer(enable_checkpointing=False)
+
+----
+
+*********************
+Resume training state
+*********************
+
+If you don't just want to load weights, but instead restore the full training, do the following:
+
+.. code-block:: python
+
+   model = LitModel()
+   trainer = Trainer()
+
+   # automatically restores model, epoch, step, LR schedulers, apex, etc...
+   trainer.fit(model, ckpt_path="some/path/to/my_checkpoint.ckpt")
diff --git a/docs/source/common/checkpointing_expert.rst b/docs/source/common/checkpointing_expert.rst
new file mode 100644
index 00000000000000..c1859d60ecf527
--- /dev/null
+++ b/docs/source/common/checkpointing_expert.rst
@@ -0,0 +1,89 @@
+:orphan:
+
+.. _checkpointing_expert:
+
+######################
+Checkpointing (expert)
+######################
+
+TODO: I don't understand this...
+
+***********************
+Customize Checkpointing
+***********************
+
+.. warning::
+
+    The Checkpoint IO API is experimental and subject to change.
+
+
+Lightning supports modifying the checkpointing save/load functionality through the ``CheckpointIO``. This encapsulates the save/load logic
+that is managed by the ``Strategy``. ``CheckpointIO`` is different from :meth:`~pytorch_lightning.core.hooks.CheckpointHooks.on_save_checkpoint`
+and :meth:`~pytorch_lightning.core.hooks.CheckpointHooks.on_load_checkpoint` methods as it determines how the checkpoint is saved/loaded to storage rather than
+what's saved in the checkpoint.
+
+
+******************************
+Built-in Checkpoint IO Plugins
+******************************
+
+.. list-table:: Built-in Checkpoint IO Plugins
+   :widths: 25 75
+   :header-rows: 1
+
+   * - Plugin
+     - Description
+   * - :class:`~pytorch_lightning.plugins.io.TorchCheckpointIO`
+     - CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints
+       respectively, common for most use cases.
+   * - :class:`~pytorch_lightning.plugins.io.XLACheckpointIO`
+     - CheckpointIO that utilizes :func:`xm.save` to save checkpoints for TPU training strategies.
+
+
+***************************
+Custom Checkpoint IO Plugin
+***************************
+
+``CheckpointIO`` can be extended to include your custom save/load functionality to and from a path. The ``CheckpointIO`` object can be passed to either a ``Trainer`` directly or a ``Strategy`` as shown below:
+
+.. code-block:: python
+
+    from pytorch_lightning import Trainer
+    from pytorch_lightning.callbacks import ModelCheckpoint
+    from pytorch_lightning.plugins import CheckpointIO
+    from pytorch_lightning.strategies import SingleDeviceStrategy
+
+
+    class CustomCheckpointIO(CheckpointIO):
+        def save_checkpoint(self, checkpoint, path, storage_options=None):
+            ...
+
+        def load_checkpoint(self, path, storage_options=None):
+            ...
+
+        def remove_checkpoint(self, path):
+            ...
+
+
+    custom_checkpoint_io = CustomCheckpointIO()
+
+    # Either pass into the Trainer object
+    model = MyModel()
+    trainer = Trainer(
+        plugins=[custom_checkpoint_io],
+        callbacks=ModelCheckpoint(save_last=True),
+    )
+    trainer.fit(model)
+
+    # or pass into Strategy
+    model = MyModel()
+    device = torch.device("cpu")
+    trainer = Trainer(
+        strategy=SingleDeviceStrategy(device, checkpoint_io=custom_checkpoint_io),
+        callbacks=ModelCheckpoint(save_last=True),
+    )
+    trainer.fit(model)
+
+.. note::
+
+    Some ``TrainingTypePlugins`` like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable.
diff --git a/docs/source/common/checkpointing_intermediate.rst b/docs/source/common/checkpointing_intermediate.rst
new file mode 100644
index 00000000000000..77965755471fa4
--- /dev/null
+++ b/docs/source/common/checkpointing_intermediate.rst
@@ -0,0 +1,175 @@
+:orphan:
+
+.. _checkpointing_intermediate:
+
+############################
+Checkpointing (intermediate)
+############################
+**Audience:** Users looking to customize the checkpointing behavior
+
+----
+
+*****************************
+Modify checkpointing behavior
+*****************************
+For fine-grain control over checkpointing behavior, use the :class:`~pytorch_lightning.callbacks.ModelCheckpoint` object
+
+.. code-block:: python
+
+        from pytorch_lightning.callbacks import ModelCheckpoint
+
+        checkpoint_callback = ModelCheckpoint(dirpath="my/path/", save_top_k=2, monitor="val_loss")
+        trainer = Trainer(callbacks=[checkpoint_callback])
+        trainer.fit(model)
+        checkpoint_callback.best_model_path
+
+Any value that has been logged via *self.log* in the LightningModule can be monitored.
+
+.. code-block:: python
+
+        class LitModel(pl.LightningModule):
+            def training_step(self, batch, batch_idx):
+                self.log("my_metric", x)
+
+
+        # 'my_metric' is now able to be monitored
+        checkpoint_callback = ModelCheckpoint(monitor="my_metric")
+
+----
+
+*****************************
+Save checkpoints by condition
+*****************************
+To save checkpoints based on a (*when/which/what/where*) condition (for example *when* the validation_loss is lower) modify the :class:`~pytorch_lightning.callbacks.ModelCheckpoint` properties.
+
+When
+====
+
+- When using iterative training which doesn't have an epoch, you can checkpoint at every ``N`` training steps by specifying ``every_n_training_steps=N``.
+- You can also control the interval of epochs between checkpoints using ``every_n_epochs`` between checkpoints, to avoid slowdowns.
+- You can checkpoint at a regular time interval using ``train_time_interval`` argument independent of the steps or epochs.
+- In case you are monitoring a training metrics, we'd suggest using ``save_on_train_epoch_end=True`` to ensure the required metric is being accumulated correctly for creating a checkpoint.
+
+
+Which
+=====
+
+- You can save the last checkpoint when training ends using ``save_last`` argument.
+- You can save top-K and last-K checkpoints by configuring the ``monitor`` and ``save_top_k`` argument.
+
+|
+
+    .. testcode::
+
+        from pytorch_lightning.callbacks import ModelCheckpoint
+
+
+        # saves top-K checkpoints based on "val_loss" metric
+        checkpoint_callback = ModelCheckpoint(
+            save_top_k=10,
+            monitor="val_loss",
+            mode="min",
+            dirpath="my/path/",
+            filename="sample-mnist-{epoch:02d}-{val_loss:.2f}",
+        )
+
+        # saves last-K checkpoints based on "global_step" metric
+        # make sure you log it inside your LightningModule
+        checkpoint_callback = ModelCheckpoint(
+            save_top_k=10,
+            monitor="global_step",
+            mode="max",
+            dirpath="my/path/",
+            filename="sample-mnist-{epoch:02d}-{global_step}",
+        )
+
+-  You can customize the checkpointing behavior to monitor any quantity of your training or validation steps. For example, if you want to update your checkpoints based on your validation loss:
+
+|
+
+    .. testcode::
+
+        from pytorch_lightning.callbacks import ModelCheckpoint
+
+
+        class LitAutoEncoder(LightningModule):
+            def validation_step(self, batch, batch_idx):
+                x, y = batch
+                y_hat = self.backbone(x)
+
+                # 1. calculate loss
+                loss = F.cross_entropy(y_hat, y)
+
+                # 2. log val_loss
+                self.log("val_loss", loss)
+
+
+        # 3. Init ModelCheckpoint callback, monitoring "val_loss"
+        checkpoint_callback = ModelCheckpoint(monitor="val_loss")
+
+        # 4. Add your callback to the callbacks list
+        trainer = Trainer(callbacks=[checkpoint_callback])
+
+
+What
+====
+
+- By default, the ``ModelCheckpoint`` callback saves model weights, optimizer states, etc., but in case you have limited disk space or just need the model weights to be saved you can specify ``save_weights_only=True``.
+
+
+Where
+=====
+
+- It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.lightning.LightningModule.log`.
+
+|
+
+    .. testcode::
+
+        from pytorch_lightning.callbacks import ModelCheckpoint
+
+
+        # saves a file like: my/path/sample-mnist-epoch=02-val_loss=0.32.ckpt
+        checkpoint_callback = ModelCheckpoint(
+            dirpath="my/path/",
+            filename="sample-mnist-{epoch:02d}-{val_loss:.2f}",
+        )
+
+|
+
+The :class:`~pytorch_lightning.callbacks.ModelCheckpoint` callback is very robust and should cover 99% of the use-cases. If you find a use-case that is not configured yet, feel free to open an issue with a feature request on GitHub
+and the Lightning Team will be happy to integrate/help integrate it.
+
+----
+
+*************************
+Save checkpoints manually
+*************************
+
+You can manually save checkpoints and restore your model from the checkpointed state using :meth:`~pytorch_lightning.trainer.trainer.Trainer.save_checkpoint`
+and :meth:`~pytorch_lightning.core.saving.ModelIO.load_from_checkpoint`.
+
+.. code-block:: python
+
+    model = MyLightningModule(hparams)
+    trainer.fit(model)
+    trainer.save_checkpoint("example.ckpt")
+
+    # load the checkpoint later as normal
+    new_model = MyLightningModule.load_from_checkpoint(checkpoint_path="example.ckpt")
+
+Manual saving with distributed training
+=======================================
+In distributed training cases where a model is running across many machines, Lightning ensures that only one checkpoint is saved instead of a model per machine. This requires no code changes as seen below:
+
+.. code-block:: python
+
+    trainer = Trainer(strategy="ddp")
+    model = MyLightningModule(hparams)
+    trainer.fit(model)
+    # Saves only on the main process
+    trainer.save_checkpoint("example.ckpt")
+
+Not using :meth:`~pytorch_lightning.trainer.trainer.Trainer.save_checkpoint` can lead to unexpected behavior and potential deadlock. Using other saving functions will result in all devices attempting to save the checkpoint. As a result, we highly recommend using the Trainer's save functionality.
+If using custom saving functions cannot be avoided, we recommend using the :func:`~pytorch_lightning.utilities.rank_zero.rank_zero_only` decorator to ensure saving occurs only on the main process. Note that this will only work if all ranks hold the exact same state and won't work when using
+model parallel distributed strategies such as deepspeed or sharded training.
diff --git a/docs/source/common/console_logs.rst b/docs/source/common/console_logs.rst
new file mode 100644
index 00000000000000..6761432378c2b1
--- /dev/null
+++ b/docs/source/common/console_logs.rst
@@ -0,0 +1,26 @@
+###############
+Console logging
+###############
+**Audience:** Engineers looking to capture more visible logs.
+
+----
+
+*******************
+Enable console logs
+*******************
+Lightning logs useful information about the training process and user warnings to the console.
+You can retrieve the Lightning console logger and change it to your liking. For example, adjust the logging level
+or redirect output for certain modules to log files:
+
+.. testcode::
+
+    import logging
+
+    # configure logging at the root level of Lightning
+    logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
+
+    # configure logging on module level, redirect to file
+    logger = logging.getLogger("pytorch_lightning.core")
+    logger.addHandler(logging.FileHandler("core.log"))
+
+Read more about custom Python logging `here <https://docs.python.org/3/library/logging.html>`_.
diff --git a/docs/source/common/debugging.rst b/docs/source/common/debugging.rst
deleted file mode 100644
index 784645a71eabec..00000000000000
--- a/docs/source/common/debugging.rst
+++ /dev/null
@@ -1,249 +0,0 @@
-.. testsetup:: *
-
-    from pytorch_lightning.trainer.trainer import Trainer
-
-.. _debugging:
-
-#########
-Debugging
-#########
-
-The Lightning :class:`~pytorch_lightning.trainer.trainer.Trainer` is empowered with a lot of flags that can help you debug your :class:`~pytorch_lightning.core.lightning.LightningModule`.
-
-.. raw:: html
-
-    <video width="50%" max-width="400px" controls
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/yt_thumbs/thumb_debugging.png"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/yt/Trainer+flags+7-+debugging_1.mp4"></video>
-
-|
-
-The following are flags that make debugging much easier.
-
-
-----------------
-
-
-******************
-Quick Unit Testing
-******************
-
-fast_dev_run
-============
-
-This flag runs a "unit test" by running ``N`` if set to ``N`` (int) else 1 if set to ``True`` training, validation, testing and predict batch(es)
-for a single epoch. The point is to have a dry run to detect any bugs in the respective loop without having to wait for a complete loop to crash.
-
-Internally, it just updates ``limit_<train/test/val/predict>_batches=fast_dev_run`` and sets ``max_epoch=1`` to limit the batches.
-
-(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.fast_dev_run`
-argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
-
-.. testcode::
-
-    # runs 1 train, val, test batch and program ends
-    trainer = Trainer(fast_dev_run=True)
-
-    # runs 7 train, val, test batches and program ends
-    trainer = Trainer(fast_dev_run=7)
-
-.. note::
-
-    This argument will disable tuner, checkpoint callbacks, early stopping callbacks,
-    loggers and logger callbacks like :class:`~pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor` and
-    :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor`.
-
-
-Shorten Epochs
-==============
-
-Sometimes it's helpful to only use a fraction of your training, val, test, or predict data (or a set number of batches).
-For example, you can use 20% of the training set and 1% of the validation set.
-
-On larger datasets like Imagenet, this can help you debug or test a few things faster than waiting for a full epoch.
-
-.. testcode::
-
-    # use only 10% of training data and 1% of val data
-    trainer = Trainer(limit_train_batches=0.1, limit_val_batches=0.01)
-
-    # use 10 batches of train and 5 batches of val
-    trainer = Trainer(limit_train_batches=10, limit_val_batches=5)
-
-
-Validation Sanity Check
-=======================
-
-Lightning runs a few steps of validation in the beginning of training.
-This avoids crashing in the validation loop sometime deep into a lengthy training loop.
-
-(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.num_sanity_val_steps`
-argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
-
-.. testcode::
-
-    # DEFAULT
-    trainer = Trainer(num_sanity_val_steps=2)
-
-
-Make Model Overfit on Subset of Data
-====================================
-
-A good debugging technique is to take a tiny portion of your data (say 2 samples per class),
-and try to get your model to overfit. If it can't, it's a sign it won't work with large datasets.
-
-(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.overfit_batches`
-argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
-
-.. testcode::
-
-    # use only 1% of training/validation data
-    trainer = Trainer(overfit_batches=0.01)
-
-    # similar, but with a fixed 10 batches
-    trainer = Trainer(overfit_batches=10)
-
-When using this flag, Lightning will also replace the sampler in the training & validation dataloader to turn off shuffle for you.
-
-
-----------------
-
-
-************
-Optimization
-************
-
-Inspect Gradient Norms
-======================
-
-Logs the norm of the gradients to the logger.
-
-(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.track_grad_norm`
-argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
-
-.. testcode::
-
-    # the 2-norm
-    trainer = Trainer(track_grad_norm=2)
-
-
-Detect Anomaly
-==============
-
-You can enable anomaly detection for the autograd engine. It uses PyTorch's built-in
-`Anomaly Detection Context-manager <https://pytorch.org/docs/stable/autograd.html#anomaly-detection>`_.
-
-To enable it within Lightning, use Trainer's flag:
-
-.. testcode::
-
-    trainer = Trainer(detect_anomaly=True)
-
-
-----------------
-
-
-***********
-Performance
-***********
-
-Log Device Statistics
-=====================
-
-Monitor and log device stats during training with the :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor`.
-
-.. testcode::
-
-    from pytorch_lightning.callbacks import DeviceStatsMonitor
-
-    trainer = Trainer(callbacks=[DeviceStatsMonitor()])
-
-
-Profiling
-=========
-
-Check out the :ref:`Profiler <profiler>` document.
-
-
-----------------
-
-
-****************
-Model Statistics
-****************
-
-Print a Summary of Your LightningModule
-=======================================
-
-Whenever the ``.fit()`` function gets called, the Trainer will print the weights summary for the LightningModule.
-By default it only prints the top-level modules. If you want to show all submodules in your network, use the
-``max_depth`` option of :class:`~pytorch_lightning.callbacks.model_summary.ModelSummary` callback:
-
-.. testcode::
-
-    from pytorch_lightning.callbacks import ModelSummary
-
-    trainer = Trainer(callbacks=[ModelSummary(max_depth=-1)])
-
-
-You can also display the intermediate input- and output sizes of all your layers by setting the
-``example_input_array`` attribute in your LightningModule. It will print a table like this
-
-.. code-block:: text
-
-      | Name  | Type        | Params | In sizes  | Out sizes
-    --------------------------------------------------------------
-    0 | net   | Sequential  | 132 K  | [10, 256] | [10, 512]
-    1 | net.0 | Linear      | 131 K  | [10, 256] | [10, 512]
-    2 | net.1 | BatchNorm1d | 1.0 K  | [10, 512] | [10, 512]
-
-when you call ``.fit()`` on the Trainer. This can help you find bugs in the composition of your layers.
-
-It is enabled by default and can be turned off using ``Trainer(enable_model_summary=False)``.
-
-See Also:
-    - :class:`~pytorch_lightning.callbacks.model_summary.ModelSummary`
-    - :func:`~pytorch_lightning.utilities.model_summary.summarize`
-    - :class:`~pytorch_lightning.utilities.model_summary.ModelSummary`
-
-
-----------------
-
-
-*************************************
-Debugging with Distributed Strategies
-*************************************
-
-DDP Debugging
-=============
-
-If you are having a hard time debugging DDP on your remote machine you can debug DDP locally on the CPU. Note that this will not provide any speed benefits.
-
-.. code-block:: python
-
-    trainer = Trainer(accelerator="cpu", strategy="ddp", devices=2)
-
-To inspect your code, you can use `pdb <https://docs.python.org/3/library/pdb.html>`_ or `breakpoint() <https://docs.python.org/3/library/functions.html#breakpoint>`_
-or use regular print statements.
-
-.. testcode::
-
-    class LitModel(LightningModule):
-        def training_step(self, batch, batch_idx):
-
-            debugging_message = ...
-            print(f"RANK - {self.trainer.global_rank}: {debugging_message}")
-
-            if self.trainer.global_rank == 0:
-                import pdb
-
-                pdb.set_trace()
-
-            # to prevent other processes from moving forward until all processes are in sync
-            self.trainer.strategy.barrier()
-
-When everything works, switch back to GPU by changing only the accelerator.
-
-.. code-block:: python
-
-    trainer = Trainer(accelerator="gpu", strategy="ddp", devices=2)
diff --git a/docs/source/common/early_stopping.rst b/docs/source/common/early_stopping.rst
index d0c6427e94a39f..593106fd19cc15 100644
--- a/docs/source/common/early_stopping.rst
+++ b/docs/source/common/early_stopping.rst
@@ -1,6 +1,5 @@
 .. testsetup:: *
 
-    from pytorch_lightning.trainer.trainer import Trainer
     from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 
 .. _early_stopping:
diff --git a/docs/source/common/evaluation.rst b/docs/source/common/evaluation.rst
index fa14ddfec069e7..e126a706ecdc1c 100644
--- a/docs/source/common/evaluation.rst
+++ b/docs/source/common/evaluation.rst
@@ -1,155 +1,33 @@
-.. _test_set:
+.. _val-test-dataset:
 
-##########
-Evaluation
-##########
+********************************
+Add validation and test datasets
+********************************
 
-During and after training we need a way to evaluate our models to make sure they are not overfitting while training and
-generalize well on unseen or real-world data. There are generally 2 stages of evaluation: validation and testing. To some
-degree they serve the same purpose, to make sure models works on real data but they have some practical differences.
+.. raw:: html
 
-Validation is usually done during training, traditionally after each training epoch. It can be used for hyperparameter optimization or tracking model performance during training.
-It's a part of the training process.
+    <div class="display-card-container">
+        <div class="row">
 
-Testing is usually done once we are satisfied with the training and only with the best model selected from the validation metrics.
+.. Add callout items below this line
 
-Let's see how these can be performed with Lightning.
+.. displayitem::
+   :header: Basic
+   :description: Add a validation and test loop to avoid overfitting.
+   :col_css: col-md-6
+   :button_link: evaluation_basic.html
+   :height: 150
+   :tag: basic
 
-*******
-Testing
-*******
+.. displayitem::
+   :header: Intermediate
+   :description: Learn about more complex validation and test workflows
+   :col_css: col-md-6
+   :button_link: evaluation_intermediate.html
+   :height: 150
+   :tag: intermediate
 
-Lightning allows the user to test their models with any compatible test dataloaders. This can be done before/after training
-and is completely agnostic to :meth:`~pytorch_lightning.trainer.trainer.Trainer.fit` call. The logic used here is defined under
-:meth:`~pytorch_lightning.core.lightning.LightningModule.test_step`.
+.. raw:: html
 
-Testing is performed using the ``Trainer`` object's ``.test()`` method.
-
-.. automethod:: pytorch_lightning.trainer.Trainer.test
-    :noindex:
-
-
-Test after Fit
-==============
-
-To run the test set after training completes, use this method.
-
-.. code-block:: python
-
-    # run full training
-    trainer.fit(model)
-
-    # (1) load the best checkpoint automatically (lightning tracks this for you)
-    trainer.test(ckpt_path="best")
-
-    # (2) test using a specific checkpoint
-    trainer.test(ckpt_path="/path/to/my_checkpoint.ckpt")
-
-    # (3) test with an explicit model (will use this model and not load a checkpoint)
-    trainer.test(model)
-
-.. warning::
-
-    It is recommended to test with ``Trainer(devices=1)`` since distributed strategies such as DDP
-    use :class:`~torch.utils.data.distributed.DistributedSampler` internally, which replicates some samples to
-    make sure all devices have same batch size in case of uneven inputs. This is helpful to make sure
-    benchmarking for research papers is done the right way.
-
-
-Test Multiple Models
-====================
-
-You can run the test set on multiple models using the same trainer instance.
-
-.. code-block:: python
-
-    model1 = LitModel()
-    model2 = GANModel()
-
-    trainer = Trainer()
-    trainer.test(model1)
-    trainer.test(model2)
-
-
-Test Pre-Trained Model
-======================
-
-To run the test set on a pre-trained model, use this method.
-
-.. code-block:: python
-
-    model = MyLightningModule.load_from_checkpoint(
-        checkpoint_path="/path/to/pytorch_checkpoint.ckpt",
-        hparams_file="/path/to/test_tube/experiment/version/hparams.yaml",
-        map_location=None,
-    )
-
-    # init trainer with whatever options
-    trainer = Trainer(...)
-
-    # test (pass in the model)
-    trainer.test(model)
-
-In this  case, the options you pass to trainer will be used when
-running the test set (ie: 16-bit, dp, ddp, etc...)
-
-
-Test with Additional DataLoaders
-================================
-
-You can still run inference on a test dataset even if the :meth:`~pytorch_lightning.core.hooks.DataHooks.test_dataloader` method hasn't been
-defined within your :doc:`lightning module <../common/lightning_module>` instance. This would be the case when your test data
-is not available at the time your model was declared.
-
-.. code-block:: python
-
-    # setup your data loader
-    test_dataloader = DataLoader(...)
-
-    # test (pass in the loader)
-    trainer.test(dataloaders=test_dataloader)
-
-You can either pass in a single dataloader or a list of them. This optional named
-parameter can be used in conjunction with any of the above use cases. Additionally,
-you can also pass in an :doc:`datamodules <../extensions/datamodules>` that have overridden the
-:ref:`datamodule_test_dataloader_label` method.
-
-.. code-block:: python
-
-    class MyDataModule(pl.LightningDataModule):
-        ...
-
-        def test_dataloader(self):
-            return DataLoader(...)
-
-
-    # setup your datamodule
-    dm = MyDataModule(...)
-
-    # test (pass in datamodule)
-    trainer.test(datamodule=dm)
-
-----------
-
-**********
-Validation
-**********
-
-Lightning allows the user to validate their models with any compatible ``val dataloaders``. This can be done before/after training.
-The logic associated to the validation is defined within the :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step`.
-
-Apart from this ``.validate`` has same API as ``.test``, but would rely respectively on :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step` and :meth:`~pytorch_lightning.core.lightning.LightningModule.test_step`.
-
-.. note::
-    ``.validate`` method uses the same validation logic being used under validation happening within
-    :meth:`~pytorch_lightning.trainer.trainer.Trainer.fit` call.
-
-.. warning::
-
-    When using ``trainer.validate()``, it is recommended to use ``Trainer(devices=1)`` since distributed strategies such as DDP
-    uses :class:`~torch.utils.data.distributed.DistributedSampler` internally, which replicates some samples to
-    make sure all devices have same batch size in case of uneven inputs. This is helpful to make sure
-    benchmarking for research papers is done the right way.
-
-.. automethod:: pytorch_lightning.trainer.Trainer.validate
-    :noindex:
+        </div>
+    </div>
diff --git a/docs/source/common/evaluation_basic.rst b/docs/source/common/evaluation_basic.rst
new file mode 100644
index 00000000000000..5f933eeab44506
--- /dev/null
+++ b/docs/source/common/evaluation_basic.rst
@@ -0,0 +1,128 @@
+:orphan:
+
+#################################
+Validate and test a model (basic)
+#################################
+**Audience**: Users who want to add a validation loop to avoid overfitting
+
+----
+
+***************
+Add a test loop
+***************
+To make sure a model can generalize to an unseen dataset (ie: to publish a paper or in a production environment) a dataset is normally split into two parts, the *train* split and the *test* split.
+
+The test set is **NOT** used during training, it is **ONLY** used once the model has been trained to see how the model will do in the real-world.
+
+----
+
+Find the train and test splits
+==============================
+Datasets come with two splits. Refer to the dataset documentation to find the *train* and *test* splits.
+
+.. code-block:: python
+
+   import torch.utils.data as data
+   from torchvision import datasets
+
+   # Load data sets
+   train_set = datasets.MNIST(root="MNIST", download=True, train=True)
+   test_set = datasets.MNIST(root="MNIST", download=True, train=False)
+
+----
+
+Define the test loop
+====================
+To add a test loop, implement the **test_step** method of the LightningModule
+
+.. code:: python
+
+    class LitAutoEncoder(pl.LightningModule):
+        def training_step(self, batch, batch_idx):
+            ...
+
+        def test_step(self, batch, batch_idx):
+            # this is the test loop
+            x, y = batch
+            x = x.view(x.size(0), -1)
+            z = self.encoder(x)
+            x_hat = self.decoder(z)
+            test_loss = F.mse_loss(x_hat, x)
+            self.log("test_loss", test_loss)
+
+----
+
+Train with the test loop
+========================
+Once the model has finished training, call **.test**
+
+.. code-block:: python
+
+   from torch.utils.data import DataLoader
+
+   # initialize the Trainer
+   trainer = Trainer()
+
+   # test the model
+   trainer.test(model, dataloaders=DataLoader(test_set))
+
+----
+
+*********************
+Add a validation loop
+*********************
+During training, it's common practice to use a small portion of the train split to determine when the model has finished training.
+
+----
+
+Split the training data
+=======================
+As a rule of thumb, we use 20% of the training set as the **validation set**. This number varies from dataset to dataset.
+
+.. code-block:: python
+
+   # use 20% of training data for validation
+   train_set_size = int(len(train_set) * 0.8)
+   valid_set_size = len(train_set) - train_set_size
+
+   # split the train set into two
+   seed = torch.Generator().manual_seed(42)
+   train_set, valid_set = data.random_split(train_set, [train_set_size, valid_set_size], generator=seed)
+
+----
+
+Define the validation loop
+==========================
+To add a validation loop, implement the **validation_step** method of the LightningModule
+
+.. code:: python
+
+    class LitAutoEncoder(pl.LightningModule):
+        def training_step(self, batch, batch_idx):
+            ...
+
+        def validation_step(self, batch, batch_idx):
+            # this is the validation loop
+            x, y = batch
+            x = x.view(x.size(0), -1)
+            z = self.encoder(x)
+            x_hat = self.decoder(z)
+            test_loss = F.mse_loss(x_hat, x)
+            self.log("val_loss", test_loss)
+
+----
+
+Train with the validation loop
+==============================
+To run the validation loop, pass in the validation set to **.fit**
+
+.. code-block:: python
+
+   from torch.utils.data import DataLoader
+
+   train_set = DataLoader(train_set)
+   val_set = DataLoader(val_set)
+
+   # train with both splits
+   trainer = Trainer()
+   trainer.fit(model, train_set, val_set)
diff --git a/docs/source/common/evaluation_intermediate.rst b/docs/source/common/evaluation_intermediate.rst
new file mode 100644
index 00000000000000..b09acd06a9a8ed
--- /dev/null
+++ b/docs/source/common/evaluation_intermediate.rst
@@ -0,0 +1,160 @@
+.. _test_set:
+
+:orphan:
+
+########################################
+Validate and test a model (intermediate)
+########################################
+
+During and after training we need a way to evaluate our models to make sure they are not overfitting while training and
+generalize well on unseen or real-world data. There are generally 2 stages of evaluation: validation and testing. To some
+degree they serve the same purpose, to make sure models works on real data but they have some practical differences.
+
+Validation is usually done during training, traditionally after each training epoch. It can be used for hyperparameter optimization or tracking model performance during training.
+It's a part of the training process.
+
+Testing is usually done once we are satisfied with the training and only with the best model selected from the validation metrics.
+
+Let's see how these can be performed with Lightning.
+
+*******
+Testing
+*******
+
+Lightning allows the user to test their models with any compatible test dataloaders. This can be done before/after training
+and is completely agnostic to :meth:`~pytorch_lightning.trainer.trainer.Trainer.fit` call. The logic used here is defined under
+:meth:`~pytorch_lightning.core.lightning.LightningModule.test_step`.
+
+Testing is performed using the ``Trainer`` object's ``.test()`` method.
+
+.. automethod:: pytorch_lightning.trainer.Trainer.test
+    :noindex:
+
+
+Test after Fit
+==============
+
+To run the test set after training completes, use this method.
+
+.. code-block:: python
+
+    # run full training
+    trainer.fit(model)
+
+    # (1) load the best checkpoint automatically (lightning tracks this for you)
+    trainer.test(ckpt_path="best")
+
+    # (2) load the last available checkpoint
+    trainer.test(ckpt_path="last")
+
+    # (3) test using a specific checkpoint
+    trainer.test(ckpt_path="/path/to/my_checkpoint.ckpt")
+
+    # (4) test with an explicit model (will use this model and not load a checkpoint)
+    trainer.test(model)
+
+.. warning::
+
+    It is recommended to test with ``Trainer(devices=1)`` since distributed strategies such as DDP
+    use :class:`~torch.utils.data.distributed.DistributedSampler` internally, which replicates some samples to
+    make sure all devices have same batch size in case of uneven inputs. This is helpful to make sure
+    benchmarking for research papers is done the right way.
+
+
+Test Multiple Models
+====================
+
+You can run the test set on multiple models using the same trainer instance.
+
+.. code-block:: python
+
+    model1 = LitModel()
+    model2 = GANModel()
+
+    trainer = Trainer()
+    trainer.test(model1)
+    trainer.test(model2)
+
+
+Test Pre-Trained Model
+======================
+
+To run the test set on a pre-trained model, use this method.
+
+.. code-block:: python
+
+    model = MyLightningModule.load_from_checkpoint(
+        checkpoint_path="/path/to/pytorch_checkpoint.ckpt",
+        hparams_file="/path/to/experiment/version/hparams.yaml",
+        map_location=None,
+    )
+
+    # init trainer with whatever options
+    trainer = Trainer(...)
+
+    # test (pass in the model)
+    trainer.test(model)
+
+In this  case, the options you pass to trainer will be used when
+running the test set (ie: 16-bit, dp, ddp, etc...)
+
+
+Test with Additional DataLoaders
+================================
+
+You can still run inference on a test dataset even if the :meth:`~pytorch_lightning.core.hooks.DataHooks.test_dataloader` method hasn't been
+defined within your :doc:`lightning module <../common/lightning_module>` instance. This would be the case when your test data
+is not available at the time your model was declared.
+
+.. code-block:: python
+
+    # setup your data loader
+    test_dataloader = DataLoader(...)
+
+    # test (pass in the loader)
+    trainer.test(dataloaders=test_dataloader)
+
+You can either pass in a single dataloader or a list of them. This optional named
+parameter can be used in conjunction with any of the above use cases. Additionally,
+you can also pass in an :doc:`datamodules <../data/datamodule>` that have overridden the
+:ref:`datamodule_test_dataloader_label` method.
+
+.. code-block:: python
+
+    class MyDataModule(pl.LightningDataModule):
+        ...
+
+        def test_dataloader(self):
+            return DataLoader(...)
+
+
+    # setup your datamodule
+    dm = MyDataModule(...)
+
+    # test (pass in datamodule)
+    trainer.test(datamodule=dm)
+
+----------
+
+**********
+Validation
+**********
+
+Lightning allows the user to validate their models with any compatible ``val dataloaders``. This can be done before/after training.
+The logic associated to the validation is defined within the :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step`.
+
+Apart from this ``.validate`` has same API as ``.test``, but would rely respectively on :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step` and :meth:`~pytorch_lightning.core.lightning.LightningModule.test_step`.
+
+.. note::
+    ``.validate`` method uses the same validation logic being used under validation happening within
+    :meth:`~pytorch_lightning.trainer.trainer.Trainer.fit` call.
+
+.. warning::
+
+    When using ``trainer.validate()``, it is recommended to use ``Trainer(devices=1)`` since distributed strategies such as DDP
+    uses :class:`~torch.utils.data.distributed.DistributedSampler` internally, which replicates some samples to
+    make sure all devices have same batch size in case of uneven inputs. This is helpful to make sure
+    benchmarking for research papers is done the right way.
+
+.. automethod:: pytorch_lightning.trainer.Trainer.validate
+    :noindex:
diff --git a/docs/source/common/hyperparameters.rst b/docs/source/common/hyperparameters.rst
index ce314e28fc2ceb..9103100d7cc887 100644
--- a/docs/source/common/hyperparameters.rst
+++ b/docs/source/common/hyperparameters.rst
@@ -1,15 +1,12 @@
 .. testsetup:: *
 
-    import torch
     from argparse import ArgumentParser, Namespace
-    from pytorch_lightning.trainer.trainer import Trainer
-    from pytorch_lightning.core.lightning import LightningModule
-    import sys
 
     sys.argv = ["foo"]
 
-Hyperparameters
----------------
+Configure hyperparameters from the CLI
+--------------------------------------
+
 Lightning has utilities to interact seamlessly with the command line ``ArgumentParser``
 and plays well with the hyperparameter optimization framework of your choice.
 
diff --git a/docs/source/common/lightning_cli.rst b/docs/source/common/lightning_cli.rst
deleted file mode 100644
index d487ed5db12325..00000000000000
--- a/docs/source/common/lightning_cli.rst
+++ /dev/null
@@ -1,1032 +0,0 @@
-.. testsetup:: *
-    :skipif: not _JSONARGPARSE_AVAILABLE
-
-    import torch
-    from unittest import mock
-    from typing import List
-    import pytorch_lightning as pl
-    from pytorch_lightning import LightningModule, LightningDataModule, Trainer, Callback
-
-
-    class NoFitTrainer(Trainer):
-        def fit(self, *_, **__):
-            pass
-
-
-    class LightningCLI(pl.utilities.cli.LightningCLI):
-        def __init__(self, *args, trainer_class=NoFitTrainer, run=False, **kwargs):
-            super().__init__(*args, trainer_class=trainer_class, run=run, **kwargs)
-
-
-    class MyModel(LightningModule):
-        def __init__(
-            self,
-            encoder_layers: int = 12,
-            decoder_layers: List[int] = [2, 4],
-            batch_size: int = 8,
-        ):
-            pass
-
-
-    class MyClassModel(LightningModule):
-        def __init__(self, num_classes: int):
-            pass
-
-
-    class MyDataModule(LightningDataModule):
-        def __init__(self, batch_size: int = 8):
-            self.num_classes = 5
-
-
-    def send_email(address, message):
-        pass
-
-
-    MyModelBaseClass = MyModel
-    MyDataModuleBaseClass = MyDataModule
-
-    EncoderBaseClass = MyModel
-    DecoderBaseClass = MyModel
-
-    mock_argv = mock.patch("sys.argv", ["any.py"])
-    mock_argv.start()
-
-.. testcleanup:: *
-
-    mock_argv.stop()
-
-
-Lightning CLI and config files
-------------------------------
-
-Another source of boilerplate code that Lightning can help to reduce is in the implementation of command line tools.
-Furthermore, it provides a standardized way to configure experiments using a single file that includes settings for
-:class:`~pytorch_lightning.trainer.trainer.Trainer` as well as the user extended
-:class:`~pytorch_lightning.core.lightning.LightningModule` and
-:class:`~pytorch_lightning.core.datamodule.LightningDataModule` classes. The full configuration is automatically saved
-in the log directory. This has the benefit of greatly simplifying the reproducibility of experiments.
-
-The main requirement for user extended classes to be made configurable is that all relevant init arguments must have
-type hints. This is not a very demanding requirement since it is good practice to do anyway. As a bonus if the arguments
-are described in the docstrings, then the help of the command line tool will display them.
-
-.. warning:: ``LightningCLI`` is in beta and subject to change.
-
-----------
-
-
-LightningCLI
-^^^^^^^^^^^^
-
-The implementation of training command line tools is done via the :class:`~pytorch_lightning.utilities.cli.LightningCLI`
-class. The minimal installation of pytorch-lightning does not include this support. To enable it, either install
-Lightning as :code:`pytorch-lightning[extra]` or install the package :code:`pip install -U jsonargparse[signatures]`.
-
-The case in which the user's :class:`~pytorch_lightning.core.lightning.LightningModule` class implements all required
-:code:`*_dataloader` methods, a :code:`trainer.py` tool can be as simple as:
-
-.. testcode::
-
-    cli = LightningCLI(MyModel)
-
-The help of the tool describing all configurable options and default values can be shown by running :code:`python
-trainer.py --help`. Default options can be changed by providing individual command line arguments. However, it is better
-practice to create a configuration file and provide this to the tool. A way to do this would be:
-
-.. code-block:: bash
-
-    # Dump default configuration to have as reference
-    python trainer.py fit --print_config > config.yaml
-    # Modify the config to your liking - you can remove all default arguments
-    nano config.yaml
-    # Fit your model using the configuration
-    python trainer.py fit --config config.yaml
-
-The instantiation of the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class takes care of parsing command line
-and config file options, instantiating the classes, setting up a callback to save the config in the log directory and
-finally running the trainer. The resulting object :code:`cli` can be used for example to get the instance of the model,
-(:code:`cli.model`).
-
-After multiple experiments with different configurations, each one will have in its respective log directory a
-:code:`config.yaml` file. This file can be used for reference to know in detail all the settings that were used for each
-particular experiment, and also could be used to trivially reproduce a training, e.g.:
-
-.. code-block:: bash
-
-    python trainer.py fit --config lightning_logs/version_7/config.yaml
-
-If a separate :class:`~pytorch_lightning.core.datamodule.LightningDataModule` class is required, the trainer tool just
-needs a small modification as follows:
-
-.. testcode::
-
-    cli = LightningCLI(MyModel, MyDataModule)
-
-The start of a possible implementation of :class:`MyModel` including the recommended argument descriptions in the
-docstring could be the one below. Note that by using type hints and docstrings there is no need to duplicate this
-information to define its configurable arguments.
-
-.. testcode:: mymodel
-
-    class MyModel(LightningModule):
-        def __init__(self, encoder_layers: int = 12, decoder_layers: List[int] = [2, 4]):
-            """Example encoder-decoder model
-
-            Args:
-                encoder_layers: Number of layers for the encoder
-                decoder_layers: Number of layers for each decoder block
-            """
-            super().__init__()
-            self.save_hyperparameters()
-
-With this model class, the help of the trainer tool would look as follows:
-
-.. code-block:: bash
-
-    $ python trainer.py fit --help
-    usage: trainer.py [-h] [--config CONFIG] [--print_config [={comments,skip_null}+]] ...
-
-    optional arguments:
-      -h, --help            Show this help message and exit.
-      --config CONFIG       Path to a configuration file in json or yaml format.
-      --print_config [={comments,skip_null}+]
-                            Print configuration and exit.
-      --seed_everything SEED_EVERYTHING
-                            Set to an int to run seed_everything with this value before classes instantiation
-                            (type: Optional[int], default: null)
-
-    Customize every aspect of training via flags:
-      ...
-      --trainer.max_epochs MAX_EPOCHS
-                            Stop training once this number of epochs is reached. (type: Optional[int], default: null)
-      --trainer.min_epochs MIN_EPOCHS
-                            Force training for at least these many epochs (type: Optional[int], default: null)
-      ...
-
-    Example encoder-decoder model:
-      --model.encoder_layers ENCODER_LAYERS
-                            Number of layers for the encoder (type: int, default: 12)
-      --model.decoder_layers DECODER_LAYERS
-                            Number of layers for each decoder block (type: List[int], default: [2, 4])
-
-The default configuration that option :code:`--print_config` gives is in yaml format and for the example above would
-look as follows:
-
-.. code-block:: bash
-
-    $ python trainer.py fit --print_config
-    model:
-      decoder_layers:
-      - 2
-      - 4
-      encoder_layers: 12
-    trainer:
-      accelerator: null
-      accumulate_grad_batches: 1
-      amp_backend: native
-      amp_level: O2
-      ...
-
-Note that there is a section for each class (model and trainer) including all the init parameters of the class. This
-grouping is also used in the formatting of the help shown previously.
-
-
-Changing subcommands
-^^^^^^^^^^^^^^^^^^^^
-
-The CLI supports running any trainer function from command line by changing the subcommand provided:
-
-.. code-block:: bash
-
-    $ python trainer.py --help
-    usage: trainer.py [-h] [--config CONFIG] [--print_config [={comments,skip_null}+]] {fit,validate,test,predict,tune} ...
-
-    pytorch-lightning trainer command line tool
-
-    optional arguments:
-      -h, --help            Show this help message and exit.
-      --config CONFIG       Path to a configuration file in json or yaml format.
-      --print_config [={comments,skip_null}+]
-                            Print configuration and exit.
-
-    subcommands:
-      For more details of each subcommand add it as argument followed by --help.
-
-      {fit,validate,test,predict,tune}
-        fit                 Runs the full optimization routine.
-        validate            Perform one evaluation epoch over the validation set.
-        test                Perform one evaluation epoch over the test set.
-        predict             Run inference on your data.
-        tune                Runs routines to tune hyperparameters before training.
-    $ python trainer.py test --trainer.limit_test_batches=10 [...]
-
-
-Use of command line arguments
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-For every CLI implemented, users are encouraged to learn how to run it by reading the documentation printed with the
-:code:`--help` option and use the :code:`--print_config` option to guide the writing of config files. A few more details
-that might not be clear by only reading the help are the following.
-
-:class:`~pytorch_lightning.utilities.cli.LightningCLI` is based on argparse and as such follows the same arguments style
-as many POSIX command line tools. Long options are prefixed with two dashes and its corresponding values should be
-provided with an empty space or an equal sign, as :code:`--option value` or :code:`--option=value`. Command line options
-are parsed from left to right, therefore if a setting appears multiple times the value most to the right will override
-the previous ones. If a class has an init parameter that is required (i.e. no default value), it is given as
-:code:`--option` which makes it explicit and more readable instead of relying on positional arguments.
-
-When calling a CLI, all options can be provided using individual arguments. However, given the large amount of options
-that the CLIs have, it is recommended to use a combination of config files and individual arguments. Therefore, a common
-pattern could be a single config file and only a few individual arguments that override defaults or values in the
-config, for example:
-
-.. code-block:: bash
-
-    $ python trainer.py fit --config experiment_defaults.yaml --trainer.max_epochs 100
-
-Another common pattern could be having multiple config files:
-
-.. code-block:: bash
-
-    $ python trainer.py --config config1.yaml --config config2.yaml test --config config3.yaml [...]
-
-As explained before, :code:`config1.yaml` is parsed first and then :code:`config2.yaml`. Therefore, if individual
-settings are defined in both files, then the ones in :code:`config2.yaml` will be used. Settings in :code:`config1.yaml`
-that are not in :code:`config2.yaml` are be kept. The same happens for :code:`config3.yaml`.
-
-The configuration files before the subcommand (``test`` in this case) can contain custom configuration for multiple of
-them, for example:
-
-.. code-block:: bash
-
-    $ cat config1.yaml
-    fit:
-        trainer:
-            limit_train_batches: 100
-            max_epochs: 10
-    test:
-        trainer:
-            limit_test_batches: 10
-
-
-whereas the configuration files passed after the subcommand would be:
-
-.. code-block:: bash
-
-    $ cat config3.yaml
-    trainer:
-        limit_train_batches: 100
-        max_epochs: 10
-    # the argument passed to `trainer.test(ckpt_path=...)`
-    ckpt_path: "a/path/to/a/checkpoint"
-
-
-Groups of options can also be given as independent config files:
-
-.. code-block:: bash
-
-    $ python trainer.py fit --trainer trainer.yaml --model model.yaml --data data.yaml [...]
-
-When running experiments in clusters it could be desired to use a config which needs to be accessed from a remote
-location. :class:`~pytorch_lightning.utilities.cli.LightningCLI` comes with `fsspec
-<https://filesystem-spec.readthedocs.io/en/stable/>`_ support which allows reading and writing from many types of remote
-file systems. One example is if you have installed `s3fs <https://s3fs.readthedocs.io/en/latest/>`_ then a config
-could be stored in an S3 bucket and accessed as:
-
-.. code-block:: bash
-
-    $ python trainer.py --config s3://bucket/config.yaml [...]
-
-In some cases people might what to pass an entire config in an environment variable, which could also be used instead of
-a path to a file, for example:
-
-.. code-block:: bash
-
-    $ python trainer.py fit --trainer "$TRAINER_CONFIG" --model "$MODEL_CONFIG" [...]
-
-An alternative for environment variables could be to instantiate the CLI with :code:`env_parse=True`. In this case the
-help shows the names of the environment variables for all options. A global config would be given in :code:`PL_CONFIG`
-and there wouldn't be a need to specify any command line argument.
-
-It is also possible to set a path to a config file of defaults. If the file exists it would be automatically loaded
-without having to specify any command line argument. Arguments given would override the values in the default config
-file. Loading a defaults file :code:`my_cli_defaults.yaml` in the current working directory would be implemented as:
-
-.. testcode::
-
-    cli = LightningCLI(MyModel, MyDataModule, parser_kwargs={"default_config_files": ["my_cli_defaults.yaml"]})
-
-or if you want defaults per subcommand:
-
-.. testcode::
-
-    cli = LightningCLI(MyModel, MyDataModule, parser_kwargs={"fit": {"default_config_files": ["my_fit_defaults.yaml"]}})
-
-To load a file in the user's home directory would be just changing to :code:`~/.my_cli_defaults.yaml`. Note that this
-setting is given through :code:`parser_kwargs`. More parameters are supported. For details see the `ArgumentParser API
-<https://jsonargparse.readthedocs.io/en/stable/#jsonargparse.core.ArgumentParser.__init__>`_ documentation.
-
-
-Instantiation only mode
-^^^^^^^^^^^^^^^^^^^^^^^
-
-The CLI is designed to start fitting with minimal code changes. On class instantiation, the CLI will automatically
-call the trainer function associated to the subcommand provided so you don't have to do it.
-To avoid this, you can set the following argument:
-
-.. testcode::
-
-    cli = LightningCLI(MyModel, run=False)  # True by default
-    # you'll have to call fit yourself:
-    cli.trainer.fit(cli.model)
-
-In this mode, there are subcommands added to the parser.
-This can be useful to implement custom logic without having to subclass the CLI, but still using the CLI's instantiation
-and argument parsing capabilities.
-
-
-Subclass registration
-^^^^^^^^^^^^^^^^^^^^^
-
-To use shorthand notation, the options need to be registered beforehand. This can be easily done with:
-
-.. code-block::
-
-    LightningCLI(auto_registry=True)  # False by default
-
-which will register all subclasses of :class:`torch.optim.Optimizer`, :class:`torch.optim.lr_scheduler._LRScheduler`,
-:class:`~pytorch_lightning.core.lightning.LightningModule`,
-:class:`~pytorch_lightning.core.datamodule.LightningDataModule`, :class:`~pytorch_lightning.callbacks.Callback`, and
-:class:`~pytorch_lightning.loggers.Logger` across all imported modules. This includes those in your own
-code.
-
-Alternatively, if this is left unset, only the subclasses defined in PyTorch's :class:`torch.optim.Optimizer`,
-:class:`torch.optim.lr_scheduler._LRScheduler` and Lightning's :class:`~pytorch_lightning.callbacks.Callback` and
-:class:`~pytorch_lightning.loggers.Logger` subclassess will be registered.
-
-In subsequent sections, we will go over adding specific classes to specific registries as well as how to use
-shorthand notation.
-
-
-Trainer Callbacks and arguments with class type
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-A very important argument of the :class:`~pytorch_lightning.trainer.trainer.Trainer` class is the :code:`callbacks`. In
-contrast to other more simple arguments which just require numbers or strings, :code:`callbacks` expects a list of
-instances of subclasses of :class:`~pytorch_lightning.callbacks.Callback`. To specify this kind of argument in a config
-file, each callback must be given as a dictionary including a :code:`class_path` entry with an import path of the class,
-and optionally an :code:`init_args` entry with arguments required to instantiate it. Therefore, a simple configuration
-file example that defines a couple of callbacks is the following:
-
-.. code-block:: yaml
-
-    trainer:
-      callbacks:
-        - class_path: pytorch_lightning.callbacks.EarlyStopping
-          init_args:
-            patience: 5
-        - class_path: pytorch_lightning.callbacks.LearningRateMonitor
-          init_args:
-            ...
-
-Similar to the callbacks, any arguments in :class:`~pytorch_lightning.trainer.trainer.Trainer` and user extended
-:class:`~pytorch_lightning.core.lightning.LightningModule` and
-:class:`~pytorch_lightning.core.datamodule.LightningDataModule` classes that have as type hint a class can be configured
-the same way using :code:`class_path` and :code:`init_args`.
-
-For callbacks in particular, Lightning simplifies the command line so that only
-the :class:`~pytorch_lightning.callbacks.Callback` name is required.
-The argument's order matters and the user needs to pass the arguments in the following way.
-
-.. code-block:: bash
-
-    $ python ... \
-        --trainer.callbacks={CALLBACK_1_NAME} \
-        --trainer.callbacks.{CALLBACK_1_ARGS_1}=... \
-        --trainer.callbacks.{CALLBACK_1_ARGS_2}=... \
-        ...
-        --trainer.callbacks={CALLBACK_N_NAME} \
-        --trainer.callbacks.{CALLBACK_N_ARGS_1}=... \
-        ...
-
-Here is an example:
-
-.. code-block:: bash
-
-    $ python ... \
-        --trainer.callbacks=EarlyStopping \
-        --trainer.callbacks.patience=5 \
-        --trainer.callbacks=LearningRateMonitor \
-        --trainer.callbacks.logging_interval=epoch
-
-Lightning provides a mechanism for you to add your own callbacks and benefit from the command line simplification
-as described above:
-
-.. code-block:: python
-
-    from pytorch_lightning.utilities.cli import CALLBACK_REGISTRY
-
-
-    @CALLBACK_REGISTRY
-    class CustomCallback(Callback):
-        ...
-
-
-    cli = LightningCLI(...)
-
-.. code-block:: bash
-
-    $  python ... --trainer.callbacks=CustomCallback ...
-
-.. note::
-
-    This shorthand notation is only supported in the shell and not inside a configuration file. The configuration file
-    generated by calling the previous command with ``--print_config`` will have the ``class_path`` notation.
-
-    .. code-block:: yaml
-
-        trainer:
-          callbacks:
-            - class_path: your_class_path.CustomCallback
-              init_args:
-                ...
-
-
-.. tip::
-
-    ``--trainer.logger`` also supports shorthand notation and a ``LOGGER_REGISTRY`` is available to register custom
-    Loggers.
-
-
-Multiple models and/or datasets
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-In the previous examples :class:`~pytorch_lightning.utilities.cli.LightningCLI` works only for a single model and
-datamodule class. However, there are many cases in which the objective is to easily be able to run many experiments for
-multiple models and datasets.
-
-The model and datamodule arguments can be left unset if a class has been registered first.
-This is particularly interesting for library authors who want to provide their users a range of models to choose from:
-
-.. code-block:: python
-
-    import flash.image
-    from pytorch_lightning.utilities.cli import MODEL_REGISTRY, DATAMODULE_REGISTRY
-
-
-    @MODEL_REGISTRY
-    class MyModel(LightningModule):
-        ...
-
-
-    @DATAMODULE_REGISTRY
-    class MyData(LightningDataModule):
-        ...
-
-
-    # register all `LightningModule` subclasses from a package
-    MODEL_REGISTRY.register_classes(flash.image, LightningModule)
-    # print(MODEL_REGISTRY)
-    # >>> Registered objects: ['MyModel', 'ImageClassifier', 'ObjectDetector', 'StyleTransfer', ...]
-
-    cli = LightningCLI()
-
-.. code-block:: bash
-
-    $ python trainer.py fit --model=MyModel --model.feat_dim=64 --data=MyData
-
-.. note::
-
-    This shorthand notation is only supported in the shell and not inside a configuration file. The configuration file
-    generated by calling the previous command with ``--print_config`` will have the ``class_path`` notation described
-    below.
-
-Additionally, the tool can be configured such that a model and/or a datamodule is
-specified by an import path and init arguments. For example, with a tool implemented as:
-
-.. code-block:: python
-
-    cli = LightningCLI(MyModelBaseClass, MyDataModuleBaseClass, subclass_mode_model=True, subclass_mode_data=True)
-
-A possible config file could be as follows:
-
-.. code-block:: yaml
-
-    model:
-      class_path: mycode.mymodels.MyModel
-      init_args:
-        decoder_layers:
-        - 2
-        - 4
-        encoder_layers: 12
-    data:
-      class_path: mycode.mydatamodules.MyDataModule
-      init_args:
-        ...
-    trainer:
-      callbacks:
-        - class_path: pytorch_lightning.callbacks.EarlyStopping
-          init_args:
-            patience: 5
-        ...
-
-Only model classes that are a subclass of :code:`MyModelBaseClass` would be allowed, and similarly only subclasses of
-:code:`MyDataModuleBaseClass`. If as base classes :class:`~pytorch_lightning.core.lightning.LightningModule` and
-:class:`~pytorch_lightning.core.datamodule.LightningDataModule` are given, then the tool would allow any lightning
-module and data module.
-
-.. tip::
-
-    Note that with the subclass modes the :code:`--help` option does not show information for a specific subclass. To
-    get help for a subclass the options :code:`--model.help` and :code:`--data.help` can be used, followed by the
-    desired class path. Similarly :code:`--print_config` does not include the settings for a particular subclass. To
-    include them the class path should be given before the :code:`--print_config` option. Examples for both help and
-    print config are:
-
-    .. code-block:: bash
-
-        $ python trainer.py fit --model.help mycode.mymodels.MyModel
-        $ python trainer.py fit --model mycode.mymodels.MyModel --print_config
-
-
-Models with multiple submodules
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Many use cases require to have several modules each with its own configurable options. One possible way to handle this
-with LightningCLI is to implement a single module having as init parameters each of the submodules. Since the init
-parameters have as type a class, then in the configuration these would be specified with :code:`class_path` and
-:code:`init_args` entries. For instance a model could be implemented as:
-
-.. testcode::
-
-    class MyMainModel(LightningModule):
-        def __init__(self, encoder: EncoderBaseClass, decoder: DecoderBaseClass):
-            """Example encoder-decoder submodules model
-
-            Args:
-                encoder: Instance of a module for encoding
-                decoder: Instance of a module for decoding
-            """
-            super().__init__()
-            self.encoder = encoder
-            self.decoder = decoder
-
-If the CLI is implemented as :code:`LightningCLI(MyMainModel)` the configuration would be as follows:
-
-.. code-block:: yaml
-
-    model:
-      encoder:
-        class_path: mycode.myencoders.MyEncoder
-        init_args:
-          ...
-      decoder:
-        class_path: mycode.mydecoders.MyDecoder
-        init_args:
-          ...
-
-It is also possible to combine :code:`subclass_mode_model=True` and submodules, thereby having two levels of
-:code:`class_path`.
-
-
-Customizing LightningCLI
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-The init parameters of the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class can be used to customize some
-things, namely: the description of the tool, enabling parsing of environment variables and additional arguments to
-instantiate the trainer and configuration parser.
-
-Nevertheless the init arguments are not enough for many use cases. For this reason the class is designed so that can be
-extended to customize different parts of the command line tool. The argument parser class used by
-:class:`~pytorch_lightning.utilities.cli.LightningCLI` is
-:class:`~pytorch_lightning.utilities.cli.LightningArgumentParser` which is an extension of python's argparse, thus
-adding arguments can be done using the :func:`add_argument` method. In contrast to argparse it has additional methods to
-add arguments, for example :func:`add_class_arguments` adds all arguments from the init of a class, though requiring
-parameters to have type hints. For more details about this please refer to the `respective documentation
-<https://jsonargparse.readthedocs.io/en/stable/#classes-methods-and-functions>`_.
-
-The :class:`~pytorch_lightning.utilities.cli.LightningCLI` class has the
-:meth:`~pytorch_lightning.utilities.cli.LightningCLI.add_arguments_to_parser` method which can be implemented to include
-more arguments. After parsing, the configuration is stored in the :code:`config` attribute of the class instance. The
-:class:`~pytorch_lightning.utilities.cli.LightningCLI` class also has two methods that can be used to run code before
-and after the trainer runs: :code:`before_<subcommand>` and :code:`after_<subcommand>`.
-A realistic example for these would be to send an email before and after the execution.
-The code for the :code:`fit` subcommand would be something like:
-
-.. testcode::
-
-    class MyLightningCLI(LightningCLI):
-        def add_arguments_to_parser(self, parser):
-            parser.add_argument("--notification_email", default="will@email.com")
-
-        def before_fit(self):
-            send_email(address=self.config["notification_email"], message="trainer.fit starting")
-
-        def after_fit(self):
-            send_email(address=self.config["notification_email"], message="trainer.fit finished")
-
-
-    cli = MyLightningCLI(MyModel)
-
-Note that the config object :code:`self.config` is a dictionary whose keys are global options or groups of options. It
-has the same structure as the yaml format described previously. This means for instance that the parameters used for
-instantiating the trainer class can be found in :code:`self.config['fit']['trainer']`.
-
-.. tip::
-
-    Have a look at the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class API reference to learn about other
-    methods that can be extended to customize a CLI.
-
-
-Configurable callbacks
-^^^^^^^^^^^^^^^^^^^^^^
-
-As explained previously, any Lightning callback can be added by passing it through command line or
-including it in the config via :code:`class_path` and :code:`init_args` entries.
-However, there are other cases in which a callback should always be present and be configurable.
-This can be implemented as follows:
-
-.. testcode::
-
-    from pytorch_lightning.callbacks import EarlyStopping
-
-
-    class MyLightningCLI(LightningCLI):
-        def add_arguments_to_parser(self, parser):
-            parser.add_lightning_class_args(EarlyStopping, "my_early_stopping")
-            parser.set_defaults({"my_early_stopping.monitor": "val_loss", "my_early_stopping.patience": 5})
-
-
-    cli = MyLightningCLI(MyModel)
-
-To change the configuration of the :code:`EarlyStopping` in the config it would be:
-
-.. code-block:: yaml
-
-    model:
-      ...
-    trainer:
-      ...
-    my_early_stopping:
-      patience: 5
-
-.. note::
-
-    The example above overrides a default in :code:`add_arguments_to_parser`. This is included to show that defaults can
-    be changed if needed. However, note that overriding of defaults in the source code is not intended to be used to
-    store the best hyperparameters for a task after experimentation. To ease reproducibility the source code should be
-    stable. It is better practice to store the best hyperparameters for a task in a configuration file independent from
-    the source code.
-
-
-Class type defaults
-^^^^^^^^^^^^^^^^^^^
-
-The support for classes as type hints allows to try many possibilities with the same CLI. This is a useful feature, but
-it can make it tempting to use an instance of a class as a default. For example:
-
-.. testcode::
-
-    class MyMainModel(LightningModule):
-        def __init__(
-            self,
-            backbone: torch.nn.Module = MyModel(encoder_layers=24),  # BAD PRACTICE!
-        ):
-            super().__init__()
-            self.backbone = backbone
-
-Normally classes are mutable as it is in this case. The instance of :code:`MyModel` would be created the moment that the
-module that defines :code:`MyMainModel` is first imported. This means that the default of :code:`backbone` will be
-initialized before the CLI class runs :code:`seed_everything` making it non-reproducible. Furthermore, if
-:code:`MyMainModel` is used more than once in the same Python process and the :code:`backbone` parameter is not
-overridden, the same instance would be used in multiple places which very likely is not what the developer intended.
-Having an instance as default also makes it impossible to generate the complete config file since for arbitrary classes
-it is not known which arguments were used to instantiate it.
-
-A good solution to these problems is to not have a default or set the default to a special value (e.g. a
-string) which would be checked in the init and instantiated accordingly. If a class parameter has no default and the CLI
-is subclassed then a default can be set as follows:
-
-.. testcode::
-
-    default_backbone = {
-        "class_path": "import.path.of.MyModel",
-        "init_args": {
-            "encoder_layers": 24,
-        },
-    }
-
-
-    class MyLightningCLI(LightningCLI):
-        def add_arguments_to_parser(self, parser):
-            parser.set_defaults({"model.backbone": default_backbone})
-
-A more compact version that avoids writing a dictionary would be:
-
-.. testcode::
-
-    from jsonargparse import lazy_instance
-
-
-    class MyLightningCLI(LightningCLI):
-        def add_arguments_to_parser(self, parser):
-            parser.set_defaults({"model.backbone": lazy_instance(MyModel, encoder_layers=24)})
-
-
-Argument linking
-^^^^^^^^^^^^^^^^
-
-Another case in which it might be desired to extend :class:`~pytorch_lightning.utilities.cli.LightningCLI` is that the
-model and data module depend on a common parameter. For example in some cases both classes require to know the
-:code:`batch_size`. It is a burden and error prone giving the same value twice in a config file. To avoid this the
-parser can be configured so that a value is only given once and then propagated accordingly. With a tool implemented
-like shown below, the :code:`batch_size` only has to be provided in the :code:`data` section of the config.
-
-.. testcode::
-
-    class MyLightningCLI(LightningCLI):
-        def add_arguments_to_parser(self, parser):
-            parser.link_arguments("data.batch_size", "model.batch_size")
-
-
-    cli = MyLightningCLI(MyModel, MyDataModule)
-
-The linking of arguments is observed in the help of the tool, which for this example would look like:
-
-.. code-block:: bash
-
-    $ python trainer.py fit --help
-      ...
-        --data.batch_size BATCH_SIZE
-                              Number of samples in a batch (type: int, default: 8)
-
-      Linked arguments:
-        model.batch_size <-- data.batch_size
-                              Number of samples in a batch (type: int)
-
-Sometimes a parameter value is only available after class instantiation. An example could be that your model requires
-the number of classes to instantiate its fully connected layer (for a classification task) but the value is not
-available until the data module has been instantiated. The code below illustrates how to address this.
-
-.. testcode::
-
-    class MyLightningCLI(LightningCLI):
-        def add_arguments_to_parser(self, parser):
-            parser.link_arguments("data.num_classes", "model.num_classes", apply_on="instantiate")
-
-
-    cli = MyLightningCLI(MyClassModel, MyDataModule)
-
-Instantiation links are used to automatically determine the order of instantiation, in this case data first.
-
-.. tip::
-
-    The linking of arguments can be used for more complex cases. For example to derive a value via a function that takes
-    multiple settings as input. For more details have a look at the API of `link_arguments
-    <https://jsonargparse.readthedocs.io/en/stable/#jsonargparse.core.ArgumentParser.link_arguments>`_.
-
-
-Variable Interpolation
-^^^^^^^^^^^^^^^^^^^^^^
-
-The linking of arguments is intended for things that are meant to be non-configurable. This improves the CLI user
-experience since it avoids the need for providing more parameters. A related concept is
-variable interpolation which in contrast keeps things being configurable.
-
-The YAML standard defines anchors and aliases which is a way to reuse the content in multiple places of the YAML. This is
-supported in the ``LightningCLI`` though it has limitations. Support for OmegaConf's more powerful `variable
-interpolation <https://omegaconf.readthedocs.io/en/2.1_branch/usage.html#variable-interpolation>`__ will be available
-out of the box if this package is installed. To install it run :code:`pip install omegaconf`. Then to enable the use
-of OmegaConf in a ``LightningCLI``, when instantiating a parameter needs to be given for the parser as follows:
-
-.. testcode::
-
-    cli = LightningCLI(MyModel, parser_kwargs={"parser_mode": "omegaconf"})
-
-With the encoder-decoder example model above a possible YAML that uses variable interpolation could be the following:
-
-.. code-block:: yaml
-
-    model:
-      encoder_layers: 12
-      decoder_layers:
-      - ${model.encoder_layers}
-      - 4
-
-.. note::
-
-    OmegaConf's interpolation only works within a single config file. Trying to interpolate across command line
-    arguments or using it in `parser.set_defaults` will not work. More up to date information about the interpolation
-    support can be found `here <https://jsonargparse.readthedocs.io/en/stable/#variable-interpolation>__`.
-
-
-Optimizers and learning rate schedulers
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Optimizers and learning rate schedulers can also be made configurable. The most common case is when a model only has a
-single optimizer and optionally a single learning rate scheduler. In this case, the model's
-:meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` could be left unimplemented since it is
-normally always the same and just adds boilerplate.
-
-The CLI works out-of-the-box with PyTorch's built-in optimizers and learning rate schedulers when
-at most one of each is used.
-Only the optimizer or scheduler name needs to be passed, optionally with its ``__init__`` arguments:
-
-.. code-block:: bash
-
-    $ python trainer.py fit --optimizer=Adam --optimizer.lr=0.01 --lr_scheduler=ExponentialLR --lr_scheduler.gamma=0.1
-
-A corresponding example of the config file would be:
-
-.. code-block:: yaml
-
-    optimizer:
-      class_path: torch.optim.Adam
-      init_args:
-        lr: 0.01
-    lr_scheduler:
-      class_path: torch.optim.lr_scheduler.ExponentialLR
-      init_args:
-        gamma: 0.1
-    model:
-      ...
-    trainer:
-      ...
-
-.. note::
-
-    This shorthand notation is only supported in the shell and not inside a configuration file. The configuration file
-    generated by calling the previous command with ``--print_config`` will have the ``class_path`` notation.
-
-Furthermore, you can register your own optimizers and/or learning rate schedulers as follows:
-
-.. code-block:: python
-
-    from pytorch_lightning.utilities.cli import OPTIMIZER_REGISTRY, LR_SCHEDULER_REGISTRY
-
-
-    @OPTIMIZER_REGISTRY
-    class CustomAdam(torch.optim.Adam):
-        ...
-
-
-    @LR_SCHEDULER_REGISTRY
-    class CustomCosineAnnealingLR(torch.optim.lr_scheduler.CosineAnnealingLR):
-        ...
-
-
-    # register all `Optimizer` subclasses from the `torch.optim` package
-    # This is done automatically!
-    OPTIMIZER_REGISTRY.register_classes(torch.optim, Optimizer)
-
-    cli = LightningCLI(...)
-
-.. code-block:: bash
-
-    $ python trainer.py fit --optimizer=CustomAdam --optimizer.lr=0.01 --lr_scheduler=CustomCosineAnnealingLR
-
-The :class:`torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler requires an additional monitor argument:
-
-.. code-block:: bash
-
-    $ python trainer.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=metric_to_track
-
-If you need to customize the learning rate scheduler configuration, you can do so by overriding
-:meth:`~pytorch_lightning.utilities.cli.LightningCLI.configure_optimizers`:
-
-.. testcode::
-
-    class MyLightningCLI(LightningCLI):
-        @staticmethod
-        def configure_optimizers(lightning_module, optimizer, lr_scheduler=None):
-            return ...
-
-If you will not be changing the class, you can manually add the arguments for specific optimizers and/or
-learning rate schedulers by subclassing the CLI. This has the advantage of providing the proper help message for those
-classes. The following code snippet shows how to implement it:
-
-.. testcode::
-
-    class MyLightningCLI(LightningCLI):
-        def add_arguments_to_parser(self, parser):
-            parser.add_optimizer_args(torch.optim.Adam)
-            parser.add_lr_scheduler_args(torch.optim.lr_scheduler.ExponentialLR)
-
-With this, in the config the :code:`optimizer` and :code:`lr_scheduler` groups would accept all of the options for the
-given classes, in this example :code:`Adam` and :code:`ExponentialLR`.
-Therefore, the config file would be structured like:
-
-.. code-block:: yaml
-
-    optimizer:
-      lr: 0.01
-    lr_scheduler:
-      gamma: 0.2
-    model:
-      ...
-    trainer:
-      ...
-
-Where the arguments can be passed directly through command line without specifying the class. For example:
-
-.. code-block:: bash
-
-    $ python trainer.py fit --optimizer.lr=0.01 --lr_scheduler.gamma=0.2
-
-The automatic implementation of :code:`configure_optimizers` can be disabled by linking the configuration group. An
-example can be when one wants to add support for multiple optimizers:
-
-.. code-block:: python
-
-    from pytorch_lightning.utilities.cli import instantiate_class
-
-
-    class MyModel(LightningModule):
-        def __init__(self, optimizer1_init: dict, optimizer2_init: dict):
-            super().__init__()
-            self.optimizer1_init = optimizer1_init
-            self.optimizer2_init = optimizer2_init
-
-        def configure_optimizers(self):
-            optimizer1 = instantiate_class(self.parameters(), self.optimizer1_init)
-            optimizer2 = instantiate_class(self.parameters(), self.optimizer2_init)
-            return [optimizer1, optimizer2]
-
-
-    class MyLightningCLI(LightningCLI):
-        def add_arguments_to_parser(self, parser):
-            parser.add_optimizer_args(
-                OPTIMIZER_REGISTRY.classes, nested_key="gen_optimizer", link_to="model.optimizer1_init"
-            )
-            parser.add_optimizer_args(
-                OPTIMIZER_REGISTRY.classes, nested_key="gen_discriminator", link_to="model.optimizer2_init"
-            )
-
-
-    cli = MyLightningCLI(MyModel)
-
-The value given to :code:`optimizer*_init` will always be a dictionary including :code:`class_path` and
-:code:`init_args` entries. The function :func:`~pytorch_lightning.utilities.cli.instantiate_class`
-takes care of importing the class defined in :code:`class_path` and instantiating it using some positional arguments,
-in this case :code:`self.parameters()`, and the :code:`init_args`.
-Any number of optimizers and learning rate schedulers can be added when using :code:`link_to`.
-
-With shorthand notation:
-
-.. code-block:: bash
-
-    $ python trainer.py fit \
-        --gen_optimizer=Adam \
-        --gen_optimizer.lr=0.01 \
-        --gen_discriminator=AdamW \
-        --gen_discriminator.lr=0.0001
-
-You can also pass the class path directly, for example, if the optimizer hasn't been registered to the
-``OPTIMIZER_REGISTRY``:
-
-.. code-block:: bash
-
-    $ python trainer.py fit \
-        --gen_optimizer.class_path=torch.optim.Adam \
-        --gen_optimizer.init_args.lr=0.01 \
-        --gen_discriminator.class_path=torch.optim.AdamW \
-        --gen_discriminator.init_args.lr=0.0001
-
-
-Troubleshooting
-^^^^^^^^^^^^^^^
-
-The standard behavior for CLIs, when they fail, is to terminate the process with a non-zero exit code and a short message
-to hint the user about the cause. This is problematic while developing the CLI since there is no information to track
-down the root of the problem. A simple change in the instantiation of the ``LightningCLI`` can be used such that when
-there is a failure an exception is raised and the full stack trace printed.
-
-.. testcode::
-
-    cli = LightningCLI(MyModel, parser_kwargs={"error_handler": None})
-
-.. note::
-
-    When asking about problems and reporting issues please set the ``error_handler`` to ``None`` and include the stack
-    trace in your description. With this, it is more likely for people to help out identifying the cause without needing
-    to create a reproducible script.
-
-
-Notes related to reproducibility
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The topic of reproducibility is complex and it is impossible to guarantee reproducibility by just providing a class that
-people can use in unexpected ways. Nevertheless, the :class:`~pytorch_lightning.utilities.cli.LightningCLI` tries to
-give a framework and recommendations to make reproducibility simpler.
-
-When an experiment is run, it is good practice to use a stable version of the source code, either being a released
-package or at least a commit of some version controlled repository. For each run of a CLI the config file is
-automatically saved including all settings. This is useful to figure out what was done for a particular run without
-requiring to look at the source code. If by mistake the exact version of the source code is lost or some defaults
-changed, having the full config means that most of the information is preserved.
-
-The class is targeted at implementing CLIs because running a command from a shell provides a separation with the Python
-source code. Ideally the CLI would be placed in your path as part of the installation of a stable package, instead of
-running from a clone of a repository that could have uncommitted local modifications. Creating installable packages that
-include CLIs is out of the scope of this document. This is mentioned only as a teaser for people who would strive for
-the best practices possible.
diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
index bfba98591fe0c0..19bb9b0c620223 100644
--- a/docs/source/common/lightning_module.rst
+++ b/docs/source/common/lightning_module.rst
@@ -775,6 +775,8 @@ freeze
 .. automethod:: pytorch_lightning.core.lightning.LightningModule.freeze
     :noindex:
 
+.. _lm-log:
+
 log
 ~~~
 
diff --git a/docs/source/common/loggers.rst b/docs/source/common/loggers.rst
deleted file mode 100644
index 9aff37af52277e..00000000000000
--- a/docs/source/common/loggers.rst
+++ /dev/null
@@ -1,282 +0,0 @@
-.. testsetup:: *
-
-    from pytorch_lightning.trainer.trainer import Trainer
-    from pytorch_lightning.core.lightning import LightningModule
-
-.. _loggers:
-
-*******
-Loggers
-*******
-
-Lightning supports the most popular logging frameworks (TensorBoard, Comet, Neptune, etc...). TensorBoard is used by default,
-but you can pass to the :class:`~pytorch_lightning.trainer.trainer.Trainer` any combination of the following loggers.
-
-.. note::
-
-    All loggers log by default to `os.getcwd()`. To change the path without creating a logger set
-    `Trainer(default_root_dir='/your/path/to/save/checkpoints')`
-
-Read more about :doc:`logging <../extensions/logging>` options.
-
-To log arbitrary artifacts like images or audio samples use the `trainer.log_dir` property to resolve
-the path.
-
-.. code-block:: python
-
-    def training_step(self, batch, batch_idx):
-        img = ...
-        log_image(img, self.trainer.log_dir)
-
-Comet.ml
-========
-
-`Comet.ml <https://www.comet.ml/site/>`_ is a third-party logger.
-To use :class:`~pytorch_lightning.loggers.CometLogger` as your logger do the following.
-First, install the package:
-
-.. code-block:: bash
-
-    pip install comet-ml
-
-Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
-
-.. testcode::
-
-    import os
-    from pytorch_lightning.loggers import CometLogger
-
-    comet_logger = CometLogger(
-        api_key=os.environ.get("COMET_API_KEY"),
-        workspace=os.environ.get("COMET_WORKSPACE"),  # Optional
-        save_dir=".",  # Optional
-        project_name="default_project",  # Optional
-        rest_api_key=os.environ.get("COMET_REST_API_KEY"),  # Optional
-        experiment_name="lightning_logs",  # Optional
-    )
-    trainer = Trainer(logger=comet_logger)
-
-The :class:`~pytorch_lightning.loggers.CometLogger` is available anywhere except ``__init__`` in your
-:class:`~pytorch_lightning.core.lightning.LightningModule`.
-
-.. testcode::
-
-    class MyModule(LightningModule):
-        def any_lightning_module_function_or_hook(self):
-            some_img = fake_image()
-            self.logger.experiment.add_image("generated_images", some_img, 0)
-
-.. seealso::
-    :class:`~pytorch_lightning.loggers.CometLogger` docs.
-
-----------------
-
-MLflow
-======
-
-`MLflow <https://mlflow.org/>`_ is a third-party logger.
-To use :class:`~pytorch_lightning.loggers.MLFlowLogger` as your logger do the following.
-First, install the package:
-
-.. code-block:: bash
-
-    pip install mlflow
-
-Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
-
-.. code-block:: python
-
-    from pytorch_lightning.loggers import MLFlowLogger
-
-    mlf_logger = MLFlowLogger(experiment_name="lightning_logs", tracking_uri="file:./ml-runs")
-    trainer = Trainer(logger=mlf_logger)
-
-.. seealso::
-    :class:`~pytorch_lightning.loggers.MLFlowLogger` docs.
-
-----------------
-
-Neptune.ai
-==========
-
-`Neptune.ai <https://neptune.ai/>`_ is a third-party logger.
-To use :class:`~pytorch_lightning.loggers.NeptuneLogger` as your logger do the following.
-First, install the package:
-
-.. code-block:: bash
-
-    pip install neptune-client
-
-or with conda:
-
-.. code-block:: bash
-
-    conda install -c conda-forge neptune-client
-
-Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
-
-.. code-block:: python
-
-    from pytorch_lightning.loggers import NeptuneLogger
-
-    neptune_logger = NeptuneLogger(
-        api_key="ANONYMOUS",  # replace with your own
-        project="common/pytorch-lightning-integration",  # format "<WORKSPACE/PROJECT>"
-        tags=["training", "resnet"],  # optional
-    )
-    trainer = Trainer(logger=neptune_logger)
-
-The :class:`~pytorch_lightning.loggers.NeptuneLogger` is available anywhere except ``__init__`` in your
-:class:`~pytorch_lightning.core.lightning.LightningModule`.
-
-.. code-block:: python
-
-    class MyModule(LightningModule):
-        def any_lightning_module_function_or_hook(self):
-            # generic recipe for logging custom metadata (neptune specific)
-            metadata = ...
-            self.logger.experiment["your/metadata/structure"].log(metadata)
-
-Note that syntax: ``self.logger.experiment["your/metadata/structure"].log(metadata)``
-is specific to Neptune and it extends logger capabilities.
-Specifically, it allows you to log various types of metadata like scores, files,
-images, interactive visuals, CSVs, etc. Refer to the
-`Neptune docs <https://docs.neptune.ai/you-should-know/logging-metadata#essential-logging-methods>`_
-for more detailed explanations.
-
-You can always use regular logger methods: ``log_metrics()`` and ``log_hyperparams()`` as these are also supported.
-
-.. seealso::
-    :class:`~pytorch_lightning.loggers.NeptuneLogger` docs.
-
-    Logger `user guide <https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning>`_.
-
-----------------
-
-Tensorboard
-===========
-
-To use `TensorBoard <https://pytorch.org/docs/stable/tensorboard.html>`_ as your logger do the following.
-
-.. testcode::
-
-    from pytorch_lightning.loggers import TensorBoardLogger
-
-    logger = TensorBoardLogger("tb_logs", name="my_model")
-    trainer = Trainer(logger=logger)
-
-The :class:`~pytorch_lightning.loggers.TensorBoardLogger` is available anywhere except ``__init__`` in your
-:class:`~pytorch_lightning.core.lightning.LightningModule`.
-
-.. testcode::
-
-    class MyModule(LightningModule):
-        def any_lightning_module_function_or_hook(self):
-            some_img = fake_image()
-            self.logger.experiment.add_image("generated_images", some_img, 0)
-
-To see your logs, run the following command in the terminal:
-
-.. code-block:: bash
-
-    tensorboard --logdir=<logging_folder>
-
-To visualize tensorboard in a jupyter notebook environment, run the following command in a jupyter cell:
-
-.. code-block:: bash
-
-    %reload_ext tensorboard
-    %tensorboard --logdir=<logging_folder>
-
-.. seealso::
-    :class:`~pytorch_lightning.loggers.TensorBoardLogger` docs.
-
-----------------
-
-Weights and Biases
-==================
-
-`Weights and Biases <https://docs.wandb.ai/integrations/lightning/>`_ is a third-party logger.
-To use :class:`~pytorch_lightning.loggers.WandbLogger` as your logger do the following.
-First, install the package:
-
-.. code-block:: bash
-
-    pip install wandb
-
-Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
-
-.. code-block:: python
-
-    from pytorch_lightning.loggers import WandbLogger
-
-    # instrument experiment with W&B
-    wandb_logger = WandbLogger(project="MNIST", log_model="all")
-    trainer = Trainer(logger=wandb_logger)
-
-    # log gradients and model topology
-    wandb_logger.watch(model)
-
-The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except ``__init__`` in your
-:class:`~pytorch_lightning.core.lightning.LightningModule`.
-
-.. code-block:: python
-
-    class MyModule(LightningModule):
-        def any_lightning_module_function_or_hook(self):
-            some_img = fake_image()
-            # Option 1
-            self.logger.experiment.log({"generated_images": [wandb.Image(some_img, caption="...")]})
-            # Option 2 for specifically logging images
-            self.logger.log_image(key="generated_images", images=[some_img])
-
-To visualize using wandb in a jupyter notebook environment use the following magic line command:
-
-.. code-block:: shell
-
-    %%wandb
-
-    # Your training loop here
-
-To display any existing dashboards, sweeps or reports directly in your notebook using the %wandb magic:
-
-.. code-block:: shell
-
-    # Display a project workspace
-    %wandb USERNAME/PROJECT
-
-More information is available `here <https://docs.wandb.ai/guides/track/jupyter>`__.
-
-.. seealso::
-    - :class:`~pytorch_lightning.loggers.WandbLogger` docs.
-    - `W&B Documentation <https://docs.wandb.ai/integrations/lightning>`__
-    - `Demo in Google Colab <http://wandb.me/lightning>`__ with hyperparameter search and model logging
-
-----------------
-
-Multiple Loggers
-================
-
-Lightning supports the use of multiple loggers, just pass a list to the
-:class:`~pytorch_lightning.trainer.trainer.Trainer`.
-
-.. code-block:: python
-
-    from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
-
-    logger1 = TensorBoardLogger(save_dir="tb_logs", name="my_model")
-    logger2 = WandbLogger(save_dir="tb_logs", name="my_model")
-    trainer = Trainer(logger=[logger1, logger2])
-
-The loggers are available as a list anywhere except ``__init__`` in your
-:class:`~pytorch_lightning.core.lightning.LightningModule`.
-
-.. testcode::
-
-    class MyModule(LightningModule):
-        def any_lightning_module_function_or_hook(self):
-            some_img = fake_image()
-            # Option 1
-            self.logger.experiment[0].add_image("generated_images", some_img, 0)
-            # Option 2
-            self.logger[0].experiment.add_image("generated_images", some_img, 0)
diff --git a/docs/source/common/optimization.rst b/docs/source/common/optimization.rst
index 42b28140cf6d36..e7e9e12605c38e 100644
--- a/docs/source/common/optimization.rst
+++ b/docs/source/common/optimization.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. _optimization:
 
 ############
@@ -16,297 +18,9 @@ For advanced/expert users who want to do esoteric optimization schedules or tech
 
 .. _manual_optimization:
 
-*******************
-Manual Optimization
-*******************
-
-For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable to
-manually manage the optimization process.
-
-This is only recommended for experts who need ultimate flexibility.
-Lightning will handle only accelerator, precision and strategy logic.
-The users are left with ``optimizer.zero_grad()``, gradient accumulation, model toggling, etc..
-
-To manually optimize, do the following:
-
-* Set ``self.automatic_optimization=False`` in your ``LightningModule``'s ``__init__``.
-* Use the following functions and call them manually:
-
-  * ``self.optimizers()`` to access your optimizers (one or multiple)
-  * ``optimizer.zero_grad()`` to clear the gradients from the previous training step
-  * ``self.manual_backward(loss)`` instead of ``loss.backward()``
-  * ``optimizer.step()`` to update your model parameters
-
-Here is a minimal example of manual optimization.
-
-.. testcode:: python
-
-    from pytorch_lightning import LightningModule
-
-
-    class MyModel(LightningModule):
-        def __init__(self):
-            super().__init__()
-            # Important: This property activates manual optimization.
-            self.automatic_optimization = False
-
-        def training_step(self, batch, batch_idx):
-            opt = self.optimizers()
-            opt.zero_grad()
-            loss = self.compute_loss(batch)
-            self.manual_backward(loss)
-            opt.step()
-
-.. warning::
-   Before 1.2, ``optimizer.step()`` was calling ``optimizer.zero_grad()`` internally.
-   From 1.2, it is left to the user's expertise.
-
-.. tip::
-   Be careful where you call ``optimizer.zero_grad()``, or your model won't converge.
-   It is good practice to call ``optimizer.zero_grad()`` before ``self.manual_backward(loss)``.
-
-
-Access your Own Optimizer
-=========================
-
-The provided ``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer
-configured in your :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers`. You can access your own optimizer
-with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to
-support accelerators, precision and profiling for you.
-
-.. testcode:: python
-
-   class Model(LightningModule):
-       def __init__(self):
-           super().__init__()
-           self.automatic_optimization = False
-           ...
-
-       def training_step(self, batch, batch_idx):
-           optimizer = self.optimizers()
-
-           # `optimizer` is a `LightningOptimizer` wrapping the optimizer.
-           # To access it, do the following.
-           # However, it won't work on TPU, AMP, etc...
-           optimizer = optimizer.optimizer
-           ...
-
-Gradient Accumulation
-=====================
-
-You can accumulate gradients over batches similarly to ``accumulate_grad_batches`` argument in
-:ref:`Trainer <trainer>` for automatic optimization. To perform gradient accumulation with one optimizer
-after every ``N`` steps, you can do as such.
-
-.. testcode:: python
-
-    def __init__(self):
-        super().__init__()
-        self.automatic_optimization = False
-
-
-    def training_step(self, batch, batch_idx):
-        opt = self.optimizers()
-
-        loss = self.compute_loss(batch)
-        self.manual_backward(loss)
-
-        # accumulate gradients of N batches
-        if (batch_idx + 1) % N == 0:
-            opt.step()
-            opt.zero_grad()
-
-
-Use Multiple Optimizers (like GANs)
-===================================
-
-Here is an example training a simple GAN with multiple optimizers using manual optimization.
-
-.. testcode:: python
-
-    import torch
-    from torch import Tensor
-    from pytorch_lightning import LightningModule
-
-
-    class SimpleGAN(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.G = Generator()
-            self.D = Discriminator()
-
-            # Important: This property activates manual optimization.
-            self.automatic_optimization = False
-
-        def sample_z(self, n) -> Tensor:
-            sample = self._Z.sample((n,))
-            return sample
-
-        def sample_G(self, n) -> Tensor:
-            z = self.sample_z(n)
-            return self.G(z)
-
-        def training_step(self, batch, batch_idx):
-            # Implementation follows the PyTorch tutorial:
-            # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
-            g_opt, d_opt = self.optimizers()
-
-            X, _ = batch
-            batch_size = X.shape[0]
-
-            real_label = torch.ones((batch_size, 1), device=self.device)
-            fake_label = torch.zeros((batch_size, 1), device=self.device)
-
-            g_X = self.sample_G(batch_size)
-
-            ##########################
-            # Optimize Discriminator #
-            ##########################
-            d_x = self.D(X)
-            errD_real = self.criterion(d_x, real_label)
-
-            d_z = self.D(g_X.detach())
-            errD_fake = self.criterion(d_z, fake_label)
-
-            errD = errD_real + errD_fake
-
-            d_opt.zero_grad()
-            self.manual_backward(errD)
-            d_opt.step()
-
-            ######################
-            # Optimize Generator #
-            ######################
-            d_z = self.D(g_X)
-            errG = self.criterion(d_z, real_label)
-
-            g_opt.zero_grad()
-            self.manual_backward(errG)
-            g_opt.step()
-
-            self.log_dict({"g_loss": errG, "d_loss": errD}, prog_bar=True)
-
-        def configure_optimizers(self):
-            g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5)
-            d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5)
-            return g_opt, d_opt
-
-
-Learning Rate Scheduling
-========================
-
-Every optimizer you use can be paired with any
-`Learning Rate Scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_. Please see the
-documentation of :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` for all the available options
-
-You can call ``lr_scheduler.step()`` at arbitrary intervals.
-Use ``self.lr_schedulers()`` in  your :class:`~pytorch_lightning.core.lightning.LightningModule` to access any learning rate schedulers
-defined in your :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers`.
-
-.. warning::
-   * Before v1.3, Lightning automatically called ``lr_scheduler.step()`` in both automatic and manual optimization. From
-     1.3, ``lr_scheduler.step()`` is now for the user to call at arbitrary intervals.
-   * Note that the ``lr_scheduler_config`` keys, such as ``"frequency"`` and ``"interval"``, will be ignored even if they are provided in
-     your :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` during manual optimization.
-
-Here is an example calling ``lr_scheduler.step()`` every step.
-
-.. testcode:: python
-
-    # step every batch
-    def __init__(self):
-        super().__init__()
-        self.automatic_optimization = False
-
-
-    def training_step(self, batch, batch_idx):
-        # do forward, backward, and optimization
-        ...
-
-        # single scheduler
-        sch = self.lr_schedulers()
-        sch.step()
-
-        # multiple schedulers
-        sch1, sch2 = self.lr_schedulers()
-        sch1.step()
-        sch2.step()
-
-If you want to call ``lr_scheduler.step()`` every ``N`` steps/epochs, do the following.
-
-.. testcode:: python
-
-    def __init__(self):
-        super().__init__()
-        self.automatic_optimization = False
-
-
-    def training_step(self, batch, batch_idx):
-        # do forward, backward, and optimization
-        ...
-
-        sch = self.lr_schedulers()
-
-        # step every N batches
-        if (batch_idx + 1) % N == 0:
-            sch.step()
-
-        # step every N epochs
-        if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % N == 0:
-            sch.step()
-
-If you want to call schedulers that require a metric value after each epoch, consider doing the following:
-
-.. testcode::
-
-    def __init__(self):
-        super().__init__()
-        self.automatic_optimization = False
-
-
-    def training_epoch_end(self, outputs):
-        sch = self.lr_schedulers()
-
-        # If the selected scheduler is a ReduceLROnPlateau scheduler.
-        if isinstance(sch, torch.optim.lr_scheduler.ReduceLROnPlateau):
-            sch.step(self.trainer.callback_metrics["loss"])
-
-Use Closure for LBFGS-like Optimizers
-=====================================
-
-It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and
-``backward`` of your model. It is optional for most optimizers, but makes your code compatible if you switch to an
-optimizer which requires a closure, such as :class:`~torch.optim.LBFGS`.
-
-See `the PyTorch docs <https://pytorch.org/docs/stable/optim.html#optimizer-step-closure>`_ for more about the closure.
-
-Here is an example using a closure function.
-
-.. testcode:: python
-
-    def __init__(self):
-        super().__init__()
-        self.automatic_optimization = False
-
-
-    def configure_optimizers(self):
-        return torch.optim.LBFGS(...)
-
-
-    def training_step(self, batch, batch_idx):
-        opt = self.optimizers()
-
-        def closure():
-            loss = self.compute_loss(batch)
-            opt.zero_grad()
-            self.manual_backward(loss)
-            return loss
-
-        opt.step(closure=closure)
-
-.. warning::
-   The :class:`~torch.optim.LBFGS` optimizer is not supported for apex AMP, native AMP, IPUs, or DeepSpeed.
+----
 
+.. include:: ../model/manual_optimization.rst
 
 -----
 
diff --git a/docs/source/common/precision.rst b/docs/source/common/precision.rst
new file mode 100644
index 00000000000000..15fcdf0c0a1aea
--- /dev/null
+++ b/docs/source/common/precision.rst
@@ -0,0 +1,43 @@
+:orphan:
+
+.. _precision:
+
+###############
+N-Bit Precision
+###############
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Basic
+   :description: Enable your models to train faster and save memory with different floating-point precision settings.
+   :col_css: col-md-4
+   :button_link: precision_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Intermediate
+   :description: Enable state-of-the-art scaling with advanced mix-precision settings.
+   :col_css: col-md-4
+   :button_link: precision_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Expert
+   :description: Create new precision techniques and enable them through Lightning.
+   :col_css: col-md-4
+   :button_link: precision_expert.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/common/precision_basic.rst b/docs/source/common/precision_basic.rst
new file mode 100644
index 00000000000000..3cc0b3a9677be4
--- /dev/null
+++ b/docs/source/common/precision_basic.rst
@@ -0,0 +1,92 @@
+:orphan:
+
+.. _precision_basic:
+
+#######################
+N-Bit Precision (Basic)
+#######################
+**Audience:** Users looking to train models faster and consume less memory.
+
+----
+
+If you're looking to run models faster or consume less memory, consider tweaking the precision settings of your models.
+
+Lower precision, such as 16-bit floating-point, requires less memory and enables training and deploying larger models.
+Higher precision, such as the 64-bit floating-point, can be used for highly sensitive use-cases.
+
+----
+
+****************
+16-bit Precision
+****************
+
+Use 16-bit precision to cut your memory consumption in half so that you can train and deploy larger models. If your GPUs are [`Tensor Core <https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html>`_] GPUs, you can also get a ~3x speed improvement. Half precision can sometimes lead to unstable training.
+
+.. code::
+
+    Trainer(precision=16)
+
+----
+
+****************
+32-bit Precision
+****************
+
+32-bit precision is the default used across all models and research. This precision is known to be stable in contrast to lower precision settings.
+
+.. testcode::
+
+    Trainer(precision=32)
+
+----
+
+****************
+64-bit Precision
+****************
+
+For certain scientific computations, 64-bit precision enables more accurate models. However, doubling the precision from 32 to 64 bit also doubles the memory requirements.
+
+.. testcode::
+
+    Trainer(precision=64)
+
+.. note::
+
+    Since in deep learning, memory is always a bottleneck, especially when dealing with a large volume of data and with limited resources.
+    It is recommended using single precision for better speed. Although you can still use it if you want for your particular use-case.
+
+----
+
+********************************
+Precision support by accelerator
+********************************
+
+.. list-table:: Precision with Accelerators
+   :widths: 20 20 20 20 20
+   :header-rows: 1
+
+   * - Precision
+     - CPU
+     - GPU
+     - TPU
+     - IPU
+   * - 16
+     - No
+     - Yes
+     - No
+     - Yes
+   * - BFloat16
+     - Yes
+     - Yes
+     - Yes
+     - No
+   * - 32
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - 64
+     - Yes
+     - Yes
+     - No
+     - No
diff --git a/docs/source/common/precision_expert.rst b/docs/source/common/precision_expert.rst
new file mode 100644
index 00000000000000..34bc95568c9627
--- /dev/null
+++ b/docs/source/common/precision_expert.rst
@@ -0,0 +1,28 @@
+:orphan:
+
+.. _precision_expert:
+
+########################
+N-Bit Precision (Expert)
+########################
+**Audience:** Researchers looking to integrate their new precision techniques into Lightning.
+
+
+*****************
+Precision Plugins
+*****************
+
+You can also customize and pass your own Precision Plugin by subclassing the :class:`~pytorch_lightning.plugins.precision.precision_plugin.PrecisionPlugin` class.
+
+- Perform pre and post backward/optimizer step operations such as scaling gradients.
+- Provide context managers for forward, training_step, etc.
+
+.. code-block:: python
+
+    class CustomPrecisionPlugin(PrecisionPlugin):
+        precision = 16
+
+        ...
+
+
+    trainer = Trainer(plugins=[CustomPrecisionPlugin()])
diff --git a/docs/source/advanced/precision.rst b/docs/source/common/precision_intermediate.rst
similarity index 69%
rename from docs/source/advanced/precision.rst
rename to docs/source/common/precision_intermediate.rst
index 13900ffb5bb7db..9ed4c75ecc3f89 100644
--- a/docs/source/advanced/precision.rst
+++ b/docs/source/common/precision_intermediate.rst
@@ -1,63 +1,17 @@
-.. testsetup:: *
-
-    from pytorch_lightning import Trainer
-
-
-.. _amp:
-
-
-###############
-N-Bit Precision
-###############
-
-There are numerous benefits to using numerical formats with lower precision than the 32-bit floating-point or higher precision such as 64-bit floating-point.
-
-Lower precision, such as the 16-bit floating-point, requires less memory enabling the training and deployment of large neural networks, enhances data transfer operations since they require
-less memory bandwidth and run batch operations much faster on GPUs that support Tensor Core. [`1 <https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html>`_].
-
-Higher precision, such as the 64-bit floating-point, can be used for highly sensitive use-cases.
-
-Following are the precisions available in Lightning along with their supported Accelerator:
-
-.. list-table:: Precision with Accelerators
-   :widths: 20 20 20 20 20 20
-   :header-rows: 1
-
-   * - Precision
-     - CPU
-     - GPU
-     - TPU
-     - IPU
-     - HPU
-   * - 16
-     - No
-     - Yes
-     - No
-     - Yes
-     - No
-   * - BFloat16
-     - Yes
-     - Yes
-     - Yes
-     - No
-     - Yes
-   * - 32
-     - Yes
-     - Yes
-     - Yes
-     - Yes
-     - Yes
-   * - 64
-     - Yes
-     - Yes
-     - No
-     - No
-     - No
+:orphan:
 
+.. _precision_intermediate:
 
-***************
-Mixed Precision
-***************
+##############################
+N-Bit Precision (Intermediate)
+##############################
+**Audience:** Users looking to scale larger models or take advantage of optimized accelerators.
+
+----
+
+************************
+What is Mixed Precision?
+************************
 
 PyTorch, like most deep learning frameworks, trains on 32-bit floating-point (FP32) arithmetic by default. However, many deep learning models do not require this to reach complete accuracy. By conducting
 operations in half-precision format while keeping minimum information in single-precision to maintain as much information as possible in crucial areas of the network, mixed precision training delivers
@@ -97,9 +51,11 @@ delivers all of these benefits while ensuring that no task-specific accuracy is
 
         trainer = Trainer(accelerator="gpu", devices=1, precision=32)
 
+----
 
+********************
 FP16 Mixed Precision
-====================
+********************
 
 In most cases, mixed precision uses FP16. Supported `PyTorch operations <https://pytorch.org/docs/stable/amp.html#op-specific-behavior>`__ automatically run in FP16, saving memory and improving throughput on the supported accelerators.
 
@@ -147,9 +103,11 @@ Set the `NVIDIA optimization level <https://nvidia.github.io/apex/amp.html#opt-l
 
     Trainer(accelerator="gpu", devices=1, amp_backend="apex", amp_level="O2", precision=16)
 
+----
 
+************************
 BFloat16 Mixed Precision
-========================
+************************
 
 .. warning::
 
@@ -175,58 +133,11 @@ It is also possible to use BFloat16 mixed precision on the CPU, relying on MKLDN
 
     Trainer(precision="bf16")
 
-
-****************
-Single Precision
-****************
-
-PyTorch models train with 32-bit floating-point (FP32) arithmetic by default.
-Lightning uses 32-bit by default. You can also set it using:
-
-.. testcode::
-
-    Trainer(precision=32)
-
-
-****************
-Double Precision
-****************
-
-Lightning supports training models with double precision/64-bit. You can set it using:
-
-.. testcode::
-
-    Trainer(precision=64)
-
-.. note::
-
-    Since in deep learning, memory is always a bottleneck, especially when dealing with a large volume of data and with limited resources.
-    It is recommended using single precision for better speed. Although you can still use it if you want for your particular use-case.
-
-
-*****************
-Precision Plugins
-*****************
-
-You can also customize and pass your own Precision Plugin by subclassing the :class:`~pytorch_lightning.plugins.precision.precision_plugin.PrecisionPlugin` class.
-
-- Perform pre and post backward/optimizer step operations such as scaling gradients.
-- Provide context managers for forward, training_step, etc.
-
-.. code-block:: python
-
-    class CustomPrecisionPlugin(PrecisionPlugin):
-        precision = 16
-
-        ...
-
-
-    trainer = Trainer(plugins=[CustomPrecisionPlugin()])
-
+----
 
 ***************
 8-bit Optimizer
 ***************
 
 It is possible to further reduce the precision using third-party libraries like `bitsandbytes <https://github.com/facebookresearch/bitsandbytes>`_. Although,
-Lightning doesn't support it out of the box yet but you can still use it by configuring it in your :class:`~pytorch_lightning.core.lightning.LightningModule` and setting ``Trainer(precision=32)``.
+Lightning doesn't support it out of the box yet but you can still use it by configuring it in your LightningModule and setting ``Trainer(precision=32)``.
diff --git a/docs/source/common/production_inference.rst b/docs/source/common/production_inference.rst
deleted file mode 100644
index dd27d0a540cbd8..00000000000000
--- a/docs/source/common/production_inference.rst
+++ /dev/null
@@ -1,284 +0,0 @@
-.. _production_inference:
-
-#######################
-Inference in Production
-#######################
-
-Once a model is trained, deploying to production and running inference is the next task. To help you with it, here are the possible approaches
-you can use to deploy and make inferences with your models.
-
-------------
-
-******************
-With Lightning API
-******************
-
-The following are some possible ways you can use Lightning to run inference in production. Note that PyTorch Lightning has some extra dependencies and using raw PyTorch might be advantageous.
-in your production environment.
-
-------------
-
-Prediction API
-==============
-
-Lightning provides you with a prediction API that can be accessed using :meth:`~pytorch_lightning.trainer.trainer.Trainer.predict`.
-To configure this with your LightningModule, you would need to override the :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step` method.
-By default :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step` calls the :meth:`~pytorch_lightning.core.lightning.LightningModule.forward`
-method. In order to customize this behaviour, simply override the :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step` method. This can be useful to add some pre-processing or post-processing logic to your data.
-
-For the example let's override ``predict_step`` and try out `Monte Carlo Dropout <https://arxiv.org/pdf/1506.02142.pdf>`_:
-
-.. code-block:: python
-
-    class LitMCdropoutModel(pl.LightningModule):
-        def __init__(self, model, mc_iteration):
-            super().__init__()
-            self.model = model
-            self.dropout = nn.Dropout()
-            self.mc_iteration = mc_iteration
-
-        def predict_step(self, batch, batch_idx):
-            # enable Monte Carlo Dropout
-            self.dropout.train()
-
-            # take average of `self.mc_iteration` iterations
-            pred = [self.dropout(self.model(x)).unsqueeze(0) for _ in range(self.mc_iteration)]
-            pred = torch.vstack(pred).mean(dim=0)
-            return pred
-
-------------
-
-PyTorch Runtime
-===============
-
-You can also load the saved checkpoint and use it as a regular :class:`torch.nn.Module`.
-
-.. code-block:: python
-
-    class SimpleModel(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.l1 = torch.nn.Linear(in_features=64, out_features=4)
-
-        def forward(self, x):
-            return torch.relu(self.l1(x.view(x.size(0), -1)))
-
-
-    # create the model
-    model = SimpleModel()
-
-    # train it
-    trainer = Trainer(accelerator="gpu", devices=2)
-    trainer.fit(model, train_dataloader, val_dataloader)
-    trainer.save_checkpoint("best_model.ckpt", weights_only=True)
-
-    # use model after training or load weights and drop into the production system
-    model = SimpleModel.load_from_checkpoint("best_model.ckpt")
-    model.eval()
-    x = torch.randn(1, 64)
-
-    with torch.no_grad():
-        y_hat = model(x)
-
-------------
-
-*********************
-Without Lightning API
-*********************
-
-As the :class:`~pytorch_lightning.core.lightning.LightningModule` is simply a :class:`torch.nn.Module`, common techniques to export PyTorch models
-to production apply here too. However, the :class:`~pytorch_lightning.core.lightning.LightningModule` provides helper methods to help you out with it.
-
-------------
-
-Convert to ONNX
-===============
-
-Lightning provides a handy function to quickly export your model to `ONNX <https://pytorch.org/docs/stable/onnx.html>`_ format
-which allows the model to be independent of PyTorch and run on an ONNX Runtime.
-
-To export your model to ONNX format call the :meth:`~pytorch_lightning.core.lightning.LightningModule.to_onnx` function on your :class:`~pytorch_lightning.core.lightning.LightningModule` with the ``filepath`` and ``input_sample``.
-
-.. code-block:: python
-
-    class SimpleModel(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.l1 = torch.nn.Linear(in_features=64, out_features=4)
-
-        def forward(self, x):
-            return torch.relu(self.l1(x.view(x.size(0), -1)))
-
-
-    # create the model
-    model = SimpleModel()
-    filepath = "model.onnx"
-    input_sample = torch.randn((1, 64))
-    model.to_onnx(filepath, input_sample, export_params=True)
-
-You can also skip passing the input sample if the ``example_input_array`` property is specified in your :class:`~pytorch_lightning.core.lightning.LightningModule`.
-
-.. code-block:: python
-
-    class SimpleModel(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.l1 = torch.nn.Linear(in_features=64, out_features=4)
-            self.example_input_array = torch.randn(7, 64)
-
-        def forward(self, x):
-            return torch.relu(self.l1(x.view(x.size(0), -1)))
-
-
-    # create the model
-    model = SimpleModel()
-    filepath = "model.onnx"
-    model.to_onnx(filepath, export_params=True)
-
-Once you have the exported model, you can run it on your ONNX runtime in the following way:
-
-.. code-block:: python
-
-    import onnxruntime
-
-    ort_session = onnxruntime.InferenceSession(filepath)
-    input_name = ort_session.get_inputs()[0].name
-    ort_inputs = {input_name: np.random.randn(1, 64)}
-    ort_outs = ort_session.run(None, ort_inputs)
-
-------------
-
-Convert to TorchScript
-======================
-
-`TorchScript <https://pytorch.org/docs/stable/jit.html>`_ allows you to serialize your models in a way that it can be loaded in non-Python environments.
-The ``LightningModule`` has a handy method :meth:`~pytorch_lightning.core.lightning.LightningModule.to_torchscript` that returns a scripted module which you
-can save or directly use.
-
-.. testcode:: python
-
-    class SimpleModel(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.l1 = torch.nn.Linear(in_features=64, out_features=4)
-
-        def forward(self, x):
-            return torch.relu(self.l1(x.view(x.size(0), -1)))
-
-
-    # create the model
-    model = SimpleModel()
-    script = model.to_torchscript()
-
-    # save for use in production environment
-    torch.jit.save(script, "model.pt")
-
-It is recommended that you install the latest supported version of PyTorch to use this feature without limitations.
-
-Once you have the exported model, you can run it in Pytorch or C++ runtime:
-
-.. code-block:: python
-
-    inp = torch.rand(1, 64)
-    scripted_module = torch.jit.load("model.pt")
-    output = scripted_module(inp)
-
-
-If you want to script a different method, you can decorate the method with :func:`torch.jit.export`:
-
-.. code-block:: python
-
-    class LitMCdropoutModel(pl.LightningModule):
-        def __init__(self, model, mc_iteration):
-            super().__init__()
-            self.model = model
-            self.dropout = nn.Dropout()
-            self.mc_iteration = mc_iteration
-
-        @torch.jit.export
-        def predict_step(self, batch, batch_idx):
-            # enable Monte Carlo Dropout
-            self.dropout.train()
-
-            # take average of `self.mc_iteration` iterations
-            pred = [self.dropout(self.model(x)).unsqueeze(0) for _ in range(self.mc_iteration)]
-            pred = torch.vstack(pred).mean(dim=0)
-            return pred
-
-
-    model = LitMCdropoutModel(...)
-    script = model.to_torchscript(file_path="model.pt", method="script")
-
-
-------------
-
-PyTorch Runtime
-===============
-
-You can also load the saved checkpoint and use it as a regular :class:`torch.nn.Module`. You can extract all your :class:`torch.nn.Module`
-and load the weights using the checkpoint saved using LightningModule after training. For this, we recommend copying the exact implementation
-from your LightningModule ``init`` and ``forward`` method.
-
-.. code-block:: python
-
-    class Encoder(nn.Module):
-        ...
-
-
-    class Decoder(nn.Module):
-        ...
-
-
-    class AutoEncoderProd(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.encoder = Encoder()
-            self.decoder = Decoder()
-
-        def forward(self, x):
-            return self.encoder(x)
-
-
-    class AutoEncoderSystem(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.auto_encoder = AutoEncoderProd()
-
-        def forward(self, x):
-            return self.auto_encoder.encoder(x)
-
-        def training_step(self, batch, batch_idx):
-            x, y = batch
-            y_hat = self.auto_encoder.encoder(x)
-            y_hat = self.auto_encoder.decoder(y_hat)
-            loss = ...
-            return loss
-
-
-    # train it
-    trainer = Trainer(devices=2, accelerator="gpu", strategy="ddp")
-    model = AutoEncoderSystem()
-    trainer.fit(model, train_dataloader, val_dataloader)
-    trainer.save_checkpoint("best_model.ckpt")
-
-
-    # create the PyTorch model and load the checkpoint weights
-    model = AutoEncoderProd()
-    checkpoint = torch.load("best_model.ckpt")
-    hyper_parameters = checkpoint["hyper_parameters"]
-
-    # if you want to restore any hyperparameters, you can pass them too
-    model = AutoEncoderProd(**hyper_parameters)
-
-    state_dict = checkpoint["state_dict"]
-
-    # update keys by dropping `auto_encoder.`
-    for key in list(model_weights):
-        model_weights[key.replace("auto_encoder.", "")] = model_weights.pop(key)
-
-    model.load_state_dict(model_weights)
-    model.eval()
-    x = torch.randn(1, 64)
-
-    with torch.no_grad():
-        y_hat = model(x)
diff --git a/docs/source/common/progress_bar.rst b/docs/source/common/progress_bar.rst
index 92663ef752d77a..d00c716bb83cf0 100644
--- a/docs/source/common/progress_bar.rst
+++ b/docs/source/common/progress_bar.rst
@@ -5,8 +5,8 @@
 .. _progress_bar:
 
 
-Progress Bar
-============
+Customize the progress bar
+==========================
 
 Lightning supports two different types of progress bars (`tqdm <https://github.com/tqdm/tqdm>`_ and `rich <https://github.com/Textualize/rich>`_). :class:`~pytorch_lightning.callbacks.TQDMProgressBar` is used by default,
 but you can override it by passing a custom :class:`~pytorch_lightning.callbacks.TQDMProgressBar` or :class:`~pytorch_lightning.callbacks.RichProgressBar` to the ``callbacks`` argument of the :class:`~pytorch_lightning.trainer.trainer.Trainer`.
diff --git a/docs/source/common/remote_fs.rst b/docs/source/common/remote_fs.rst
index cf85c0fcf82c77..29a4fe78a9ef4c 100644
--- a/docs/source/common/remote_fs.rst
+++ b/docs/source/common/remote_fs.rst
@@ -54,4 +54,4 @@ You could learn more about the available filesystems with:
     print(known_implementations)
 
 
-You could also look into :ref:`CheckpointIO Plugin <common/checkpointing:Customize Checkpointing>` for more details on how to customize saving and loading checkpoints.
+You could also look into :ref:`CheckpointIO Plugin <checkpointing_expert>` for more details on how to customize saving and loading checkpoints.
diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst
index d1b5c0d7161f8e..e63640e99f8ce0 100644
--- a/docs/source/common/trainer.rst
+++ b/docs/source/common/trainer.rst
@@ -158,7 +158,7 @@ So you can run it like so:
 Validation
 ----------
 You can perform an evaluation epoch over the validation set, outside of the training loop,
-using :meth:`pytorch_lightning.trainer.trainer.Trainer.validate`. This might be
+using :meth:`~pytorch_lightning.trainer.trainer.Trainer.validate`. This might be
 useful if you want to collect new metrics from a model right at its initialization
 or after it has already been trained.
 
@@ -438,7 +438,7 @@ benchmark
 |
 
 Defaults to ``True`` if :paramref:`~pytorch_lightning.trainer.Trainer.deterministic` is not set.
-This flag sets the ``torch.backends.cudnn.deterministic`` flag. You can read more about its impact
+This flag sets the ``torch.backends.cudnn.benchmark`` flag. You can read more about its impact
 `here <https://pytorch.org/docs/stable/notes/randomness.html#cuda-convolution-benchmarking>`__
 
 This is likely to increase the speed of your system if your input sizes don't change. However, if they do, then it
@@ -464,12 +464,11 @@ deterministic
 
 |
 
-If true enables cudnn.deterministic.
+This flag sets the ``torch.backends.cudnn.deterministic`` flag.
 Might make your system slower, but ensures reproducibility.
 Also sets ``$HOROVOD_FUSION_THRESHOLD=0``.
 
-For more info check `[pytorch docs]
-<https://pytorch.org/docs/stable/notes/randomness.html>`_.
+For more info check `PyTorch docs <https://pytorch.org/docs/stable/notes/randomness.html>`_.
 
 Example::
 
@@ -669,43 +668,38 @@ fast_dev_run
 
 |
 
-Runs n if set to ``n`` (int) else 1 if set to ``True`` batch(es) of train, val and test
-to find any bugs (ie: a sort of unit test).
-
-Under the hood the pseudocode looks like this when running *fast_dev_run* with a single batch:
+Runs n if set to ``n`` (int) else 1 if set to ``True`` batch(es) to ensure your code will execute without errors. This
+applies to fitting, validating, testing, and predicting. This flag is **only** recommended for debugging purposes and
+should not be used to limit the number of batches to run.
 
 .. code-block:: python
 
-    # loading
-    __init__()
-    prepare_data
-
-    # test training step
-    training_batch = next(train_dataloader)
-    training_step(training_batch)
-
-    # test val step
-    val_batch = next(val_dataloader)
-    out = validation_step(val_batch)
-    validation_epoch_end([out])
-
-.. testcode::
-
     # default used by the Trainer
     trainer = Trainer(fast_dev_run=False)
 
-    # runs 1 train, val, test batch and program ends
+    # runs only 1 training and 1 validation batch and the program ends
     trainer = Trainer(fast_dev_run=True)
+    trainer.fit(...)
 
-    # runs 7 train, val, test batches and program ends
+    # runs 7 predict batches and program ends
     trainer = Trainer(fast_dev_run=7)
-
-.. note::
-
-    This argument is a bit different from ``limit_train/val/test_batches``. Setting this argument will
-    disable tuner, checkpoint callbacks, early stopping callbacks, loggers and logger callbacks like
-    ``LearningRateLogger`` and runs for only 1 epoch. This must be used only for debugging purposes.
-    ``limit_train/val/test_batches`` only limits the number of batches and won't disable anything.
+    trainer.predict(...)
+
+This argument is different from ``limit_{train,val,test,predict}_batches`` because side effects are avoided to reduce the
+impact to subsequent runs. These are the changes enabled:
+
+- Sets ``Trainer(max_epochs=1)``.
+- Sets ``Trainer(max_steps=...)`` to 1 or the number passed.
+- Sets ``Trainer(num_sanity_val_steps=0)``.
+- Sets ``Trainer(val_check_interval=1.0)``.
+- Sets ``Trainer(check_every_n_epoch=1)``.
+- Disables all loggers.
+- Disables passing logged metrics to loggers.
+- The :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint` callbacks will not trigger.
+- The :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping` callbacks will not trigger.
+- Sets ``limit_{train,val,test,predict}_batches`` to 1 or the number passed.
+- Disables the Tuner.
+- If using the CLI, the configuration file is not saved.
 
 flush_logs_every_n_steps
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -780,7 +774,7 @@ Example::
     trainer = Trainer(gpus=[1, 4], num_nodes=4)
 
 See Also:
-    - :ref:`accelerators/gpu:Multi GPU Training`
+    - :ref:`Multi GPU Training <multi_gpu>`
 
 gradient_clip_val
 ^^^^^^^^^^^^^^^^^
@@ -918,7 +912,7 @@ logger
 
 |
 
-:doc:`Logger <../common/loggers>` (or iterable collection of loggers) for experiment tracking. A ``True`` value uses the default ``TensorBoardLogger`` shown below. ``False`` will disable logging.
+:doc:`Logger <../visualize/loggers>` (or iterable collection of loggers) for experiment tracking. A ``True`` value uses the default ``TensorBoardLogger`` shown below. ``False`` will disable logging.
 
 .. testcode::
 
@@ -1152,9 +1146,9 @@ plugins
 
 :ref:`Plugins` allow you to connect arbitrary backends, precision libraries, clusters etc. For example:
 
-- :ref:`DDP <gpu>`
+- :ref:`Checkpoint IO <checkpointing_expert>`
 - `TorchElastic <https://pytorch.org/elastic/0.2.2/index.html>`_
-- :ref:`Apex <amp>`
+- :ref:`Precision Plugins <precision_expert>`
 
 To define your own behavior, subclass the relevant class and pass it in. Here's an example linking up your own
 :class:`~pytorch_lightning.plugins.environments.ClusterEnvironment`.
@@ -1264,7 +1258,7 @@ profiler
 
 To profile individual steps during training and assist in identifying bottlenecks.
 
-See the :doc:`profiler documentation <../advanced/profiler>`. for more details.
+See the :doc:`profiler documentation <../tuning/profiler>`. for more details.
 
 .. testcode::
 
@@ -1415,7 +1409,7 @@ Supports passing different training strategies with aliases (ddp, ddp_spawn, etc
     trainer = Trainer(strategy=CustomDDPStrategy(), accelerator="gpu", devices=2)
 
 See Also:
-    - :ref:`accelerators/gpu:Multi GPU Training`.
+    - :ref:`Multi GPU Training <multi_gpu>`.
     - :doc:`Model Parallel GPU training guide <../advanced/model_parallel>`.
     - :doc:`TPU training guide <../accelerators/tpu>`.
 
diff --git a/docs/source/common_usecases.rst b/docs/source/common_usecases.rst
index aefd11491af105..606eea96d77d34 100644
--- a/docs/source/common_usecases.rst
+++ b/docs/source/common_usecases.rst
@@ -1,165 +1,171 @@
 :orphan:
 
-Conceptual Guides
-=================
+################
+Common Workflows
+################
 
-.. include:: links.rst
+Customize and extend Lightning for things like custom hardware or distributed strategies.
+
+.. join_slack::
+   :align: left
+
+----
 
 .. raw:: html
 
-    <div class="tutorials-callout-container">
+    <div class="display-card-container">
         <div class="row">
 
 .. Add callout items below this line
 
-.. customcalloutitem::
-   :description: Learn to train Lightning models on the cloud
-   :header: Cloud Training
-   :button_link:  clouds/cloud_training.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Lightning checkpoints have everything you need to save and restore your models
-   :header: Checkpointing
-   :button_link: common/checkpointing.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Learn to train on your university or company's cluster
-   :header: Cluster Training
-   :button_link: clouds/cluster.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Tricks for debugging your Lightning Models
-   :header: Debugging
-   :button_link: common/debugging.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Save time and money by training until key metrics stop improving or time has elapsed
-   :header: Early Stopping
-   :button_link: common/early_stopping.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Here you'll find the latest SOTA training techniques such as SWA, accumulated gradients, etc...
-   :header: Effective Training Techniques
-   :button_link: advanced/training_tricks.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Avoid over-fitting (memorizing the dataset) with these techniques
-   :header: Evaluation
+.. displayitem::
+   :header: Avoid overfitting
+   :description: Add a training and test loop.
+   :col_css: col-md-12
    :button_link: common/evaluation.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Before coding a complex model, use lightning-flash to create a baseline in a few lines of code
-   :header: Fast Baselines
-   :button_link: ecosystem/flash.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Enable fault-tolerant training in clusters/clouds where machines might fail (ie: pre-emtible machines)
-   :header: Fault-Tolerant Training
-   :button_link: advanced/fault_tolerant_training.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Make your models more flexible by enabling command-line arguments
-   :header: Hyperparameters (via command-line)
+   :height: 100
+
+.. displayitem::
+   :header: Build a model
+   :description: Steps to build a model.
+   :col_css: col-md-12
+   :button_link: model/build_model.html
+   :height: 100
+
+.. displayitem::
+   :header: Configure hyperparameters from the CLI
+   :description: Enable basic CLI with Lightning.
+   :col_css: col-md-12
    :button_link: common/hyperparameters.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Use the latest tricks to easily productionize your Lightning models
-   :header: Inference in Production
-   :button_link: common/production_inference.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Reduce configuration boilerplate with the Lightning CLI
-   :header: Lightning CLI
-   :button_link: common/lightning_cli.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Visualize your machine learning experiments with these experiment managers
-   :header: Loggers (experiment managers)
-   :button_link: common/loggers.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Use the model registry to mix and match your models and Datamodules
-   :header: Model and Datamodule Registry
-   :button_link: common/lightning_cli.html#multiple-models-and-or-datasets
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Train 1TB+ parameter models with these advanced built-in techniques
-   :header: Model Parallelism
-   :button_link: advanced/model_parallel.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Increase batch-sizes and improve speeds by training using 16-bit precision and more
-   :header: N-Bit Precision
-   :button_link: advanced/precision.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Enable manual optimization to fully control the optimization procedure for advanced research
-   :header: Manual Optimization
-   :button_link: common/optimization.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Use these profilers to find bottlenecks in your model
-   :header: Profiling
-   :button_link: advanced/profiler.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Use these built-in progress bars or learn how to make your own!
-   :header: Progress Bar
+   :height: 100
+
+.. displayitem::
+   :header: Customize the progress bar
+   :description: Change the progress bar behavior.
+   :col_css: col-md-12
    :button_link: common/progress_bar.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Compress model sizes to speed up model inference for deployment without loss of performance (accuracy)
-   :header: Pruning and Quantization
-   :button_link: advanced/pruning_quantization.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Work with data on any local or cloud filesystem such as S3 on AWS, GCS on Google Cloud, or ADL on Azure
-   :header: Remote Filesystems
-   :button_link: common/remote_fs.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Building the next Deepspeed, FSDP or fancy scaling technique? Add them to Lightning here
-   :header: Strategy Registry
-   :button_link: advanced/strategy_registry.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Simplify metrics calculations to scale-proof your models
-   :header: Torchmetrics
-   :button_link: ecosystem/metrics.html
-   :card_style: text-container-small
-
-.. customcalloutitem::
-   :description: Use models training on large datasets to achieve better results when you don't have much data
-   :header: Transfer Learning (finetuning)
-   :button_link: advanced/transfer_learning.html
-   :card_style: text-container-small
+   :height: 100
+
+.. displayitem::
+   :header: Deploy models into production
+   :description: Deploy models with different levels of scale.
+   :col_css: col-md-12
+   :button_link: deploy/production.html
+   :height: 100
 
+.. displayitem::
+   :header: Effective Training Techniques
+   :description: Explore advanced training techniques.
+   :col_css: col-md-12
+   :button_link: advanced/training_tricks.html
+   :height: 100
+
+.. displayitem::
+   :header: Eliminate config boilerplate
+   :description: Control your training via CLI and YAML.
+   :col_css: col-md-12
+   :button_link: cli/lightning_cli.html
+   :height: 100
+
+.. displayitem::
+   :header: Find bottlenecks in your code
+   :description: Learn to find bottlenecks in your code.
+   :col_css: col-md-12
+   :button_link: tuning/profiler.html
+   :height: 100
+
+.. displayitem::
+   :header: Finetune a model
+   :description: Learn to use pretrained models
+   :col_css: col-md-12
+   :button_link: advanced/transfer_learning.html
+   :height: 100
+
+.. displayitem::
+   :header: Manage Experiments
+   :description: Learn to track and visualize experiments
+   :col_css: col-md-12
+   :button_link: visualize/logging_intermediate.html
+   :height: 100
+
+.. displayitem::
+   :header: Run on an on-prem cluster
+   :description: Learn to run on your own cluster
+   :col_css: col-md-12
+   :button_link: clouds/cluster.html
+   :height: 100
+
+.. displayitem::
+   :header: Save and load model progress
+   :description: Save and load progress with checkpoints.
+   :col_css: col-md-12
+   :button_link: common/checkpointing_basic.html
+   :height: 100
+
+.. displayitem::
+   :header: Save memory with half-precision
+   :description: Enable half-precision to train faster and save memory.
+   :col_css: col-md-12
+   :button_link: common/precision.html
+   :height: 100
+
+.. displayitem::
+   :header: Train 1 trillion+ parameter models
+   :description: Scale GPU training to 1 trillion + parameter models
+   :col_css: col-md-12
+   :button_link: advanced/model_parallel.html
+   :height: 100
+
+.. displayitem::
+   :header: Train on the cloud
+   :description: Run models on the cloud.
+   :col_css: col-md-12
+   :button_link: clouds/cloud_training.html
+   :height: 100
+
+.. displayitem::
+   :header: Train on single or multiple GPUs
+   :description: Train models faster with GPUs.
+   :col_css: col-md-12
+   :button_link: accelerators/gpu.html
+   :height: 100
+
+.. displayitem::
+   :header: Train on single or multiple HPUs
+   :description: Train models faster with HPUs.
+   :col_css: col-md-12
+   :button_link: accelerators/hpu.html
+   :height: 100
+
+.. displayitem::
+   :header: Train on single or multiple IPUs
+   :description: Train models faster with IPUs.
+   :col_css: col-md-12
+   :button_link: accelerators/ipu.html
+   :height: 100
+
+.. displayitem::
+   :header: Train on single or multiple TPUs
+   :description: Train models faster with TPUs.
+   :col_css: col-md-12
+   :button_link: accelerators/tpu.html
+   :height: 100
+
+.. displayitem::
+   :header: Track and Visualize Experiments
+   :description: Learn to track and visualize experiments
+   :col_css: col-md-12
+   :button_link: visualize/logging_intermediate.html
+   :height: 100
+
+.. displayitem::
+   :header: Use a pure PyTorch training loop
+   :description: Run your pure PyTorch loop with Lightning.
+   :col_css: col-md-12
+   :button_link: model/own_your_loop.html
+   :height: 100
 
 .. raw:: html
 
         </div>
     </div>
-
-.. End of callout item section
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 502354c9f95f8c..f1118ccc74e077 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -121,7 +121,7 @@ def _transform_changelog(path_in: str, path_out: str) -> None:
     "sphinx_copybutton",
     "sphinx_paramlinks",
     "sphinx_togglebutton",
-    "pt_lightning_sphinx_theme.extensions.lightning_tutorials",
+    "pt_lightning_sphinx_theme.extensions.lightning",
 ]
 
 # Suppress warnings about duplicate labels (needed for PL tutorials)
@@ -386,11 +386,15 @@ def package_list_from_file(file):
 doctest_global_setup = """
 import importlib
 import os
+import sys
 from typing import Optional
+
 import torch
-from torch import nn
 import pytorch_lightning as pl
-from pytorch_lightning import LightningDataModule, LightningModule, Trainer
+from torch import nn
+from torch.utils.data import IterableDataset, DataLoader, Dataset
+from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything
+from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.utilities import (
     _APEX_AVAILABLE,
     _XLA_AVAILABLE,
diff --git a/docs/source/extensions/datamodules.rst b/docs/source/data/datamodule.rst
similarity index 79%
rename from docs/source/extensions/datamodules.rst
rename to docs/source/data/datamodule.rst
index 1bbcbcb83a43cb..bce1877cb37cf4 100644
--- a/docs/source/extensions/datamodules.rst
+++ b/docs/source/data/datamodule.rst
@@ -1,7 +1,8 @@
 .. _datamodules:
 
+###################
 LightningDataModule
-===================
+###################
 A datamodule is a shareable, reusable class that encapsulates all the steps needed to process data:
 
 .. raw:: html
@@ -37,8 +38,9 @@ This class can then be shared and used anywhere:
 
 ---------------
 
+***************************
 Why do I need a DataModule?
----------------------------
+***************************
 In normal PyTorch code, the data cleaning/preparation is usually scattered across many files. This makes
 sharing and reusing the exact splits and transforms across projects impossible.
 
@@ -51,8 +53,9 @@ Datamodules are for you if you ever asked the questions:
 
 --------------
 
-What is a DataModule
---------------------
+*********************
+What is a DataModule?
+*********************
 A DataModule is simply a collection of a train_dataloader(s), val_dataloader(s), test_dataloader(s) and
 predict_dataloader(s) along with the matching transforms and data processing/downloads steps required.
 
@@ -166,27 +169,29 @@ Here's a more realistic, complex DataModule that shows how much more reusable th
 
 ---------------
 
+***********************
 LightningDataModule API
------------------------
+***********************
 To define a DataModule the following methods are used to create train/val/test/predict dataloaders:
 
-- :ref:`prepare_data<extensions/datamodules:prepare_data>` (how to download, tokenize, etc...)
-- :ref:`setup<extensions/datamodules:setup>` (how to split, define dataset, etc...)
-- :ref:`train_dataloader<extensions/datamodules:train_dataloader>`
-- :ref:`val_dataloader<extensions/datamodules:val_dataloader>`
-- :ref:`test_dataloader<extensions/datamodules:test_dataloader>`
-- :ref:`predict_dataloader<extensions/datamodules:predict_dataloader>`
+- :ref:`prepare_data<data/datamodule:prepare_data>` (how to download, tokenize, etc...)
+- :ref:`setup<data/datamodule:setup>` (how to split, define dataset, etc...)
+- :ref:`train_dataloader<data/datamodule:train_dataloader>`
+- :ref:`val_dataloader<data/datamodule:val_dataloader>`
+- :ref:`test_dataloader<data/datamodule:test_dataloader>`
+- :ref:`predict_dataloader<data/datamodule:predict_dataloader>`
 
 
 prepare_data
-~~~~~~~~~~~~
+============
 Downloading and saving data with multiple processes (distributed settings) will result in corrupted data. Lightning
-ensures the :meth:`~pytorch_lightning.core.hooks.DataHooks.prepare_data` is called only within a single process,
+ensures the :meth:`~pytorch_lightning.core.hooks.DataHooks.prepare_data` is called only within a single process on CPU,
 so you can safely add your downloading logic within. In case of multi-node training, the execution of this hook
-depends upon :ref:`prepare_data_per_node<extensions/datamodules:prepare_data_per_node>`.
+depends upon :ref:`prepare_data_per_node<data/datamodule:prepare_data_per_node>`. :meth:`~pytorch_lightning.core.hooks.DataHooks.setup` is called after
+``prepare_data`` and there is a barrier in between which ensures that all the processes proceed to ``setup`` once the data is prepared and available for use.
 
-- download
-- tokenize
+- download, i.e. download data only once on the disk from a single process
+- tokenize. Since it's a one time process, it is not recommended to do it on all processes
 - etc...
 
 .. code-block:: python
@@ -198,11 +203,14 @@ depends upon :ref:`prepare_data_per_node<extensions/datamodules:prepare_data_per
             MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())
 
 
-.. warning:: ``prepare_data`` is called from the main process. It is not recommended to assign state here (e.g. ``self.x = y``).
+.. warning::
+
+    ``prepare_data`` is called from the main process. It is not recommended to assign state here (e.g. ``self.x = y``) since it is called on a single process and if you assign
+    states here then they won't be available for other processes.
 
 
 setup
-~~~~~
+=====
 There are also data operations you might want to perform on every GPU. Use :meth:`~pytorch_lightning.core.hooks.DataHooks.setup` to do things like:
 
 - count number of classes
@@ -230,18 +238,35 @@ There are also data operations you might want to perform on every GPU. Use :meth
                 self.mnist_test = MNIST(self.data_dir, train=False, download=True, transform=self.transform)
 
 
+For eg., if you are working with NLP task where you need to tokenize the text and use it, then you can do something like as follows:
+
+.. code-block:: python
+
+    class LitDataModule(LightningDataModule):
+        def prepare_data(self):
+            dataset = load_Dataset(...)
+            train_dataset = ...
+            val_dataset = ...
+            # tokenize
+            # save it to disk
+
+        def setup(self, stage):
+            # load it back here
+            dataset = load_dataset_from_disk(...)
+
+
 This method expects a ``stage`` argument.
 It is used to separate setup logic for ``trainer.{fit,validate,test,predict}``. If ``setup`` is called with ``stage=None``,
 we assume all stages have been set-up.
 
-.. note:: :ref:`setup<extensions/datamodules:setup>` is called from every process across all the nodes. Setting state here is recommended.
-.. note:: :ref:`teardown<extensions/datamodules:teardown>` can be used to clean up the state. It is also called from every process across all the nodes.
+.. note:: :ref:`setup<data/datamodule:setup>` is called from every process across all the nodes. Setting state here is recommended.
+.. note:: :ref:`teardown<data/datamodule:teardown>` can be used to clean up the state. It is also called from every process across all the nodes.
 
 
 train_dataloader
-~~~~~~~~~~~~~~~~
+================
 Use the :meth:`~pytorch_lightning.core.hooks.DataHooks.train_dataloader` method to generate the training dataloader(s).
-Usually you just wrap the dataset you defined in :ref:`setup<extensions/datamodules:setup>`. This is the dataloader that the Trainer
+Usually you just wrap the dataset you defined in :ref:`setup<data/datamodule:setup>`. This is the dataloader that the Trainer
 :meth:`~pytorch_lightning.trainer.trainer.Trainer.fit` method uses.
 
 .. code-block:: python
@@ -256,9 +281,9 @@ Usually you just wrap the dataset you defined in :ref:`setup<extensions/datamodu
 .. _datamodule_val_dataloader_label:
 
 val_dataloader
-~~~~~~~~~~~~~~
+==============
 Use the :meth:`~pytorch_lightning.core.hooks.DataHooks.val_dataloader` method to generate the validation dataloader(s).
-Usually you just wrap the dataset you defined in :ref:`setup<extensions/datamodules:setup>`. This is the dataloader that the Trainer
+Usually you just wrap the dataset you defined in :ref:`setup<data/datamodule:setup>`. This is the dataloader that the Trainer
 :meth:`~pytorch_lightning.trainer.trainer.Trainer.fit` and :meth:`~pytorch_lightning.trainer.trainer.Trainer.validate` methods uses.
 
 .. code-block:: python
@@ -274,9 +299,9 @@ Usually you just wrap the dataset you defined in :ref:`setup<extensions/datamodu
 .. _datamodule_test_dataloader_label:
 
 test_dataloader
-~~~~~~~~~~~~~~~
+===============
 Use the :meth:`~pytorch_lightning.core.hooks.DataHooks.test_dataloader` method to generate the test dataloader(s).
-Usually you just wrap the dataset you defined in :ref:`setup<extensions/datamodules:setup>`. This is the dataloader that the Trainer
+Usually you just wrap the dataset you defined in :ref:`setup<data/datamodule:setup>`. This is the dataloader that the Trainer
 :meth:`~pytorch_lightning.trainer.trainer.Trainer.test` method uses.
 
 .. code-block:: python
@@ -290,9 +315,9 @@ Usually you just wrap the dataset you defined in :ref:`setup<extensions/datamodu
 
 
 predict_dataloader
-~~~~~~~~~~~~~~~~~~
+==================
 Use the :meth:`~pytorch_lightning.core.hooks.DataHooks.predict_dataloader` method to generate the prediction dataloader(s).
-Usually you just wrap the dataset you defined in :ref:`setup<extensions/datamodules:setup>`. This is the dataloader that the Trainer
+Usually you just wrap the dataset you defined in :ref:`setup<data/datamodule:setup>`. This is the dataloader that the Trainer
 :meth:`~pytorch_lightning.trainer.trainer.Trainer.predict` method uses.
 
 .. code-block:: python
@@ -306,67 +331,67 @@ Usually you just wrap the dataset you defined in :ref:`setup<extensions/datamodu
 
 
 transfer_batch_to_device
-~~~~~~~~~~~~~~~~~~~~~~~~
+========================
 
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.transfer_batch_to_device
     :noindex:
 
 on_before_batch_transfer
-~~~~~~~~~~~~~~~~~~~~~~~~
+========================
 
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.on_before_batch_transfer
     :noindex:
 
 on_after_batch_transfer
-~~~~~~~~~~~~~~~~~~~~~~~
+=======================
 
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.on_after_batch_transfer
     :noindex:
 
 load_state_dict
-~~~~~~~~~~~~~~~~~~
+===============
 
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.load_state_dict
     :noindex:
 
 state_dict
-~~~~~~~~~~~~~~~~~~
+==========
 
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.state_dict
     :noindex:
 
 on_train_dataloader
-~~~~~~~~~~~~~~~~~~~
+===================
 
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.on_train_dataloader
     :noindex:
 
 on_val_dataloader
-~~~~~~~~~~~~~~~~~
+=================
 
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.on_val_dataloader
     :noindex:
 
 on_test_dataloader
-~~~~~~~~~~~~~~~~~~
+==================
 
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.on_test_dataloader
     :noindex:
 
 on_predict_dataloader
-~~~~~~~~~~~~~~~~~~~~~
+=====================
 
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.on_predict_dataloader
     :noindex:
 
 teardown
-~~~~~~~~
+========
 
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.teardown
     :noindex:
 
 prepare_data_per_node
-~~~~~~~~~~~~~~~~~~~~~
+=====================
 If set to ``True`` will call ``prepare_data()`` on LOCAL_RANK=0 for every node.
 If set to ``False`` will only call from NODE_RANK=0, LOCAL_RANK=0.
 
@@ -380,8 +405,9 @@ If set to ``False`` will only call from NODE_RANK=0, LOCAL_RANK=0.
 
 ------------------
 
+******************
 Using a DataModule
-------------------
+******************
 
 The recommended way to use a DataModule is simply:
 
@@ -395,8 +421,8 @@ The recommended way to use a DataModule is simply:
     trainer.predict(datamodule=dm)
 
 If you need information from the dataset to build your model, then run
-:ref:`prepare_data<extensions/datamodules:prepare_data>` and
-:ref:`setup<extensions/datamodules:setup>` manually (Lightning ensures
+:ref:`prepare_data<data/datamodule:prepare_data>` and
+:ref:`setup<data/datamodule:setup>` manually (Lightning ensures
 the method runs on the correct devices).
 
 .. code-block:: python
@@ -413,8 +439,9 @@ the method runs on the correct devices).
 
 ----------------
 
+*****************************
 DataModules without Lightning
------------------------------
+*****************************
 You can of course use DataModules in plain PyTorch code as well.
 
 .. code-block:: python
@@ -447,8 +474,9 @@ structure.
 
 ----------------
 
+******************************
 Hyperparameters in DataModules
-------------------------------
+******************************
 Like LightningModules, DataModules support hyperparameters with the same API.
 
 .. code-block:: python
@@ -466,3 +494,8 @@ Like LightningModules, DataModules support hyperparameters with the same API.
             opt = optim.Adam(self.parameters(), lr=self.hparams.lr)
 
 Refer to ``save_hyperparameters`` in :doc:`lightning module <../common/lightning_module>` for more details.
+
+
+----
+
+.. include:: ../extensions/datamodules_state.rst
diff --git a/docs/source/debug/debugging.rst b/docs/source/debug/debugging.rst
new file mode 100644
index 00000000000000..8fd78d646480bf
--- /dev/null
+++ b/docs/source/debug/debugging.rst
@@ -0,0 +1,41 @@
+.. _debugging:
+
+################
+Debug your model
+################
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Basic
+   :description: Learn the basics of model debugging.
+   :col_css: col-md-4
+   :button_link: debugging_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Intermediate
+   :description: Learn to debug machine learning operations
+   :col_css: col-md-4
+   :button_link: debugging_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Advanced
+   :description: Learn to debug distributed models
+   :col_css: col-md-4
+   :button_link: debugging_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+   </div>
diff --git a/docs/source/debug/debugging_advanced.rst b/docs/source/debug/debugging_advanced.rst
new file mode 100644
index 00000000000000..0c1685ae3ab5f9
--- /dev/null
+++ b/docs/source/debug/debugging_advanced.rst
@@ -0,0 +1,44 @@
+:orphan:
+
+.. _debugging_advanced:
+
+###########################
+Debug your model (advanced)
+###########################
+**Audience**: Users who want to debug distributed models.
+
+----
+
+************************
+Debug distributed models
+************************
+To debug a distributed model, we recommend you debug it locally by running the distributed version on CPUs:
+
+.. code-block:: python
+
+    trainer = Trainer(accelerator="cpu", strategy="ddp", devices=2)
+
+On the CPU, you can use `pdb <https://docs.python.org/3/library/pdb.html>`_ or `breakpoint() <https://docs.python.org/3/library/functions.html#breakpoint>`_
+or use regular print statements.
+
+.. testcode::
+
+    class LitModel(LightningModule):
+        def training_step(self, batch, batch_idx):
+
+            debugging_message = ...
+            print(f"RANK - {self.trainer.global_rank}: {debugging_message}")
+
+            if self.trainer.global_rank == 0:
+                import pdb
+
+                pdb.set_trace()
+
+            # to prevent other processes from moving forward until all processes are in sync
+            self.trainer.strategy.barrier()
+
+When everything works, switch back to GPU by changing only the accelerator.
+
+.. code-block:: python
+
+    trainer = Trainer(accelerator="gpu", strategy="ddp", devices=2)
diff --git a/docs/source/debug/debugging_basic.rst b/docs/source/debug/debugging_basic.rst
new file mode 100644
index 00000000000000..147285f9fe798d
--- /dev/null
+++ b/docs/source/debug/debugging_basic.rst
@@ -0,0 +1,161 @@
+:orphan:
+
+.. _debugging_basic:
+
+########################
+Debug your model (basic)
+########################
+**Audience**: Users who want to learn the basics of debugging models.
+
+.. raw:: html
+
+    <video width="50%" max-width="400px" controls
+    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/yt_thumbs/thumb_debugging.png"
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/yt/Trainer+flags+7-+debugging_1.mp4"></video>
+
+----
+
+**********************************
+How does Lightning help me debug ?
+**********************************
+The Lightning Trainer has *a lot* of arguments devoted to maximizing your debugging productivity.
+
+----
+
+****************
+Set a breakpoint
+****************
+A breakpoint stops your code execution so you can inspect variables, etc... and allow your code to execute one line at a time.
+
+.. code:: python
+
+    def function_to_debug():
+        x = 2
+
+        # set breakpoint
+        import pdb
+
+        pdb.set_trace()
+        y = x ** 2
+
+In this example, the code will stop before executing the ``y = x**2`` line.
+
+----
+
+************************************
+Run all your model code once quickly
+************************************
+If you've ever trained a model for days only to crash during validation or testing then this trainer argument is about to become your best friend.
+
+The :paramref:`~pytorch_lightning.trainer.trainer.Trainer.fast_dev_run` argument in the trainer runs 5 batch of training, validation, test and prediction data through your trainer to see if there are any bugs:
+
+.. code:: python
+
+    Trainer(fast_dev_run=True)
+
+To change how many batches to use, change the argument to an integer. Here we run 7 batches of each:
+
+.. code:: python
+
+    Trainer(fast_dev_run=7)
+
+
+.. note::
+
+    This argument will disable tuner, checkpoint callbacks, early stopping callbacks,
+    loggers and logger callbacks like :class:`~pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor` and
+    :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor`.
+
+----
+
+************************
+Shorten the epoch length
+************************
+Sometimes it's helpful to only use a fraction of your training, val, test, or predict data (or a set number of batches).
+For example, you can use 20% of the training set and 1% of the validation set.
+
+On larger datasets like Imagenet, this can help you debug or test a few things faster than waiting for a full epoch.
+
+.. testcode::
+
+    # use only 10% of training data and 1% of val data
+    trainer = Trainer(limit_train_batches=0.1, limit_val_batches=0.01)
+
+    # use 10 batches of train and 5 batches of val
+    trainer = Trainer(limit_train_batches=10, limit_val_batches=5)
+
+----
+
+******************
+Run a Sanity Check
+******************
+Lightning runs **2** steps of validation in the beginning of training.
+This avoids crashing in the validation loop sometime deep into a lengthy training loop.
+
+(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.num_sanity_val_steps`
+argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
+
+.. testcode::
+
+    trainer = Trainer(num_sanity_val_steps=2)
+
+----
+
+*************************************
+Print LightningModule weights summary
+*************************************
+Whenever the ``.fit()`` function gets called, the Trainer will print the weights summary for the LightningModule.
+
+.. code:: python
+
+    trainer.fit(...)
+
+this generate a table like:
+
+.. code-block:: text
+
+      | Name  | Type        | Params
+    ----------------------------------
+    0 | net   | Sequential  | 132 K
+    1 | net.0 | Linear      | 131 K
+    2 | net.1 | BatchNorm1d | 1.0 K
+
+To add the child modules to the summary add a :class:`~pytorch_lightning.callbacks.model_summary.ModelSummary`:
+
+.. testcode::
+
+    from pytorch_lightning.callbacks import ModelSummary
+
+    trainer = Trainer(callbacks=[ModelSummary(max_depth=-1)])
+
+To turn off the autosummary use:
+
+.. code:: python
+
+    Trainer(enable_model_summary=False)
+
+----
+
+***********************************
+Print input output layer dimensions
+***********************************
+Another debugging tool is to  display the intermediate input- and output sizes of all your layers by setting the
+``example_input_array`` attribute in your LightningModule.
+
+.. code-block:: python
+
+    class LitModel(LightningModule):
+        def __init__(self, *args, **kwargs):
+            self.example_input_array = torch.Tensor(32, 1, 28, 28)
+
+With the input array, the summary table will include the input and output layer dimensions:
+
+.. code-block:: text
+
+      | Name  | Type        | Params | In sizes  | Out sizes
+    --------------------------------------------------------------
+    0 | net   | Sequential  | 132 K  | [10, 256] | [10, 512]
+    1 | net.0 | Linear      | 131 K  | [10, 256] | [10, 512]
+    2 | net.1 | BatchNorm1d | 1.0 K  | [10, 512] | [10, 512]
+
+when you call ``.fit()`` on the Trainer. This can help you find bugs in the composition of your layers.
diff --git a/docs/source/debug/debugging_intermediate.rst b/docs/source/debug/debugging_intermediate.rst
new file mode 100644
index 00000000000000..da8eb59dba8826
--- /dev/null
+++ b/docs/source/debug/debugging_intermediate.rst
@@ -0,0 +1,79 @@
+:orphan:
+
+.. _debugging_intermediate:
+
+
+###############################
+Debug your model (intermediate)
+###############################
+**Audience**: Users who want to debug their ML code
+
+----
+
+***************************
+Why should I debug ML code?
+***************************
+Machine learning code requires debugging mathematical correctness, which is not something non-ML code has to deal with. Lightning implements a few best-practice techniques to give all users, expert level ML debugging abilities.
+
+----
+
+**************************************
+Overfit your model on a Subset of Data
+**************************************
+A good debugging technique is to take a tiny portion of your data (say 2 samples per class),
+and try to get your model to overfit. If it can't, it's a sign it won't work with large datasets.
+
+(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.overfit_batches`
+argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
+
+.. testcode::
+
+    # use only 1% of training data (and turn off validation)
+    trainer = Trainer(overfit_batches=0.01)
+
+    # similar, but with a fixed 10 batches
+    trainer = Trainer(overfit_batches=10)
+
+When using this argument, the validation loop will be disabled. We will also replace the sampler
+in the training set to turn off shuffle for you.
+
+----
+
+********************************
+Look-out for exploding gradients
+********************************
+One major problem that plagues models is exploding gradients. Gradient norm is one technique that can help keep gradients from exploding.
+
+.. testcode::
+
+    # the 2-norm
+    trainer = Trainer(track_grad_norm=2)
+
+This will plot the 2-norm to your experiment manager. If you notice the norm is going up, there's a good chance your gradients are/will explode.
+
+One technique to stop exploding gradients is to clip the gradient
+
+.. testcode::
+
+    # DEFAULT (ie: don't clip)
+    trainer = Trainer(gradient_clip_val=0)
+
+    # clip gradients' global norm to <=0.5 using gradient_clip_algorithm='norm' by default
+    trainer = Trainer(gradient_clip_val=0.5)
+
+    # clip gradients' maximum magnitude to <=0.5
+    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm="value")
+
+----
+
+*************************
+Detect autograd anomalies
+*************************
+Lightning helps you detect anomalies in the PyTorh autograd engine via PyTorch's built-in
+`Anomaly Detection Context-manager <https://pytorch.org/docs/stable/autograd.html#anomaly-detection>`_.
+
+Enable it via the **detect_anomaly** trainer argument:
+
+.. testcode::
+
+    trainer = Trainer(detect_anomaly=True)
diff --git a/docs/source/deploy/production.rst b/docs/source/deploy/production.rst
new file mode 100644
index 00000000000000..686fcc5e2ea6de
--- /dev/null
+++ b/docs/source/deploy/production.rst
@@ -0,0 +1,79 @@
+.. _production_inference:
+
+#############################
+Deploy models into production
+#############################
+
+******
+Basics
+******
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Basic
+   :description: Learn the basics of predicting with Lightning
+   :col_css: col-md-6
+   :button_link: production_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Intermediate
+   :description: Learn to remove the Lightning dependencies and use pure PyTorch for prediction.
+   :col_css: col-md-6
+   :button_link: production_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
+
+----
+
+********
+Advanced
+********
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Deploy with ONNX
+   :description: Optimize models for enterprise-scale production environments with ONNX.
+   :col_css: col-md-4
+   :button_link: production_advanced.html
+   :height: 180
+   :tag: advanced
+
+.. displayitem::
+   :header: Deploy with torchscript
+   :description: Optimize models for enterprise-scale production environments with torchscript.
+   :col_css: col-md-4
+   :button_link: production_advanced_2.html
+   :height: 180
+   :tag: advanced
+
+.. displayitem::
+   :header: Compress models for fast inference
+   :description: Compress models for fast inference for deployment with Quantization and Pruning.
+   :col_css: col-md-4
+   :button_link: ../advanced/pruning_quantization.html
+   :height: 180
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/deploy/production_advanced.rst b/docs/source/deploy/production_advanced.rst
new file mode 100644
index 00000000000000..750355dc7177fd
--- /dev/null
+++ b/docs/source/deploy/production_advanced.rst
@@ -0,0 +1,60 @@
+########################################
+Deploy models into production (advanced)
+########################################
+**Audience**: Machine learning engineers optimizing models for enterprise-scale production environments.
+
+----
+
+**************************
+Compile your model to ONNX
+**************************
+`ONNX <https://pytorch.org/docs/stable/onnx.html>`_ is a package developed by Microsoft to optimize inference. ONNX allows the model to be independent of PyTorch and run on any ONNX Runtime.
+
+To export your model to ONNX format call the :meth:`~pytorch_lightning.core.lightning.LightningModule.to_onnx` function on your :class:`~pytorch_lightning.core.lightning.LightningModule` with the ``filepath`` and ``input_sample``.
+
+.. code-block:: python
+
+    class SimpleModel(LightningModule):
+        def __init__(self):
+            super().__init__()
+            self.l1 = torch.nn.Linear(in_features=64, out_features=4)
+
+        def forward(self, x):
+            return torch.relu(self.l1(x.view(x.size(0), -1)))
+
+
+    # create the model
+    model = SimpleModel()
+    filepath = "model.onnx"
+    input_sample = torch.randn((1, 64))
+    model.to_onnx(filepath, input_sample, export_params=True)
+
+You can also skip passing the input sample if the ``example_input_array`` property is specified in your :class:`~pytorch_lightning.core.lightning.LightningModule`.
+
+.. code-block:: python
+
+    class SimpleModel(LightningModule):
+        def __init__(self):
+            super().__init__()
+            self.l1 = torch.nn.Linear(in_features=64, out_features=4)
+            self.example_input_array = torch.randn(7, 64)
+
+        def forward(self, x):
+            return torch.relu(self.l1(x.view(x.size(0), -1)))
+
+
+    # create the model
+    model = SimpleModel()
+    filepath = "model.onnx"
+    model.to_onnx(filepath, export_params=True)
+
+Once you have the exported model, you can run it on your ONNX runtime in the following way:
+
+.. code-block:: python
+
+    import onnxruntime
+
+    ort_session = onnxruntime.InferenceSession(filepath)
+    input_name = ort_session.get_inputs()[0].name
+    ort_inputs = {input_name: np.random.randn(1, 64)}
+    ort_outs = ort_session.run(None, ort_inputs)
diff --git a/docs/source/deploy/production_advanced_2.rst b/docs/source/deploy/production_advanced_2.rst
new file mode 100644
index 00000000000000..e86aee8c7fa4f1
--- /dev/null
+++ b/docs/source/deploy/production_advanced_2.rst
@@ -0,0 +1,69 @@
+:orphan:
+
+########################################
+Deploy models into production (advanced)
+########################################
+**Audience**: Machine learning engineers optimizing models for enterprise-scale production environments.
+
+----
+
+*********************************
+Compile your model to TorchScript
+*********************************
+`TorchScript <https://pytorch.org/docs/stable/jit.html>`_ allows you to serialize your models in a way that it can be loaded in non-Python environments.
+The ``LightningModule`` has a handy method :meth:`~pytorch_lightning.core.lightning.LightningModule.to_torchscript` that returns a scripted module which you
+can save or directly use.
+
+.. testcode:: python
+
+    class SimpleModel(LightningModule):
+        def __init__(self):
+            super().__init__()
+            self.l1 = torch.nn.Linear(in_features=64, out_features=4)
+
+        def forward(self, x):
+            return torch.relu(self.l1(x.view(x.size(0), -1)))
+
+
+    # create the model
+    model = SimpleModel()
+    script = model.to_torchscript()
+
+    # save for use in production environment
+    torch.jit.save(script, "model.pt")
+
+It is recommended that you install the latest supported version of PyTorch to use this feature without limitations.
+
+Once you have the exported model, you can run it in Pytorch or C++ runtime:
+
+.. code-block:: python
+
+    inp = torch.rand(1, 64)
+    scripted_module = torch.jit.load("model.pt")
+    output = scripted_module(inp)
+
+
+If you want to script a different method, you can decorate the method with :func:`torch.jit.export`:
+
+.. code-block:: python
+
+    class LitMCdropoutModel(pl.LightningModule):
+        def __init__(self, model, mc_iteration):
+            super().__init__()
+            self.model = model
+            self.dropout = nn.Dropout()
+            self.mc_iteration = mc_iteration
+
+        @torch.jit.export
+        def predict_step(self, batch, batch_idx):
+            # enable Monte Carlo Dropout
+            self.dropout.train()
+
+            # take average of `self.mc_iteration` iterations
+            pred = [self.dropout(self.model(x)).unsqueeze(0) for _ in range(self.mc_iteration)]
+            pred = torch.vstack(pred).mean(dim=0)
+            return pred
+
+
+    model = LitMCdropoutModel(...)
+    script = model.to_torchscript(file_path="model.pt", method="script")
diff --git a/docs/source/deploy/production_basic.rst b/docs/source/deploy/production_basic.rst
new file mode 100644
index 00000000000000..00e9caa37c9069
--- /dev/null
+++ b/docs/source/deploy/production_basic.rst
@@ -0,0 +1,80 @@
+#####################################
+Deploy models into production (basic)
+#####################################
+**Audience**: All users.
+
+----
+
+*****************************
+Load a checkpoint and predict
+*****************************
+The easiest way to use a model for predictions is to load the weights using **load_from_checkpoint** found in the LightningModule.
+
+.. code-block:: python
+
+    model = LitModel.load_from_checkpoint("best_model.ckpt")
+    model.eval()
+    x = torch.randn(1, 64)
+
+    with torch.no_grad():
+        y_hat = model(x)
+
+----
+
+**************************************
+Predict step with your LightningModule
+**************************************
+Loading a checkpoint and predicting still leaves you with a lot of boilerplate around the predict epoch. The **predict step** in the LightningModule removes this boilerplate.
+
+.. code-block:: python
+
+    class MyModel(LightningModule):
+        def predict_step(self, batch, batch_idx, dataloader_idx=0):
+            return self(batch)
+
+And pass in any dataloader to the Lightning Trainer:
+
+.. code-block:: python
+
+    data_loader = DataLoader(...)
+    model = MyModel()
+    trainer = Trainer()
+    predictions = trainer.predict(model, data_loader)
+
+----
+
+********************************
+Enable complicated predict logic
+********************************
+When you need to add complicated pre-processing or post-processing logic to your data use the predict step. For example here we do  `Monte Carlo Dropout <https://arxiv.org/pdf/1506.02142.pdf>`_ for predictions:
+
+.. code-block:: python
+
+    class LitMCdropoutModel(pl.LightningModule):
+        def __init__(self, model, mc_iteration):
+            super().__init__()
+            self.model = model
+            self.dropout = nn.Dropout()
+            self.mc_iteration = mc_iteration
+
+        def predict_step(self, batch, batch_idx):
+            # enable Monte Carlo Dropout
+            self.dropout.train()
+
+            # take average of `self.mc_iteration` iterations
+            pred = [self.dropout(self.model(x)).unsqueeze(0) for _ in range(self.mc_iteration)]
+            pred = torch.vstack(pred).mean(dim=0)
+            return pred
+
+----
+
+****************************
+Enable distributed inference
+****************************
+By using the predict step in Lightning you get free distributed inference
+
+
+.. code-block:: python
+
+    trainer = Trainer(devices=8, accelerator="gpu")
+    predictions = trainer.predict(model, data_loader)
diff --git a/docs/source/deploy/production_intermediate.rst b/docs/source/deploy/production_intermediate.rst
new file mode 100644
index 00000000000000..eacb03dea9dccb
--- /dev/null
+++ b/docs/source/deploy/production_intermediate.rst
@@ -0,0 +1,99 @@
+############################################
+Deploy models into production (intermediate)
+############################################
+**Audience**: Researchers and MLEs looking to use their models for predictions without Lightning dependencies.
+
+----
+
+*********************
+Use PyTorch as normal
+*********************
+If you prefer to use PyTorch directly, feel free to use any Lightning checkpoint without Lightning.
+
+.. code-block:: python
+
+    import torch
+
+    model = torch.load("path/to/lightning/checkpoint.ckpt")
+    model.eval()
+
+You can also pull out the specific modules you want out of the checkpoint:
+
+.. code-block:: python
+
+    model = torch.load("path/to/lightning/checkpoint.ckpt")
+    encoder = model["encoder"]
+    encoder.eval()
+
+----
+
+********************************************
+Extract nn.Module from Lightning checkpoints
+********************************************
+You can also load the saved checkpoint and use it as a regular :class:`torch.nn.Module`. You can extract all your :class:`torch.nn.Module`
+and load the weights using the checkpoint saved using LightningModule after training. For this, we recommend copying the exact implementation
+from your LightningModule ``init`` and ``forward`` method.
+
+.. code-block:: python
+
+    class Encoder(nn.Module):
+        ...
+
+
+    class Decoder(nn.Module):
+        ...
+
+
+    class AutoEncoderProd(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.encoder = Encoder()
+            self.decoder = Decoder()
+
+        def forward(self, x):
+            return self.encoder(x)
+
+
+    class AutoEncoderSystem(LightningModule):
+        def __init__(self):
+            super().__init__()
+            self.auto_encoder = AutoEncoderProd()
+
+        def forward(self, x):
+            return self.auto_encoder.encoder(x)
+
+        def training_step(self, batch, batch_idx):
+            x, y = batch
+            y_hat = self.auto_encoder.encoder(x)
+            y_hat = self.auto_encoder.decoder(y_hat)
+            loss = ...
+            return loss
+
+
+    # train it
+    trainer = Trainer(devices=2, accelerator="gpu", strategy="ddp")
+    model = AutoEncoderSystem()
+    trainer.fit(model, train_dataloader, val_dataloader)
+    trainer.save_checkpoint("best_model.ckpt")
+
+
+    # create the PyTorch model and load the checkpoint weights
+    model = AutoEncoderProd()
+    checkpoint = torch.load("best_model.ckpt")
+    hyper_parameters = checkpoint["hyper_parameters"]
+
+    # if you want to restore any hyperparameters, you can pass them too
+    model = AutoEncoderProd(**hyper_parameters)
+
+    state_dict = checkpoint["state_dict"]
+
+    # update keys by dropping `auto_encoder.`
+    for key in list(model_weights):
+        model_weights[key.replace("auto_encoder.", "")] = model_weights.pop(key)
+
+    model.load_state_dict(model_weights)
+    model.eval()
+    x = torch.randn(1, 64)
+
+    with torch.no_grad():
+        y_hat = model(x)
diff --git a/docs/source/ecosystem/asr_nlp_tts.rst b/docs/source/ecosystem/asr_nlp_tts.rst
index 0f74dc39464059..b624696886c732 100644
--- a/docs/source/ecosystem/asr_nlp_tts.rst
+++ b/docs/source/ecosystem/asr_nlp_tts.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 #################
 Conversational AI
 #################
diff --git a/docs/source/ecosystem/bolts.rst b/docs/source/ecosystem/bolts.rst
index 77604a53b16a45..56c77681351a84 100644
--- a/docs/source/ecosystem/bolts.rst
+++ b/docs/source/ecosystem/bolts.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 Lightning Bolts
 ===============
 
diff --git a/docs/source/ecosystem/community_examples.rst b/docs/source/ecosystem/community_examples.rst
index 270a1286f78c99..f53585743f2829 100644
--- a/docs/source/ecosystem/community_examples.rst
+++ b/docs/source/ecosystem/community_examples.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 Community Examples
 ==================
 
diff --git a/docs/source/ecosystem/ecosystem-ci.rst b/docs/source/ecosystem/ecosystem-ci.rst
index 3a28f984d549ff..04ff342dc3edc5 100644
--- a/docs/source/ecosystem/ecosystem-ci.rst
+++ b/docs/source/ecosystem/ecosystem-ci.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 Ecosystem CI
 ============
 
diff --git a/docs/source/ecosystem/flash.rst b/docs/source/ecosystem/flash.rst
index 24d91115e13b42..31c6e9d7d7124d 100644
--- a/docs/source/ecosystem/flash.rst
+++ b/docs/source/ecosystem/flash.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 Lightning Flash
 ===============
 
diff --git a/docs/source/ecosystem/metrics.rst b/docs/source/ecosystem/metrics.rst
index 15378fa217b360..8ec155f7f149ac 100644
--- a/docs/source/ecosystem/metrics.rst
+++ b/docs/source/ecosystem/metrics.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 TorchMetrics
 ============
 
diff --git a/docs/source/ecosystem/transformers.rst b/docs/source/ecosystem/transformers.rst
index bad06d65e6fc4f..b20402a52699dc 100644
--- a/docs/source/ecosystem/transformers.rst
+++ b/docs/source/ecosystem/transformers.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 Lightning Transformers
 ======================
 
diff --git a/docs/source/expertise_levels.rst b/docs/source/expertise_levels.rst
new file mode 100644
index 00000000000000..9b563f7d4a3210
--- /dev/null
+++ b/docs/source/expertise_levels.rst
@@ -0,0 +1,298 @@
+:orphan:
+
+Level up
+========
+Learn enough Lightning to match the level of expertise required by your research or job.
+
+.. join_slack::
+   :align: left
+   :margin: 30
+
+----
+
+Basic skills
+------------
+Learn the basics of model development with Lightning. Researchers and machine learning engineers should start here.
+
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Level 1: Train a model
+   :description: Learn the basics of training a model.
+   :button_link: model/train_model_basic.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Level 2: Add a validation and test set
+   :description: Add validation and test sets to avoid over/underfitting.
+   :button_link: levels/basic_level_2.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Level 3: Use pretrained models
+   :description: Learn how to use pretrained models with Lightning
+   :button_link: advanced/transfer_learning.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Level 4: Enable script parameters
+   :description: Add parameters to your script so you can run from the commandline.
+   :button_link: common/hyperparameters.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Level 5: Understand and visualize your model
+   :description: Remove bottlenecks and visualize your model
+   :button_link: levels/basic_level_5.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :description: Use your model for predictions.
+   :header: Level 6: Predict with your model
+   :button_link: levels/core_level_6.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. raw:: html
+
+        </div>
+    </div>
+
+----
+
+Intermediate skills
+-------------------
+Learn to scale up your models and enable collaborative model development at academic or industry research labs.
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Level 7: Interactive cloud development
+   :description: Learn how to access GPUs and TPUs on the cloud.
+   :button_link: levels/intermediate_level_7.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 8: Train in the background on the cloud
+   :description: Learn how to run models on the cloud in the background.
+   :button_link: levels/intermediate_level_8.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 9: Modularize your projects
+   :description: Create DataModules to enable dataset reusability.
+   :col_css: col-md-6
+   :button_link: levels/intermediate_level_9.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 10: Understand your model
+   :description: Use advanced visuals to find the best performing model.
+   :col_css: col-md-6
+   :button_link: levels/intermediate_level_10.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 11: Explore SOTA scaling techniques
+   :description: Explore SOTA techniques to help convergence, stability and scalability.
+   :col_css: col-md-6
+   :button_link: levels/intermediate_level_11.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 12: Deploy your models
+   :description: Learn how to deploy your models with optimizations like ONNX and torchscript.
+   :col_css: col-md-6
+   :button_link: levels/intermediate_level_12.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 13: Optimize training speed
+   :description: Use advanced profilers to mixed precision to train bigger models, faster.
+   :col_css: col-md-6
+   :button_link: levels/intermediate_level_13.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 14: Run on on-prem clusters
+   :description: Run on a custom on-prem cluster or SLURM cluster.
+   :col_css: col-md-6
+   :button_link: levels/intermediate_level_14.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
+
+----
+
+Advanced skills
+---------------
+Configure all aspects of Lightning for advanced usecases.
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Level 15: Customize configs to run in production
+   :description: Enable composable YAMLs
+   :col_css: col-md-6
+   :button_link: levels/advanced_level_15.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 16: Customize the trainer
+   :description: Inject custom code into the trainer and modify the progress bar.
+   :col_css: col-md-6
+   :button_link: levels/advanced_level_16.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 17: Own the training loop
+   :description: Learn all the ways of owning your raw PyTorch loops with Lighting.
+   :col_css: col-md-6
+   :button_link: levels/advanced_level_17.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 18: Enable advanced checkpointing
+   :description: Enable composable or cloud based checkpoints.
+   :col_css: col-md-6
+   :button_link: levels/advanced_level_18.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 19: Explore IPUs
+   :description: Explore Intelligence Processing Unit (IPU) for model scaling.
+   :col_css: col-md-6
+   :button_link: levels/advanced_level_19.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 20: Explore HPUs
+   :description: Explore Havana Gaudi Processing Unit (HPU) for model scaling.
+   :col_css: col-md-6
+   :button_link: levels/advanced_level_20.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 21: Master TPUs
+   :description: Master TPUs and run on cloud TPUs.
+   :col_css: col-md-6
+   :button_link: levels/advanced_level_21.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 22: Reach 1 trillion parameters on GPUs
+   :description: Scale to 1 trillion params on GPUs.
+   :col_css: col-md-6
+   :button_link: levels/advanced_level_22.html
+   :height: 150
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
+
+----
+
+Expert skills
+-------------
+Customize and extend Lightning for things like custom hardware or distributed strategies.
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Level 23: Extend the Lightning CLI
+   :description: Extend the functionality of the Lightning CLI.
+   :col_css: col-md-6
+   :button_link: levels/expert_level_23.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Level 24: Integrate a custom cluster
+   :description: Integrate a custom cluster into Lightning.
+   :col_css: col-md-6
+   :button_link: levels/expert_level_24.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Level 25: Explore fault-tolerance in-depth
+   :description: Understand the details of fault-tolerance.
+   :col_css: col-md-6
+   :button_link: clouds/fault_tolerant_training_faq.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Level 26: Make your own profiler
+   :description: Make your own profiler.
+   :col_css: col-md-6
+   :button_link: tuning/profiler_expert.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Level 27: Add a new accelerator or Strategy
+   :description: Integrate a new accelerator or distributed strategy.
+   :col_css: col-md-6
+   :button_link: levels/expert_level_27.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
index f6dccff662e03c..45e0e1d9b0c66d 100644
--- a/docs/source/extensions/callbacks.rst
+++ b/docs/source/extensions/callbacks.rst
@@ -1,15 +1,11 @@
-.. testsetup:: *
-
-    from pytorch_lightning.trainer.trainer import Trainer
-    from pytorch_lightning.callbacks.base import Callback
-
 .. role:: hidden
     :class: hidden-section
 
 .. _callbacks:
 
+########
 Callback
-========
+########
 
 .. raw:: html
 
@@ -60,8 +56,9 @@ We successfully extended functionality without polluting our super clean
 
 -----------
 
+********
 Examples
---------
+********
 You can do pretty much anything with callbacks.
 
 - `Add a MLP to fine-tune self-supervised networks <https://lightning-bolts.readthedocs.io/en/stable/deprecated/callbacks/self_supervised.html#sslonlineevaluator>`_.
@@ -72,8 +69,9 @@ You can do pretty much anything with callbacks.
 
 --------------
 
+******************
 Built-in Callbacks
-------------------
+******************
 Lightning has a few built-in callbacks.
 
 .. note::
@@ -108,73 +106,14 @@ Lightning has a few built-in callbacks.
 
 ----------
 
-.. _Persisting Callback State:
-
-Persisting State
-----------------
-
-Some callbacks require internal state in order to function properly. You can optionally
-choose to persist your callback's state as part of model checkpoint files using
-:meth:`~pytorch_lightning.callbacks.Callback.state_dict` and :meth:`~pytorch_lightning.callbacks.Callback.load_state_dict`.
-Note that the returned state must be able to be pickled.
-
-When your callback is meant to be used only as a singleton callback then implementing the above two hooks is enough
-to persist state effectively. However, if passing multiple instances of the callback to the Trainer is supported, then
-the callback must define a :attr:`~pytorch_lightning.callbacks.Callback.state_key` property in order for Lightning
-to be able to distinguish the different states when loading the callback state. This concept is best illustrated by
-the following example.
-
-.. testcode::
-
-    class Counter(Callback):
-        def __init__(self, what="epochs", verbose=True):
-            self.what = what
-            self.verbose = verbose
-            self.state = {"epochs": 0, "batches": 0}
-
-        @property
-        def state_key(self):
-            # note: we do not include `verbose` here on purpose
-            return self._generate_state_key(what=self.what)
-
-        def on_train_epoch_end(self, *args, **kwargs):
-            if self.what == "epochs":
-                self.state["epochs"] += 1
-
-        def on_train_batch_end(self, *args, **kwargs):
-            if self.what == "batches":
-                self.state["batches"] += 1
-
-        def load_state_dict(self, state_dict):
-            self.state.update(state_dict)
-
-        def state_dict(self):
-            return self.state.copy()
-
-
-    # two callbacks of the same type are being used
-    trainer = Trainer(callbacks=[Counter(what="epochs"), Counter(what="batches")])
+.. include:: callbacks_state.rst
 
-A Lightning checkpoint from this Trainer with the two stateful callbacks will include the following information:
-
-.. code-block::
-
-    {
-        "state_dict": ...,
-        "callbacks": {
-            "Counter{'what': 'batches'}": {"batches": 32, "epochs": 0},
-            "Counter{'what': 'epochs'}": {"batches": 0, "epochs": 2},
-            ...
-        }
-    }
-
-The implementation of a :attr:`~pytorch_lightning.callbacks.Callback.state_key` is essential here. If it were missing,
-Lightning would not be able to disambiguate the state for these two callbacks, and :attr:`~pytorch_lightning.callbacks.Callback.state_key`
-by default only defines the class name as the key, e.g., here ``Counter``.
+----------
 
 
+**************
 Best Practices
---------------
+**************
 The following are best practices when using/designing callbacks.
 
 1. Callbacks should be isolated in their functionality.
@@ -183,140 +122,146 @@ The following are best practices when using/designing callbacks.
 4. Directly calling methods (eg. `on_validation_end`) is strongly discouraged.
 5. Whenever possible, your callbacks should not depend on the order in which they are executed.
 
+
+-----------
+
+.. include:: entry_points.rst
+
 -----------
 
 .. _callback_hooks:
 
+************
 Callback API
-------------
+************
 Here is the full API of methods available in the Callback base class.
 
 The :class:`~pytorch_lightning.callbacks.Callback` class is the base for all the callbacks in Lightning just like the :class:`~pytorch_lightning.core.lightning.LightningModule` is the base for all models.
 It defines a public interface that each callback implementation must follow, the key ones are:
 
 Properties
-^^^^^^^^^^
+==========
 
 state_key
-~~~~~~~~~
+^^^^^^^^^
 
 .. autoattribute:: pytorch_lightning.callbacks.Callback.state_key
     :noindex:
 
 
 Hooks
-^^^^^
+=====
 
 on_configure_sharded_model
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_configure_sharded_model
     :noindex:
 
 setup
-~~~~~
+^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.setup
     :noindex:
 
 teardown
-~~~~~~~~
+^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.teardown
     :noindex:
 
 on_init_start
-~~~~~~~~~~~~~
+^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_init_start
     :noindex:
 
 on_init_end
-~~~~~~~~~~~
+^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_init_end
     :noindex:
 
 on_fit_start
-~~~~~~~~~~~~
+^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_fit_start
     :noindex:
 
 on_fit_end
-~~~~~~~~~~
+^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_fit_end
     :noindex:
 
 on_sanity_check_start
-~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_sanity_check_start
     :noindex:
 
 on_sanity_check_end
-~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_sanity_check_end
     :noindex:
 
 on_train_batch_start
-~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_train_batch_start
     :noindex:
 
 on_train_batch_end
-~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_train_batch_end
     :noindex:
 
 on_train_epoch_start
-~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_train_epoch_start
     :noindex:
 
 on_train_epoch_end
-~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_train_epoch_end
     :noindex:
 
 on_validation_epoch_start
-~~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_validation_epoch_start
     :noindex:
 
 on_validation_epoch_end
-~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_validation_epoch_end
     :noindex:
 
 on_test_epoch_start
-~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_test_epoch_start
     :noindex:
 
 on_test_epoch_end
-~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_test_epoch_end
     :noindex:
 
 on_predict_epoch_start
-~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_predict_epoch_start
     :noindex:
 
 on_predict_epoch_end
-~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_predict_epoch_end
     :noindex:
@@ -325,145 +270,145 @@ on_predict_epoch_end
     :noindex:
 
 on_validation_batch_start
-~~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_validation_batch_start
     :noindex:
 
 on_validation_batch_end
-~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_validation_batch_end
     :noindex:
 
 on_test_batch_start
-~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_test_batch_start
     :noindex:
 
 on_test_batch_end
-~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_test_batch_end
     :noindex:
 
 on_predict_batch_start
-~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_predict_batch_start
     :noindex:
 
 on_predict_batch_end
-~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_predict_batch_end
     :noindex:
 
 on_train_start
-~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_train_start
     :noindex:
 
 on_train_end
-~~~~~~~~~~~~
+^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_train_end
     :noindex:
 
 on_validation_start
-~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_validation_start
     :noindex:
 
 on_validation_end
-~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_validation_end
     :noindex:
 
 on_test_start
-~~~~~~~~~~~~~
+^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_test_start
     :noindex:
 
 on_test_end
-~~~~~~~~~~~
+^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_test_end
     :noindex:
 
 on_predict_start
-~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_predict_start
     :noindex:
 
 on_predict_end
-~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_predict_end
     :noindex:
 
 on_keyboard_interrupt
-~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_keyboard_interrupt
     :noindex:
 
 on_exception
-~~~~~~~~~~~~
+^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_exception
     :noindex:
 
 state_dict
-~~~~~~~~~~
+^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.state_dict
     :noindex:
 
 on_save_checkpoint
-~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_save_checkpoint
     :noindex:
 
 load_state_dict
-~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.load_state_dict
     :noindex:
 
 on_load_checkpoint
-~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_load_checkpoint
     :noindex:
 
 on_before_backward
-~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_before_backward
     :noindex:
 
 on_after_backward
-~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_after_backward
     :noindex:
 
 on_before_optimizer_step
-~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_before_optimizer_step
     :noindex:
 
 on_before_zero_grad
-~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_before_zero_grad
     :noindex:
diff --git a/docs/source/extensions/callbacks_state.rst b/docs/source/extensions/callbacks_state.rst
new file mode 100644
index 00000000000000..0a104caf2bc3d0
--- /dev/null
+++ b/docs/source/extensions/callbacks_state.rst
@@ -0,0 +1,62 @@
+*******************
+Save Callback state
+*******************
+
+Some callbacks require internal state in order to function properly. You can optionally
+choose to persist your callback's state as part of model checkpoint files using
+:meth:`~pytorch_lightning.callbacks.Callback.state_dict` and :meth:`~pytorch_lightning.callbacks.Callback.load_state_dict`.
+Note that the returned state must be able to be pickled.
+
+When your callback is meant to be used only as a singleton callback then implementing the above two hooks is enough
+to persist state effectively. However, if passing multiple instances of the callback to the Trainer is supported, then
+the callback must define a :attr:`~pytorch_lightning.callbacks.Callback.state_key` property in order for Lightning
+to be able to distinguish the different states when loading the callback state. This concept is best illustrated by
+the following example.
+
+.. testcode::
+
+    class Counter(Callback):
+        def __init__(self, what="epochs", verbose=True):
+            self.what = what
+            self.verbose = verbose
+            self.state = {"epochs": 0, "batches": 0}
+
+        @property
+        def state_key(self):
+            # note: we do not include `verbose` here on purpose
+            return self._generate_state_key(what=self.what)
+
+        def on_train_epoch_end(self, *args, **kwargs):
+            if self.what == "epochs":
+                self.state["epochs"] += 1
+
+        def on_train_batch_end(self, *args, **kwargs):
+            if self.what == "batches":
+                self.state["batches"] += 1
+
+        def load_state_dict(self, state_dict):
+            self.state.update(state_dict)
+
+        def state_dict(self):
+            return self.state.copy()
+
+
+    # two callbacks of the same type are being used
+    trainer = Trainer(callbacks=[Counter(what="epochs"), Counter(what="batches")])
+
+A Lightning checkpoint from this Trainer with the two stateful callbacks will include the following information:
+
+.. code-block::
+
+    {
+        "state_dict": ...,
+        "callbacks": {
+            "Counter{'what': 'batches'}": {"batches": 32, "epochs": 0},
+            "Counter{'what': 'epochs'}": {"batches": 0, "epochs": 2},
+            ...
+        }
+    }
+
+The implementation of a :attr:`~pytorch_lightning.callbacks.Callback.state_key` is essential here. If it were missing,
+Lightning would not be able to disambiguate the state for these two callbacks, and :attr:`~pytorch_lightning.callbacks.Callback.state_key`
+by default only defines the class name as the key, e.g., here ``Counter``.
diff --git a/docs/source/extensions/datamodules_state.rst b/docs/source/extensions/datamodules_state.rst
new file mode 100644
index 00000000000000..61710d7f11e2ea
--- /dev/null
+++ b/docs/source/extensions/datamodules_state.rst
@@ -0,0 +1,15 @@
+Save DataModule state
+=====================
+When a checkpoint is created, it asks every DataModule for their state. If your DataModule defines the *state_dict* and *load_state_dict* methods, the checkpoint will automatically track and restore your DataModules.
+
+.. code:: python
+
+    class LitDataModule(pl.DataModuler):
+        def state_dict(self):
+            # track whatever you want here
+            state = {"current_train_batch_index": self.current_train_batch_index}
+            return state
+
+        def load_state_dict(self, state_dict):
+            # restore the state based on what you tracked in (def state_dict)
+            self.current_train_batch_index = state_dict["current_train_batch_index"]
diff --git a/docs/source/extensions/entry_points.rst b/docs/source/extensions/entry_points.rst
new file mode 100644
index 00000000000000..24bf28a05f55cc
--- /dev/null
+++ b/docs/source/extensions/entry_points.rst
@@ -0,0 +1,45 @@
+************
+Entry Points
+************
+
+Lightning supports registering Trainer callbacks directly through
+`Entry Points <https://setuptools.pypa.io/en/latest/userguide/entry_point.html>`_. Entry points allow an arbitrary
+package to include callbacks that the Lightning Trainer can automatically use, without you having to add them
+to the Trainer manually. This is useful in production environments where it is common to provide specialized monitoring
+and logging callbacks globally for every application.
+
+Here is a callback factory function that returns two special callbacks:
+
+.. code-block:: python
+    :caption: factories.py
+
+    def my_custom_callbacks_factory():
+        return [MyCallback1(), MyCallback2()]
+
+If we make this `factories.py` file into an installable package, we can define an **entry point** for this factory function.
+Here is a minimal example of the `setup.py` file for the package `my-package`:
+
+.. code-block:: python
+    :caption: setup.py
+
+    from setuptools import setup
+
+    setup(
+        name="my-package",
+        version="0.0.1",
+        install_requires=["pytorch-lightning"],
+        entry_points={
+            "pytorch_lightning.callbacks_factory": [
+                # The format here must be [any name]=[module path]:[function name]
+                "monitor_callbacks=factories:my_custom_callbacks_factory"
+            ]
+        },
+    )
+
+The group name for the entry points is ``pytorch_lightning.callbacks_factory`` and it contains a list of strings that
+specify where to find the function within the package.
+
+Now, if you `pip install -e .` this package, it will register the ``my_custom_callbacks_factory`` function and Lightning
+will automatically call it to collect the callbacks whenever you run the Trainer!
+
+To unregister the factory, simply uninstall the package with `pip uninstall "my-package"`.
diff --git a/docs/source/extensions/logging.rst b/docs/source/extensions/logging.rst
index 6abf04fac06bc0..8bad4525bacf38 100644
--- a/docs/source/extensions/logging.rst
+++ b/docs/source/extensions/logging.rst
@@ -1,7 +1,7 @@
+:orphan:
+
 .. testsetup:: *
 
-    from pytorch_lightning.core.lightning import LightningModule
-    from pytorch_lightning.trainer.trainer import Trainer
     from pytorch_lightning import loggers as pl_loggers
 
 .. role:: hidden
@@ -170,6 +170,24 @@ The :meth:`~pytorch_lightning.core.lightning.LightningModule.log` method has a f
      - False
      - True
 
+
+.. note::
+
+    While logging tensor metrics with ``on_epoch=True`` inside step-level hooks and using mean-reduction (default) to accumulate the metrics across the current epoch, Lightning tries to extract the
+    batch size from the current batch. If multiple possible batch sizes are found, a warning is logged and if it fails to extract the batch size from the current batch, which is possible if
+    the batch is a custom structure/collection, then an error is raised. To avoid this, you can specify the ``batch_size`` inside the ``self.log(... batch_size=batch_size)`` call.
+
+    .. code-block:: python
+
+        def training_step(self, batch, batch_idx):
+            # extracts the batch size from `batch`
+            self.log("train_loss", loss, on_epoch=True)
+
+
+        def validation_step(self, batch, batch_idx):
+            # uses `batch_size=10`
+            self.log("val_loss", loss, batch_size=10)
+
 .. note::
 
     - The above config for ``validation`` applies for ``test`` hooks as well.
@@ -389,4 +407,4 @@ Managing Remote Filesystems
 
 Lightning supports saving logs to a variety of filesystems, including local filesystems and several cloud storage providers.
 
-Check out the :ref:`Remote Filesystems <remote_fs>` doc for more info.
+Check out the :doc:`Remote Filesystems <../common/remote_fs>` doc for more info.
diff --git a/docs/source/extensions/loops.rst b/docs/source/extensions/loops.rst
index d2e5d1467b4542..c24d4ceac24739 100644
--- a/docs/source/extensions/loops.rst
+++ b/docs/source/extensions/loops.rst
@@ -1,4 +1,5 @@
-.. _loop_customization:
+.. _loop-customization-extensions:
+
 
 Loops
 =====
@@ -107,7 +108,7 @@ Defining a loop within a class interface instead of hard-coding a raw Python for
 
 ----------
 
-.. _override default loops:
+.. _override-default-loops-extensions:
 
 Overriding the default Loops
 ----------------------------
@@ -137,7 +138,7 @@ For example with the :class:`~pytorch_lightning.loops.fit_loop.FitLoop`:
         def on_run_end(self):
             """Do something when the loop ends."""
 
-A full list with all built-in loops and subloops can be found :ref:`here <loop structure>`.
+A full list with all built-in loops and subloops can be found :ref:`here <loop-structure-extensions>`.
 
 To add your own modifications to a loop, simply subclass an existing loop class and override what you need.
 Here is a simple example how to add a new hook:
@@ -213,7 +214,7 @@ Finally, attach it into the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
     trainer.fit(...)
 
 But beware: Loop customization gives you more power and full control over the Trainer and with great power comes great responsibility.
-We recommend that you familiarize yourself with :ref:`overriding the default loops <override default loops>` first before you start building a new loop from the ground up.
+We recommend that you familiarize yourself with :ref:`overriding the default loops <override-default-loops-extensions>` first before you start building a new loop from the ground up.
 
 ----------
 
@@ -294,11 +295,11 @@ More about the built-in loops and how they are composed is explained in the next
 
 ----------
 
-.. _loop structure:
-
 Built-in Loops
 --------------
 
+.. _loop-structure-extensions:
+
 The training loop in Lightning is called *fit loop* and is actually a combination of several loops.
 Here is what the structure would look like in plain Python:
 
@@ -354,7 +355,7 @@ Each of these :code:`for`-loops represents a class implementing the :class:`~pyt
      - The :class:`~pytorch_lightning.loops.optimization.optimizer_loop.OptimizerLoop` iterates over one or multiple optimizers and for each one it calls the :meth:`~pytorch_lightning.core.lightning.LightningModule.training_step` method with the batch, the current batch index and the optimizer index if multiple optimizers are requested.
        It is the leaf node in the tree of loops and performs the actual optimization (forward, zero grad, backward, optimizer step).
    * - :class:`~pytorch_lightning.loops.optimization.manual_loop.ManualOptimization`
-     - Substitutes the :class:`~pytorch_lightning.loops.optimization.optimizer_loop.OptimizerLoop` in case of :ref:`manual_optimization` and implements the manual optimization step.
+     - Substitutes the :class:`~pytorch_lightning.loops.optimization.optimizer_loop.OptimizerLoop` in case of :doc:`manual optimization <../model/manual_optimization>` and implements the manual optimization step.
    * - :class:`~pytorch_lightning.loops.dataloader.evaluation_loop.EvaluationLoop`
      - The :class:`~pytorch_lightning.loops.dataloader.evaluation_loop.EvaluationLoop` is the top-level loop where validation/testing starts.
        It simply iterates over each evaluation dataloader from one to the next by calling :code:`EvaluationEpochLoop.run()` in its :code:`advance()` method.
diff --git a/docs/source/extensions/loops_advanced.rst b/docs/source/extensions/loops_advanced.rst
index c53d6344de4c7d..e71c827f36ce50 100644
--- a/docs/source/extensions/loops_advanced.rst
+++ b/docs/source/extensions/loops_advanced.rst
@@ -12,7 +12,7 @@ Persisting the State of Loops
 
     This is an experimental feature and is not activated by default.
     Set the environment variable `PL_FAULT_TOLERANT_TRAINING = 1` to enable saving the progress of loops.
-    Read more about :doc:`fault-tolerant training <../advanced/fault_tolerant_training>`.
+    Read more about :doc:`fault-tolerant training <../clouds/fault_tolerant_training>`.
 
 A powerful property of the class-based loop interface is that it can own an internal state.
 Loop instances can save their state to the checkpoint through corresponding hooks and if implemented accordingly, resume the state of execution at the appropriate place.
diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst
index b70b330ed7337a..392a07219b45b6 100644
--- a/docs/source/extensions/plugins.rst
+++ b/docs/source/extensions/plugins.rst
@@ -30,7 +30,7 @@ By default, the plugins get selected based on the rest of the Trainer settings s
 
 -----------
 
-.. _precision_plugins:
+.. _precision-plugins:
 
 *****************
 Precision Plugins
@@ -65,7 +65,7 @@ The full list of built-in precision plugins is listed below.
     TPUBf16PrecisionPlugin
     TPUPrecisionPlugin
 
-More information regarding precision with Lightning can be found :doc:`here <../advanced/precision>`
+More information regarding precision with Lightning can be found :ref:`here <precision>`
 
 -----------
 
@@ -92,7 +92,7 @@ Below is a list of built-in plugins for checkpointing.
     TorchCheckpointIO
     XLACheckpointIO
 
-Learn more about custom checkpointing with Lightning :ref:`here <customize_checkpointing>`.
+Learn more about custom checkpointing with Lightning :ref:`here <checkpointing_expert>`.
 
 -----------
 
diff --git a/docs/source/extensions/strategy.rst b/docs/source/extensions/strategy.rst
index 626338d25b8c12..aa97cc6cf00b7c 100644
--- a/docs/source/extensions/strategy.rst
+++ b/docs/source/extensions/strategy.rst
@@ -1,11 +1,9 @@
-.. _strategy:
+###################
+What is a Strategy?
+###################
 
-########
-Strategy
-########
-
-A Strategy controls the model distribution across training, evaluation, and prediction to be used by the :doc:`Trainer <../common/trainer>`.
-It can be controlled by passing different strategy with aliases (``"ddp"``, ``"ddp_spawn"``, ``"deepspeed"`` and so on) as well as a custom strategy object to the ``strategy`` parameter for Trainer.
+Strategy controls the model distribution across training, evaluation, and prediction to be used by the :doc:`Trainer <../common/trainer>`. It can be controlled by passing different
+strategy with aliases (``"ddp"``, ``"ddp_spawn"``, ``"deepspeed"`` and so on) as well as a custom strategy to the ``strategy`` parameter for Trainer.
 
 The Strategy in PyTorch Lightning handles the following responsibilities:
 
@@ -16,7 +14,8 @@ The Strategy in PyTorch Lightning handles the following responsibilities:
 * Handles/owns optimizers and schedulers.
 
 
-A Strategy is a composition of one :doc:`Accelerator <../extensions/accelerator>`, one :ref:`Precision Plugin <extensions/plugins:Precision Plugins>`, a :ref:`CheckpointIO <extensions/plugins:CheckpointIO Plugins>` plugin and other optional plugins such as the :ref:`ClusterEnvironment <extensions/plugins:Cluster Environments>`.
+Strategy is a composition of one :doc:`Accelerator <../extensions/accelerator>`, one :ref:`Precision Plugin <extensions/plugins:Precision Plugins>`, a :ref:`CheckpointIO <extensions/plugins:CheckpointIO Plugins>`
+plugin and other optional plugins such as the :ref:`ClusterEnvironment <extensions/plugins:Cluster Environments>`.
 
 .. image:: https://pl-public-data.s3.amazonaws.com/docs/static/images/strategies/overview.jpeg
     :alt: Illustration of the Strategy as a composition of the Accelerator and several plugins
@@ -75,7 +74,7 @@ The below table lists all relevant strategies available in Lightning with their
      - Description
    * - bagua
      - :class:`~pytorch_lightning.strategies.BaguaStrategy`
-     - Strategy for training using the Bagua library, with advanced distributed training algorithms and system optimizations. :ref:`Learn more. <accelerators/gpu:Bagua>`
+     - Strategy for training using the Bagua library, with advanced distributed training algorithms and system optimizations. :ref:`Learn more. <accelerators/gpu_intermediate:Bagua>`
    * - fsdp
      - :class:`~pytorch_lightning.strategies.DDPFullyShardedStrategy`
      - Strategy for Fully Sharded Data Parallel provided by FairScale. :ref:`Learn more. <advanced/model_parallel:Fully Sharded Training>`
@@ -87,19 +86,19 @@ The below table lists all relevant strategies available in Lightning with their
      - Optimizer sharded training provided by FairScale. :ref:`Learn more. <advanced/model_parallel:Sharded Training>`
    * - ddp_spawn
      - :class:`~pytorch_lightning.strategies.DDPSpawnStrategy`
-     - Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training finishes. :ref:`Learn more. <accelerators/gpu:Distributed Data Parallel Spawn>`
+     - Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training finishes. :ref:`Learn more. <accelerators/gpu_intermediate:Distributed Data Parallel Spawn>`
    * - ddp
      - :class:`~pytorch_lightning.strategies.DDPStrategy`
-     - Strategy for multi-process single-device training on one or multiple nodes. :ref:`Learn more. <accelerators/gpu:Distributed Data Parallel>`
+     - Strategy for multi-process single-device training on one or multiple nodes. :ref:`Learn more. <accelerators/gpu_intermediate:Distributed Data Parallel>`
    * - dp
      - :class:`~pytorch_lightning.strategies.DataParallelStrategy`
-     - Implements data-parallel training in a single process, i.e., the model gets replicated to each device and each gets a split of the data. :ref:`Learn more. <accelerators/gpu:Data Parallel>`
+     - Implements data-parallel training in a single process, i.e., the model gets replicated to each device and each gets a split of the data. :ref:`Learn more. <accelerators/gpu_intermediate:Data Parallel>`
    * - deepspeed
      - :class:`~pytorch_lightning.strategies.DeepSpeedStrategy`
      - Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. :ref:`Learn more. <advanced/model_parallel:deepspeed>`
    * - horovod
      - :class:`~pytorch_lightning.strategies.HorovodStrategy`
-     - Strategy for Horovod distributed training integration. :ref:`Learn more. <accelerators/gpu:Horovod>`
+     - Strategy for Horovod distributed training integration. :ref:`Learn more. <accelerators/gpu_intermediate:Horovod>`
    * - hpu_parallel
      - :class:`~pytorch_lightning.strategies.HPUParallelStrategy`
      - Strategy for distributed training on multiple HPU devices. :doc:`Learn more. <../accelerators/hpu>`
@@ -117,9 +116,6 @@ The below table lists all relevant strategies available in Lightning with their
      - Strategy for training on a single TPU device. :doc:`Learn more. <../accelerators/tpu>`
 
 
-----------
-
-
 ************************
 Create a Custom Strategy
 ************************
diff --git a/docs/source/guides/data.rst b/docs/source/guides/data.rst
index 8ddfb8fde455fd..72dba270df2e8d 100644
--- a/docs/source/guides/data.rst
+++ b/docs/source/guides/data.rst
@@ -1,8 +1,4 @@
-.. testsetup:: *
-
-    from pytorch_lightning.core.lightning import LightningModule
-    from torch.utils.data import IterableDataset, DataLoader, Dataset
-    from pytorch_lightning.trainer.trainer import Trainer
+:orphan:
 
 .. _data:
 
diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst
index dc8f29dfff94aa..1020755ecfd058 100644
--- a/docs/source/guides/speed.rst
+++ b/docs/source/guides/speed.rst
@@ -1,8 +1,8 @@
+:orphan:
+
 .. testsetup:: *
 
-    from pytorch_lightning.trainer.trainer import Trainer
     from pytorch_lightning.callbacks.early_stopping import EarlyStopping
-    from pytorch_lightning.core.lightning import LightningModule
 
 .. _training-speedup:
 
@@ -14,7 +14,6 @@ Speed Up Model Training
 When you are limited with the resources, it becomes hard to speed up model training and reduce the training time
 without affecting the model's performance. There are multiple ways you can speed up your model's time to convergence.
 
-
 ************************
 Training on Accelerators
 ************************
@@ -165,7 +164,7 @@ Example::
     -- python your_trainer_file.py
 
 
-Read more in our :ref:`accelerators` and :ref:`plugins` guides.
+Read more in our :ref:`training-speedup` and :ref:`plugins` guides.
 
 
 -----------
@@ -181,7 +180,7 @@ You can read more about it :ref:`here <early_stopping>`.
 
 ----------
 
-.. _speed_amp:
+.. _speed-amp:
 
 *********************************
 Mixed Precision (16-bit) Training
@@ -217,7 +216,7 @@ Lightning offers mixed precision training for GPUs and CPUs, as well as bfloat16
     trainer = Trainer(precision=16, accelerator="gpu", devices=4)
 
 
-Read more about :ref:`mixed-precision training <amp>`.
+Read more about :ref:`mixed-precision training <speed-amp>`.
 
 
 ----------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index d65bc3515ddc4e..f5375b3e11f9a5 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -3,30 +3,34 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to PyTorch Lightning
-============================
+Welcome to ⚡ PyTorch Lightning
+===============================
+
+.. twocolumns::
+   :left:
+      .. image:: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/mov.gif
+         :alt: Animation showing how to convert a standard training loop to a Lightning loop
+   :right:
+      PyTorch Lightning is the deep learning framework for professional AI researchers and machine learning engineers who need maximal flexibility without sacrificing performance at scale.
+      Lightning evolves with you as your projects go from idea to paper/production.
 
 .. raw:: html
 
    <div class="row" style='font-size: 14px'>
       <div class='col-md-6'>
-
-.. image:: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/mov.gif
-    :alt: Animation showing how to convert a standard training loop to a Lightning loop
-
-
-.. raw:: html
-
       </div>
       <div class='col-md-6'>
 
-PyTorch Lightning is the deep learning framework for professional AI researchers and machine learning engineers who need maximal flexibility without sacrificing performance at scale.
+.. join_slack::
+   :align: center
+   :margin: 0
 
 .. raw:: html
 
       </div>
    </div>
 
+
 .. raw:: html
 
    <hr class="docutils" style="margin: 50px 0 50px 0">
@@ -38,10 +42,10 @@ Install Lightning
 
 .. raw:: html
 
-   <div class="row" style='font-size: 14px'>
+   <div class="row" style='font-size: 16px'>
       <div class='col-md-6'>
 
-For pip (and conda) users
+Pip users
 
 .. code-block:: bash
 
@@ -52,7 +56,7 @@ For pip (and conda) users
       </div>
       <div class='col-md-6'>
 
-Or directly from conda
+Conda users
 
 .. code-block:: bash
 
@@ -69,6 +73,9 @@ Or read the `advanced install guide <starter/installation.html>`_
 
    <hr class="docutils" style="margin: 50px 0 50px 0">
 
+Get Started
+-----------
+
 .. raw:: html
 
     <div class="tutorials-callout-container">
@@ -77,39 +84,60 @@ Or read the `advanced install guide <starter/installation.html>`_
 .. Add callout items below this line
 
 .. customcalloutitem::
-   :description: Use this 2-step guide to learn key concepts.
-   :header: New to Lightning?
+   :description: Learn the 7 key steps of a typical Lightning workflow.
+   :header: Lightning in 15 minutes
    :button_link:  starter/introduction.html
 
-
 .. customcalloutitem::
-   :description: Easily organize your existing PyTorch code into PyTorch Lightning.
-   :header: PyTorch to PyTorch Lightning
-   :button_link: starter/converting.html
+   :description: Learn how to benchmark PyTorch Lightning.
+   :header: Benchmarking
+   :button_link: benchmarking/benchmarks.html
 
+.. raw:: html
 
-.. customcalloutitem::
-   :description: See how Lightning is used in research areas like NLP, Computer Vision, RL and meta learning.
-   :header: Examples
-   :button_link: tutorials.html
+        </div>
+    </div>
 
+.. End of callout item section
+
+.. raw:: html
+
+   <hr class="docutils" style="margin: 50px 0 50px 0">
+
+Current Lightning Users
+-----------------------
+
+.. raw:: html
+
+    <div class="tutorials-callout-container">
+        <div class="row">
+
+.. Add callout items below this line
 
 .. customcalloutitem::
-   :description: Detailed descriptions of each API package.
+   :description: Learn Lightning in small bites at 4 levels of expertise: Introductory, intermediate, advanced and expert.
+   :header: Level Up!
+   :button_link:  expertise_levels.html
+
+.. customcalloutitem::
+   :description: Detailed description of API each package. Assumes you already have basic Lightning knowledge.
    :header: API Reference
    :button_link: api_references.html
 
+.. customcalloutitem::
+   :description: From NLP, Computer vision to RL and meta learning - see how to use Lightning in ALL research areas.
+   :header: Hands-on Examples
+   :button_link: tutorials.html
 
 .. customcalloutitem::
-   :description: Learn how to do everything from hyperparameters sweeps to cloud training to pruning and quantization with Lightning.
-   :header: Conceptual Guides
+   :description: Learn how to do everything from hyperparameters sweeps to cloud training to Pruning and Quantization with Lightning.
+   :header: Common Workflows
    :button_link: common_usecases.html
 
-
 .. customcalloutitem::
-   :description: Learn how to benchmark PyTorch Lightning.
-   :header: Benchmarking
-   :button_link: benchmarking/benchmarks.html
+   :description: Convert your current code to Lightning
+   :header: Convert code to PyTorch Lightning
+   :button_link: starter/converting.html
 
 
 .. raw:: html
@@ -126,134 +154,132 @@ Or read the `advanced install guide <starter/installation.html>`_
 .. toctree::
    :maxdepth: 1
    :name: start
-   :caption: Getting Started
+   :caption: Get Started
 
    starter/introduction
    starter/installation
-   starter/converting
-   starter/lightning_lite
+
 
 .. toctree::
-   :maxdepth: 1
-   :name: guides
-   :caption: Best Practices
+   :maxdepth: 2
+   :name: levels
+   :caption: Level Up
 
-   guides/speed
-   guides/data
-   starter/style_guide
-   Lightning project template<https://github.com/PyTorchLightning/pytorch-lightning-conference-seed>
-   benchmarking/benchmarks
+   levels/core_skills
+   levels/intermediate
+   levels/advanced
+   levels/expert
 
 .. toctree::
    :maxdepth: 2
    :name: pl_docs
-   :caption: Lightning API
+   :caption: Core API
 
    common/lightning_module
    common/trainer
 
 .. toctree::
    :maxdepth: 2
-   :name: docs
-   :caption: Optional Extensions
+   :name: api
+   :caption: API Reference
 
-   extensions/accelerator
-   extensions/callbacks
-   extensions/datamodules
-   extensions/logging
-   extensions/plugins
-   extensions/strategy
-   extensions/loops
+   api_references
 
 .. toctree::
    :maxdepth: 1
-   :name: Accelerators
-   :caption: Accelerators
+   :name: Common Workflows
+   :caption: Common Workflows
 
-   accelerators/gpu
-   accelerators/tpu
-   accelerators/ipu
-   accelerators/hpu
+   Avoid overfitting <common/evaluation>
+   model/build_model.rst
+   common/hyperparameters
+   common/progress_bar
+   deploy/production
+   advanced/training_tricks
+   cli/lightning_cli
+   tuning/profiler
+   Manage experiments <visualize/logging_intermediate>
+   Organize existing PyTorch into Lightning <starter/converting>
+   clouds/cluster
+   Save and load model progress <common/checkpointing>
+   Save memory with half-precision <common/precision>
+   advanced/model_parallel
+   clouds/cloud_training
+   Train on single or multiple GPUs <accelerators/gpu>
+   Train on single or multiple HPUs <accelerators/hpu>
+   Train on single or multiple IPUs <accelerators/ipu>
+   Train on single or multiple TPUs <accelerators/tpu>
+   Use a pretrained model <advanced/pretrained>
+   model/own_your_loop
 
 .. toctree::
    :maxdepth: 1
-   :name: Conceptual Guides
-   :caption: Conceptual Guides
-
-   clouds/cloud_training
-   common/checkpointing
-   clouds/cluster
-   common/debugging
-   common/early_stopping
-   advanced/training_tricks
-   common/evaluation
-   advanced/fault_tolerant_training
-   common/hyperparameters
-   common/production_inference
-   common/lightning_cli
-   common/loggers
-   advanced/model_parallel
-   advanced/precision
-   common/optimization
-   advanced/profiler
-   common/progress_bar
-   advanced/pruning_quantization
-   common/remote_fs
-   advanced/strategy_registry
-   advanced/transfer_learning
+   :name: Glossary
+   :caption: Glossary
+
+   Accelerators <extensions/accelerator>
+   Callback <extensions/callbacks>
+   Checkpointing <common/checkpointing>
+   Cluster <clouds/cluster>
+   Cloud checkpoint <common/checkpointing_advanced>
+   Console Logging <common/console_logs>
+   Debugging <debug/debugging>
+   Early stopping <common/early_stopping>
+   Experiment manager (Logger) <visualize/experiment_managers>
+   Fault tolerant training  <clouds/fault_tolerant_training>
+   Finetuning <advanced/finetuning>
+   Flash <https://lightning-flash.readthedocs.io/en/stable/>
+   Grid AI <clouds/cloud_training>
+   GPU <accelerators/gpu>
+   Half precision <common/precision>
+   HPU <accelerators/hpu>
+   Inference <deploy/production_intermediate>
+   IPU <accelerators/ipu>
+   Lightning CLI <cli/lightning_cli>
+   Lightning Lite <model/build_model_expert>
+   LightningDataModule <data/datamodule>
+   LightningModule <common/lightning_module>
+   Lightning Transformers <https://pytorch-lightning.readthedocs.io/en/stable/ecosystem/transformers.html>
+   Log <visualize/loggers>
+   Loops <extensions/loops>
+   TPU <accelerators/tpu>
+   Metrics <https://torchmetrics.readthedocs.io/en/stable/>
+   Model <model/build_model.rst>
+   Model Parallel <advanced/model_parallel>
+   Plugins <extensions/plugins>
+   Progress bar <common/progress_bar>
+   Production <deploy/production_advanced>
+   Predict <deploy/production_basic>
+   Pretrained models <advanced/pretrained>
+   Profiler <tuning/profiler>
+   Pruning and Quantization <advanced/pruning_quantization>
+   Remote filesystem and FSSPEC <common/remote_fs>
+   Strategy <extensions/strategy>
+   Strategy registry <advanced/strategy_registry>
+   Style guide <starter/style_guide>
+   Sweep <clouds/run_intermediate>
+   SWA <advanced/training_tricks>
+   SLURM <clouds/cluster_advanced>
+   Transfer learning <advanced/transfer_learning>
+   Trainer <common/trainer>
+   Torch distributed <clouds/cluster_intermediate_2>
 
 .. toctree::
    :maxdepth: 1
-   :name: Tutorials
-   :caption: Tutorials
+   :name: Hands-on Examples
+   :caption: Hands-on Examples
    :glob:
 
-   starter/core_guide
+   notebooks/**/*
    PyTorch Lightning 101 class <https://www.youtube.com/playlist?list=PLaMu-SDt_RB5NUm67hU2pdE75j6KaIOv2>
    From PyTorch to PyTorch Lightning [Blog] <https://towardsdatascience.com/from-pytorch-to-pytorch-lightning-a-gentle-introduction-b371b7caaf09>
    From PyTorch to PyTorch Lightning [Video] <https://www.youtube.com/watch?v=QHww1JH7IDU>
-   notebooks/**/*
-
-.. toctree::
-   :maxdepth: 2
-   :name: api
-   :caption: API References
-
-   api_references
-
-.. toctree::
-   :maxdepth: 1
-   :name: Lightning Ecosystem
-   :caption: Lightning Ecosystem
-
-   ecosystem/metrics
-   ecosystem/flash
-   ecosystem/bolts
-   ecosystem/transformers
-   ecosystem/ecosystem-ci
-
-.. toctree::
-   :maxdepth: 1
-   :name: Examples
-   :caption: Examples
-
-   ecosystem/community_examples
-   ecosystem/asr_nlp_tts
-   Autoencoder <https://lightning-bolts.readthedocs.io/en/stable/deprecated/models/autoencoders.html>
-   BYOL <https://lightning-bolts.readthedocs.io/en/stable/deprecated/callbacks/self_supervised.html#byolmaweightupdate>
-   DQN <https://lightning-bolts.readthedocs.io/en/stable/deprecated/models/reinforce_learn.html#deep-q-network-dqn>
-   GAN <https://lightning-bolts.readthedocs.io/en/stable/deprecated/models/gans.html#basic-gan>
-   GPT-2 <https://lightning-bolts.readthedocs.io/en/stable/deprecated/models/convolutional.html#gpt-2>
-   Image-GPT <https://lightning-bolts.readthedocs.io/en/stable/deprecated/models/convolutional.html#image-gpt>
-   SimCLR <https://lightning-bolts.readthedocs.io/en/stable/deprecated/transforms/self_supervised.html#simclr-transforms>
-   VAE <https://lightning-bolts.readthedocs.io/en/stable/deprecated/models/autoencoders.html#basic-vae>
 
 .. toctree::
    :maxdepth: 1
    :name: Community
    :caption: Community
 
-
    generated/CODE_OF_CONDUCT.md
    generated/CONTRIBUTING.md
    generated/BECOMING_A_CORE_CONTRIBUTOR.md
@@ -263,9 +289,3 @@ Or read the `advanced install guide <starter/installation.html>`_
 .. raw:: html
 
    </div>
-
-Indices and tables
-------------------
-
-* :ref:`genindex`
-* :ref:`search`
diff --git a/docs/source/levels/advanced.rst b/docs/source/levels/advanced.rst
new file mode 100644
index 00000000000000..4ffe090b7636cc
--- /dev/null
+++ b/docs/source/levels/advanced.rst
@@ -0,0 +1,87 @@
+
+###############
+Advanced skills
+###############
+
+Configure all aspects of Lightning for advanced usecases.
+
+.. join_slack::
+   :align: left
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Level 15: Customize configs to run in production
+   :description: Enable composable YAMLs
+   :col_css: col-md-6
+   :button_link: advanced_level_15.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 16: Customize the trainer
+   :description: Inject custom code into the trainer and modify the progress bar.
+   :col_css: col-md-6
+   :button_link: advanced_level_16.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 17: Own the training loop
+   :description: Learn all the ways of owning your raw PyTorch loops with Lighting.
+   :col_css: col-md-6
+   :button_link: advanced_level_17.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 18: Enable advanced checkpointing
+   :description: Enable composable or cloud based checkpoints.
+   :col_css: col-md-6
+   :button_link: advanced_level_18.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 19: Explore IPUs
+   :description: Explore Intelligence Processing Unit (IPU) for model scaling.
+   :col_css: col-md-6
+   :button_link: advanced_level_19.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 20: Explore HPUs
+   :description: Explore Havana Gaudi Processing Unit (HPU) for model scaling.
+   :col_css: col-md-6
+   :button_link: advanced_level_20.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 21: Master TPUs
+   :description: Master TPUs and run on cloud TPUs.
+   :col_css: col-md-6
+   :button_link: advanced_level_21.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Level 22: Reach 1 trillion parameters on GPUs
+   :description: Scale to 1 trillion params on GPUs.
+   :col_css: col-md-6
+   :button_link: advanced_level_22.html
+   :height: 150
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/advanced_level_15.rst b/docs/source/levels/advanced_level_15.rst
new file mode 100644
index 00000000000000..761dbd328e1534
--- /dev/null
+++ b/docs/source/levels/advanced_level_15.rst
@@ -0,0 +1,37 @@
+:orphan:
+
+################################################
+Level 15: Customize configs to run in production
+################################################
+
+This level goes over advanced YAML use for running models in production.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: 1: Control it all via YAML
+   :description: Enable composable YAMLs.
+   :col_css: col-md-6
+   :button_link: ../cli/lightning_cli_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: 2: Use YAML for production
+   :description: Use the Lightning CLI with YAMLs for production environments.
+   :col_css: col-md-6
+   :button_link: ../cli/lightning_cli_advanced_2.html
+   :height: 150
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/advanced_level_16.rst b/docs/source/levels/advanced_level_16.rst
new file mode 100644
index 00000000000000..fd41df12058834
--- /dev/null
+++ b/docs/source/levels/advanced_level_16.rst
@@ -0,0 +1,37 @@
+:orphan:
+
+###############################
+Level 16: Customize the trainer
+###############################
+
+In this level, you'll learn to modify the Trainer behavior.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Create and use Callbacks
+   :description: Modify Trainer behavior with reusable, self-contained code.
+   :col_css: col-md-6
+   :button_link: ../extensions/callbacks.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Customize the progress bar
+   :description: Create beautiful custom progress bars.
+   :col_css: col-md-6
+   :button_link: ../common/progress_bar.html
+   :height: 150
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/advanced_level_17.rst b/docs/source/levels/advanced_level_17.rst
new file mode 100644
index 00000000000000..c05c8c9837c47c
--- /dev/null
+++ b/docs/source/levels/advanced_level_17.rst
@@ -0,0 +1,45 @@
+:orphan:
+
+###############################
+Level 17: Own the training loop
+###############################
+
+Learn all the ways of owning your raw PyTorch loops with Lighting.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Enable manual optimization
+   :description: Gain control of the training loop with manual optimization and LightningModule methods.
+   :col_css: col-md-4
+   :button_link: ../model/build_model_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Use a Raw PyTorch Loop
+   :description: Migrate complex PyTorch projects to Lightning and push bleeding-edge research with the raw PyTorch loop.
+   :col_css: col-md-4
+   :button_link: ../model/build_model_expert.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Make a custom Lightning Loop
+   :description: Conduct bleeding-edge research like meta-learning and RL with a custom Loop.
+   :col_css: col-md-4
+   :button_link: ../extensions/loops.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/advanced_level_18.rst b/docs/source/levels/advanced_level_18.rst
new file mode 100644
index 00000000000000..a7d7966743c89c
--- /dev/null
+++ b/docs/source/levels/advanced_level_18.rst
@@ -0,0 +1,37 @@
+:orphan:
+
+#######################################
+Level 18: Enable advanced checkpointing
+#######################################
+
+This level shows you how to enable composable and/or cloud based checkpoints.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Enable composable and cloud checkpoints
+   :description: Enable cloud-based checkpointing and composable checkpoints.
+   :col_css: col-md-6
+   :button_link: ../common/checkpointing_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Explore remote filesystems
+   :description: Explore advanced cloud checkpointing features.
+   :col_css: col-md-6
+   :button_link: ../common/remote_fs.html
+   :height: 150
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/advanced_level_19.rst b/docs/source/levels/advanced_level_19.rst
new file mode 100644
index 00000000000000..c7b66974f1d834
--- /dev/null
+++ b/docs/source/levels/advanced_level_19.rst
@@ -0,0 +1,45 @@
+:orphan:
+
+######################
+Level 19: Explore IPUs
+######################
+
+Explore Intelligence Processing Unit (IPU) for model scaling.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Prepare your code (Optional)
+   :description: Prepare your code to run on any hardware.
+   :col_css: col-md-4
+   :button_link: ../accelerators/accelerator_prepare.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Train models on IPUs
+   :description: Learn the basics of single and multi-IPU training.
+   :col_css: col-md-4
+   :button_link: ../accelerators/ipu_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Optimize models training on IPUs
+   :description: Tune model performance with mix-precision and the performance analyser.
+   :col_css: col-md-4
+   :button_link: ../accelerators/ipu_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/advanced_level_20.rst b/docs/source/levels/advanced_level_20.rst
new file mode 100644
index 00000000000000..7e9d562fd6f6a3
--- /dev/null
+++ b/docs/source/levels/advanced_level_20.rst
@@ -0,0 +1,37 @@
+:orphan:
+
+######################
+Level 19: Explore HPUs
+######################
+
+Explore Intelligence Processing Unit (IPU) for model scaling.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Train models on HPUs
+   :description: Learn the basics of single and multi-HPU core training.
+   :col_css: col-md-6
+   :button_link: ../accelerators/hpu_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Optimize models training on HPUs
+   :description: Enable state-of-the-art scaling with advanced mix-precision settings.
+   :col_css: col-md-6
+   :button_link: ../accelerators/hpu_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/advanced_level_21.rst b/docs/source/levels/advanced_level_21.rst
new file mode 100644
index 00000000000000..5252a1cd9c8bd3
--- /dev/null
+++ b/docs/source/levels/advanced_level_21.rst
@@ -0,0 +1,45 @@
+:orphan:
+
+#####################
+Level 21: Master TPUs
+#####################
+
+Master cloud TPU training with profiling and scaling techniques.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Run on cloud TPUs
+   :description: Scale massive models using cloud TPUs.
+   :col_css: col-md-4
+   :button_link: ../accelerators/tpu_intermediate.html
+   :height: 180
+   :tag: intermediate
+
+.. displayitem::
+   :header: Explore advanced TPU scaling techniques
+   :description: Dive into XLA and advanced techniques to optimize TPU-powered models.
+   :col_css: col-md-4
+   :button_link: ../accelerators/tpu_advanced.html
+   :height: 180
+   :tag: advanced
+
+.. displayitem::
+   :header: Profile TPU code
+   :description: Learn to profile TPU code.
+   :col_css: col-md-4
+   :button_link: ../tuning/profiler_advanced.html
+   :height: 180
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/advanced_level_22.rst b/docs/source/levels/advanced_level_22.rst
new file mode 100644
index 00000000000000..a90a482622cb92
--- /dev/null
+++ b/docs/source/levels/advanced_level_22.rst
@@ -0,0 +1,37 @@
+:orphan:
+
+#############################################
+Level 22: Reach 1 trillion parameters on GPUs
+#############################################
+
+Scale to 1 trillion+ parameters with multiple distributed strategies.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Scale with distributed strategies
+   :description: Learn about different distributed strategies to reach bigger model parameter sizes.
+   :col_css: col-md-6
+   :button_link: ../accelerators/gpu_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Reach 1 trillion parameters on GPUs
+   :description: Scale to 1 trillion params on GPUs with FSDP and Deepspeed.
+   :col_css: col-md-6
+   :button_link: ../advanced/model_parallel.html
+   :height: 150
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/basic_level_2.rst b/docs/source/levels/basic_level_2.rst
new file mode 100644
index 00000000000000..348a389486bd9d
--- /dev/null
+++ b/docs/source/levels/basic_level_2.rst
@@ -0,0 +1,41 @@
+:orphan:
+
+######################################
+Level 2: Add a validation and test set
+######################################
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. displayitem::
+   :header: Validate and test a model
+   :description: Add a validation and test data split to avoid overfitting.
+   :col_css: col-md-4
+   :button_link: ../common/evaluation_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Save your model progress
+   :description: Learn to save the state of a model as it trains.
+   :col_css: col-md-4
+   :button_link: ../common/checkpointing_basic.html#save-a-checkpoint
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Enable early stopping
+   :description: Use early stopping to decide when to stop training your model.
+   :col_css: col-md-4
+   :button_link: ../common/early_stopping.html
+   :height: 150
+   :tag: basic
+
+
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/basic_level_5.rst b/docs/source/levels/basic_level_5.rst
new file mode 100644
index 00000000000000..98de1802e84177
--- /dev/null
+++ b/docs/source/levels/basic_level_5.rst
@@ -0,0 +1,39 @@
+:orphan:
+
+##########################################################
+Level 5: Debug, visualize and find performance bottlenecks
+##########################################################
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. displayitem::
+   :header: Debug your model
+   :description: Learn the basics of model debugging
+   :col_css: col-md-4
+   :button_link: ../debug/debugging_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Find bottlenecks in training
+   :description: Learn to find bottlenecks in the training loop.
+   :col_css: col-md-4
+   :button_link: ../tuning/profiler_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Visualize metrics, images, and text.
+   :description: Learn how to track and visualize metrics, images and text.
+   :col_css: col-md-4
+   :button_link: ../visualize/logging_basic.html
+   :height: 150
+   :tag: basic
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/core_level_3.rst b/docs/source/levels/core_level_3.rst
new file mode 100644
index 00000000000000..ea348538f75122
--- /dev/null
+++ b/docs/source/levels/core_level_3.rst
@@ -0,0 +1,31 @@
+:orphan:
+
+####################################
+Level 3: Visualize training progress
+####################################
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. displayitem::
+   :header: Visualize metrics, images, and text.
+   :description: Learn how to track and visualize metrics, images and text.
+   :col_css: col-md-6
+   :button_link: ../visualize/logging_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Use third-party experiment managers
+   :description: Enable third-party experiment managers with advanced visualizations.
+   :col_css: col-md-6
+   :button_link: ../visualize/logging_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/core_level_6.rst b/docs/source/levels/core_level_6.rst
new file mode 100644
index 00000000000000..99686bc5e9b295
--- /dev/null
+++ b/docs/source/levels/core_level_6.rst
@@ -0,0 +1,39 @@
+:orphan:
+
+#################################
+ Level 6: Predict with your model
+#################################
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. displayitem::
+   :header: Load model weights
+   :description: Learn to load the weights (checkpoint) of a model.
+   :col_css: col-md-4
+   :button_link: ../common/checkpointing_basic.html#lightningmodule-from-checkpoint
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Predict with LightningModule
+   :description: Learn the basics of predicting with Lightning.
+   :col_css: col-md-4
+   :button_link: ../deploy/production_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Predict with pure PyTorch
+   :description: Learn to use pure PyTorch without the Lightning dependencies for prediction.
+   :col_css: col-md-4
+   :button_link: ../deploy/production_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/core_skills.rst b/docs/source/levels/core_skills.rst
new file mode 100644
index 00000000000000..86f2ff46b4191b
--- /dev/null
+++ b/docs/source/levels/core_skills.rst
@@ -0,0 +1,70 @@
+
+############
+Basic skills
+############
+Learn the basics of model development with Lightning. Researchers and machine learning engineers should start here.
+
+.. join_slack::
+   :align: left
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Level 1: Train a model
+   :description: Learn the basics of training a model.
+   :button_link: ../model/train_model_basic.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Level 2: Add a validation and test set
+   :description: Add validation and test sets to avoid over/underfitting.
+   :button_link: ../levels/basic_level_2.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Level 3: Use pretrained models
+   :description: Learn how to use pretrained models with Lightning
+   :button_link: ../advanced/transfer_learning.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Level 4: Enable script parameters
+   :description: Add parameters to your script so you can run from the commandline.
+   :button_link: ../common/hyperparameters.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Level 5: Understand and visualize your model
+   :description: Remove bottlenecks and visualize your model
+   :button_link: ../levels/basic_level_5.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :description: Use your model for predictions.
+   :header: Level 6: Predict with your model
+   :button_link: core_level_6.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: basic
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/expert.rst b/docs/source/levels/expert.rst
new file mode 100644
index 00000000000000..d41680b6b536fb
--- /dev/null
+++ b/docs/source/levels/expert.rst
@@ -0,0 +1,63 @@
+
+#############
+Expert skills
+#############
+
+Customize and extend Lightning for things like custom hardware or distributed strategies.
+
+.. join_slack::
+   :align: left
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Level 23: Extend the Lightning CLI
+   :description: Extend the functionality of the Lightning CLI.
+   :col_css: col-md-6
+   :button_link: expert_level_23.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Level 24: Integrate a custom cluster
+   :description: Integrate a custom cluster into Lightning.
+   :col_css: col-md-6
+   :button_link: expert_level_24.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Level 25: Explore fault-tolerance in-depth
+   :description: Understand the details of fault-tolerance.
+   :col_css: col-md-6
+   :button_link: ../clouds/fault_tolerant_training_faq.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Level 26: Make your own profiler
+   :description: Make your own profiler.
+   :col_css: col-md-6
+   :button_link: ../tuning/profiler_expert.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Level 27: Add a new accelerator or Strategy
+   :description: Integrate a new accelerator or distributed strategy.
+   :col_css: col-md-6
+   :button_link: expert_level_27.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/expert_level_23.rst b/docs/source/levels/expert_level_23.rst
new file mode 100644
index 00000000000000..9b143a0a8ead14
--- /dev/null
+++ b/docs/source/levels/expert_level_23.rst
@@ -0,0 +1,37 @@
+:orphan:
+
+##################################
+Level 23: Extend the Lightning CLI
+##################################
+
+Extend the functionality of the Lightning CLI.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Customize configs for complex projects
+   :description: Learn how to connect complex projects with each Registry.
+   :col_css: col-md-6
+   :button_link: ../cli/lightning_cli_advanced_3.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Extend the Lightning CLI
+   :description: Customize the Lightning CLI
+   :col_css: col-md-6
+   :button_link: ../cli/lightning_cli_expert.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/expert_level_24.rst b/docs/source/levels/expert_level_24.rst
new file mode 100644
index 00000000000000..b32a8acca6a65a
--- /dev/null
+++ b/docs/source/levels/expert_level_24.rst
@@ -0,0 +1,37 @@
+:orphan:
+
+####################################
+Level 24: Integrate a custom cluster
+####################################
+
+Extend the functionality of the Lightning CLI.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Integrate your own cluster
+   :description: Learn how to integrate your own cluster
+   :col_css: col-md-6
+   :button_link: ../clouds/cluster_expert.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: Run on your own cloud
+   :description: Learn how to run on your Company or University private clouds.
+   :col_css: col-md-6
+   :button_link: ../clouds/run_expert.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/expert_level_27.rst b/docs/source/levels/expert_level_27.rst
new file mode 100644
index 00000000000000..c2d682b18dd035
--- /dev/null
+++ b/docs/source/levels/expert_level_27.rst
@@ -0,0 +1,53 @@
+:orphan:
+
+###########################################
+Level 27: Add a new accelerator or Strategy
+###########################################
+
+Integrate a new accelerator or distributed strategy.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: 1: Develop a new strategy
+   :description: Develop new strategies for training and deploying larger and larger models.
+   :col_css: col-md-6
+   :button_link: ../accelerators/gpu_expert.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: 2: Customize checkpointing with new strategies.
+   :description: Customize checkpointing for custom distributed strategies and accelerators.
+   :col_css: col-md-6
+   :button_link: ../common/checkpointing_expert.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: 3: Register a new strategy
+   :description: Enable a new strategy to be used in Lightning.
+   :col_css: col-md-6
+   :button_link: ../advanced/strategy_registry.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: 4: Create a new precision technique
+   :description: Create new precision techniques and enable them through Lightning.
+   :col_css: col-md-6
+   :button_link: ../common/precision_expert.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/intermediate.rst b/docs/source/levels/intermediate.rst
new file mode 100644
index 00000000000000..331e4770fe9cd2
--- /dev/null
+++ b/docs/source/levels/intermediate.rst
@@ -0,0 +1,89 @@
+
+###################
+Intermediate skills
+###################
+
+Learn to scale up your models and enable collaborative model development at academic or industry research labs.
+
+.. join_slack::
+   :align: left
+
+----
+
+.. include:: ../links.rst
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Level 7: Interactive cloud development
+   :description: Learn how to access GPUs and TPUs on the cloud.
+   :button_link: intermediate_level_7.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 8: Train in the background on the cloud
+   :description: Learn how to run models on the cloud in the background.
+   :button_link: intermediate_level_8.html
+   :col_css: col-md-6
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 9: Modularize your projects
+   :description: Create DataModules to enable dataset reusability.
+   :col_css: col-md-6
+   :button_link: intermediate_level_9.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 10: Understand your model
+   :description: Use advanced visuals to find the best performing model.
+   :col_css: col-md-6
+   :button_link: intermediate_level_10.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 11: Explore SOTA scaling techniques
+   :description: Explore SOTA techniques to help convergence, stability and scalability.
+   :col_css: col-md-6
+   :button_link: intermediate_level_11.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 12: Deploy your models
+   :description: Learn how to deploy your models with optimizations like ONNX and torchscript.
+   :col_css: col-md-6
+   :button_link: intermediate_level_12.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 13: Optimize training speed
+   :description: Use advanced profilers to mixed precision to train bigger models, faster.
+   :col_css: col-md-6
+   :button_link: intermediate_level_13.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Level 14: Run on on-prem clusters
+   :description: Run on a custom on-prem cluster or SLURM cluster.
+   :col_css: col-md-6
+   :button_link: intermediate_level_14.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/intermediate_level_10.rst b/docs/source/levels/intermediate_level_10.rst
new file mode 100644
index 00000000000000..d7f5dc512d1a07
--- /dev/null
+++ b/docs/source/levels/intermediate_level_10.rst
@@ -0,0 +1,45 @@
+:orphan:
+
+###############################
+Level 10: Understand your model
+###############################
+
+Find the best model using advanced visualizations for deeper insights.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: 1: Alter checkpoint behavior
+   :description: Learn to monitor metrics and enable checkpointing by condition.
+   :col_css: col-md-4
+   :button_link: ../common/checkpointing_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: 2: Visualize more than metrics
+   :description: Use advanced visualization techniques provided by experiment managers.
+   :col_css: col-md-4
+   :button_link: ../visualize/logging_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: 3: Granular control of logging
+   :description: Gain granular control over logging to optimize for speed.
+   :col_css: col-md-4
+   :button_link: ../visualize/logging_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/intermediate_level_11.rst b/docs/source/levels/intermediate_level_11.rst
new file mode 100644
index 00000000000000..4c7ed0639d4825
--- /dev/null
+++ b/docs/source/levels/intermediate_level_11.rst
@@ -0,0 +1,37 @@
+:orphan:
+
+#########################################
+Level 11: Explore SOTA scaling techniques
+#########################################
+
+In this level you'll explore SOTA techniques to help convergence, stability and scalability.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: 1: Half precision training
+   :description: Enable your models to train faster and save memory with different floating-point precision settings.
+   :col_css: col-md-6
+   :button_link: ../common/precision_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: 2: SOTA scaling techniques
+   :description: Enable techniques to help scaling and convergence.
+   :col_css: col-md-6
+   :button_link: ../advanced/training_tricks.html
+   :height: 150
+   :tag: basic
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/intermediate_level_12.rst b/docs/source/levels/intermediate_level_12.rst
new file mode 100644
index 00000000000000..fe1c076b2d20ff
--- /dev/null
+++ b/docs/source/levels/intermediate_level_12.rst
@@ -0,0 +1,46 @@
+:orphan:
+
+############################
+Level 12: Deploy your models
+############################
+
+In this level you'll learn a few options for deploying models into production.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Deploy with ONNX
+   :description: Optimize models for enterprise-scale production environments with ONNX.
+   :col_css: col-md-4
+   :button_link: ../deploy/production_advanced.html
+   :height: 180
+   :tag: advanced
+
+.. displayitem::
+   :header: Deploy with torchscript
+   :description: Optimize models for enterprise-scale production environments with torchscript.
+   :col_css: col-md-4
+   :button_link: ../deploy/production_advanced_2.html
+   :height: 180
+   :tag: advanced
+
+.. displayitem::
+   :header: Compress models for fast inference
+   :description: Compress model size for deployment with Quantization and Pruning.
+   :col_css: col-md-4
+   :button_link: ../advanced/pruning_quantization.html
+   :height: 180
+   :tag: advanced
+
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/intermediate_level_13.rst b/docs/source/levels/intermediate_level_13.rst
new file mode 100644
index 00000000000000..38ac7aa1822190
--- /dev/null
+++ b/docs/source/levels/intermediate_level_13.rst
@@ -0,0 +1,45 @@
+:orphan:
+
+#################################
+Level 13: Optimize training speed
+#################################
+
+In this level you'll use advanced profilers and mixed precision techniques to train bigger models faster.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Explore advanced mixed precision settings
+   :description: Enable state-of-the-art scaling with advanced mix-precision settings.
+   :col_css: col-md-4
+   :button_link: ../common/precision_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Enable advanced profilers
+   :description: Tune model performance with profilers.
+   :col_css: col-md-4
+   :button_link: ../tuning/profiler_basic.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Profile PyTorch operations
+   :description: Learn to find bottlenecks in PyTorch operations.
+   :col_css: col-md-4
+   :button_link: ../tuning/profiler_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/intermediate_level_14.rst b/docs/source/levels/intermediate_level_14.rst
new file mode 100644
index 00000000000000..a779edeed5062d
--- /dev/null
+++ b/docs/source/levels/intermediate_level_14.rst
@@ -0,0 +1,45 @@
+:orphan:
+
+#################################
+Level 14: Run on on-prem clusters
+#################################
+
+In this level you'll learn to run on onprem clusters.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Run on an on-prem cluster
+   :description: Learn to train models on a general compute cluster.
+   :col_css: col-md-4
+   :button_link: ../clouds/cluster_intermediate_1.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Run on a SLURM cluster
+   :description: Run models on a SLURM-managed cluster
+   :col_css: col-md-4
+   :button_link: ../clouds/cluster_advanced.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Run with Torch Distributed
+   :description: Run models on a cluster with torch distributed.
+   :col_css: col-md-4
+   :button_link: ../clouds/cluster_intermediate_2.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/intermediate_level_7.rst b/docs/source/levels/intermediate_level_7.rst
new file mode 100644
index 00000000000000..cc55fd70a0501a
--- /dev/null
+++ b/docs/source/levels/intermediate_level_7.rst
@@ -0,0 +1,51 @@
+:orphan:
+
+######################################
+Level 7: Interactive cloud development
+######################################
+
+Learn to develop models on cloud GPUs and TPUs.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. displayitem::
+   :header: Prepare your code (Optional)
+   :description: Prepare your code to run on any hardware.
+   :col_css: col-md-3
+   :button_link: ../accelerators/accelerator_prepare.html
+   :height: 180
+   :tag: basic
+
+.. displayitem::
+   :header: Access a cloud machine with GPUs
+   :description: Learn how to get a cloud machine with single or multiple GPUs.
+   :col_css: col-md-3
+   :button_link: ../clouds/session_basic.html
+   :height: 180
+   :tag: basic
+
+.. displayitem::
+   :header: GPU Training
+   :description: Learn the basics on single and multi-GPU training.
+   :col_css: col-md-3
+   :button_link: ../accelerators/gpu_basic.html
+   :height: 180
+   :tag: basic
+
+.. displayitem::
+   :header: TPU Training
+   :description: Learn the basics on single and multi-TPU core training.
+   :col_css: col-md-3
+   :button_link: ../accelerators/tpu_basic.html
+   :height: 180
+   :tag: basic
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/intermediate_level_8.rst b/docs/source/levels/intermediate_level_8.rst
new file mode 100644
index 00000000000000..190364a5172d04
--- /dev/null
+++ b/docs/source/levels/intermediate_level_8.rst
@@ -0,0 +1,53 @@
+:orphan:
+
+################################
+Level 8: Run models on the cloud
+################################
+
+Learn to run models on the cloud in the background asynchroneously.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: 1: Run a model in the background on the cloud
+   :description: Learn to run a model in the background on a cloud machine.
+   :col_css: col-md-6
+   :button_link: ../clouds/run_basic.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: 2: Save up to 80% on cloud costs with fault-tolerant training
+   :description: Run on the cloud for 1/10th the price with fault-tolerant training.
+   :col_css: col-md-6
+   :button_link: ../clouds/fault_tolerant_training_basic.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: 3: Run many models at once
+   :description: Run many models at once (sweep) to find the best performing model.
+   :col_css: col-md-6
+   :button_link: ../clouds/run_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: 4: Run on your own cloud
+   :description: Learn how to run on your Company or University private clouds.
+   :col_css: col-md-6
+   :button_link: ../clouds/run_expert.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/levels/intermediate_level_9.rst b/docs/source/levels/intermediate_level_9.rst
new file mode 100644
index 00000000000000..8c537d777e1744
--- /dev/null
+++ b/docs/source/levels/intermediate_level_9.rst
@@ -0,0 +1,45 @@
+:orphan:
+
+#################################
+Level 9: Modularize your projects
+#################################
+
+This module teaches you how to setup complex projects that can be controlled via the CLI.
+
+----
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: 1: Modularize your datasets
+   :description: Reuse datasets across models by using DataModules
+   :col_css: col-md-4
+   :button_link: ../data/datamodule.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: 2: Control it all from the CLI
+   :description: Learn to control a LightningModule and LightningDataModule from the CLI
+   :col_css: col-md-4
+   :button_link: ../cli/lightning_cli_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: 3: Mix models and datasets
+   :description: Register models, datasets, optimizers and learning rate schedulers
+   :col_css: col-md-4
+   :button_link: ../cli/lightning_cli_intermediate_2.html
+   :height: 150
+   :tag: intermediate
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/model/build_model.rst b/docs/source/model/build_model.rst
new file mode 100644
index 00000000000000..8d12110db80535
--- /dev/null
+++ b/docs/source/model/build_model.rst
@@ -0,0 +1,55 @@
+:orphan:
+
+#############
+Build a Model
+#############
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: 1: Train a model
+   :description: Build a model to learn the basic ideas of Lightning
+   :col_css: col-md-4
+   :button_link: train_model_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: 2: Validate and test a model
+   :description: Add a validation and test data split to avoid overfitting.
+   :col_css: col-md-4
+   :button_link: validate_model_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: 3: Supercharge training
+   :description: Enable state-of-the-art training techniques with the Trainer features.
+   :col_css: col-md-4
+   :button_link: build_model_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: LightningModule API
+   :description: Dig into LightningModule API in depth
+   :col_css: col-md-4
+   :button_link: ../common/lightning_module.html#lightningmodule-api
+   :height: 150
+
+.. displayitem::
+   :header: Trainer API
+   :description: Dig into Trainer API in depth
+   :col_css: col-md-4
+   :button_link: ../common/trainer.html#trainer-class-api
+   :height: 150
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/model/build_model_advanced.rst b/docs/source/model/build_model_advanced.rst
new file mode 100644
index 00000000000000..33be842cbab74e
--- /dev/null
+++ b/docs/source/model/build_model_advanced.rst
@@ -0,0 +1,25 @@
+:orphan:
+
+########################
+Own your loop (advanced)
+########################
+
+***********************
+Customize training loop
+***********************
+
+.. image:: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/custom_loop.png
+    :width: 600
+    :alt: Injecting custom code in a training loop
+
+Inject custom code anywhere in the Training loop using any of the 20+ methods (:ref:`lightning_hooks`) available in the LightningModule.
+
+.. testcode::
+
+    class LitModel(pl.LightningModule):
+        def backward(self, loss, optimizer, optimizer_idx):
+            loss.backward()
+
+----
+
+.. include:: manual_optimization.rst
diff --git a/docs/source/model/build_model_expert.rst b/docs/source/model/build_model_expert.rst
new file mode 100644
index 00000000000000..f321e905e40097
--- /dev/null
+++ b/docs/source/model/build_model_expert.rst
@@ -0,0 +1,7 @@
+:orphan:
+
+#########################
+Raw PyTorch loop (expert)
+#########################
+
+.. include::  ../starter/lightning_lite.rst
diff --git a/docs/source/model/build_model_intermediate.rst b/docs/source/model/build_model_intermediate.rst
new file mode 100644
index 00000000000000..55f1247f414af1
--- /dev/null
+++ b/docs/source/model/build_model_intermediate.rst
@@ -0,0 +1,47 @@
+:orphan:
+
+###################################
+Supercharge training (intermediate)
+###################################
+
+************************
+Enable training features
+************************
+Enable advanced training features using Trainer arguments. These are SOTA techniques that are automatically integrated into your training loop without changes to your code.
+
+.. code::
+
+   # train 1TB+ parameter models with Deepspeed/fsdp
+   trainer = Trainer(
+       devices=4,
+       accelerator="gpu",
+       strategy="deepspeed_stage_2",
+       precision=16
+    )
+
+   # 20+ helpful arguments for rapid idea iteration
+   trainer = Trainer(
+       max_epochs=10,
+       min_epochs=5,
+       overfit_batches=1
+    )
+
+   # access the latest state of the art techniques
+   trainer = Trainer(callbacks=[StochasticWeightAveraging(...)])
+
+----
+
+******************
+Extend the Trainer
+******************
+
+.. raw:: html
+
+    <video width="100%" max-width="800px" controls autoplay muted playsinline
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/cb.m4v"></video>
+
+If you have multiple lines of code with similar functionalities, you can use *callbacks* to easily group them together and toggle all of those lines on or off at the same time.
+
+.. code::
+
+   trainer = Trainer(callbacks=[AWSCheckpoints()])
diff --git a/docs/source/model/manual_optimization.rst b/docs/source/model/manual_optimization.rst
new file mode 100644
index 00000000000000..e4a31dd6e28acd
--- /dev/null
+++ b/docs/source/model/manual_optimization.rst
@@ -0,0 +1,290 @@
+*******************
+Manual Optimization
+*******************
+
+For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable to
+manually manage the optimization process.
+
+This is only recommended for experts who need ultimate flexibility.
+Lightning will handle only accelerator, precision and strategy logic.
+The users are left with ``optimizer.zero_grad()``, gradient accumulation, model toggling, etc..
+
+To manually optimize, do the following:
+
+* Set ``self.automatic_optimization=False`` in your ``LightningModule``'s ``__init__``.
+* Use the following functions and call them manually:
+
+  * ``self.optimizers()`` to access your optimizers (one or multiple)
+  * ``optimizer.zero_grad()`` to clear the gradients from the previous training step
+  * ``self.manual_backward(loss)`` instead of ``loss.backward()``
+  * ``optimizer.step()`` to update your model parameters
+
+Here is a minimal example of manual optimization.
+
+.. testcode:: python
+
+    from pytorch_lightning import LightningModule
+
+
+    class MyModel(LightningModule):
+        def __init__(self):
+            super().__init__()
+            # Important: This property activates manual optimization.
+            self.automatic_optimization = False
+
+        def training_step(self, batch, batch_idx):
+            opt = self.optimizers()
+            opt.zero_grad()
+            loss = self.compute_loss(batch)
+            self.manual_backward(loss)
+            opt.step()
+
+.. warning::
+   Before 1.2, ``optimizer.step()`` was calling ``optimizer.zero_grad()`` internally.
+   From 1.2, it is left to the user's expertise.
+
+.. tip::
+   Be careful where you call ``optimizer.zero_grad()``, or your model won't converge.
+   It is good practice to call ``optimizer.zero_grad()`` before ``self.manual_backward(loss)``.
+
+
+Access your Own Optimizer
+=========================
+
+The provided ``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer
+configured in your :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers`. You can access your own optimizer
+with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to
+support accelerators, precision and profiling for you.
+
+.. testcode:: python
+
+   class Model(LightningModule):
+       def __init__(self):
+           super().__init__()
+           self.automatic_optimization = False
+           ...
+
+       def training_step(self, batch, batch_idx):
+           optimizer = self.optimizers()
+
+           # `optimizer` is a `LightningOptimizer` wrapping the optimizer.
+           # To access it, do the following.
+           # However, it won't work on TPU, AMP, etc...
+           optimizer = optimizer.optimizer
+           ...
+
+Gradient Accumulation
+=====================
+
+You can accumulate gradients over batches similarly to ``accumulate_grad_batches`` argument in
+:ref:`Trainer <trainer>` for automatic optimization. To perform gradient accumulation with one optimizer
+after every ``N`` steps, you can do as such.
+
+.. testcode:: python
+
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
+
+
+    def training_step(self, batch, batch_idx):
+        opt = self.optimizers()
+
+        loss = self.compute_loss(batch)
+        self.manual_backward(loss)
+
+        # accumulate gradients of N batches
+        if (batch_idx + 1) % N == 0:
+            opt.step()
+            opt.zero_grad()
+
+
+Use Multiple Optimizers (like GANs)
+===================================
+
+Here is an example training a simple GAN with multiple optimizers using manual optimization.
+
+.. testcode:: python
+
+    import torch
+    from torch import Tensor
+    from pytorch_lightning import LightningModule
+
+
+    class SimpleGAN(LightningModule):
+        def __init__(self):
+            super().__init__()
+            self.G = Generator()
+            self.D = Discriminator()
+
+            # Important: This property activates manual optimization.
+            self.automatic_optimization = False
+
+        def sample_z(self, n) -> Tensor:
+            sample = self._Z.sample((n,))
+            return sample
+
+        def sample_G(self, n) -> Tensor:
+            z = self.sample_z(n)
+            return self.G(z)
+
+        def training_step(self, batch, batch_idx):
+            # Implementation follows the PyTorch tutorial:
+            # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
+            g_opt, d_opt = self.optimizers()
+
+            X, _ = batch
+            batch_size = X.shape[0]
+
+            real_label = torch.ones((batch_size, 1), device=self.device)
+            fake_label = torch.zeros((batch_size, 1), device=self.device)
+
+            g_X = self.sample_G(batch_size)
+
+            ##########################
+            # Optimize Discriminator #
+            ##########################
+            d_x = self.D(X)
+            errD_real = self.criterion(d_x, real_label)
+
+            d_z = self.D(g_X.detach())
+            errD_fake = self.criterion(d_z, fake_label)
+
+            errD = errD_real + errD_fake
+
+            d_opt.zero_grad()
+            self.manual_backward(errD)
+            d_opt.step()
+
+            ######################
+            # Optimize Generator #
+            ######################
+            d_z = self.D(g_X)
+            errG = self.criterion(d_z, real_label)
+
+            g_opt.zero_grad()
+            self.manual_backward(errG)
+            g_opt.step()
+
+            self.log_dict({"g_loss": errG, "d_loss": errD}, prog_bar=True)
+
+        def configure_optimizers(self):
+            g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5)
+            d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5)
+            return g_opt, d_opt
+
+
+Learning Rate Scheduling
+========================
+
+Every optimizer you use can be paired with any
+`Learning Rate Scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_. Please see the
+documentation of :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` for all the available options
+
+You can call ``lr_scheduler.step()`` at arbitrary intervals.
+Use ``self.lr_schedulers()`` in  your :class:`~pytorch_lightning.core.lightning.LightningModule` to access any learning rate schedulers
+defined in your :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers`.
+
+.. warning::
+   * Before v1.3, Lightning automatically called ``lr_scheduler.step()`` in both automatic and manual optimization. From
+     1.3, ``lr_scheduler.step()`` is now for the user to call at arbitrary intervals.
+   * Note that the ``lr_scheduler_config`` keys, such as ``"frequency"`` and ``"interval"``, will be ignored even if they are provided in
+     your :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` during manual optimization.
+
+Here is an example calling ``lr_scheduler.step()`` every step.
+
+.. testcode:: python
+
+    # step every batch
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
+
+
+    def training_step(self, batch, batch_idx):
+        # do forward, backward, and optimization
+        ...
+
+        # single scheduler
+        sch = self.lr_schedulers()
+        sch.step()
+
+        # multiple schedulers
+        sch1, sch2 = self.lr_schedulers()
+        sch1.step()
+        sch2.step()
+
+If you want to call ``lr_scheduler.step()`` every ``N`` steps/epochs, do the following.
+
+.. testcode:: python
+
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
+
+
+    def training_step(self, batch, batch_idx):
+        # do forward, backward, and optimization
+        ...
+
+        sch = self.lr_schedulers()
+
+        # step every N batches
+        if (batch_idx + 1) % N == 0:
+            sch.step()
+
+        # step every N epochs
+        if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % N == 0:
+            sch.step()
+
+If you want to call schedulers that require a metric value after each epoch, consider doing the following:
+
+.. testcode::
+
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
+
+
+    def training_epoch_end(self, outputs):
+        sch = self.lr_schedulers()
+
+        # If the selected scheduler is a ReduceLROnPlateau scheduler.
+        if isinstance(sch, torch.optim.lr_scheduler.ReduceLROnPlateau):
+            sch.step(self.trainer.callback_metrics["loss"])
+
+Use Closure for LBFGS-like Optimizers
+=====================================
+
+It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and
+``backward`` of your model. It is optional for most optimizers, but makes your code compatible if you switch to an
+optimizer which requires a closure, such as :class:`~torch.optim.LBFGS`.
+
+See `the PyTorch docs <https://pytorch.org/docs/stable/optim.html#optimizer-step-closure>`_ for more about the closure.
+
+Here is an example using a closure function.
+
+.. testcode:: python
+
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
+
+
+    def configure_optimizers(self):
+        return torch.optim.LBFGS(...)
+
+
+    def training_step(self, batch, batch_idx):
+        opt = self.optimizers()
+
+        def closure():
+            loss = self.compute_loss(batch)
+            opt.zero_grad()
+            self.manual_backward(loss)
+            return loss
+
+        opt.step(closure=closure)
+
+.. warning::
+   The :class:`~torch.optim.LBFGS` optimizer is not supported for apex AMP, native AMP, IPUs, or DeepSpeed.
diff --git a/docs/source/model/own_your_loop.rst b/docs/source/model/own_your_loop.rst
new file mode 100644
index 00000000000000..5982b0ae956783
--- /dev/null
+++ b/docs/source/model/own_your_loop.rst
@@ -0,0 +1,41 @@
+:orphan:
+
+################################
+Use a pure PyTorch training loop
+################################
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Enable manual optimization
+   :description: Gain control of the training loop with manual optimization and LightningModule methods.
+   :col_css: col-md-4
+   :button_link: build_model_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Use a Raw PyTorch Loop
+   :description: Migrate complex PyTorch projects to Lightning and push bleeding-edge research with the raw PyTorch loop.
+   :col_css: col-md-4
+   :button_link: build_model_expert.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Make a custom Lightning Loop
+   :description: Conduct bleeding-edge research like meta-learning and RL with a custom Loop.
+   :col_css: col-md-4
+   :button_link: loops.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/model/train_model_basic.rst b/docs/source/model/train_model_basic.rst
new file mode 100644
index 00000000000000..88799a14510c2e
--- /dev/null
+++ b/docs/source/model/train_model_basic.rst
@@ -0,0 +1,131 @@
+:orphan:
+
+#####################
+Train a model (basic)
+#####################
+**Audience**: Users who need to train a model without coding their own training loops.
+
+----
+
+***********
+Add imports
+***********
+Add the relevant imports at the top of the file
+
+.. code:: python
+
+    import os
+    import torch
+    from torch import nn
+    import torch.nn.functional as F
+    from torchvision import transforms
+    from torchvision.datasets import MNIST
+    from torch.utils.data import DataLoader, random_split
+    import pytorch_lightning as pl
+
+----
+
+*****************************
+Define the PyTorch nn.Modules
+*****************************
+
+.. code:: python
+
+    class Encoder(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.l1 = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
+
+        def forward(self, x):
+            return self.l1(x)
+
+
+    class Decoder(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.l1 = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))
+
+        def forward(self, x):
+            return self.l1(x)
+
+----
+
+************************
+Define a LightningModule
+************************
+The LightningModule is the full **recipe** that defines how your nn.Modules interact.
+
+- The **training_step** defines how the *nn.Modules* interact together.
+- In the **configure_optimizers** define the optimizer(s) for your models.
+
+.. code:: python
+
+    class LitAutoEncoder(pl.LightningModule):
+        def __init__(self, encoder, decoder):
+            super().__init__()
+            self.encoder = encoder
+            self.decoder = decoder
+
+        def training_step(self, batch, batch_idx):
+            # training_step defines the train loop.
+            x, y = batch
+            x = x.view(x.size(0), -1)
+            z = self.encoder(x)
+            x_hat = self.decoder(z)
+            loss = F.mse_loss(x_hat, x)
+            return loss
+
+        def configure_optimizers(self):
+            optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
+            return optimizer
+
+----
+
+***************************
+Define the training dataset
+***************************
+Define a PyTorch :class:`~torch.utils.data.DataLoader` which contains your training dataset.
+
+.. code-block:: python
+
+    dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
+    train_loader = DataLoader(dataset)
+
+----
+
+***************
+Train the model
+***************
+To train the model use the Lightning :doc:`Trainer <../common/trainer>` which handles all the engineering and abstracts away all the complexity needed for scale.
+
+.. code-block:: python
+
+    # model
+    autoencoder = LitAutoEncoder(Encoder(), Decoder())
+
+    # train model
+    trainer = pl.Trainer()
+    trainer.fit(model=autoencoder, train_dataloaders=train_loader)
+
+----
+
+***************************
+Eliminate the training loop
+***************************
+Under the hood, the Lightning Trainer runs the following training loop on your behalf
+
+.. code:: python
+
+    autoencoder = LitAutoEncoder(encoder, decoder)
+    optimizer = autoencoder.configure_optimizers()
+
+    for batch_idx, batch in enumerate(train_loader):
+        loss = autoencoder(batch, batch_idx)
+
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+
+The power of Lightning comes when the training loop gets complicated as you add validation/test splits, schedulers, distributed training and all the latest SOTA techniques.
+
+With Lightning, you can add mix all these techniques together without needing to rewrite a new loop every time.
diff --git a/docs/source/starter/converting.rst b/docs/source/starter/converting.rst
index d57688b8afd449..952a93a1858be0 100644
--- a/docs/source/starter/converting.rst
+++ b/docs/source/starter/converting.rst
@@ -1,9 +1,3 @@
-.. testsetup:: *
-
-    from pytorch_lightning.core.lightning import LightningModule
-    from pytorch_lightning.core.datamodule import LightningDataModule
-    from pytorch_lightning.trainer.trainer import Trainer
-
 .. _converting:
 
 ######################################
@@ -14,11 +8,11 @@ To enable your code to work with Lightning, perform the following to organize Py
 
 --------
 
-*******************************
-1. Move your Computational Code
-*******************************
+******************************
+1. Keep you Computational Code
+******************************
 
-Move the model architecture and forward pass to your :class:`~pytorch_lightning.core.lightning.LightningModule`.
+Keep your regular nn.Module architecture
 
 .. testcode::
 
@@ -28,7 +22,7 @@ Move the model architecture and forward pass to your :class:`~pytorch_lightning.
     import torch.nn.functional as F
 
 
-    class LitModel(pl.LightningModule):
+    class LitModel(nn.Module):
         def __init__(self):
             super().__init__()
             self.layer_1 = nn.Linear(28 * 28, 128)
@@ -43,138 +37,90 @@ Move the model architecture and forward pass to your :class:`~pytorch_lightning.
 
 --------
 
-********************************************
-2. Move the Optimizer(s) and LR Scheduler(s)
-********************************************
-
-Move your optimizers to the :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` hook.
+***************************
+2. Configure Training Logic
+***************************
+In the training_step of the LightningModule configure how your training routine behaves with a batch of training data:
 
 .. testcode::
 
     class LitModel(pl.LightningModule):
-        def configure_optimizers(self):
-            optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
-            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
-            return [optimizer], [lr_scheduler]
+        def __init__(self, encoder):
+            super().__init__()
+            self.encoder = encoder
 
---------
+        def training_step(self, batch, batch_idx):
+            x, y = batch
+            y_hat = self.encoder(x)
+            loss = F.cross_entropy(y_hat, y)
+            return loss
 
-*******************************
-3. Configure the Training Logic
-*******************************
+.. note:: If you need to fully own the training loop for complicated legacy projects, check out :doc:`Own your loop <../model/own_your_loop>`.
 
-Lightning automates the training loop for you and manages all of the associated components such as: epoch and batch tracking, optimizers and schedulers,
-and metric reduction. As a user, you just need to define how your model behaves with a batch of training data within the
-:meth:`~pytorch_lightning.core.lightning.LightningModule.training_step` method. When using Lightning, simply override the
-:meth:`~pytorch_lightning.core.lightning.LightningModule.training_step` method which takes the current ``batch`` and the ``batch_idx``
-as arguments. Optionally, it can take ``optimizer_idx`` if your LightningModule defines multiple optimizers within its
-:meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` hook.
+----
+
+****************************************
+3. Move Optimizer(s) and LR Scheduler(s)
+****************************************
+Move your optimizers to the :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` hook.
 
 .. testcode::
 
     class LitModel(pl.LightningModule):
-        def training_step(self, batch, batch_idx):
-            x, y = batch
-            y_hat = self(x)
-            loss = F.cross_entropy(y_hat, y)
-            return loss
+        def configure_optimizers(self):
+            optimizer = torch.optim.Adam(self.encoder.parameters(), lr=1e-3)
+            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+            return [optimizer], [lr_scheduler]
 
 --------
 
-*********************************
-4. Configure the Validation Logic
-*********************************
-
-Lightning also automates the validation loop for you and manages all of the associated components such as: epoch and batch tracking, and metrics reduction. As a user,
-you just need to define how your model behaves with a batch of validation data within the :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step`
-method. When using Lightning, simply override the :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step` method which takes the current
-``batch`` and the ``batch_idx`` as arguments. Optionally, it can take ``dataloader_idx`` if you configure multiple dataloaders.
-
-To add an (optional) validation loop add logic to the
-:meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step` hook (make sure to use the hook parameters, ``batch`` and ``batch_idx`` in this case).
+***************************************
+4. Organize Validation Logic (optional)
+***************************************
+If you need a validation loop, configure how your validation routine behaves with a batch of validation data:
 
 .. testcode::
 
     class LitModel(pl.LightningModule):
         def validation_step(self, batch, batch_idx):
             x, y = batch
-            y_hat = self(x)
+            y_hat = self.encoder(x)
             val_loss = F.cross_entropy(y_hat, y)
             self.log("val_loss", val_loss)
 
-Additionally, you can run only the validation loop using :meth:`~pytorch_lightning.trainer.trainer.Trainer.validate` method.
-
-.. code-block:: python
-
-    model = LitModel()
-    trainer.validate(model)
-
-.. note:: ``model.eval()`` and ``torch.no_grad()`` are called automatically for validation.
-
 .. tip:: ``trainer.validate()`` loads the best checkpoint automatically by default if checkpointing was enabled during fitting.
 
 --------
 
-**************************
-5. Configure Testing Logic
-**************************
-
-Lightning automates the testing loop for you and manages all the associated components, such as epoch and batch tracking, metrics reduction. As a user,
-you just need to define how your model behaves with a batch of testing data within the :meth:`~pytorch_lightning.core.lightning.LightningModule.test_step`
-method. When using Lightning, simply override the :meth:`~pytorch_lightning.core.lightning.LightningModule.test_step` method which takes the current
-``batch`` and the ``batch_idx`` as arguments. Optionally, it can take ``dataloader_idx`` if you configure multiple dataloaders.
+************************************
+5. Organize Testing Logic (optional)
+************************************
+If you need a test loop, configure how your testing routine behaves with a batch of test data:
 
 .. testcode::
 
     class LitModel(pl.LightningModule):
         def test_step(self, batch, batch_idx):
             x, y = batch
-            y_hat = self(x)
+            y_hat = self.encoder(x)
             test_loss = F.cross_entropy(y_hat, y)
             self.log("test_loss", test_loss)
 
-The test loop isn't used within :meth:`~pytorch_lightning.trainer.trainer.Trainer.fit`, therefore, you would need to explicitly call :meth:`~pytorch_lightning.trainer.trainer.Trainer.test`.
-
-.. code-block:: python
-
-    model = LitModel()
-    trainer.test(model)
-
-.. note:: ``model.eval()`` and ``torch.no_grad()`` are called automatically for testing.
-
-.. tip:: ``trainer.test()`` loads the best checkpoint automatically by default if checkpointing is enabled.
-
 --------
 
-*****************************
-6. Configure Prediction Logic
-*****************************
-
-Lightning automates the prediction loop for you and manages all of the associated components such as epoch and batch tracking. As a user,
-you just need to define how your model behaves with a batch of data within the :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step`
-method. When using Lightning, simply override the :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step` method which takes the current
-``batch`` and the ``batch_idx`` as arguments. Optionally, it can take ``dataloader_idx`` if you configure multiple dataloaders.
-If you don't override ``predict_step`` hook, it by default calls :meth:`~pytorch_lightning.core.lightning.LightningModule.forward` method on the batch.
+****************************************
+6. Configure Prediction Logic (optional)
+****************************************
+If you need a prediction loop, configure how your prediction routine behaves with a batch of test data:
 
 .. testcode::
 
     class LitModel(LightningModule):
         def predict_step(self, batch, batch_idx):
             x, y = batch
-            pred = self(x)
+            pred = self.encoder(x)
             return pred
 
-The predict loop will not be used until you call :meth:`~pytorch_lightning.trainer.trainer.Trainer.predict`.
-
-.. code-block:: python
-
-    model = LitModel()
-    trainer.predict(model)
-
-.. note:: ``model.eval()`` and ``torch.no_grad()`` are called automatically for testing.
-
-.. tip:: ``trainer.predict()`` loads the best checkpoint automatically by default if checkpointing is enabled.
-
 --------
 
 ******************************************
@@ -209,27 +155,43 @@ Hint: If you are initializing a :class:`~torch.Tensor` within the ``LightningMod
 ********************
 8. Use your own data
 ********************
+Regular PyTorch DataLoaders work with Lightning. For more modular and scalable datasets, check out :doc:`LightningDataModule <../data/datamodule>`.
 
-To use your DataLoaders, you can override the respective dataloader hooks in the :class:`~pytorch_lightning.core.lightning.LightningModule`:
+----
 
-.. testcode::
+************
+Good to know
+************
 
-    class LitModel(LightningModule):
-        def train_dataloader(self):
-            return DataLoader(...)
+Additionally, you can run only the validation loop using :meth:`~pytorch_lightning.trainer.trainer.Trainer.validate` method.
 
-        def val_dataloader(self):
-            return DataLoader(...)
+.. code-block:: python
 
-        def test_dataloader(self):
-            return DataLoader(...)
+    model = LitModel()
+    trainer.validate(model)
 
-        def predict_dataloader(self):
-            return DataLoader(...)
+.. note:: ``model.eval()`` and ``torch.no_grad()`` are called automatically for validation.
 
-Alternatively, you can pass your dataloaders in one of the following ways:
 
-* Pass in the dataloaders explictly inside ``trainer.fit/.validate/.test/.predict`` calls.
-* Use a :ref:`LightningDataModule <datamodules>`.
+The test loop isn't used within :meth:`~pytorch_lightning.trainer.trainer.Trainer.fit`, therefore, you would need to explicitly call :meth:`~pytorch_lightning.trainer.trainer.Trainer.test`.
+
+.. code-block:: python
+
+    model = LitModel()
+    trainer.test(model)
+
+.. note:: ``model.eval()`` and ``torch.no_grad()`` are called automatically for testing.
+
+.. tip:: ``trainer.test()`` loads the best checkpoint automatically by default if checkpointing is enabled.
+
 
-Checkout :ref:`data` doc to understand data management within Lightning.
+The predict loop will not be used until you call :meth:`~pytorch_lightning.trainer.trainer.Trainer.predict`.
+
+.. code-block:: python
+
+    model = LitModel()
+    trainer.predict(model)
+
+.. note:: ``model.eval()`` and ``torch.no_grad()`` are called automatically for testing.
+
+.. tip:: ``trainer.predict()`` loads the best checkpoint automatically by default if checkpointing is enabled.
diff --git a/docs/source/starter/core_guide.rst b/docs/source/starter/core_guide.rst
deleted file mode 100644
index 20ec33534ac71a..00000000000000
--- a/docs/source/starter/core_guide.rst
+++ /dev/null
@@ -1,1152 +0,0 @@
-.. testsetup:: *
-
-    from pytorch_lightning.core.lightning import LightningModule
-    from pytorch_lightning.core.datamodule import LightningDataModule
-    from pytorch_lightning.trainer.trainer import Trainer
-
-.. _introduction_guide:
-
-#########################
-Step-by-step Walk-through
-#########################
-This guide will walk you through the core pieces of PyTorch Lightning.
-
-We'll accomplish the following:
-
-- Implement an MNIST classifier.
-- Use inheritance to implement an AutoEncoder
-
-.. note:: Any DL/ML PyTorch project fits into the Lightning structure. Here we just focus on three types
-    of research to illustrate.
-
---------------
-
-**************************
-From MNIST to AutoEncoders
-**************************
-
-
-Installing Lightning
-====================
-
-
-Lightning is easy to install. We recommend using conda environments
-
-.. code-block:: bash
-
-    conda activate my_env
-    pip install pytorch-lightning
-
-Or without conda environments, use pip.
-
-.. code-block:: bash
-
-    pip install pytorch-lightning
-
-Or conda.
-
-.. code-block:: bash
-
-    conda install pytorch-lightning -c conda-forge
-
--------------
-
-The Research
-============
-
-The Model
----------
-
-The :doc:`lightning module <../common/lightning_module>` holds all the core research ingredients:
-
-- The model
-
-- The optimizers
-
-- The train/ val/ test steps
-
-Let's first start with the model. In this case, we'll design a 3-layer neural network.
-
-.. testcode::
-
-    import torch
-    from torch.nn import functional as F
-    from torch import nn
-    from pytorch_lightning.core.lightning import LightningModule
-
-
-    class LitMNIST(LightningModule):
-        def __init__(self):
-            super().__init__()
-
-            # mnist images are (1, 28, 28) (channels, height, width)
-            self.layer_1 = nn.Linear(28 * 28, 128)
-            self.layer_2 = nn.Linear(128, 256)
-            self.layer_3 = nn.Linear(256, 10)
-
-        def forward(self, x):
-            batch_size, channels, height, width = x.size()
-
-            # (b, 1, 28, 28) -> (b, 1*28*28)
-            x = x.view(batch_size, -1)
-            x = self.layer_1(x)
-            x = F.relu(x)
-            x = self.layer_2(x)
-            x = F.relu(x)
-            x = self.layer_3(x)
-
-            x = F.log_softmax(x, dim=1)
-            return x
-
-Notice this is a :doc:`lightning module <../common/lightning_module>` instead of a ``torch.nn.Module``. A LightningModule is
-equivalent to a pure PyTorch ``nn.Module`` except it has added functionality. However, you can use it **exactly** the same as you would a PyTorch ``nn.Module``.
-
-.. testcode::
-
-    net = LitMNIST()
-    x = torch.randn(1, 1, 28, 28)
-    out = net(x)
-    print(out.shape)
-
-.. testoutput::
-    :hide:
-
-    torch.Size([1, 10])
-
-.. rst-class:: sphx-glr-script-out
-
- Out:
-
- .. code-block:: python
-
-    torch.Size([1, 10])
-
-
-Now, we add the ``training_step`` which has all our training loop logic:
-
-.. testcode::
-
-    class LitMNIST(LightningModule):
-        def training_step(self, batch, batch_idx):
-            x, y = batch
-            logits = self(x)
-            loss = F.nll_loss(logits, y)
-            return loss
-
-
-Optimizer
----------
-
-Next, we choose which optimizer to use for training our system.
-In PyTorch, we do it as follows:
-
-.. code-block:: python
-
-    from torch.optim import Adam
-
-    optimizer = Adam(LitMNIST().parameters(), lr=1e-3)
-
-
-In Lightning, the same code is re-organized within the :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` method.
-
-.. testcode::
-
-    class LitMNIST(LightningModule):
-        def configure_optimizers(self):
-            return Adam(self.parameters(), lr=1e-3)
-
-.. note:: The ``LightningModule`` is subclassing :class:`~torch.nn.Module` and therefore, you can access its children parameters directly with ``self.parameters()``.
-
-If you have multiple optimizers, you can configure them as follows:
-
-.. testcode::
-
-    class LitMNIST(LightningModule):
-        def configure_optimizers(self):
-            return Adam(self.generator.parameters(), lr=1e-3), Adam(self.discriminator.parameters(), lr=1e-3)
-
-If you have LR Schedulers you can return them too:
-
-.. testcode::
-
-    from torch.optim.lr_scheduler import CosineAnnealingLR
-
-
-    class LitMNIST(LightningModule):
-        def configure_optimizers(self):
-            opt = Adam(self.parameters(), lr=1e-3)
-            scheduler = CosineAnnealingLR(opt, T_max=10)
-            return [opt], [scheduler]
-
-
-For more available configurations, please checkout the :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` method.
-
-
-Data
-----
-
-Lightning operates on pure dataloaders. Here's the PyTorch code for loading MNIST.
-
-.. testcode::
-    :skipif: not _TORCHVISION_AVAILABLE
-
-    from torch.utils.data import DataLoader, random_split
-    from torchvision.datasets import MNIST
-    import os
-    from torchvision import datasets, transforms
-    from pytorch_lightning import Trainer
-
-    # transforms
-    # prepare transforms standard to MNIST
-    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
-
-    # data
-    mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform)
-    mnist_train = DataLoader(mnist_train, batch_size=64)
-
-.. testoutput::
-    :hide:
-    :skipif: os.path.isdir(os.path.join(os.getcwd(), 'MNIST')) or not _TORCHVISION_AVAILABLE
-
-    Downloading ...
-    Extracting ...
-    Downloading ...
-    Extracting ...
-    Downloading ...
-    Extracting ...
-    Processing...
-    Done!
-
-You can use DataLoaders in three ways:
-
-1. Pass DataLoaders to .fit()
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Pass in the dataloaders to the `.fit()` function.
-
-.. code-block:: python
-
-    model = LitMNIST()
-    trainer = Trainer()
-    trainer.fit(model, mnist_train)
-
-
-2. LightningModule DataLoaders
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-For fast research prototyping, it might be easier to link the model with the dataloaders.
-
-
-.. code-block:: python
-
-    class LitMNIST(pl.LightningModule):
-        def train_dataloader(self):
-            # transforms
-            # prepare transforms standard to MNIST
-            transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
-            # data
-            mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform)
-            return DataLoader(mnist_train, batch_size=64)
-
-        def val_dataloader(self):
-            transforms = ...
-            mnist_val = ...
-            return DataLoader(mnist_val, batch_size=64)
-
-        def test_dataloader(self):
-            transforms = ...
-            mnist_test = ...
-            return DataLoader(mnist_test, batch_size=64)
-
-DataLoaders are already in the model, no need to specify on .fit().
-
-.. code-block:: python
-
-    model = LitMNIST()
-    trainer = Trainer()
-    trainer.fit(model)
-
-3. DataModules (recommended)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Defining free-floating dataloaders, splits, download instructions, and such can get messy.
-In this case, it's better to group the full definition of a dataset into a `DataModule` which includes:
-
-- Download instructions
-- Processing instructions
-- Split instructions
-- Train dataloader
-- Val dataloader(s)
-- Test dataloader(s)
-
-.. testcode::
-
-    class MyDataModule(LightningDataModule):
-        def __init__(self):
-            super().__init__()
-            self.train_dims = None
-            self.vocab_size = 0
-
-        def prepare_data(self):
-            # called only on 1 GPU
-            download_dataset()
-            tokenize()
-            build_vocab()
-
-        def setup(self, stage: Optional[str] = None):
-            # called on every GPU
-            vocab = load_vocab()
-            self.vocab_size = len(vocab)
-
-            self.train, self.val, self.test = load_datasets()
-            self.train_dims = self.train.next_batch.size()
-
-        def train_dataloader(self):
-            transforms = ...
-            return DataLoader(self.train, batch_size=64)
-
-        def val_dataloader(self):
-            transforms = ...
-            return DataLoader(self.val, batch_size=64)
-
-        def test_dataloader(self):
-            transforms = ...
-            return DataLoader(self.test, batch_size=64)
-
-Using DataModules allows easier sharing of full dataset definitions.
-
-.. code-block:: python
-
-    # use an MNIST dataset
-    mnist_dm = MNISTDatamodule()
-    model = LitModel(num_classes=mnist_dm.num_classes)
-    trainer.fit(model, mnist_dm)
-
-    # or other datasets with the same model
-    imagenet_dm = ImagenetDatamodule()
-    model = LitModel(num_classes=imagenet_dm.num_classes)
-    trainer.fit(model, imagenet_dm)
-
-.. note:: ``prepare_data()`` is called on only one GPU in distributed training (automatically)
-.. note:: ``setup()`` is called on every GPU (automatically)
-
-Models defined by data
-^^^^^^^^^^^^^^^^^^^^^^
-When your models need to know about the data, it's best to process the data before passing it to the model.
-
-.. code-block:: python
-
-    # init dm AND call the processing manually
-    dm = ImagenetDataModule()
-    dm.prepare_data()
-    dm.setup()
-
-    model = LitModel(out_features=dm.num_classes, img_width=dm.img_width, img_height=dm.img_height)
-    trainer.fit(model, dm)
-
-
-1. use ``prepare_data()`` to download and process the dataset.
-2. use ``setup()`` to do splits, and build your model internals
-
-An alternative to using a DataModule is to defer initialization of the models modules to the ``setup`` method of your LightningModule as follows:
-
-.. testcode::
-
-    class LitMNIST(LightningModule):
-        def __init__(self):
-            self.l1 = None
-
-        def prepare_data(self):
-            download_data()
-            tokenize()
-
-        def setup(self, stage: Optional[str] = None):
-            # stage is either 'fit', 'validate', 'test', or 'predict'. 90% of the time not relevant
-            data = load_data()
-            num_classes = data.classes
-            self.l1 = nn.Linear(..., num_classes)
-
-Training step
--------------
-
-The training step is what happens inside the training loop.
-
-.. code-block:: python
-
-    for epoch in epochs:
-        for batch in data:
-            # TRAINING STEP
-            # ....
-            # TRAINING STEP
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-
-In the case of MNIST, we do the following
-
-.. code-block:: python
-
-    for epoch in epochs:
-        for batch in data:
-            # ------ TRAINING STEP START ------
-            x, y = batch
-            logits = model(x)
-            loss = F.nll_loss(logits, y)
-            # ------ TRAINING STEP END ------
-
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-
-In Lightning, everything that is in the training step gets organized under the
-:func:`~pytorch_lightning.core.LightningModule.training_step` function in the LightningModule.
-
-.. testcode::
-
-    class LitMNIST(LightningModule):
-        def training_step(self, batch, batch_idx):
-            x, y = batch
-            logits = self(x)
-            loss = F.nll_loss(logits, y)
-            return loss
-
-Again, this is the same PyTorch code except that it has been organized by the LightningModule.
-This code is not restricted which means it can be as complicated as a full seq-2-seq, RL loop, GAN, etc...
-
-----------------
-
-The Engineering
-===============
-
-Training
---------
-So far we defined 4 key ingredients in pure PyTorch but organized the code with the LightningModule.
-
-1. Model.
-2. Optimizer.
-3. Training data.
-4. What happens in the training loop.
-
-|
-
-For clarity, we'll recall that the full LightningModule now looks like this.
-
-.. code-block:: python
-
-    class LitMNIST(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.layer_1 = nn.Linear(28 * 28, 128)
-            self.layer_2 = nn.Linear(128, 256)
-            self.layer_3 = nn.Linear(256, 10)
-
-        def forward(self, x):
-            batch_size, channels, height, width = x.size()
-            x = x.view(batch_size, -1)
-            x = self.layer_1(x)
-            x = F.relu(x)
-            x = self.layer_2(x)
-            x = F.relu(x)
-            x = self.layer_3(x)
-            x = F.log_softmax(x, dim=1)
-            return x
-
-        def training_step(self, batch, batch_idx):
-            x, y = batch
-            logits = self(x)
-            loss = F.nll_loss(logits, y)
-            return loss
-
-        def configure_optimizers(self):
-            return Adam(self.parameters(), lr=1e-3)
-
-Again, this is the same PyTorch code, except that it's organized by the LightningModule.
-
-Logging
-^^^^^^^
-To log to Tensorboard, your favorite logger, and/or the progress bar, use the
-:func:`~~pytorch_lightning.core.lightning.LightningModule.log` method which can be called from
-any method in the LightningModule.
-
-.. code-block:: python
-
-    def training_step(self, batch, batch_idx):
-        self.log("my_metric", x)
-
-The :func:`~~pytorch_lightning.core.lightning.LightningModule.log` method has a few options:
-
-- on_step (logs the metric at that step in training)
-- on_epoch (automatically accumulates and logs at the end of the epoch)
-- prog_bar (logs to the progress bar)
-- logger (logs to the logger like Tensorboard)
-
-Depending on where the log is called from, Lightning auto-determines the correct mode for you. But of course
-you can override the default behavior by manually setting the flags.
-
-.. note:: Setting on_epoch=True will accumulate your logged values over the full training epoch.
-
-.. code-block:: python
-
-    def training_step(self, batch, batch_idx):
-        self.log("my_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
-
-You can also use any method of your logger directly:
-
-.. code-block:: python
-
-    def training_step(self, batch, batch_idx):
-        tensorboard = self.logger.experiment
-        tensorboard.any_summary_writer_method_you_want()
-
-Once your training starts, you can view the logs by using your favorite logger or booting up the Tensorboard logs:
-
-.. code-block:: bash
-
-    tensorboard --logdir ./lightning_logs
-
-this generates automatic tensorboard logs (or with the logger of your choice).
-
-.. figure:: ../_static/images/mnist_imgs/mnist_tb.png
-   :alt: mnist CPU bar
-   :width: 500
-
-|
-
-You can also use any of the :doc:`number of other loggers <../common/loggers>` we support.
-
-
-Train on CPU
-^^^^^^^^^^^^
-.. code-block:: python
-
-    from pytorch_lightning import Trainer
-
-    model = LitMNIST()
-    trainer = Trainer()
-    trainer.fit(model, train_loader)
-
-You should see the following weights summary and progress bar
-
-.. figure:: ../_static/images/mnist_imgs/mnist_cpu_bar.png
-   :alt: mnist CPU bar
-
-
-Train on GPU
-^^^^^^^^^^^^
-But the beauty is all the magic you can do with the trainer flags. For instance, to run this model on a GPU:
-
-.. code-block:: python
-
-    model = LitMNIST()
-    trainer = Trainer(accelerator="gpu", devices=1)
-    trainer.fit(model, train_loader)
-
-
-.. figure:: ../_static/images/mnist_imgs/mnist_gpu.png
-    :alt: mnist GPU bar
-
-Train on Multi-GPU
-^^^^^^^^^^^^^^^^^^
-Or you can also train on multiple GPUs.
-
-.. code-block:: python
-
-    model = LitMNIST()
-    trainer = Trainer(accelerator="gpu", devices=8)
-    trainer.fit(model, train_loader)
-
-Or multiple nodes
-
-.. code-block:: python
-
-    # (32 GPUs)
-    model = LitMNIST()
-    trainer = Trainer(accelerator="gpu", devices=8, num_nodes=4, strategy="ddp")
-    trainer.fit(model, train_loader)
-
-Refer to the :ref:`distributed computing guide for more details <accelerators/gpu:Multi GPU Training>`.
-
-Train on TPUs
-^^^^^^^^^^^^^
-Did you know you can use PyTorch on TPUs? It's very hard to do, but we've
-worked with the xla team to use their awesome library to get this to work
-out of the box!
-
-Let's train on Colab (`full demo available here <https://colab.research.google.com/drive/1-_LKx4HwAxl5M6xPJmqAAu444LTDQoa3>`_)
-
-First, change the runtime to TPU (and reinstall lightning).
-
-.. figure:: ../_static/images/mnist_imgs/runtime_tpu.png
-    :alt: mnist GPU bar
-    :width: 400
-
-.. figure:: ../_static/images/mnist_imgs/restart_runtime.png
-    :alt: mnist GPU bar
-    :width: 400
-
-|
-
-Next, install the required xla library (adds support for PyTorch on TPUs)
-
-.. code-block:: shell
-
-    !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl
-
-In distributed training (multiple GPUs and multiple TPU cores) each GPU or TPU core will run a copy
-of this program. This means that without taking any care you will download the dataset N times which
-will cause all sorts of issues.
-
-To solve this problem, make sure your download code is in the ``prepare_data`` method in the DataModule.
-In this method we do all the preparation we need to do once (instead of on every GPU).
-
-``prepare_data`` can be called in two ways, once per node or only on the root node.
-
-.. code-block:: python
-
-    class MNISTDataModule(LightningDataModule):
-        def __init__(self, batch_size=64):
-            super().__init__()
-            self.batch_size = batch_size
-
-        def prepare_data(self):
-            # download only
-            MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
-            MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())
-
-        def setup(self, stage: Optional[str] = None):
-            # transform
-            transform = transforms.Compose([transforms.ToTensor()])
-            mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform)
-            mnist_test = MNIST(os.getcwd(), train=False, download=False, transform=transform)
-
-            # train/val split
-            mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
-
-            # assign to use in dataloaders
-            self.train_dataset = mnist_train
-            self.val_dataset = mnist_val
-            self.test_dataset = mnist_test
-
-        def train_dataloader(self):
-            return DataLoader(self.train_dataset, batch_size=self.batch_size)
-
-        def val_dataloader(self):
-            return DataLoader(self.val_dataset, batch_size=self.batch_size)
-
-        def test_dataloader(self):
-            return DataLoader(self.test_dataset, batch_size=self.batch_size)
-
-The ``prepare_data`` method is also a good place to do any data processing that needs to be done only
-once (ie: download or tokenize, etc...).
-
-.. note:: Lightning inserts the correct DistributedSampler for distributed training. No need to add yourself!
-
-Now we can train the LightningModule on a TPU without doing anything else!
-
-.. code-block:: python
-
-    dm = MNISTDataModule()
-    model = LitMNIST()
-    trainer = Trainer(accelerator="tpu", devices=8)
-    trainer.fit(model, dm)
-
-You'll now see the TPU cores booting up.
-
-.. figure:: ../_static/images/mnist_imgs/tpu_start.png
-    :alt: TPU start
-    :width: 400
-
-Notice the epoch is MUCH faster!
-
-.. figure:: ../_static/images/mnist_imgs/tpu_fast.png
-    :alt: TPU speed
-    :width: 600
-
-----------------
-
-.. include:: ../common/hyperparameters.rst
-
-----------------
-
-Validating
-----------
-
-For most cases, we stop training the model when the performance on a validation
-split of the data reaches a minimum.
-
-Just like the ``training_step``, we can define a ``validation_step`` to check whatever
-metrics we care about, generate samples, or add more to our logs.
-
-.. code-block:: python
-
-    def validation_step(self, batch, batch_idx):
-        loss = MSE_loss(...)
-        self.log("val_loss", loss)
-
-Now we can train with a validation loop as well.
-
-.. code-block:: python
-
-    from pytorch_lightning import Trainer
-
-    model = LitMNIST()
-    trainer = Trainer(accelerator="tpu", devices=8)
-    trainer.fit(model, train_loader, val_loader)
-
-You may have noticed the words **Validation sanity check** logged. This is because Lightning runs 2 batches
-of validation before starting to train. This is a kind of unit test to make sure that if you have a bug
-in the validation loop, you won't need to potentially wait for a full epoch to find out.
-
-.. note:: Lightning disables gradients, puts model in eval mode, and does everything needed for validation.
-
-Val loop under the hood
-^^^^^^^^^^^^^^^^^^^^^^^
-Under the hood, Lightning does the following:
-
-.. code-block:: python
-
-    model = Model()
-    model.train()
-    torch.set_grad_enabled(True)
-
-    for epoch in epochs:
-        for batch in data:
-            # train
-            ...
-
-        # validate
-        model.eval()
-        torch.set_grad_enabled(False)
-
-        outputs = []
-        for batch in val_data:
-            x, y = batch  # validation_step
-            y_hat = model(x)  # validation_step
-            loss = loss(y_hat, x)  # validation_step
-            outputs.append({"val_loss": loss})  # validation_step
-
-        total_loss = outputs.mean()  # validation_epoch_end
-
-Optional methods
-^^^^^^^^^^^^^^^^
-If you still need even more fine-grain control, define the other optional methods for the loop.
-
-.. code-block:: python
-
-    def validation_step(self, batch, batch_idx):
-        preds = ...
-        return preds
-
-
-    def validation_epoch_end(self, val_step_outputs):
-        for pred in val_step_outputs:
-            # do something with all the predictions from each validation_step
-            ...
-
-----------------
-
-Testing
--------
-Once our research is done and we're about to publish or deploy a model, we normally want to figure out
-how it will generalize in the "real world." For this, we use a held-out split of the data for testing.
-
-Just like the validation loop, we define a test loop
-
-.. code-block:: python
-
-    class LitMNIST(LightningModule):
-        def test_step(self, batch, batch_idx):
-            x, y = batch
-            logits = self(x)
-            loss = F.nll_loss(logits, y)
-            self.log("test_loss", loss)
-
-
-However, to make sure the test set isn't used inadvertently, Lightning has a separate API to run tests.
-Once you train your model simply call ``.test()``.
-
-.. code-block:: python
-
-    from pytorch_lightning import Trainer
-
-    model = LitMNIST()
-    trainer = Trainer(accelerator="tpu", devices=8)
-    trainer.fit(model)
-
-    # run test set
-    result = trainer.test()
-    print(result)
-
-.. rst-class:: sphx-glr-script-out
-
- Out:
-
- .. code-block:: none
-
-        --------------------------------------------------------------
-        TEST RESULTS
-        {'test_loss': 1.1703}
-        --------------------------------------------------------------
-
-You can also run the test from a saved lightning model
-
-.. code-block:: python
-
-    model = LitMNIST.load_from_checkpoint(PATH)
-    trainer = Trainer(accelerator="tpu", devices=8)
-    trainer.test(model)
-
-.. note:: Lightning disables gradients, puts model in eval mode, and does everything needed for testing.
-
-.. warning:: .test() is not stable yet on TPUs. We're working on getting around the multiprocessing challenges.
-
-----------------
-
-Predicting
-----------
-Again, a LightningModule is exactly the same as a PyTorch module. This means you can load it
-and use it for prediction.
-
-.. code-block:: python
-
-    model = LitMNIST.load_from_checkpoint(PATH)
-    x = torch.randn(1, 1, 28, 28)
-    out = model(x)
-
-On the surface, it looks like ``forward`` and ``training_step`` are similar. Generally, we want to make sure that
-what we want the model to do is what happens in the ``forward``. whereas the ``training_step`` likely calls forward from
-within it.
-
-.. testcode::
-
-    class MNISTClassifier(LightningModule):
-        def forward(self, x):
-            batch_size, channels, height, width = x.size()
-            x = x.view(batch_size, -1)
-            x = self.layer_1(x)
-            x = F.relu(x)
-            x = self.layer_2(x)
-            x = F.relu(x)
-            x = self.layer_3(x)
-            x = F.log_softmax(x, dim=1)
-            return x
-
-        def training_step(self, batch, batch_idx):
-            x, y = batch
-            logits = self(x)
-            loss = F.nll_loss(logits, y)
-            return loss
-
-.. code-block:: python
-
-    model = MNISTClassifier()
-    x = mnist_image()
-    logits = model(x)
-
-In this case, we've set this LightningModel to predict logits. But we could also have it predict feature maps:
-
-.. testcode::
-
-    class MNISTRepresentator(LightningModule):
-        def forward(self, x):
-            batch_size, channels, height, width = x.size()
-            x = x.view(batch_size, -1)
-            x = self.layer_1(x)
-            x1 = F.relu(x)
-            x = self.layer_2(x1)
-            x2 = F.relu(x)
-            x3 = self.layer_3(x2)
-            return [x, x1, x2, x3]
-
-        def training_step(self, batch, batch_idx):
-            x, y = batch
-            out, l1_feats, l2_feats, l3_feats = self(x)
-            logits = F.log_softmax(out, dim=1)
-            ce_loss = F.nll_loss(logits, y)
-            loss = perceptual_loss(l1_feats, l2_feats, l3_feats) + ce_loss
-            return loss
-
-.. code-block:: python
-
-    model = MNISTRepresentator.load_from_checkpoint(PATH)
-    x = mnist_image()
-    feature_maps = model(x)
-
-Or maybe we have a model that we use to do generation.
-A :class:`~pytorch_lightning.core.lightning.LightningModule` is also just a :class:`torch.nn.Module`.
-
-.. testcode::
-
-    class LitMNISTDreamer(LightningModule):
-        def forward(self, z):
-            imgs = self.decoder(z)
-            return imgs
-
-        def training_step(self, batch, batch_idx):
-            x, y = batch
-            representation = self.encoder(x)
-            imgs = self(representation)
-
-            loss = perceptual_loss(imgs, x)
-            return loss
-
-.. code-block:: python
-
-    model = LitMNISTDreamer.load_from_checkpoint(PATH)
-    z = sample_noise()
-    generated_imgs = model(z)
-
-
-To perform inference at scale, it is possible to use :meth:`~pytorch_lightning.trainer.trainer.Trainer.predict`
-with :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step`
-By default, :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step`
-calls :meth:`~pytorch_lightning.core.lightning.LightningModule.forward`,
-but it can be overridden to add any processing logic.
-
-.. code-block:: python
-
-    class LitMNISTDreamer(LightningModule):
-        def forward(self, z):
-            imgs = self.decoder(z)
-            return imgs
-
-        def predict_step(self, batch, batch_idx, dataloader_idx=0):
-            return self(batch)
-
-
-    model = LitMNISTDreamer()
-    trainer.predict(model, datamodule)
-
-
-How you split up what goes in :meth:`~pytorch_lightning.core.lightning.LightningModule.forward`
-vs :meth:`~pytorch_lightning.core.lightning.LightningModule.training_step`
-vs :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step` depends on how you want to use this model for prediction.
-However, we recommend :meth:`~pytorch_lightning.core.lightning.LightningModule.forward` to contain only tensor operations with your model.
-:meth:`~pytorch_lightning.core.lightning.LightningModule.training_step` to encapsulate
-:meth:`~pytorch_lightning.core.lightning.LightningModule.forward` logic with logging, metrics, and loss computation.
-:meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step` to encapsulate
-:meth:`~pytorch_lightning.core.lightning.LightningModule.forward` with any necessary preprocess or postprocess functions.
-
-----------------
-
-The non-essentials
-==================
-
-Extensibility
--------------
-Although lightning makes everything super simple, it doesn't sacrifice any flexibility or control.
-Lightning offers multiple ways of managing the training state.
-
-Training overrides
-^^^^^^^^^^^^^^^^^^
-
-Any part of the training, validation, and testing loop can be modified.
-For instance, if you wanted to do your own backward pass, you would override the
-default implementation
-
-.. testcode::
-
-    def backward(self, use_amp, loss, optimizer):
-        loss.backward()
-
-With your own
-
-.. testcode::
-
-    class LitMNIST(LightningModule):
-        def backward(self, use_amp, loss, optimizer, optimizer_idx):
-            # do a custom way of backward
-            loss.backward(retain_graph=True)
-
-Every single part of training is configurable this way.
-For a full list look at :doc:`LightningModule <../common/lightning_module>`.
-
-----------------
-
-
-Callbacks
----------
-Another way to add arbitrary functionality is to add a custom callback
-for hooks that you might care about
-
-.. testcode::
-
-    from pytorch_lightning.callbacks import Callback
-
-
-    class MyPrintingCallback(Callback):
-        def on_train_start(self, trainer, pl_module):
-            print("Training is starting")
-
-        def on_train_end(self, trainer, pl_module):
-            print("Training is ending")
-
-And pass the callbacks into the trainer
-
-.. testcode::
-
-    trainer = Trainer(callbacks=[MyPrintingCallback()])
-
-.. tip::
-    See full list of 12+ hooks in the :doc:`callbacks <../extensions/callbacks>`.
-
-----------------
-
-*************
-Child Modules
-*************
-
-.. include:: ../common/child_modules.rst
-
-----------------
-
-.. include:: ../advanced/transfer_learning.rst
-
-----------
-
-**********************
-Why PyTorch Lightning?
-**********************
-
-a. Less boilerplate
-===================
-
-Research and production code starts with simple code, but quickly grows in complexity
-once you add GPU training, 16-bit, checkpointing, logging, and so on.
-
-PyTorch Lightning implements these features for you and tests them rigorously to make sure you can
-instead focus on the research idea.
-
-Writing less engineering/bolierplate code means:
-
-- fewer bugs
-- faster iteration
-- faster prototyping
-
-b. More functionality
-=====================
-
-In PyTorch Lightning you leverage code written by hundreds of AI researchers,
-research engs and PhDs from the world's top AI labs,
-implementing all the latest best practices and SOTA features such as
-
-- GPU, Multi GPU, TPU training
-- Multi-node training
-- Auto logging
-- ...
-- Gradient accumulation
-
-c. Less error-prone
-===================
-
-Why re-invent the wheel?
-
-Use PyTorch Lightning to enjoy a deep learning structure that is rigorously tested (500+ tests)
-across CPUs/multi-GPUs/multi-TPUs on every pull-request.
-
-We promise our collective team of 20+ from the top labs has thought about training more than you :)
-
-d. Not a new library
-====================
-
-PyTorch Lightning is organized PyTorch - no need to learn a new framework.
-
-Learn how to :ref:`convert from PyTorch to Lightning here <converting>`.
-
-Your projects WILL grow in complexity and you WILL end up engineering more than trying out new ideas...
-Defer the hardest parts to Lightning!
-
-----------------
-
-********************
-Lightning Philosophy
-********************
-Lightning structures your deep learning code in 4 parts:
-
-- Research code
-- Engineering code
-- Non-essential code
-- Data code
-
-Research code
-=============
-In the MNIST generation example, the research code
-would be the particular system and how it's trained (ie: A GAN or VAE or GPT).
-
-.. code-block:: python
-
-    l1 = nn.Linear(...)
-    l2 = nn.Linear(...)
-    decoder = Decoder()
-
-    x1 = l1(x)
-    x2 = l2(x2)
-    out = decoder(features, x)
-
-    loss = perceptual_loss(x1, x2, x) + CE(out, x)
-
-In Lightning, this code is organized into a :doc:`lightning module <../common/lightning_module>`.
-
-Engineering code
-================
-
-The Engineering code is all the code related to training this system. Things such as early stopping, distribution
-over GPUs, 16-bit precision, etc. This is normally code that is THE SAME across most projects.
-
-.. code-block:: python
-
-    model.cuda(0)
-    x = x.cuda(0)
-
-    distributed = DistributedParallel(model)
-
-    with gpu_zero:
-        download_data()
-
-    dist.barrier()
-
-In Lightning, this code is abstracted out by the :doc:`trainer <../common/lightning_module>`.
-
-Non-essential code
-==================
-
-This is code that helps the research but isn't relevant to the research code. Some examples might be:
-
-1. Inspect gradients
-2. Log to tensorboard.
-
-|
-
-.. code-block:: python
-
-    # log samples
-    z = Q.rsample()
-    generated = decoder(z)
-    self.experiment.log("images", generated)
-
-In Lightning this code is organized into :doc:`callbacks <../extensions/callbacks>`.
-
-Data code
-=========
-Lightning uses standard PyTorch DataLoaders or anything that gives a batch of data.
-This code tends to end up getting messy with transforms, normalization constants, and data splitting
-spread all over files.
-
-.. code-block:: python
-
-    # data
-    train = MNIST(...)
-    train, val = split(train, val)
-    test = MNIST(...)
-
-    # transforms
-    train_transforms = ...
-    val_transforms = ...
-    test_transforms = ...
-
-    # dataloader ...
-    # download with dist.barrier() for multi-gpu, etc...
-
-This code gets especially complicated once you start doing multi-GPU training or needing info about
-the data to build your models.
-
-In Lightning this code is organized inside a :doc:`datamodules <../extensions/datamodules>`.
-
-.. tip:: DataModules are optional but encouraged, otherwise you can use standard DataLoaders
diff --git a/docs/source/starter/installation.rst b/docs/source/starter/installation.rst
index 65202205d771c1..ba28644006f743 100644
--- a/docs/source/starter/installation.rst
+++ b/docs/source/starter/installation.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. _installation:
 
 ############
@@ -6,9 +8,9 @@ Installation
 
 --------------
 
-*********************
-Installation with pip
-*********************
+****************
+Install with pip
+****************
 
 Install any supported version of PyTorch if you want from `PyTorch Installation Page <https://pytorch.org/get-started/locally/#start-locally>`_.
 Now you can install using `pip <https://pypi.org/project/pytorch-lightning/>`_ using the following command:
@@ -19,9 +21,9 @@ Now you can install using `pip <https://pypi.org/project/pytorch-lightning/>`_ u
 
 --------------
 
-***********************
-Installation with Conda
-***********************
+******************
+Install with Conda
+******************
 
 If you don't have conda installed, follow the `Conda Installation Guide <https://docs.conda.io/projects/conda/en/latest/user-guide/install>`_.
 Lightning can be installed with `conda <https://anaconda.org/conda-forge/pytorch-lightning>`_ using the following command:
@@ -39,9 +41,9 @@ You can also use `Conda Environments <https://docs.conda.io/projects/conda/en/la
 
 --------------
 
-************************
-Installation from Source
-************************
+*****************
+Build from Source
+*****************
 
 Install nightly from the source. Note that it contains all the bug fixes and newly released features that
 are not published yet. This is the bleeding edge, so use it at your own discretion.
@@ -55,16 +57,3 @@ Install future patch releases from the source. Note that the patch release conta
 .. code-block:: bash
 
     pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/refs/heads/release/1.5.x.zip
-
---------------
-
-******************
-Lightning Coverage
-******************
-
-PyTorch Lightning is maintained and tested on different Python and PyTorch versions.
-
-Check out the `CI Coverage <https://github.com/PyTorchLightning/pytorch-lightning#continuous-integration>`_ for more info.
-
-It is rigorously tested across multiple GPUs, TPUs, CPUs and IPUs. GPU tests run on two NVIDIA P100. TPU tests run on Google GKE TPUv2/3.
-TPU py3.7 means we support Colab and Kaggle env. IPU tests run on MK1 IPU boxes.
diff --git a/docs/source/starter/introduction.rst b/docs/source/starter/introduction.rst
index 05d6aa715d03a4..4c32809d20edca 100644
--- a/docs/source/starter/introduction.rst
+++ b/docs/source/starter/introduction.rst
@@ -1,672 +1,399 @@
-.. testsetup:: *
+:orphan:
 
-    import os
-    import torch
-    from torch.nn import functional as F
-    from torch.utils.data import DataLoader
-    from torch.utils.data import random_split
-    import pytorch_lightning as pl
-    from pytorch_lightning.core.datamodule import LightningDataModule
-    from pytorch_lightning.core.lightning import LightningModule
-    from pytorch_lightning.trainer.trainer import Trainer
+#######################
+Lightning in 15 minutes
+#######################
+**Required background:** None
 
-.. _new_project:
+**Goal:** In this guide, we'll walk you through the 7 key steps of a typical Lightning workflow.
 
+PyTorch Lightning is the deep learning framework with "batteries included" for professional AI researchers and machine learning engineers who need maximal flexibility while super-charging performance at scale.
 
-####################
-Lightning in 2 Steps
-####################
+.. join_slack::
+   :align: left
+   :margin: 20
 
-**In this guide we'll show you how to organize your PyTorch code into Lightning in 2 steps.**
 
-Organizing your code with PyTorch Lightning makes your code:
+Lightning organizes PyTorch code to remove boilerplate and unlock scalability.
 
-* Keep all the flexibility (this is all pure PyTorch), but removes a ton of boilerplate
-* More readable by decoupling the research code from the engineering
-* Easier to reproduce
-* Less error-prone by automating most of the training loop and tricky engineering
-* Scalable to any hardware without changing your model
+.. raw:: html
 
+    <video width="100%" max-width="800px" controls autoplay muted playsinline
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/pl_docs_animation_final.m4v"></video>
 
-----------
+|
 
-Here's a 3 minute conversion guide for PyTorch projects:
+By organizing PyTorch code, lightning enables:
 
 .. raw:: html
 
-    <video width="100%" max-width="800px" controls autoplay muted playsinline
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/pl_docs_animation_final.m4v"></video>
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Full flexibility
+   :description: Try any ideas using raw PyTorch without the boilerplate.
+   :col_css: col-md-3
+   :image_center: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/card_full_control.png
+   :height: 290
+
+.. displayitem::
+   :description: Decoupled research and engineering code enable reproducibility and better readability.
+   :header: Reproducible + Readable
+   :col_css: col-md-3
+   :image_center: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/card_no_boilerplate.png
+   :height: 290
+
+.. displayitem::
+   :description: Use multiple GPUs/TPUs/HPUs etc... without code changes.
+   :header: Simple multi-GPU training
+   :col_css: col-md-3
+   :image_center: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/card_hardware.png
+   :height: 290
+
+.. displayitem::
+   :description: We've done all the testing so you don't have to.
+   :header: Built-in testing
+   :col_css: col-md-3
+   :image_center: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/card_testing.png
+   :height: 290
+
+.. raw:: html
 
-----------
+        </div>
+    </div>
 
-*********************************
-Step 0: Install PyTorch Lightning
-*********************************
+.. End of callout item section
+
+----
+
+****************************
+1: Install PyTorch Lightning
+****************************
+.. raw:: html
 
+   <div class="row" style='font-size: 16px'>
+      <div class='col-md-6'>
 
-You can install using `pip <https://pypi.org/project/pytorch-lightning/>`_
+For `pip <https://pypi.org/project/pytorch-lightning/>`_ users
 
 .. code-block:: bash
 
     pip install pytorch-lightning
 
-Or with `conda <https://anaconda.org/conda-forge/pytorch-lightning>`_ (see how to install conda `here <https://docs.conda.io/projects/conda/en/latest/user-guide/install/>`_):
+.. raw:: html
+
+      </div>
+      <div class='col-md-6'>
+
+For `conda <https://anaconda.org/conda-forge/pytorch-lightning>`_ users
 
 .. code-block:: bash
 
     conda install pytorch-lightning -c conda-forge
 
-You could also use conda environments
+.. raw:: html
 
-.. code-block:: bash
+      </div>
+   </div>
 
-    conda activate my_env
-    pip install pytorch-lightning
+Or read the `advanced install guide <installation.html>`_
 
-----------
+----
 
-Import the following:
+.. _new_project:
+
+***************************
+2: Define a LightningModule
+***************************
+
+A LightningModule enables your PyTorch nn.Module to play together in complex ways inside the training_step (there is also an optional validation_step and test_step).
 
 .. testcode::
-    :skipif: not _TORCHVISION_AVAILABLE
 
     import os
-    import torch
-    from torch import nn
-    import torch.nn.functional as F
-    from torchvision import transforms
-    from torchvision.datasets import MNIST
-    from torch.utils.data import DataLoader, random_split
+    from torch import optim, nn, utils, Tensor
+    from tests.helpers.datasets import MNIST
     import pytorch_lightning as pl
 
+    # define any number of nn.Modules (or use your current ones)
+    encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
+    decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))
 
-Step 1: Define LightningModule
-==============================
-
-.. testcode::
-
+    # define the LightningModule
     class LitAutoEncoder(pl.LightningModule):
-        def __init__(self):
+        def __init__(self, encoder, decoder):
             super().__init__()
-            self.encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
-            self.decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))
-
-        def forward(self, x):
-            # in lightning, forward defines the prediction/inference actions
-            embedding = self.encoder(x)
-            return embedding
+            self.encoder = encoder
+            self.decoder = decoder
 
         def training_step(self, batch, batch_idx):
-            # training_step defined the train loop.
-            # It is independent of forward
+            # training_step defines the train loop.
+            # it is independent of forward
             x, y = batch
             x = x.view(x.size(0), -1)
             z = self.encoder(x)
             x_hat = self.decoder(z)
-            loss = F.mse_loss(x_hat, x)
+            loss = nn.functional.mse_loss(x_hat, x)
             # Logging to TensorBoard by default
             self.log("train_loss", loss)
             return loss
 
         def configure_optimizers(self):
-            optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
+            optimizer = optim.Adam(self.parameters(), lr=1e-3)
             return optimizer
 
 
-**SYSTEM VS MODEL**
-
-A :doc:`lightning module <../common/lightning_module>` defines a *system* not just a model.
-
-.. figure:: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/model_system.png
-    :width: 400
-
-Examples of systems are:
-
-- `Autoencoder <https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/basic_examples/autoencoder.py>`_
-- `BERT <https://colab.research.google.com/github/PyTorchLightning/lightning-tutorials/blob/publication/.notebooks/lightning_examples/text-transformers.ipynb>`_
-- `DQN <https://colab.research.google.com/github/PyTorchLightning/lightning-tutorials/blob/publication/.notebooks/lightning_examples/reinforce-learning-DQN.ipynb>`_
-- `GAN <https://colab.research.google.com/github/PyTorchLightning/lightning-tutorials/blob/publication/.notebooks/lightning_examples/basic-gan.ipynb>`_
-- `Image classifier <https://colab.research.google.com/github/PyTorchLightning/lightning-tutorials/blob/publication/.notebooks/lightning_examples/mnist-hello-world.ipynb>`_
-- `Semantic Segmentation <https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/domain_templates/semantic_segmentation.py>`_
-- `and a lot more <https://github.com/PyTorchLightning/lightning-tutorials/tree/publication/.notebooks/lightning_examples>`_
-
-Under the hood, a LightningModule is still just a :class:`torch.nn.Module` that groups all research code into a single file to make it self-contained:
-
-- The Train loop
-- The Validation loop
-- The Test loop
-- The Prediction loop
-- The Model or system of Models
-- The Optimizers and LR Schedulers
+    # init the autoencoder
+    autoencoder = LitAutoEncoder(encoder, decoder)
 
-You can customize any part of training (such as the backward pass) by overriding any
-of the 20+ hooks found in :ref:`lightning_hooks`
-
-.. testcode::
-
-    class LitAutoEncoder(pl.LightningModule):
-        def backward(self, loss, optimizer, optimizer_idx):
-            loss.backward()
+----
 
-**FORWARD vs TRAINING_STEP**
-
-In Lightning we suggest separating training from inference. The ``training_step`` defines
-the full training loop. We encourage users to use the ``forward`` to define inference actions.
-
-For example, in this case we can define the autoencoder to act as an embedding extractor:
-
-.. code-block:: python
-
-    def forward(self, batch):
-        embeddings = self.encoder(batch)
-        return embeddings
+*******************
+3: Define a dataset
+*******************
 
-Of course, nothing is preventing you from using ``forward`` from within the ``training_step``.
+Lightning supports ANY iterable (:class:`~torch.utils.data.DataLoader`, numpy, etc...) for the train/val/test/predict splits.
 
 .. code-block:: python
 
-    def training_step(self, batch, batch_idx):
-        ...
-        embeddings = self.encoder(batch)
-        output = self.decoder(embeddings)
-
-It really comes down to your application. We do, however, recommend that you keep both intents separate.
-
-* Use ``forward`` for inference (predicting).
-* Use ``training_step`` for training.
+    # setup data
+    dataset = MNIST(os.getcwd(), download=True)
+    train_loader = utils.data.DataLoader(dataset)
 
-More details in :doc:`LightningModule <../common/lightning_module>` docs.
+----
 
-----------
+******************
+4: Train the model
+******************
 
-Step 2: Fit with Lightning Trainer
-==================================
-
-First, define the data however you want. Lightning just needs a :class:`~torch.utils.data.DataLoader` for the train/val/test/predict splits.
+The Lightning :doc:`Trainer <../common/trainer>` "mixes" any :doc:`LightningModule <../common/lightning_module>` with any dataset and abstracts away all the engineering complexity needed for scale.
 
 .. code-block:: python
 
-    dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
-    train_loader = DataLoader(dataset)
-
-Next, init the :doc:`LightningModule <../common/lightning_module>` and the PyTorch Lightning :doc:`Trainer <../common/trainer>`,
-then call fit with both the data and model.
-
-.. code-block:: python
-
-    # init model
-    autoencoder = LitAutoEncoder()
-
-    # most basic trainer, uses good defaults (auto-tensorboard, checkpoints, logs, and more)
-    # trainer = pl.Trainer(accelerator="gpu", devices=8) (if you have GPUs)
-    trainer = pl.Trainer()
+    # train the model (hint: here are some helpful Trainer arguments for rapid idea iteration)
+    trainer = pl.Trainer(limit_train_batches=100, max_epochs=1)
     trainer.fit(model=autoencoder, train_dataloaders=train_loader)
 
-The :class:`~pytorch_lightning.trainer.Trainer` automates:
+The Lightning :doc:`Trainer <../common/trainer>` automates `40+ tricks <../common/trainer.html#trainer-flags>`_ including:
 
 * Epoch and batch iteration
 * ``optimizer.step()``, ``loss.backward()``, ``optimizer.zero_grad()`` calls
 * Calling of ``model.eval()``, enabling/disabling grads during evaluation
 * :doc:`Checkpoint Saving and Loading <../common/checkpointing>`
-* Tensorboard (see :doc:`loggers <../common/loggers>` options)
-* :ref:`Multi-GPU <accelerators/gpu:Multi GPU Training>` support
+* Tensorboard (see :doc:`loggers <../visualize/loggers>` options)
+* :doc:`Multi-GPU <../accelerators/gpu>` support
 * :doc:`TPU <../accelerators/tpu>`
-* :ref:`16-bit precision AMP <amp>` support
-
-.. tip:: If you prefer to manually manage optimizers, you can use the :ref:`manual_opt` mode (i.e., RL, GANs, and so on).
-
-
-**That's it!**
-
-These are the main two components you need to know in Lightning in general. All the other features of Lightning are either
-features of the Trainer or LightningModule or are extensions for advanced use-cases.
-
------------
-
-**************
-Basic Features
-**************
-
-Manual vs Automatic Optimization
-================================
-
-Automatic Optimization
-----------------------
-
-With Lightning, you don't need to worry about when to enable/disable grads, do a backward pass, or update optimizers
-as long as you return a loss with an attached graph from the :meth:`~pytorch_lightning.core.lightning.LightningModule.training_step` method,
-Lightning will automate the optimization.
-
-.. code-block:: python
-
-    def training_step(self, batch, batch_idx):
-        loss = self.encoder(batch)
-        return loss
-
-.. _manual_opt:
-
-Manual Optimization
--------------------
-
-For certain research like GANs, reinforcement learning, or something with multiple optimizers
-or an inner loop, you can turn off automatic optimization and fully control it yourself.
-
-Turn off automatic optimization, and you control the optimization!
-
-.. code-block:: python
-
-    def __init__(self):
-        self.automatic_optimization = False
-
-
-    def training_step(self, batch, batch_idx):
-        # access your optimizers with use_pl_optimizer=False. Default is True,
-        # setting use_pl_optimizer=True will maintain plugin/precision support
-        opt_a, opt_b = self.optimizers(use_pl_optimizer=True)
-
-        loss_a = self.generator(batch)
-        opt_a.zero_grad()
-        # use `manual_backward()` instead of `loss.backward` to automate half precision, etc...
-        self.manual_backward(loss_a)
-        opt_a.step()
-
-        loss_b = self.discriminator(batch)
-        opt_b.zero_grad()
-        self.manual_backward(loss_b)
-        opt_b.step()
-
-
-Loop Customization
-==================
-
-If you need even more flexibility, you can fully customize the training loop to its core. These are usually required to be customized
-for advanced use-cases. Learn more inside :doc:`Loops docs <../extensions/loops>`.
-
-
-Predict or Deploy
-=================
-
-When you're done training, you have three options to use your LightningModule for predictions.
-
-Option 1: Sub-models
---------------------
-
-Pull out any model inside your system for predictions.
-
-.. code-block:: python
-
-    # ----------------------------------
-    # to use as embedding extractor
-    # ----------------------------------
-    autoencoder = LitAutoEncoder.load_from_checkpoint("path/to/checkpoint_file.ckpt")
-    encoder_model = autoencoder.encoder
-    encoder_model.eval()
-
-    # ----------------------------------
-    # to use as image generator
-    # ----------------------------------
-    decoder_model = autoencoder.decoder
-    decoder_model.eval()
-
-
-Option 2: Forward
------------------
-
-You can also add a forward method to do predictions however you want.
-
-.. testcode::
-
-    # ----------------------------------
-    # using the AE to extract embeddings
-    # ----------------------------------
-    class LitAutoEncoder(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.encoder = nn.Sequential(nn.Linear(28 * 28, 64))
-
-        def forward(self, x):
-            embedding = self.encoder(x)
-            return embedding
-
-
-    autoencoder = LitAutoEncoder()
-    embedding = autoencoder(torch.rand(1, 28 * 28))
+* :ref:`16-bit precision AMP <speed-amp>` support
 
+----
 
-.. code-block:: python
-
-    # -------------------------------
-    # using the AE to generate images
-    # -------------------------------
-    class LitAutoEncoder(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.decoder = nn.Sequential(nn.Linear(64, 28 * 28))
-
-        def forward(self):
-            z = torch.rand(1, 64)
-            image = self.decoder(z)
-            image = image.view(1, 1, 28, 28)
-            return image
-
-
-    autoencoder = LitAutoEncoder()
-    image_sample = autoencoder()
-
-
-Option 3: Production
---------------------
-
-For production systems, `ONNX <https://pytorch.org/docs/stable/onnx.html>`_ or `TorchScript <https://pytorch.org/docs/stable/jit.html>`_ is much faster.
-Make sure you have added a ``forward`` method or trace only the sub-models you need.
-
-* TorchScript using :meth:`~pytorch_lightning.core.lightning.LightningModule.to_torchscript` method.
-
-.. code-block:: python
-
-    autoencoder = LitAutoEncoder()
-    autoencoder.to_torchscript(file_path="model.pt")
-
-* Onnx using :meth:`~pytorch_lightning.core.lightning.LightningModule.to_onnx` method.
-
-.. code-block:: python
-
-    autoencoder = LitAutoEncoder()
-    input_sample = torch.randn((1, 28 * 28))
-    autoencoder.to_onnx(file_path="model.onnx", input_sample=input_sample, export_params=True)
-
-
-Using Accelerators
-==================
-
-It's easy to use CPUs, GPUs, TPUs or IPUs in Lightning. There's **no need** to change your code; simply change the :class:`~pytorch_lightning.trainer.trainer.Trainer` options.
-
-CPU
----
-
-.. testcode::
-
-    # train on CPU
-    trainer = Trainer()
 
-    # train on 8 CPUs
-    trainer = Trainer(accelerator="cpu", devices=8)
+****************
+5: Use the model
+****************
+Once you've trained the model you can export to onnx, torchscript and put it into production or simply load the weights and run predictions.
 
-    # train on 1024 CPUs across 128 machines
-    trainer = pl.Trainer(accelerator="cpu", devices=8, num_nodes=128)
+.. code:: python
 
-GPU
----
+    # load checkpoint
+    checkpoint = "./lightning_logs/version_0/checkpoints/epoch=0-step=100.ckpt"
+    autoencoder = LitAutoEncoder.load_from_checkpoint(checkpoint, encoder=encoder, decoder=decoder)
 
-.. code-block:: python
-
-    # train on 1 GPU
-    trainer = pl.Trainer(accelerator="gpu", devices=1)
+    # choose your trained nn.Module
+    encoder = autoencoder.encoder
+    encoder.eval()
 
-    # train on multiple GPUs across nodes (32 GPUs here)
-    trainer = pl.Trainer(accelerator="gpu", devices=4, num_nodes=8)
+    # embed 4 fake images!
+    fake_image_batch = Tensor(4, 28 * 28)
+    embeddings = encoder(fake_image_batch)
+    print("⚡" * 20, "\nPredictions (4 image embeddings):\n", embeddings, "\n", "⚡" * 20)
 
-    # train on gpu 1, 3, 5 (3 GPUs total)
-    trainer = pl.Trainer(accelerator="gpu", devices=[1, 3, 5])
+----
 
-    # Multi GPU with mixed precision
-    trainer = pl.Trainer(accelerator="gpu", devices=2, precision=16)
+*********************
+6: Visualize training
+*********************
+Lightning comes with a *lot* of batteries included. A helpful one is Tensorboard for visualizing experiments.
 
-TPU
----
-
-.. code-block:: python
+Run this on your commandline and open your browser to **http://localhost:6006/**
 
-    # Train on 8 TPU cores
-    trainer = pl.Trainer(accelerator="tpu", devices=8)
+.. code:: bash
 
-    # Train on single TPU core
-    trainer = pl.Trainer(accelerator="tpu", devices=1)
+    tensorboard --logdir .
 
-    # Train on 7th TPU core
-    trainer = pl.Trainer(accelerator="tpu", devices=[7])
+----
 
-    # without changing a SINGLE line of your code, you can
-    # train on TPUs using 16-bit precision
-    # using only half the training data and checking validation every quarter of a training epoch
-    trainer = pl.Trainer(accelerator="tpu", devices=8, precision=16, limit_train_batches=0.5, val_check_interval=0.25)
-
-IPU
----
-
-.. code-block:: python
+***********************
+7: Supercharge training
+***********************
+Enable advanced training features using Trainer arguments. These are state-of-the-art techniques that are automatically integrated into your training loop without changes to your code.
 
-    # Train on IPUs
-    trainer = pl.Trainer(accelerator="ipu", devices=8)
+.. code::
 
+   # train on 4 GPUs
+   trainer = Trainer(
+       devices=4,
+       accelerator="gpu",
+    )
 
-Checkpointing
-=============
+   # train 1TB+ parameter models with Deepspeed/fsdp
+   trainer = Trainer(
+       devices=4,
+       accelerator="gpu",
+       strategy="deepspeed_stage_2",
+       precision=16
+    )
 
-Lightning automatically saves your model. Once you've trained, you can load the checkpoints as follows:
-
-.. code-block:: python
-
-    model = LitModel.load_from_checkpoint(path_to_saved_checkpoint)
-
-The above checkpoint contains all the arguments needed to init the model and set the state dict.
-If you prefer to do it manually, here's the equivalent
-
-.. code-block:: python
+   # 20+ helpful flags for rapid idea iteration
+   trainer = Trainer(
+       max_epochs=10,
+       min_epochs=5,
+       overfit_batches=1
+    )
 
-    # load the ckpt
-    ckpt = torch.load("path/to/checkpoint.ckpt")
+   # access the latest state of the art techniques
+   trainer = Trainer(callbacks=[StochasticWeightAveraging(...)])
 
-    # equivalent to the above
-    model = LitModel()
-    model.load_state_dict(ckpt["state_dict"])
-
-Learn more inside :ref:`Checkpoint docs <checkpointing>`.
-
-
-Data Flow
-=========
-
-Each loop (training, validation, test, predict) has three hooks you can implement:
-
-- x_step
-- x_step_end (optional)
-- x_epoch_end (optional)
-
-To illustrate how data flows, we'll use the training loop (i.e., x=training)
-
-.. code-block:: python
-
-    outs = []
-    for batch in data:
-        out = training_step(batch)
-        out = training_step_end(out)
-        outs.append(out)
-    training_epoch_end(outs)
-
-The equivalent in Lightning is:
-
-.. code-block:: python
+----
 
-    def training_step(self, batch, batch_idx):
-        prediction = ...
-        return prediction
+********************
+Maximize flexibility
+********************
+Lightning's core guiding principle is to always provide maximal flexibility **without ever hiding any of the PyTorch**.
 
+Lightning offers 5 *added* degrees of flexibility depending on your project's complexity.
 
-    def training_epoch_end(self, outs):
-        for out in outs:
-            ...
+----
 
-In the event you use DP or DDP2 distributed modes (i.e., split a batch across devices), check out *Training with DataParallel* section :ref:`here <lightning_module>`.
-The validation, test and prediction loops have the same structure.
+Customize training loop
+=======================
 
+.. image:: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/custom_loop.png
+    :width: 600
+    :alt: Injecting custom code in a training loop
 
-----------------
-
-*******************
-Optional Extensions
-*******************
-
-Check out the following optional extensions that can make your ML Pipelines more robust:
-
-* :ref:`LightningDataModule <datamodules>`
-* :ref:`Callbacks <callbacks>`
-* :ref:`Logging <logging>`
-* :ref:`Accelerators <accelerators>`
-* :ref:`Plugins <plugins>`
-* :ref:`Loops <loop_customization>`
-
-
-----------------
-
-*********
-Debugging
-*********
-
-Lightning has many tools for debugging. Here is an example of just a few of them:
-
-Limit Batches
-=============
+Inject custom code anywhere in the Training loop using any of the 20+ methods (:ref:`lightning_hooks`) available in the LightningModule.
 
 .. testcode::
 
-    # use only 10 train batches and three val batches per epoch
-    trainer = Trainer(limit_train_batches=10, limit_val_batches=3)
-    # use 20% of total train batches and 10% of total val batches per epoch
-    trainer = Trainer(limit_train_batches=0.2, limit_val_batches=0.1)
-
-Overfit Batches
-===============
-
-.. testcode::
-
-    # Automatically overfit the same batches to your model for a sanity test
-    # use only 10 train & val batches
-    trainer = Trainer(overfit_batches=10)
-    # use only 20% of total train batches and 20% of val batches
-    trainer = Trainer(overfit_batches=0.2)
-
-Fast Dev Run
-============
-
-.. testcode::
-
-    # unit test all the code - hits every line of your code once to see if you have bugs,
-    # instead of waiting hours to crash somewhere
-    trainer = Trainer(fast_dev_run=True)
+    class LitAutoEncoder(pl.LightningModule):
+        def backward(self, loss, optimizer, optimizer_idx):
+            loss.backward()
 
-    # unit test all the code - hits every line of your code with four batches
-    trainer = Trainer(fast_dev_run=4)
+----
 
-Val Check Interval
+Extend the Trainer
 ==================
 
-.. testcode::
-
-    # run validation every 25% of a training epoch
-    trainer = Trainer(val_check_interval=0.25)
-
-.. testcode::
-
-    # Profile your code to find speed/memory bottlenecks
-    Trainer(profiler="simple")
-
-
----------------
-
-
-*******************
-Other Cool Features
-*******************
-
-Once you define and train your first Lightning model, you might want to try other cool features like:
-
-- :doc:`Automatic early stopping <../common/early_stopping>`
-- :ref:`Automatic truncated-back-propagation-through-time <common/lightning_module:truncated_bptt_steps>`
-- :ref:`Automatically scale your batch size <advanced/training_tricks:Batch Size Finder>`
-- :ref:`Automatically scale your batch size <advanced/training_tricks:Learning Rate Finder>`
-- :ref:`Load checkpoints directly from S3 <common/checkpointing:Checkpoint Loading>`
-- :doc:`Scale to massive compute clusters <../clouds/cluster>`
-- :doc:`Use multiple dataloaders per train/val/test/predict loop <../guides/data>`
-- :ref:`Use multiple optimizers to do reinforcement learning or even GANs <common/optimization:Use multiple optimizers (like GANs)>`
-
-Read our :doc:`Guide <../starter/core_guide>` to learn more with a step-by-step walk-through!
-
-
--------------
-
-
-*****************
-Starter Templates
-*****************
-
-Before installing anything, use the following templates to try it out live:
-
-.. list-table::
-   :widths: 18 15 25
-   :header-rows: 1
-
-   * - Use case
-     - Description
-     - link
-   * - Scratch model
-     - To prototype quickly / debug with random data
-     -
-        .. raw:: html
-
-            <div style='width:150px;height:auto'>
-                <a href="https://colab.research.google.com/drive/1rHBxrtopwtF8iLpmC_e7yl3TeDGrseJL?usp=sharing>">
-                    <img alt="open in colab" src="http://bit.ly/pl_colab">
-                </a>
-            </div>
-   * - Scratch model with manual optimization
-     - To prototype quickly / debug with random data
-     -
-        .. raw:: html
+.. raw:: html
 
-            <div style='width:150px;height:auto'>
-                <a href="https://colab.research.google.com/drive/1nGtvBFirIvtNQdppe2xBes6aJnZMjvl8?usp=sharing">
-                    <img alt="open in colab" src="http://bit.ly/pl_colab">
-                </a>
-            </div>
+    <video width="100%" max-width="800px" controls autoplay muted playsinline
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/cb.m4v"></video>
 
+If you have multiple lines of code with similar functionalities, you can use callbacks to easily group them together and toggle all of those lines on or off at the same time.
 
-------------
+.. code::
 
+   trainer = Trainer(callbacks=[AWSCheckpoints()])
 
-*******
-Grid AI
-*******
+----
 
-Grid AI is our native solution for large scale training and tuning on the cloud.
+Use a raw PyTorch loop
+======================
 
-`Get started for free with your GitHub or Google Account here <https://www.grid.ai/>`_.
+For certain types of work at the bleeding-edge of research, Lightning offers experts full control of their training loops in various ways.
 
+.. raw:: html
 
-------------
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Manual optimization
+   :description: Automated training loop, but you own the optimization steps.
+   :col_css: col-md-4
+   :image_center: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/manual_opt.png
+   :button_link: ../model/build_model_advanced.html#manual-optimization
+   :image_height: 220px
+   :height: 320
+
+.. displayitem::
+   :header: Lightning Lite
+   :description: Full control over loop for migrating complex PyTorch projects.
+   :col_css: col-md-4
+   :image_center: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/lite.png
+   :button_link: ../model/build_model_expert.html
+   :image_height: 220px
+   :height: 320
+
+.. displayitem::
+   :header: Loops
+   :description: Enable meta-learning, reinforcement learning, GANs with full control.
+   :col_css: col-md-4
+   :image_center: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/loops.png
+   :button_link: ../extensions/loops.html
+   :image_height: 220px
+   :height: 320
 
+.. raw:: html
 
-*********
-Community
-*********
+        </div>
+    </div>
 
-Our community of core maintainers and thousands of expert researchers is active on our
-`Slack <https://www.pytorchlightning.ai/community>`_
-and `GitHub Discussions <https://github.com/PyTorchLightning/pytorch-lightning/discussions>`_. Drop by
-to hang out, ask Lightning questions or even discuss research!
+.. End of callout item section
 
+----
 
--------------
+**********
+Next steps
+**********
+Depending on your use case, you might want to check one of these out next.
 
+.. raw:: html
 
-***********
-Masterclass
-***********
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Level 2: Add a validation and test set
+   :description: Add validation and test sets to avoid over/underfitting.
+   :button_link: ../levels/basic_level_2.html
+   :col_css: col-md-3
+   :height: 180
+   :tag: basic
+
+.. displayitem::
+   :header: See more examples
+   :description: See examples across computer vision, NLP, RL, etc...
+   :col_css: col-md-3
+   :button_link: ../tutorials.html
+   :height: 180
+   :tag: basic
+
+.. displayitem::
+   :header: I need my raw PyTorch Loop
+   :description: Expert-level control for researchers working on the bleeding-edge
+   :col_css: col-md-3
+   :button_link: ../model/build_model_expert.html
+   :height: 180
+   :tag: expert
+
+.. displayitem::
+   :header: Deploy your model
+   :description: Learn how to predict or put your model into production
+   :col_css: col-md-3
+   :button_link: ../deploy/production.html
+   :height: 180
+   :tag: basic
 
-We also offer a Masterclass to teach you the advanced uses of Lightning.
+.. raw:: html
 
-.. image:: ../_static/images/general/PTL101_youtube_thumbnail.jpg
-    :width: 500
-    :align: center
-    :alt: Masterclass
-    :target: https://www.youtube.com/playlist?list=PLaMu-SDt_RB5NUm67hU2pdE75j6KaIOv2
+        </div>
+    </div>
diff --git a/docs/source/starter/lightning_lite.rst b/docs/source/starter/lightning_lite.rst
index f376ec1bd9e887..61b69cdd238fcf 100644
--- a/docs/source/starter/lightning_lite.rst
+++ b/docs/source/starter/lightning_lite.rst
@@ -1,5 +1,5 @@
 ###########################################
-LightningLite - Stepping Stone to Lightning
+LightningLite (Stepping Stone to Lightning)
 ###########################################
 
 
diff --git a/docs/source/tuning/profiler.rst b/docs/source/tuning/profiler.rst
new file mode 100644
index 00000000000000..1ff7c24ff7dbbe
--- /dev/null
+++ b/docs/source/tuning/profiler.rst
@@ -0,0 +1,49 @@
+.. _profiler:
+
+#############################
+Find bottlenecks in your code
+#############################
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Basic
+   :description: Learn to find bottlenecks in the training loop.
+   :col_css: col-md-3
+   :button_link: profiler_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Intermediate
+   :description: Learn to find bottlenecks in PyTorch operations.
+   :col_css: col-md-3
+   :button_link: profiler_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Advanced
+   :description: Learn to profile TPU code.
+   :col_css: col-md-3
+   :button_link: profiler_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Expert
+   :description: Learn to build your own profiler or profile custom pieces of code
+   :col_css: col-md-3
+   :button_link: profiler_expert.html
+   :height: 150
+   :tag: expert
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/tuning/profiler_advanced.rst b/docs/source/tuning/profiler_advanced.rst
new file mode 100644
index 00000000000000..ad2ab9e2020a1b
--- /dev/null
+++ b/docs/source/tuning/profiler_advanced.rst
@@ -0,0 +1,74 @@
+:orphan:
+
+.. _profiler_advanced:
+
+########################################
+Find bottlenecks in your code (advanced)
+########################################
+**Audience**: Users who want to profile their TPU models to find bottlenecks and improve performance.
+
+----
+
+************************
+Profile cloud TPU models
+************************
+To profile TPU models use the :class:`~pytorch_lightning.profiler.xla.XLAProfiler`
+
+.. code-block:: python
+
+    from pytorch_lightning.profiler import XLAProfiler
+
+    profiler = XLAProfiler(port=9001)
+    trainer = Trainer(profiler=profiler)
+
+----
+
+*************************************
+Capture profiling logs in Tensorboard
+*************************************
+To capture profile logs in Tensorboard, follow these instructions:
+
+----
+
+0: Setup the required installs
+==============================
+Use this `guide <https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm#tpu-vm>`_ to help you with the Cloud TPU required installations.
+
+----
+
+1: Start Tensorboard
+====================
+Start the `TensorBoard <https://www.tensorflow.org/tensorboard>`_ server:
+
+.. code-block:: bash
+
+    tensorboard --logdir ./tensorboard --port 9001
+
+Now open the following url on your browser
+
+.. code-block:: bash
+
+    http://localhost:9001/#profile
+
+----
+
+2: Capture the profile
+======================
+Once the code you want to profile is running:
+
+1. click on the ``CAPTURE PROFILE`` button.
+2. Enter ``localhost:9001`` (default port for XLA Profiler) as the Profile Service URL.
+3. Enter the number of milliseconds for the profiling duration
+4. Click ``CAPTURE``
+
+----
+
+3: Don't stop your code
+=======================
+Make sure the code is running while you are trying to capture the traces. It will lead to better performance insights if the profiling duration is longer than the step time.
+
+----
+
+4: View the profiling logs
+==========================
+Once the capture is finished, the page will refresh and you can browse through the insights using the **Tools** dropdown at the top left
diff --git a/docs/source/tuning/profiler_basic.rst b/docs/source/tuning/profiler_basic.rst
new file mode 100644
index 00000000000000..899e657904d464
--- /dev/null
+++ b/docs/source/tuning/profiler_basic.rst
@@ -0,0 +1,121 @@
+:orphan:
+
+.. _profiler_basic:
+
+#####################################
+Find bottlenecks in your code (basic)
+#####################################
+**Audience**: Users who want to learn the basics of removing bottlenecks from their code
+
+----
+
+************************
+Why do I need profiling?
+************************
+Profiling helps you find bottlenecks in your code by capturing analytics such as how long a function takes or how much memory is used.
+
+------------
+
+******************************
+Find training loop bottlenecks
+******************************
+The most basic profile measures all the key methods across **Callbacks**, **DataModules** and the **LightningModule** in the training loop.
+
+.. code-block:: python
+
+    trainer = Trainer(profiler="simple")
+
+Once the **.fit()** function has completed, you'll see an output like this:
+
+.. code-block::
+
+    FIT Profiler Report
+
+    -----------------------------------------------------------------------------------------------
+    |  Action                                          |  Mean duration (s)	|  Total time (s) |
+    -----------------------------------------------------------------------------------------------
+    |  [LightningModule]BoringModel.prepare_data       |  10.0001       	|  20.00          |
+    |  run_training_epoch                              |  6.1558         	|  6.1558         |
+    |  run_training_batch                              |  0.0022506      	|  0.015754       |
+    |  [LightningModule]BoringModel.optimizer_step     |  0.0017477      	|  0.012234       |
+    |  [LightningModule]BoringModel.val_dataloader     |  0.00024388     	|  0.00024388     |
+    |  on_train_batch_start                            |  0.00014637     	|  0.0010246      |
+    |  [LightningModule]BoringModel.teardown           |  2.15e-06       	|  2.15e-06       |
+    |  [LightningModule]BoringModel.on_train_start     |  1.644e-06      	|  1.644e-06      |
+    |  [LightningModule]BoringModel.on_train_end       |  1.516e-06      	|  1.516e-06      |
+    |  [LightningModule]BoringModel.on_fit_end         |  1.426e-06      	|  1.426e-06      |
+    |  [LightningModule]BoringModel.setup              |  1.403e-06      	|  1.403e-06      |
+    |  [LightningModule]BoringModel.on_fit_start       |  1.226e-06      	|  1.226e-06      |
+    -----------------------------------------------------------------------------------------------
+
+In this report we can see that the slowest function is **prepare_data**. Now you can figure out why data preparation is slowing down your training.
+
+The simple profiler measures all the standard methods used in the training loop automatically, including:
+
+- on_train_epoch_start
+- on_train_epoch_end
+- on_train_batch_start
+- model_backward
+- on_after_backward
+- optimizer_step
+- on_train_batch_end
+- training_step_end
+- on_training_end
+- etc...
+
+----
+
+**************************************
+Profile the time within every function
+**************************************
+To profile the time within every function, use the :class:`~pytorch_lightning.profiler.advanced.AdvancedProfiler` built on top of Python's `cProfiler <https://docs.python.org/3/library/profile.html#module-cProfile>`_.
+
+
+.. code-block:: python
+
+    trainer = Trainer(profiler="advanced")
+
+Once the **.fit()** function has completed, you'll see an output like this:
+
+.. code-block::
+
+    Profiler Report
+
+    Profile stats for: get_train_batch
+            4869394 function calls (4863767 primitive calls) in 18.893 seconds
+    Ordered by: cumulative time
+    List reduced from 76 to 10 due to restriction <10>
+    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+    3752/1876    0.011    0.000   18.887    0.010 {built-in method builtins.next}
+        1876     0.008    0.000   18.877    0.010 dataloader.py:344(__next__)
+        1876     0.074    0.000   18.869    0.010 dataloader.py:383(_next_data)
+        1875     0.012    0.000   18.721    0.010 fetch.py:42(fetch)
+        1875     0.084    0.000   18.290    0.010 fetch.py:44(<listcomp>)
+        60000    1.759    0.000   18.206    0.000 mnist.py:80(__getitem__)
+        60000    0.267    0.000   13.022    0.000 transforms.py:68(__call__)
+        60000    0.182    0.000    7.020    0.000 transforms.py:93(__call__)
+        60000    1.651    0.000    6.839    0.000 functional.py:42(to_tensor)
+        60000    0.260    0.000    5.734    0.000 transforms.py:167(__call__)
+
+If the profiler report becomes too long, you can stream the report to a file:
+
+.. code-block:: python
+
+    from pytorch_lightning.profiler import AdvancedProfiler
+
+    profiler = AdvancedProfiler(dirpath=".", filename="perf_logs")
+    trainer = Trainer(profiler=profiler)
+
+----
+
+*************************
+Measure accelerator usage
+*************************
+Another helpful technique to detect bottlenecks is to ensure that you're using the full capacity of your accelerator (GPU/TPU/IPU/HPU).
+This can be measured with the :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor`:
+
+.. testcode::
+
+    from pytorch_lightning.callbacks import DeviceStatsMonitor
+
+    trainer = Trainer(callbacks=[DeviceStatsMonitor()])
diff --git a/docs/source/tuning/profiler_expert.rst b/docs/source/tuning/profiler_expert.rst
new file mode 100644
index 00000000000000..64ff784ed6c0d6
--- /dev/null
+++ b/docs/source/tuning/profiler_expert.rst
@@ -0,0 +1,108 @@
+:orphan:
+
+.. _profiler_expert:
+
+######################################
+Find bottlenecks in your code (expert)
+######################################
+**Audience**: Users who want to build their own profilers.
+
+----
+
+***********************
+Build your own profiler
+***********************
+To build your own profiler, subclass :class:`~pytorch_lightning.profiler.base.Profiler`
+and override some of its methods. Here is a simple example that profiles the first occurrence and total calls of each action:
+
+.. code-block:: python
+
+    from pytorch_lightning.profiler import Profiler
+    from collections import defaultdict
+    import time
+
+
+    class ActionCountProfiler(Profiler):
+        def __init__(self, dirpath=None, filename=None):
+            super().__init__(dirpath=dirpath, filename=filename)
+            self._action_count = defaultdict(int)
+            self._action_first_occurrence = {}
+
+        def start(self, action_name):
+            if action_name not in self._action_first_occurrence:
+                self._action_first_occurrence[action_name] = time.strftime("%m/%d/%Y, %H:%M:%S")
+
+        def stop(self, action_name):
+            self._action_count[action_name] += 1
+
+        def summary(self):
+            res = f"\nProfile Summary: \n"
+            max_len = max(len(x) for x in self._action_count)
+
+            for action_name in self._action_count:
+                # generate summary for actions called more than once
+                if self._action_count[action_name] > 1:
+                    res += (
+                        f"{action_name:<{max_len}s} \t "
+                        + "self._action_first_occurrence[action_name]} \t "
+                        + "{self._action_count[action_name]} \n"
+                    )
+
+            return res
+
+        def teardown(self, stage):
+            self._action_count = {}
+            self._action_first_occurrence = {}
+            super().teardown(stage=stage)
+
+.. code-block:: python
+
+    trainer = Trainer(profiler=ActionCountProfiler())
+    trainer.fit(...)
+
+----
+
+**********************************
+Profile custom actions of interest
+**********************************
+To profile a specific action of interest, reference a profiler in the LightningModule.
+
+.. code-block:: python
+
+    from pytorch_lightning.profiler import SimpleProfiler, PassThroughProfiler
+
+
+    class MyModel(LightningModule):
+        def __init__(self, profiler=None):
+            self.profiler = profiler or PassThroughProfiler()
+
+To profile in any part of your code, use the **self.profiler.profile()** function
+
+.. code-block:: python
+
+    class MyModel(LightningModule):
+        def custom_processing_step(self, data):
+            with self.profiler.profile("my_custom_action"):
+                ...
+            return data
+
+Here's the full code:
+
+.. code-block:: python
+
+    from pytorch_lightning.profiler import SimpleProfiler, PassThroughProfiler
+
+
+    class MyModel(LightningModule):
+        def __init__(self, profiler=None):
+            self.profiler = profiler or PassThroughProfiler()
+
+        def custom_processing_step(self, data):
+            with self.profiler.profile("my_custom_action"):
+                ...
+            return data
+
+
+    profiler = SimpleProfiler()
+    model = MyModel(profiler)
+    trainer = Trainer(profiler=profiler, max_epochs=1)
diff --git a/docs/source/tuning/profiler_intermediate.rst b/docs/source/tuning/profiler_intermediate.rst
new file mode 100644
index 00000000000000..d2b64b5d547437
--- /dev/null
+++ b/docs/source/tuning/profiler_intermediate.rst
@@ -0,0 +1,181 @@
+:orphan:
+
+.. _profiler_intermediate:
+
+############################################
+Find bottlenecks in your code (intermediate)
+############################################
+**Audience**: Users who want to see more granular profiling information
+
+----
+
+**************************
+Profile pytorch operations
+**************************
+To understand the cost of each PyTorch operation, use the :class:`~pytorch_lightning.profiler.pytorch.PyTorchProfiler` built on top of the `PyTorch profiler <https://pytorch.org/docs/master/profiler.html>`__.
+
+.. code-block:: python
+
+    from pytorch_lightning.profiler import PyTorchProfiler
+
+    profiler = PyTorchProfiler()
+    trainer = Trainer(profiler=profiler)
+
+The profiler will generate an output like this:
+
+.. code-block::
+
+    Profiler Report
+
+    Profile stats for: training_step
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    Name                   Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    t                      62.10%           1.044ms          62.77%           1.055ms          1.055ms
+    addmm                  32.32%           543.135us        32.69%           549.362us        549.362us
+    mse_loss               1.35%            22.657us         3.58%            60.105us         60.105us
+    mean                   0.22%            3.694us          2.05%            34.523us         34.523us
+    div_                   0.64%            10.756us         1.90%            32.001us         16.000us
+    ones_like              0.21%            3.461us          0.81%            13.669us         13.669us
+    sum_out                0.45%            7.638us          0.74%            12.432us         12.432us
+    transpose              0.23%            3.786us          0.68%            11.393us         11.393us
+    as_strided             0.60%            10.060us         0.60%            10.060us         3.353us
+    to                     0.18%            3.059us          0.44%            7.464us          7.464us
+    empty_like             0.14%            2.387us          0.41%            6.859us          6.859us
+    empty_strided          0.38%            6.351us          0.38%            6.351us          3.175us
+    fill_                  0.28%            4.782us          0.33%            5.566us          2.783us
+    expand                 0.20%            3.336us          0.28%            4.743us          4.743us
+    empty                  0.27%            4.456us          0.27%            4.456us          2.228us
+    copy_                  0.15%            2.526us          0.15%            2.526us          2.526us
+    broadcast_tensors      0.15%            2.492us          0.15%            2.492us          2.492us
+    size                   0.06%            0.967us          0.06%            0.967us          0.484us
+    is_complex             0.06%            0.961us          0.06%            0.961us          0.481us
+    stride                 0.03%            0.517us          0.03%            0.517us          0.517us
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    Self CPU time total: 1.681ms
+
+.. note::
+    When using the PyTorch Profiler, wall clock time will not not be representative of the true wall clock time.
+    This is due to forcing profiled operations to be measured synchronously, when many CUDA ops happen asynchronously.
+    It is recommended to use this Profiler to find bottlenecks/breakdowns, however for end to end wall clock time use
+    the ``SimpleProfiler``.
+
+----
+
+***************************
+Profile a distributed model
+***************************
+To profile a distributed model, use the :class:`~pytorch_lightning.profiler.pytorch.PyTorchProfiler` with the *filename* argument which will save a report per rank.
+
+.. code-block:: python
+
+    from pytorch_lightning.profiler import PyTorchProfiler
+
+    profiler = PyTorchProfiler(filename="perf-logs")
+    trainer = Trainer(profiler=profiler)
+
+With two ranks, it will generate a report like so:
+
+.. code-block::
+
+    Profiler Report: rank 0
+
+    Profile stats for: training_step
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    Name                   Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    t                      62.10%           1.044ms          62.77%           1.055ms          1.055ms
+    addmm                  32.32%           543.135us        32.69%           549.362us        549.362us
+    mse_loss               1.35%            22.657us         3.58%            60.105us         60.105us
+    mean                   0.22%            3.694us          2.05%            34.523us         34.523us
+    div_                   0.64%            10.756us         1.90%            32.001us         16.000us
+    ones_like              0.21%            3.461us          0.81%            13.669us         13.669us
+    sum_out                0.45%            7.638us          0.74%            12.432us         12.432us
+    transpose              0.23%            3.786us          0.68%            11.393us         11.393us
+    as_strided             0.60%            10.060us         0.60%            10.060us         3.353us
+    to                     0.18%            3.059us          0.44%            7.464us          7.464us
+    empty_like             0.14%            2.387us          0.41%            6.859us          6.859us
+    empty_strided          0.38%            6.351us          0.38%            6.351us          3.175us
+    fill_                  0.28%            4.782us          0.33%            5.566us          2.783us
+    expand                 0.20%            3.336us          0.28%            4.743us          4.743us
+    empty                  0.27%            4.456us          0.27%            4.456us          2.228us
+    copy_                  0.15%            2.526us          0.15%            2.526us          2.526us
+    broadcast_tensors      0.15%            2.492us          0.15%            2.492us          2.492us
+    size                   0.06%            0.967us          0.06%            0.967us          0.484us
+    is_complex             0.06%            0.961us          0.06%            0.961us          0.481us
+    stride                 0.03%            0.517us          0.03%            0.517us          0.517us
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    Self CPU time total: 1.681ms
+
+.. code-block::
+
+    Profiler Report: rank 1
+
+    Profile stats for: training_step
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    Name                   Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    t                      42.10%           1.044ms          62.77%           1.055ms          1.055ms
+    addmm                  32.32%           543.135us        32.69%           549.362us        549.362us
+    mse_loss               1.35%            22.657us         3.58%            60.105us         60.105us
+    mean                   0.22%            3.694us          2.05%            34.523us         34.523us
+    div_                   0.64%            10.756us         1.90%            32.001us         16.000us
+    ones_like              0.21%            3.461us          0.81%            13.669us         13.669us
+    sum_out                0.45%            7.638us          0.74%            12.432us         12.432us
+    transpose              0.23%            3.786us          0.68%            11.393us         11.393us
+    as_strided             0.60%            10.060us         0.60%            10.060us         3.353us
+    to                     0.18%            3.059us          0.44%            7.464us          7.464us
+    empty_like             0.14%            2.387us          0.41%            6.859us          6.859us
+    empty_strided          0.38%            6.351us          0.38%            6.351us          3.175us
+    fill_                  0.28%            4.782us          0.33%            5.566us          2.783us
+    expand                 0.20%            3.336us          0.28%            4.743us          4.743us
+    empty                  0.27%            4.456us          0.27%            4.456us          2.228us
+    copy_                  0.15%            2.526us          0.15%            2.526us          2.526us
+    broadcast_tensors      0.15%            2.492us          0.15%            2.492us          2.492us
+    size                   0.06%            0.967us          0.06%            0.967us          0.484us
+    is_complex             0.06%            0.961us          0.06%            0.961us          0.481us
+    stride                 0.03%            0.517us          0.03%            0.517us          0.517us
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    Self CPU time total: 1.681ms
+
+This profiler will record ``training_step``, ``backward``, ``validation_step``, ``test_step``, and ``predict_step`` by default.
+The output below shows the profiling for the action ``training_step``. The user can provide ``PyTorchProfiler(record_functions={...})``
+to extend the scope of profiled functions.
+
+.. note::
+    When using the PyTorch Profiler, wall clock time will not not be representative of the true wall clock time.
+    This is due to forcing profiled operations to be measured synchronously, when many CUDA ops happen asynchronously.
+    It is recommended to use this Profiler to find bottlenecks/breakdowns, however for end to end wall clock time use
+    the ``SimpleProfiler``.
+
+----
+
+*****************************
+Visualize profiled operations
+*****************************
+To visualize the profiled operations, enable **emit_nvtx** in the :class:`~pytorch_lightning.profiler.pytorch.PyTorchProfiler`.
+
+.. code-block:: python
+
+    from pytorch_lightning.profiler import PyTorchProfiler
+
+    profiler = PyTorchProfiler(emit_nvtx=True)
+    trainer = Trainer(profiler=profiler)
+
+Then run as following:
+
+.. code-block::
+
+    nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
+
+To visualize the profiled operation, you can either use **nvvp**:
+
+.. code-block::
+
+    nvvp trace_name.prof
+
+or python:
+
+.. code-block::
+
+    python -c 'import torch; print(torch.autograd.profiler.load_nvprof("trace_name.prof"))'
diff --git a/docs/source/visualize/experiment_managers.rst b/docs/source/visualize/experiment_managers.rst
new file mode 100644
index 00000000000000..30fada9f2e0fb7
--- /dev/null
+++ b/docs/source/visualize/experiment_managers.rst
@@ -0,0 +1,25 @@
+******************
+Manage Experiments
+******************
+To track other artifacts, such as histograms or model topology graphs first select one of the many experiment managers (*loggers*) supported by Lightning
+
+.. code-block:: python
+
+    from pytorch_lightning import loggers as pl_loggers
+
+    tensorboard = pl_loggers.TensorBoardLogger()
+    trainer = Trainer(logger=tensorboard)
+
+then access the logger's API directly
+
+.. code-block:: python
+
+    def training_step(self):
+        tensorboard = self.logger.experiment
+        tensorboard.add_image()
+        tensorboard.add_histogram(...)
+        tensorboard.add_figure(...)
+
+----
+
+.. include:: supported_exp_managers.rst
diff --git a/docs/source/visualize/loggers.rst b/docs/source/visualize/loggers.rst
new file mode 100644
index 00000000000000..bdf95ec1b675e2
--- /dev/null
+++ b/docs/source/visualize/loggers.rst
@@ -0,0 +1,56 @@
+.. _loggers:
+
+###############################
+Track and Visualize Experiments
+###############################
+
+.. raw:: html
+
+    <div class="display-card-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. displayitem::
+   :header: Basic
+   :description: Learn how to track and visualize metrics, images and text.
+   :col_css: col-md-4
+   :button_link: logging_basic.html
+   :height: 150
+   :tag: basic
+
+.. displayitem::
+   :header: Intermediate
+   :description: Enable third-party experiment managers with advanced visualizations.
+   :col_css: col-md-4
+   :button_link: logging_intermediate.html
+   :height: 150
+   :tag: intermediate
+
+.. displayitem::
+   :header: Advanced
+   :description: Optimize model speed with advanced self.log arguments and cloud logging.
+   :col_css: col-md-4
+   :button_link: logging_advanced.html
+   :height: 150
+   :tag: advanced
+
+.. displayitem::
+   :header: Expert
+   :description: Make your own progress-bar or integrate a new experiment manager.
+   :col_css: col-md-4
+   :button_link: logging_expert.html
+   :height: 150
+   :tag: expert
+
+.. displayitem::
+   :header: LightningModule.log API
+   :description: Dig into the LightningModule.log API in depth
+   :col_css: col-md-4
+   :button_link: ../common/lightning_module.html#log
+   :height: 150
+
+.. raw:: html
+
+        </div>
+    </div>
diff --git a/docs/source/visualize/logging_advanced.rst b/docs/source/visualize/logging_advanced.rst
new file mode 100644
index 00000000000000..ca11e3957fdb20
--- /dev/null
+++ b/docs/source/visualize/logging_advanced.rst
@@ -0,0 +1,385 @@
+:orphan:
+
+.. _logging_advanced:
+
+##########################################
+Track and Visualize Experiments (advanced)
+##########################################
+**Audience:** Users who want to do advanced speed optimizations by customizing the logging behavior.
+
+----
+
+****************************
+Change progress bar defaults
+****************************
+To change the default values (ie: version number) shown in the progress bar, override the :meth:`~pytorch_lightning.callbacks.progress.base.ProgressBarBase.get_metrics` method in your logger.
+
+.. code-block:: python
+
+    from pytorch_lightning.callbacks.progress import Tqdm
+
+
+    class CustomProgressBar(Tqdm):
+        def get_metrics(self, *args, **kwargs):
+            # don't show the version number
+            items = super().get_metrics()
+            items.pop("v_num", None)
+            return items
+
+----
+
+************************************
+Customize tracking to speed up model
+************************************
+
+
+Modify logging frequency
+========================
+
+Logging a metric on every single batch can slow down training. By default, Lightning logs every 50 rows, or 50 training steps.
+To change this behaviour, set the *log_every_n_steps* :class:`~pytorch_lightning.trainer.trainer.Trainer` flag.
+
+.. testcode::
+
+   k = 10
+   trainer = Trainer(log_every_n_steps=k)
+
+----
+
+Modify flushing frequency
+=========================
+
+Metrics are kept in memory for N steps to improve training efficiency. Every N steps, metrics flush to disk. To change the frequency of this flushing, use the *flush_logs_every_n_steps* Trainer argument.
+
+.. code-block:: python
+
+    # faster training, high memory
+    Trainer(flush_logs_every_n_steps=500)
+
+    # slower training, low memory
+    Trainer(flush_logs_every_n_steps=500)
+
+The higher *flush_logs_every_n_steps* is, the faster the model will train but the memory will build up until the next flush.
+The smaller *flush_logs_every_n_steps* is, the slower the model will train but memory will be kept to a minimum.
+
+TODO: chart
+
+----
+
+******************
+Customize self.log
+******************
+
+The LightningModule *self.log* method offers many configurations to customize its behavior.
+
+----
+
+add_dataloader_idx
+==================
+**Default:** True
+
+If True, appends the index of the current dataloader to the name (when using multiple dataloaders). If False, user needs to give unique names for each dataloader to not mix the values.
+
+.. code-block:: python
+
+  self.log(add_dataloader_idx=True)
+
+----
+
+batch_size
+==========
+**Default:** None
+
+Current batch size used for accumulating logs logged with ``on_epoch=True``. This will be directly inferred from the loaded batch, but for some data structures you might need to explicitly provide it.
+
+.. code-block:: python
+
+  self.log(batch_size=32)
+
+----
+
+enable_graph
+============
+**Default:** True
+
+If True, will not auto detach the graph.
+
+.. code-block:: python
+
+  self.log(enable_graph=True)
+
+----
+
+logger
+======
+**Default:** True
+
+Send logs to the logger like ``Tensorboard``, or any other custom logger passed to the :class:`~pytorch_lightning.trainer.trainer.Trainer` (Default: ``True``).
+
+.. code-block:: python
+
+  self.log(logger=True)
+
+----
+
+on_epoch
+========
+**Default:** It varies
+
+If this is True, that specific *self.log* call accumulates and reduces all metrics to the end of the epoch.
+
+.. code-block:: python
+
+  self.log(on_epoch=True)
+
+The default value depends in which function this is called
+
+.. code-block:: python
+
+  def training_step(self, batch, batch_idx):
+      # Default: False
+      self.log(on_epoch=False)
+
+
+  def validation_step(self, batch, batch_idx):
+      # Default: True
+      self.log(on_epoch=True)
+
+
+  def test_step(self, batch, batch_idx):
+      # Default: True
+      self.log(on_epoch=True)
+
+----
+
+on_step
+=======
+**Default:** It varies
+
+If this is True, that specific *self.log* call will NOT accumulate metrics. Instead it will generate a timeseries across steps.
+
+.. code-block:: python
+
+  self.log(on_step=True)
+
+The default value depends in which function this is called
+
+.. code-block:: python
+
+  def training_step(self, batch, batch_idx):
+      # Default: True
+      self.log(on_step=True)
+
+
+  def validation_step(self, batch, batch_idx):
+      # Default: False
+      self.log(on_step=False)
+
+
+  def test_step(self, batch, batch_idx):
+      # Default: False
+      self.log(on_step=False)
+
+
+----
+
+prog_bar
+========
+**Default:** False
+
+If set to True, logs will be sent to the progress bar.
+
+.. code-block:: python
+
+  self.log(prog_bar=True)
+
+----
+
+rank_zero_only
+==============
+**Default:** True
+
+Whether the value will be logged only on rank 0. This will prevent synchronization which would produce a deadlock as not all processes would perform this log call.
+
+.. code-block:: python
+
+  self.log(rank_zero_only=True)
+
+----
+
+reduce_fx
+=========
+**Default:** :meth:`torch.mean`
+
+Reduction function over step values for end of epoch. Uses :meth:`torch.mean` by default.
+
+.. code-block:: python
+
+  self.log(reduce_fx=torch.mean)
+
+----
+
+sync_dist
+=========
+**Default:** False
+
+If True, reduces the metric across devices. Use with care as this may lead to a significant communication overhead.
+
+.. code-block:: python
+
+  self.log(sync_dist=False)
+
+----
+
+sync_dist_group
+===============
+**Default:** None
+
+The DDP group to sync across.
+
+.. code-block:: python
+
+  import torch.distributed as dist
+
+  group = dist.init_process_group("nccl", rank=self.global_rank, world_size=self.world_size)
+  self.log(sync_dist_group=group)
+
+----
+
+***************************************
+Enable metrics for distributed training
+***************************************
+For certain types of metrics that need complex aggregation, we recommended to build your metric using torchmetric which ensures all the complexities of metric aggregation in distributed environments is handled.
+
+First, implement your metric:
+
+.. code-block:: python
+
+  import torch
+  import torchmetrics
+
+
+  class MyAccuracy(Metric):
+      def __init__(self, dist_sync_on_step=False):
+          # call `self.add_state`for every internal state that is needed for the metrics computations
+          # dist_reduce_fx indicates the function that should be used to reduce
+          # state from multiple processes
+          super().__init__(dist_sync_on_step=dist_sync_on_step)
+
+          self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum")
+          self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
+
+      def update(self, preds: torch.Tensor, target: torch.Tensor):
+          # update metric states
+          preds, target = self._input_format(preds, target)
+          assert preds.shape == target.shape
+
+          self.correct += torch.sum(preds == target)
+          self.total += target.numel()
+
+      def compute(self):
+          # compute final result
+          return self.correct.float() / self.total
+
+To use the metric inside Lightning, 1) initialize it in the init, 2) compute the metric, 3) pass it into *self.log*
+
+.. code-block:: python
+
+  class LitModel(LightningModule):
+      def __init__(self):
+          # 1. initialize the metric
+          self.accuracy = MyAccuracy()
+
+      def training_step(self, batch, batch_idx):
+          x, y = batch
+          preds = self(x)
+
+          # 2. compute the metric
+          self.accuracy(preds, y)
+
+          # 3. log it
+          self.log("train_acc_step", self.accuracy)
+
+----
+
+********************************
+Log to a custom cloud filesystem
+********************************
+Lightning is integrated with the major remote file systems including local filesystems and several cloud storage providers such as
+`S3 <https://aws.amazon.com/s3/>`_ on `AWS <https://aws.amazon.com/>`_, `GCS <https://cloud.google.com/storage>`_ on `Google Cloud <https://cloud.google.com/>`_,
+or `ADL <https://azure.microsoft.com/solutions/data-lake/>`_ on `Azure <https://azure.microsoft.com/>`_.
+
+PyTorch Lightning uses `fsspec <https://filesystem-spec.readthedocs.io/>`_ internally to handle all filesystem operations.
+
+To save logs to a remote filesystem, prepend a protocol like "s3:/" to the root_dir used for writing and reading model data.
+
+.. code-block:: python
+
+    from pytorch_lightning.loggers import TensorBoardLogger
+
+    logger = TensorBoardLogger(save_dir="s3://my_bucket/logs/")
+
+    trainer = Trainer(logger=logger)
+    trainer.fit(model)
+
+----
+
+*********************************
+Track both step and epoch metrics
+*********************************
+To track the timeseries over steps (*on_step*) as well as the accumulated epoch metric (*on_epoch*), set both to True
+
+.. code-block:: python
+
+  self.log(on_step=True, on_epoch=True)
+
+Setting both to True will generate two graphs with *_step* for the timeseries over steps and *_epoch* for the epoch metric.
+
+# TODO: show images of both
+
+----
+
+**************************************
+Understand self.log automatic behavior
+**************************************
+This table shows the default values of *on_step* and *on_epoch* depending on the *LightningModule* or *Callback* method.
+
+----
+
+In LightningModule
+==================
+
+.. list-table:: Default behavior of logging in ightningModule
+   :widths: 50 25 25
+   :header-rows: 1
+
+   * - Method
+     - on_step
+     - on_epoch
+   * - on_after_backward, on_before_backward, on_before_optimizer_step, on_before_zero_grad, training_step, training_step_end
+     - True
+     - False
+   * - training_epoch_end, test_epoch_end, test_step, test_step_end, validation_epoch_end, validation_step, validation_step_end
+     - False
+     - True
+
+----
+
+In Callback
+===========
+
+.. list-table:: Default behavior of logging in Callback
+   :widths: 50 25 25
+   :header-rows: 1
+
+   * - Method
+     - on_step
+     - on_epoch
+   * - on_after_backward, on_before_backward, on_before_optimizer_step, on_before_zero_grad, on_train_batch_start, on_train_batch_end
+     - True
+     - False
+   * - on_train_epoch_start, on_train_epoch_end, on_train_start, on_validation_batch_start, on_validation_batch_end, on_validation_start, on_validation_epoch_start, on_validation_epoch_end
+     - False
+     - True
+
+.. note:: To add logging to an unsupported method, please open an issue with a clear description of why it is blocking you.
diff --git a/docs/source/visualize/logging_basic.rst b/docs/source/visualize/logging_basic.rst
new file mode 100644
index 00000000000000..873230584baed0
--- /dev/null
+++ b/docs/source/visualize/logging_basic.rst
@@ -0,0 +1,146 @@
+:orphan:
+
+.. _logging_basic:
+
+#######################################
+Track and Visualize Experiments (basic)
+#######################################
+**Audience:** Users who want to visualize and monitor their model development
+
+----
+
+*******************************
+Why do I need to track metrics?
+*******************************
+In model development, we track values of interest such as the *validation_loss* to visualize the learning process for our models. Model development is like driving a car without windows, charts and logs provide the *windows* to know where to drive the car.
+
+With Lightning, you can visualize virtually anything you can think of: numbers, text, images, audio. Your creativity and imagination are the only limiting factor.
+
+----
+
+*************
+Track metrics
+*************
+Metric visualization is the most basic but powerful way of understanding how your model is doing throughout the model development process.
+
+To track a metric, simply use the *self.log* method available inside the *LightningModule*
+
+.. code-block:: python
+
+    class LitModel(pl.LightningModule):
+        def training_step(self, batch, batch_idx):
+            value = self.global_step
+            self.log("some_value", self.global_step)
+
+To log multiple metrics at once, use *self.log_dict*
+
+.. code-block:: python
+
+    values = {"loss": loss, "acc": acc, "metric_n": metric_n}  # add more items if needed
+    self.log_dict(values)
+
+TODO: show plot of metric changing over time
+
+----
+
+View in the commandline
+=======================
+
+To view metrics in the commandline progress bar, set the *prog_bar* argument to True.
+
+.. code-block:: python
+
+    self.log(prog_bar=True)
+
+TODO: need progress bar here
+
+----
+
+View in the browser
+===================
+To view metrics in the browser you need to use an *experiment manager* with these capabilities. By Default, Lightning uses Tensorboard which is free and opensource.
+
+Tensorboard is already enabled by default
+
+.. code-block:: python
+
+    # every trainer already has tensorboard enabled by default
+    trainer = Trainer()
+
+To launch the tensorboard dashboard run the following command on the commandline.
+
+.. code-block:: bash
+
+    tensorboard --logdir=lightning_logs/
+
+If you're using a notebook environment such as *colab* or *kaggle* or *jupyter*, launch Tensorboard with this command
+
+.. code-block:: bash
+
+    %reload_ext tensorboard
+    %tensorboard --logdir=lightning_logs/
+
+----
+
+Accumulate a metric
+===================
+When *self.log* is called inside the *training_step*, it generates a timeseries showing how the metric behaves over time.
+
+TODO: show chart
+
+However, For the validation and test sets we are not generally interested in plotting the metric values per batch of data. Instead, we want to compute a summary statistic (such as average, min or max) across the full split of data.
+
+When you call self.log inside the *validation_step* and *test_step*, Lightning automatically accumulates the metric and averages it once it's gone through the whole split (*epoch*).
+
+.. code-block:: python
+
+    def validation_step(self, batch, batch_idx):
+        value = batch_idx + 1
+        self.log("average_value", value)
+
+TODO: show single point plotted
+
+If you don't want to average, add your own function in the *reduce_fx* argument.
+
+.. code-block:: python
+
+    # default function
+    self.log(reduce_fx=torch.mean)
+
+----
+
+************
+Track images
+************
+If your *experiment manager* supports image visualization, simply *log* the image with *self.log*
+
+.. code-block:: python
+
+    # (32 batch samples, 3 channels, 32 width, 32 height)
+    image = torch.Tensor(32, 3, 28, 28)
+    self.log("an_image", image)
+
+----
+
+**********
+Track text
+**********
+If your *experiment manager* supports text visualization, simply *log* the text with *self.log*
+
+.. code-block:: python
+
+    text = "hello world"
+    self.log("some_text", text)
+
+# TODO: show screenshot
+
+----
+
+******************************
+Configure the saving directory
+******************************
+By default, anything that is logged is saved to the current working directory. To use a different directory, set the *default_root_dir* argument in the Trainer.
+
+.. code-block:: python
+
+    Trainer(default_root_dir="/your/custom/path")
diff --git a/docs/source/visualize/logging_expert.rst b/docs/source/visualize/logging_expert.rst
new file mode 100644
index 00000000000000..3b44ee910ea9ce
--- /dev/null
+++ b/docs/source/visualize/logging_expert.rst
@@ -0,0 +1,135 @@
+:orphan:
+
+.. _logging_expert:
+
+########################################
+Track and Visualize Experiments (expert)
+########################################
+**Audience:** Users who want to make their own progress bars or integrate new experiment managers.
+
+----
+
+***********************
+Change the progress bar
+***********************
+
+If you'd like to change the way the progress bar displays information you can use some of our built-in progress bard or build your own.
+
+----
+
+Use the TQDMProgressBar
+=======================
+To use the TQDMProgressBar pass it into the *callbacks* :class:`~pytorch_lightning.trainer.trainer.Trainer` argument.
+
+.. code-block:: python
+
+    from pytorch_lightning.callbacks import TQDMProgressBar
+
+    trainer = Trainer(callbacks=[TQDMProgressBar()])
+
+----
+
+Use the RichProgressBar
+=======================
+The RichProgressBar can add custom colors and beautiful formatting for your progress bars. First, install the *`rich <https://github.com/Textualize/rich>`_*  library
+
+.. code-block:: bash
+
+    pip install rich
+
+Then pass the callback into the callbacks :class:`~pytorch_lightning.trainer.trainer.Trainer` argument:
+
+.. code-block:: python
+
+    from pytorch_lightning.callbacks import RichProgressBar
+
+    trainer = Trainer(callbacks=[RichProgressBar()])
+
+The rich progress bar can also have custom themes
+
+.. code-block:: python
+
+    from pytorch_lightning.callbacks import RichProgressBar
+    from pytorch_lightning.callbacks.progress.rich_progress import RichProgressBarTheme
+
+    # create your own theme!
+    theme = RichProgressBarTheme(description="green_yellow", progress_bar="green1")
+
+    # init as normal
+    progress_bar = RichProgressBar(theme=theme)
+    trainer = Trainer(callbacks=progress_bar)
+
+----
+
+************************
+Customize a progress bar
+************************
+To customize either the  :class:`~pytorch_lightning.callbacks.TQDMProgressBar` or the  :class:`~pytorch_lightning.callbacks.RichProgressBar`, subclass it and override any of its methods.
+
+.. code-block:: python
+
+    from pytorch_lightning.callbacks import TQDMProgressBar
+
+
+    class LitProgressBar(TQDMProgressBar):
+        def init_validation_tqdm(self):
+            bar = super().init_validation_tqdm()
+            bar.set_description("running validation...")
+            return bar
+
+----
+
+***************************
+Build your own progress bar
+***************************
+To build your own progress bar, subclass :class:`~pytorch_lightning.callbacks.ProgressBarBase`
+
+.. code-block:: python
+
+    from pytorch_lightning.callbacks import ProgressBarBase
+
+
+    class LitProgressBar(ProgressBarBase):
+        def __init__(self):
+            super().__init__()  # don't forget this :)
+            self.enable = True
+
+        def disable(self):
+            self.enable = False
+
+        def on_train_batch_end(self, trainer, pl_module, outputs, batch_idx):
+            super().on_train_batch_end(trainer, pl_module, outputs, batch_idx)  # don't forget this :)
+            percent = (self.train_batch_idx / self.total_train_batches) * 100
+            sys.stdout.flush()
+            sys.stdout.write(f"{percent:.01f} percent complete \r")
+
+
+    bar = LitProgressBar()
+    trainer = Trainer(callbacks=[bar])
+
+----
+
+*******************************
+Integrate an experiment manager
+*******************************
+To create an integration between a custom logger and Lightning, subclass :class:`~pytorch_lightning.loggers.base.LightningLoggerBase`
+
+.. code-block:: python
+
+    from pytorch_lightning.loggers import Logger
+
+
+    class LitLogger(Logger):
+        @property
+        def name(self) -> str:
+            return "my-experiment"
+
+        @property
+        def version(self):
+            return "version_0"
+
+        def log_metrics(self, metrics, step=None):
+            print("my logged metrics", metrics)
+
+        def log_hyperparams(self, params, *args, **kwargs):
+            print("my logged hyperparameters", params)
diff --git a/docs/source/visualize/logging_intermediate.rst b/docs/source/visualize/logging_intermediate.rst
new file mode 100644
index 00000000000000..1b0dd6b0bca956
--- /dev/null
+++ b/docs/source/visualize/logging_intermediate.rst
@@ -0,0 +1,80 @@
+.. _logging_intermediate:
+
+##############################################
+Track and Visualize Experiments (intermediate)
+##############################################
+**Audience:** Users who want to track more complex outputs and use third-party experiment managers.
+
+----
+
+*******************************
+Track audio and other artifacts
+*******************************
+To track other artifacts, such as histograms or model topology graphs first select one of the many loggers supported by Lightning
+
+.. code-block:: python
+
+    from pytorch_lightning import loggers as pl_loggers
+
+    tensorboard = pl_loggers.TensorBoardLogger()
+    trainer = Trainer(logger=tensorboard)
+
+then access the logger's API directly
+
+.. code-block:: python
+
+    def training_step(self):
+        tensorboard = self.logger.experiment
+        tensorboard.add_image()
+        tensorboard.add_histogram(...)
+        tensorboard.add_figure(...)
+
+----
+
+.. include:: supported_exp_managers.rst
+
+----
+
+****************************************
+Track multiple metrics in the same chart
+****************************************
+If your logger supports plotting multiple metrics on the same chart, pass in a dictionary to *self.log*.
+
+.. code-block:: python
+
+    self.log("performance", {"acc": acc, "recall": recall})
+
+----
+
+*********************
+Track hyperparameters
+*********************
+To track hyperparameters, first call *save_hyperparameters* from the LightningModule init:
+
+.. code-block:: python
+
+    class MyLightningModule(LightningModule):
+        def __init__(self, learning_rate, another_parameter, *args, **kwargs):
+            super().__init__()
+            self.save_hyperparameters()
+
+If your logger supports tracked hyperparameters, the hyperparameters will automatically show up on the logger dashboard.
+
+TODO: show tracked hyperparameters.
+
+----
+
+********************
+Track model topology
+********************
+Multiple loggers support visualizing the model topology. Here's an example that tracks the model topology using Tensorboard.
+
+.. code-block:: python
+
+    def any_lightning_module_function_or_hook(self):
+        tensorboard_logger = self.logger.experiment
+
+        prototype_array = torch.Tensor(32, 1, 28, 27)
+        tensorboard_logger.log_graph(model=self, input_array=prototype_array)
+
+TODO: show tensorboard topology.
diff --git a/docs/source/visualize/supported_exp_managers.rst b/docs/source/visualize/supported_exp_managers.rst
new file mode 100644
index 00000000000000..1a15ee23e9360a
--- /dev/null
+++ b/docs/source/visualize/supported_exp_managers.rst
@@ -0,0 +1,198 @@
+Comet.ml
+========
+To use `Comet.ml <https://www.comet.ml/site/>`_ first install the comet package:
+
+.. code-block:: bash
+
+    pip install comet-ml
+
+Configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
+
+.. code-block:: python
+
+    from pytorch_lightning.loggers import CometLogger
+
+    comet_logger = CometLogger(api_key="YOUR_COMET_API_KEY")
+    trainer = Trainer(logger=comet_logger)
+
+Access the comet logger from any function (except the LightningModule *init*) to use its API for tracking advanced artifacts
+
+.. code-block:: python
+
+    class LitModel(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            comet = self.logger.experiment
+            fake_images = torch.Tensor(32, 3, 28, 28)
+            comet.add_image("generated_images", fake_images, 0)
+
+Here's the full documentation for the :class:`~pytorch_lightning.loggers.CometLogger`.
+
+----
+
+MLflow
+======
+To use `MLflow <https://mlflow.org/>`_ first install the MLflow package:
+
+.. code-block:: bash
+
+    pip install mlflow
+
+Configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
+
+.. code-block:: python
+
+    from pytorch_lightning.loggers import MLFlowLogger
+
+    mlf_logger = MLFlowLogger(experiment_name="lightning_logs", tracking_uri="file:./ml-runs")
+    trainer = Trainer(logger=mlf_logger)
+
+Access the comet logger from any function (except the LightningModule *init*) to use its API for tracking advanced artifacts
+
+.. code-block:: python
+
+    class LitModel(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            mlf_logger = self.logger.experiment
+            fake_images = torch.Tensor(32, 3, 28, 28)
+            mlf_logger.add_image("generated_images", fake_images, 0)
+
+Here's the full documentation for the :class:`~pytorch_lightning.loggers.MLFlowLogger`.
+
+----
+
+Neptune.ai
+==========
+To use `Neptune.ai <https://neptune.ai/>`_ first install the neptune package:
+
+.. code-block:: bash
+
+    pip install neptune-client
+
+or with conda:
+
+.. code-block:: bash
+
+    conda install -c conda-forge neptune-client
+
+Configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
+
+.. code-block:: python
+
+    from pytorch_lightning.loggers import NeptuneLogger
+
+    neptune_logger = NeptuneLogger(
+        api_key="ANONYMOUS",  # replace with your own
+        project="common/pytorch-lightning-integration",  # format "<WORKSPACE/PROJECT>"
+    )
+    trainer = Trainer(logger=neptune_logger)
+
+Access the neptune logger from any function (except the LightningModule *init*) to use its API for tracking advanced artifacts
+
+.. code-block:: python
+
+    class LitModel(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            neptune_logger = self.logger.experiment["your/metadata/structure"]
+            neptune_logger.log(metadata)
+
+Here's the full documentation for the :class:`~pytorch_lightning.loggers.NeptuneLogger`.
+
+----
+
+Tensorboard
+===========
+`TensorBoard <https://pytorch.org/docs/stable/tensorboard.html>`_ already comes installed with Lightning. If you removed the install install the following package.
+
+.. code-block:: bash
+
+    pip install tensorboard
+
+Configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
+
+.. code-block:: python
+
+    from pytorch_lightning.loggers import TensorBoardLogger
+
+    logger = TensorBoardLogger()
+    trainer = Trainer(logger=logger)
+
+Access the tensorboard logger from any function (except the LightningModule *init*) to use its API for tracking advanced artifacts
+
+.. code-block:: python
+
+    class LitModel(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            tensorboard_logger = self.logger.experiment
+            fake_images = torch.Tensor(32, 3, 28, 28)
+            tensorboard_logger.add_image("generated_images", fake_images, 0)
+
+Here's the full documentation for the :class:`~pytorch_lightning.loggers.TensorBoardLogger`.
+
+----
+
+Weights and Biases
+==================
+To use `Weights and Biases <https://docs.wandb.ai/integrations/lightning/>`_ (wandb) first install the wandb package:
+
+.. code-block:: bash
+
+    pip install wandb
+
+Configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
+
+.. code-block:: python
+
+    from pytorch_lightning.loggers import WandbLogger
+
+    wandb_logger = WandbLogger(project="MNIST", log_model="all")
+    trainer = Trainer(logger=wandb_logger)
+
+    # log gradients and model topology
+    wandb_logger.watch(model)
+
+Access the wandb logger from any function (except the LightningModule *init*) to use its API for tracking advanced artifacts
+
+.. code-block:: python
+
+    class MyModule(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            wandb_logger = self.logger.experiment
+            fake_images = torch.Tensor(32, 3, 28, 28)
+
+            # Option 1
+            wandb_logger.log({"generated_images": [wandb.Image(fake_images, caption="...")]})
+
+            # Option 2 for specifically logging images
+            wandb_logger.log_image(key="generated_images", images=[fake_images])
+
+Here's the full documentation for the :class:`~pytorch_lightning.loggers.WandbLogger`.
+`Demo in Google Colab <http://wandb.me/lightning>`__ with hyperparameter search and model logging.
+
+----
+
+Use multiple exp managers
+=========================
+To use multiple experiment managers at the same time, pass a list to the *logger* :class:`~pytorch_lightning.trainer.trainer.Trainer` argument.
+
+.. code-block:: python
+
+    from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
+
+    logger1 = TensorBoardLogger()
+    logger2 = WandbLogger()
+    trainer = Trainer(logger=[logger1, logger2])
+
+
+Access all loggers from any function (except the LightningModule *init*) to use their APIs for tracking advanced artifacts
+
+.. code-block:: python
+
+    class MyModule(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            tensorboard_logger = self.logger.experiment[0]
+            wandb_logger = self.logger.experiment[1]
+
+            fake_images = torch.Tensor(32, 3, 28, 28)
+
+            tensorboard_logger.add_image("generated_images", fake_images, 0)
+            wandb_logger.add_image("generated_images", fake_images, 0)
diff --git a/pl_examples/loop_examples/kfold.py b/pl_examples/loop_examples/kfold.py
index 811ad409c2e91c..229673ec78df41 100644
--- a/pl_examples/loop_examples/kfold.py
+++ b/pl_examples/loop_examples/kfold.py
@@ -239,6 +239,9 @@ def __getattr__(self, key) -> Any:
             return getattr(self.fit_loop, key)
         return self.__dict__[key]
 
+    def __setstate__(self, state: Dict[str, Any]) -> None:
+        self.__dict__.update(state)
+
 
 class LitImageClassifier(ImageClassifier):
     def __init__(self) -> None:
diff --git a/pl_examples/loop_examples/yielding_training_step.py b/pl_examples/loop_examples/yielding_training_step.py
index e787c8bd982046..52abf768fea4df 100644
--- a/pl_examples/loop_examples/yielding_training_step.py
+++ b/pl_examples/loop_examples/yielding_training_step.py
@@ -22,7 +22,6 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.loops import OptimizerLoop
 from pytorch_lightning.loops.optimization.optimizer_loop import ClosureResult
-from pytorch_lightning.loops.utilities import _build_training_step_kwargs
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 #############################################################################################
@@ -56,28 +55,25 @@ def __init__(self):
     def connect(self, **kwargs):
         raise NotImplementedError(f"{self.__class__.__name__} does not connect any child loops.")
 
-    def on_run_start(self, batch, optimizers, batch_idx):
-        super().on_run_start(batch, optimizers, batch_idx)
+    def on_run_start(self, optimizers, kwargs):
+        super().on_run_start(optimizers, kwargs)
         if not inspect.isgeneratorfunction(self.trainer.lightning_module.training_step):
             raise MisconfigurationException("The `LightningModule` does not yield anything in the `training_step`.")
         assert self.trainer.lightning_module.automatic_optimization
 
-        # We request the generator once and save it for later
-        # so we can call next() on it.
-        self._generator = self._get_generator(batch, batch_idx, opt_idx=0)
+        # We request the generator once and save it for later so we can call next() on it.
+        self._generator = self._get_generator(kwargs)
 
-    def _make_step_fn(self, split_batch, batch_idx, opt_idx):
+    def _make_step_fn(self, *_):
         return partial(self._training_step, self._generator)
 
-    def _get_generator(self, split_batch, batch_idx, opt_idx):
-        step_kwargs = _build_training_step_kwargs(
-            self.trainer.lightning_module, self.trainer.optimizers, split_batch, batch_idx, opt_idx, hiddens=None
-        )
+    def _get_generator(self, kwargs, opt_idx=0):
+        kwargs = self._build_kwargs(kwargs, opt_idx, hiddens=None)
 
         # Here we are basically calling `lightning_module.training_step()`
-        # and this returns a generator! The `training_step` is handled by the
-        # accelerator to enable distributed training.
-        return self.trainer.strategy.training_step(*step_kwargs.values())
+        # and this returns a generator! The `training_step` is handled by
+        # the accelerator to enable distributed training.
+        return self.trainer.strategy.training_step(*kwargs.values())
 
     def _training_step(self, generator):
         # required for logging
diff --git a/pyproject.toml b/pyproject.toml
index 74873e284521d3..bace2601b36a5e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ module = [
     "pytorch_lightning.core.lightning",
     "pytorch_lightning.core.mixins.device_dtype_mixin",
     "pytorch_lightning.core.saving",
+    "pytorch_lightning.demos.boring_classes",
     "pytorch_lightning.distributed.dist",
     "pytorch_lightning.loggers.base",
     "pytorch_lightning.loggers.logger",
@@ -60,7 +61,6 @@ module = [
     "pytorch_lightning.loggers.mlflow",
     "pytorch_lightning.loggers.neptune",
     "pytorch_lightning.loggers.tensorboard",
-    "pytorch_lightning.loggers.test_tube",
     "pytorch_lightning.loggers.wandb",
     "pytorch_lightning.loops.epoch.training_epoch_loop",
     "pytorch_lightning.strategies.ddp",
diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py
index 0929358cf0f74d..f21aceae2eed00 100644
--- a/pytorch_lightning/callbacks/device_stats_monitor.py
+++ b/pytorch_lightning/callbacks/device_stats_monitor.py
@@ -65,7 +65,9 @@ def on_train_batch_start(
         device_stats = trainer.accelerator.get_device_stats(device)
         for logger in trainer.loggers:
             separator = logger.group_separator
-            prefixed_device_stats = _prefix_metric_keys(device_stats, "on_train_batch_start", separator)
+            prefixed_device_stats = _prefix_metric_keys(
+                device_stats, f"{self.__class__.__qualname__}.on_train_batch_start", separator
+            )
             logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped)
 
     def on_train_batch_end(
@@ -87,7 +89,9 @@ def on_train_batch_end(
         device_stats = trainer.accelerator.get_device_stats(device)
         for logger in trainer.loggers:
             separator = logger.group_separator
-            prefixed_device_stats = _prefix_metric_keys(device_stats, "on_train_batch_end", separator)
+            prefixed_device_stats = _prefix_metric_keys(
+                device_stats, f"{self.__class__.__qualname__}.on_train_batch_end", separator
+            )
             logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped)
 
 
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 16b1bfce152adb..31c2706cffe3bf 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -81,7 +81,7 @@ class EarlyStopping(Callback):
 
         *monitor, mode*
 
-        Read more: :ref:`Persisting Callback State`
+        Read more: :ref:`Persisting Callback State <extensions/callbacks_state:save callback state>`
     """
     mode_dict = {"min": torch.lt, "max": torch.gt}
 
diff --git a/pytorch_lightning/callbacks/finetuning.py b/pytorch_lightning/callbacks/finetuning.py
index c01df943785146..26ef742ee1b39f 100644
--- a/pytorch_lightning/callbacks/finetuning.py
+++ b/pytorch_lightning/callbacks/finetuning.py
@@ -279,7 +279,7 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo
         # import is here to avoid circular imports
         from pytorch_lightning.loops.utilities import _get_active_optimizers
 
-        for opt_idx, optimizer in _get_active_optimizers(trainer.optimizers, trainer.optimizer_frequencies):
+        for opt_idx, optimizer in _get_active_optimizers(trainer.optimizers, trainer.optimizer_frequencies, 0):
             num_param_groups = len(optimizer.param_groups)
             self.finetune_function(pl_module, trainer.current_epoch, optimizer, opt_idx)
             current_param_groups = optimizer.param_groups
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 2ce147b37c8c30..33468216ab85e4 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -87,8 +87,7 @@ class ModelCheckpoint(Callback):
         save_last: When ``True``, saves an exact copy of the checkpoint to a file `last.ckpt` whenever a checkpoint
             file gets saved. This allows accessing the latest checkpoint in a deterministic manner. Default: ``None``.
         save_top_k: if ``save_top_k == k``,
-            the best k models according to
-            the quantity monitored will be saved.
+            the best k models according to the quantity monitored will be saved.
             if ``save_top_k == 0``, no models are saved.
             if ``save_top_k == -1``, all models are saved.
             Please note that the monitors are checked every ``every_n_epochs`` epochs.
@@ -103,6 +102,7 @@ class ModelCheckpoint(Callback):
             For example, ``filename='checkpoint_{epoch:02d}-{acc:02.0f}`` with epoch ``1`` and acc ``1.12`` will resolve
             to ``checkpoint_epoch=01-acc=01.ckpt``. Is useful to set it to ``False`` when metric names contain ``/``
             as this will result in extra folders.
+            For example, ``filename='epoch={epoch}-step={step}-val_acc={val/acc:.2f}', auto_insert_metric_name=False``
         save_weights_only: if ``True``, then only the model's weights will be
             saved. Otherwise, the optimizer states, lr-scheduler states, etc are added in the checkpoint too.
         every_n_train_steps: Number of training steps between checkpoints.
@@ -196,7 +196,7 @@ class ModelCheckpoint(Callback):
 
         *monitor, mode, every_n_train_steps, every_n_epochs, train_time_interval, save_on_train_epoch_end*
 
-        Read more: :ref:`Persisting Callback State`
+        Read more: :ref:`Persisting Callback State <extensions/callbacks_state:save callback state>`
     """
 
     CHECKPOINT_JOIN_CHAR = "-"
@@ -517,9 +517,12 @@ def _format_checkpoint_name(
                 if auto_insert_metric_name:
                     filename = filename.replace(group, name + "={" + name)
 
+                # support for dots: https://stackoverflow.com/a/7934969
+                filename = filename.replace(group, f"{{0[{name}]")
+
                 if name not in metrics:
                     metrics[name] = 0
-            filename = filename.format(**metrics)
+            filename = filename.format(metrics)
 
         if prefix:
             filename = cls.CHECKPOINT_JOIN_CHAR.join([prefix, filename])
@@ -567,8 +570,8 @@ def format_checkpoint_name(
         return os.path.join(self.dirpath, ckpt_name) if self.dirpath else ckpt_name
 
     def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None:
-        """Determines model checkpoint save directory at runtime. References attributes from the trainer's logger
-        to determine where to save checkpoints. The path for saving weights is set in this priority:
+        """Determines model checkpoint save directory at runtime. Reference attributes from the trainer's logger to
+        determine where to save checkpoints. The path for saving weights is set in this priority:
 
         1.  The ``ModelCheckpoint``'s ``dirpath`` if passed in
         2.  The ``Trainer``'s ``weights_saved_path`` if passed in (deprecated)
@@ -637,6 +640,12 @@ def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[
             return
 
         filepath = self.format_checkpoint_name(monitor_candidates, self.CHECKPOINT_NAME_LAST)
+
+        version_cnt = self.STARTING_VERSION
+        while self.file_exists(filepath, trainer) and filepath != self.last_model_path:
+            filepath = self.format_checkpoint_name(monitor_candidates, self.CHECKPOINT_NAME_LAST, ver=version_cnt)
+            version_cnt += 1
+
         # set the last model path before saving because it will be part of the state.
         previous, self.last_model_path = self.last_model_path, filepath
         self._save_checkpoint(trainer, filepath)
diff --git a/pytorch_lightning/callbacks/progress/base.py b/pytorch_lightning/callbacks/progress/base.py
index 42ad957235f2c8..91a662455545f5 100644
--- a/pytorch_lightning/callbacks/progress/base.py
+++ b/pytorch_lightning/callbacks/progress/base.py
@@ -220,7 +220,7 @@ def get_metrics(self, trainer, model):
         Return:
             Dictionary with the items to be displayed in the progress bar.
         """
-        standard_metrics = pl_module.get_progress_bar_dict()
+        standard_metrics = get_standard_metrics(trainer, pl_module)
         pbar_metrics = trainer.progress_bar_metrics
         duplicates = list(standard_metrics.keys() & pbar_metrics.keys())
         if duplicates:
diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py
index 741d4b85d92148..131cec031be2a4 100644
--- a/pytorch_lightning/callbacks/progress/rich_progress.py
+++ b/pytorch_lightning/callbacks/progress/rich_progress.py
@@ -22,7 +22,8 @@
 
 Task, Style = None, None
 if _RICH_AVAILABLE:
-    from rich.console import Console, RenderableType
+    from rich import get_console, reconfigure
+    from rich.console import RenderableType
     from rich.progress import BarColumn, Progress, ProgressColumn, Task, TaskID, TextColumn
     from rich.progress_bar import ProgressBar
     from rich.style import Style
@@ -278,7 +279,8 @@ def enable(self) -> None:
     def _init_progress(self, trainer):
         if self.is_enabled and (self.progress is None or self._progress_stopped):
             self._reset_progress_bar_ids()
-            self._console = Console(**self._console_kwargs)
+            reconfigure(**self._console_kwargs)
+            self._console = get_console()
             self._console.clear_live()
             self._metric_component = MetricsTextColumn(trainer, self.theme.metrics)
             self.progress = CustomProgress(
diff --git a/pytorch_lightning/callbacks/progress/tqdm_progress.py b/pytorch_lightning/callbacks/progress/tqdm_progress.py
index 4ce29645884984..c7d86a09a5b5dd 100644
--- a/pytorch_lightning/callbacks/progress/tqdm_progress.py
+++ b/pytorch_lightning/callbacks/progress/tqdm_progress.py
@@ -259,13 +259,13 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", *_: Any) -> None:
             val_checks_per_epoch = total_train_batches // trainer.val_check_batch
             total_val_batches = total_val_batches * val_checks_per_epoch
         total_batches = total_train_batches + total_val_batches
-        self.main_progress_bar.total = convert_inf(total_batches)
+        self.main_progress_bar.reset(convert_inf(total_batches))
         self.main_progress_bar.set_description(f"Epoch {trainer.current_epoch}")
 
     def on_train_batch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", *_: Any) -> None:
         current = self.train_batch_idx + self._val_processed
         if self._should_update(current, self.main_progress_bar.total):
-            _update_n(self.main_progress_bar, current)
+            _update_n(self.main_progress_bar, current, self.refresh_rate)
             self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
 
     def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
@@ -285,17 +285,17 @@ def on_validation_batch_start(
         if not self.has_dataloader_changed(dataloader_idx):
             return
 
-        self.val_progress_bar.total = convert_inf(self.total_val_batches_current_dataloader)
+        self.val_progress_bar.reset(convert_inf(self.total_val_batches_current_dataloader))
         desc = self.sanity_check_description if trainer.sanity_checking else self.validation_description
         self.val_progress_bar.set_description(f"{desc} DataLoader {dataloader_idx}")
 
     def on_validation_batch_end(self, trainer: "pl.Trainer", *_: Any) -> None:
         if self._should_update(self.val_batch_idx, self.val_progress_bar.total):
-            _update_n(self.val_progress_bar, self.val_batch_idx)
+            _update_n(self.val_progress_bar, self.val_batch_idx, self.refresh_rate)
 
         current = self.train_batch_idx + self._val_processed
         if trainer.state.fn == "fit" and self._should_update(current, self.main_progress_bar.total):
-            _update_n(self.main_progress_bar, current)
+            _update_n(self.main_progress_bar, current, self.refresh_rate)
 
     def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         if self._main_progress_bar is not None and trainer.state.fn == "fit":
@@ -312,12 +312,12 @@ def on_test_batch_start(
         if not self.has_dataloader_changed(dataloader_idx):
             return
 
-        self.test_progress_bar.total = convert_inf(self.total_test_batches_current_dataloader)
+        self.test_progress_bar.reset(convert_inf(self.total_test_batches_current_dataloader))
         self.test_progress_bar.set_description(f"{self.test_description} DataLoader {dataloader_idx}")
 
     def on_test_batch_end(self, *_: Any) -> None:
         if self._should_update(self.test_batch_idx, self.test_progress_bar.total):
-            _update_n(self.test_progress_bar, self.test_batch_idx)
+            _update_n(self.test_progress_bar, self.test_batch_idx, self.refresh_rate)
 
     def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         self.test_progress_bar.close()
@@ -332,12 +332,12 @@ def on_predict_batch_start(
         if not self.has_dataloader_changed(dataloader_idx):
             return
 
-        self.predict_progress_bar.total = convert_inf(self.total_predict_batches_current_dataloader)
+        self.predict_progress_bar.reset(convert_inf(self.total_predict_batches_current_dataloader))
         self.predict_progress_bar.set_description(f"{self.predict_description} DataLoader {dataloader_idx}")
 
     def on_predict_batch_end(self, *_: Any) -> None:
         if self._should_update(self.predict_batch_idx, self.predict_progress_bar.total):
-            _update_n(self.predict_progress_bar, self.predict_batch_idx)
+            _update_n(self.predict_progress_bar, self.predict_batch_idx, self.refresh_rate)
 
     def on_predict_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         self.predict_progress_bar.close()
@@ -381,7 +381,10 @@ def convert_inf(x: Optional[Union[int, float]]) -> Optional[Union[int, float]]:
     return x
 
 
-def _update_n(bar: _tqdm, value: int) -> None:
+def _update_n(bar: _tqdm, current: int, refresh_rate: int) -> None:
     if not bar.disable:
-        bar.n = value
+        total = bar.total
+        leftover = current % refresh_rate
+        advance = leftover if (current == total and leftover != 0) else refresh_rate
+        bar.update(advance)
         bar.refresh()
diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
index 4b0b3f702cf6e3..2ae1262eb25d99 100644
--- a/pytorch_lightning/callbacks/quantization.py
+++ b/pytorch_lightning/callbacks/quantization.py
@@ -26,7 +26,7 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_10
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_11
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if _TORCH_GREATER_EQUAL_1_10:
@@ -34,6 +34,11 @@
 else:
     from torch.quantization import QConfig
 
+if _TORCH_GREATER_EQUAL_1_11:
+    from torch.ao.quantization import fuse_modules_qat as fuse_modules
+else:
+    from torch.quantization import fuse_modules
+
 
 def wrap_qat_forward_context(
     quant_cb, model: "pl.LightningModule", func: Callable, trigger_condition: Optional[Union[Callable, int]] = None
@@ -252,7 +257,7 @@ def _prepare_model(self, model: torch.nn.Module) -> None:
             model.qconfig = self._qconfig
 
         if self._check_feasible_fuse(model):
-            torch.quantization.fuse_modules(model, self._modules_to_fuse, inplace=True)
+            fuse_modules(model, self._modules_to_fuse, inplace=True)
 
         # Prepare the model for QAT. This inserts observers and fake_quants in
         # the model that will observe weight and activation tensors during calibration.
diff --git a/pytorch_lightning/callbacks/rich_model_summary.py b/pytorch_lightning/callbacks/rich_model_summary.py
index 830e865632a3e0..c290ee764d8b46 100644
--- a/pytorch_lightning/callbacks/rich_model_summary.py
+++ b/pytorch_lightning/callbacks/rich_model_summary.py
@@ -18,7 +18,7 @@
 from pytorch_lightning.utilities.model_summary import get_human_readable_count
 
 if _RICH_AVAILABLE:
-    from rich.console import Console
+    from rich import get_console
     from rich.table import Table
 
 
@@ -73,7 +73,7 @@ def summarize(
         model_size: float,
     ) -> None:
 
-        console = Console()
+        console = get_console()
 
         table = Table(header_style="bold magenta")
         table.add_column(" ", style="dim")
diff --git a/pytorch_lightning/core/datamodule.py b/pytorch_lightning/core/datamodule.py
index 185eee6a9f34f1..a2b4e8e0f1309d 100644
--- a/pytorch_lightning/core/datamodule.py
+++ b/pytorch_lightning/core/datamodule.py
@@ -13,14 +13,15 @@
 # limitations under the License.
 """LightningDataModule for loading DataLoaders with ease."""
 from argparse import ArgumentParser, Namespace
-from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, IO, List, Mapping, Optional, Sequence, Tuple, Union
 
 from torch.utils.data import DataLoader, Dataset, IterableDataset
 
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks
 from pytorch_lightning.core.mixins import HyperparametersMixin
-from pytorch_lightning.utilities import rank_zero_deprecation
+from pytorch_lightning.core.saving import _load_from_checkpoint
 from pytorch_lightning.utilities.argparse import add_argparse_args, from_argparse_args, get_init_arguments_and_types
+from pytorch_lightning.utilities.types import _PATH
 
 
 class LightningDataModule(CheckpointHooks, DataHooks, HyperparametersMixin):
@@ -53,61 +54,15 @@ def teardown(self):
     """
 
     name: str = ...
+    CHECKPOINT_HYPER_PARAMS_KEY = "datamodule_hyper_parameters"
+    CHECKPOINT_HYPER_PARAMS_NAME = "datamodule_hparams_name"
+    CHECKPOINT_HYPER_PARAMS_TYPE = "datamodule_hparams_type"
 
-    def __init__(self, val_transforms=None, test_transforms=None):
+    def __init__(self) -> None:
         super().__init__()
-        if val_transforms is not None:
-            rank_zero_deprecation(
-                "DataModule property `val_transforms` was deprecated in v1.5 and will be removed in v1.7."
-            )
-        if test_transforms is not None:
-            rank_zero_deprecation(
-                "DataModule property `test_transforms` was deprecated in v1.5 and will be removed in v1.7."
-            )
-        self._val_transforms = val_transforms
-        self._test_transforms = test_transforms
-
         # Pointer to the trainer object
         self.trainer = None
 
-    @property
-    def val_transforms(self):
-        """Optional transforms (or collection of transforms) you can apply to validation dataset.
-
-        .. deprecated:: v1.5     Will be removed in v1.7.0.
-        """
-
-        rank_zero_deprecation(
-            "DataModule property `val_transforms` was deprecated in v1.5 and will be removed in v1.7."
-        )
-        return self._val_transforms
-
-    @val_transforms.setter
-    def val_transforms(self, t):
-        rank_zero_deprecation(
-            "DataModule property `val_transforms` was deprecated in v1.5 and will be removed in v1.7."
-        )
-        self._val_transforms = t
-
-    @property
-    def test_transforms(self):
-        """Optional transforms (or collection of transforms) you can apply to test dataset.
-
-        .. deprecated:: v1.5     Will be removed in v1.7.0.
-        """
-
-        rank_zero_deprecation(
-            "DataModule property `test_transforms` was deprecated in v1.5 and will be removed in v1.7."
-        )
-        return self._test_transforms
-
-    @test_transforms.setter
-    def test_transforms(self, t):
-        rank_zero_deprecation(
-            "DataModule property `test_transforms` was deprecated in v1.5 and will be removed in v1.7."
-        )
-        self._test_transforms = t
-
     @classmethod
     def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs) -> ArgumentParser:
         """Extends existing argparse by default `LightningDataModule` attributes."""
@@ -147,6 +102,7 @@ def from_datasets(
         train_dataset: Optional[Union[Dataset, Sequence[Dataset], Mapping[str, Dataset]]] = None,
         val_dataset: Optional[Union[Dataset, Sequence[Dataset]]] = None,
         test_dataset: Optional[Union[Dataset, Sequence[Dataset]]] = None,
+        predict_dataset: Optional[Union[Dataset, Sequence[Dataset]]] = None,
         batch_size: int = 1,
         num_workers: int = 0,
     ):
@@ -157,6 +113,7 @@ def from_datasets(
             train_dataset: (optional) Dataset to be used for train_dataloader()
             val_dataset: (optional) Dataset or list of Dataset to be used for val_dataloader()
             test_dataset: (optional) Dataset or list of Dataset to be used for test_dataloader()
+            predict_dataset: (optional) Dataset or list of Dataset to be used for predict_dataloader()
             batch_size: Batch size to use for each dataloader. Default is 1.
             num_workers: Number of subprocesses to use for data loading. 0 means that the
                 data will be loaded in the main process. Number of CPUs available.
@@ -184,6 +141,11 @@ def test_dataloader():
                 return [dataloader(ds) for ds in test_dataset]
             return dataloader(test_dataset)
 
+        def predict_dataloader():
+            if isinstance(predict_dataset, Sequence):
+                return [dataloader(ds) for ds in predict_dataset]
+            return dataloader(predict_dataset)
+
         datamodule = cls()
         if train_dataset is not None:
             datamodule.train_dataloader = train_dataloader
@@ -191,6 +153,8 @@ def test_dataloader():
             datamodule.val_dataloader = val_dataloader
         if test_dataset is not None:
             datamodule.test_dataloader = test_dataloader
+        if predict_dataset is not None:
+            datamodule.predict_dataloader = predict_dataloader
         return datamodule
 
     def state_dict(self) -> Dict[str, Any]:
@@ -199,7 +163,7 @@ def state_dict(self) -> Dict[str, Any]:
         Returns:
             A dictionary containing datamodule state.
         """
-        return {}
+        return dict()
 
     def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
         """Called when loading a checkpoint, implement to reload datamodule state given datamodule state_dict.
@@ -208,3 +172,72 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
             state_dict: the datamodule state returned by ``state_dict``.
         """
         pass
+
+    @classmethod
+    def load_from_checkpoint(
+        cls,
+        checkpoint_path: Union[_PATH, IO],
+        hparams_file: Optional[_PATH] = None,
+        **kwargs,
+    ):
+        r"""
+        Primary way of loading a datamodule from a checkpoint. When Lightning saves a checkpoint
+        it stores the arguments passed to ``__init__``  in the checkpoint under ``"datamodule_hyper_parameters"``.
+
+        Any arguments specified through \*\*kwargs will override args stored in ``"datamodule_hyper_parameters"``.
+
+        Args:
+            checkpoint_path: Path to checkpoint. This can also be a URL, or file-like object
+            hparams_file: Optional path to a ``.yaml`` or ``.csv`` file with hierarchical structure
+                as in this example::
+
+                    dataloader:
+                        batch_size: 32
+
+                You most likely won't need this since Lightning will always save the hyperparameters
+                to the checkpoint.
+                However, if your checkpoint weights don't have the hyperparameters saved,
+                use this method to pass in a ``.yaml`` file with the hparams you'd like to use.
+                These will be converted into a :class:`~dict` and passed into your
+                :class:`LightningDataModule` for use.
+
+                If your datamodule's ``hparams`` argument is :class:`~argparse.Namespace`
+                and ``.yaml`` file has hierarchical structure, you need to refactor your datamodule to treat
+                ``hparams`` as :class:`~dict`.
+            \**kwargs: Any extra keyword args needed to init the datamodule. Can also be used to override saved
+                hyperparameter values.
+
+        Return:
+            :class:`LightningDataModule` instance with loaded weights and hyperparameters (if available).
+
+        Note:
+            ``load_from_checkpoint`` is a **class** method. You should use your :class:`LightningDataModule`
+            **class** to call it instead of the :class:`LightningDataModule` instance.
+
+        Example::
+
+            # load weights without mapping ...
+            datamodule = MyLightningDataModule.load_from_checkpoint('path/to/checkpoint.ckpt')
+
+            # or load weights and hyperparameters from separate files.
+            datamodule = MyLightningDataModule.load_from_checkpoint(
+                'path/to/checkpoint.ckpt',
+                hparams_file='/path/to/hparams_file.yaml'
+            )
+
+            # override some of the params with new values
+            datamodule = MyLightningDataModule.load_from_checkpoint(
+                PATH,
+                batch_size=32,
+                num_workers=10,
+            )
+
+        """
+        return _load_from_checkpoint(
+            cls,
+            checkpoint_path,
+            map_location=None,
+            hparams_file=hparams_file,
+            strict=None,
+            **kwargs,
+        )
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 33ba4b2990e12b..816ea0889c299c 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -32,15 +32,14 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.callbacks.progress import base as progress_base
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks
 from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin, HyperparametersMixin
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.core.saving import ModelIO
-from pytorch_lightning.loggers import Logger
+from pytorch_lightning.loggers import Logger, LoggerCollection
 from pytorch_lightning.trainer.connectors.data_connector import _DataHookSelector
 from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator
-from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType
+from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType, warnings
 from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp
@@ -246,7 +245,26 @@ def truncated_bptt_steps(self, truncated_bptt_steps: int) -> None:
     @property
     def logger(self) -> Optional[Logger]:
         """Reference to the logger object in the Trainer."""
-        return self.trainer.logger if self.trainer else None
+        # this should match the implementation of `trainer.logger`
+        # we don't reuse it so we can properly set the deprecation stacklevel
+        if self.trainer is None:
+            return
+        loggers = self.trainer.loggers
+        if len(loggers) == 0:
+            return None
+        if len(loggers) == 1:
+            return loggers[0]
+        else:
+            if not self._running_torchscript:
+                rank_zero_deprecation(
+                    "Using `lightning_module.logger` when multiple loggers are configured."
+                    " This behavior will change in v1.8 when `LoggerCollection` is removed, and"
+                    " `lightning_module.logger` will return the first logger available.",
+                    stacklevel=5,
+                )
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                return LoggerCollection(loggers)
 
     @property
     def loggers(self) -> List[Logger]:
@@ -685,7 +703,7 @@ def training_step_end(self, training_step_outputs):
                 return loss
 
         See Also:
-            See the :ref:`accelerators/gpu:Multi GPU Training` guide for more details.
+            See the :ref:`Multi GPU Training <gpu_intermediate>` guide for more details.
         """
 
     def training_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
@@ -859,7 +877,7 @@ def validation_step_end(self, val_step_outputs):
                     ...
 
         See Also:
-            See the :ref:`accelerators/gpu:Multi GPU Training` guide for more details.
+            See the :ref:`Multi GPU Training <gpu_intermediate>` guide for more details.
         """
 
     def validation_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
@@ -987,7 +1005,7 @@ def test_step(self, batch, batch_idx, dataloader_idx=0):
         """
 
     def test_step_end(self, *args, **kwargs) -> Optional[STEP_OUTPUT]:
-        """Use this when testing with dp or ddp2 because :meth:`test_step` will operate on only part of the batch.
+        """Use this when testing with DP or DDP2 because :meth:`test_step` will operate on only part of the batch.
         However, this is still optional and only needed for things like softmax or NCE loss.
 
         Note:
@@ -1037,7 +1055,7 @@ def test_step_end(self, output_results):
                 self.log("test_loss", loss)
 
         See Also:
-            See the :ref:`accelerators/gpu:Multi GPU Training` guide for more details.
+            See the :ref:`Multi GPU Training <gpu_intermediate>` guide for more details.
         """
 
     def test_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
@@ -1109,7 +1127,7 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> A
 
             class MyModel(LightningModule):
 
-                def predicts_step(self, batch, batch_idx, dataloader_idx=0):
+                def predict_step(self, batch, batch_idx, dataloader_idx=0):
                     return self(batch)
 
             dm = ...
@@ -1731,35 +1749,6 @@ def unfreeze(self) -> None:
 
         self.train()
 
-    def get_progress_bar_dict(self) -> Dict[str, Union[int, str]]:
-        r"""
-        .. deprecated:: v1.5
-            This method was deprecated in v1.5 in favor of
-            `pytorch_lightning.callbacks.progress.base.get_metrics` and will be removed in v1.7.
-
-        Implement this to override the default items displayed in the progress bar.
-        By default it includes the average loss value, split index of BPTT (if used)
-        and the version of the experiment when using a logger.
-
-        .. code-block::
-
-            Epoch 1:   4%|▎         | 40/1095 [00:03<01:37, 10.84it/s, loss=4.501, v_num=10]
-
-        Here is an example how to override the defaults:
-
-        .. code-block:: python
-
-            def get_progress_bar_dict(self):
-                # don't show the version number
-                items = super().get_progress_bar_dict()
-                items.pop("v_num", None)
-                return items
-
-        Return:
-            Dictionary with the items to be displayed in the progress bar.
-        """
-        return progress_base.get_standard_metrics(self.trainer, self)
-
     def _verify_is_manual_optimization(self, fn_name):
         if self.automatic_optimization:
             raise MisconfigurationException(
diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py
index fa0f92eb3b971b..da81e4c2125605 100644
--- a/pytorch_lightning/core/saving.py
+++ b/pytorch_lightning/core/saving.py
@@ -26,6 +26,7 @@
 import torch
 import yaml
 
+import pytorch_lightning as pl
 from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, AttributeDict
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.cloud_io import get_filesystem
@@ -33,6 +34,7 @@
 from pytorch_lightning.utilities.migration import pl_legacy_patch
 from pytorch_lightning.utilities.parsing import parse_class_init_keys
 from pytorch_lightning.utilities.rank_zero import rank_zero_warn
+from pytorch_lightning.utilities.types import _PATH
 
 log = logging.getLogger(__name__)
 PRIMITIVE_TYPES = (bool, int, float, str)
@@ -73,7 +75,7 @@ def load_from_checkpoint(
                 If your checkpoint saved a GPU model and you now load on CPUs
                 or a different number of GPUs, use this to map to the new setup.
                 The behaviour is the same as in :func:`torch.load`.
-            hparams_file: Optional path to a .yaml file with hierarchical structure
+            hparams_file: Optional path to a ``.yaml`` or ``.csv`` file with hierarchical structure
                 as in this example::
 
                     drop_prob: 0.2
@@ -83,16 +85,16 @@ def load_from_checkpoint(
                 You most likely won't need this since Lightning will always save the hyperparameters
                 to the checkpoint.
                 However, if your checkpoint weights don't have the hyperparameters saved,
-                use this method to pass in a .yaml file with the hparams you'd like to use.
+                use this method to pass in a ``.yaml`` file with the hparams you'd like to use.
                 These will be converted into a :class:`~dict` and passed into your
                 :class:`LightningModule` for use.
 
                 If your model's ``hparams`` argument is :class:`~argparse.Namespace`
-                and .yaml file has hierarchical structure, you need to refactor your model to treat
+                and ``.yaml`` file has hierarchical structure, you need to refactor your model to treat
                 ``hparams`` as :class:`~dict`.
             strict: Whether to strictly enforce that the keys in :attr:`checkpoint_path` match the keys
                 returned by this module's state dict.
-            kwargs: Any extra keyword args needed to init the model. Can also be used to override saved
+            \**kwargs: Any extra keyword args needed to init the model. Can also be used to override saved
                 hyperparameter values.
 
         Return:
@@ -132,93 +134,14 @@ def load_from_checkpoint(
             pretrained_model.freeze()
             y_hat = pretrained_model(x)
         """
-        with pl_legacy_patch():
-            if map_location is not None:
-                checkpoint = pl_load(checkpoint_path, map_location=map_location)
-            else:
-                checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
-
-        if hparams_file is not None:
-            extension = hparams_file.split(".")[-1]
-            if extension.lower() == "csv":
-                hparams = load_hparams_from_tags_csv(hparams_file)
-            elif extension.lower() in ("yml", "yaml"):
-                hparams = load_hparams_from_yaml(hparams_file)
-            else:
-                raise ValueError(".csv, .yml or .yaml is required for `hparams_file`")
-
-            hparams["on_gpu"] = False
-
-            # overwrite hparams by the given file
-            checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] = hparams
-
-        # for past checkpoint need to add the new key
-        if cls.CHECKPOINT_HYPER_PARAMS_KEY not in checkpoint:
-            checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] = {}
-        # override the hparams with values that were passed in
-        checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY].update(kwargs)
-
-        model = cls._load_model_state(checkpoint, strict=strict, **kwargs)
-        return model
-
-    @classmethod
-    def _load_model_state(cls, checkpoint: Dict[str, Any], strict: bool = True, **cls_kwargs_new):
-        cls_spec = inspect.getfullargspec(cls.__init__)
-        cls_init_args_name = inspect.signature(cls.__init__).parameters.keys()
-
-        self_var, args_var, kwargs_var = parse_class_init_keys(cls)
-        drop_names = [n for n in (self_var, args_var, kwargs_var) if n]
-        cls_init_args_name = list(filter(lambda n: n not in drop_names, cls_init_args_name))
-
-        cls_kwargs_loaded = {}
-        # pass in the values we saved automatically
-        if cls.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint:
-
-            # 1. (backward compatibility) Try to restore model hparams from checkpoint using old/past keys
-            for _old_hparam_key in CHECKPOINT_PAST_HPARAMS_KEYS:
-                cls_kwargs_loaded.update(checkpoint.get(_old_hparam_key, {}))
-
-            # 2. Try to restore model hparams from checkpoint using the new key
-            _new_hparam_key = cls.CHECKPOINT_HYPER_PARAMS_KEY
-            cls_kwargs_loaded.update(checkpoint.get(_new_hparam_key))
-
-            # 3. Ensure that `cls_kwargs_old` has the right type, back compatibility between dict and Namespace
-            cls_kwargs_loaded = _convert_loaded_hparams(
-                cls_kwargs_loaded, checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_TYPE)
-            )
-
-            # 4. Update cls_kwargs_new with cls_kwargs_old, such that new has higher priority
-            args_name = checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_NAME)
-            if args_name and args_name in cls_init_args_name:
-                cls_kwargs_loaded = {args_name: cls_kwargs_loaded}
-
-        _cls_kwargs = {}
-        _cls_kwargs.update(cls_kwargs_loaded)
-        _cls_kwargs.update(cls_kwargs_new)
-
-        if not cls_spec.varkw:
-            # filter kwargs according to class init unless it allows any argument via kwargs
-            _cls_kwargs = {k: v for k, v in _cls_kwargs.items() if k in cls_init_args_name}
-
-        model = cls(**_cls_kwargs)
-
-        # give model a chance to load something
-        model.on_load_checkpoint(checkpoint)
-
-        # load the state_dict on the model automatically
-        keys = model.load_state_dict(checkpoint["state_dict"], strict=strict)
-
-        if not strict:
-            if keys.missing_keys:
-                rank_zero_warn(
-                    f"Found keys that are in the model state dict but not in the checkpoint: {keys.missing_keys}"
-                )
-            if keys.unexpected_keys:
-                rank_zero_warn(
-                    f"Found keys that are not in the model state dict but in the checkpoint: {keys.unexpected_keys}"
-                )
-
-        return model
+        return _load_from_checkpoint(
+            cls,
+            checkpoint_path,
+            map_location,
+            hparams_file,
+            strict,
+            **kwargs,
+        )
 
     # -------------------------
     # OPTIONAL HOOKS
@@ -247,6 +170,107 @@ def on_hpc_load(self, checkpoint: Dict[str, Any]) -> None:
         """
 
 
+def _load_from_checkpoint(
+    cls: Union["pl.LightningModule", "pl.LightningDataModule"],
+    checkpoint_path: Union[str, IO],
+    map_location: Optional[Union[Dict[str, str], str, torch.device, int, Callable]] = None,
+    hparams_file: Optional[str] = None,
+    strict: Optional[bool] = None,
+    **kwargs: Any,
+) -> Any:
+    if map_location is None:
+        map_location = lambda storage, loc: storage
+    with pl_legacy_patch():
+        checkpoint = pl_load(checkpoint_path, map_location=map_location)
+
+    if hparams_file is not None:
+        extension = hparams_file.split(".")[-1]
+        if extension.lower() == "csv":
+            hparams = load_hparams_from_tags_csv(hparams_file)
+        elif extension.lower() in ("yml", "yaml"):
+            hparams = load_hparams_from_yaml(hparams_file)
+        else:
+            raise ValueError(".csv, .yml or .yaml is required for `hparams_file`")
+
+        # overwrite hparams by the given file
+        checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] = hparams
+
+    # for past checkpoint need to add the new key
+    checkpoint.setdefault(cls.CHECKPOINT_HYPER_PARAMS_KEY, {})
+    # override the hparams with values that were passed in
+    checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY].update(kwargs)
+
+    if issubclass(cls, pl.LightningDataModule):
+        return _load_state(cls, checkpoint, **kwargs)
+    return _load_state(cls, checkpoint, strict=strict, **kwargs)
+
+
+def _load_state(
+    cls: Union["pl.LightningModule", "pl.LightningDataModule"],
+    checkpoint: Dict[str, Any],
+    strict: Optional[bool] = None,
+    **cls_kwargs_new: Any,
+) -> Any:
+    cls_spec = inspect.getfullargspec(cls.__init__)
+    cls_init_args_name = inspect.signature(cls.__init__).parameters.keys()
+
+    self_var, args_var, kwargs_var = parse_class_init_keys(cls)
+    drop_names = [n for n in (self_var, args_var, kwargs_var) if n]
+    cls_init_args_name = list(filter(lambda n: n not in drop_names, cls_init_args_name))
+
+    cls_kwargs_loaded = {}
+    # pass in the values we saved automatically
+    if cls.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint:
+
+        if issubclass(cls, pl.LightningModule):
+            # 1. (backward compatibility) Try to restore model hparams from checkpoint using old/past keys
+            for _old_hparam_key in CHECKPOINT_PAST_HPARAMS_KEYS:
+                cls_kwargs_loaded.update(checkpoint.get(_old_hparam_key, {}))
+
+        # 2. Try to restore model hparams from checkpoint using the new key
+        _new_hparam_key = cls.CHECKPOINT_HYPER_PARAMS_KEY
+        cls_kwargs_loaded.update(checkpoint.get(_new_hparam_key))
+
+        # 3. Ensure that `cls_kwargs_old` has the right type, back compatibility between dict and Namespace
+        cls_kwargs_loaded = _convert_loaded_hparams(cls_kwargs_loaded, checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_TYPE))
+
+        # 4. Update cls_kwargs_new with cls_kwargs_old, such that new has higher priority
+        args_name = checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_NAME)
+        if args_name and args_name in cls_init_args_name:
+            cls_kwargs_loaded = {args_name: cls_kwargs_loaded}
+
+    _cls_kwargs = {}
+    _cls_kwargs.update(cls_kwargs_loaded)
+    _cls_kwargs.update(cls_kwargs_new)
+
+    if not cls_spec.varkw:
+        # filter kwargs according to class init unless it allows any argument via kwargs
+        _cls_kwargs = {k: v for k, v in _cls_kwargs.items() if k in cls_init_args_name}
+
+    obj = cls(**_cls_kwargs)
+
+    # give model a chance to load something
+    obj.on_load_checkpoint(checkpoint)
+
+    if isinstance(obj, pl.LightningDataModule):
+        return obj
+
+    # load the state_dict on the model automatically
+    keys = obj.load_state_dict(checkpoint["state_dict"], strict=strict)
+
+    if not strict:
+        if keys.missing_keys:
+            rank_zero_warn(
+                f"Found keys that are in the model state dict but not in the checkpoint: {keys.missing_keys}"
+            )
+        if keys.unexpected_keys:
+            rank_zero_warn(
+                f"Found keys that are not in the model state dict but in the checkpoint: {keys.unexpected_keys}"
+            )
+
+    return obj
+
+
 def _convert_loaded_hparams(model_args: dict, hparams_type: Optional[Union[Callable, str]] = None) -> object:
     """Convert hparams according given type in callable or string (past) format."""
     # if not hparams type define
@@ -288,7 +312,7 @@ def update_hparams(hparams: dict, updates: dict) -> None:
             hparams.update({k: v})
 
 
-def load_hparams_from_tags_csv(tags_csv: str) -> Dict[str, Any]:
+def load_hparams_from_tags_csv(tags_csv: _PATH) -> Dict[str, Any]:
     """Load hparams from a file.
 
     >>> hparams = Namespace(batch_size=32, learning_rate=0.001, data_root='./any/path/here')
@@ -311,7 +335,7 @@ def load_hparams_from_tags_csv(tags_csv: str) -> Dict[str, Any]:
     return tags
 
 
-def save_hparams_to_tags_csv(tags_csv: str, hparams: Union[dict, Namespace]) -> None:
+def save_hparams_to_tags_csv(tags_csv: _PATH, hparams: Union[dict, Namespace]) -> None:
     fs = get_filesystem(tags_csv)
     if not fs.isdir(os.path.dirname(tags_csv)):
         raise RuntimeError(f"Missing folder: {os.path.dirname(tags_csv)}.")
@@ -327,7 +351,7 @@ def save_hparams_to_tags_csv(tags_csv: str, hparams: Union[dict, Namespace]) ->
             writer.writerow({"key": k, "value": v})
 
 
-def load_hparams_from_yaml(config_yaml: str, use_omegaconf: bool = True) -> Dict[str, Any]:
+def load_hparams_from_yaml(config_yaml: _PATH, use_omegaconf: bool = True) -> Dict[str, Any]:
     """Load hparams from a file.
 
         Args:
@@ -360,7 +384,7 @@ def load_hparams_from_yaml(config_yaml: str, use_omegaconf: bool = True) -> Dict
     return hparams
 
 
-def save_hparams_to_yaml(config_yaml, hparams: Union[dict, Namespace], use_omegaconf: bool = True) -> None:
+def save_hparams_to_yaml(config_yaml: _PATH, hparams: Union[dict, Namespace], use_omegaconf: bool = True) -> None:
     """
     Args:
         config_yaml: path to new YAML file
diff --git a/pytorch_lightning/demos/__init__.py b/pytorch_lightning/demos/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/pytorch_lightning/demos/boring_classes.py b/pytorch_lightning/demos/boring_classes.py
new file mode 100644
index 00000000000000..00aea18288de7c
--- /dev/null
+++ b/pytorch_lightning/demos/boring_classes.py
@@ -0,0 +1,213 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import torch
+from torch.utils.data import DataLoader, Dataset, IterableDataset, Subset
+
+from pytorch_lightning import LightningDataModule, LightningModule
+
+
+class RandomDictDataset(Dataset):
+    def __init__(self, size: int, length: int):
+        self.len = length
+        self.data = torch.randn(length, size)
+
+    def __getitem__(self, index):
+        a = self.data[index]
+        b = a + 2
+        return {"a": a, "b": b}
+
+    def __len__(self) -> int:
+        return self.len
+
+
+class RandomDataset(Dataset):
+    def __init__(self, size: int, length: int):
+        self.len = length
+        self.data = torch.randn(length, size)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self) -> int:
+        return self.len
+
+
+class RandomIterableDataset(IterableDataset):
+    def __init__(self, size: int, count: int):
+        self.count = count
+        self.size = size
+
+    def __iter__(self):
+        for _ in range(self.count):
+            yield torch.randn(self.size)
+
+
+class RandomIterableDatasetWithLen(IterableDataset):
+    def __init__(self, size: int, count: int):
+        self.count = count
+        self.size = size
+
+    def __iter__(self):
+        for _ in range(len(self)):
+            yield torch.randn(self.size)
+
+    def __len__(self):
+        return self.count
+
+
+class BoringModel(LightningModule):
+    def __init__(self):
+        """Testing PL Module.
+
+        Use as follows:
+        - subclass
+        - modify the behavior for what you want
+
+        class TestModel(BaseTestModel):
+            def training_step(...):
+                # do your own thing
+
+        or:
+
+        model = BaseTestModel()
+        model.training_epoch_end = None
+        """
+        super().__init__()
+        self.layer = torch.nn.Linear(32, 2)
+
+    def forward(self, x):
+        return self.layer(x)
+
+    def loss(self, batch, preds):
+        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
+        return torch.nn.functional.mse_loss(preds, torch.ones_like(preds))
+
+    def step(self, x):
+        x = self(x)
+        out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
+        return out
+
+    def training_step(self, batch, batch_idx):
+        output = self(batch)
+        loss = self.loss(batch, output)
+        return {"loss": loss}
+
+    def training_step_end(self, training_step_outputs):
+        return training_step_outputs
+
+    def training_epoch_end(self, outputs) -> None:
+        torch.stack([x["loss"] for x in outputs]).mean()
+
+    def validation_step(self, batch, batch_idx):
+        output = self(batch)
+        loss = self.loss(batch, output)
+        return {"x": loss}
+
+    def validation_epoch_end(self, outputs) -> None:
+        torch.stack([x["x"] for x in outputs]).mean()
+
+    def test_step(self, batch, batch_idx):
+        output = self(batch)
+        loss = self.loss(batch, output)
+        return {"y": loss}
+
+    def test_epoch_end(self, outputs) -> None:
+        torch.stack([x["y"] for x in outputs]).mean()
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+        return [optimizer], [lr_scheduler]
+
+    def train_dataloader(self):
+        return DataLoader(RandomDataset(32, 64))
+
+    def val_dataloader(self):
+        return DataLoader(RandomDataset(32, 64))
+
+    def test_dataloader(self):
+        return DataLoader(RandomDataset(32, 64))
+
+    def predict_dataloader(self):
+        return DataLoader(RandomDataset(32, 64))
+
+
+class BoringDataModule(LightningDataModule):
+    def __init__(self, data_dir: str = "./"):
+        super().__init__()
+        self.data_dir = data_dir
+        self.non_picklable = None
+        self.checkpoint_state: Optional[str] = None
+        self.random_full = RandomDataset(32, 64 * 4)
+
+    def setup(self, stage: Optional[str] = None):
+        if stage == "fit" or stage is None:
+            self.random_train = Subset(self.random_full, indices=range(64))
+
+        if stage in ("fit", "validate") or stage is None:
+            self.random_val = Subset(self.random_full, indices=range(64, 64 * 2))
+
+        if stage == "test" or stage is None:
+            self.random_test = Subset(self.random_full, indices=range(64 * 2, 64 * 3))
+
+        if stage == "predict" or stage is None:
+            self.random_predict = Subset(self.random_full, indices=range(64 * 3, 64 * 4))
+
+    def train_dataloader(self):
+        return DataLoader(self.random_train)
+
+    def val_dataloader(self):
+        return DataLoader(self.random_val)
+
+    def test_dataloader(self):
+        return DataLoader(self.random_test)
+
+    def predict_dataloader(self):
+        return DataLoader(self.random_predict)
+
+
+class ManualOptimBoringModel(BoringModel):
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
+
+    def training_step(self, batch, batch_idx):
+        opt = self.optimizers()
+        output = self(batch)
+        loss = self.loss(batch, output)
+        opt.zero_grad()
+        self.manual_backward(loss)
+        opt.step()
+        return loss
+
+
+class DemoModel(LightningModule):
+    def __init__(self, out_dim: int = 10, learning_rate: float = 0.02):
+        super().__init__()
+        self.l1 = torch.nn.Linear(32, out_dim)
+        self.learning_rate = learning_rate
+
+    def forward(self, x):
+        return torch.relu(self.l1(x.view(x.size(0), -1)))
+
+    def training_step(self, batch, batch_nb):
+        x = batch
+        x = self(x)
+        loss = x.sum()
+        return loss
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)
diff --git a/pytorch_lightning/loggers/__init__.py b/pytorch_lightning/loggers/__init__.py
index 4fd24587b293de..43a9a59059f32e 100644
--- a/pytorch_lightning/loggers/__init__.py
+++ b/pytorch_lightning/loggers/__init__.py
@@ -25,7 +25,6 @@
 from pytorch_lightning.loggers.comet import _COMET_AVAILABLE, CometLogger  # noqa: F401
 from pytorch_lightning.loggers.mlflow import _MLFLOW_AVAILABLE, MLFlowLogger  # noqa: F401
 from pytorch_lightning.loggers.neptune import _NEPTUNE_AVAILABLE, NeptuneLogger  # noqa: F401
-from pytorch_lightning.loggers.test_tube import _TESTTUBE_AVAILABLE, TestTubeLogger  # noqa: F401
 from pytorch_lightning.loggers.wandb import WandbLogger  # noqa: F401
 from pytorch_lightning.utilities.imports import _WANDB_AVAILABLE
 
@@ -40,8 +39,5 @@
 if _NEPTUNE_AVAILABLE:
     __all__.append("NeptuneLogger")
 
-if _TESTTUBE_AVAILABLE:
-    __all__.append("TestTubeLogger")
-
 if _WANDB_AVAILABLE:
     __all__.append("WandbLogger")
diff --git a/pytorch_lightning/loggers/test_tube.py b/pytorch_lightning/loggers/test_tube.py
deleted file mode 100644
index b456cc5220da3e..00000000000000
--- a/pytorch_lightning/loggers/test_tube.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Test Tube Logger
-----------------
-"""
-from argparse import Namespace
-from typing import Any, Dict, Optional, Union
-
-import pytorch_lightning as pl
-from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment
-from pytorch_lightning.utilities import _module_available
-from pytorch_lightning.utilities.logger import _add_prefix, _convert_params, _flatten_dict
-from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_only, rank_zero_warn
-
-_TESTTUBE_AVAILABLE = _module_available("test_tube")
-
-if _TESTTUBE_AVAILABLE:
-    from test_tube import Experiment
-else:
-    Experiment = None
-
-
-class TestTubeLogger(Logger):
-    r"""
-    Log to local file system in `TensorBoard <https://www.tensorflow.org/tensorboard>`_ format
-    but using a nicer folder structure (see `full docs <https://williamfalcon.github.io/test-tube>`_).
-
-    Warning:
-        The test-tube package is no longer maintained and PyTorch Lightning will remove the :class:´TestTubeLogger´
-        in v1.7.0.
-
-    Install it with pip:
-
-    .. code-block:: bash
-
-        pip install test_tube
-
-    .. code-block:: python
-
-        from pytorch_lightning import Trainer
-        from pytorch_lightning.loggers import TestTubeLogger
-
-        logger = TestTubeLogger("tt_logs", name="my_exp_name")
-        trainer = Trainer(logger=logger)
-
-    Use the logger anywhere in your :class:`~pytorch_lightning.core.lightning.LightningModule` as follows:
-
-    .. code-block:: python
-
-        from pytorch_lightning import LightningModule
-
-
-        class LitModel(LightningModule):
-            def training_step(self, batch, batch_idx):
-                # example
-                self.logger.experiment.whatever_method_summary_writer_supports(...)
-
-            def any_lightning_module_function_or_hook(self):
-                self.logger.experiment.add_histogram(...)
-
-    Args:
-        save_dir: Save directory
-        name: Experiment name. Defaults to ``'default'``.
-        description: A short snippet about this experiment
-        debug: If ``True``, it doesn't log anything.
-        version: Experiment version. If version is not specified the logger inspects the save
-            directory for existing versions, then automatically assigns the next available version.
-        create_git_tag: If ``True`` creates a git tag to save the code used in this experiment.
-        log_graph: Adds the computational graph to tensorboard. This requires that
-            the user has defined the `self.example_input_array` attribute in their
-            model.
-        prefix: A string to put at the beginning of metric keys.
-
-    Raises:
-        ModuleNotFoundError:
-            If required TestTube package is not installed on the device.
-    """
-
-    __test__ = False
-    LOGGER_JOIN_CHAR = "-"
-
-    def __init__(
-        self,
-        save_dir: str,
-        name: str = "default",
-        description: Optional[str] = None,
-        debug: bool = False,
-        version: Optional[int] = None,
-        create_git_tag: bool = False,
-        log_graph: bool = False,
-        prefix: str = "",
-    ):
-        rank_zero_deprecation(
-            "The TestTubeLogger is deprecated since v1.5 and will be removed in v1.7. We recommend switching to the"
-            " `pytorch_lightning.loggers.TensorBoardLogger` as an alternative."
-        )
-        if Experiment is None:
-            raise ModuleNotFoundError(
-                "You want to use `test_tube` logger which is not installed yet,"
-                " install it with `pip install test-tube`."
-            )
-        super().__init__()
-        self._save_dir = save_dir
-        self._name = name
-        self.description = description
-        self.debug = debug
-        self._version = version
-        self.create_git_tag = create_git_tag
-        self._log_graph = log_graph
-        self._prefix = prefix
-        self._experiment = None
-
-    @property
-    @rank_zero_experiment
-    def experiment(self) -> Experiment:
-        r"""
-
-        Actual TestTube object. To use TestTube features in your
-        :class:`~pytorch_lightning.core.lightning.LightningModule` do the following.
-
-        Example::
-
-            self.logger.experiment.some_test_tube_function()
-
-        """
-        if self._experiment is not None:
-            return self._experiment
-
-        self._experiment = Experiment(
-            save_dir=self.save_dir,
-            name=self._name,
-            debug=self.debug,
-            version=self.version,
-            description=self.description,
-            create_git_tag=self.create_git_tag,
-            rank=rank_zero_only.rank,
-        )
-        return self._experiment
-
-    @rank_zero_only
-    def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:
-        # TODO: HACK figure out where this is being set to true
-        self.experiment.debug = self.debug
-        params = _convert_params(params)
-        params = _flatten_dict(params)
-        self.experiment.argparse(Namespace(**params))
-
-    @rank_zero_only
-    def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
-        # TODO: HACK figure out where this is being set to true
-        metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR)
-        self.experiment.debug = self.debug
-        self.experiment.log(metrics, global_step=step)
-
-    @rank_zero_only
-    def log_graph(self, model: "pl.LightningModule", input_array=None):
-        if self._log_graph:
-            if input_array is None:
-                input_array = model.example_input_array
-
-            if input_array is not None:
-                self.experiment.add_graph(model, model._apply_batch_transfer_handler(input_array))
-            else:
-                rank_zero_warn(
-                    "Could not log computational graph since neither the"
-                    " `model.example_input_array` attribute is set nor"
-                    " `input_array` was given",
-                )
-
-    @rank_zero_only
-    def save(self) -> None:
-        super().save()
-        # TODO: HACK figure out where this is being set to true
-        self.experiment.debug = self.debug
-        self.experiment.save()
-
-    @rank_zero_only
-    def finalize(self, status: str) -> None:
-        super().finalize(status)
-        # TODO: HACK figure out where this is being set to true
-        self.experiment.debug = self.debug
-        self.save()
-        self.close()
-
-    @rank_zero_only
-    def close(self) -> None:
-        super().save()
-        # TODO: HACK figure out where this is being set to true
-        self.experiment.debug = self.debug
-        if not self.debug:
-            exp = self.experiment
-            exp.close()
-
-    @property
-    def save_dir(self) -> Optional[str]:
-        """Gets the save directory.
-
-        Returns:
-            The path to the save directory.
-        """
-        return self._save_dir
-
-    @property
-    def name(self) -> str:
-        """Gets the experiment name.
-
-        Returns:
-             The experiment name if the experiment exists, else the name specified in the constructor.
-        """
-        if self._experiment is None:
-            return self._name
-
-        return self.experiment.name
-
-    @property
-    def version(self) -> int:
-        """Gets the experiment version.
-
-        Returns:
-             The experiment version if the experiment exists, else the next version.
-        """
-        if self._experiment is None:
-            return self._version
-
-        return self.experiment.version
-
-    # Test tube experiments are not pickleable, so we need to override a few
-    # methods to get DDP working. See
-    # https://docs.python.org/3/library/pickle.html#handling-stateful-objects
-    # for more info.
-    def __getstate__(self) -> Dict[Any, Any]:
-        state = self.__dict__.copy()
-        state["_experiment"] = self.experiment.get_meta_copy()
-        return state
-
-    def __setstate__(self, state: Dict[Any, Any]):
-        self._experiment = state["_experiment"].get_non_ddp_exp()
-        del state["_experiment"]
-        self.__dict__.update(state)
diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index d88387dfeb5caf..0198e57a21034f 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, List, Optional, OrderedDict, Tuple, Union
 
-from deprecate import void
 from torch import Tensor
 
 from pytorch_lightning.loops.base import Loop
@@ -59,35 +58,35 @@ def reset(self) -> None:
         """Resets the loop state."""
         self._outputs = []
 
-    def on_run_start(self, batch: Any, batch_idx: int) -> None:  # type: ignore[override]
+    def on_run_start(self, kwargs: OrderedDict) -> None:  # type: ignore[override]
         """Splits the data into tbptt splits.
 
         Args:
-            batch: the current batch to run the trainstep on
-            batch_idx: the index of the current batch
+            kwargs: the kwargs passed down to the hooks.
         """
-        void(batch_idx)
+        batch = kwargs["batch"]
         self._remaining_splits = list(enumerate(self._tbptt_split_batch(batch)))
 
-    def advance(self, batch: Any, batch_idx: int) -> None:  # type: ignore[override]
+    def advance(self, kwargs: OrderedDict) -> None:  # type: ignore[override]
         """Runs the train step together with optimization (if necessary) on the current batch split.
 
         Args:
-            batch: the current batch to run the training on (this is not the split!)
-            batch_idx: the index of the current batch
+            kwargs: the kwargs passed down to the hooks.
         """
-        void(batch)
-        self.split_idx, split_batch = self._remaining_splits.pop(0)
+        # replace the batch with the split batch
+        self.split_idx, kwargs["batch"] = self._remaining_splits.pop(0)
 
         self.trainer._logger_connector.on_train_split_start(self.split_idx)
 
         outputs: Optional[Union[_OPTIMIZER_LOOP_OUTPUTS_TYPE, _MANUAL_LOOP_OUTPUTS_TYPE]] = None  # for mypy
         # choose which loop will run the optimization
         if self.trainer.lightning_module.automatic_optimization:
-            optimizers = _get_active_optimizers(self.trainer.optimizers, self.trainer.optimizer_frequencies, batch_idx)
-            outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
+            optimizers = _get_active_optimizers(
+                self.trainer.optimizers, self.trainer.optimizer_frequencies, kwargs.get("batch_idx", 0)
+            )
+            outputs = self.optimizer_loop.run(optimizers, kwargs)
         else:
-            outputs = self.manual_loop.run(split_batch, batch_idx)
+            outputs = self.manual_loop.run(kwargs)
         if outputs:
             # automatic: can be empty if all optimizers skip their batches
             # manual: #9052 added support for raising `StopIteration` in the `training_step`. If that happens,
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
index 0dea13365a13f0..7f4f4bd12365c6 100644
--- a/pytorch_lightning/loops/dataloader/evaluation_loop.py
+++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 import os
 import shutil
+import sys
 from collections import ChainMap, OrderedDict
 from functools import partial
-from typing import Any, IO, Iterable, List, Optional, Sequence, Type, Union
+from typing import Any, Iterable, List, Optional, Sequence, Tuple, Type, Union
 
 import torch
 from deprecate.utils import void
@@ -41,7 +42,7 @@
 from pytorch_lightning.utilities.types import EPOCH_OUTPUT
 
 if _RICH_AVAILABLE:
-    from rich.console import Console
+    from rich import get_console
     from rich.table import Column, Table
 
 
@@ -243,39 +244,27 @@ def _on_evaluation_start(self, *args: Any, **kwargs: Any) -> None:
         assert self._results is not None
         self._results.to(device=self.trainer.lightning_module.device)
 
-        if self.trainer.testing:
-            self.trainer._call_callback_hooks("on_test_start", *args, **kwargs)
-            self.trainer._call_lightning_module_hook("on_test_start", *args, **kwargs)
-            self.trainer._call_strategy_hook("on_test_start", *args, **kwargs)
-        else:
-            self.trainer._call_callback_hooks("on_validation_start", *args, **kwargs)
-            self.trainer._call_lightning_module_hook("on_validation_start", *args, **kwargs)
-            self.trainer._call_strategy_hook("on_validation_start", *args, **kwargs)
+        hook_name = "on_test_start" if self.trainer.testing else "on_validation_start"
+        self.trainer._call_callback_hooks(hook_name, *args, **kwargs)
+        self.trainer._call_lightning_module_hook(hook_name, *args, **kwargs)
+        self.trainer._call_strategy_hook(hook_name, *args, **kwargs)
 
     def _on_evaluation_model_eval(self) -> None:
         """Sets model to eval mode."""
-        if self.trainer.testing:
-            self.trainer._call_lightning_module_hook("on_test_model_eval")
-        else:
-            self.trainer._call_lightning_module_hook("on_validation_model_eval")
+        hook_name = "on_test_model_eval" if self.trainer.testing else "on_validation_model_eval"
+        self.trainer._call_lightning_module_hook(hook_name)
 
     def _on_evaluation_model_train(self) -> None:
         """Sets model to train mode."""
-        if self.trainer.testing:
-            self.trainer._call_lightning_module_hook("on_test_model_train")
-        else:
-            self.trainer._call_lightning_module_hook("on_validation_model_train")
+        hook_name = "on_test_model_train" if self.trainer.testing else "on_validation_model_train"
+        self.trainer._call_lightning_module_hook(hook_name)
 
     def _on_evaluation_end(self, *args: Any, **kwargs: Any) -> None:
         """Runs ``on_{validation/test}_end`` hook."""
-        if self.trainer.testing:
-            self.trainer._call_callback_hooks("on_test_end", *args, **kwargs)
-            self.trainer._call_lightning_module_hook("on_test_end", *args, **kwargs)
-            self.trainer._call_strategy_hook("on_test_end", *args, **kwargs)
-        else:
-            self.trainer._call_callback_hooks("on_validation_end", *args, **kwargs)
-            self.trainer._call_lightning_module_hook("on_validation_end", *args, **kwargs)
-            self.trainer._call_strategy_hook("on_validation_end", *args, **kwargs)
+        hook_name = "on_test_end" if self.trainer.testing else "on_validation_end"
+        self.trainer._call_callback_hooks(hook_name, *args, **kwargs)
+        self.trainer._call_lightning_module_hook(hook_name, *args, **kwargs)
+        self.trainer._call_strategy_hook(hook_name, *args, **kwargs)
 
         # reset the logger connector state
         self.trainer._logger_connector.reset_results()
@@ -286,12 +275,9 @@ def _on_evaluation_epoch_start(self, *args: Any, **kwargs: Any) -> None:
         self.trainer._call_callback_hooks("on_epoch_start", *args, **kwargs)
         self.trainer._call_lightning_module_hook("on_epoch_start", *args, **kwargs)
 
-        if self.trainer.testing:
-            self.trainer._call_callback_hooks("on_test_epoch_start", *args, **kwargs)
-            self.trainer._call_lightning_module_hook("on_test_epoch_start", *args, **kwargs)
-        else:
-            self.trainer._call_callback_hooks("on_validation_epoch_start", *args, **kwargs)
-            self.trainer._call_lightning_module_hook("on_validation_epoch_start", *args, **kwargs)
+        hook_name = "on_test_epoch_start" if self.trainer.testing else "on_validation_epoch_start"
+        self.trainer._call_callback_hooks(hook_name, *args, **kwargs)
+        self.trainer._call_lightning_module_hook(hook_name, *args, **kwargs)
 
     def _evaluation_epoch_end(self, outputs: List[EPOCH_OUTPUT]) -> None:
         """Runs ``{validation/test}_epoch_end``"""
@@ -303,10 +289,8 @@ def _evaluation_epoch_end(self, outputs: List[EPOCH_OUTPUT]) -> None:
         )
 
         # call the model epoch end
-        if self.trainer.testing:
-            self.trainer._call_lightning_module_hook("test_epoch_end", output_or_outputs)
-        else:
-            self.trainer._call_lightning_module_hook("validation_epoch_end", output_or_outputs)
+        hook_name = "test_epoch_end" if self.trainer.testing else "validation_epoch_end"
+        self.trainer._call_lightning_module_hook(hook_name, output_or_outputs)
 
     def _on_evaluation_epoch_end(self) -> None:
         """Runs ``on_{validation/test}_epoch_end`` hook."""
@@ -319,41 +303,48 @@ def _on_evaluation_epoch_end(self) -> None:
         self.trainer._logger_connector.on_epoch_end()
 
     @staticmethod
-    def _get_keys(data: dict) -> Iterable[str]:
-        if any(isinstance(v, dict) for v in data.values()):
-            for v in data.values():
-                yield from apply_to_collection(v, dict, dict.keys)
-        else:
-            yield from data.keys()
+    def _get_keys(data: dict) -> Iterable[Tuple[str, ...]]:
+        for k, v in data.items():
+            if isinstance(v, dict):
+                for new_key in apply_to_collection(v, dict, EvaluationLoop._get_keys):
+                    yield (k, *new_key)  # this need to be in parenthesis for older python versions
+            else:
+                yield k,
 
     @staticmethod
-    def _find_value(data: dict, target: str) -> Iterable[Any]:
-        for k, v in data.items():
-            if k == target:
-                yield v
-            elif isinstance(v, dict):
-                yield from EvaluationLoop._find_value(v, target)
+    def _find_value(data: dict, target: Iterable[str]) -> Optional[Any]:
+        target_start, *rest = target
+        if target_start not in data:
+            return None
+        result = data[target_start]
+        if not rest:
+            return result
+        return EvaluationLoop._find_value(result, rest)
 
     @staticmethod
-    def _print_results(results: List[_OUT_DICT], stage: str, file: Optional[IO[str]] = None) -> None:
+    def _print_results(results: List[_OUT_DICT], stage: str) -> None:
         # remove the dl idx suffix
         results = [{k.split("/dataloader_idx_")[0]: v for k, v in result.items()} for result in results]
-        metrics = sorted({k for keys in apply_to_collection(results, dict, EvaluationLoop._get_keys) for k in keys})
-        if not metrics:
+        metrics_paths = {k for keys in apply_to_collection(results, dict, EvaluationLoop._get_keys) for k in keys}
+        if not metrics_paths:
             return
+
+        metrics_strs = [":".join(metric) for metric in metrics_paths]
+        # sort both lists based on metrics_strs
+        metrics_strs, metrics_paths = zip(*sorted(zip(metrics_strs, metrics_paths)))
+
         headers = [f"DataLoader {i}" for i in range(len(results))]
 
         # fallback is useful for testing of printed output
         term_size = shutil.get_terminal_size(fallback=(120, 30)).columns or 120
-        max_length = int(min(max(len(max(metrics + headers, key=len)), 25), term_size / 2))
+        max_length = int(min(max(len(max(metrics_strs, key=len)), len(max(headers, key=len)), 25), term_size / 2))
 
-        rows: List[List[Any]] = [[] for _ in metrics]
+        rows: List[List[Any]] = [[] for _ in metrics_paths]
 
         for result in results:
-            for metric, row in zip(metrics, rows):
-                v = list(EvaluationLoop._find_value(result, metric))
-                if v:
-                    val = v[0]
+            for metric, row in zip(metrics_paths, rows):
+                val = EvaluationLoop._find_value(result, metric)
+                if val is not None:
                     if isinstance(val, torch.Tensor):
                         val = val.item() if val.numel() == 1 else val.tolist()
                     row.append(f"{val}")
@@ -370,23 +361,32 @@ def _print_results(results: List[_OUT_DICT], stage: str, file: Optional[IO[str]]
             table_headers.insert(0, f"{stage} Metric".capitalize())
 
             if _RICH_AVAILABLE:
-                console = Console(file=file)
-
                 columns = [Column(h, justify="center", style="magenta", width=max_length) for h in table_headers]
                 columns[0].style = "cyan"
 
                 table = Table(*columns)
-                for metric, row in zip(metrics, table_rows):
+                for metric, row in zip(metrics_strs, table_rows):
                     row.insert(0, metric)
                     table.add_row(*row)
+
+                console = get_console()
                 console.print(table)
             else:
                 row_format = f"{{:^{max_length}}}" * len(table_headers)
                 half_term_size = int(term_size / 2)
 
-                bar = "─" * term_size
+                try:
+                    # some terminals do not support this character
+                    if sys.stdout.encoding is not None:
+                        "─".encode(sys.stdout.encoding)
+                except UnicodeEncodeError:
+                    bar_character = "-"
+                else:
+                    bar_character = "─"
+                bar = bar_character * term_size
+
                 lines = [bar, row_format.format(*table_headers).rstrip(), bar]
-                for metric, row in zip(metrics, table_rows):
+                for metric, row in zip(metrics_strs, table_rows):
                     # deal with column overflow
                     if len(metric) > half_term_size:
                         while len(metric) > half_term_size:
@@ -397,7 +397,7 @@ def _print_results(results: List[_OUT_DICT], stage: str, file: Optional[IO[str]]
                     else:
                         lines.append(row_format.format(metric, *row).rstrip())
                 lines.append(bar)
-                print(os.linesep.join(lines), file=file)
+                print(os.linesep.join(lines))
 
 
 def _select_data_fetcher_type(trainer: "pl.Trainer") -> Type[AbstractDataFetcher]:
diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
index a14a218ef67e91..4ff6543064a6e2 100644
--- a/pytorch_lightning/loops/dataloader/prediction_loop.py
+++ b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -80,8 +80,11 @@ def reset(self) -> None:
             self.dataloader_progress.reset_on_run()
 
     def on_run_start(self) -> None:  # type: ignore[override]
-        """Calls ``_on_predict_start`` hook."""
+        """Calls ``_on_predict_model_eval``, ``_on_predict_start`` and ``_on_predict_epoch_start`` hooks."""
+        self.trainer._call_lightning_module_hook("on_predict_model_eval")
+        self.trainer.lightning_module.zero_grad()
         self._on_predict_start()
+        self._on_predict_epoch_start()
 
     def advance(self, *args: Any, **kwargs: Any) -> None:
         """Predicts one entire dataloader."""
@@ -111,19 +114,13 @@ def on_run_end(self) -> Optional[_PREDICT_OUTPUT]:
         return results
 
     def _on_predict_start(self) -> None:
-        """Sets model to eval mode and disables gradients.
-
-        Also calls ``on_predict_start`` and ``on_predict_epoch_start`` hooks.
-        """
-        # enable eval mode + no grads
-        self._on_predict_model_eval()
-        self.trainer.lightning_module.zero_grad()
-
-        # hook
+        """Calls ``on_predict_start`` hooks."""
         self.trainer._call_callback_hooks("on_predict_start")
         self.trainer._call_lightning_module_hook("on_predict_start")
         self.trainer._call_strategy_hook("on_predict_start")
 
+    def _on_predict_epoch_start(self) -> None:
+        """Calls ``on_predict_epoch_start`` hooks."""
         self.trainer._call_callback_hooks("on_predict_epoch_start")
         self.trainer._call_lightning_module_hook("on_predict_epoch_start")
 
@@ -151,8 +148,3 @@ def _on_predict_end(self) -> None:
         self.trainer._call_callback_hooks("on_predict_end")
         self.trainer._call_lightning_module_hook("on_predict_end")
         self.trainer._call_strategy_hook("on_predict_end")
-
-    def _on_predict_model_eval(self) -> None:
-        """Calls ``on_predict_model_eval`` hook."""
-        model_ref = self.trainer.lightning_module
-        model_ref.on_predict_model_eval()
diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
index fffa021769c761..8c631bf23fd856 100644
--- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -216,10 +216,8 @@ def _evaluation_step(self, **kwargs: Any) -> Optional[STEP_OUTPUT]:
         Returns:
             the outputs of the step
         """
-        if self.trainer.testing:
-            output = self.trainer._call_strategy_hook("test_step", *kwargs.values())
-        else:
-            output = self.trainer._call_strategy_hook("validation_step", *kwargs.values())
+        hook_name = "test_step" if self.trainer.testing else "validation_step"
+        output = self.trainer._call_strategy_hook(hook_name, *kwargs.values())
 
         return output
 
@@ -266,7 +264,7 @@ def _on_evaluation_batch_end(self, output: Optional[STEP_OUTPUT], **kwargs: Any)
         self.trainer._logger_connector.on_batch_end()
 
     def _build_kwargs(self, kwargs: OrderedDict, batch: Any, batch_idx: int) -> OrderedDict:
-        """Helper function to build the arguments for the current step.
+        """Helper method to build the arguments for the current step.
 
         Args:
             kwargs: The kwargs passed down to the hooks.
@@ -275,7 +273,8 @@ def _build_kwargs(self, kwargs: OrderedDict, batch: Any, batch_idx: int) -> Orde
         Returns:
             The kwargs passed down to the hooks.
         """
-        kwargs.update({"batch": batch, "batch_idx": batch_idx})
+        kwargs.update(batch=batch, batch_idx=batch_idx)
+        # `dataloader_idx` should be last so we need to push these to the front
         kwargs.move_to_end("batch_idx", last=False)
         kwargs.move_to_end("batch", last=False)
         return kwargs
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index b6887a4cf546cb..e059446bd4fae9 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from typing import Any, Dict, Generator, List, Optional, overload, Tuple, Union
 
 import numpy as np
@@ -173,6 +173,8 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None:  # type: ignore[ov
             batch_idx, batch = next(data_fetcher)
         self.batch_progress.is_last_batch = data_fetcher.done
 
+        kwargs = self._build_kwargs(OrderedDict(), batch, batch_idx)
+
         self.batch_progress.increment_ready()
 
         self.trainer._logger_connector.on_batch_start(batch, batch_idx)
@@ -205,7 +207,7 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None:  # type: ignore[ov
             self.batch_progress.increment_started()
 
             with self.trainer.profiler.profile("run_training_batch"):
-                batch_output = self.batch_loop.run(batch, batch_idx)
+                batch_output = self.batch_loop.run(kwargs)
 
         self.batch_progress.increment_processed()
 
@@ -356,6 +358,7 @@ def _prepare_outputs_training_batch_end(
         if (
             num_optimizers > 1
             and lightning_module.truncated_bptt_steps > 0
+            and is_overridden("on_train_batch_end", lightning_module)
             and not _v1_8_output_format(lightning_module.on_train_batch_end)
         ):
             rank_zero_deprecation(
@@ -501,9 +504,9 @@ def _get_monitor_value(self, key: str) -> Any:
         return self.trainer.callback_metrics.get(key)
 
     def _should_check_val_epoch(self):
-        return (
-            self.trainer.enable_validation
-            and (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch == 0
+        return self.trainer.enable_validation and (
+            self.trainer.check_val_every_n_epoch is None
+            or (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch == 0
         )
 
     def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool:
@@ -524,7 +527,13 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool:
         if isinstance(self.trainer.limit_train_batches, int) and is_infinite_dataset:
             is_val_check_batch = (batch_idx + 1) % self.trainer.limit_train_batches == 0
         elif self.trainer.val_check_batch != float("inf"):
-            is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0
+            # if `check_val_every_n_epoch is `None`, run a validation loop every n training batches
+            # else condition it based on the batch_idx of the current epoch
+            current_iteration = (
+                self._batches_that_stepped if self.trainer.check_val_every_n_epoch is None else batch_idx
+            )
+            is_val_check_batch = (current_iteration + 1) % self.trainer.val_check_batch == 0
+
         return is_val_check_batch
 
     def _save_loggers_on_train_batch_end(self) -> None:
@@ -540,6 +549,25 @@ def _reload_dataloader_state_dict(self, data_fetcher: AbstractDataFetcher) -> No
             data_fetcher.dataloader.load_state_dict(self._dataloader_state_dict)
             self._dataloader_state_dict = None
 
+    def _build_kwargs(self, kwargs: OrderedDict, batch: Any, batch_idx: int) -> OrderedDict:
+        """Helper method to build the arguments for the current step.
+
+        Args:
+            kwargs: The kwargs passed down to the hooks.
+            batch: The current batch to run through the step.
+            batch_idx: The current batch idx.
+
+        Returns:
+            The kwargs passed down to the hooks.
+        """
+        kwargs["batch"] = batch
+        training_step_fx = getattr(self.trainer.lightning_module, "training_step")
+        # the `batch_idx` is optional, however, when there's more than 1 argument we cannot differentiate whether the
+        # user wants the `batch_idx` or another key like `optimizer_idx` as we are not strict about the argument names
+        if is_param_in_hook_signature(training_step_fx, "batch_idx", min_args=2):
+            kwargs["batch_idx"] = batch_idx
+        return kwargs
+
 
 def _convert_optim_dict(outs: Dict[int, Dict[str, Any]], num_optimizers: int) -> List[Optional[Dict[str, Any]]]:
     """Converts an optimizer dict to a list in which the key of the dict determines the position of the element.
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index db3f60fb28ede8..40334387c06880 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -123,15 +123,10 @@ def running_loss(self) -> TensorRunningAccum:
 
     @Loop.restarting.setter
     def restarting(self, restarting: bool) -> None:
-        # if the last epoch completely finished, we are not actually restarting, we can check this to see if all
-        # current values are equal
-        values = (
-            self.epoch_progress.current.ready,
-            self.epoch_progress.current.started,
-            self.epoch_progress.current.processed,
-        )
-        finished_before_on_train_end = any(v != self.epoch_progress.current.completed for v in values)
-        restarting &= finished_before_on_train_end
+        # if the last epoch completely finished, we are not actually restarting
+        values = self.epoch_progress.current.ready, self.epoch_progress.current.started
+        epoch_unfinished = any(v != self.epoch_progress.current.processed for v in values)
+        restarting = restarting and epoch_unfinished or self._iteration_based_training()
         Loop.restarting.fset(self, restarting)  # call the parent setter
 
     @property
@@ -205,6 +200,10 @@ def reset(self) -> None:
 
     def on_run_start(self) -> None:  # type: ignore[override]
         """Calls the ``on_train_start`` hook."""
+        # update the current_epoch in-case of checkpoint reload
+        if not self._iteration_based_training():
+            self.epoch_progress.current.completed = self.epoch_progress.current.processed
+
         # reset train dataloader and val dataloader
         self.trainer.reset_train_val_dataloaders(self.trainer.lightning_module)
 
@@ -336,6 +335,9 @@ def _should_accumulate(self) -> bool:
         """Whether the gradients should be accumulated."""
         return self.epoch_loop._should_accumulate()
 
+    def _iteration_based_training(self) -> bool:
+        return self.trainer.max_steps != -1
+
 
 def _select_data_fetcher(trainer: "pl.Trainer") -> Type[AbstractDataFetcher]:
     training_step_fx = getattr(trainer.lightning_module, "training_step")
diff --git a/pytorch_lightning/loops/optimization/manual_loop.py b/pytorch_lightning/loops/optimization/manual_loop.py
index 70ada6a23909b6..7d17ed33e26b67 100644
--- a/pytorch_lightning/loops/optimization/manual_loop.py
+++ b/pytorch_lightning/loops/optimization/manual_loop.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections import OrderedDict
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional
 
@@ -97,30 +98,25 @@ def on_run_start(self, *_: Any, **__: Any) -> None:
             lightning_optimizer._on_before_step = self._on_before_step
             lightning_optimizer._on_after_step = self._on_after_step
 
-    def advance(self, batch: Any, batch_idx: int) -> None:  # type: ignore[override]
+    def advance(self, kwargs: OrderedDict) -> None:  # type: ignore[override]
         """Performs the training step for manual optimization.
 
         Args:
-            batch: the current tbptt split of the current batch
-            batch_idx: the index of the current batch
+            kwargs: The kwargs passed down to the hooks.
         """
         assert self.trainer is not None
-        lightning_module = self.trainer.lightning_module
 
-        step_kwargs = _build_training_step_kwargs(
-            lightning_module, self.trainer.optimizers, batch, batch_idx, opt_idx=None, hiddens=self._hiddens
-        )
+        kwargs = self._build_kwargs(kwargs, self._hiddens)
 
         # manually capture logged metrics
-        training_step_output = self.trainer._call_strategy_hook("training_step", *step_kwargs.values())
+        training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
+        del kwargs  # release the batch from memory
         self.trainer.strategy.post_training_step()
 
-        del step_kwargs
-
         model_output = self.trainer._call_lightning_module_hook("training_step_end", training_step_output)
         strategy_output = self.trainer._call_strategy_hook("training_step_end", training_step_output)
         training_step_output = strategy_output if model_output is None else model_output
-        self._hiddens = _extract_hiddens(training_step_output, lightning_module.truncated_bptt_steps)
+        self._hiddens = _extract_hiddens(training_step_output, self.trainer.lightning_module.truncated_bptt_steps)
 
         result = self.output_result_cls.from_training_step_output(training_step_output)
 
@@ -149,3 +145,17 @@ def _on_before_step(self) -> None:
     def _on_after_step(self) -> None:
         self.trainer.profiler.stop("optimizer_step")
         self.optim_step_progress.increment_completed()
+
+    def _build_kwargs(self, kwargs: OrderedDict, hiddens: Optional[Any]) -> OrderedDict:
+        """Helper method to build the arguments for the current step.
+
+        Args:
+            kwargs: The kwargs passed down to the hooks.
+            hiddens: the hidden state of the previous RNN iteration.
+
+        Returns:
+            The kwargs passed down to the hooks.
+        """
+        return _build_training_step_kwargs(
+            kwargs, self.trainer.lightning_module, self.trainer.optimizers, None, hiddens
+        )
diff --git a/pytorch_lightning/loops/optimization/optimizer_loop.py b/pytorch_lightning/loops/optimization/optimizer_loop.py
index 5cd81aa30f48fb..95c072bf28a546 100644
--- a/pytorch_lightning/loops/optimization/optimizer_loop.py
+++ b/pytorch_lightning/loops/optimization/optimizer_loop.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from dataclasses import dataclass, field
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, OrderedDict, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -164,7 +164,6 @@ def __init__(self) -> None:
 
         self._outputs: _OUTPUTS_TYPE = {}
         self._skip_backward: bool = False
-        self._batch_idx: int = 0
         self._optimizers: Tuple[Optimizer, ...] = tuple()
         self._indices: Tuple[int, ...] = tuple()
         self._hiddens: Optional[Any] = None
@@ -190,20 +189,16 @@ def reset(self) -> None:
         self._outputs = {}
 
     def on_run_start(  # type: ignore[override]
-        self, batch: Any, optimizers: List[Tuple[int, Optimizer]], batch_idx: int
+        self, optimizers: List[Tuple[int, Optimizer]], kwargs: OrderedDict
     ) -> None:
-        self._batch_idx = batch_idx
         self._indices, self._optimizers = zip(*optimizers)
         if self.done:
             self.optim_progress.optimizer_position = 0
 
-    def advance(self, batch: Any, *args: Any, **kwargs: Any) -> None:  # type: ignore[override]
-        result = self._run_optimization(
-            batch,
-            self._batch_idx,
-            self._optimizers[self.optim_progress.optimizer_position],
-            self.optimizer_idx,
-        )
+    def advance(self, optimizers: List[Tuple[int, Optimizer]], kwargs: OrderedDict) -> None:  # type: ignore[override]
+        kwargs = self._build_kwargs(kwargs, self.optimizer_idx, self._hiddens)
+
+        result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
         if result.loss is not None:
             # automatic optimization assumes a loss needs to be returned for extras to be considered as the batch
             # would be skipped otherwise
@@ -216,21 +211,19 @@ def on_run_end(self) -> _OUTPUTS_TYPE:
         self._optimizers = tuple()
         return outputs
 
-    def _run_optimization(
-        self, split_batch: Any, batch_idx: int, optimizer: torch.optim.Optimizer, opt_idx: int
-    ) -> ClosureResult:
+    def _run_optimization(self, kwargs: OrderedDict, optimizer: torch.optim.Optimizer) -> ClosureResult:
         """Runs closure (train step + backward) together with optimization if necessary.
 
         Args:
-            split_batch: the current tbptt split of the whole batch
-            batch_idx: the index of the current batch
+            kwargs: the kwargs passed down to the hooks.
             optimizer: the current optimizer
-            opt_idx: the index of the current optimizer
         """
+        opt_idx = kwargs.get("optimizer_idx", 0)
+
         # toggle model params
         self._run_optimization_start(opt_idx, optimizer)
 
-        closure = self._make_closure(split_batch, batch_idx, opt_idx, optimizer)
+        closure = self._make_closure(kwargs, optimizer)
 
         if (
             # when the strategy handles accumulation, we want to always call the optimizer step
@@ -251,7 +244,8 @@ def _run_optimization(
         # ------------------------------
         # gradient update with accumulated gradients
         else:
-            self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
+            # the `batch_idx` is optional with inter-batch parallelism
+            self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
 
         result = closure.consume_result()
 
@@ -265,17 +259,18 @@ def _run_optimization(
         self._run_optimization_end(opt_idx)
         return result
 
-    def _make_closure(self, split_batch: Any, batch_idx: int, opt_idx: int, optimizer: Optimizer) -> Closure:
+    def _make_closure(self, kwargs: OrderedDict, optimizer: Optimizer) -> Closure:
         """Build a closure object that captures the given arguments and runs the `training_step` function and
         optionally other functions such as `backward` and `zero_grad`."""
-        step_fn = self._make_step_fn(split_batch, batch_idx, opt_idx)
+        opt_idx = kwargs.get("optimizer_idx", 0)
+        step_fn = self._make_step_fn(kwargs)
         backward_fn = self._make_backward_fn(optimizer, opt_idx)
-        zero_grad_fn = self._make_zero_grad_fn(batch_idx, opt_idx, optimizer)
+        zero_grad_fn = self._make_zero_grad_fn(kwargs.get("batch_idx", 0), opt_idx, optimizer)
         return Closure(step_fn=step_fn, backward_fn=backward_fn, zero_grad_fn=zero_grad_fn)
 
-    def _make_step_fn(self, split_batch: Any, batch_idx: int, opt_idx: int) -> Callable[[], ClosureResult]:
+    def _make_step_fn(self, kwargs: OrderedDict) -> Callable[[], ClosureResult]:
         """Build the step function that runs the `training_step` and processes its output."""
-        return partial(self._training_step, split_batch, batch_idx, opt_idx)
+        return partial(self._training_step, kwargs)
 
     def _make_zero_grad_fn(self, batch_idx: int, opt_idx: int, optimizer: Optimizer) -> Optional[Callable[[], None]]:
         """Build a `zero_grad` function that zeroes the gradients before back-propagation.
@@ -399,33 +394,24 @@ def _optimizer_zero_grad(self, batch_idx: int, optimizer: torch.optim.Optimizer,
         )
         self.optim_progress.optimizer.zero_grad.increment_completed()
 
-    def _training_step(self, split_batch: Any, batch_idx: int, opt_idx: int) -> ClosureResult:
+    def _training_step(self, kwargs: OrderedDict) -> ClosureResult:
         """Performs the actual train step with the tied hooks.
 
         Args:
-            split_batch: the current tbptt split of the current batch
-            batch_idx: the index of the current batch
-            opt_idx: the index of the current optimizer
+            kwargs: the kwargs passed down to the hooks.
 
         Returns:
             A ``ClosureResult`` containing the training step output.
         """
-        # give the PL module a result for logging
-        lightning_module = self.trainer.lightning_module
-
-        step_kwargs = _build_training_step_kwargs(
-            lightning_module, self.trainer.optimizers, split_batch, batch_idx, opt_idx, self._hiddens
-        )
-
         # manually capture logged metrics
-        training_step_output = self.trainer._call_strategy_hook("training_step", *step_kwargs.values())
+        training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
         self.trainer.strategy.post_training_step()
 
         model_output = self.trainer._call_lightning_module_hook("training_step_end", training_step_output)
         strategy_output = self.trainer._call_strategy_hook("training_step_end", training_step_output)
         training_step_output = strategy_output if model_output is None else model_output
 
-        self._hiddens = _extract_hiddens(training_step_output, lightning_module.truncated_bptt_steps)
+        self._hiddens = _extract_hiddens(training_step_output, self.trainer.lightning_module.truncated_bptt_steps)
 
         result = self.output_result_cls.from_training_step_output(
             training_step_output, self.trainer.accumulate_grad_batches
@@ -437,3 +423,18 @@ def _training_step(self, split_batch: Any, batch_idx: int, opt_idx: int) -> Clos
             self.trainer._results.cpu()
 
         return result
+
+    def _build_kwargs(self, kwargs: OrderedDict, opt_idx: int, hiddens: Optional[Any]) -> OrderedDict:
+        """Helper method to build the arguments for the current step.
+
+        Args:
+            kwargs: The kwargs passed down to the hooks.
+            opt_idx: the index of the current optimizer.
+            hiddens: the hidden state of the previous RNN iteration.
+
+        Returns:
+            The kwargs passed down to the hooks.
+        """
+        return _build_training_step_kwargs(
+            kwargs, self.trainer.lightning_module, self.trainer.optimizers, opt_idx, hiddens
+        )
diff --git a/pytorch_lightning/loops/utilities.py b/pytorch_lightning/loops/utilities.py
index d84c195d758f94..8b0f98efe64f7d 100644
--- a/pytorch_lightning/loops/utilities.py
+++ b/pytorch_lightning/loops/utilities.py
@@ -106,34 +106,25 @@ def _parse_loop_limits(
 
 
 def _build_training_step_kwargs(
+    kwargs: OrderedDict,
     lightning_module: "pl.LightningModule",
     optimizers: Sequence[Optimizer],
-    batch: Any,
-    batch_idx: int,
     opt_idx: Optional[int],
     hiddens: Optional[Any],
-) -> Dict[str, Any]:
+) -> OrderedDict:
     """Builds the keyword arguments for training_step.
 
     Args:
+        kwargs: The kwargs passed down to the hooks.
         lightning_module: the LightningModule with a `training_step` hook implementation
         optimizers: the list of optimizers from the Trainer
-        batch: the batch to train on
-        batch_idx: the index of the current batch
         opt_idx: the index of the current optimizer
         hiddens: the hidden state of the previous RNN iteration
 
     Returns:
         the keyword arguments for the training step
     """
-    # enable not needing to add opt_idx to training_step
-    step_kwargs = OrderedDict([("batch", batch)])
-
     training_step_fx = getattr(lightning_module, "training_step")
-
-    if is_param_in_hook_signature(training_step_fx, "batch_idx", min_args=2):
-        step_kwargs["batch_idx"] = batch_idx
-
     if len(optimizers) > 1:
         has_opt_idx_in_train_step = is_param_in_hook_signature(training_step_fx, "optimizer_idx")
         if has_opt_idx_in_train_step:
@@ -143,7 +134,7 @@ def _build_training_step_kwargs(
                     " in manual optimization optimizers must be handled by the user. Remove the optimizer_idx"
                     " argument or set `self.automatic_optimization = True`."
                 )
-            step_kwargs["optimizer_idx"] = opt_idx
+            kwargs["optimizer_idx"] = opt_idx
         elif not has_opt_idx_in_train_step and lightning_module.automatic_optimization:
             raise ValueError(
                 f"Your LightningModule defines {len(optimizers)} optimizers but"
@@ -152,9 +143,9 @@ def _build_training_step_kwargs(
 
     # pass hiddens if using tbptt
     if lightning_module.truncated_bptt_steps > 0:
-        step_kwargs["hiddens"] = hiddens
+        kwargs["hiddens"] = hiddens
 
-    return step_kwargs
+    return kwargs
 
 
 @contextmanager
@@ -182,7 +173,7 @@ def _cumulative_optimizer_frequencies(frequencies: Tuple[int]) -> np.ndarray:
 
 
 def _get_active_optimizers(
-    optimizers: List[Optimizer], frequencies: List[int], batch_idx: Optional[int] = None
+    optimizers: List[Optimizer], frequencies: List[int], batch_idx: int
 ) -> List[Tuple[int, Optimizer]]:
     """Returns the currently active optimizers. When multiple optimizers are used with different frequencies, only
     one of the optimizers is active at a time.
diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index 9c70a2867b4290..ec09b7723132d0 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -6,7 +6,6 @@
 import pickle
 
 import torch
-from torch._C._distributed_c10d import ProcessGroup
 
 _pickler = pickle.Pickler
 _unpickler = pickle.Unpickler
@@ -15,6 +14,7 @@
 logger = logging.getLogger(__name__)
 
 if torch.distributed.is_available():
+    from torch._C._distributed_c10d import ProcessGroup
     from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember
 
 # The code underneath is taken from PyTorch `torch/distributed/distributed_c10d.py`
@@ -22,7 +22,7 @@
 
 
 # Taken from https://github.com/pytorch/pytorch/blob/3466c1b6901f06a563b8cbfa3c942fa50bda835b/torch/distributed/distributed_c10d.py#L267 # noqa: E501
-def _rank_not_in_group(group: ProcessGroup):
+def _rank_not_in_group(group: "ProcessGroup"):
     """Helper that checks if the current process's rank is not in a given group."""
     if group is None:
         return False
diff --git a/pytorch_lightning/plugins/precision/sharded_native_amp.py b/pytorch_lightning/plugins/precision/sharded_native_amp.py
index 57840a918a2e10..e40aea8ecf4eba 100644
--- a/pytorch_lightning/plugins/precision/sharded_native_amp.py
+++ b/pytorch_lightning/plugins/precision/sharded_native_amp.py
@@ -35,7 +35,7 @@ def __init__(
                 "You have asked for sharded AMP but you have not installed it."
                 " Install `fairscale` using this guide: https://https://github.com/facebookresearch/fairscale"
             )
-        super().__init__(precision, device, scaler=scaler or ShardedGradScaler())
+        super().__init__(precision, device, scaler=ShardedGradScaler() if scaler is None and precision == 16 else None)
 
     def clip_grad_by_norm(self, optimizer: "OSS", clip_val: Union[int, float]) -> None:
         optimizer.clip_grad_norm(clip_val)
diff --git a/pytorch_lightning/strategies/__init__.py b/pytorch_lightning/strategies/__init__.py
index 38a2b466e57e9e..0de3e51a0f2775 100644
--- a/pytorch_lightning/strategies/__init__.py
+++ b/pytorch_lightning/strategies/__init__.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.strategies.bagua import BaguaStrategy  # noqa: F401
+from pytorch_lightning.strategies.collaborative import CollaborativeStrategy  # noqa: F401
 from pytorch_lightning.strategies.ddp import DDPStrategy  # noqa: F401
 from pytorch_lightning.strategies.ddp2 import DDP2Strategy  # noqa: F401
 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy  # noqa: F401
 from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy  # noqa: F401
 from pytorch_lightning.strategies.dp import DataParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy  # noqa: F401
+from pytorch_lightning.strategies.fully_sharded_native import DDPFullyShardedNativeStrategy  # noqa: F401
 from pytorch_lightning.strategies.horovod import HorovodStrategy  # noqa: F401
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.ipu import IPUStrategy  # noqa: F401
diff --git a/pytorch_lightning/strategies/collaborative.py b/pytorch_lightning/strategies/collaborative.py
new file mode 100644
index 00000000000000..5774fdaca5801d
--- /dev/null
+++ b/pytorch_lightning/strategies/collaborative.py
@@ -0,0 +1,529 @@
+import http
+import ipaddress
+import logging
+import os
+import platform
+import re
+import threading
+import time
+import warnings
+from http.server import BaseHTTPRequestHandler
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import requests
+import torch
+
+import pytorch_lightning as pl
+from pytorch_lightning.strategies.strategy import Strategy, TBroadcast
+from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn
+from pytorch_lightning.utilities.data import extract_batch_size
+from pytorch_lightning.utilities.enums import PrecisionType
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _HIVEMIND_AVAILABLE
+from pytorch_lightning.utilities.model_helpers import is_overridden
+from pytorch_lightning.utilities.types import _LRScheduler, ReduceLROnPlateau
+
+if _HIVEMIND_AVAILABLE:
+    import hivemind
+
+log = logging.getLogger(__name__)
+
+
+class CollaborativeStrategy(Strategy):
+    def __init__(
+        self,
+        target_batch_size: int,
+        run_id: str = "lightning_run",
+        batch_size: Optional[int] = None,
+        delay_state_averaging: bool = False,
+        delay_optimizer_step: Optional[bool] = None,
+        delay_grad_averaging: bool = False,
+        offload_optimizer: Optional[bool] = None,
+        reuse_grad_buffers: bool = False,
+        scheduler_fn: Optional[Callable] = None,
+        matchmaking_time: float = 5.0,
+        averaging_timeout: float = 30.0,
+        verbose: bool = False,
+        averager_opts: Optional[Dict] = None,
+        host_maddrs: Optional[List] = None,
+        initial_peers: Optional[Union[str, List]] = None,
+        endpoint: Optional[bool] = None,
+        peer_endpoint: Optional[str] = None,
+        persistent: bool = True,
+        host: Optional[str] = None,
+        port: Optional[int] = None,
+        retry_endpoint_attempts: int = 5,
+        retry_endpoint_sleep_duration: int = 5,
+        **optimizer_kwargs: Any,
+    ):
+        """Provides capabilities to train using the Hivemind Library, training collaboratively across the internet
+        with unreliable machines. For more information, `refer to the docs <https://pytorch-
+        lightning.readthedocs.io/en/latest/strategies/collaborative_training.html>`__.
+
+        .. warning:: ``CollaborativeStrategy`` is experimental and subject to change.
+
+        Arguments:
+
+            target_batch_size: When training, the batch size to accumulate to before running a step. The larger this
+                batch size, the more work can be done asynchronously without communication.
+
+            run_id: A unique identifier of this training run, used as a common prefix for all DHT keys.
+                See ``https://learning-at-home.readthedocs.io/en/latest/user/dht.html``.
+
+            batch_size: The local batch size per process. If not provided, we infer this from the first batch of data
+                passed in at training (lazy). Note that this should not change throughout training.
+
+            delay_state_averaging: If enabled (default), average parameters and extra tensors in a background thread;
+                if set to False, average parameters synchronously within the
+                corresponding :meth:`hivemind.Optimizer.step` call.
+
+            delay_optimizer_step: Run optimizer in background, apply results in future .step. requires
+                :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.offload_optimizer`.
+
+            delay_grad_averaging: Average gradients in background; requires
+                :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.offload_optimizer` and
+                :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.delay_optimizer_step`.
+
+            offload_optimizer: Offload the optimizer to host memory, saving GPU memory for parameters and gradients.
+
+            reuse_grad_buffers: Use the model's gradient buffers (params.grad) for gradient accumulation
+                which is more memory efficient. Lightning will automatically disable ``zero_grad``
+                in the ``LightningModule``.
+
+            scheduler_fn: callable(optimizer) -> PyTorch LRScheduler or a pre-initialized PyTorch scheduler.
+                When using `offload_optimizer`/`delay_optimizer_step`/`delay_state_averaging` ``scheduler_fn``
+                is required to be passed to the ``CollaborativeStrategy``. This is because the optimizer
+                is re-created and the scheduler needs to be re-created as well.
+
+            matchmaking_time: When looking for group, wait for peers to join for up to this many seconds.
+                Increase if you see "averaged gradients with N peers" where N is below 0.9x on >=25% of epochs.
+                Training with low-latency network, decreasing matchmaking_time allows training with smaller batch sizes.
+
+            averaging_timeout: If an averaging step hangs for this long, it will be cancelled automatically.
+                Increase averaging_timeout if you see "Proceeding with local gradients" at least 25% of the time.
+                Do not set this timeout too high, as it may cause your optimizer to hang
+                after some types of network errors.
+
+            verbose: Report internal Hivemind events such as accumulating gradients and running background tasks.
+
+            averager_opts: Additional keyword arguments forwarded to both
+                ``GradientAverager`` and ``TrainingStateAverager``.
+
+            host_maddrs: List of multi-addrs to create visible peers for other processes.
+                `https://learning-at-home.readthedocs.io/en/latest/user/dht.html#running-across-the-internet`
+
+            initial_peers: If connecting to a running process, a list of initial peers needs to be passed in.
+                This can also be set via the env variable ``INITIAL_PEERS``.
+
+            endpoint: Enable if a side-car endpoint server is required on the process to server initial peers.
+                This is useful when using some form of orchestration such as torchelastic.
+
+            peer_endpoint: The endpoint to request initial peers from.
+
+            persistent: When using an endpoint, this controls whether other processes that are not the endpoint
+                server log/checkpoint. If ``persistent`` is True, we do not log/checkpoint from other processes.
+
+            host: When creating the endpoint, the host IP to use.
+
+            port: When creating the endpoint, the host port to use.
+
+            retry_endpoint_attempts: When connecting to the
+                :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.peer_endpoint`,
+                how many time to retry before raising an exception.
+
+            retry_endpoint_sleep_duration: When connecting to the
+                :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.peer_endpoint`,
+                how long to wait between retries.
+
+            **optimizer_kwargs: kwargs are passed to the :class:`hivemind.Optimizer` class.
+        """
+        if not _HIVEMIND_AVAILABLE or platform.system() != "Linux":
+            raise MisconfigurationException(
+                "To use the `CollaborativeStrategy`, you must have Hivemind installed and be running on Linux."
+                " Install it by running `pip install -U hivemind`."
+            )
+
+        super().__init__()
+        self.dht_manager = DHTManager(
+            persistent=persistent,
+            endpoint=endpoint,
+            peer_endpoint=peer_endpoint,
+            host=host,
+            port=port,
+            host_maddrs=host_maddrs,
+            initial_peers=initial_peers,
+            retry_endpoint_attempts=retry_endpoint_attempts,
+            retry_endpoint_sleep_duration=retry_endpoint_sleep_duration,
+        )
+        self._target_batch_size = target_batch_size
+        self._batch_size = batch_size
+        self._scheduler_fn = scheduler_fn
+        self._require_scheduler_fn = delay_optimizer_step or delay_state_averaging or offload_optimizer
+        self._opt = None
+        self._optimizer_zero_grad_original: Optional[Callable] = None
+        self._run_id = run_id
+        self._reuse_grad_buffers = reuse_grad_buffers
+        self._optimizer_kwargs = dict(
+            matchmaking_time=matchmaking_time,
+            averaging_timeout=averaging_timeout,
+            delay_optimizer_step=delay_optimizer_step,
+            delay_state_averaging=delay_state_averaging,
+            delay_grad_averaging=delay_grad_averaging,
+            offload_optimizer=offload_optimizer,
+            averager_opts=averager_opts if averaging_timeout is not None else dict(request_timeout=1.0),
+            verbose=verbose,
+            reuse_grad_buffers=reuse_grad_buffers,
+            **optimizer_kwargs,
+        )
+
+        # a bit of a hack to only log from the stable server
+        if self.dht_manager.disable_logging_checkpointing:
+            warnings.warn(
+                "This machine is not a persistent machine. Checkpointing/Logging has been disabled.", UserWarning
+            )
+        rank_zero_only.rank = 1 if self.dht_manager.disable_logging_checkpointing else 0
+        self._hivemind_initialized = False
+
+    @property
+    def num_peers(self) -> int:
+        if self._opt:
+            return self._opt.tracker.global_progress.num_peers
+        return 1
+
+    @property
+    def dht(self) -> "hivemind.DHT":
+        """Hivemind Distributed Hash Table which stores values across all peers.
+
+        See documentation for more details: `https://learning-at-home.readthedocs.io/en/latest/modules/dht.html`
+        """
+        return self.dht_manager.dht
+
+    @property
+    def root_device(self) -> torch.device:
+        from pytorch_lightning.accelerators.cpu import CPUAccelerator
+        from pytorch_lightning.accelerators.gpu import GPUAccelerator
+
+        if isinstance(self.accelerator, GPUAccelerator):
+            return torch.device(f"cuda:{torch.cuda.current_device()}")
+        elif isinstance(self.accelerator, CPUAccelerator):
+            return torch.device("cpu")
+        raise MisconfigurationException(
+            f"Was unable to infer device type from the accelerator: {self.accelerator.__class__.__name__}."
+        )
+
+    @property
+    def global_rank(self) -> int:
+        return 0
+
+    @property
+    def is_global_zero(self) -> bool:
+        return True
+
+    def setup(self, trainer: "pl.Trainer") -> None:
+        self.model_to_device()
+        super().setup(trainer)
+        if self.precision_plugin.precision in (PrecisionType.HALF, PrecisionType.MIXED):
+            self.precision_plugin.scaler = hivemind.GradScaler()
+
+    def _initialize_hivemind(self) -> None:
+        if len(self.optimizers) > 1:
+            raise MisconfigurationException("Hivemind only supports training with one optimizer.")
+        optimizer = self.optimizers[0]
+
+        if self._require_scheduler_fn and self._scheduler_fn is None:
+            rank_zero_warn(
+                "Enabling `delay_optimizer_step`, `delay_state_averaging` or `offload_optimizer` "
+                "requires a `scheduler_fn` to be passed to the strategy if a scheduler is being used "
+                "(this is because the optimizer is re-created within Hivemind)."
+            )
+
+        scheduler = self._scheduler_fn if self._require_scheduler_fn else None
+        params = optimizer.param_groups if self._require_scheduler_fn else None
+        optimizer = type(optimizer) if self._require_scheduler_fn else optimizer
+        opt = hivemind.Optimizer(
+            dht=self.dht,
+            run_id=self._run_id,
+            params=params,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            target_batch_size=self._target_batch_size,
+            batch_size_per_step=self._batch_size,
+            **self._optimizer_kwargs,
+        )
+
+        if not self._scheduler_fn:
+            self._wrap_schedulers(opt)
+        opt.load_state_from_peers()
+        self.optimizers = [opt]
+        self._opt = opt
+
+        if self._reuse_grad_buffers:
+            assert self.lightning_module is not None
+            self._optimizer_zero_grad_original = self.lightning_module.optimizer_zero_grad
+            self._disable_zero_grad()
+
+    def _disable_zero_grad(self) -> None:
+        lightning_module = self.lightning_module
+        if is_overridden("optimizer_zero_grad", lightning_module):
+            assert lightning_module is not None  # `is_overridden` returns False otherwise
+            rank_zero_warn(
+                "You have overridden `optimizer_zero_grad` which will be disabled."
+                " When `CollaborativeStrategy(reuse_grad_buffers=True)`, the optimizer cannot call zero grad,"
+                " as this would delete the gradients before they are averaged."
+            )
+        assert lightning_module is not None
+        lightning_module.optimizer_zero_grad = None  # type: ignore[assignment]
+
+    def _wrap_schedulers(self, opt: "hivemind.Optimizer") -> None:
+        # wrap schedulers so that they only update when the hivemind optimizer updates
+        for scheduler_config in self.lr_scheduler_configs:
+            scheduler = scheduler_config.scheduler
+            if isinstance(scheduler, ReduceLROnPlateau):
+                raise ValueError(
+                    f"The `ReduceLROnPlateau` scheduler is not currently supported with `{self.__class__.__name__}`."
+                )
+            scheduler_config.scheduler = HiveMindScheduler(
+                optimizer=opt,
+                scheduler=scheduler,
+            )
+
+    def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None:
+        if not self._hivemind_initialized:
+            self._hivemind_initialized = True
+            # todo (sean): we could technically support a dynamic batch size by inferring each step
+            # and passing it to the ``hivemind.Optimizer``.
+            if self._batch_size is None:
+                try:
+                    self._batch_size = extract_batch_size(batch)
+                    log.info(f"Found per machine batch size automatically from the batch: {self._batch_size}")
+                except (MisconfigurationException, RecursionError) as e:
+                    raise MisconfigurationException(
+                        "We tried to infer the batch size from the first batch of data. "
+                        "Please provide the batch size to the Strategy by "
+                        "``Trainer(strategy=CollaborativeStrategy(batch_size=x))``. "
+                    ) from e
+            self._initialize_hivemind()
+
+    def reduce(self, tensor: Union[Any, torch.Tensor], *args: Any, **kwargs: Any) -> Union[Any, torch.Tensor]:
+        return tensor
+
+    def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor:
+        return tensor
+
+    def model_to_device(self) -> None:
+        assert self.model is not None
+        self.model.to(self.root_device)
+
+    def barrier(self, *args: Any, **kwargs: Any) -> None:
+        pass
+
+    def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
+        return obj
+
+    def teardown(self) -> None:
+        super().teardown()
+
+        if self._optimizer_zero_grad_original is not None and self.lightning_module is not None:
+            # re-enable `optimizer_zero_grad`
+            self.lightning_module.optimizer_zero_grad = self._optimizer_zero_grad_original  # type: ignore[assignment]
+
+        if self.root_device.type == "cuda" and self.lightning_module is not None:
+            # GPU teardown
+            self.lightning_module.cpu()
+            # clean up memory
+            torch.cuda.empty_cache()
+        if self._opt:
+            self._opt.shutdown()
+        log.info("Shutting down hivemind DHT.")
+        self.dht.shutdown()
+
+
+class HiveMindScheduler:
+    """Wrapper for schedulers to prevent Lightning from stepping the scheduler too soon.
+
+    This code ensures that we only step when the HiveMind optimizer reaches the global step.
+    """
+
+    def __init__(self, optimizer: "hivemind.Optimizer", scheduler: _LRScheduler) -> None:
+        # copy most of the `Scheduler` methods into this instance. `__del__` is skipped in case the scheduler has
+        # implemented custom logic which we would not want to call on destruction of the `HiveMindScheduler`
+        self.__dict__ = {k: v for k, v in scheduler.__dict__.items() if k not in ("step", "__del__")}
+
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        self.current_step = -1
+
+    def step(self, epoch: Optional[int] = None) -> None:
+        while self.current_step < self.optimizer.local_epoch:
+            self.scheduler.step(epoch=epoch)
+            self.current_step += 1
+
+    def load_state_dict(self, state_dict: Dict) -> None:
+        self.scheduler.load_state_dict(state_dict)
+
+    def state_dict(self) -> Dict:
+        return self.scheduler.state_dict()
+
+
+class DHTManager:
+    ENDPOINT_ENV: str = "PL_ENDPOINT"
+    PEER_ENDPOINT_ENV: str = "PL_PEER_ENDPOINT"
+    INITIAL_PEERS_ENV: str = "PL_INITIAL_PEERS"
+    HOST_ENV: str = "PL_HOST"
+    PORT_ENV: str = "PL_PORT"
+    DEFAULT_HOST: str = "0.0.0.0"
+    DEFAULT_PORT: int = 1440
+
+    def __init__(
+        self,
+        host_maddrs: Optional[List],
+        initial_peers: Optional[Union[str, List]],
+        persistent: bool,
+        endpoint: Optional[bool],
+        peer_endpoint: Optional[str],
+        host: Optional[str],
+        port: Optional[int],
+        retry_endpoint_attempts: int = 5,
+        retry_endpoint_sleep_duration: int = 5,
+    ) -> None:
+        """Manages the `hivemind.DHT` connection and provides a side-car endpoint server for initial peer access.
+
+        Arguments:
+
+            host_maddrs: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.host_maddrs`
+
+            initial_peers: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.initial_peers`
+
+            persistent: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.persistent`
+
+            endpoint: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.endpoint`
+
+            peer_endpoint: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.peer_endpoint`
+
+            host: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.host`
+
+            port: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.port`
+
+            retry_endpoint_attempts:
+                :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.retry_endpoint_attempts`
+
+            retry_endpoint_sleep_duration:
+                :paramref:
+                `~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.retry_endpoint_sleep_duration`
+        """
+        self._persistent = persistent
+        self._endpoint = endpoint
+        self._initial_peers = initial_peers
+        self._peer_endpoint = peer_endpoint
+        self._host = host
+        self._port = port
+
+        self._parse_env_vars()
+
+        if self._peer_endpoint and self._initial_peers is None:
+            self._initial_peers = self._get_initial_peers_from_endpoint(
+                retry_initial_peers=retry_endpoint_attempts, retry_peer_sleep_duration=retry_endpoint_sleep_duration
+            )
+
+        self.dht = hivemind.DHT(
+            start=True,
+            initial_peers=self._initial_peers,
+            host_maddrs=host_maddrs if host_maddrs is not None else ["/ip4/0.0.0.0/tcp/0", "/ip4/0.0.0.0/udp/0/quic"],
+        )
+
+        visible_addresses = [
+            str(a) for a in self.dht.get_visible_maddrs() if not ipaddress.ip_address(a.values()[0]).is_loopback
+        ]
+
+        if self._endpoint:
+            self._host = self._host if self._host is not None else self.DEFAULT_HOST
+            self._port = self._port if self._port is not None else self.DEFAULT_PORT
+            self._start_server_process(self._host, self._port)
+            self._log_endpoint_helper_message(visible_addresses)
+        elif self._peer_endpoint:
+            log.info("Machine received initial peers from endpoint.")
+        elif self._initial_peers is None:
+            log.info(
+                "\nOther machines can connect running the same command:\n"
+                f"INITIAL_PEERS={','.join(visible_addresses)} python ...\n"
+                "or passing the peers to the strategy:\n"
+                f"CollaborativeStrategy(initial_peers='{','.join(visible_addresses)}')"
+            )
+
+    def _log_endpoint_helper_message(self, visible_addresses: List[str]) -> None:
+        assert self._host is not None
+        resolved_host = self._host
+        if "0.0.0.0" in self._host:
+            # use the visible multi-addresses to figure out the IP that has been exposed
+            # todo (sean): this is pretty hacky, worth investigating.
+            p = re.compile(r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+")
+            # todo (sean): we select one address from here, could we have multiple?
+            resolved_host = {p.findall(maddr)[0] for maddr in visible_addresses}.pop()
+        log.info(
+            "\nSidecar endpoint enabled to serve peers.\n"
+            "Other peers can connect via:\n"
+            f"PEER_ENDPOINT={resolved_host}:{self._port} python ...\n"
+            "or pass the peer endpoint address to the strategy:\n"
+            f"CollaborativeStrategy(peer_endpoint='{resolved_host}:{self._port}')"
+        )
+
+    def _start_server_process(self, host: str, port: int) -> None:
+        dht = self.dht
+
+        class DHTHandler(BaseHTTPRequestHandler):
+            def do_GET(self) -> None:
+                """Respond to a GET request."""
+                self.send_response(200)
+                self.send_header("Content-type", "text/html")
+                self.end_headers()
+
+                visible_peers = [
+                    str(a) for a in dht.get_visible_maddrs() if not ipaddress.ip_address(a.values()[0]).is_loopback
+                ]
+
+                self.wfile.write("\n".join(visible_peers).encode())
+
+        server = http.server.ThreadingHTTPServer((host, int(port)), DHTHandler)
+        thread = threading.Thread(target=server.serve_forever)
+        thread.daemon = True
+        thread.start()
+
+    def _get_initial_peers_from_endpoint(self, retry_initial_peers: int, retry_peer_sleep_duration: int) -> List:
+        peers = None
+        for _ in range(retry_initial_peers):
+            try:
+                peers = self._get_peers()
+                break
+            except requests.exceptions.RequestException:
+                log.info(f"Failed to get peers, retrying in {retry_peer_sleep_duration} seconds...")
+                time.sleep(retry_peer_sleep_duration)
+        if peers is None:
+            raise MisconfigurationException(
+                f"Unable to get peers. Tried {retry_initial_peers} times waiting {retry_peer_sleep_duration}s."
+                f"These parameters can be extended by passing "
+                "to the strategy (CollaborativeStrategy(retry_connection=x, retry_sleep_duration=y))."
+            )
+        log.info(f"Received initial peers from collaborative server: {peers}")
+        return peers
+
+    def _get_peers(self) -> List[str]:
+        assert self._peer_endpoint is not None
+        url = f"http://{self._peer_endpoint}" if not self._peer_endpoint.startswith("http://") else self._peer_endpoint
+        r = requests.get(url)
+        return r.text.split(",")
+
+    def _parse_env_vars(self) -> None:
+        endpoint = os.environ.get(self.ENDPOINT_ENV, self._endpoint)
+        self._endpoint = endpoint == "1" if isinstance(endpoint, str) else endpoint
+        self._peer_endpoint = os.environ.get(self.PEER_ENDPOINT_ENV, self._peer_endpoint)
+        initial_peers = os.environ.get(self.INITIAL_PEERS_ENV, self._initial_peers)
+        self._initial_peers = initial_peers.split(",") if isinstance(initial_peers, str) else initial_peers
+
+        port = os.environ.get(self.PORT_ENV, self._port)
+        self._port = int(port) if isinstance(port, str) else port
+        self._host = os.environ.get(self.HOST_ENV, self._host)
+
+    @property
+    def disable_logging_checkpointing(self) -> bool:
+        # if this node is a peer, we do not log/checkpoint in persistent mode.
+        return self._persistent and (self._initial_peers is not None or self._peer_endpoint is not None)
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index 67b8a22c8adfd4..3c6ddeb635a6b4 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -62,7 +62,7 @@
 
 if _FAIRSCALE_AVAILABLE:
     from fairscale.optim import OSS
-if _TORCH_GREATER_EQUAL_1_10:
+if _TORCH_GREATER_EQUAL_1_10 and torch.distributed.is_available():
     from torch.distributed.algorithms.model_averaging.averagers import ModelAverager
 
 
diff --git a/pytorch_lightning/strategies/deepspeed.py b/pytorch_lightning/strategies/deepspeed.py
index 7874322a4d74a7..aa4679a9104d5b 100644
--- a/pytorch_lightning/strategies/deepspeed.py
+++ b/pytorch_lightning/strategies/deepspeed.py
@@ -26,6 +26,7 @@
 from torch.optim import Optimizer
 
 import pytorch_lightning as pl
+from pytorch_lightning.accelerators.gpu import GPUAccelerator
 from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
@@ -447,6 +448,11 @@ def init_deepspeed(self):
         if self.lightning_module.trainer.gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
             raise MisconfigurationException("DeepSpeed does not support clipping gradients by value.")
 
+        if not isinstance(self.accelerator, GPUAccelerator):
+            raise MisconfigurationException(
+                f"DeepSpeed strategy is only supported on GPU but `{self.accelerator.__class__.__name__}` is used."
+            )
+
         accumulation_scheduler = self.lightning_module.trainer.accumulation_scheduler
 
         if accumulation_scheduler.epochs != [0]:
@@ -762,6 +768,9 @@ def save_checkpoint(self, checkpoint: Dict, filepath: _PATH, storage_options: Op
             TypeError:
                 If ``storage_options`` arg is passed in
         """
+        # broadcast the filepath from rank 0 to ensure all the states are saved in a common filepath
+        filepath = self.broadcast(filepath)
+
         if storage_options is not None:
             raise TypeError(
                 "`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg"
diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py
index b61429264d80aa..6a902d3e09a3a6 100644
--- a/pytorch_lightning/strategies/fully_sharded.py
+++ b/pytorch_lightning/strategies/fully_sharded.py
@@ -163,7 +163,7 @@ def wrap_policy(*args, **kwargs):
             cpu_offload=self.cpu_offload,
             move_grads_to_cpu=self.move_grads_to_cpu,
             flatten_parameters=self.flatten_parameters,
-            mixed_precision=(precision == PrecisionType.MIXED),
+            mixed_precision=(precision in (PrecisionType.MIXED, PrecisionType.HALF)),
             reshard_after_forward=self.reshard_after_forward,
             fp32_reduce_scatter=self.fp32_reduce_scatter,
             compute_dtype=self.compute_dtype,
diff --git a/pytorch_lightning/strategies/fully_sharded_native.py b/pytorch_lightning/strategies/fully_sharded_native.py
new file mode 100644
index 00000000000000..7d4a037826ab37
--- /dev/null
+++ b/pytorch_lightning/strategies/fully_sharded_native.py
@@ -0,0 +1,252 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import logging
+from typing import Any, Dict, Generator, List, Optional, Union
+
+import torch
+from torch.distributed.distributed_c10d import _get_default_group, ProcessGroup
+
+import pytorch_lightning as pl
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
+from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
+from pytorch_lightning.plugins.precision import PrecisionPlugin
+from pytorch_lightning.strategies.parallel import ParallelStrategy
+from pytorch_lightning.strategies.strategy import TBroadcast
+from pytorch_lightning.trainer.states import TrainerFn
+from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.utilities.distributed import (
+    _get_process_group_backend_from_env,
+    distributed_available,
+    get_default_process_group_backend_for_device,
+)
+from pytorch_lightning.utilities.distributed import group as _group
+from pytorch_lightning.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11
+from pytorch_lightning.utilities.optimizer import optimizers_to_device
+from pytorch_lightning.utilities.seed import reset_seed
+
+if _TORCH_GREATER_EQUAL_1_11:
+    from torch.distributed.fsdp.fully_sharded_data_parallel import (
+        BackwardPrefetch,
+        CPUOffload,
+        FullyShardedDataParallel,
+    )
+    from torch.distributed.fsdp.wrap import enable_wrap
+
+
+log = logging.getLogger(__name__)
+
+
+class DDPFullyShardedNativeStrategy(ParallelStrategy):
+
+    strategy_name = "fsdp_native"
+    _registered_strategies: List[str] = []
+
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None,
+        parallel_devices: Optional[List[torch.device]] = None,
+        cluster_environment: Optional[ClusterEnvironment] = None,
+        checkpoint_io: Optional[CheckpointIO] = None,
+        precision_plugin: Optional[PrecisionPlugin] = None,
+        process_group_backend: Optional[str] = None,
+        cpu_offload=None,
+        backward_prefetch=None,
+    ) -> None:
+        """Strategy for Fully Sharded Data Parallel provided by torch.Distributed.
+
+        Fully Sharded Training shards the entire model across all available GPUs, allowing you to scale model
+        size, whilst using efficient communication to reduce overhead. In practice, this means we can remain
+        at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar
+        to ZeRO-Stage 3.
+        `For more information: https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/`.
+
+        .. warning:: ``DDPFullyShardedNativeStrategy`` is in beta and subject to change. The interface can
+        bring breaking changes and new features with the next release of Pytorch.
+
+        Defaults have been set and options have been exposed, but may require configuration
+        based on your level of memory/speed efficiency. We suggest having a look at this tutorial for
+        more information.
+        `https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html`
+
+        Arguments:
+            cpu_offload (Optional [CPUOffload]):
+                CPU offloading config. Currently, only parameter and gradient CPU
+                offload is supported. It can be enabled via passing in
+                ``cpu_offload=CPUOffload(offload_params=True)``. Note that this
+                currently implicitly enables gradient offloading to CPU in order for
+                params and grads to be on same device to work with optimizer. This
+                API is subject to change. Default is ``None`` in which case there
+                will be no offloading.
+            backward_prefetch: (Optional[BackwardPrefetch]):
+                This is an experimental feature that is subject to change in the
+                the near future. It allows users to enable two different backward_prefetch
+                algorithms to help backward communication and computation overlapping.
+                Pros and cons of each algorithm is explained in the class ``BackwardPrefetch``.
+        """
+        if not _TORCH_GREATER_EQUAL_1_11:
+            raise MisconfigurationException("DDPFullyShardedNativeStrategy is supported from pytorch v1.11.0 onwards.")
+
+        super().__init__(
+            accelerator=accelerator,
+            parallel_devices=parallel_devices,
+            cluster_environment=cluster_environment,
+            checkpoint_io=checkpoint_io,
+            precision_plugin=precision_plugin,
+        )
+        self._process_group = None
+        self.num_processes = len(self.parallel_devices) if self.parallel_devices is not None else 0
+        self._process_group_backend: Optional[str] = process_group_backend
+        self.cpu_offload: Optional[CPUOffload] = cpu_offload
+        self.backward_prefetch: Optional[BackwardPrefetch] = backward_prefetch
+
+    @property
+    def root_device(self) -> torch.device:
+        return self.parallel_devices[self.local_rank]
+
+    @property
+    def process_group(self) -> Optional[ProcessGroup]:
+        if self._process_group is None:
+            # The strategy should have already initilized process group in setup_environment()
+            self._process_group = _get_default_group()
+        return self._process_group
+
+    @property
+    def process_group_backend(self) -> Optional[str]:
+        return self._process_group_backend
+
+    def setup_environment(self) -> None:
+        reset_seed()
+        # set warning rank
+        rank_zero_only.rank = self.global_rank
+        self._process_group_backend = self._get_process_group_backend()
+        assert self.cluster_environment is not None
+        init_dist_connection(self.cluster_environment, self._process_group_backend)
+        super().setup_environment()
+
+    def _get_process_group_backend(self) -> str:
+        return (
+            self._process_group_backend
+            or _get_process_group_backend_from_env()
+            or get_default_process_group_backend_for_device(self.root_device)
+        )
+
+    def setup(self, trainer: "pl.Trainer") -> None:
+        self.accelerator.setup(trainer)
+
+        if trainer.state.fn == TrainerFn.FITTING and self._layer_sync:
+            assert self.model is not None
+            self.model = self._layer_sync.apply(self.model)
+
+        if not self.cpu_offload:
+            self.model_to_device()
+
+        self.barrier()
+        self.setup_optimizers(trainer)
+        optimizers_to_device(self.optimizers, self.root_device)
+        self.setup_precision_plugin()
+
+    def model_to_device(self) -> None:
+        # ensure we update the device type in the lightning module
+        assert self.lightning_module is not None
+        log.info(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...")
+        self.lightning_module.to(self.root_device)
+
+    @contextlib.contextmanager
+    def model_sharded_context(self) -> Generator:
+        log.detail(f"{self.__class__.__name__}: entered model_sharded_context.")
+
+        with enable_wrap(
+            wrapper_cls=FullyShardedDataParallel,
+            process_group=self.process_group,
+            cpu_offload=self.cpu_offload,
+            backward_prefetch=self.backward_prefetch,
+        ):
+            yield
+
+    def barrier(self, name: Optional[str] = None) -> None:
+        if not distributed_available():
+            return
+        if torch.distributed.get_backend() == "nccl":
+            torch.distributed.barrier(device_ids=self._determine_device_ids())
+        else:
+            torch.distributed.barrier()
+
+    def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
+        obj = [obj]
+        if self.global_rank != src:
+            obj = [None]  # type: ignore
+        torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
+        return obj[0]
+
+    def reduce(
+        self,
+        tensor: Union[torch.Tensor, Any],
+        group: Optional[Any] = None,
+        reduce_op: Optional[Union[ReduceOp, str]] = "mean",
+    ) -> torch.Tensor:
+        """Reduces a tensor from several distributed processes to one aggregated tensor.
+
+        Args:
+            tensor: the tensor to sync and reduce
+            group: the process group to gather results from. Defaults to all processes (world)
+            reduce_op: the reduction operation. Defaults to 'mean'/'avg'.
+                Can also be a string 'sum' to calculate the sum during reduction.
+
+        Return:
+            reduced value, except when the input was not a tensor the output remains is unchanged
+        """
+        if isinstance(tensor, torch.Tensor):
+            tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op)
+        return tensor
+
+    def _determine_device_ids(self) -> List[int]:
+        return [self.root_device.index]
+
+    def teardown(self) -> None:
+        log.info(f"{self.__class__.__name__}: tearing down strategy...")
+        super().teardown()
+
+        if (
+            self.lightning_module is not None
+            and self.lightning_module.trainer is not None
+            and self.lightning_module.trainer.state.fn == TrainerFn.FITTING
+            and self._layer_sync
+        ):
+            assert self.model is not None
+            self.model = self._layer_sync.revert(self.model)
+
+    @classmethod
+    def get_registered_strategies(cls) -> List[str]:
+        return cls._registered_strategies
+
+    @classmethod
+    def register_strategies(cls, strategy_registry: Dict) -> None:
+        if _TORCH_GREATER_EQUAL_1_11:
+            strategy_registry.register(
+                "fsdp_native",
+                cls,
+                description="Fully Sharded Data Parallel training from torch.distributed.",
+            )
+            cls._registered_strategies.append("fsdp_native")
+
+            strategy_registry.register(
+                "fsdp_native_full_shard_offload",
+                cls,
+                description="Native FSDP with Full Sharding and CPU Offloading",
+                cpu_offload=CPUOffload(offload_params=True),
+            )
+            cls._registered_strategies.append("fsdp_native_full_shard_offload")
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 562a841b895109..4996ddabcf960c 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -103,7 +103,7 @@ def configure_ddp(self) -> None:
                 self._model._set_static_graph()  # type: ignore
             self._register_ddp_hooks()
         else:
-            self.configure_ddp()
+            super().configure_ddp()
 
     def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
         obj = [obj]
diff --git a/pytorch_lightning/strategies/ipu.py b/pytorch_lightning/strategies/ipu.py
index 2780369a999ae2..ea014420c96fb9 100644
--- a/pytorch_lightning/strategies/ipu.py
+++ b/pytorch_lightning/strategies/ipu.py
@@ -25,12 +25,13 @@
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.parallel import ParallelStrategy
 from pytorch_lightning.trainer.states import RunningStage, TrainerFn
-from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE
+from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.data import _get_dataloader_init_kwargs
 from pytorch_lightning.utilities.enums import PrecisionType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.types import STEP_OUTPUT
 
 if _POPTORCH_AVAILABLE:
@@ -121,6 +122,7 @@ def __init__(
             os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options)
 
         self._update_dataloader_original: Optional[Callable] = None
+        self._optimizer_zero_grad_original: Optional[Callable] = None
 
     def setup(self, trainer: "pl.Trainer") -> None:
         # set the `accumulate_grad_batches` property as early as possible
@@ -134,6 +136,11 @@ def setup(self, trainer: "pl.Trainer") -> None:
 
         super().setup(trainer)
 
+        # disable the `optimizer_zero_grad` function by setting it to `None`.
+        # this is because the IPU zeros the gradients internally
+        self._optimizer_zero_grad_original = self.lightning_module.optimizer_zero_grad
+        self._disable_zero_grad()
+
         model = LightningIPUModule(self.lightning_module, self.precision_plugin.precision)
         self.model = model
 
@@ -260,6 +267,16 @@ def to_tensor(x):
         args = apply_to_collection(args, dtype=(int, float), function=to_tensor)
         return args
 
+    def _disable_zero_grad(self) -> None:
+        lightning_module = self.lightning_module
+        if is_overridden("optimizer_zero_grad", lightning_module):
+            assert lightning_module is not None  # `is_overridden` returns False otherwise
+            rank_zero_warn(
+                "You have overridden the `LightningModule.optimizer_zero_grad` hook but it will be ignored since"
+                " IPUs handle the zeroing of gradients internally."
+            )
+        lightning_module.optimizer_zero_grad = None  # type: ignore[assignment]
+
     def _step(self, stage: RunningStage, *args: Any, **kwargs: Any):
         args = self._prepare_input(args)
         poptorch_model = self.poptorch_models[stage]
@@ -290,6 +307,10 @@ def teardown(self) -> None:
             # undo dataloader patching
             pl.trainer.connectors.data_connector._update_dataloader = self._update_dataloader_original
 
+        if self._optimizer_zero_grad_original is not None:
+            # re-enable `optimizer_zero_grad`
+            self.lightning_module.optimizer_zero_grad = self._optimizer_zero_grad_original
+
         for model in self.poptorch_models.values():
             model.destroy()
 
diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py
index f23e978d0df02e..86a64fbec505a8 100644
--- a/pytorch_lightning/trainer/configuration_validator.py
+++ b/pytorch_lightning/trainer/configuration_validator.py
@@ -49,8 +49,6 @@ def verify_loop_configurations(trainer: "pl.Trainer") -> None:
 
     __verify_dp_batch_transfer_support(trainer, model)
     _check_add_get_queue(model)
-    # TODO: Delete _check_progress_bar in v1.7
-    _check_progress_bar(model)
     # TODO: Delete _check_on_post_move_to_device in v1.7
     _check_on_post_move_to_device(model)
     _check_deprecated_callback_hooks(trainer)
@@ -64,6 +62,7 @@ def verify_loop_configurations(trainer: "pl.Trainer") -> None:
     _check_on_pretrain_routine(model)
     # TODO: Delete CheckpointHooks off LightningDataModule in v1.8
     _check_datamodule_checkpoint_hooks(trainer)
+    _check_setup_method(trainer)
 
 
 def __verify_train_val_loop_configuration(trainer: "pl.Trainer", model: "pl.LightningModule") -> None:
@@ -143,20 +142,6 @@ def __verify_train_val_loop_configuration(trainer: "pl.Trainer", model: "pl.Ligh
         )
 
 
-def _check_progress_bar(model: "pl.LightningModule") -> None:
-    r"""
-    Checks if get_progress_bar_dict is overridden and sends a deprecation warning.
-
-    Args:
-        model: The model to check the get_progress_bar_dict method.
-    """
-    if is_overridden("get_progress_bar_dict", model):
-        rank_zero_deprecation(
-            "The `LightningModule.get_progress_bar_dict` method was deprecated in v1.5 and will be removed in v1.7."
-            " Please use the `ProgressBarBase.get_metrics` instead."
-        )
-
-
 def _check_on_post_move_to_device(model: "pl.LightningModule") -> None:
     r"""
     Checks if `on_post_move_to_device` method is overridden and sends a deprecation warning.
@@ -418,3 +403,9 @@ def _check_datamodule_checkpoint_hooks(trainer: "pl.Trainer") -> None:
             "`LightningDataModule.on_load_checkpoint` was deprecated in"
             " v1.6 and will be removed in v1.8. Use `load_state_dict` instead."
         )
+
+
+def _check_setup_method(trainer: "pl.Trainer") -> None:
+    for obj in [trainer.lightning_module, trainer.datamodule] + trainer.callbacks:
+        if is_overridden("setup", obj) and not is_param_in_hook_signature(obj.setup, "stage"):
+            raise MisconfigurationException(f"`{obj.__class__.__name__}.setup` does not have a `stage` argument.")
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 826e278e27a731..753234bc21d835 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -18,6 +18,7 @@
 from typing import Dict, List, Optional, Union
 
 import torch
+from typing_extensions import Literal
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
@@ -53,6 +54,7 @@
 from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm
 from pytorch_lightning.strategies import (
     DDP2Strategy,
+    DDPFullyShardedNativeStrategy,
     DDPFullyShardedStrategy,
     DDPShardedStrategy,
     DDPSpawnShardedStrategy,
@@ -80,13 +82,21 @@
     rank_zero_warn,
 )
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _HOROVOD_AVAILABLE, _HPU_AVAILABLE, _IPU_AVAILABLE, _TPU_AVAILABLE
+from pytorch_lightning.utilities.imports import (
+    _HOROVOD_AVAILABLE,
+    _HPU_AVAILABLE,
+    _IPU_AVAILABLE,
+    _TORCH_GREATER_EQUAL_1_11,
+    _TPU_AVAILABLE,
+)
 
 log = logging.getLogger(__name__)
 
 if _HOROVOD_AVAILABLE:
     import horovod.torch as hvd
 
+_LITERAL_WARN = Literal["warn"]
+
 
 class AcceleratorConnector:
     def __init__(
@@ -102,7 +112,7 @@ def __init__(
         sync_batchnorm: bool = False,
         benchmark: Optional[bool] = None,
         replace_sampler_ddp: bool = True,
-        deterministic: bool = False,
+        deterministic: Union[bool, _LITERAL_WARN] = False,
         auto_select_gpus: bool = False,
         num_processes: Optional[int] = None,  # deprecated
         tpu_cores: Optional[Union[List[int], str, int]] = None,  # deprecated
@@ -205,9 +215,12 @@ def __init__(
         # 6. Instantiate Strategy - Part 2
         self._lazy_init_strategy()
 
-    def _init_deterministic(self, deterministic: bool) -> None:
+    def _init_deterministic(self, deterministic: Union[bool, _LITERAL_WARN]) -> None:
         self.deterministic = deterministic
-        torch.use_deterministic_algorithms(deterministic)
+        if _TORCH_GREATER_EQUAL_1_11 and deterministic == "warn":
+            torch.use_deterministic_algorithms(True, warn_only=True)
+        else:
+            torch.use_deterministic_algorithms(deterministic)
         if deterministic:
             # fixing non-deterministic part of horovod
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
@@ -579,6 +592,14 @@ def _check_strategy_and_fallback(self) -> None:
         if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu":
             rank_zero_warn(f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.")
             strategy_flag = "ddp"
+        if (
+            strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies()
+            or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy)
+        ) and self._accelerator_flag != "gpu":
+            raise MisconfigurationException(
+                f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
+                "but GPU accelerator is not used."
+            )
 
         if strategy_flag:
             self._strategy_flag = strategy_flag
@@ -658,6 +679,10 @@ def _check_and_init_precision(self) -> PrecisionPlugin:
                 if self._precision_flag == 16
                 else "Using bfloat16 Automatic Mixed Precision (AMP)"
             )
+            if isinstance(self.strategy, DDPFullyShardedNativeStrategy):
+                raise MisconfigurationException(
+                    "DDPFullyShardedNativeStrategy currently doesn't support Mixed Precision"
+                )
 
             if self._amp_type_flag == AMPType.NATIVE:
                 device = "cpu" if self._accelerator_flag == "cpu" else "cuda"
@@ -710,7 +735,10 @@ def _validate_precision_choice(self) -> None:
                 "it's not supported. Try using `amp_type='native'` instead."
             )
         if self._precision_flag in (16, "bf16") and self._amp_type_flag == AMPType.APEX:
-            if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy, DDPFullyShardedStrategy)):
+            if isinstance(
+                self.strategy,
+                (DDPShardedStrategy, DDPSpawnShardedStrategy, DDPFullyShardedStrategy, DDPFullyShardedNativeStrategy),
+            ):
                 raise MisconfigurationException(
                     "Sharded plugins are not supported with apex, please switch to `amp_backend='native'`."
                 )
@@ -791,6 +819,7 @@ def is_distributed(self) -> bool:
             DDPStrategy,
             DDPSpawnShardedStrategy,
             DDPShardedStrategy,
+            DDPFullyShardedNativeStrategy,
             DDPFullyShardedStrategy,
             DDPSpawnStrategy,
             DeepSpeedStrategy,
diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py
index 50f0839663c406..7514e5c85eef70 100644
--- a/pytorch_lightning/trainer/connectors/callback_connector.py
+++ b/pytorch_lightning/trainer/connectors/callback_connector.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import logging
 import os
 from datetime import timedelta
 from typing import Dict, List, Optional, Sequence, Union
@@ -28,8 +30,11 @@
 from pytorch_lightning.callbacks.timer import Timer
 from pytorch_lightning.utilities.enums import ModelSummaryMode
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info
 
+_log = logging.getLogger(__name__)
+
 
 class CallbackConnector:
     def __init__(self, trainer):
@@ -91,6 +96,8 @@ def on_trainer_init(
         if self.trainer.state._fault_tolerant_mode.is_enabled:
             self._configure_fault_tolerance_callbacks()
 
+        self.trainer.callbacks.extend(_configure_external_callbacks())
+
         # push all model checkpoint callbacks to the end
         # it is important that these are the last callbacks to run
         self.trainer.callbacks = self._reorder_callbacks(self.trainer.callbacks)
@@ -282,3 +289,33 @@ def _reorder_callbacks(callbacks: List[Callback]) -> List[Callback]:
         checkpoints = [c for c in callbacks if isinstance(c, ModelCheckpoint)]
         not_checkpoints = [c for c in callbacks if not isinstance(c, ModelCheckpoint)]
         return not_checkpoints + checkpoints
+
+
+def _configure_external_callbacks() -> List[Callback]:
+    """Collect external callbacks registered through entry points.
+
+    The entry points are expected to be functions returning a list of callbacks.
+
+    Return:
+        A list of all callbacks collected from external factories.
+    """
+    if _PYTHON_GREATER_EQUAL_3_8_0:
+        from importlib.metadata import entry_points
+
+        factories = entry_points().get("pytorch_lightning.callbacks_factory", ())
+    else:
+        from pkg_resources import iter_entry_points
+
+        factories = iter_entry_points("pytorch_lightning.callbacks_factory")
+
+    external_callbacks = []
+    for factory in factories:
+        callback_factory = factory.load()
+        callbacks_list: List[Callback] = callback_factory()
+        callbacks_list = [callbacks_list] if isinstance(callbacks_list, Callback) else callbacks_list
+        _log.info(
+            f"Adding {len(callbacks_list)} callbacks from entry point '{factory.name}':"
+            f" {', '.join(type(cb).__name__ for cb in callbacks_list)}"
+        )
+        external_callbacks.extend(callbacks_list)
+    return external_callbacks
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index fa8e5277cf1c4b..6c5d75a2e41f15 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -360,6 +360,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
             }
         """
         model = self.trainer.lightning_module
+        datamodule = self.trainer.datamodule
 
         checkpoint = {
             # the epoch and global step are saved for compatibility but they are not relevant for restoration
@@ -396,15 +397,16 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
             prec_plugin.on_save_checkpoint(checkpoint)
 
         # dump hyper-parameters
-        if model.hparams:
-            if hasattr(model, "_hparams_name"):
-                checkpoint[pl.LightningModule.CHECKPOINT_HYPER_PARAMS_NAME] = model._hparams_name
-            # dump arguments
-            if _OMEGACONF_AVAILABLE and isinstance(model.hparams, Container):
-                checkpoint[pl.LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = model.hparams
-                checkpoint[pl.LightningModule.CHECKPOINT_HYPER_PARAMS_TYPE] = type(model.hparams)
-            else:
-                checkpoint[pl.LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = dict(model.hparams)
+        for obj in (model, datamodule):
+            if obj and obj.hparams:
+                if hasattr(obj, "_hparams_name"):
+                    checkpoint[obj.CHECKPOINT_HYPER_PARAMS_NAME] = obj._hparams_name
+                # dump arguments
+                if _OMEGACONF_AVAILABLE and isinstance(obj.hparams, Container):
+                    checkpoint[obj.CHECKPOINT_HYPER_PARAMS_KEY] = obj.hparams
+                    checkpoint[obj.CHECKPOINT_HYPER_PARAMS_TYPE] = type(obj.hparams)
+                else:
+                    checkpoint[obj.CHECKPOINT_HYPER_PARAMS_KEY] = dict(obj.hparams)
 
         # dump stateful datamodule
         datamodule = self.trainer.datamodule
diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py
index f1943073da8193..a2a093ae2f0d94 100644
--- a/pytorch_lightning/trainer/connectors/data_connector.py
+++ b/pytorch_lightning/trainer/connectors/data_connector.py
@@ -71,14 +71,21 @@ def _should_reload_val_dl(self) -> bool:
 
     def on_trainer_init(
         self,
-        check_val_every_n_epoch: int,
+        val_check_interval: Union[int, float],
         reload_dataloaders_every_n_epochs: int,
+        check_val_every_n_epoch: Optional[int],
     ) -> None:
         self.trainer.datamodule = None
 
-        if not isinstance(check_val_every_n_epoch, int):
+        if check_val_every_n_epoch is not None and not isinstance(check_val_every_n_epoch, int):
             raise MisconfigurationException(
-                f"check_val_every_n_epoch should be an integer. Found {check_val_every_n_epoch}"
+                f"`check_val_every_n_epoch` should be an integer, found {check_val_every_n_epoch!r}."
+            )
+
+        if check_val_every_n_epoch is None and isinstance(val_check_interval, float):
+            raise MisconfigurationException(
+                "`val_check_interval` should be an integer when `check_val_every_n_epoch=None`,"
+                f" found {val_check_interval!r}."
             )
 
         self.trainer.check_val_every_n_epoch = check_val_every_n_epoch
@@ -134,14 +141,11 @@ def attach_data(
         # set local properties on the model
         self._copy_trainer_model_properties(model)
 
-    def _copy_trainer_model_properties(self, model):
-        ref_model = self.trainer.lightning_module or model
-
-        for m in [model, ref_model]:
-            m.trainer = proxy(self.trainer)
-            # Remove setting use_amp in v1.8
-            m._use_amp = self.trainer.amp_backend is not None
-            m.precision = self.trainer.precision
+    def _copy_trainer_model_properties(self, model: "pl.LightningModule") -> None:
+        model.trainer = proxy(self.trainer)
+        # Remove setting use_amp in v1.8
+        model._use_amp = self.trainer.amp_backend is not None
+        model.precision = self.trainer.precision
 
     def attach_dataloaders(
         self,
@@ -374,13 +378,17 @@ def _reset_eval_dataloader(
         loader_num_batches = []
 
         # determine number of batches
-        # datasets could be none, 1 or 2+
         module = model or self.trainer.lightning_module or self.datamodule
         if len(dataloaders) != 0:
             for i, dataloader in enumerate(dataloaders):
                 orig_num_batches = num_batches = (
                     len(dataloader) if has_len_all_ranks(dataloader, self.trainer.strategy, module) else float("inf")
                 )
+
+                if orig_num_batches == 0:
+                    loader_num_batches.append(orig_num_batches)
+                    continue
+
                 self._worker_check(dataloader, f"{mode.dataloader_prefix}_dataloader {i}")
 
                 # percent or num_steps
@@ -388,23 +396,27 @@ def _reset_eval_dataloader(
 
                 # limit num batches either as a percent or num steps
                 if isinstance(limit_eval_batches, int):
-                    num_batches = min(num_batches, int(limit_eval_batches))
-                elif num_batches != float("inf"):
-                    num_batches = int(num_batches * limit_eval_batches)
+                    num_batches = min(orig_num_batches, limit_eval_batches)
+                elif isinstance(limit_eval_batches, float) and orig_num_batches != float("inf"):
+                    num_batches = int(orig_num_batches * limit_eval_batches)
                 elif limit_eval_batches != 1.0:
                     raise MisconfigurationException(
-                        f"When using an IterableDataset for `limit_{mode}_batches`,"
-                        f" `Trainer(limit_{mode.dataloader_prefix}_batches)` must be `1.0` or an int. An int k"
-                        f" specifies `num_{mode.dataloader_prefix}_batches` to use."
+                        f"When using an `IterableDataset`, `Trainer(limit_{mode.dataloader_prefix}_batches)` must be"
+                        f" `1.0` or an int. An int specifies `num_{mode.dataloader_prefix}_batches` to use."
                     )
 
-                if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float):
-                    min_pct = 1.0 / len(dataloader)
+                if (
+                    num_batches == 0
+                    and limit_eval_batches > 0.0
+                    and isinstance(limit_eval_batches, float)
+                    and orig_num_batches != float("inf")
+                ):
+                    min_percentage = 1.0 / orig_num_batches
                     raise MisconfigurationException(
                         f"You requested to check {limit_eval_batches} of the `{mode.dataloader_prefix}_dataloader` but"
                         f" {limit_eval_batches} * {orig_num_batches} < 1. Please increase the"
-                        f" `limit_{mode.dataloader_prefix}_batches` flag. Try at least"
-                        f" `limit_{mode.dataloader_prefix}_batches={min_pct}`"
+                        f" `limit_{mode.dataloader_prefix}_batches` argument. Try at least"
+                        f" `limit_{mode.dataloader_prefix}_batches={min_percentage}`"
                     )
 
                 loader_num_batches.append(num_batches)
@@ -425,7 +437,7 @@ def _request_dataloader(
         self.trainer._call_lightning_module_hook("on_" + hook, pl_module=model)
         with _replace_dataloader_init_method():
             # under this context manager, the arguments passed to `DataLoader.__init__` will be captured and saved as
-            # attributes on the instance in case the dataloader needs to be re-instantiated later by Ligtning
+            # attributes on the instance in case the dataloader needs to be re-instantiated later by Lightning
             dataloader = source.dataloader()
         if isinstance(dataloader, tuple):
             dataloader = list(dataloader)
@@ -460,6 +472,7 @@ def replace_sampler(dataloader: DataLoader) -> DataLoader:
 
     @staticmethod
     def _check_eval_shuffling(dataloader, mode):
+        # limit this warning only for samplers assigned automatically when shuffle is set
         if _is_dataloader_shuffled(dataloader):
             rank_zero_warn(
                 f"Your `{mode.dataloader_prefix}_dataloader`'s sampler has shuffling enabled,"
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 3d25075e2985e5..4f2cdba6fbba83 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -15,17 +15,21 @@
 import inspect
 import logging
 import math
+import operator
 import os
 import traceback
 import warnings
 from argparse import ArgumentParser, Namespace
+from contextlib import contextmanager
 from copy import deepcopy
 from datetime import timedelta
+from functools import partial
 from pathlib import Path
-from typing import Any, Callable, cast, Dict, Iterable, List, Optional, Type, Union
+from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Type, Union
 from weakref import proxy
 
 import torch
+import torch.distributed as dist
 from packaging.version import Version
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
@@ -63,7 +67,7 @@
 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy
 from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin
 from pytorch_lightning.trainer.configuration_validator import verify_loop_configurations
-from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector
+from pytorch_lightning.trainer.connectors.accelerator_connector import _LITERAL_WARN, AcceleratorConnector
 from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
 from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
 from pytorch_lightning.trainer.connectors.data_connector import DataConnector
@@ -97,7 +101,7 @@
 from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_len_all_ranks
 from pytorch_lightning.utilities.distributed import distributed_available
 from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException
-from pytorch_lightning.utilities.imports import _fault_tolerant_training
+from pytorch_lightning.utilities.imports import _fault_tolerant_training, _TORCH_GREATER_EQUAL_1_9
 from pytorch_lightning.utilities.meta import is_on_meta_device, materialize_module
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn
@@ -145,7 +149,7 @@ def __init__(
         enable_progress_bar: bool = True,
         overfit_batches: Union[int, float] = 0.0,
         track_grad_norm: Union[int, float, str] = -1,
-        check_val_every_n_epoch: int = 1,
+        check_val_every_n_epoch: Optional[int] = 1,
         fast_dev_run: Union[int, bool] = False,
         accumulate_grad_batches: Optional[Union[int, Dict[int, int]]] = None,
         max_epochs: Optional[int] = None,
@@ -171,7 +175,7 @@ def __init__(
         resume_from_checkpoint: Optional[Union[Path, str]] = None,
         profiler: Optional[Union[BaseProfiler, str]] = None,
         benchmark: Optional[bool] = None,
-        deterministic: bool = False,
+        deterministic: Union[bool, _LITERAL_WARN] = False,
         reload_dataloaders_every_n_epochs: int = 0,
         auto_lr_find: Union[bool, str] = False,
         replace_sampler_ddp: bool = True,
@@ -242,10 +246,11 @@ def __init__(
                 :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks`.
                 Default: ``True``.
 
-            check_val_every_n_epoch: Check val every n train epochs.
+            check_val_every_n_epoch: Perform a validation loop every after every `N` training epochs. If ``None``,
+                validation will be done solely based on the number of training batches, requiring ``val_check_interval``
+                to be an integer value.
                 Default: ``1``.
 
-
             default_root_dir: Default path for logs and weights when no logger/ckpt_callback passed.
                 Default: ``os.getcwd()``.
                 Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/'
@@ -254,6 +259,8 @@ def __init__(
                 Default: ``False``.
 
             deterministic: If ``True``, sets whether PyTorch operations must use deterministic algorithms.
+                Set to ``"warn"`` to use deterministic algorithms whenever possible, throwing warnings on operations
+                that don't support deterministic mode (requires Pytorch 1.11+).
                 Default: ``False``.
 
             devices: Will be mapped to either `gpus`, `tpu_cores`, `num_processes` or `ipus`,
@@ -403,7 +410,8 @@ def __init__(
 
             val_check_interval: How often to check the validation set. Pass a ``float`` in the range [0.0, 1.0] to check
                 after a fraction of the training epoch. Pass an ``int`` to check after a fixed number of training
-                batches.
+                batches. An ``int`` value can only be higher than the number of training batches when
+                ``check_val_every_n_epoch=None``.
                 Default: ``1.0``.
 
             enable_model_summary: Whether to enable model summarization by default.
@@ -524,8 +532,9 @@ def __init__(
         # init data flags
         self.check_val_every_n_epoch: int
         self._data_connector.on_trainer_init(
-            check_val_every_n_epoch,
+            val_check_interval,
             reload_dataloaders_every_n_epochs,
+            check_val_every_n_epoch,
         )
 
         # gradient clipping
@@ -616,10 +625,7 @@ def _init_debugging_flags(
             self.check_val_every_n_epoch = 1
             self.loggers = [DummyLogger()] if self.loggers else []
 
-            rank_zero_info(
-                "Running in fast_dev_run mode: will run a full train,"
-                f" val, test and prediction loop using {num_batches} batch(es)."
-            )
+            rank_zero_info(f"Running in fast_dev_run mode: will run the requested loop using {num_batches} batch(es).")
 
         self.limit_train_batches = _determine_batch_limits(limit_train_batches, "limit_train_batches")
         self.limit_val_batches = _determine_batch_limits(limit_val_batches, "limit_val_batches")
@@ -1313,7 +1319,7 @@ def _run_evaluate(self) -> _EVALUATE_OUTPUT:
         # reset trainer on this loop and all child loops in case user connected a custom loop
         self._evaluation_loop.trainer = self
 
-        with self.profiler.profile(f"run_{self.state.stage}_evaluation"), torch.no_grad():
+        with self.profiler.profile(f"run_{self.state.stage}_evaluation"), _evaluation_context():
             eval_loop_results = self._evaluation_loop.run()
 
         # remove the tensors from the eval results
@@ -1329,7 +1335,7 @@ def _run_predict(self) -> Optional[_PREDICT_OUTPUT]:
         self.reset_predict_dataloader(self.lightning_module)
         # reset trainer on this loop and all child loops in case user connected a custom loop
         self.predict_loop.trainer = self
-        with torch.no_grad():
+        with _evaluation_context():
             return self.predict_loop.run()
 
     def _run_sanity_check(self) -> None:
@@ -1382,27 +1388,37 @@ def __set_ckpt_path(self, ckpt_path: Optional[str], model_provided: bool, model_
         from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint
 
         ft_checkpoints = [cb for cb in self.callbacks if isinstance(cb, _FaultToleranceCheckpoint)]
-        if ft_checkpoints:
-            ft_ckpt_path = ft_checkpoints[0].ckpt_path
-            fs = get_filesystem(ft_ckpt_path)
-            if fs.exists(ft_ckpt_path):
-                return ft_ckpt_path
+        fn = self.state.fn.value
+
+        if ckpt_path is None and ft_checkpoints and self.state.fn == TrainerFn.FITTING:
+            ckpt_path = "last"
+            rank_zero_warn(
+                f"`.{fn}(ckpt_path=None)` was called without a model."
+                " Because fault tolerance is enabled, the last model of the previous `fit` call will be used."
+                f" You can pass `{fn}(ckpt_path='best')` to use the best model or"
+                f" `{fn}(ckpt_path='last')` to use the last model."
+                " If you pass a value, this warning will be silenced."
+            )
 
         if model_provided and ckpt_path is None:
             # use passed model to function without loading weights
             return
 
-        fn = self.state.fn.value
-
         if model_connected and ckpt_path is None:
+            ckpt_path = "best"
+            ft_tip = (
+                " There is also a fault-tolerant checkpoint available, however it is used by default only when fitting."
+                if ft_checkpoints
+                else ""
+            )
             rank_zero_warn(
                 f"`.{fn}(ckpt_path=None)` was called without a model."
                 " The best model of the previous `fit` call will be used."
-                f" You can pass `{fn}(ckpt_path='best')` to use and best model"
-                " checkpoint and avoid this warning or"
-                " `ckpt_path=trainer.checkpoint_callback.last_model_path` to use the last model."
+                + ft_tip
+                + f" You can pass `.{fn}(ckpt_path='best')` to use the best model or"
+                f" `.{fn}(ckpt_path='last')` to use the last model."
+                " If you pass a value, this warning will be silenced."
             )
-            ckpt_path = "best"
 
         if ckpt_path == "best":
             if len(self.checkpoint_callbacks) > 1:
@@ -1428,6 +1444,21 @@ def __set_ckpt_path(self, ckpt_path: Optional[str], model_provided: bool, model_
             # load best weights
             ckpt_path = self.checkpoint_callback.best_model_path
 
+        if ckpt_path == "last":
+            candidates = [ft.ckpt_path for ft in ft_checkpoints] + [
+                cb.last_model_path for cb in self.checkpoint_callbacks
+            ]
+            candidates_fs = {path: get_filesystem(path) for path in candidates if path}
+            candidates_ts = {path: fs.modified(path) for path, fs in candidates_fs.items() if fs.exists(path)}
+            if not candidates_ts:
+                # not an error so it can be set and forget before the first `fit` run
+                rank_zero_warn(
+                    f'.{fn}(ckpt_path="last") is set, but there is no fault tolerant'
+                    " or last checkpoint available. No checkpoint will be loaded."
+                )
+                return
+            ckpt_path = max(candidates_ts.keys(), key=partial(operator.getitem, candidates_ts))
+
         if not ckpt_path:
             raise MisconfigurationException(
                 f"`.{fn}()` found no path for the best weights: {ckpt_path!r}. Please"
@@ -1810,30 +1841,35 @@ def reset_train_dataloader(self, model: Optional["pl.LightningModule"] = None) -
             self.train_dataloader = CombinedLoader(loaders, self._data_connector.multiple_trainloader_mode)
 
         module = model or self.lightning_module or self.datamodule
-        self.num_training_batches = (
+        orig_train_batches = self.num_training_batches = (
             len(self.train_dataloader)
             if has_len_all_ranks(self.train_dataloader, self.strategy, module)
             else float("inf")
         )
+        if orig_train_batches == 0:
+            return
+
+        # store epoch of dataloader reset for reload_dataloaders_every_n_epochs
+        self._last_train_dl_reload_epoch = self.current_epoch
 
         if isinstance(self.limit_train_batches, int):
-            self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches))
+            self.num_training_batches = min(orig_train_batches, self.limit_train_batches)
         elif self.num_training_batches != float("inf"):
-            self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
+            self.num_training_batches = int(orig_train_batches * self.limit_train_batches)
         elif self.limit_train_batches != 1.0:
             raise MisconfigurationException(
-                "When using an IterableDataset for `limit_train_batches`,"
-                " `Trainer(limit_train_batches)` must be `1.0` or an int. An int k specifies"
-                " `num_training_batches` to use."
+                "When using an `IterableDataset`, `Trainer(limit_train_batches)` must be `1.0` or an int."
+                "An int specifies `num_training_batches` to use."
             )
 
         if isinstance(self.val_check_interval, int):
             self.val_check_batch = self.val_check_interval
-            if self.val_check_batch > self.num_training_batches:
+            if self.val_check_batch > self.num_training_batches and self.check_val_every_n_epoch is not None:
                 raise ValueError(
                     f"`val_check_interval` ({self.val_check_interval}) must be less than or equal "
                     f"to the number of the training batches ({self.num_training_batches}). "
                     "If you want to disable validation set `limit_val_batches` to 0.0 instead."
+                    "If you want to validate based on the total training batches, set `check_val_every_n_epoch=None`."
                 )
         else:
             if not has_len_all_ranks(self.train_dataloader, self.strategy, module):
@@ -1857,8 +1893,19 @@ def reset_train_dataloader(self, model: Optional["pl.LightningModule"] = None) -
                 category=PossibleUserWarning,
             )
 
-        # store epoch of dataloader reset for reload_dataloaders_every_n_epochs
-        self._last_train_dl_reload_epoch = self.current_epoch
+        if (
+            self.num_training_batches == 0
+            and self.limit_train_batches > 0.0
+            and isinstance(self.limit_train_batches, float)
+            and orig_train_batches != float("inf")
+        ):
+            min_percentage = 1.0 / orig_train_batches
+            raise MisconfigurationException(
+                f"You requested to check {self.limit_train_batches} of the `train_dataloader` but"
+                f" {self.limit_train_batches} * {orig_train_batches} < 1. Please increase the"
+                f" `limit_train_batches` argument. Try at least"
+                f" `limit_train_batches={min_percentage}`"
+            )
 
     def reset_val_dataloader(self, model: Optional["pl.LightningModule"] = None) -> None:
         """Resets the validation dataloader and determines the number of batches.
@@ -1912,6 +1959,7 @@ def reset_train_val_dataloaders(self, model: Optional["pl.LightningModule"] = No
 
         The val dataloader must be initialized before training loop starts, as the training loop
         inspects the val dataloader to determine whether to run the evaluation loop.
+
         Args:
             model: The ``LightningModule`` if called outside of the trainer scope.
         """
@@ -2184,19 +2232,6 @@ def distributed_sampler_kwargs(self) -> Optional[dict]:
     def data_parallel(self) -> bool:
         return isinstance(self.strategy, ParallelStrategy)
 
-    @property
-    def progress_bar_dict(self) -> dict:
-        """Read-only for progress bar metrics."""
-        rank_zero_deprecation(
-            "`trainer.progress_bar_dict` is deprecated in v1.5 and will be removed in v1.7."
-            " Use `ProgressBarBase.get_metrics` instead."
-        )
-        ref_model = self.lightning_module
-        ref_model = cast(pl.LightningModule, ref_model)
-        if self.progress_bar_callback:
-            return self.progress_bar_callback.get_metrics(self, ref_model)
-        return self.progress_bar_metrics
-
     @property
     def enable_validation(self) -> bool:
         """Check if we should run validation during training."""
@@ -2373,6 +2408,11 @@ def save_checkpoint(
             storage_options: parameter for how to save to storage, passed to ``CheckpointIO`` plugin
 
         """
+        if self.model is None:
+            raise AttributeError(
+                "Saving a checkpoint is only possible if a model is attached to the Trainer. Did you call"
+                " `Trainer.save_checkpoint()` before calling `Trainer.{fit,validate,test,predict}`?"
+            )
         self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
 
     """
@@ -2626,19 +2666,21 @@ def _active_loop(self) -> Optional[Union[FitLoop, EvaluationLoop, PredictionLoop
 
     @property
     def logger(self) -> Optional[Logger]:
-        if len(self.loggers) == 0:
+        loggers = self.loggers
+        if len(loggers) == 0:
             return None
-        if len(self.loggers) == 1:
-            return self.loggers[0]
+        if len(loggers) == 1:
+            return loggers[0]
         else:
-            rank_zero_warn(
-                "Using trainer.logger when Trainer is configured to use multiple loggers."
-                " This behavior will change in v1.8 when LoggerCollection is removed, and"
-                " trainer.logger will return the first logger in trainer.loggers"
+            rank_zero_deprecation(
+                "Using `trainer.logger` when multiple loggers are configured."
+                " This behavior will change in v1.8 when `LoggerCollection` is removed, and"
+                " `trainer.logger` will return the first logger available.",
+                stacklevel=5,
             )
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
-                return LoggerCollection(self.loggers)
+                return LoggerCollection(loggers)
 
     @logger.setter
     def logger(self, logger: Optional[Logger]) -> None:
@@ -2744,6 +2786,18 @@ def configure_optimizers(self):
         return max_estimated_steps
 
 
+@contextmanager
+def _evaluation_context() -> Generator:
+    # inference mode is not supported with gloo backend (#9431)
+    context_manager_class = (
+        torch.inference_mode
+        if _TORCH_GREATER_EQUAL_1_9 and not (dist.is_initialized() and dist.get_backend() == "gloo")
+        else torch.no_grad
+    )
+    with context_manager_class():
+        yield
+
+
 def _determine_batch_limits(batches: Optional[Union[int, float]], name: str) -> Union[int, float]:
     if batches is None:
         # batches is optional to know if the user passed a value so that we can show the above info messages only to the
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 289b7faa431e24..61e075fd010e92 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -35,6 +35,7 @@
     _FAIRSCALE_FULLY_SHARDED_AVAILABLE,
     _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE,
     _GROUP_AVAILABLE,
+    _HIVEMIND_AVAILABLE,
     _HOROVOD_AVAILABLE,
     _HPU_AVAILABLE,
     _HYDRA_AVAILABLE,
@@ -49,6 +50,7 @@
     _RICH_AVAILABLE,
     _TORCH_GREATER_EQUAL_1_9,
     _TORCH_GREATER_EQUAL_1_10,
+    _TORCH_GREATER_EQUAL_1_11,
     _TORCH_QUANTIZE_AVAILABLE,
     _TORCHTEXT_AVAILABLE,
     _TORCHVISION_AVAILABLE,
diff --git a/pytorch_lightning/utilities/argparse.py b/pytorch_lightning/utilities/argparse.py
index 8927ff09343736..58ced375fcae51 100644
--- a/pytorch_lightning/utilities/argparse.py
+++ b/pytorch_lightning/utilities/argparse.py
@@ -266,7 +266,13 @@ def add_argparse_args(
             use_type = _precision_allowed_type
 
         parser.add_argument(
-            f"--{arg}", dest=arg, default=arg_default, type=use_type, help=args_help.get(arg), **arg_kwargs
+            f"--{arg}",
+            dest=arg,
+            default=arg_default,
+            type=use_type,
+            help=args_help.get(arg),
+            required=(arg_default == inspect._empty),
+            **arg_kwargs,
         )
 
     if use_argument_group:
diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py
index 41d2739f78870d..31db85c5151d6b 100644
--- a/pytorch_lightning/utilities/cli.py
+++ b/pytorch_lightning/utilities/cli.py
@@ -29,21 +29,25 @@
 from pytorch_lightning import Callback, LightningDataModule, LightningModule, seed_everything, Trainer
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _JSONARGPARSE_AVAILABLE
+from pytorch_lightning.utilities.imports import _DOCSTRING_PARSER_AVAILABLE, _JSONARGPARSE_AVAILABLE
 from pytorch_lightning.utilities.meta import get_all_subclasses
 from pytorch_lightning.utilities.model_helpers import is_overridden
-from pytorch_lightning.utilities.rank_zero import _warn, rank_zero_warn
-from pytorch_lightning.utilities.types import LRSchedulerType, LRSchedulerTypeTuple, LRSchedulerTypeUnion
+from pytorch_lightning.utilities.rank_zero import _warn, rank_zero_deprecation, rank_zero_warn
+from pytorch_lightning.utilities.seed import _select_seed_randomly
 
 if _JSONARGPARSE_AVAILABLE:
     from jsonargparse import ActionConfigFile, ArgumentParser, class_from_function, Namespace, set_config_read_mode
-    from jsonargparse.optionals import import_docstring_parse
+    from jsonargparse.typehints import get_all_subclass_paths
+    from jsonargparse.util import import_object
 
     set_config_read_mode(fsspec_enabled=True)
 else:
     locals()["ArgumentParser"] = object
     locals()["Namespace"] = object
 
+if _DOCSTRING_PARSER_AVAILABLE:
+    import docstring_parser
+
 
 class _Registry(dict):
     def __call__(self, cls: Type, key: Optional[str] = None, override: bool = False) -> Type:
@@ -104,6 +108,12 @@ def __init__(self, optimizer: Optimizer, monitor: str, *args: Any, **kwargs: Any
         self.monitor = monitor
 
 
+# LightningCLI requires the ReduceLROnPlateau defined here, thus it shouldn't accept the one from pytorch:
+LRSchedulerTypeTuple = (torch.optim.lr_scheduler._LRScheduler, ReduceLROnPlateau)
+LRSchedulerTypeUnion = Union[torch.optim.lr_scheduler._LRScheduler, ReduceLROnPlateau]
+LRSchedulerType = Union[Type[torch.optim.lr_scheduler._LRScheduler], Type[ReduceLROnPlateau]]
+
+
 def _populate_registries(subclasses: bool) -> None:
     if subclasses:
         # this will register any subclasses from all loaded modules including userland
@@ -132,9 +142,6 @@ def _populate_registries(subclasses: bool) -> None:
 class LightningArgumentParser(ArgumentParser):
     """Extension of jsonargparse's ArgumentParser for pytorch-lightning."""
 
-    # use class attribute because `parse_args` is only called on the main parser
-    _choices: Dict[str, Tuple[Tuple[Type, ...], bool]] = {}
-
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         """Initialize argument parser that supports configuration file input.
 
@@ -221,7 +228,6 @@ def add_optimizer_args(
         kwargs = {"instantiate": False, "fail_untyped": False, "skip": {"params"}}
         if isinstance(optimizer_class, tuple):
             self.add_subclass_arguments(optimizer_class, nested_key, **kwargs)
-            self.set_choices(nested_key, optimizer_class)
         else:
             self.add_class_arguments(optimizer_class, nested_key, sub_configs=True, **kwargs)
         self._optimizers[nested_key] = (optimizer_class, link_to)
@@ -246,100 +252,19 @@ def add_lr_scheduler_args(
         kwargs = {"instantiate": False, "fail_untyped": False, "skip": {"optimizer"}}
         if isinstance(lr_scheduler_class, tuple):
             self.add_subclass_arguments(lr_scheduler_class, nested_key, **kwargs)
-            self.set_choices(nested_key, lr_scheduler_class)
         else:
             self.add_class_arguments(lr_scheduler_class, nested_key, sub_configs=True, **kwargs)
         self._lr_schedulers[nested_key] = (lr_scheduler_class, link_to)
 
     def parse_args(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
         argv = sys.argv
-        for k, v in self._choices.items():
-            if not any(arg.startswith(f"--{k}") for arg in argv):
-                # the key wasn't passed - maybe defined in a config, maybe it's optional
-                continue
-            classes, is_list = v
-            # knowing whether the argument is a list type automatically would be too complex
-            if is_list:
-                argv = self._convert_argv_issue_85(classes, k, argv)
-            else:
-                argv = self._convert_argv_issue_84(classes, k, argv)
-        self._choices.clear()
+        nested_key = "trainer.callbacks"
+        if any(arg.startswith(f"--{nested_key}") for arg in argv):
+            classes = tuple(import_object(x) for x in get_all_subclass_paths(Callback))
+            argv = self._convert_argv_issue_85(classes, nested_key, argv)
         with mock.patch("sys.argv", argv):
             return super().parse_args(*args, **kwargs)
 
-    def set_choices(self, nested_key: str, classes: Tuple[Type, ...], is_list: bool = False) -> None:
-        """Adds support for shorthand notation for a particular nested key.
-
-        Args:
-            nested_key: The key whose choices will be set.
-            classes: A tuple of classes to choose from.
-            is_list: Whether the argument is a ``List[object]`` type.
-        """
-        self._choices[nested_key] = (classes, is_list)
-
-    @staticmethod
-    def _convert_argv_issue_84(classes: Tuple[Type, ...], nested_key: str, argv: List[str]) -> List[str]:
-        """Placeholder for https://github.com/omni-us/jsonargparse/issues/84.
-
-        Adds support for shorthand notation for ``object`` arguments.
-        """
-        passed_args, clean_argv = {}, []
-        argv_key = f"--{nested_key}"
-        # get the argv args for this nested key
-        i = 0
-        while i < len(argv):
-            arg = argv[i]
-            if arg.startswith(argv_key):
-                if "=" in arg:
-                    key, value = arg.split("=")
-                else:
-                    key = arg
-                    i += 1
-                    value = argv[i]
-                passed_args[key] = value
-            else:
-                clean_argv.append(arg)
-            i += 1
-
-        # the user requested a help message
-        help_key = argv_key + ".help"
-        if help_key in passed_args:
-            argv_class = passed_args[help_key]
-            if "." in argv_class:
-                # user passed the class path directly
-                class_path = argv_class
-            else:
-                # convert shorthand format to the classpath
-                for cls in classes:
-                    if cls.__name__ == argv_class:
-                        class_path = _class_path_from_class(cls)
-                        break
-                else:
-                    raise ValueError(f"Could not generate get the class_path for {repr(argv_class)}")
-            return clean_argv + [help_key, class_path]
-
-        # generate the associated config file
-        argv_class = passed_args.pop(argv_key, "")
-        if not argv_class:
-            # the user passed a config as a str
-            class_path = passed_args[f"{argv_key}.class_path"]
-            init_args_key = f"{argv_key}.init_args"
-            init_args = {k[len(init_args_key) + 1 :]: v for k, v in passed_args.items() if k.startswith(init_args_key)}
-            config = str({"class_path": class_path, "init_args": init_args})
-        elif argv_class.startswith("{") or argv_class in ("None", "True", "False"):
-            # the user passed a config as a dict
-            config = argv_class
-        else:
-            # the user passed the shorthand format
-            init_args = {k[len(argv_key) + 1 :]: v for k, v in passed_args.items()}  # +1 to account for the period
-            for cls in classes:
-                if cls.__name__ == argv_class:
-                    config = str(_global_add_class_path(cls, init_args))
-                    break
-            else:
-                raise ValueError(f"Could not generate a config for {repr(argv_class)}")
-        return clean_argv + [argv_key, config]
-
     @staticmethod
     def _convert_argv_issue_85(classes: Tuple[Type, ...], nested_key: str, argv: List[str]) -> List[str]:
         """Placeholder for https://github.com/omni-us/jsonargparse/issues/85.
@@ -470,7 +395,7 @@ def __init__(
         save_config_multifile: bool = False,
         trainer_class: Union[Type[Trainer], Callable[..., Trainer]] = Trainer,
         trainer_defaults: Optional[Dict[str, Any]] = None,
-        seed_everything_default: Optional[int] = None,
+        seed_everything_default: Union[bool, int] = True,
         description: str = "pytorch-lightning trainer command line tool",
         env_prefix: str = "PL",
         env_parse: bool = False,
@@ -487,7 +412,7 @@ def __init__(
         A full configuration yaml would be parsed from ``PL_CONFIG`` if set.
         Individual settings are so parsed from variables named for example ``PL_TRAINER__MAX_EPOCHS``.
 
-        For more info, read :ref:`the CLI docs <common/lightning_cli:LightningCLI>`.
+        For more info, read :ref:`the CLI docs <lightning-cli>`.
 
         .. warning:: ``LightningCLI`` is in beta and subject to change.
 
@@ -507,9 +432,10 @@ def __init__(
             trainer_defaults: Set to override Trainer defaults or add persistent callbacks. The callbacks added through
                 this argument will not be configurable from a configuration file and will always be present for
                 this particular CLI. Alternatively, configurable callbacks can be added as explained in
-                :ref:`the CLI docs <common/lightning_cli:Configurable callbacks>`.
-            seed_everything_default: Default value for the :func:`~pytorch_lightning.utilities.seed.seed_everything`
-                seed argument.
+                :ref:`the CLI docs <lightning-cli>`.
+            seed_everything_default: Value for the :func:`~pytorch_lightning.utilities.seed.seed_everything`
+                seed argument. Set to True to automatically choose a valid seed.
+                Setting it to False will not call seed_everything.
             description: Description of the tool shown when running ``--help``.
             env_prefix: Prefix for environment variables.
             env_parse: Whether environment variable parsing is enabled.
@@ -532,6 +458,13 @@ def __init__(
         self.trainer_defaults = trainer_defaults or {}
         self.seed_everything_default = seed_everything_default
 
+        if self.seed_everything_default is None:
+            rank_zero_deprecation(
+                "Setting `LightningCLI.seed_everything_default` to `None` is deprecated in v1.7 "
+                "and will be removed in v1.9. Set it to `False` instead."
+            )
+            self.seed_everything_default = False
+
         self.model_class = model_class
         # used to differentiate between the original value and the processed value
         self._model_class = model_class or LightningModule
@@ -553,9 +486,7 @@ def __init__(
 
         self.subcommand = self.config["subcommand"] if run else None
 
-        seed = self._get(self.config, "seed_everything")
-        if seed is not None:
-            seed_everything(seed, workers=True)
+        self._set_seed()
 
         self.before_instantiate_classes()
         self.instantiate_classes()
@@ -593,32 +524,29 @@ def add_default_arguments_to_parser(self, parser: LightningArgumentParser) -> No
         """Adds default arguments to the parser."""
         parser.add_argument(
             "--seed_everything",
-            type=Optional[int],
+            type=Union[bool, int],
             default=self.seed_everything_default,
-            help="Set to an int to run seed_everything with this value before classes instantiation",
+            help=(
+                "Set to an int to run seed_everything with this value before classes instantiation."
+                "Set to True to use a random seed."
+            ),
         )
 
     def add_core_arguments_to_parser(self, parser: LightningArgumentParser) -> None:
         """Adds arguments from the core classes to the parser."""
         parser.add_lightning_class_args(self.trainer_class, "trainer")
-        parser.set_choices("trainer.callbacks", CALLBACK_REGISTRY.classes, is_list=True)
-        parser.set_choices("trainer.logger", LOGGER_REGISTRY.classes)
         trainer_defaults = {"trainer." + k: v for k, v in self.trainer_defaults.items() if k != "callbacks"}
         parser.set_defaults(trainer_defaults)
 
         parser.add_lightning_class_args(self._model_class, "model", subclass_mode=self.subclass_mode_model)
-        if self.model_class is None and len(MODEL_REGISTRY):
-            # did not pass a model and there are models registered
-            parser.set_choices("model", MODEL_REGISTRY.classes)
 
         if self.datamodule_class is not None:
             parser.add_lightning_class_args(self._datamodule_class, "data", subclass_mode=self.subclass_mode_data)
-        elif len(DATAMODULE_REGISTRY):
+        else:
             # this should not be required because the user might want to use the `LightningModule` dataloaders
             parser.add_lightning_class_args(
                 self._datamodule_class, "data", subclass_mode=self.subclass_mode_data, required=False
             )
-            parser.set_choices("data", DATAMODULE_REGISTRY.classes)
 
     def _add_arguments(self, parser: LightningArgumentParser) -> None:
         # default + core + custom arguments
@@ -627,9 +555,9 @@ def _add_arguments(self, parser: LightningArgumentParser) -> None:
         self.add_arguments_to_parser(parser)
         # add default optimizer args if necessary
         if not parser._optimizers:  # already added by the user in `add_arguments_to_parser`
-            parser.add_optimizer_args(OPTIMIZER_REGISTRY.classes)
+            parser.add_optimizer_args((Optimizer,))
         if not parser._lr_schedulers:  # already added by the user in `add_arguments_to_parser`
-            parser.add_lr_scheduler_args(LR_SCHEDULER_REGISTRY.classes)
+            parser.add_lr_scheduler_args(LRSchedulerTypeTuple)
         self.link_optimizers_and_lr_schedulers(parser)
 
     def add_arguments_to_parser(self, parser: LightningArgumentParser) -> None:
@@ -849,6 +777,17 @@ def _prepare_subcommand_kwargs(self, subcommand: str) -> Dict[str, Any]:
             fn_kwargs["datamodule"] = self.datamodule
         return fn_kwargs
 
+    def _set_seed(self) -> None:
+        """Sets the seed."""
+        config_seed = self._get(self.config, "seed_everything")
+
+        if isinstance(config_seed, bool) and config_seed:
+            config_seed = _select_seed_randomly()
+
+        if config_seed is not None and config_seed is not False:
+            seed_everything(config_seed, workers=True)
+            self.config["seed_everything"] = config_seed
+
 
 def _class_path_from_class(class_type: Type) -> str:
     return class_type.__module__ + "." + class_type.__name__
@@ -889,9 +828,13 @@ def instantiate_class(args: Union[Any, Tuple[Any, ...]], init: Dict[str, Any]) -
 
 
 def _get_short_description(component: object) -> Optional[str]:
-    parse = import_docstring_parse("LightningCLI(run=True)")
-    try:
-        docstring = parse(component.__doc__)
-        return docstring.short_description
-    except ValueError:
-        rank_zero_warn(f"Failed parsing docstring for {component}")
+    if component.__doc__ is None:
+        return None
+    if not _DOCSTRING_PARSER_AVAILABLE:
+        rank_zero_warn(f"Failed parsing docstring for {component}: docstring-parser package is required")
+    else:
+        try:
+            docstring = docstring_parser.parse(component.__doc__)
+            return docstring.short_description
+        except (ValueError, docstring_parser.ParseError) as ex:
+            rank_zero_warn(f"Failed parsing docstring for {component}: {ex}")
diff --git a/pytorch_lightning/utilities/data.py b/pytorch_lightning/utilities/data.py
index 80bb7747c5fe82..86cdb741bd5e53 100644
--- a/pytorch_lightning/utilities/data.py
+++ b/pytorch_lightning/utilities/data.py
@@ -21,7 +21,7 @@
 from typing import Any, Callable, Dict, Generator, Iterable, Mapping, Optional, Set, Type, Union
 
 import torch
-from torch.utils.data import BatchSampler, DataLoader, IterableDataset, Sampler, SequentialSampler
+from torch.utils.data import BatchSampler, DataLoader, IterableDataset, RandomSampler, Sampler, SequentialSampler
 
 import pytorch_lightning as pl
 from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper
@@ -389,9 +389,18 @@ def _apply_fault_tolerant_automatic_capture_dataset_wrapper(dl_kwargs: Dict) ->
     return dl_kwargs
 
 
-def _is_dataloader_shuffled(dataloader: DataLoader):
-    return (
-        hasattr(dataloader, "sampler")
-        and not isinstance(dataloader.sampler, SequentialSampler)
-        and not isinstance(dataloader.dataset, IterableDataset)
-    )
+def _is_dataloader_shuffled(dataloader: object) -> bool:
+    if hasattr(dataloader, "shuffle"):
+        # this attribute is not part of PyTorch's DataLoader, but could have been set by
+        # our `_replace_dataloader_init_method` context manager
+        return dataloader.shuffle
+    if isinstance(dataloader.dataset, IterableDataset):
+        # shuffling is useless with iterable datasets
+        return False
+    if not hasattr(dataloader, "sampler"):
+        # shuffling is enabled via a sampler. No sampler, no shuffling
+        return False
+    sampler = dataloader.sampler
+    if isinstance(sampler, SequentialSampler):
+        return False
+    return isinstance(sampler, RandomSampler)
diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py
index ef857bad87471b..1aa1ee7662e595 100644
--- a/pytorch_lightning/utilities/device_parser.py
+++ b/pytorch_lightning/utilities/device_parser.py
@@ -82,7 +82,7 @@ def parse_gpu_ids(gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[i
         MisconfigurationException:
             If no GPUs are available but the value of gpus variable indicates request for GPUs
     """
-    # Check that gpus param is None, Int, String or List
+    # Check that gpus param is None, Int, String or Sequence of Ints
     _check_data_type(gpus)
 
     # Handle the case when no gpus are requested
@@ -227,8 +227,7 @@ def _check_unique(device_ids: List[int]) -> None:
 
 
 def _check_data_type(device_ids: Any) -> None:
-    """Checks that the device_ids argument is one of: None, Int, String or List. Raises a MisconfigurationException
-    otherwise.
+    """Checks that the device_ids argument is one of None, int, string, or sequence of integers.
 
     Args:
         device_ids: gpus/tpu_cores parameter as passed to the Trainer
@@ -237,10 +236,16 @@ def _check_data_type(device_ids: Any) -> None:
         MisconfigurationException:
             If ``device_ids`` of GPU/TPUs aren't ``int``, ``str``, sequence of ``int`` or ``None``
     """
-    if device_ids is not None and (
-        not isinstance(device_ids, (int, str, MutableSequence, tuple)) or isinstance(device_ids, bool)
-    ):
-        raise MisconfigurationException("Device ID's (GPU/TPU) must be int, string or sequence of ints or None.")
+    msg = "Device IDs (GPU/TPU) must be an int, a string, a sequence of ints or None, but you passed"
+
+    if device_ids is None:
+        return
+    elif isinstance(device_ids, (MutableSequence, tuple)):
+        for id_ in device_ids:
+            if type(id_) is not int:
+                raise MisconfigurationException(f"{msg} a sequence of {type(id_).__name__}.")
+    elif type(device_ids) not in (int, str):
+        raise MisconfigurationException(f"{msg} {type(device_ids).__name__}.")
 
 
 def _tpu_cores_valid(tpu_cores: Any) -> bool:
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 86449db1190a9a..cd839f7fd54651 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -25,6 +25,7 @@
 from pytorch_lightning.utilities.rank_zero import rank_zero_only  # noqa: F401
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
 from pytorch_lightning.utilities.rank_zero import rank_zero_info as new_rank_zero_info
+from pytorch_lightning.utilities.rank_zero import rank_zero_warn as new_rank_zero_warn
 
 if _TPU_AVAILABLE:
     import torch_xla.core.xla_model as xm
@@ -281,8 +282,6 @@ def register_ddp_comm_hook(
         ...     ddp_comm_wrapper=default.fp16_compress_wrapper,
         ... )
     """
-    from pytorch_lightning.utilities import rank_zero_warn
-
     if ddp_comm_hook is None:
         return
     # inform mypy that ddp_comm_hook is callable
@@ -290,14 +289,16 @@ def register_ddp_comm_hook(
 
     if ddp_comm_wrapper is not None:
         if not _TORCH_GREATER_EQUAL_1_9:
-            rank_zero_warn("Not applying DDP comm wrapper. To use communication wrapper, please use pytorch>=1.9.0.")
+            new_rank_zero_warn(
+                "Not applying DDP comm wrapper. To use communication wrapper, please use pytorch>=1.9.0."
+            )
         else:
             new_rank_zero_info(
                 f"DDP comm wrapper is provided, apply {ddp_comm_wrapper.__qualname__}({ddp_comm_hook.__qualname__})."
             )
             ddp_comm_hook = ddp_comm_wrapper(ddp_comm_hook)
 
-    rank_zero_debug(f"Registering DDP comm hook: {ddp_comm_hook.__qualname__}.")
+    new_rank_zero_debug(f"Registering DDP comm hook: {ddp_comm_hook.__qualname__}.")
     model.register_comm_hook(state=ddp_comm_state, hook=ddp_comm_hook)
 
 
diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py
index 2189063ee20ef5..de2a0af661c2ab 100644
--- a/pytorch_lightning/utilities/enums.py
+++ b/pytorch_lightning/utilities/enums.py
@@ -27,10 +27,10 @@ class LightningEnum(str, Enum):
 
     @classmethod
     def from_str(cls, value: str) -> LightningEnum | None:
-        statuses = [status for status in dir(cls) if not status.startswith("_")]
+        statuses = cls.__members__.keys()
         for st in statuses:
             if st.lower() == value.lower():
-                return getattr(cls, st)
+                return cls[st]
         return None
 
     def __eq__(self, other: object) -> bool:
@@ -43,21 +43,21 @@ def __hash__(self) -> int:
         return hash(self.value.lower())
 
 
-class _OnAccessEnumMeta(EnumMeta):
-    """Enum with a hook to run a function whenever a member is accessed.
+class _DeprecatedEnumMeta(EnumMeta):
+    """Enum that calls `deprecate()` whenever a member is accessed.
 
-    Adapted from:
-    https://www.buzzphp.com/posts/how-do-i-detect-and-invoke-a-function-when-a-python-enum-member-is-accessed
+    Adapted from: https://stackoverflow.com/a/62309159/208880
     """
 
     def __getattribute__(cls, name: str) -> Any:
         obj = super().__getattribute__(name)
-        if isinstance(obj, Enum):
+        # ignore __dunder__ names -- prevents potential recursion errors
+        if not (name.startswith("__") and name.endswith("__")) and isinstance(obj, Enum):
             obj.deprecate()
         return obj
 
     def __getitem__(cls, name: str) -> Any:
-        member: _OnAccessEnumMeta = super().__getitem__(name)
+        member: _DeprecatedEnumMeta = super().__getitem__(name)
         member.deprecate()
         return member
 
@@ -68,6 +68,12 @@ def __call__(cls, *args: Any, **kwargs: Any) -> Any:
         return obj
 
 
+class _DeprecatedEnum(LightningEnum, metaclass=_DeprecatedEnumMeta):
+    """_DeprecatedEnum calls an enum's `deprecate()` method on member access."""
+
+    pass
+
+
 class AMPType(LightningEnum):
     """Type of Automatic Mixed Precission used for training.
 
@@ -104,7 +110,7 @@ def supported_types() -> list[str]:
         return [x.value for x in PrecisionType]
 
 
-class DistributedType(LightningEnum, metaclass=_OnAccessEnumMeta):
+class DistributedType(_DeprecatedEnum):
     """Define type of training strategy.
 
     Deprecated since v1.6.0 and will be removed in v1.8.0.
@@ -141,11 +147,11 @@ def is_interactive_compatible(self) -> bool:
     def deprecate(self) -> None:
         rank_zero_deprecation(
             "`DistributedType` Enum has been deprecated in v1.6 and will be removed in v1.8."
-            " Use the string value `{self.value!r}` instead."
+            f" Use the string value `{self.value!r}` instead."
         )
 
 
-class DeviceType(LightningEnum, metaclass=_OnAccessEnumMeta):
+class DeviceType(_DeprecatedEnum):
     """Define Device type by its nature - accelerators.
 
     Deprecated since v1.6.0 and will be removed in v1.8.0.
@@ -161,7 +167,7 @@ class DeviceType(LightningEnum, metaclass=_OnAccessEnumMeta):
     def deprecate(self) -> None:
         rank_zero_deprecation(
             "`DeviceType` Enum has been deprecated in v1.6 and will be removed in v1.8."
-            " Use the string value `{self.value!r}` instead."
+            f" Use the string value `{self.value!r}` instead."
         )
 
 
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 232c51ad636ee8..f2f73a7d89c892 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -98,15 +98,18 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 _APEX_AVAILABLE = _module_available("apex.amp")
 _BAGUA_AVAILABLE = _package_available("bagua")
 _DEEPSPEED_AVAILABLE = _package_available("deepspeed")
+_DEEPSPEED_GREATER_EQUAL_0_5_9 = _DEEPSPEED_AVAILABLE and _compare_version("deepspeed", operator.ge, "0.5.9")
 _DEEPSPEED_GREATER_EQUAL_0_6 = _DEEPSPEED_AVAILABLE and _compare_version("deepspeed", operator.ge, "0.6.0")
+_DOCSTRING_PARSER_AVAILABLE = _package_available("docstring_parser")
 _FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn")
 _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3")
 _FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4")
 _GROUP_AVAILABLE = not _IS_WINDOWS and _module_available("torch.distributed.group")
+_HIVEMIND_AVAILABLE = _package_available("hivemind")
 _HOROVOD_AVAILABLE = _module_available("horovod.torch")
 _HYDRA_AVAILABLE = _package_available("hydra")
 _HYDRA_EXPERIMENTAL_AVAILABLE = _module_available("hydra.experimental")
-_JSONARGPARSE_AVAILABLE = _package_available("jsonargparse") and _compare_version("jsonargparse", operator.ge, "4.6.0")
+_JSONARGPARSE_AVAILABLE = _package_available("jsonargparse") and _compare_version("jsonargparse", operator.ge, "4.7.1")
 _KINETO_AVAILABLE = _TORCH_GREATER_EQUAL_1_8_1 and torch.profiler.kineto_available()
 _NEPTUNE_AVAILABLE = _package_available("neptune")
 _NEPTUNE_GREATER_EQUAL_0_9 = _NEPTUNE_AVAILABLE and _compare_version("neptune", operator.ge, "0.9.0")
diff --git a/pytorch_lightning/utilities/migration.py b/pytorch_lightning/utilities/migration.py
index bc3761e47b8356..30cc8232104237 100644
--- a/pytorch_lightning/utilities/migration.py
+++ b/pytorch_lightning/utilities/migration.py
@@ -14,10 +14,14 @@
 from __future__ import annotations
 
 import sys
+import threading
 from types import ModuleType, TracebackType
 
 import pytorch_lightning.utilities.argparse
 
+# Create a global lock to ensure no race condition with deleting sys modules
+_lock = threading.Lock()
+
 
 class pl_legacy_patch:
     """Registers legacy artifacts (classes, methods, etc.) that were removed but still need to be included for
@@ -35,6 +39,7 @@ class pl_legacy_patch:
     """
 
     def __enter__(self) -> None:
+        _lock.acquire()
         # `pl.utilities.argparse_utils` was renamed to `pl.utilities.argparse`
         legacy_argparse_module = ModuleType("pytorch_lightning.utilities.argparse_utils")
         sys.modules["pytorch_lightning.utilities.argparse_utils"] = legacy_argparse_module
@@ -49,3 +54,4 @@ def __exit__(
         if hasattr(pytorch_lightning.utilities.argparse, "_gpus_arg_default"):
             delattr(pytorch_lightning.utilities.argparse, "_gpus_arg_default")
         del sys.modules["pytorch_lightning.utilities.argparse_utils"]
+        _lock.release()
diff --git a/pytorch_lightning/utilities/rank_zero.py b/pytorch_lightning/utilities/rank_zero.py
index b503fa3727833a..e2292789c317c3 100644
--- a/pytorch_lightning/utilities/rank_zero.py
+++ b/pytorch_lightning/utilities/rank_zero.py
@@ -24,7 +24,7 @@
 
 
 def rank_zero_only(fn: Callable) -> Callable:
-    """Function that can be used as a decorator to enable a function/method being called only on rank 0."""
+    """Function that can be used as a decorator to enable a function/method being called only on global rank 0."""
 
     @wraps(fn)
     def wrapped_fn(*args: Any, **kwargs: Any) -> Optional[Any]:
@@ -65,13 +65,13 @@ def _debug(*args: Any, stacklevel: int = 2, **kwargs: Any) -> None:
 
 @rank_zero_only
 def rank_zero_debug(*args: Any, stacklevel: int = 4, **kwargs: Any) -> None:
-    """Function used to log debug-level messages only on rank 0."""
+    """Function used to log debug-level messages only on global rank 0."""
     _debug(*args, stacklevel=stacklevel, **kwargs)
 
 
 @rank_zero_only
 def rank_zero_info(*args: Any, stacklevel: int = 4, **kwargs: Any) -> None:
-    """Function used to log info-level messages only on rank 0."""
+    """Function used to log info-level messages only on global rank 0."""
     _info(*args, stacklevel=stacklevel, **kwargs)
 
 
@@ -88,7 +88,7 @@ def _warn(message: Union[str, Warning], stacklevel: int = 2, **kwargs: Any) -> N
 
 @rank_zero_only
 def rank_zero_warn(message: Union[str, Warning], stacklevel: int = 4, **kwargs: Any) -> None:
-    """Function used to log warn-level messages only on rank 0."""
+    """Function used to log warn-level messages only on global rank 0."""
     _warn(message, stacklevel=stacklevel, **kwargs)
 
 
diff --git a/pytorch_lightning/utilities/seed.py b/pytorch_lightning/utilities/seed.py
index 463c3e32cc1b35..d14e68824c9e0c 100644
--- a/pytorch_lightning/utilities/seed.py
+++ b/pytorch_lightning/utilities/seed.py
@@ -80,7 +80,9 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False) -> int:
     return seed
 
 
-def _select_seed_randomly(min_seed_value: int = 0, max_seed_value: int = 255) -> int:
+def _select_seed_randomly(
+    min_seed_value: int = np.iinfo(np.uint32).min, max_seed_value: int = np.iinfo(np.uint32).max
+) -> int:
     return random.randint(min_seed_value, max_seed_value)
 
 
diff --git a/pytorch_lightning/utilities/signature_utils.py b/pytorch_lightning/utilities/signature_utils.py
index 05045e98d3af65..dc618c57f9f2d1 100644
--- a/pytorch_lightning/utilities/signature_utils.py
+++ b/pytorch_lightning/utilities/signature_utils.py
@@ -25,9 +25,10 @@ def is_param_in_hook_signature(
         explicit: whether the parameter has to be explicitly declared
         min_args: whether the `signature` as at least `min_args` parameters
     """
-    hook_params = list(inspect.signature(hook_fx).parameters)
+    parameters = inspect.getfullargspec(hook_fx)
+    args = parameters.args[1:]  # ignore `self`
     return (
-        param in hook_params
-        or (not explicit and "args" in hook_params)
-        or (isinstance(min_args, int) and len(hook_params) >= min_args)
+        param in args
+        or (not explicit and (parameters.varargs is not None))
+        or (isinstance(min_args, int) and len(args) >= min_args)
     )
diff --git a/pytorch_lightning/utilities/types.py b/pytorch_lightning/utilities/types.py
index c5e384117f8e1e..c65ac4b39eccd0 100644
--- a/pytorch_lightning/utilities/types.py
+++ b/pytorch_lightning/utilities/types.py
@@ -68,6 +68,9 @@ class _LRScheduler(_Stateful, Protocol):
     def __init__(self, optimizer: Optimizer, *args: Any, **kwargs: Any) -> None:
         ...
 
+    def step(self, epoch: Optional[int] = None) -> None:
+        ...
+
 
 # Inferred from `torch.optim.lr_scheduler.pyi`
 # Missing attributes were added to improve typing
@@ -91,6 +94,9 @@ def __init__(
     ) -> None:
         ...
 
+    def step(self, metrics: Union[float, int, torch.Tensor], epoch: Optional[int] = None) -> None:
+        ...
+
 
 # todo: improve LRSchedulerType naming/typing
 LRSchedulerTypeTuple = (torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
diff --git a/requirements.txt b/requirements.txt
index 6aa080fc7e8fb9..39f0d586ba18f7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 
 numpy>=1.17.2
 torch>=1.8.*
-tqdm>=4.41.0
+tqdm>=4.57.0
 PyYAML>=5.4
 fsspec[http]>=2021.05.0, !=2021.06.0
 tensorboard>=2.2.0
diff --git a/requirements/devel-base.txt b/requirements/devel-base.txt
new file mode 100644
index 00000000000000..3252ebf39f431f
--- /dev/null
+++ b/requirements/devel-base.txt
@@ -0,0 +1,14 @@
+# install all mandatory dependencies
+-r ../requirements.txt
+
+# install all extra dependencies for full package testing
+-r ./extra.txt
+
+# install all loggers for full package testing
+-r ./loggers.txt
+
+# extended list of dependencies for development and run lint and tests
+-r ./test.txt
+
+# install all extra dependencies for running examples
+-r ./examples.txt
diff --git a/requirements/devel.txt b/requirements/devel.txt
index dcf66495ee46fc..09c2699ccd1dd7 100644
--- a/requirements/devel.txt
+++ b/requirements/devel.txt
@@ -4,6 +4,12 @@
 # install all extra dependencies for full package testing
 -r ./extra.txt
 
+# install all loggers for full package testing
+-r ./loggers.txt
+
+# install all strategies for full package testing
+-r ./strategies.txt
+
 # extended list of dependencies for development and run lint and tests
 -r ./test.txt
 
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 672f15ffd78b04..c61aef6a3d7eb5 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -6,7 +6,7 @@ pandoc>=1.0
 docutils>=0.16
 sphinxcontrib-fulltoc>=1.0
 sphinxcontrib-mockautodoc
-https://github.com/PyTorchLightning/lightning_sphinx_theme/archive/master.zip#egg=pt-lightning-sphinx-theme
+pt-lightning-sphinx-theme @ https://github.com/PyTorchLightning/lightning_sphinx_theme/archive/master.zip
 sphinx-autodoc-typehints>=1.11,<1.15  # v1.15 failing on master (#11405)
 sphinx-paramlinks>=0.5.1
 sphinx-togglebutton>=0.2
diff --git a/requirements/extra.txt b/requirements/extra.txt
index 05ea48b8f2c244..880fb47de97595 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -1,10 +1,9 @@
 # extended list of package dependencies to reach full functionality
 
 matplotlib>3.1
-horovod>=0.21.2,!=0.24.0  # no need to install with [pytorch] as pytorch is already installed
 torchtext>=0.9.*
 omegaconf>=2.0.5
 hydra-core>=1.0.5
-jsonargparse[signatures]>=4.6.0
+jsonargparse[signatures]>=4.7.1
 gcsfs>=2021.5.0
 rich>=10.2.2,!=10.15.*
diff --git a/requirements/loggers.txt b/requirements/loggers.txt
index 6e09559dcc8220..3b02aac28f6c08 100644
--- a/requirements/loggers.txt
+++ b/requirements/loggers.txt
@@ -2,5 +2,4 @@
 neptune-client>=0.10.0
 comet-ml>=3.1.12
 mlflow>=1.0.0
-test_tube>=0.7.5
 wandb>=0.8.21
diff --git a/requirements/strategies.txt b/requirements/strategies.txt
new file mode 100644
index 00000000000000..7846a297e30bc7
--- /dev/null
+++ b/requirements/strategies.txt
@@ -0,0 +1,4 @@
+fairscale>=0.4.5
+deepspeed<0.6.0
+horovod>=0.21.2,!=0.24.0  # no need to install with [pytorch] as pytorch is already installed
+hivemind>=1.0.1; sys_platform == 'linux'
diff --git a/requirements/test.txt b/requirements/test.txt
index 51d9ecf71db44c..a744f57382d143 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -7,8 +7,6 @@ mypy>=0.920
 flake8>=3.9.2
 pre-commit>=1.0
 pytest-forked
-sklearn
-jsonargparse
 
 # needed in tests
 cloudpickle>=1.3
diff --git a/setup.cfg b/setup.cfg
index 95612febfae77e..1bb124259a49f9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -24,6 +24,7 @@ addopts =
     --doctest-modules
     --color=yes
     --disable-pytest-warnings
+    --ignore=legacy/checkpoints
 filterwarnings =
     # error out on our deprecation warnings - ensures the code and tests are kept up-to-date
     error::pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning
diff --git a/setup.py b/setup.py
index 956b49a8b960d1..e5be7cd37df7f6 100755
--- a/setup.py
+++ b/setup.py
@@ -43,9 +43,10 @@ def _load_py_module(fname, pkg="pytorch_lightning"):
     "examples": setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name="examples.txt"),
     "loggers": setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name="loggers.txt"),
     "extra": setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name="extra.txt"),
+    "strategies": setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name="strategies.txt"),
     "test": setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name="test.txt"),
 }
-extras["dev"] = extras["extra"] + extras["loggers"] + extras["test"]
+extras["dev"] = extras["extra"] + extras["loggers"] + extras["strategies"] + extras["test"]
 extras["all"] = extras["dev"] + extras["examples"]  # + extras['docs']
 
 # These packages shall be installed only on GPU machines
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 20e385154545d7..64c9d9bdd7b32c 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -36,6 +36,7 @@
 from pytorch_lightning.strategies import (
     DataParallelStrategy,
     DDP2Strategy,
+    DDPFullyShardedNativeStrategy,
     DDPShardedStrategy,
     DDPSpawnShardedStrategy,
     DDPSpawnStrategy,
@@ -269,6 +270,7 @@ def test_accelerator_cpu(_):
     ):
         with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed"):
             Trainer(gpus=1)
+
     with pytest.raises(
         MisconfigurationException,
         match="GPUAccelerator can not run on your system since the accelerator is not available.",
@@ -279,6 +281,16 @@ def test_accelerator_cpu(_):
         Trainer(accelerator="cpu", gpus=1)
 
 
+@mock.patch("torch.cuda.device_count", return_value=2)
+@mock.patch("torch.cuda.is_available", return_value=True)
+@pytest.mark.parametrize("device_count", (["0"], [0, "1"], ["GPU"], [["0", "1"], [0, 1]], [False]))
+def test_accelererator_invalid_type_devices(mock_is_available, mock_device_count, device_count):
+    with pytest.raises(
+        MisconfigurationException, match=r"must be an int, a string, a sequence of ints or None, but you"
+    ):
+        _ = Trainer(accelerator="gpu", devices=device_count)
+
+
 @RunIf(min_gpus=1)
 def test_accelerator_gpu():
     trainer = Trainer(accelerator="gpu", devices=1)
@@ -631,6 +643,35 @@ def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock
     assert trainer.strategy.local_rank == 0
 
 
+@RunIf(min_torch="1.11")
+def test_check_native_fsdp_strategy_and_fallback():
+    with pytest.raises(
+        MisconfigurationException,
+        match=f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
+        "but GPU accelerator is not used.",
+    ):
+        Trainer(accelerator="cpu", strategy="fsdp_native")
+
+
+@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
+@mock.patch("torch.cuda.device_count", return_value=1)
+@mock.patch("torch.cuda.is_available", return_value=True)
+@RunIf(min_torch="1.11")
+def test_mixed_precision_support_with_native_fsdp_strategy(device_count_mock, mock_cuda_available, tmpdir):
+    with pytest.raises(
+        MisconfigurationException, match="DDPFullyShardedNativeStrategy currently doesn't support Mixed Precision"
+    ):
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            fast_dev_run=True,
+            strategy="fsdp_native",
+            accelerator="gpu",
+            devices=1,
+            precision=16,
+        )
+        assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy)
+
+
 @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True)
 def test_unsupported_tpu_choice(mock_tpu_acc_avail):
 
@@ -688,7 +729,7 @@ def test_parallel_devices_in_strategy_confilict_with_accelerator(parallel_device
         Trainer(strategy=DDPStrategy(parallel_devices=parallel_devices), accelerator=accelerator)
 
 
-@pytest.mark.parametrize("deterministic", [True, False])
+@pytest.mark.parametrize("deterministic", [True, False, pytest.param("warn", marks=RunIf(min_torch="1.11.0"))])
 def test_deterministic_init(deterministic):
     trainer = Trainer(accelerator="auto", deterministic=deterministic)
     assert trainer._accelerator_connector.deterministic == deterministic
diff --git a/tests/benchmarks/test_sync_batchnorm_parity.py b/tests/benchmarks/test_sync_batchnorm_parity.py
index 6c02ca0efcb420..180d5e5d9f1aaf 100644
--- a/tests/benchmarks/test_sync_batchnorm_parity.py
+++ b/tests/benchmarks/test_sync_batchnorm_parity.py
@@ -68,6 +68,8 @@ def test_sync_batchnorm_parity(tmpdir):
         replace_sampler_ddp=False,
         deterministic=True,
         benchmark=False,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
diff --git a/tests/callbacks/test_rich_model_summary.py b/tests/callbacks/test_rich_model_summary.py
index c596557eed0dcf..d9e4ec55902cab 100644
--- a/tests/callbacks/test_rich_model_summary.py
+++ b/tests/callbacks/test_rich_model_summary.py
@@ -41,8 +41,8 @@ def test_rich_progress_bar_import_error(monkeypatch):
 
 
 @RunIf(rich=True)
-@mock.patch("pytorch_lightning.callbacks.rich_model_summary.Console.print", autospec=True)
-@mock.patch("pytorch_lightning.callbacks.rich_model_summary.Table.add_row", autospec=True)
+@mock.patch("rich.console.Console.print", autospec=True)
+@mock.patch("rich.table.Table.add_row", autospec=True)
 def test_rich_summary_tuples(mock_table_add_row, mock_console):
     """Ensure that tuples are converted into string, and print is called correctly."""
     model_summary = RichModelSummary()
diff --git a/tests/callbacks/test_stochastic_weight_avg.py b/tests/callbacks/test_stochastic_weight_avg.py
index f62bcb19432b58..9ec66f08867f99 100644
--- a/tests/callbacks/test_stochastic_weight_avg.py
+++ b/tests/callbacks/test_stochastic_weight_avg.py
@@ -128,6 +128,7 @@ def train_with_swa(
     trainer = Trainer(
         default_root_dir=tmpdir,
         enable_progress_bar=False,
+        enable_model_summary=False,
         max_epochs=max_epochs,
         limit_train_batches=5,
         limit_val_batches=0,
diff --git a/tests/callbacks/test_tqdm_progress_bar.py b/tests/callbacks/test_tqdm_progress_bar.py
index 0ec38648a674d3..48f27626a88dc1 100644
--- a/tests/callbacks/test_tqdm_progress_bar.py
+++ b/tests/callbacks/test_tqdm_progress_bar.py
@@ -53,6 +53,7 @@ def n(self):
     @n.setter
     def n(self, value):
         self.__n = value
+
         # track the changes in the `n` value
         if not len(self.n_values) or value != self.n_values[-1]:
             self.n_values.append(value)
@@ -150,7 +151,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=None):
     assert not pbar.val_progress_bar.leave
     assert trainer.num_sanity_val_batches == expected_sanity_steps
     assert pbar.val_progress_bar.total_values == expected_sanity_steps
-    assert pbar.val_progress_bar.n_values == list(range(1, num_sanity_val_steps + 1)) * num_dl
+    assert pbar.val_progress_bar.n_values == list(range(num_sanity_val_steps + 1)) * num_dl
     assert pbar.val_progress_bar.descriptions == [f"Sanity Checking DataLoader {i}: " for i in range(num_dl)]
 
     # fit
@@ -169,7 +170,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=None):
 
     # check val progress bar total
     assert pbar.val_progress_bar.total_values == m
-    assert pbar.val_progress_bar.n_values == list(range(1, m[0] + 1)) * num_dl
+    assert pbar.val_progress_bar.n_values == list(range(m[0] + 1)) * num_dl
     assert pbar.val_progress_bar.descriptions == [f"Validation DataLoader {i}: " for i in range(num_dl)]
     assert not pbar.val_progress_bar.leave
 
@@ -178,7 +179,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=None):
         trainer.validate(model)
     assert trainer.num_val_batches == m
     assert pbar.val_progress_bar.total_values == m
-    assert pbar.val_progress_bar.n_values == list(range(1, m[0] + 1)) * num_dl
+    assert pbar.val_progress_bar.n_values == list(range(m[0] + 1)) * num_dl
     assert pbar.val_progress_bar.descriptions == [f"Validation DataLoader {i}: " for i in range(num_dl)]
 
     # test
@@ -187,7 +188,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=None):
     assert pbar.test_progress_bar.leave
     k = trainer.num_test_batches
     assert pbar.test_progress_bar.total_values == k
-    assert pbar.test_progress_bar.n_values == list(range(1, k[0] + 1)) * num_dl
+    assert pbar.test_progress_bar.n_values == list(range(k[0] + 1)) * num_dl
     assert pbar.test_progress_bar.descriptions == [f"Testing DataLoader {i}: " for i in range(num_dl)]
     assert pbar.test_progress_bar.leave
 
@@ -197,7 +198,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=None):
     assert pbar.predict_progress_bar.leave
     k = trainer.num_predict_batches
     assert pbar.predict_progress_bar.total_values == k
-    assert pbar.predict_progress_bar.n_values == list(range(1, k[0] + 1)) * num_dl
+    assert pbar.predict_progress_bar.n_values == list(range(k[0] + 1)) * num_dl
     assert pbar.predict_progress_bar.descriptions == [f"Predicting DataLoader {i}: " for i in range(num_dl)]
     assert pbar.predict_progress_bar.leave
 
@@ -345,13 +346,13 @@ def test_tqdm_progress_bar_value_on_colab(tmpdir):
 @pytest.mark.parametrize(
     "train_batches,val_batches,refresh_rate,train_updates,val_updates",
     [
-        [2, 3, 1, [1, 2, 3, 4, 5], [1, 2, 3]],
+        [2, 3, 1, [0, 1, 2, 3, 4, 5], [0, 1, 2, 3]],
         [0, 0, 3, None, None],
-        [1, 0, 3, [1], None],
-        [1, 1, 3, [2], [1]],
-        [5, 0, 3, [3, 5], None],
-        [5, 2, 3, [3, 6, 7], [2]],
-        [5, 2, 6, [6, 7], [2]],
+        [1, 0, 3, [0, 1], None],
+        [1, 1, 3, [0, 2], [0, 1]],
+        [5, 0, 3, [0, 3, 5], None],
+        [5, 2, 3, [0, 3, 6, 7], [0, 2]],
+        [5, 2, 6, [0, 6, 7], [0, 2]],
     ],
 )
 def test_main_progress_bar_update_amount(
@@ -381,7 +382,7 @@ def test_main_progress_bar_update_amount(
         assert progress_bar.val_progress_bar.n_values == val_updates
 
 
-@pytest.mark.parametrize("test_batches,refresh_rate,updates", [[1, 3, [1]], [3, 1, [1, 2, 3]], [5, 3, [3, 5]]])
+@pytest.mark.parametrize("test_batches,refresh_rate,updates", [(1, 3, [0, 1]), (3, 1, [0, 1, 2, 3]), (5, 3, [0, 3, 5])])
 def test_test_progress_bar_update_amount(tmpdir, test_batches: int, refresh_rate: int, updates: list):
     """Test that test progress updates with the correct amount."""
     model = BoringModel()
@@ -552,7 +553,7 @@ def test_tqdm_progress_bar_can_be_pickled():
 
 @pytest.mark.parametrize(
     ["val_check_interval", "main_progress_bar_updates", "val_progress_bar_updates"],
-    [(4, [3, 6, 9, 12, 14], [3, 6, 7]), (0.5, [3, 6, 9, 12, 15, 18, 21], [3, 6, 7])],
+    [(4, [0, 3, 6, 9, 12, 14], [0, 3, 6, 7]), (0.5, [0, 3, 6, 9, 12, 15, 18, 21], [0, 3, 6, 7])],
 )
 def test_progress_bar_max_val_check_interval(
     tmpdir, val_check_interval, main_progress_bar_updates, val_progress_bar_updates
@@ -607,11 +608,12 @@ def test_progress_bar_max_val_check_interval_ddp(tmpdir, val_check_interval):
         default_root_dir=tmpdir,
         num_sanity_val_steps=0,
         max_epochs=1,
-        enable_model_summary=False,
         val_check_interval=val_check_interval,
         accelerator="gpu",
         devices=world_size,
         strategy="ddp",
+        enable_progress_bar=True,
+        enable_model_summary=False,
     )
     trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
 
diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py
index 7e753617a63315..ac2806cbf3811e 100644
--- a/tests/checkpointing/test_legacy_checkpoints.py
+++ b/tests/checkpointing/test_legacy_checkpoints.py
@@ -14,6 +14,7 @@
 import glob
 import os
 import sys
+import threading
 from unittest.mock import patch
 
 import pytest
@@ -60,6 +61,28 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo
             trainer.should_stop = True
 
 
+@pytest.mark.parametrize("pl_version", LEGACY_BACK_COMPATIBLE_PL_VERSIONS)
+def test_legacy_ckpt_threading(tmpdir, pl_version: str):
+    def load_model():
+        import torch
+
+        from pytorch_lightning.utilities.migration import pl_legacy_patch
+
+        with pl_legacy_patch():
+            _ = torch.load(PATH_LEGACY)
+
+    PATH_LEGACY = os.path.join(LEGACY_CHECKPOINTS_PATH, pl_version)
+    with patch("sys.path", [PATH_LEGACY] + sys.path):
+        t1 = threading.Thread(target=load_model)
+        t2 = threading.Thread(target=load_model)
+
+        t1.start()
+        t2.start()
+
+        t1.join()
+        t2.join()
+
+
 @pytest.mark.parametrize("pl_version", LEGACY_BACK_COMPATIBLE_PL_VERSIONS)
 def test_resume_legacy_checkpoints(tmpdir, pl_version: str):
     PATH_LEGACY = os.path.join(LEGACY_CHECKPOINTS_PATH, pl_version)
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 9e112f81ad4500..fc2fa981c9f585 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -452,6 +452,12 @@ def test_model_checkpoint_format_checkpoint_name(tmpdir):
     )
     assert ckpt_name == "epoch=003-val_acc=0.03"
 
+    # dots in the metric name
+    ckpt_name = ModelCheckpoint._format_checkpoint_name(
+        "mAP@0.50={val/mAP@0.50:.4f}", {"val/mAP@0.50": 0.2}, auto_insert_metric_name=False
+    )
+    assert ckpt_name == "mAP@0.50=0.2000"
+
 
 class ModelCheckpointExtensionTest(ModelCheckpoint):
     FILE_EXTENSION = ".tpkc"
@@ -1293,6 +1299,23 @@ def test_save_last_saves_correct_last_model_path(tmpdir):
     assert ckpt["callbacks"][mc.state_key]["last_model_path"] == full_path
 
 
+def test_save_last_versioning(tmpdir):
+    model = BoringModel()
+    for _ in range(2):
+        mc = ModelCheckpoint(dirpath=tmpdir, save_top_k=0, save_last=True)
+        trainer = Trainer(
+            max_epochs=2,
+            callbacks=mc,
+            limit_train_batches=1,
+            limit_val_batches=0,
+            enable_progress_bar=False,
+            enable_model_summary=False,
+            logger=False,
+        )
+        trainer.fit(model)
+    assert {"last.ckpt", "last-v1.ckpt"} == set(os.listdir(tmpdir))
+
+
 def test_none_monitor_saves_correct_best_model_path(tmpdir):
     mc = ModelCheckpoint(dirpath=tmpdir, monitor=None)
     trainer = Trainer(callbacks=mc)
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index 9ffa443a809e2f..8b7f9d66b1341f 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -377,9 +377,9 @@ def test_dm_init_from_datasets_dataloaders(iterable):
     with mock.patch("pytorch_lightning.core.datamodule.DataLoader") as dl_mock:
         dm.train_dataloader()
         dl_mock.assert_called_once_with(train_ds, batch_size=4, shuffle=not iterable, num_workers=0, pin_memory=True)
-    with pytest.raises(MisconfigurationException):
+    with pytest.raises(MisconfigurationException, match="`val_dataloader` must be implemented"):
         _ = dm.val_dataloader()
-    with pytest.raises(MisconfigurationException):
+    with pytest.raises(MisconfigurationException, match="`test_dataloader` must be implemented"):
         _ = dm.test_dataloader()
 
     train_ds_sequence = [ds(), ds()]
@@ -392,9 +392,9 @@ def test_dm_init_from_datasets_dataloaders(iterable):
                 call(train_ds_sequence[1], batch_size=4, shuffle=not iterable, num_workers=0, pin_memory=True),
             ]
         )
-    with pytest.raises(MisconfigurationException):
+    with pytest.raises(MisconfigurationException, match="`val_dataloader` must be implemented"):
         _ = dm.val_dataloader()
-    with pytest.raises(MisconfigurationException):
+    with pytest.raises(MisconfigurationException, match="`test_dataloader` must be implemented"):
         _ = dm.test_dataloader()
 
     valid_ds = ds()
@@ -405,21 +405,25 @@ def test_dm_init_from_datasets_dataloaders(iterable):
         dl_mock.assert_called_with(valid_ds, batch_size=2, shuffle=False, num_workers=0, pin_memory=True)
         dm.test_dataloader()
         dl_mock.assert_called_with(test_ds, batch_size=2, shuffle=False, num_workers=0, pin_memory=True)
-    with pytest.raises(MisconfigurationException):
+    with pytest.raises(MisconfigurationException, match="`train_dataloader` must be implemented"):
         _ = dm.train_dataloader()
 
     valid_dss = [ds(), ds()]
     test_dss = [ds(), ds()]
-    dm = LightningDataModule.from_datasets(train_ds, valid_dss, test_dss, batch_size=4, num_workers=0)
+    predict_dss = [ds(), ds()]
+    dm = LightningDataModule.from_datasets(train_ds, valid_dss, test_dss, predict_dss, batch_size=4, num_workers=0)
     with mock.patch("pytorch_lightning.core.datamodule.DataLoader") as dl_mock:
         dm.val_dataloader()
         dm.test_dataloader()
+        dm.predict_dataloader()
         dl_mock.assert_has_calls(
             [
                 call(valid_dss[0], batch_size=4, shuffle=False, num_workers=0, pin_memory=True),
                 call(valid_dss[1], batch_size=4, shuffle=False, num_workers=0, pin_memory=True),
                 call(test_dss[0], batch_size=4, shuffle=False, num_workers=0, pin_memory=True),
                 call(test_dss[1], batch_size=4, shuffle=False, num_workers=0, pin_memory=True),
+                call(predict_dss[0], batch_size=4, shuffle=False, num_workers=0, pin_memory=True),
+                call(predict_dss[1], batch_size=4, shuffle=False, num_workers=0, pin_memory=True),
             ]
         )
 
diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py
index c7fee3b0d5fd07..07fcf8dadccc3d 100644
--- a/tests/core/test_lightning_module.py
+++ b/tests/core/test_lightning_module.py
@@ -77,7 +77,7 @@ def test_property_logger(tmpdir):
     assert model.logger is None
 
     logger = TensorBoardLogger(tmpdir)
-    trainer = Mock(logger=logger)
+    trainer = Mock(loggers=[logger])
     model.trainer = trainer
     assert model.logger == logger
 
diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py
index d7f4d2bf8547b0..781b4507b2a693 100644
--- a/tests/core/test_metric_result_integration.py
+++ b/tests/core/test_metric_result_integration.py
@@ -464,7 +464,7 @@ def on_train_epoch_end(self) -> None:
     )
     ckpt_path = os.path.join(tmpdir, ".pl_auto_save.ckpt")
 
-    trainer = Trainer(**trainer_kwargs)
+    trainer = Trainer(**trainer_kwargs, enable_progress_bar=False, enable_model_summary=False)
     trainer.fit(model, ckpt_path=ckpt_path)
     assert model.has_validated_sum
 
diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py
index d9ec403a7bd787..270cd7ecd9769b 100644
--- a/tests/deprecated_api/test_remove_1-7.py
+++ b/tests/deprecated_api/test_remove_1-7.py
@@ -20,9 +20,9 @@
 import pytest
 import torch
 
-from pytorch_lightning import Callback, LightningDataModule, Trainer
+from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
-from pytorch_lightning.loggers import LoggerCollection, TestTubeLogger
+from pytorch_lightning.loggers import LoggerCollection
 from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper
 from pytorch_lightning.plugins.environments import (
     KubeflowEnvironment,
@@ -34,45 +34,10 @@
 from pytorch_lightning.strategies import SingleDeviceStrategy
 from tests.deprecated_api import _soft_unimport_module
 from tests.helpers import BoringModel
-from tests.helpers.datamodules import MNISTDataModule
 from tests.loggers.test_logger import CustomLogger
 from tests.plugins.environments.test_lsf_environment import _make_rankfile
 
 
-def test_v1_7_0_datamodule_transform_properties(tmpdir):
-    dm = MNISTDataModule()
-    with pytest.deprecated_call(match=r"DataModule property `val_transforms` was deprecated in v1.5"):
-        dm.val_transforms = "b"
-    with pytest.deprecated_call(match=r"DataModule property `test_transforms` was deprecated in v1.5"):
-        dm.test_transforms = "c"
-    with pytest.deprecated_call(match=r"DataModule property `val_transforms` was deprecated in v1.5"):
-        _ = LightningDataModule(val_transforms="b")
-    with pytest.deprecated_call(match=r"DataModule property `test_transforms` was deprecated in v1.5"):
-        _ = LightningDataModule(test_transforms="c")
-
-
-def test_v1_7_0_moved_get_progress_bar_dict(tmpdir):
-    class TestModel(BoringModel):
-        def get_progress_bar_dict(self):
-            items = super().get_progress_bar_dict()
-            items.pop("v_num", None)
-            return items
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        fast_dev_run=True,
-    )
-    test_model = TestModel()
-    with pytest.deprecated_call(match=r"`LightningModule.get_progress_bar_dict` method was deprecated in v1.5"):
-        trainer.fit(test_model)
-    standard_metrics_postfix = trainer.progress_bar_callback.main_progress_bar.postfix
-    assert "loss" in standard_metrics_postfix
-    assert "v_num" not in standard_metrics_postfix
-
-    with pytest.deprecated_call(match=r"`trainer.progress_bar_dict` is deprecated in v1.5"):
-        _ = trainer.progress_bar_dict
-
-
 def test_v1_7_0_deprecated_on_task_dataloader(tmpdir):
     class CustomBoringModel(BoringModel):
         def on_train_dataloader(self):
@@ -119,12 +84,6 @@ def _run(model, task="fit"):
         _run(model, "predict")
 
 
-@mock.patch("pytorch_lightning.loggers.test_tube.Experiment")
-def test_v1_7_0_test_tube_logger(_, tmpdir):
-    with pytest.deprecated_call(match="The TestTubeLogger is deprecated since v1.5 and will be removed in v1.7"):
-        _ = TestTubeLogger(tmpdir)
-
-
 def test_v1_7_0_on_interrupt(tmpdir):
     class HandleInterruptCallback(Callback):
         def on_keyboard_interrupt(self, trainer, pl_module):
diff --git a/tests/deprecated_api/test_remove_1-8.py b/tests/deprecated_api/test_remove_1-8.py
index 5a46411c8302bc..f168cfcd12f34d 100644
--- a/tests/deprecated_api/test_remove_1-8.py
+++ b/tests/deprecated_api/test_remove_1-8.py
@@ -758,10 +758,11 @@ def test_v1_8_0_logger_collection(tmpdir):
     trainer1.logger
     trainer1.loggers
     trainer2.loggers
-    trainer2.logger
 
+    with pytest.deprecated_call(match="logger` will return the first logger"):
+        _ = trainer2.logger
     with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"):
-        LoggerCollection([logger1, logger2])
+        _ = LoggerCollection([logger1, logger2])
 
 
 def test_v1_8_0_precision_plugin_checkpoint_hooks(tmpdir):
diff --git a/tests/deprecated_api/test_remove_1-9.py b/tests/deprecated_api/test_remove_1-9.py
index 7324426ec1e146..1be555fe815877 100644
--- a/tests/deprecated_api/test_remove_1-9.py
+++ b/tests/deprecated_api/test_remove_1-9.py
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from unittest import mock
+
 import pytest
 
 import pytorch_lightning.loggers.base as logger_base
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.utilities.cli import LightningCLI
 from pytorch_lightning.utilities.rank_zero import rank_zero_only
 
 
@@ -78,3 +82,11 @@ def test_lightning_logger_base_merge_dicts_deprecation_warning():
         dflt_func = min
         agg_funcs = {"a": min, "v": max, "d": {"d1": sum}}
         logger_base.merge_dicts([d1, d2, d3], agg_funcs, dflt_func)
+
+
+def test_lightningCLI_seed_everything_default_to_None_deprecation_warning():
+    with mock.patch("sys.argv", ["any.py"]), pytest.deprecated_call(
+        match="Setting `LightningCLI.seed_everything_default` to `None` is deprecated in v1.7 "
+        "and will be removed in v1.9. Set it to `False` instead."
+    ):
+        LightningCLI(LightningModule, run=False, seed_everything_default=None)
diff --git a/tests/helpers/boring_model.py b/tests/helpers/boring_model.py
index 5f81b6120c91f9..2040252d23bb3b 100644
--- a/tests/helpers/boring_model.py
+++ b/tests/helpers/boring_model.py
@@ -11,12 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
-
 import torch
-from torch.utils.data import DataLoader, Dataset, IterableDataset, Subset
+from torch.utils.data import Dataset, IterableDataset
+
+from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, ManualOptimBoringModel, RandomDataset
 
-from pytorch_lightning import LightningDataModule, LightningModule
+__all__ = ["BoringDataModule", "BoringModel", "ManualOptimBoringModel", "RandomDataset"]
 
 
 class RandomDictDataset(Dataset):
@@ -33,18 +33,6 @@ def __len__(self):
         return self.len
 
 
-class RandomDataset(Dataset):
-    def __init__(self, size: int, length: int):
-        self.len = length
-        self.data = torch.randn(length, size)
-
-    def __getitem__(self, index):
-        return self.data[index]
-
-    def __len__(self):
-        return self.len
-
-
 class RandomIterableDataset(IterableDataset):
     def __init__(self, size: int, count: int):
         self.count = count
@@ -66,129 +54,3 @@ def __iter__(self):
 
     def __len__(self):
         return self.count
-
-
-class BoringModel(LightningModule):
-    def __init__(self):
-        """Testing PL Module.
-
-        Use as follows:
-        - subclass
-        - modify the behavior for what you want
-
-        class TestModel(BaseTestModel):
-            def training_step(...):
-                # do your own thing
-
-        or:
-
-        model = BaseTestModel()
-        model.training_epoch_end = None
-        """
-        super().__init__()
-        self.layer = torch.nn.Linear(32, 2)
-
-    def forward(self, x):
-        return self.layer(x)
-
-    def loss(self, batch, preds):
-        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
-        return torch.nn.functional.mse_loss(preds, torch.ones_like(preds))
-
-    def step(self, x):
-        x = self(x)
-        out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
-        return out
-
-    def training_step(self, batch, batch_idx):
-        output = self(batch)
-        loss = self.loss(batch, output)
-        return {"loss": loss}
-
-    def training_step_end(self, training_step_outputs):
-        return training_step_outputs
-
-    def training_epoch_end(self, outputs) -> None:
-        torch.stack([x["loss"] for x in outputs]).mean()
-
-    def validation_step(self, batch, batch_idx):
-        output = self(batch)
-        loss = self.loss(batch, output)
-        return {"x": loss}
-
-    def validation_epoch_end(self, outputs) -> None:
-        torch.stack([x["x"] for x in outputs]).mean()
-
-    def test_step(self, batch, batch_idx):
-        output = self(batch)
-        loss = self.loss(batch, output)
-        return {"y": loss}
-
-    def test_epoch_end(self, outputs) -> None:
-        torch.stack([x["y"] for x in outputs]).mean()
-
-    def configure_optimizers(self):
-        optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
-        return [optimizer], [lr_scheduler]
-
-    def train_dataloader(self):
-        return DataLoader(RandomDataset(32, 64))
-
-    def val_dataloader(self):
-        return DataLoader(RandomDataset(32, 64))
-
-    def test_dataloader(self):
-        return DataLoader(RandomDataset(32, 64))
-
-    def predict_dataloader(self):
-        return DataLoader(RandomDataset(32, 64))
-
-
-class BoringDataModule(LightningDataModule):
-    def __init__(self, data_dir: str = "./"):
-        super().__init__()
-        self.data_dir = data_dir
-        self.non_picklable = None
-        self.checkpoint_state: Optional[str] = None
-        self.random_full = RandomDataset(32, 64 * 4)
-
-    def setup(self, stage: Optional[str] = None):
-        if stage == "fit" or stage is None:
-            self.random_train = Subset(self.random_full, indices=range(64))
-
-        if stage in ("fit", "validate") or stage is None:
-            self.random_val = Subset(self.random_full, indices=range(64, 64 * 2))
-
-        if stage == "test" or stage is None:
-            self.random_test = Subset(self.random_full, indices=range(64 * 2, 64 * 3))
-
-        if stage == "predict" or stage is None:
-            self.random_predict = Subset(self.random_full, indices=range(64 * 3, 64 * 4))
-
-    def train_dataloader(self):
-        return DataLoader(self.random_train)
-
-    def val_dataloader(self):
-        return DataLoader(self.random_val)
-
-    def test_dataloader(self):
-        return DataLoader(self.random_test)
-
-    def predict_dataloader(self):
-        return DataLoader(self.random_predict)
-
-
-class ManualOptimBoringModel(BoringModel):
-    def __init__(self):
-        super().__init__()
-        self.automatic_optimization = False
-
-    def training_step(self, batch, batch_idx):
-        opt = self.optimizers()
-        output = self(batch)
-        loss = self.loss(batch, output)
-        opt.zero_grad()
-        self.manual_backward(loss)
-        opt.step()
-        return loss
diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py
index 5a2464f6fd6bad..99d64ebd01c9a3 100644
--- a/tests/helpers/runif.py
+++ b/tests/helpers/runif.py
@@ -26,6 +26,7 @@
     _DEEPSPEED_AVAILABLE,
     _FAIRSCALE_AVAILABLE,
     _FAIRSCALE_FULLY_SHARDED_AVAILABLE,
+    _HIVEMIND_AVAILABLE,
     _HOROVOD_AVAILABLE,
     _HPU_AVAILABLE,
     _IPU_AVAILABLE,
@@ -84,6 +85,7 @@ def __new__(
         omegaconf: bool = False,
         slow: bool = False,
         bagua: bool = False,
+        hivemind: bool = False,
         **kwargs,
     ):
         """
@@ -111,6 +113,7 @@ def __new__(
             omegaconf: Require that omry/omegaconf is installed.
             slow: Mark the test as slow, our CI will run it in a separate job.
             bagua: Require that BaguaSys/bagua is installed.
+            hivemind: Require that Hivemind is installed.
             **kwargs: Any :class:`pytest.mark.skipif` keyword arguments.
         """
         conditions = []
@@ -231,6 +234,10 @@ def __new__(
             conditions.append(not _BAGUA_AVAILABLE or sys.platform in ("win32", "darwin"))
             reasons.append("Bagua")
 
+        if hivemind:
+            conditions.append(not _HIVEMIND_AVAILABLE or sys.platform in ("win32", "darwin"))
+            reasons.append("Hivemind")
+
         reasons = [rs for cond, rs in zip(conditions, reasons) if cond]
         return pytest.mark.skipif(
             *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs
diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py
index 05dc566949a08c..0e62c4b109ef5b 100644
--- a/tests/helpers/utils.py
+++ b/tests/helpers/utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import functools
 import os
+import re
 import traceback
 from contextlib import contextmanager
 from typing import Optional, Type
@@ -126,7 +127,7 @@ def no_warning_call(expected_warning: Type[Warning] = UserWarning, match: Option
             return
     else:
         for w in record.list:
-            if w.category is expected_warning and match in w.message.args[0]:
+            if w.category is expected_warning and re.compile(match).search(w.message.args[0]):
                 break
         else:
             return
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index 2fae9ed102b7c7..d72bb8cc51ae4c 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -29,7 +29,6 @@
     MLFlowLogger,
     NeptuneLogger,
     TensorBoardLogger,
-    TestTubeLogger,
     WandbLogger,
 )
 from pytorch_lightning.loggers.logger import DummyExperiment
@@ -45,7 +44,6 @@
     mock.patch("pytorch_lightning.loggers.mlflow.mlflow"),
     mock.patch("pytorch_lightning.loggers.mlflow.MlflowClient"),
     mock.patch("pytorch_lightning.loggers.neptune.neptune", new_callable=create_neptune_mock),
-    mock.patch("pytorch_lightning.loggers.test_tube.Experiment"),
     mock.patch("pytorch_lightning.loggers.wandb.wandb"),
 )
 ALL_LOGGER_CLASSES = (
@@ -54,10 +52,8 @@
     MLFlowLogger,
     NeptuneLogger,
     TensorBoardLogger,
-    TestTubeLogger,
     WandbLogger,
 )
-ALL_LOGGER_CLASSES_WO_TTUBE = tuple(filter(lambda cls: cls is not TestTubeLogger, ALL_LOGGER_CLASSES))
 ALL_LOGGER_CLASSES_WO_NEPTUNE = tuple(filter(lambda cls: cls is not NeptuneLogger, ALL_LOGGER_CLASSES))
 ALL_LOGGER_CLASSES_WO_NEPTUNE_WANDB = tuple(filter(lambda cls: cls is not WandbLogger, ALL_LOGGER_CLASSES_WO_NEPTUNE))
 
@@ -82,7 +78,7 @@ def _instantiate_logger(logger_class, save_dir, **override_kwargs):
     return logger
 
 
-@pytest.mark.parametrize("logger_class", ALL_LOGGER_CLASSES_WO_TTUBE)
+@pytest.mark.parametrize("logger_class", ALL_LOGGER_CLASSES)
 def test_loggers_fit_test_all(tmpdir, monkeypatch, logger_class):
     """Verify that basic functionality of all loggers."""
     with contextlib.ExitStack() as stack:
@@ -128,10 +124,6 @@ def log_metrics(self, metrics, step):
         logger.experiment.id = "foo"
         logger.experiment.project_name = "bar"
 
-    if logger_class == TestTubeLogger:
-        logger.experiment.version = "foo"
-        logger.experiment.name = "bar"
-
     if logger_class == MLFlowLogger:
         logger = mock_mlflow_run_creation(logger, experiment_id="foo", run_id="bar")
 
@@ -232,11 +224,7 @@ def test_loggers_pickle_all(tmpdir, monkeypatch, logger_class):
     """
     _patch_comet_atexit(monkeypatch)
     try:
-        if logger_class is TestTubeLogger:
-            with pytest.deprecated_call(match="TestTubeLogger is deprecated since v1.5"):
-                _test_loggers_pickle(tmpdir, monkeypatch, logger_class)
-        else:
-            _test_loggers_pickle(tmpdir, monkeypatch, logger_class)
+        _test_loggers_pickle(tmpdir, monkeypatch, logger_class)
     except (ImportError, ModuleNotFoundError):
         pytest.xfail(f"pickle test requires {logger_class.__class__} dependencies to be installed.")
 
@@ -316,11 +304,7 @@ def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class):
     """Test that loggers get replaced by dummy loggers on global rank > 0."""
     _patch_comet_atexit(monkeypatch)
     try:
-        if logger_class is TestTubeLogger:
-            with pytest.deprecated_call(match="TestTubeLogger is deprecated since v1.5"):
-                _test_logger_created_on_rank_zero_only(tmpdir, logger_class)
-        else:
-            _test_logger_created_on_rank_zero_only(tmpdir, logger_class)
+        _test_logger_created_on_rank_zero_only(tmpdir, logger_class)
     except (ImportError, ModuleNotFoundError):
         pytest.xfail(f"multi-process test requires {logger_class.__class__} dependencies to be installed.")
 
@@ -378,14 +362,6 @@ def test_logger_with_prefix_all(tmpdir, monkeypatch):
         logger.log_metrics({"test": 1.0}, step=0)
         logger.experiment.add_scalar.assert_called_once_with("tmp-test", 1.0, 0)
 
-    # TestTube
-    with mock.patch("pytorch_lightning.loggers.test_tube.Experiment"), pytest.deprecated_call(
-        match="TestTubeLogger is deprecated since v1.5"
-    ):
-        logger = _instantiate_logger(TestTubeLogger, save_dir=tmpdir, prefix=prefix)
-        logger.log_metrics({"test": 1.0}, step=0)
-        logger.experiment.log.assert_called_once_with({"tmp-test": 1.0}, global_step=0)
-
     # WandB
     with mock.patch("pytorch_lightning.loggers.wandb.wandb") as wandb:
         logger = _instantiate_logger(WandbLogger, save_dir=tmpdir, prefix=prefix)
diff --git a/tests/loops/batch/test_truncated_bptt.py b/tests/loops/batch/test_truncated_bptt.py
index 55adbc618b9f95..a43d15909f9bcc 100644
--- a/tests/loops/batch/test_truncated_bptt.py
+++ b/tests/loops/batch/test_truncated_bptt.py
@@ -170,3 +170,36 @@ def training_step(self, *args, **kwargs):
     )
     trainer.fit(model)
     assert set(trainer.logged_metrics) == {"loss_step", "loss_epoch"}
+
+
+def test_hiddens_multiple_optimizers(tmpdir):
+    class TBPTTModel(LSTMModel):
+        # TODO: `optimizer_idx=n` gets the hiddens from `optimizer_idx=n-1` instead of the hidden from
+        # `optimizer_idx=n`, `split_idx=m-1`. This is unexpected and should be changed
+        test_hiddens = None
+
+        def training_step(self, batch, batch_idx, optimizer_idx, hiddens):
+            if hiddens is None:
+                assert self.test_hiddens is None
+            else:
+                assert all(torch.equal(h, th) for h, th in zip(hiddens, self.test_hiddens))
+            out = super().training_step(batch, batch_idx, hiddens)
+            self.test_hiddens = out["hiddens"]
+            return out
+
+        def configure_optimizers(self):
+            return [super().configure_optimizers(), super().configure_optimizers()]
+
+    model = TBPTTModel(truncated_bptt_steps=2, input_size=1, hidden_size=1)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=1,
+        limit_val_batches=0,
+        enable_model_summary=False,
+        logger=False,
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+    )
+    trainer.fit(model)
+    assert trainer.global_step == 8 / 2 * 2  # time_dim_length / tbptt_steps * num_optimizers
diff --git a/tests/loops/epoch/test_training_epoch_loop.py b/tests/loops/epoch/test_training_epoch_loop.py
index ed3a853644aced..d6f6a906cc0536 100644
--- a/tests/loops/epoch/test_training_epoch_loop.py
+++ b/tests/loops/epoch/test_training_epoch_loop.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from unittest import mock
-from unittest.mock import Mock, patch
+from unittest.mock import patch
 
 import pytest
 
+from pytorch_lightning import LightningModule
 from pytorch_lightning.loops import TrainingEpochLoop
 from pytorch_lightning.trainer.trainer import Trainer
 from tests.deprecated_api import no_deprecated_call
@@ -33,7 +34,8 @@
 
 class TestPrepareOutputs:
     def prepare_outputs(self, fn, tbptt_splits, new_format, batch_outputs, num_optimizers, automatic_optimization):
-        lightning_module = Mock()
+        lightning_module = LightningModule()
+        lightning_module.on_train_batch_end = lambda *_: None  # override to trigger the deprecation message
         lightning_module.automatic_optimization = automatic_optimization
         lightning_module.truncated_bptt_steps = tbptt_splits
         match = "will change in version v1.8.*new_format=True"
diff --git a/tests/loops/test_evaluation_loop_flow.py b/tests/loops/test_evaluation_loop_flow.py
index 0fe90557b3530b..20f966e6c345d1 100644
--- a/tests/loops/test_evaluation_loop_flow.py
+++ b/tests/loops/test_evaluation_loop_flow.py
@@ -63,8 +63,8 @@ def backward(self, loss, optimizer, optimizer_idx):
 
     # simulate training manually
     trainer.state.stage = RunningStage.TRAINING
-    batch_idx, batch = 0, next(iter(model.train_dataloader()))
-    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
+    kwargs = {"batch": next(iter(model.train_dataloader())), "batch_idx": 0}
+    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(kwargs)
 
     assert len(train_step_out) == 1
     train_step_out = train_step_out[0][0]
@@ -72,9 +72,7 @@ def backward(self, loss, optimizer, optimizer_idx):
     assert train_step_out["loss"].item() == 171
 
     # make sure the optimizer closure returns the correct things
-    opt_closure = trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._make_closure(
-        batch, batch_idx, 0, trainer.optimizers[0]
-    )
+    opt_closure = trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._make_closure(kwargs, trainer.optimizers[0])
     opt_closure_result = opt_closure()
     assert opt_closure_result.item() == 171
 
@@ -126,8 +124,8 @@ def backward(self, loss, optimizer, optimizer_idx):
 
     trainer.state.stage = RunningStage.TRAINING
     # make sure training outputs what is expected
-    batch_idx, batch = 0, next(iter(model.train_dataloader()))
-    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
+    kwargs = {"batch": next(iter(model.train_dataloader())), "batch_idx": 0}
+    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(kwargs)
 
     assert len(train_step_out) == 1
     train_step_out = train_step_out[0][0]
@@ -135,9 +133,7 @@ def backward(self, loss, optimizer, optimizer_idx):
     assert train_step_out["loss"].item() == 171
 
     # make sure the optimizer closure returns the correct things
-    opt_closure = trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._make_closure(
-        batch, batch_idx, 0, trainer.optimizers[0]
-    )
+    opt_closure = trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._make_closure(kwargs, trainer.optimizers[0])
     opt_closure_result = opt_closure()
     assert opt_closure_result.item() == 171
 
diff --git a/tests/loops/test_training_loop_flow_scalar.py b/tests/loops/test_training_loop_flow_scalar.py
index 8493de4db03db7..29e3d3b3a0e5b0 100644
--- a/tests/loops/test_training_loop_flow_scalar.py
+++ b/tests/loops/test_training_loop_flow_scalar.py
@@ -146,8 +146,8 @@ def backward(self, loss, optimizer, optimizer_idx):
 
     trainer.state.stage = RunningStage.TRAINING
     # make sure training outputs what is expected
-    batch_idx, batch = 0, next(iter(model.train_dataloader()))
-    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
+    kwargs = {"batch": next(iter(model.train_dataloader())), "batch_idx": 0}
+    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(kwargs)
 
     assert len(train_step_out) == 1
     train_step_out = train_step_out[0][0]
@@ -155,9 +155,7 @@ def backward(self, loss, optimizer, optimizer_idx):
     assert train_step_out["loss"].item() == 171
 
     # make sure the optimizer closure returns the correct things
-    opt_closure = trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._make_closure(
-        batch, batch_idx, 0, trainer.optimizers[0]
-    )
+    opt_closure = trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._make_closure(kwargs, trainer.optimizers[0])
     opt_closure_result = opt_closure()
     assert opt_closure_result.item() == 171
 
@@ -218,8 +216,8 @@ def backward(self, loss, optimizer, optimizer_idx):
 
     trainer.state.stage = RunningStage.TRAINING
     # make sure training outputs what is expected
-    batch_idx, batch = 0, next(iter(model.train_dataloader()))
-    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
+    kwargs = {"batch": next(iter(model.train_dataloader())), "batch_idx": 0}
+    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(kwargs)
 
     assert len(train_step_out) == 1
     train_step_out = train_step_out[0][0]
@@ -227,9 +225,7 @@ def backward(self, loss, optimizer, optimizer_idx):
     assert train_step_out["loss"].item() == 171
 
     # make sure the optimizer closure returns the correct things
-    opt_closure = trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._make_closure(
-        batch, batch_idx, 0, trainer.optimizers[0]
-    )
+    opt_closure = trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._make_closure(kwargs, trainer.optimizers[0])
     opt_closure_result = opt_closure()
     assert opt_closure_result.item() == 171
 
@@ -239,7 +235,7 @@ def test_train_step_no_return(tmpdir):
     automatic_optimization."""
 
     class TestModel(BoringModel):
-        def training_step(self, batch, batch_idx):
+        def training_step(self, batch):
             self.training_step_called = True
             loss = self.step(batch[0])
             self.log("a", loss, on_step=True, on_epoch=True)
@@ -305,7 +301,7 @@ def training_step(self, batch, batch_idx):
 
     # manually check a few batches
     for batch_idx, batch in enumerate(model.train_dataloader()):
-        out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
+        out = trainer.fit_loop.epoch_loop.batch_loop.run({"batch": batch, "batch_idx": batch_idx})
         if not batch_idx % 2:
             assert out == []
 
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index aa71a96969c43d..6c9dab1480d3fb 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -201,10 +201,11 @@ def train_dataloader(self):
         limit_train_batches=2,
         limit_val_batches=0,
         max_epochs=1,
-        enable_model_summary=False,
         strategy="ddp",
         accelerator="gpu",
         devices=2,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -573,7 +574,102 @@ def training_step(self, batch, batch_idx):
     assert called == expected
 
 
-def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir):
+def test_trainer_model_hook_system_fit_no_val_and_resume_max_epochs(tmpdir):
+    # initial training to get a checkpoint
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=0,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        callbacks=[HookedCallback([])],
+    )
+    trainer.fit(model)
+    best_model_path = trainer.checkpoint_callback.best_model_path
+
+    called = []
+    callback = HookedCallback(called)
+    # already performed 1 step, resume and do 2 more
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        limit_train_batches=2,
+        limit_val_batches=0,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        callbacks=[callback],
+        track_grad_norm=1,
+    )
+    assert called == [
+        dict(name="Callback.on_init_start", args=(trainer,)),
+        dict(name="Callback.on_init_end", args=(trainer,)),
+    ]
+
+    # resume from checkpoint with HookedModel
+    model = HookedModel(called)
+    trainer.fit(model, ckpt_path=best_model_path)
+    loaded_ckpt = {
+        "callbacks": ANY,
+        "epoch": 0,
+        "global_step": 2,
+        "lr_schedulers": ANY,
+        "optimizer_states": ANY,
+        "pytorch-lightning_version": __version__,
+        "state_dict": ANY,
+        "loops": ANY,
+    }
+    saved_ckpt = {**loaded_ckpt, "global_step": 4, "epoch": 1}
+    expected = [
+        dict(name="Callback.on_init_start", args=(trainer,)),
+        dict(name="Callback.on_init_end", args=(trainer,)),
+        dict(name="configure_callbacks"),
+        dict(name="prepare_data"),
+        dict(name="Callback.on_before_accelerator_backend_setup", args=(trainer, model)),
+        dict(name="Callback.setup", args=(trainer, model), kwargs=dict(stage="fit")),
+        dict(name="setup", kwargs=dict(stage="fit")),
+        dict(name="on_load_checkpoint", args=(loaded_ckpt,)),
+        dict(name="Callback.on_load_checkpoint", args=(trainer, model, {"foo": True})),
+        dict(name="Callback.load_state_dict", args=({"foo": True},)),
+        dict(name="configure_sharded_model"),
+        dict(name="Callback.on_configure_sharded_model", args=(trainer, model)),
+        dict(name="configure_optimizers"),
+        dict(name="Callback.on_fit_start", args=(trainer, model)),
+        dict(name="on_fit_start"),
+        dict(name="Callback.on_pretrain_routine_start", args=(trainer, model)),
+        dict(name="on_pretrain_routine_start"),
+        dict(name="Callback.on_pretrain_routine_end", args=(trainer, model)),
+        dict(name="on_pretrain_routine_end"),
+        dict(name="train", args=(True,)),
+        dict(name="on_train_dataloader"),
+        dict(name="train_dataloader"),
+        dict(name="Callback.on_train_start", args=(trainer, model)),
+        dict(name="on_train_start"),
+        dict(name="Callback.on_epoch_start", args=(trainer, model)),
+        dict(name="on_epoch_start"),
+        dict(name="Callback.on_train_epoch_start", args=(trainer, model)),
+        dict(name="on_train_epoch_start"),
+        *model._train_batch(trainer, model, 2, current_epoch=1, current_batch=0),
+        dict(name="training_epoch_end", args=([dict(loss=ANY)] * 2,)),
+        dict(name="Callback.on_train_epoch_end", args=(trainer, model)),
+        dict(name="Callback.state_dict"),
+        dict(name="Callback.on_save_checkpoint", args=(trainer, model, saved_ckpt)),
+        dict(name="on_save_checkpoint", args=(saved_ckpt,)),
+        dict(name="on_train_epoch_end"),
+        dict(name="Callback.on_epoch_end", args=(trainer, model)),
+        dict(name="on_epoch_end"),
+        dict(name="Callback.on_train_end", args=(trainer, model)),
+        dict(name="on_train_end"),
+        dict(name="Callback.on_fit_end", args=(trainer, model)),
+        dict(name="on_fit_end"),
+        dict(name="Callback.teardown", args=(trainer, model), kwargs=dict(stage="fit")),
+        dict(name="teardown", kwargs=dict(stage="fit")),
+    ]
+    assert called == expected
+
+
+def test_trainer_model_hook_system_fit_no_val_and_resume_max_steps(tmpdir):
     # initial training to get a checkpoint
     model = BoringModel()
     trainer = Trainer(
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 152d01aca939a1..52d1fe666c5a74 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -168,7 +168,7 @@ def test_horovod_multi_gpu_accumulate_grad_batches(tmpdir):
     _run_horovod(trainer_options)
 
 
-@RunIf(horovod=True, skip_windows=True)
+@RunIf(horovod=True, skip_windows=True, min_gpus=2)
 def test_horovod_raises_unsupported_accumulate_grad_batches(tmpdir):
     """Ensure MisConfigurationException for different `accumulate_grad_batches` at different epochs for Horovod
     Strategy on multi-gpus."""
@@ -178,7 +178,7 @@ def test_horovod_raises_unsupported_accumulate_grad_batches(tmpdir):
         enable_progress_bar=False,
         accumulate_grad_batches={0: 4, 2: 2},
         accelerator="auto",
-        devices=1,
+        devices=2,
         strategy="horovod",
     )
     with pytest.raises(MisconfigurationException, match="Horovod.*does not support.*accumulate_grad_batches"):
@@ -262,7 +262,7 @@ def test_horovod_gather(tmpdir):
     _run_horovod(trainer_options)
 
 
-@RunIf(min_gpus=1, skip_windows=True, horovod=True, horovod_nccl=True)
+@RunIf(min_gpus=2, skip_windows=True, horovod=True, horovod_nccl=True)
 def test_horovod_transfer_batch_to_gpu(tmpdir):
     class TestTrainingStepModel(BoringModel):
         def training_step(self, batch, *args, **kwargs):
@@ -282,7 +282,7 @@ def validation_step(self, batch, *args, **kwargs):
         limit_train_batches=0.4,
         limit_val_batches=0.2,
         accelerator="gpu",
-        devices=1,
+        devices=2,
         strategy="horovod",
     )
     tpipes.run_model_test_without_loggers(trainer_options, model)
diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py
index 40bde5b2410632..b09a816d11d728 100644
--- a/tests/models/test_hparams.py
+++ b/tests/models/test_hparams.py
@@ -33,6 +33,7 @@
 from pytorch_lightning.utilities import _HYDRA_EXPERIMENTAL_AVAILABLE, _OMEGACONF_AVAILABLE, AttributeDict, is_picklable
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel, RandomDataset
+from tests.helpers.boring_model import BoringDataModule
 from tests.helpers.runif import RunIf
 from tests.helpers.utils import no_warning_call
 
@@ -70,73 +71,115 @@ def __init__(self, hparams, *my_args, **my_kwargs):
         self.save_hyperparameters(hparams)
 
 
+class SaveHparamsDataModule(BoringDataModule):
+    """Tests that a model can take an object."""
+
+    def __init__(self, hparams):
+        super().__init__()
+        self.save_hyperparameters(hparams)
+
+
+class SaveHparamsDecoratedDataModule(BoringDataModule):
+    """Tests that a model can take an object."""
+
+    @decorate
+    @decorate
+    def __init__(self, hparams, *my_args, **my_kwargs):
+        super().__init__()
+        self.save_hyperparameters(hparams)
+
+
 # -------------------------
 # STANDARD TESTS
 # -------------------------
-def _run_standard_hparams_test(tmpdir, model, cls, try_overwrite=False):
+def _run_standard_hparams_test(tmpdir, model, cls, datamodule=None, try_overwrite=False):
     """Tests for the existence of an arg 'test_arg=14'."""
-    hparam_type = type(model.hparams)
+    obj = datamodule if issubclass(cls, LightningDataModule) else model
+
+    hparam_type = type(obj.hparams)
     # test proper property assignments
-    assert model.hparams.test_arg == 14
+    assert obj.hparams.test_arg == 14
 
     # verify we can train
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=2)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=datamodule if issubclass(cls, LightningDataModule) else None)
 
     # make sure the raw checkpoint saved the properties
     raw_checkpoint_path = _raw_checkpoint_path(trainer)
     raw_checkpoint = torch.load(raw_checkpoint_path)
-    assert LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in raw_checkpoint
-    assert raw_checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY]["test_arg"] == 14
+    assert cls.CHECKPOINT_HYPER_PARAMS_KEY in raw_checkpoint
+    assert raw_checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY]["test_arg"] == 14
 
     # verify that model loads correctly
-    model2 = cls.load_from_checkpoint(raw_checkpoint_path)
-    assert model2.hparams.test_arg == 14
+    obj2 = cls.load_from_checkpoint(raw_checkpoint_path)
+    assert obj2.hparams.test_arg == 14
 
-    assert isinstance(model2.hparams, hparam_type)
+    assert isinstance(obj2.hparams, hparam_type)
 
     if try_overwrite:
         # verify that we can overwrite the property
-        model3 = cls.load_from_checkpoint(raw_checkpoint_path, test_arg=78)
-        assert model3.hparams.test_arg == 78
+        obj3 = cls.load_from_checkpoint(raw_checkpoint_path, test_arg=78)
+        assert obj3.hparams.test_arg == 78
 
     return raw_checkpoint_path
 
 
-@pytest.mark.parametrize("cls", [SaveHparamsModel, SaveHparamsDecoratedModel])
+@pytest.mark.parametrize(
+    "cls", [SaveHparamsModel, SaveHparamsDecoratedModel, SaveHparamsDataModule, SaveHparamsDecoratedDataModule]
+)
 def test_namespace_hparams(tmpdir, cls):
-    # init model
-    model = cls(hparams=Namespace(test_arg=14))
+    hparams = Namespace(test_arg=14)
+
+    if issubclass(cls, LightningDataModule):
+        model = BoringModel()
+        datamodule = cls(hparams=hparams)
+    else:
+        model = cls(hparams=hparams)
+        datamodule = None
 
     # run standard test suite
-    _run_standard_hparams_test(tmpdir, model, cls)
+    _run_standard_hparams_test(tmpdir, model, cls, datamodule=datamodule)
 
 
-@pytest.mark.parametrize("cls", [SaveHparamsModel, SaveHparamsDecoratedModel])
+@pytest.mark.parametrize(
+    "cls", [SaveHparamsModel, SaveHparamsDecoratedModel, SaveHparamsDataModule, SaveHparamsDecoratedDataModule]
+)
 def test_dict_hparams(tmpdir, cls):
-    # init model
-    model = cls(hparams={"test_arg": 14})
+    hparams = {"test_arg": 14}
+    if issubclass(cls, LightningDataModule):
+        model = BoringModel()
+        datamodule = cls(hparams=hparams)
+    else:
+        model = cls(hparams=hparams)
+        datamodule = None
 
     # run standard test suite
-    _run_standard_hparams_test(tmpdir, model, cls)
+    _run_standard_hparams_test(tmpdir, model, cls, datamodule=datamodule)
 
 
 @RunIf(omegaconf=True)
-@pytest.mark.parametrize("cls", [SaveHparamsModel, SaveHparamsDecoratedModel])
+@pytest.mark.parametrize(
+    "cls", [SaveHparamsModel, SaveHparamsDecoratedModel, SaveHparamsDataModule, SaveHparamsDecoratedDataModule]
+)
 def test_omega_conf_hparams(tmpdir, cls):
-    # init model
     conf = OmegaConf.create(dict(test_arg=14, mylist=[15.4, dict(a=1, b=2)]))
-    model = cls(hparams=conf)
-    assert isinstance(model.hparams, Container)
+    if issubclass(cls, LightningDataModule):
+        model = BoringModel()
+        obj = datamodule = cls(hparams=conf)
+    else:
+        obj = model = cls(hparams=conf)
+        datamodule = None
+
+    assert isinstance(obj.hparams, Container)
 
     # run standard test suite
-    raw_checkpoint_path = _run_standard_hparams_test(tmpdir, model, cls)
-    model2 = cls.load_from_checkpoint(raw_checkpoint_path)
-    assert isinstance(model2.hparams, Container)
+    raw_checkpoint_path = _run_standard_hparams_test(tmpdir, model, cls, datamodule=datamodule)
+    obj2 = cls.load_from_checkpoint(raw_checkpoint_path)
+    assert isinstance(obj2.hparams, Container)
 
     # config specific tests
-    assert model2.hparams.test_arg == 14
-    assert model2.hparams.mylist[0] == 15.4
+    assert obj2.hparams.test_arg == 14
+    assert obj2.hparams.mylist[0] == 15.4
 
 
 def test_explicit_args_hparams(tmpdir):
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index 0d6c9772b9c456..136e8ee516bbbe 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -199,7 +199,7 @@ def on_train_start(self):
             if self.trainer.state.fn == TrainerFn.TUNING:
                 self._test_on_val_test_predict_tune_start()
             else:
-                assert self.trainer.current_epoch == state_dict["epoch"]
+                assert self.trainer.current_epoch == state_dict["epoch"] + 1
                 assert self.trainer.global_step == state_dict["global_step"]
                 assert self._check_model_state_dict()
                 assert self._check_optimizers()
diff --git a/tests/plugins/precision/test_sharded_precision.py b/tests/plugins/precision/test_sharded_precision.py
new file mode 100644
index 00000000000000..754095912fb538
--- /dev/null
+++ b/tests/plugins/precision/test_sharded_precision.py
@@ -0,0 +1,42 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from pytorch_lightning.plugins import ShardedNativeMixedPrecisionPlugin
+from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
+from tests.helpers.runif import RunIf
+
+ShardedGradScaler = None
+if _FAIRSCALE_AVAILABLE:
+    from fairscale.optim.grad_scaler import ShardedGradScaler
+
+
+@RunIf(fairscale=True)
+@pytest.mark.parametrize(
+    "precision,scaler,expected",
+    [
+        (16, torch.cuda.amp.GradScaler(), torch.cuda.amp.GradScaler),
+        (16, None, ShardedGradScaler),
+        pytest.param("bf16", None, None, marks=RunIf(min_torch="1.10")),
+        (32, None, None),
+    ],
+)
+def test_sharded_precision_scaler(precision, scaler, expected):
+    plugin = ShardedNativeMixedPrecisionPlugin(precision=precision, scaler=scaler, device="cuda")
+    if expected:
+        assert isinstance(plugin.scaler, expected)
+    else:
+        assert not plugin.scaler
diff --git a/tests/plugins/test_amp_plugins.py b/tests/plugins/test_amp_plugins.py
index 896c6d1a6637e8..7091d0a8e45463 100644
--- a/tests/plugins/test_amp_plugins.py
+++ b/tests/plugins/test_amp_plugins.py
@@ -213,6 +213,8 @@ def training_step(self, batch, batch_idx):
         devices=2,
         strategy="ddp",
         plugins=ApexMixedPrecisionPlugin(amp_level=amp_level),
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
     model = CustomBoringModel()
diff --git a/tests/plugins/test_checkpoint_io_plugin.py b/tests/plugins/test_checkpoint_io_plugin.py
index 56aadad353b2ea..4ac26261e56824 100644
--- a/tests/plugins/test_checkpoint_io_plugin.py
+++ b/tests/plugins/test_checkpoint_io_plugin.py
@@ -76,4 +76,4 @@ def test_checkpoint_plugin_called(tmpdir):
 
     trainer.test(model, ckpt_path=ck.last_model_path)
     checkpoint_plugin.load_checkpoint.assert_called_once()
-    checkpoint_plugin.load_checkpoint.assert_called_with(tmpdir / "last.ckpt")
+    checkpoint_plugin.load_checkpoint.assert_called_with(tmpdir / "last-v1.ckpt")
diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py
index 0dfa90ce42147f..811daafa14ad7b 100644
--- a/tests/profiler/test_profiler.py
+++ b/tests/profiler/test_profiler.py
@@ -352,7 +352,6 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler):
     model = BoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
-        enable_progress_bar=False,
         max_epochs=1,
         limit_train_batches=5,
         limit_val_batches=5,
@@ -360,6 +359,8 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler):
         strategy="ddp",
         accelerator="gpu",
         devices=2,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     expected = {"[pl][profile][Strategy]DDPStrategy.validation_step"}
@@ -481,7 +482,14 @@ def test_pytorch_profiler_nested_emit_nvtx(tmpdir):
     profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True)
 
     model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, profiler=profiler, accelerator="gpu", devices=1)
+    trainer = Trainer(
+        fast_dev_run=True,
+        profiler=profiler,
+        accelerator="gpu",
+        devices=1,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+    )
     trainer.fit(model)
 
 
diff --git a/tests/strategies/test_bagua_strategy.py b/tests/strategies/test_bagua_strategy.py
index 5ffabec231753b..3672703305805f 100644
--- a/tests/strategies/test_bagua_strategy.py
+++ b/tests/strategies/test_bagua_strategy.py
@@ -55,6 +55,8 @@ def test_async_algorithm(tmpdir):
         strategy=bagua_strategy,
         accelerator="gpu",
         devices=2,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
diff --git a/tests/strategies/test_collaborative.py b/tests/strategies/test_collaborative.py
new file mode 100644
index 00000000000000..6787f30c383e58
--- /dev/null
+++ b/tests/strategies/test_collaborative.py
@@ -0,0 +1,372 @@
+import multiprocessing as mp
+import os
+import time
+from typing import Any
+from unittest import mock
+from unittest.mock import PropertyMock
+
+import pytest
+import requests
+import torch
+from torch.optim import Optimizer
+
+import pytorch_lightning as pl
+from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port
+from pytorch_lightning.strategies import CollaborativeStrategy
+from pytorch_lightning.strategies.collaborative import HiveMindScheduler
+from pytorch_lightning.utilities import _HIVEMIND_AVAILABLE
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.types import STEP_OUTPUT
+from tests.helpers import BoringModel
+from tests.helpers.runif import RunIf
+
+if _HIVEMIND_AVAILABLE:
+    import hivemind
+
+
+@mock.patch("pytorch_lightning.strategies.collaborative._HIVEMIND_AVAILABLE", False)
+def test_raise_exception_if_hivemind_unavailable():
+    """Test that we raise an exception when Hivemind is not available."""
+    with pytest.raises(MisconfigurationException, match="you must have Hivemind installed"):
+        CollaborativeStrategy(target_batch_size=1)
+
+
+@RunIf(hivemind=True)
+@mock.patch("hivemind.DHT", autospec=True)
+def test_strategy(mock_dht):
+    strategy = CollaborativeStrategy(target_batch_size=1)
+    trainer = pl.Trainer(strategy=strategy)
+    assert trainer.strategy == strategy
+
+
+@RunIf(hivemind=True)
+@mock.patch("hivemind.DHT", autospec=True)
+@mock.patch("pytorch_lightning.strategies.collaborative.DHTManager._get_peers", autospec=True)
+@pytest.mark.parametrize(
+    "initial_peers,peer_endpoint",
+    [(["TEST"], None), (None, "localhost:153")],
+)
+def test_logging_disabled_when_second_peer(mock_dht, mock_http, initial_peers, peer_endpoint):
+    """Test when we are a second peer (passing initial peers or peer endpoint) we warn the user that
+    logging/checkpointing will be disabled."""
+    with pytest.warns(UserWarning, match="This machine is not a persistent machine"):
+        CollaborativeStrategy(target_batch_size=1, initial_peers=initial_peers, peer_endpoint=peer_endpoint)
+
+
+@RunIf(hivemind=True)
+@mock.patch.dict(
+    os.environ,
+    {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor", "PL_PORT": str(find_free_network_port())},
+    clear=True,
+)
+@pytest.mark.parametrize(
+    "endpoint,expected_message",
+    [(False, "INITIAL_PEERS"), (True, "Sidecar endpoint enabled to serve peers.")],
+)
+def test_initial_peer_message(caplog, endpoint, expected_message):
+    model = BoringModel()
+    trainer = pl.Trainer(strategy=CollaborativeStrategy(target_batch_size=1, endpoint=endpoint), fast_dev_run=True)
+    trainer.fit(model)
+    assert expected_message in caplog.text
+
+
+@RunIf(hivemind=True)
+@mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
+def test_optimizer_wrapped():
+    class TestModel(BoringModel):
+        def on_before_backward(self, loss: torch.Tensor) -> None:
+            optimizer = self.trainer.optimizers[0]
+            assert isinstance(optimizer, hivemind.Optimizer)
+
+    model = TestModel()
+    trainer = pl.Trainer(strategy=CollaborativeStrategy(target_batch_size=1), fast_dev_run=True)
+    trainer.fit(model)
+
+
+@RunIf(hivemind=True)
+@mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
+def test_scheduler_wrapped():
+    class TestModel(BoringModel):
+        def on_before_backward(self, loss: torch.Tensor) -> None:
+            scheduler = self.trainer.lr_scheduler_configs[0].scheduler
+            assert isinstance(scheduler, HiveMindScheduler)
+
+        def configure_optimizers(self):
+            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+            return [optimizer], [torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)]
+
+    model = TestModel()
+    trainer = pl.Trainer(
+        strategy=CollaborativeStrategy(target_batch_size=1),
+        fast_dev_run=True,
+    )
+    trainer.fit(model)
+
+
+@RunIf(hivemind=True)
+@mock.patch.dict(
+    os.environ,
+    {
+        "HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor",
+        "PL_INITIAL_PEERS": "TEST_PEERS",
+        "PL_HOST": "TEST_HOST",
+        "PL_PORT": "1300",
+        "PL_ENDPOINT": "1",
+        "PL_PEER_ENDPOINT": "TEST_PEER_ENDPOINT",
+    },
+    clear=True,
+)
+@mock.patch("hivemind.DHT", autospec=True)
+@mock.patch("pytorch_lightning.strategies.collaborative.DHTManager._get_peers", autospec=True)
+@mock.patch("http.server.ThreadingHTTPServer", autospec=True)
+def test_env_variables_parsed(mock_dht, mock_peers, mock_server):
+    """Test that env variables are parsed correctly."""
+    strategy = CollaborativeStrategy(target_batch_size=1)
+    assert strategy.dht_manager._initial_peers == ["TEST_PEERS"]
+    assert strategy.dht_manager._host == "TEST_HOST"
+    assert strategy.dht_manager._port == 1300
+    assert strategy.dht_manager._endpoint
+    assert strategy.dht_manager._peer_endpoint == "TEST_PEER_ENDPOINT"
+
+
+@RunIf(hivemind=True)
+@mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
+def test_reuse_grad_buffers_warning():
+    """Test to ensure we warn when a user overrides `optimizer_zero_grad` and `reuse_grad_buffers` is True."""
+
+    class TestModel(BoringModel):
+        def on_before_backward(self, loss: torch.Tensor) -> None:
+            optimizer = self.trainer.optimizers[0]
+            assert isinstance(optimizer, hivemind.Optimizer)
+
+        def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):
+            pass
+
+    model = TestModel()
+    trainer = pl.Trainer(
+        strategy=CollaborativeStrategy(target_batch_size=1, reuse_grad_buffers=True), fast_dev_run=True
+    )
+
+    with pytest.warns(UserWarning, match="You have overridden `optimizer_zero_grad` which will be disabled."):
+        trainer.fit(model)
+
+
+@RunIf(hivemind=True)
+def test_raise_exception_multiple_optimizers():
+    """Test that we raise an exception when multiple optimizers are provided."""
+
+    class TestModel(BoringModel):
+        def configure_optimizers(self):
+            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+            return [optimizer, optimizer], [lr_scheduler]
+
+    model = TestModel()
+    trainer = pl.Trainer(strategy=CollaborativeStrategy(target_batch_size=1), fast_dev_run=True)
+
+    with pytest.raises(MisconfigurationException, match="Hivemind only supports training with one optimizer."):
+        trainer.fit(model)
+
+
+@RunIf(hivemind=True)
+@mock.patch("pytorch_lightning.utilities.data._extract_batch_size", autospec=True, return_value=[None])
+def test_raise_exception_no_batch_size(mock_extract_batch_size):
+    """Test that we raise an exception when no batch size is automatically found."""
+
+    model = BoringModel()
+    trainer = pl.Trainer(strategy=CollaborativeStrategy(target_batch_size=1), fast_dev_run=True)
+
+    with pytest.raises(MisconfigurationException, match="Please provide the batch size to the Strategy."):
+        trainer.fit(model)
+
+
+@RunIf(hivemind=True)
+@mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
+@pytest.mark.parametrize(
+    "delay_grad_averaging, delay_state_averaging, delay_optimizer_step",
+    [(True, True, True), (False, True, False)],
+)
+def test_warn_if_argument_passed(delay_grad_averaging, delay_state_averaging, delay_optimizer_step):
+    """Test ensures that valid combination of HiveMind delay arguments warn if scheduler isn't passed in as a
+    function."""
+    model = BoringModel()
+    trainer = pl.Trainer(
+        strategy=CollaborativeStrategy(
+            target_batch_size=1,
+            delay_grad_averaging=delay_grad_averaging,
+            delay_state_averaging=delay_state_averaging,
+            delay_optimizer_step=delay_optimizer_step,
+        ),
+        fast_dev_run=True,
+    )
+
+    with pytest.warns(UserWarning, match="requires a `scheduler_fn` to be passed to the strategy"):
+        trainer.fit(model)
+
+
+@RunIf(hivemind=True)
+@mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
+@mock.patch("http.server.ThreadingHTTPServer", autospec=True)
+@mock.patch("pytorch_lightning.strategies.collaborative.CollaborativeStrategy.num_peers", new_callable=PropertyMock)
+def test_args_passed_to_optimizer(mock_peers, mock_server):
+    """Test to ensure arguments are correctly passed to the hivemind optimizer wrapper."""
+    mock_peers.return_value = 1
+    compression = hivemind.ScaledFloat16Compression()
+    with mock.patch("hivemind.Optimizer", wraps=hivemind.Optimizer) as mock_optimizer:
+
+        class TestModel(BoringModel):
+            def on_before_backward(self, loss: torch.Tensor) -> None:
+                args, kwargs = mock_optimizer.call_args
+                mock_optimizer.assert_called()
+                arguments = dict(
+                    delay_optimizer_step=True,
+                    delay_state_averaging=True,
+                    state_averaging_compression=compression,
+                    grad_compression=compression,
+                    offload_optimizer=True,
+                    reuse_grad_buffers=True,
+                    target_batch_size=1,
+                )
+
+                for key, value in arguments.items():
+                    assert key in kwargs
+                    assert value == kwargs[key]
+
+        model = TestModel()
+        trainer = pl.Trainer(
+            strategy=CollaborativeStrategy(
+                target_batch_size=1,
+                reuse_grad_buffers=True,
+                delay_state_averaging=True,
+                delay_optimizer_step=True,
+                offload_optimizer=True,
+                grad_compression=compression,
+                state_averaging_compression=compression,
+            ),
+            fast_dev_run=True,
+        )
+        trainer.fit(model)
+        # ensures that after training with `reuse_grad_buffers` we restore the hook
+        assert model.optimizer_zero_grad is not None
+
+
+@RunIf(hivemind=True)
+@mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
+@pytest.mark.parametrize(
+    "host_maddrs,expected_maddrs",
+    [(None, ["/ip4/0.0.0.0/tcp/0", "/ip4/0.0.0.0/udp/0/quic"]), (["/ip4/127.0.0.1/tcp/0"], ["/ip4/127.0.0.1/tcp/0"])],
+)
+def test_maddrs(host_maddrs, expected_maddrs):
+    """Test that the multiple addresses are correctly assigned."""
+    strategy = CollaborativeStrategy(target_batch_size=1, host_maddrs=host_maddrs)
+    assert strategy.dht.kwargs["host_maddrs"] == expected_maddrs
+
+
+def _run_collab_training_fn(initial_peers, wait_seconds, barrier, recorded_process_peers, recorded_process_steps):
+    recorded_peers = []
+    recorded_global_steps = []
+
+    class TestModel(BoringModel):
+        def on_train_batch_end(self, outputs: STEP_OUTPUT, batch: Any, batch_idx: int, unused: int = 0) -> None:
+            time.sleep(wait_seconds)  # add an additional delay to give processes time to sync
+            recorded_peers.append(self.trainer.strategy.num_peers)
+            recorded_global_steps.append(self.trainer.optimizers[0].local_epoch)
+
+        def on_train_end(self) -> None:
+            # wait for all processes to get to the end of training before teardown
+            barrier.wait()
+
+    model = TestModel()
+    trainer = pl.Trainer(
+        max_epochs=1,
+        limit_train_batches=16,
+        limit_val_batches=0,
+        strategy=CollaborativeStrategy(
+            delay_state_averaging=True,
+            offload_optimizer=True,
+            delay_optimizer_step=True,
+            delay_grad_averaging=True,
+            target_batch_size=8,
+            initial_peers=initial_peers,
+            verbose=False,
+        ),
+    )
+    trainer.fit(model)
+
+    recorded_process_peers.append(recorded_peers)
+    recorded_process_steps.append(recorded_global_steps)
+
+
+@RunIf(hivemind=True)
+@mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
+@pytest.mark.parametrize(
+    "num_processes, wait_seconds",
+    [(2, 0.25)],
+)
+def test_multiple_peers(num_processes, wait_seconds):
+    """Test to ensure that if we have two running processes with the same peers, they connect and train
+    successfully."""
+    dht_root = hivemind.DHT(start=True)
+    barrier = mp.Barrier(num_processes)
+    initial_peers = dht_root.get_visible_maddrs()
+
+    with mp.Manager() as manager:
+        # allows processes to return their recorded logged peers/steps
+        recorded_process_peers = manager.list()
+        recorded_process_steps = manager.list()
+        processes = [
+            mp.Process(
+                target=_run_collab_training_fn,
+                kwargs=dict(
+                    initial_peers=initial_peers,
+                    wait_seconds=wait_seconds,
+                    barrier=barrier,
+                    recorded_process_peers=recorded_process_peers,
+                    recorded_process_steps=recorded_process_steps,
+                ),
+            )
+            for x in range(num_processes)
+        ]
+        for process in processes:
+            process.start()
+        for process in processes:
+            process.join()
+        # assert that peers increase as expected and we run at-least 1 global step.
+        for process_peers, process_steps in zip(recorded_process_peers, recorded_process_steps):
+            assert any(num_peer == num_processes for num_peer in process_peers)
+            assert any(global_step > 0 for global_step in process_steps)
+
+
+@RunIf(hivemind=True, min_gpus=1)
+@mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
+def test_scaler_updated_precision_16():
+    class TestModel(BoringModel):
+        def on_fit_start(self) -> None:
+            assert isinstance(self.trainer.precision_plugin.scaler, hivemind.GradScaler)
+            raise SystemExit
+
+    model = TestModel()
+    trainer = pl.Trainer(
+        strategy=CollaborativeStrategy(target_batch_size=1),
+        fast_dev_run=True,
+        precision=16,
+        accelerator="gpu",
+        devices=1,
+    )
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+
+@RunIf(hivemind=True)
+def test_raise_when_peer_endpoint_unsuccessful(caplog):
+    port = find_free_network_port()
+    with pytest.raises(MisconfigurationException, match="Unable to get peers"):
+        with mock.patch("requests.get", wraps=requests.get) as requests_mock:
+            CollaborativeStrategy(
+                target_batch_size=1,
+                peer_endpoint=f"localhost:{port}",
+                retry_endpoint_attempts=10,
+                retry_endpoint_sleep_duration=0,
+            )
+    assert "Failed to get peers, retrying" in caplog.text
+    assert requests_mock.call_count == 10
diff --git a/tests/strategies/test_ddp.py b/tests/strategies/test_ddp.py
index 2e90da03b3d1f8..c73d36614d1cfe 100644
--- a/tests/strategies/test_ddp.py
+++ b/tests/strategies/test_ddp.py
@@ -154,6 +154,8 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
         accelerator="gpu",
         devices=2,
         callbacks=CustomCallback(),
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
diff --git a/tests/strategies/test_ddp_fully_sharded_native.py b/tests/strategies/test_ddp_fully_sharded_native.py
new file mode 100644
index 00000000000000..cf4973e5ae035c
--- /dev/null
+++ b/tests/strategies/test_ddp_fully_sharded_native.py
@@ -0,0 +1,176 @@
+import os
+from typing import Any, Dict, Optional
+from unittest import mock
+
+import pytest
+import torch
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11
+from tests.helpers.boring_model import BoringModel
+from tests.helpers.runif import RunIf
+
+if _TORCH_GREATER_EQUAL_1_11:
+    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
+    from torch.distributed.fsdp.wrap import wrap
+
+
+@RunIf(min_torch="1.11")
+def test_invalid_on_cpu(tmpdir):
+    """Test to ensure that to raise Misconfiguration for Native FSDP on CPU."""
+    with pytest.raises(
+        MisconfigurationException,
+        match=f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
+        "but GPU accelerator is not used.",
+    ):
+        trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp_native")
+        assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy)
+        trainer.strategy.setup_environment()
+
+
+@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
+@mock.patch("torch.cuda.device_count", return_value=1)
+@mock.patch("torch.cuda.is_available", return_value=True)
+@RunIf(min_torch="1.11")
+def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir):
+    """Test to ensure that plugin native amp plugin raises Misconfiguration error."""
+    with pytest.raises(
+        MisconfigurationException, match="DDPFullyShardedNativeStrategy currently doesn't support Mixed Precision"
+    ):
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            fast_dev_run=True,
+            strategy="fsdp_native",
+            accelerator="gpu",
+            devices=1,
+            precision=16,
+        )
+        assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy)
+
+
+class TestFSDPModel(BoringModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.layer: Optional[torch.nn.Module] = None
+
+    def _init_model(self) -> None:
+        self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2))
+
+    def setup(self, stage: str) -> None:
+        if self.layer is None:
+            self._init_model()
+
+    def configure_sharded_model(self) -> None:
+        # the model is already wrapped with FSDP: no need to wrap again!
+        if isinstance(self.layer, FullyShardedDataParallel):
+            return
+        for i, layer in enumerate(self.layer):
+            if i % 2 == 0:
+                self.layer[i] = wrap(layer)
+        self.layer = wrap(self.layer)
+
+    def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        # when loading full state dict, we first need to create a new unwrapped model
+        self._init_model()
+
+    def configure_optimizers(self):
+        return torch.optim.SGD(self.layer.parameters(), lr=0.1)
+
+    def on_train_start(self) -> None:
+        self._assert_layer_fsdp_instance()
+
+    def on_test_start(self) -> None:
+        self._assert_layer_fsdp_instance()
+
+    def on_validation_start(self) -> None:
+        self._assert_layer_fsdp_instance()
+
+    def on_prediction_start(self) -> None:
+        self._assert_layer_fsdp_instance()
+
+    def _assert_layer_fsdp_instance(self) -> None:
+        assert isinstance(self.layer, FullyShardedDataParallel)
+        assert isinstance(self.layer.module[0], FullyShardedDataParallel)
+        assert isinstance(self.layer.module[2], FullyShardedDataParallel)
+        # root should not be resharding
+        assert self.layer.reshard_after_forward is False
+        # Assert that the nested layers are set reshard_after_forward to True
+        assert self.layer.module[0].reshard_after_forward is True
+        assert self.layer.module[2].reshard_after_forward is True
+
+
+@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
+def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
+    """Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run."""
+
+    model = TestFSDPModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        accelerator="gpu",
+        devices=2,
+        strategy="fsdp_native",
+        precision=16,
+        max_epochs=1,
+        sync_batchnorm=True,
+    )
+    _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
+
+
+@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.11")
+def test_fully_sharded_native_strategy_checkpoint(tmpdir):
+    """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
+
+    model = TestFSDPModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="fsdp_native", precision=16, max_epochs=1
+    )
+    _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
+
+
+@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
+def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir):
+    """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""
+
+    model = TestFSDPModel()
+    ck = ModelCheckpoint(save_last=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        accelerator="gpu",
+        devices=2,
+        strategy="fsdp_native",
+        precision=16,
+        max_epochs=1,
+        callbacks=[ck],
+    )
+    _run_multiple_stages(trainer, model)
+
+
+def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
+    trainer.fit(model)
+
+    model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path
+
+    trainer.save_checkpoint(model_path, weights_only=True)
+
+    _assert_save_equality(trainer, model_path, cls=TestFSDPModel)
+
+    # Test entry point
+    trainer.test(model)  # model is wrapped, will not call configure_shared_model
+
+    # provide model path, will create a new unwrapped model and load and then call configure_shared_model to wrap
+    trainer.test(ckpt_path=model_path)
+
+
+def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel):
+    # Use FullySharded to get the state dict for the sake of comparison
+    model_state_dict = trainer.strategy.lightning_module_state_dict()
+
+    if trainer.is_global_zero:
+        saved_model = cls.load_from_checkpoint(ckpt_path)
+
+        # Assert model parameters are identical after loading
+        for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()):
+            assert torch.equal(ddp_param.float().cpu(), shard_param)
diff --git a/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py
index 4b237c8704ddc7..f780d88ce148b6 100644
--- a/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py
+++ b/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py
@@ -90,6 +90,11 @@ def _assert_layer_fsdp_instance(self) -> None:
         assert self.layer.module[0].reshard_after_forward is True
         assert self.layer.module[2].reshard_after_forward is True
 
+        if isinstance(self.trainer.precision_plugin, FullyShardedNativeMixedPrecisionPlugin):
+            assert self.layer.mixed_precision
+            assert self.layer.module[0].mixed_precision
+            assert self.layer.module[2].mixed_precision
+
 
 @RunIf(min_gpus=1, skip_windows=True, standalone=True, fairscale_fully_sharded=True)
 def test_fully_sharded_strategy_checkpoint(tmpdir):
@@ -97,7 +102,14 @@ def test_fully_sharded_strategy_checkpoint(tmpdir):
 
     model = TestFSDPModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="fsdp", precision=16, max_epochs=1
+        default_root_dir=tmpdir,
+        accelerator="gpu",
+        devices=1,
+        strategy="fsdp",
+        precision=16,
+        max_epochs=1,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
 
@@ -116,6 +128,8 @@ def test_fully_sharded_strategy_checkpoint_multi_gpus(tmpdir):
         precision=16,
         max_epochs=1,
         callbacks=[ck],
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     _run_multiple_stages(trainer, model)
 
@@ -161,6 +175,8 @@ def test_fsdp_gradient_clipping_raises(tmpdir):
         precision=16,
         gradient_clip_val=1,
         gradient_clip_algorithm="norm",
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     with pytest.raises(
         MisconfigurationException, match="gradient_clip_algorithm='norm'` is currently not supported for `FullySharded"
diff --git a/tests/strategies/test_ddp_strategy.py b/tests/strategies/test_ddp_strategy.py
index 3e62c17bc4ecdc..8e4e06e362eb9d 100644
--- a/tests/strategies/test_ddp_strategy.py
+++ b/tests/strategies/test_ddp_strategy.py
@@ -36,7 +36,14 @@ def on_train_start(self) -> None:
 @RunIf(min_gpus=2, skip_windows=True, standalone=True)
 def test_ddp_with_2_gpus():
     """Tests if device is set correctly when training and after teardown for DDPStrategy."""
-    trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp", fast_dev_run=True)
+    trainer = Trainer(
+        accelerator="gpu",
+        devices=2,
+        strategy="ddp",
+        fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+    )
     # assert strategy attributes for device setting
     assert isinstance(trainer.strategy, DDPStrategy)
     local_rank = trainer.strategy.local_rank
@@ -68,7 +75,15 @@ def test_ddp_barrier_non_consecutive_device_ids(barrier_mock, tmpdir):
     """Test correct usage of barriers when device ids do not start at 0 or are not consecutive."""
     model = BoringModel()
     gpus = [1, 3]
-    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, accelerator="gpu", devices=gpus, strategy="ddp")
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_steps=1,
+        accelerator="gpu",
+        devices=gpus,
+        strategy="ddp",
+        enable_progress_bar=False,
+        enable_model_summary=False,
+    )
     trainer.fit(model)
     barrier_mock.assert_any_call(device_ids=[gpus[trainer.local_rank]])
 
diff --git a/tests/strategies/test_ddp_strategy_with_comm_hook.py b/tests/strategies/test_ddp_strategy_with_comm_hook.py
index f199feb09d299f..dada03e83a5a46 100644
--- a/tests/strategies/test_ddp_strategy_with_comm_hook.py
+++ b/tests/strategies/test_ddp_strategy_with_comm_hook.py
@@ -30,11 +30,26 @@
         import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
 
 
+class TestDDPStrategy(DDPStrategy):
+    def __init__(self, expected_ddp_comm_hook_name, *args, **kwargs):
+        self.expected_ddp_comm_hook_name = expected_ddp_comm_hook_name
+        super().__init__(*args, **kwargs)
+
+    def teardown(self):
+        # check here before unwrapping DistributedDataParallel in self.teardown
+        attached_ddp_comm_hook_name = self.model._get_ddp_logging_data()["comm_hook"]
+        assert attached_ddp_comm_hook_name == self.expected_ddp_comm_hook_name
+        return super().teardown()
+
+
 @RunIf(min_gpus=2, min_torch="1.9.0", skip_windows=True, standalone=True)
 def test_ddp_fp16_compress_comm_hook(tmpdir):
     """Test for DDP FP16 compress hook."""
     model = BoringModel()
-    strategy = DDPStrategy(ddp_comm_hook=default.fp16_compress_hook)
+    strategy = TestDDPStrategy(
+        expected_ddp_comm_hook_name=default.fp16_compress_hook.__qualname__,
+        ddp_comm_hook=default.fp16_compress_hook,
+    )
     trainer = Trainer(
         max_epochs=1,
         accelerator="gpu",
@@ -43,11 +58,10 @@ def test_ddp_fp16_compress_comm_hook(tmpdir):
         default_root_dir=tmpdir,
         sync_batchnorm=True,
         fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
-    trainer_comm_hook = trainer.strategy.model.get_ddp_logging_data().comm_hook
-    expected_comm_hook = default.fp16_compress_hook.__qualname__
-    assert trainer_comm_hook == expected_comm_hook
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
 
@@ -55,7 +69,8 @@ def test_ddp_fp16_compress_comm_hook(tmpdir):
 def test_ddp_sgd_comm_hook(tmpdir):
     """Test for DDP FP16 compress hook."""
     model = BoringModel()
-    strategy = DDPStrategy(
+    strategy = TestDDPStrategy(
+        expected_ddp_comm_hook_name=powerSGD.powerSGD_hook.__qualname__,
         ddp_comm_state=powerSGD.PowerSGDState(process_group=None),
         ddp_comm_hook=powerSGD.powerSGD_hook,
     )
@@ -67,11 +82,10 @@ def test_ddp_sgd_comm_hook(tmpdir):
         default_root_dir=tmpdir,
         sync_batchnorm=True,
         fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
-    trainer_comm_hook = trainer.strategy.model.get_ddp_logging_data().comm_hook
-    expected_comm_hook = powerSGD.powerSGD_hook.__qualname__
-    assert trainer_comm_hook == expected_comm_hook
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
 
@@ -79,7 +93,8 @@ def test_ddp_sgd_comm_hook(tmpdir):
 def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir):
     """Test for DDP FP16 compress wrapper for SGD hook."""
     model = BoringModel()
-    strategy = DDPStrategy(
+    strategy = TestDDPStrategy(
+        expected_ddp_comm_hook_name=default.fp16_compress_wrapper(powerSGD.powerSGD_hook).__qualname__,
         ddp_comm_state=powerSGD.PowerSGDState(process_group=None),
         ddp_comm_hook=powerSGD.powerSGD_hook,
         ddp_comm_wrapper=default.fp16_compress_wrapper,
@@ -92,11 +107,10 @@ def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir):
         default_root_dir=tmpdir,
         sync_batchnorm=True,
         fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
-    trainer_comm_hook = trainer.strategy.model.get_ddp_logging_data().comm_hook
-    expected_comm_hook = default.fp16_compress_wrapper(powerSGD.powerSGD_hook).__qualname__
-    assert trainer_comm_hook == expected_comm_hook
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
 
@@ -113,6 +127,8 @@ def test_ddp_spawn_fp16_compress_comm_hook(tmpdir):
         default_root_dir=tmpdir,
         sync_batchnorm=True,
         fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     assert trainer.state.finished, f"Training failed with {trainer.state}"
@@ -122,8 +138,8 @@ def test_ddp_spawn_fp16_compress_comm_hook(tmpdir):
 def test_ddp_post_local_sgd_comm_hook(tmpdir):
     """Test for DDP post-localSGD hook."""
     model = BoringModel()
-
-    strategy = DDPStrategy(
+    strategy = TestDDPStrategy(
+        expected_ddp_comm_hook_name=post_localSGD.post_localSGD_hook.__qualname__,
         ddp_comm_state=post_localSGD.PostLocalSGDState(
             process_group=None,
             subgroup=None,
@@ -139,11 +155,10 @@ def test_ddp_post_local_sgd_comm_hook(tmpdir):
         strategy=strategy,
         default_root_dir=tmpdir,
         sync_batchnorm=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
-    trainer_comm_hook = trainer.strategy.model.get_ddp_logging_data().comm_hook
-    expected_comm_hook = post_localSGD.post_localSGD_hook.__qualname__
-    assert trainer_comm_hook == expected_comm_hook
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
 
@@ -161,6 +176,8 @@ def test_post_local_sgd_model_averaging(average_parameters_mock, tmpdir):
         strategy="ddp",
         default_root_dir=tmpdir,
         sync_batchnorm=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
@@ -219,6 +236,8 @@ def configure_optimizers(self):
         strategy=strategy,
         default_root_dir=tmpdir,
         sync_batchnorm=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
 
     with pytest.raises(ValueError, match="Currently model averaging cannot work with a distributed optimizer"):
diff --git a/tests/strategies/test_deepspeed_strategy.py b/tests/strategies/test_deepspeed_strategy.py
index a9b30b0015bf3c..7fd13ae76ac72b 100644
--- a/tests/strategies/test_deepspeed_strategy.py
+++ b/tests/strategies/test_deepspeed_strategy.py
@@ -32,7 +32,11 @@
 from pytorch_lightning.strategies import DeepSpeedStrategy
 from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE, _DEEPSPEED_GREATER_EQUAL_0_6
+from pytorch_lightning.utilities.imports import (
+    _DEEPSPEED_AVAILABLE,
+    _DEEPSPEED_GREATER_EQUAL_0_5_9,
+    _DEEPSPEED_GREATER_EQUAL_0_6,
+)
 from pytorch_lightning.utilities.meta import init_meta_context
 from tests.helpers.boring_model import BoringModel, RandomDataset, RandomIterableDataset
 from tests.helpers.datamodules import ClassifDataModule
@@ -42,6 +46,11 @@
     import deepspeed
     from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
 
+    if _DEEPSPEED_GREATER_EQUAL_0_5_9:
+        from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
+    else:
+        from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer as DeepSpeedZeroOptimizer
+
 
 class ModelParallelBoringModel(BoringModel):
     def __init__(self):
@@ -238,6 +247,8 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
         devices=1,
         precision=16,
         track_grad_norm=2,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     from pytorch_lightning.plugins.precision.deepspeed import warning_cache
 
@@ -294,9 +305,7 @@ def test_deepspeed_run_configure_optimizers(tmpdir):
 
     class TestCB(Callback):
         def on_train_start(self, trainer, pl_module) -> None:
-            from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
-
-            assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer)
+            assert isinstance(trainer.optimizers[0], DeepSpeedZeroOptimizer)
             assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
             assert isinstance(trainer.lr_scheduler_configs[0].scheduler, torch.optim.lr_scheduler.StepLR)
             # check that the lr_scheduler config was preserved
@@ -317,6 +326,8 @@ def configure_optimizers(self):
         fast_dev_run=True,
         precision=16,
         callbacks=[TestCB(), lr_monitor],
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -333,9 +344,8 @@ def test_deepspeed_config(tmpdir, deepspeed_zero_config):
     class TestCB(Callback):
         def on_train_start(self, trainer, pl_module) -> None:
             from deepspeed.runtime.lr_schedules import WarmupLR
-            from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
 
-            assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer)
+            assert isinstance(trainer.optimizers[0], DeepSpeedZeroOptimizer)
             assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
             assert isinstance(trainer.lr_scheduler_configs[0].scheduler, WarmupLR)
             assert trainer.lr_scheduler_configs[0].interval == "step"
@@ -355,6 +365,8 @@ def on_train_start(self, trainer, pl_module) -> None:
         max_epochs=2,
         precision=16,
         callbacks=[TestCB(), lr_monitor],
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
@@ -382,7 +394,14 @@ def on_train_start(self, trainer, pl_module) -> None:
         loss_scale=10, initial_scale_power=10, loss_scale_window=10, hysteresis=10, min_loss_scale=10
     )
     trainer = Trainer(
-        default_root_dir=tmpdir, strategy=ds, precision=16, accelerator="gpu", devices=1, callbacks=[TestCB()]
+        default_root_dir=tmpdir,
+        strategy=ds,
+        precision=16,
+        accelerator="gpu",
+        devices=1,
+        callbacks=[TestCB()],
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     with pytest.raises(SystemExit):
         trainer.fit(model)
@@ -418,12 +437,13 @@ def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir):
     model = BoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
-        enable_progress_bar=False,
         fast_dev_run=1,
         strategy=ds,
         precision=16,
         accelerator="gpu",
         devices=1,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     with mock.patch(
         "deepspeed.checkpointing.configure", wraps=deepspeed.checkpointing.configure
@@ -473,6 +493,8 @@ def test_deepspeed_multigpu(tmpdir):
         devices=2,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     with mock.patch("deepspeed.init_distributed", wraps=deepspeed.init_distributed) as mock_deepspeed_distributed:
         trainer.fit(model)
@@ -486,7 +508,13 @@ def test_deepspeed_multigpu(tmpdir):
 def test_deepspeed_fp32_works(tmpdir):
     model = BoringModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="deepspeed_stage_3", fast_dev_run=True
+        default_root_dir=tmpdir,
+        accelerator="gpu",
+        devices=1,
+        strategy="deepspeed_stage_3",
+        fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -502,6 +530,8 @@ def test_deepspeed_stage_3_save_warning(tmpdir):
         devices=2,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     checkpoint_path = os.path.join(tmpdir, "model.pt")
@@ -532,6 +562,8 @@ def test_deepspeed_multigpu_single_file(tmpdir):
         devices=1,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     strategy = trainer.strategy
     assert isinstance(strategy, DeepSpeedStrategy)
@@ -546,6 +578,8 @@ def test_deepspeed_multigpu_single_file(tmpdir):
         devices=1,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     strategy = trainer.strategy
     assert isinstance(strategy, DeepSpeedStrategy)
@@ -646,6 +680,8 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
         devices=2,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     trainer.test(model)
@@ -665,6 +701,8 @@ def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config
         devices=2,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     trainer.test(model)
@@ -691,6 +729,8 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization
         precision=16,
         accumulate_grad_batches=accumulate_grad_batches,
         callbacks=[ck],
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model, datamodule=dm)
 
@@ -705,7 +745,13 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization
     else:
         model = ManualModelParallelClassificationModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, accelerator="gpu", devices=2, strategy=DeepSpeedStrategy(stage=3), precision=16
+        default_root_dir=tmpdir,
+        accelerator="gpu",
+        devices=2,
+        strategy=DeepSpeedStrategy(stage=3),
+        precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
 
     results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)
@@ -719,7 +765,12 @@ def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir):
     dm = ClassifDataModule()
     model = BoringModel()
     checkpoint_path = os.path.join(tmpdir, "model.pt")
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+    )
     trainer.fit(model)
     trainer.save_checkpoint(checkpoint_path)
 
@@ -730,6 +781,8 @@ def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir):
         accelerator="gpu",
         devices=1,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     with pytest.warns(
         UserWarning,
@@ -825,7 +878,6 @@ def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any,
     verification_callback = VerificationCallback()
     trainer = Trainer(
         default_root_dir=tmpdir,
-        enable_progress_bar=False,
         # TODO: this test fails with max_epochs >1 as there are leftover batches per epoch.
         # there's divergence in how Lightning handles the last batch of the epoch with how DeepSpeed does it.
         # we step the optimizers on the last batch but DeepSpeed keeps the accumulation for the next epoch
@@ -838,6 +890,8 @@ def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any,
         precision=16,
         accumulate_grad_batches=2,
         callbacks=[verification_callback],
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     assert trainer.limit_train_batches % trainer.accumulate_grad_batches != 0, "leftover batches should be tested"
     trainer.fit(model, datamodule=dm)
@@ -855,6 +909,8 @@ def test_deepspeed_multigpu_test(tmpdir):
         devices=2,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.test(model)
 
@@ -889,6 +945,8 @@ def on_train_epoch_start(self) -> None:
         devices=1,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -914,6 +972,8 @@ def on_train_epoch_start(self) -> None:
         devices=1,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -976,6 +1036,8 @@ def test_deepspeed_multigpu_no_schedulers(tmpdir):
         devices=2,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -996,6 +1058,8 @@ def training_step(self, batch, batch_idx):
         devices=1,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     with pytest.raises(MisconfigurationException, match="returning `None` .* is not supported"):
         trainer.fit(model)
@@ -1032,6 +1096,8 @@ def test_dataloader(self):
         accelerator="gpu",
         devices=1,
         fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     dm = TestSetupIsCalledDataModule()
     with mock.patch("deepspeed.utils.logging.logger.warning", autospec=True) as mock_object:
@@ -1066,6 +1132,8 @@ def configure_optimizers(self):
         accelerator="gpu",
         devices=1,
         strategy="deepspeed",
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     if interval == "epoch":
@@ -1093,6 +1161,8 @@ def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_va
         devices=1,
         strategy="deepspeed",
         fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     with pytest.warns(UserWarning, match="handles gradient clipping internally"):
         trainer.fit(model)
@@ -1108,6 +1178,8 @@ def test_deepspeed_gradient_clip_by_value(tmpdir):
         devices=1,
         strategy="deepspeed",
         gradient_clip_algorithm="value",
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     with pytest.raises(MisconfigurationException, match="does not support clipping gradients by value"):
         trainer.fit(model)
@@ -1117,7 +1189,13 @@ def test_deepspeed_gradient_clip_by_value(tmpdir):
 def test_different_accumulate_grad_batches_fails(tmpdir):
     model = BoringModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, accumulate_grad_batches={1: 2}, accelerator="gpu", devices=1, strategy="deepspeed"
+        default_root_dir=tmpdir,
+        accumulate_grad_batches={1: 2},
+        accelerator="gpu",
+        devices=1,
+        strategy="deepspeed",
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     with pytest.raises(
         MisconfigurationException, match="DeepSpeed currently does not support different `accumulate_grad_batches`"
@@ -1161,6 +1239,8 @@ def on_test_batch_start(
         devices=[1],
         strategy="deepspeed",
         callbacks=TestCallback(),
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     trainer.test(model)
@@ -1178,6 +1258,8 @@ def test_deepspeed_with_meta_device(tmpdir):
         devices=2,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     assert model.layer.weight.device.type == "cpu"
@@ -1187,22 +1269,35 @@ def test_deepspeed_with_meta_device(tmpdir):
 def test_deepspeed_multi_save_same_filepath(tmpdir):
     """Test that verifies that deepspeed saves only latest checkpoint in the specified path and deletes the old
     sharded checkpoints."""
-    model = BoringModel()
+
+    class CustomModel(BoringModel):
+        def training_step(self, *args, **kwargs):
+            self.log("grank", self.global_rank)
+            return super().training_step(*args, **kwargs)
+
+    model = CustomModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
         strategy="deepspeed",
         accelerator="gpu",
         devices=2,
-        callbacks=[ModelCheckpoint(save_top_k=1, save_last=True)],
+        callbacks=[ModelCheckpoint(filename="{epoch}_{step}_{grank}", save_top_k=1)],
         limit_train_batches=1,
         limit_val_batches=0,
         num_sanity_val_steps=0,
         max_epochs=2,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
-    ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, "last.ckpt")
-    expected = ["latest", "zero_to_fp32.py", "checkpoint"]
-    assert set(expected) == set(os.listdir(ckpt_path))
+
+    filepath = "epoch=1_step=2_grank=0.0.ckpt"
+    expected = {filepath}
+    assert expected == set(os.listdir(trainer.checkpoint_callback.dirpath))
+
+    ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, filepath)
+    expected = {"latest", "zero_to_fp32.py", "checkpoint"}
+    assert expected == set(os.listdir(ckpt_path))
 
 
 @RunIf(min_gpus=2, standalone=True, deepspeed=True)
@@ -1218,6 +1313,8 @@ def test_deepspeed_with_bfloat16_precision(tmpdir):
         fast_dev_run=True,
         precision="bf16",
         num_sanity_val_steps=0,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
@@ -1225,3 +1322,17 @@ def test_deepspeed_with_bfloat16_precision(tmpdir):
     assert trainer.strategy.precision_plugin.precision == "bf16"
     assert trainer.strategy.config["zero_optimization"]["stage"] == 3
     assert model.layer.weight.dtype == torch.bfloat16
+
+
+@RunIf(deepspeed=True)
+def test_error_with_invalid_accelerator(tmpdir):
+    """Test DeepSpeedStrategy raises an exception if an invalid accelerator is used."""
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        accelerator="cpu",
+        strategy="deepspeed",
+        fast_dev_run=True,
+    )
+    model = BoringModel()
+    with pytest.raises(MisconfigurationException, match="DeepSpeed strategy is only supported on GPU"):
+        trainer.fit(model)
diff --git a/tests/strategies/test_sharded_strategy.py b/tests/strategies/test_sharded_strategy.py
index 8b3cc52429b707..a1c66ca98355b1 100644
--- a/tests/strategies/test_sharded_strategy.py
+++ b/tests/strategies/test_sharded_strategy.py
@@ -170,7 +170,13 @@ def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir):
 def test_ddp_sharded_strategy_test_multigpu(tmpdir, trainer_kwargs):
     """Test to ensure we can use validate and test without fit."""
     model = BoringModel()
-    trainer = Trainer(strategy="ddp_sharded_spawn", fast_dev_run=True, **trainer_kwargs)
+    trainer = Trainer(
+        strategy="ddp_sharded_spawn",
+        fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        **trainer_kwargs,
+    )
 
     trainer.validate(model)
     trainer.test(model)
@@ -196,7 +202,13 @@ def test_ddp_sharded_strategy_manual_optimization_spawn(tmpdir):
     # todo (sean): this test has been split out as running both tests using parametrize causes "Address in use"
     model = ManualBoringModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, strategy="ddp_sharded_spawn", fast_dev_run=2, accelerator="gpu", devices=2
+        default_root_dir=tmpdir,
+        strategy="ddp_sharded_spawn",
+        fast_dev_run=2,
+        accelerator="gpu",
+        devices=2,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -204,7 +216,15 @@ def test_ddp_sharded_strategy_manual_optimization_spawn(tmpdir):
 @RunIf(min_gpus=2, skip_windows=True, standalone=True, fairscale=True)
 def test_ddp_sharded_strategy_manual_optimization(tmpdir):
     model = ManualBoringModel()
-    trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=2, accelerator="gpu", devices=2)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        strategy="ddp_sharded",
+        fast_dev_run=2,
+        accelerator="gpu",
+        devices=2,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+    )
     trainer.fit(model)
 
 
diff --git a/tests/trainer/connectors/test_callback_connector.py b/tests/trainer/connectors/test_callback_connector.py
index e428244fdfc7e4..d5d6bb91911085 100644
--- a/tests/trainer/connectors/test_callback_connector.py
+++ b/tests/trainer/connectors/test_callback_connector.py
@@ -11,7 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 import logging
+from unittest import mock
+from unittest.mock import Mock
 
 import torch
 
@@ -26,6 +29,7 @@
     TQDMProgressBar,
 )
 from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
+from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0
 from tests.helpers import BoringModel
 
 
@@ -214,3 +218,58 @@ def test_attach_model_callbacks_override_info(caplog):
         cb_connector._attach_model_callbacks()
 
     assert "existing callbacks passed to Trainer: EarlyStopping, LearningRateMonitor" in caplog.text
+
+
+class ExternalCallback(Callback):
+    """A callback in another library that gets registered through entry points."""
+
+    pass
+
+
+def test_configure_external_callbacks():
+    """Test that the connector collects Callback instances from factories registered through entry points."""
+
+    def factory_no_callback():
+        return []
+
+    def factory_one_callback():
+        return ExternalCallback()
+
+    def factory_one_callback_list():
+        return [ExternalCallback()]
+
+    def factory_multiple_callbacks_list():
+        return [ExternalCallback(), ExternalCallback()]
+
+    with _make_entry_point_query_mock(factory_no_callback):
+        trainer = Trainer(enable_checkpointing=False, enable_progress_bar=False, enable_model_summary=False)
+    assert trainer.callbacks == [trainer.accumulation_scheduler]  # this scheduler callback gets added by default
+
+    with _make_entry_point_query_mock(factory_one_callback):
+        trainer = Trainer(enable_checkpointing=False, enable_progress_bar=False, enable_model_summary=False)
+    assert isinstance(trainer.callbacks[1], ExternalCallback)
+
+    with _make_entry_point_query_mock(factory_one_callback_list):
+        trainer = Trainer(enable_checkpointing=False, enable_progress_bar=False, enable_model_summary=False)
+    assert isinstance(trainer.callbacks[1], ExternalCallback)
+
+    with _make_entry_point_query_mock(factory_multiple_callbacks_list):
+        trainer = Trainer(enable_checkpointing=False, enable_progress_bar=False, enable_model_summary=False)
+    assert isinstance(trainer.callbacks[1], ExternalCallback)
+    assert isinstance(trainer.callbacks[2], ExternalCallback)
+
+
+@contextlib.contextmanager
+def _make_entry_point_query_mock(callback_factory):
+    query_mock = Mock()
+    entry_point = Mock()
+    entry_point.name = "mocked"
+    entry_point.load.return_value = callback_factory
+    if _PYTHON_GREATER_EQUAL_3_8_0:
+        query_mock().get.return_value = [entry_point]
+        import_path = "importlib.metadata.entry_points"
+    else:
+        query_mock.return_value = [entry_point]
+        import_path = "pkg_resources.iter_entry_points"
+    with mock.patch(import_path, query_mock):
+        yield
diff --git a/tests/trainer/connectors/test_data_connector.py b/tests/trainer/connectors/test_data_connector.py
index e22e8466001223..7d1b6f0a23bb4e 100644
--- a/tests/trainer/connectors/test_data_connector.py
+++ b/tests/trainer/connectors/test_data_connector.py
@@ -11,20 +11,388 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from contextlib import redirect_stderr
+from io import StringIO
+from re import escape
 from unittest.mock import Mock
 
 import pytest
-from torch.utils.data import DataLoader
+from torch.utils.data import BatchSampler, DataLoader, DistributedSampler, Sampler, SequentialSampler
 
 from pytorch_lightning import Trainer
+from pytorch_lightning.strategies import DDPSpawnStrategy
 from pytorch_lightning.trainer.connectors.data_connector import _DataHookSelector, _DataLoaderSource, warning_cache
-from pytorch_lightning.trainer.states import TrainerFn
+from pytorch_lightning.trainer.states import RunningStage, TrainerFn
+from pytorch_lightning.trainer.supporters import CombinedLoader
+from pytorch_lightning.utilities.data import _update_dataloader
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.warnings import PossibleUserWarning
-from tests.helpers import BoringDataModule, BoringModel
-from tests.helpers.boring_model import RandomDataset
+from tests.helpers.boring_model import BoringDataModule, BoringModel, RandomDataset
+from tests.helpers.runif import RunIf
 from tests.helpers.utils import no_warning_call
 
 
+@RunIf(skip_windows=True)
+@pytest.mark.parametrize("mode", (1, 2))
+def test_replace_distributed_sampler(tmpdir, mode):
+    class IndexedRandomDataset(RandomDataset):
+        def __getitem__(self, index):
+            return self.data[index]
+
+    class CustomDataLoader(DataLoader):
+        def __init__(self, num_features, dataset, *args, **kwargs):
+            # argument `num_features` unused on purpose
+            # it gets automatically captured by _replace_dataloader_init_method()
+            super().__init__(dataset, *args, **kwargs)
+
+    class CustomBatchSampler(BatchSampler):
+        pass
+
+    class TestModel(BoringModel):
+        def __init__(self, numbers_test_dataloaders, mode):
+            super().__init__()
+            self._numbers_test_dataloaders = numbers_test_dataloaders
+            self._mode = mode
+
+        def test_step(self, batch, batch_idx, dataloader_idx=0):
+            return super().test_step(batch, batch_idx)
+
+        def on_test_start(self) -> None:
+            dataloader = self.trainer.test_dataloaders[0]
+            assert isinstance(dataloader, CustomDataLoader)
+            batch_sampler = dataloader.batch_sampler
+            if self._mode == 1:
+                assert isinstance(batch_sampler, CustomBatchSampler)
+                # the batch_size is set on the batch sampler
+                assert dataloader.batch_size is None
+            elif self._mode == 2:
+                assert type(batch_sampler) is BatchSampler
+                assert dataloader.batch_size == self._mode
+            assert batch_sampler.batch_size == self._mode
+            assert batch_sampler.drop_last
+            # the sampler has been replaced
+            assert isinstance(batch_sampler.sampler, DistributedSampler)
+
+        def create_dataset(self):
+            dataset = IndexedRandomDataset(32, 64)
+            if self._mode == 1:
+                # with a custom batch sampler
+                batch_sampler = CustomBatchSampler(SequentialSampler(dataset), batch_size=1, drop_last=True)
+                return CustomDataLoader(32, dataset, batch_sampler=batch_sampler)
+            elif self._mode == 2:
+                # with no batch sampler provided
+                return CustomDataLoader(32, dataset, batch_size=2, drop_last=True)
+
+        def test_dataloader(self):
+            return [self.create_dataset()] * self._numbers_test_dataloaders
+
+    model = TestModel(2, mode)
+    model.test_epoch_end = None
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_test_batches=2,
+        accelerator="cpu",
+        devices=1,
+        strategy="ddp_find_unused_parameters_false",
+    )
+    trainer.test(model)
+
+
+class TestSpawnBoringModel(BoringModel):
+    def __init__(self, num_workers):
+        super().__init__()
+        self.num_workers = num_workers
+
+    def train_dataloader(self):
+        return DataLoader(RandomDataset(32, 64), num_workers=self.num_workers)
+
+    def on_fit_start(self):
+        self._resout = StringIO()
+        self.ctx = redirect_stderr(self._resout)
+        self.ctx.__enter__()
+
+    def on_train_end(self):
+        def _get_warning_msg():
+            dl = self.trainer.train_dataloader.loaders
+            if hasattr(dl, "persistent_workers"):
+                if self.num_workers == 0:
+                    warn_str = "Consider setting num_workers>0 and persistent_workers=True"
+                else:
+                    warn_str = "Consider setting persistent_workers=True"
+            else:
+                warn_str = "Consider setting strategy=ddp"
+
+            return warn_str
+
+        if self.trainer.is_global_zero:
+            self.ctx.__exit__(None, None, None)
+            msg = self._resout.getvalue()
+            warn_str = _get_warning_msg()
+            assert warn_str in msg
+
+
+@RunIf(skip_windows=True)
+@pytest.mark.parametrize("num_workers", [0, 1])
+def test_dataloader_warnings(tmpdir, num_workers):
+    trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", devices=2, strategy="ddp_spawn", fast_dev_run=4)
+    assert isinstance(trainer.strategy, DDPSpawnStrategy)
+    trainer.fit(TestSpawnBoringModel(num_workers))
+
+
+def test_update_dataloader_raises():
+    with pytest.raises(ValueError, match="needs to subclass `torch.utils.data.DataLoader"):
+        _update_dataloader(object(), object(), mode="fit")
+
+
+def test_dataloaders_with_missing_keyword_arguments():
+    ds = RandomDataset(10, 20)
+
+    class TestDataLoader(DataLoader):
+        def __init__(self, dataset):
+            super().__init__(dataset)
+
+    loader = TestDataLoader(ds)
+    sampler = SequentialSampler(ds)
+    match = escape("missing arguments are ['batch_sampler', 'sampler', 'shuffle']")
+    with pytest.raises(MisconfigurationException, match=match):
+        _update_dataloader(loader, sampler, mode="fit")
+    match = escape("missing arguments are ['batch_sampler', 'batch_size', 'drop_last', 'sampler', 'shuffle']")
+    with pytest.raises(MisconfigurationException, match=match):
+        _update_dataloader(loader, sampler, mode="predict")
+
+    class TestDataLoader(DataLoader):
+        def __init__(self, dataset, *args, **kwargs):
+            super().__init__(dataset)
+
+    loader = TestDataLoader(ds)
+    sampler = SequentialSampler(ds)
+    _update_dataloader(loader, sampler, mode="fit")
+    _update_dataloader(loader, sampler, mode="predict")
+
+    class TestDataLoader(DataLoader):
+        def __init__(self, *foo, **bar):
+            super().__init__(*foo, **bar)
+
+    loader = TestDataLoader(ds)
+    sampler = SequentialSampler(ds)
+    _update_dataloader(loader, sampler, mode="fit")
+    _update_dataloader(loader, sampler, mode="predict")
+
+    class TestDataLoader(DataLoader):
+        def __init__(self, num_feat, dataset, *args, shuffle=False):
+            self.num_feat = num_feat
+            super().__init__(dataset)
+
+    loader = TestDataLoader(1, ds)
+    sampler = SequentialSampler(ds)
+    match = escape("missing arguments are ['batch_sampler', 'sampler']")
+    with pytest.raises(MisconfigurationException, match=match):
+        _update_dataloader(loader, sampler, mode="fit")
+    match = escape("missing arguments are ['batch_sampler', 'batch_size', 'drop_last', 'sampler']")
+    with pytest.raises(MisconfigurationException, match=match):
+        _update_dataloader(loader, sampler, mode="predict")
+
+    class TestDataLoader(DataLoader):
+        def __init__(self, num_feat, dataset, **kwargs):
+            self.feat_num = num_feat
+            super().__init__(dataset)
+
+    loader = TestDataLoader(1, ds)
+    sampler = SequentialSampler(ds)
+    match = escape("missing attributes are ['num_feat']")
+    with pytest.raises(MisconfigurationException, match=match):
+        _update_dataloader(loader, sampler, mode="fit")
+    match = escape("missing attributes are ['num_feat']")
+    with pytest.raises(MisconfigurationException, match=match):
+        _update_dataloader(loader, sampler, mode="predict")
+
+
+def test_update_dataloader_with_multiprocessing_context():
+    """This test verifies that replace_sampler conserves multiprocessing context."""
+    train = RandomDataset(32, 64)
+    context = "spawn"
+    train = DataLoader(train, batch_size=32, num_workers=2, multiprocessing_context=context, shuffle=True)
+    new_data_loader = _update_dataloader(train, SequentialSampler(train.dataset))
+    assert new_data_loader.multiprocessing_context == train.multiprocessing_context
+
+
+def test_dataloader_reinit_for_subclass():
+    class CustomDataLoader(DataLoader):
+        def __init__(
+            self,
+            dataset,
+            batch_size=1,
+            shuffle=False,
+            sampler=None,
+            batch_sampler=None,
+            num_workers=0,
+            collate_fn=None,
+            pin_memory=False,
+            drop_last=False,
+            timeout=0,
+            worker_init_fn=None,
+            dummy_kwarg=None,
+        ):
+            super().__init__(
+                dataset,
+                batch_size,
+                shuffle,
+                sampler,
+                batch_sampler,
+                num_workers,
+                collate_fn,
+                pin_memory,
+                drop_last,
+                timeout,
+                worker_init_fn,
+            )
+            self.dummy_kwarg = dummy_kwarg
+            self.something_unrelated = 1
+
+    trainer = Trainer(accelerator="cpu", devices=2, strategy="ddp_spawn")
+
+    class CustomDummyObj:
+        sampler = None
+
+    result = trainer._data_connector._prepare_dataloader(CustomDummyObj(), shuffle=True)
+    assert isinstance(result, CustomDummyObj), "Wrongly reinstantiated data loader"
+
+    dataset = list(range(10))
+    result = trainer._data_connector._prepare_dataloader(CustomDataLoader(dataset), shuffle=True)
+    assert isinstance(result, DataLoader)
+    assert isinstance(result, CustomDataLoader)
+    assert result.dummy_kwarg is None
+
+    # Shuffled DataLoader should also work
+    result = trainer._data_connector._prepare_dataloader(CustomDataLoader(dataset, shuffle=True), shuffle=True)
+    assert isinstance(result, DataLoader)
+    assert isinstance(result, CustomDataLoader)
+    assert result.dummy_kwarg is None
+
+    class CustomSampler(Sampler):
+        pass
+
+    # Should raise an error if existing sampler is being replaced
+    dataloader = CustomDataLoader(dataset, sampler=CustomSampler(dataset))
+    with pytest.raises(MisconfigurationException, match="will be replaced by `DistributedSampler`"):
+        trainer._data_connector._prepare_dataloader(dataloader, shuffle=True)
+
+
+class LoaderTestModel(BoringModel):
+    def training_step(self, batch, batch_idx):
+        assert len(self.trainer.train_dataloader.loaders) == 10
+        return super().training_step(batch, batch_idx)
+
+    def validation_step(self, batch, batch_idx):
+        assert len(self.trainer.val_dataloaders[0]) == 10
+        return super().validation_step(batch, batch_idx)
+
+    def test_step(self, batch, batch_idx):
+        assert len(self.trainer.test_dataloaders[0]) == 10
+        return super().test_step(batch, batch_idx)
+
+    def predict_step(self, batch, batch_idx, dataloader_idx=0):
+        assert len(self.trainer.predict_dataloaders[0]) == 10
+        return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx)
+
+
+def test_loader_detaching():
+    """Checks that the loader has been reset after the entrypoint."""
+
+    loader = DataLoader(RandomDataset(32, 10), batch_size=1)
+
+    model = LoaderTestModel()
+
+    assert len(model.train_dataloader()) == 64
+    assert len(model.val_dataloader()) == 64
+    assert len(model.predict_dataloader()) == 64
+    assert len(model.test_dataloader()) == 64
+
+    trainer = Trainer(fast_dev_run=1)
+    trainer.fit(model, loader, loader)
+
+    assert len(model.train_dataloader()) == 64
+    assert len(model.val_dataloader()) == 64
+    assert len(model.predict_dataloader()) == 64
+    assert len(model.test_dataloader()) == 64
+
+    trainer.validate(model, loader)
+
+    assert len(model.train_dataloader()) == 64
+    assert len(model.val_dataloader()) == 64
+    assert len(model.predict_dataloader()) == 64
+    assert len(model.test_dataloader()) == 64
+
+    trainer.predict(model, loader)
+
+    assert len(model.train_dataloader()) == 64
+    assert len(model.val_dataloader()) == 64
+    assert len(model.predict_dataloader()) == 64
+    assert len(model.test_dataloader()) == 64
+
+    trainer.test(model, loader)
+
+    assert len(model.train_dataloader()) == 64
+    assert len(model.val_dataloader()) == 64
+    assert len(model.predict_dataloader()) == 64
+    assert len(model.test_dataloader()) == 64
+
+
+def test_pre_made_batches():
+    """Check that loader works with pre-made batches."""
+    loader = DataLoader(RandomDataset(32, 10), batch_size=None)
+    trainer = Trainer(fast_dev_run=1)
+    trainer.predict(LoaderTestModel(), loader)
+
+
+def test_error_raised_with_float_limited_eval_batches():
+    """Test that an error is raised if there are not enough batches when passed with float value of
+    limit_eval_batches."""
+    model = BoringModel()
+    dl_size = len(model.val_dataloader())
+    limit_val_batches = 1 / (dl_size + 2)
+    trainer = Trainer(limit_val_batches=limit_val_batches)
+    trainer._data_connector.attach_data(model)
+    with pytest.raises(
+        MisconfigurationException,
+        match=rf"{limit_val_batches} \* {dl_size} < 1. Please increase the `limit_val_batches`",
+    ):
+        trainer._data_connector._reset_eval_dataloader(RunningStage.VALIDATING, model)
+
+
+@pytest.mark.parametrize(
+    "val_dl,warns",
+    [
+        (DataLoader(dataset=RandomDataset(32, 64), shuffle=True), True),
+        (DataLoader(dataset=RandomDataset(32, 64), sampler=list(range(64))), False),
+        (CombinedLoader(DataLoader(dataset=RandomDataset(32, 64), shuffle=True)), True),
+        (
+            CombinedLoader(
+                [DataLoader(dataset=RandomDataset(32, 64)), DataLoader(dataset=RandomDataset(32, 64), shuffle=True)]
+            ),
+            True,
+        ),
+        (
+            CombinedLoader(
+                {
+                    "dl1": DataLoader(dataset=RandomDataset(32, 64)),
+                    "dl2": DataLoader(dataset=RandomDataset(32, 64), shuffle=True),
+                }
+            ),
+            True,
+        ),
+    ],
+)
+def test_non_sequential_sampler_warning_is_raised_for_eval_dataloader(val_dl, warns):
+    trainer = Trainer()
+    model = BoringModel()
+    trainer._data_connector.attach_data(model, val_dataloaders=val_dl)
+    context = pytest.warns if warns else no_warning_call
+    with context(PossibleUserWarning, match="recommended .* turn shuffling off for val/test/predict"):
+        trainer._data_connector._reset_eval_dataloader(RunningStage.VALIDATING, model)
+
+
 class NoDataLoaderModel(BoringModel):
     def __init__(self):
         super().__init__()
@@ -171,3 +539,17 @@ def val_dataloader(self):
     trainer._data_connector.attach_data(model)
     trainer.reset_val_dataloader(model)
     assert trainer.val_dataloaders[0].sampler.shuffle == shuffle
+
+
+def test_error_raised_with_insufficient_float_limit_train_dataloader():
+    batch_size = 16
+    dl = DataLoader(RandomDataset(32, batch_size * 9), batch_size=batch_size)
+    trainer = Trainer(limit_train_batches=0.1)
+    model = BoringModel()
+
+    trainer._data_connector.attach_data(model=model, train_dataloaders=dl)
+    with pytest.raises(
+        MisconfigurationException,
+        match="Please increase the `limit_train_batches` argument. Try at least",
+    ):
+        trainer.reset_train_dataloader(model)
diff --git a/tests/trainer/flags/test_check_val_every_n_epoch.py b/tests/trainer/flags/test_check_val_every_n_epoch.py
index 97c6ddf7803abf..ca2537b829cd70 100644
--- a/tests/trainer/flags/test_check_val_every_n_epoch.py
+++ b/tests/trainer/flags/test_check_val_every_n_epoch.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pytest
+from torch.utils.data import DataLoader
 
-from pytorch_lightning.trainer import Trainer
-from tests.helpers import BoringModel
+from pytorch_lightning.trainer.trainer import Trainer
+from tests.helpers import BoringModel, RandomDataset
 
 
 @pytest.mark.parametrize(
@@ -46,3 +47,35 @@ def on_validation_epoch_start(self) -> None:
 
     assert model.val_epoch_calls == expected_val_loop_calls
     assert model.val_batches == expected_val_batches
+
+
+def test_check_val_every_n_epoch_with_max_steps(tmpdir):
+    data_samples_train = 2
+    check_val_every_n_epoch = 3
+    max_epochs = 4
+
+    class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.validation_called_at_step = set()
+
+        def validation_step(self, *args):
+            self.validation_called_at_step.add(self.global_step)
+            return super().validation_step(*args)
+
+        def train_dataloader(self):
+            return DataLoader(RandomDataset(32, data_samples_train))
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_steps=data_samples_train * max_epochs,
+        check_val_every_n_epoch=check_val_every_n_epoch,
+        num_sanity_val_steps=0,
+    )
+
+    trainer.fit(model)
+
+    assert trainer.current_epoch == max_epochs
+    assert trainer.global_step == max_epochs * data_samples_train
+    assert list(model.validation_called_at_step) == [data_samples_train * check_val_every_n_epoch]
diff --git a/tests/trainer/flags/test_val_check_interval.py b/tests/trainer/flags/test_val_check_interval.py
index 685e104805daa7..b575faa81203c1 100644
--- a/tests/trainer/flags/test_val_check_interval.py
+++ b/tests/trainer/flags/test_val_check_interval.py
@@ -14,9 +14,12 @@
 import logging
 
 import pytest
+from torch.utils.data import DataLoader
 
-from pytorch_lightning.trainer import Trainer
-from tests.helpers import BoringModel
+from pytorch_lightning.trainer.trainer import Trainer
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.helpers import BoringModel, RandomDataset
+from tests.helpers.boring_model import RandomIterableDataset
 
 
 @pytest.mark.parametrize("max_epochs", [1, 2, 3])
@@ -57,3 +60,66 @@ def test_val_check_interval_info_message(caplog, value):
     with caplog.at_level(logging.INFO):
         Trainer()
     assert message not in caplog.text
+
+
+@pytest.mark.parametrize("use_infinite_dataset", [True, False])
+def test_validation_check_interval_exceed_data_length_correct(tmpdir, use_infinite_dataset):
+    data_samples_train = 4
+    max_epochs = 3
+    max_steps = data_samples_train * max_epochs
+
+    class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.validation_called_at_step = set()
+
+        def validation_step(self, *args):
+            self.validation_called_at_step.add(self.global_step)
+            return super().validation_step(*args)
+
+        def train_dataloader(self):
+            train_ds = (
+                RandomIterableDataset(32, count=max_steps + 100)
+                if use_infinite_dataset
+                else RandomDataset(32, length=data_samples_train)
+            )
+            return DataLoader(train_ds)
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_val_batches=1,
+        max_steps=max_steps,
+        val_check_interval=3,
+        check_val_every_n_epoch=None,
+        num_sanity_val_steps=0,
+    )
+
+    trainer.fit(model)
+
+    assert trainer.current_epoch == 1 if use_infinite_dataset else max_epochs
+    assert trainer.global_step == max_steps
+    assert sorted(list(model.validation_called_at_step)) == [3, 6, 9, 12]
+
+
+def test_validation_check_interval_exceed_data_length_wrong():
+    trainer = Trainer(
+        limit_train_batches=10,
+        val_check_interval=100,
+    )
+
+    model = BoringModel()
+    with pytest.raises(ValueError, match="must be less than or equal to the number of the training batches"):
+        trainer.fit(model)
+
+
+def test_val_check_interval_float_with_none_check_val_every_n_epoch():
+    """Test that an exception is raised when `val_check_interval` is set to float with
+    `check_val_every_n_epoch=None`"""
+    with pytest.raises(
+        MisconfigurationException, match="`val_check_interval` should be an integer when `check_val_every_n_epoch=None`"
+    ):
+        Trainer(
+            val_check_interval=0.5,
+            check_val_every_n_epoch=None,
+        )
diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py
index 8e2132498fd0e7..aed065596474cd 100644
--- a/tests/trainer/logging_/test_eval_loop_logging.py
+++ b/tests/trainer/logging_/test_eval_loop_logging.py
@@ -15,6 +15,7 @@
 import collections
 import itertools
 import os
+from contextlib import redirect_stdout
 from io import StringIO
 from unittest import mock
 from unittest.mock import call
@@ -28,10 +29,13 @@
 from pytorch_lightning.loops.dataloader import EvaluationLoop
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0
+from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _RICH_AVAILABLE
 from tests.helpers import BoringModel, RandomDataset
 from tests.helpers.runif import RunIf
 
+if _RICH_AVAILABLE:
+    from rich import get_console
+
 
 def test__validation_step__log(tmpdir):
     """Tests that validation_step can log."""
@@ -794,8 +798,12 @@ def test_dataloader(self):
 
 inputs1 = (
     [
-        {"performance": {"log1": torch.tensor(5), "log2": torch.tensor(3)}},
-        {"test": {"no_log1": torch.tensor(6), "no_log2": torch.tensor(1)}},
+        {
+            "value": torch.tensor(2),
+            "performance": {"log:1": torch.tensor(0), "log2": torch.tensor(3), "log3": torch.tensor(7)},
+            "extra": {"log3": torch.tensor(7)},
+        },
+        {"different value": torch.tensor(1.5), "tes:t": {"no_log1": torch.tensor(6), "no_log2": torch.tensor(1)}},
     ],
     RunningStage.TESTING,
 )
@@ -803,10 +811,14 @@ def test_dataloader(self):
 ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Test metric             DataLoader 0             DataLoader 1
 ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-          log1                       5
-          log2                       3
-         no_log1                                              6
-         no_log2                                              1
+     different value                                         1.5
+       extra:log3                    7
+    performance:log2                 3
+    performance:log3                 7
+    performance:log:1                0
+      tes:t:no_log1                                           6
+      tes:t:no_log2                                           1
+          value                      2
 ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 """
 
@@ -864,12 +876,31 @@ def test_native_print_results(monkeypatch, inputs, expected):
     import pytorch_lightning.loops.dataloader.evaluation_loop as imports
 
     monkeypatch.setattr(imports, "_RICH_AVAILABLE", False)
-    out = StringIO()
-    EvaluationLoop._print_results(*inputs, file=out)
+
+    with redirect_stdout(StringIO()) as out:
+        EvaluationLoop._print_results(*inputs)
     expected = expected[1:]  # remove the initial line break from the """ string
     assert out.getvalue().replace(os.linesep, "\n") == expected.lstrip()
 
 
+@pytest.mark.parametrize("encoding", ["latin-1", "utf-8"])
+def test_native_print_results_encodings(monkeypatch, encoding):
+    import pytorch_lightning.loops.dataloader.evaluation_loop as imports
+
+    monkeypatch.setattr(imports, "_RICH_AVAILABLE", False)
+
+    out = mock.Mock()
+    out.encoding = encoding
+    with redirect_stdout(out) as out:
+        EvaluationLoop._print_results(*inputs0)
+
+    # Attempt to encode everything the file is told to write with the given encoding
+    for call_ in out.method_calls:
+        name, args, kwargs = call_
+        if name == "write":
+            args[0].encode(encoding)
+
+
 expected0 = """
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 ┃       Test metric       ┃      DataLoader 0       ┃       DataLoader 1       ┃
@@ -883,10 +914,14 @@ def test_native_print_results(monkeypatch, inputs, expected):
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 ┃       Test metric       ┃      DataLoader 0       ┃       DataLoader 1       ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩
-│          log1           │            5            │                          │
-│          log2           │            3            │                          │
-│         no_log1         │                         │            6             │
-│         no_log2         │                         │            1             │
+│     different value     │                         │           1.5            │
+│       extra:log3        │            7            │                          │
+│    performance:log2     │            3            │                          │
+│    performance:log3     │            7            │                          │
+│    performance:log:1    │            0            │                          │
+│      tes:t:no_log1      │                         │            6             │
+│      tes:t:no_log2      │                         │            1             │
+│          value          │            2            │                          │
 └─────────────────────────┴─────────────────────────┴──────────────────────────┘
 """
 
@@ -933,7 +968,8 @@ def test_native_print_results(monkeypatch, inputs, expected):
 )
 @RunIf(skip_windows=True, rich=True)
 def test_rich_print_results(inputs, expected):
-    out = StringIO()
-    EvaluationLoop._print_results(*inputs, file=out)
+    console = get_console()
+    with console.capture() as capture:
+        EvaluationLoop._print_results(*inputs)
     expected = expected[1:]  # remove the initial line break from the """ string
-    assert out.getvalue() == expected.lstrip()
+    assert capture.get() == expected.lstrip()
diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py
index bd8fe86141282e..167648f9dacf76 100644
--- a/tests/trainer/logging_/test_logger_connector.py
+++ b/tests/trainer/logging_/test_logger_connector.py
@@ -194,7 +194,6 @@ def __init__(self, not_supported):
                 "on_before_batch_transfer",
                 "transfer_batch_to_device",
                 "on_after_batch_transfer",
-                "get_progress_bar_dict",
             }
         )
         # remove `nn.Module` hooks
diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py
index ecf701f7edca0c..e985dff214af6c 100644
--- a/tests/trainer/logging_/test_train_loop_logging.py
+++ b/tests/trainer/logging_/test_train_loop_logging.py
@@ -141,7 +141,7 @@ def test__training_step__step_end__epoch_end__log(tmpdir, batches, log_interval,
     """Tests that training_step_end and training_epoch_end can log."""
 
     class TestModel(BoringModel):
-        def training_step(self, batch, batch_idx):
+        def training_step(self, batch):
             loss = self.step(batch[0])
             self.log("a", loss, on_step=True, on_epoch=True)
             return loss
@@ -456,11 +456,12 @@ def validation_step(self, batch, batch_idx):
         limit_train_batches=1,
         limit_val_batches=1,
         max_epochs=2,
-        enable_model_summary=False,
         strategy="ddp",
         accelerator="gpu",
         devices=2,
         profiler="pytorch",
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
index 7fe435544b2e35..330f0cf10c993e 100644
--- a/tests/trainer/optimization/test_manual_optimization.py
+++ b/tests/trainer/optimization/test_manual_optimization.py
@@ -773,6 +773,8 @@ def train_manual_optimization(tmpdir, strategy, model_cls=TesManualOptimizationD
         accelerator="gpu",
         devices=2,
         strategy=strategy,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
diff --git a/tests/trainer/optimization/test_optimizers.py b/tests/trainer/optimization/test_optimizers.py
index 557a4b987af487..7c4c50a869c54e 100644
--- a/tests/trainer/optimization/test_optimizers.py
+++ b/tests/trainer/optimization/test_optimizers.py
@@ -579,7 +579,15 @@ def on_train_start(self, *args, **kwargs):
             assert state["sum"].device == torch.device("cuda", self.local_rank) == self.device
 
     model = TestModel()
-    trainer = Trainer(default_root_dir=tmpdir, accelerator="gpu", devices=2, strategy="ddp", fast_dev_run=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        accelerator="gpu",
+        devices=2,
+        strategy="ddp",
+        fast_dev_run=True,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+    )
     trainer.fit(model)
 
 
diff --git a/tests/trainer/properties/test_loggers.py b/tests/trainer/properties/test_loggers.py
index a7efe71ddb96c8..ac3a01cba3698f 100644
--- a/tests/trainer/properties/test_loggers.py
+++ b/tests/trainer/properties/test_loggers.py
@@ -61,7 +61,8 @@ def test_trainer_loggers_setters():
     assert trainer.loggers == [logger1]
 
     trainer.logger = logger_collection
-    assert trainer.logger._logger_iterable == logger_collection._logger_iterable
+    with pytest.deprecated_call(match="logger` when multiple loggers are configured"):
+        assert trainer.logger._logger_iterable == logger_collection._logger_iterable
     assert trainer.loggers == [logger1, logger2]
 
     # LoggerCollection of size 1 should result in trainer.logger becoming the contained logger.
@@ -76,7 +77,8 @@ def test_trainer_loggers_setters():
     # Test setters for trainer.loggers
     trainer.loggers = [logger1, logger2]
     assert trainer.loggers == [logger1, logger2]
-    assert trainer.logger._logger_iterable == logger_collection._logger_iterable
+    with pytest.deprecated_call(match="logger` when multiple loggers are configured"):
+        assert trainer.logger._logger_iterable == logger_collection._logger_iterable
 
     trainer.loggers = [logger1]
     assert trainer.loggers == [logger1]
diff --git a/tests/trainer/test_config_validator.py b/tests/trainer/test_config_validator.py
index ed5ea11322f52a..3237ba748f336f 100644
--- a/tests/trainer/test_config_validator.py
+++ b/tests/trainer/test_config_validator.py
@@ -15,6 +15,8 @@
 import torch
 
 from pytorch_lightning import LightningDataModule, LightningModule, Trainer
+from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.demos.boring_classes import BoringDataModule
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel, RandomDataset
 
@@ -156,3 +158,37 @@ def test_trainer_manual_optimization_config(tmpdir):
     trainer = Trainer(accumulate_grad_batches=2)
     with pytest.raises(MisconfigurationException, match="Automatic gradient accumulation is not supported"):
         trainer.fit(model)
+
+
+def test_invalid_setup_method():
+    """Test error message when `setup` method of `LightningModule` or `LightningDataModule` is not defined
+    correctly."""
+
+    class CustomModel(BoringModel):
+        def setup(self):
+            pass
+
+    class CustomDataModule(BoringDataModule):
+        def setup(self):
+            pass
+
+    class CustomBoringCallback(Callback):
+        def setup(self, pl_module, trainer):
+            pass
+
+    fit_kwargs = [
+        {"model": CustomModel(), "datamodule": BoringDataModule()},
+        {"model": BoringModel(), "datamodule": CustomDataModule()},
+    ]
+
+    for kwargs in fit_kwargs:
+        trainer = Trainer(fast_dev_run=True)
+
+        with pytest.raises(MisconfigurationException, match="does not have a `stage` argument"):
+            trainer.fit(**kwargs)
+
+    trainer = Trainer(fast_dev_run=True, callbacks=[CustomBoringCallback()])
+    model = BoringModel()
+
+    with pytest.raises(MisconfigurationException, match="does not have a `stage` argument"):
+        trainer.fit(model)
diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py
deleted file mode 100644
index 4455cd89e21045..00000000000000
--- a/tests/trainer/test_data_loading.py
+++ /dev/null
@@ -1,382 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from contextlib import redirect_stderr
-from io import StringIO
-from re import escape
-
-import pytest
-from torch.utils.data import BatchSampler, DataLoader, DistributedSampler, Sampler, SequentialSampler
-
-from pytorch_lightning import Trainer
-from pytorch_lightning.strategies import DDPSpawnStrategy
-from pytorch_lightning.trainer.states import RunningStage
-from pytorch_lightning.trainer.supporters import CombinedLoader
-from pytorch_lightning.utilities.data import _update_dataloader
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.warnings import PossibleUserWarning
-from tests.helpers import BoringModel, RandomDataset
-from tests.helpers.runif import RunIf
-
-
-@RunIf(skip_windows=True)
-@pytest.mark.parametrize("mode", (1, 2))
-def test_replace_distributed_sampler(tmpdir, mode):
-    class IndexedRandomDataset(RandomDataset):
-        def __getitem__(self, index):
-            return self.data[index]
-
-    class CustomDataLoader(DataLoader):
-        def __init__(self, num_features, dataset, *args, **kwargs):
-            # argument `num_features` unused on purpose
-            # it gets automatically captured by _replace_dataloader_init_method()
-            super().__init__(dataset, *args, **kwargs)
-
-    class CustomBatchSampler(BatchSampler):
-        pass
-
-    class TestModel(BoringModel):
-        def __init__(self, numbers_test_dataloaders, mode):
-            super().__init__()
-            self._numbers_test_dataloaders = numbers_test_dataloaders
-            self._mode = mode
-
-        def test_step(self, batch, batch_idx, dataloader_idx=0):
-            return super().test_step(batch, batch_idx)
-
-        def on_test_start(self) -> None:
-            dataloader = self.trainer.test_dataloaders[0]
-            assert isinstance(dataloader, CustomDataLoader)
-            batch_sampler = dataloader.batch_sampler
-            if self._mode == 1:
-                assert isinstance(batch_sampler, CustomBatchSampler)
-                # the batch_size is set on the batch sampler
-                assert dataloader.batch_size is None
-            elif self._mode == 2:
-                assert type(batch_sampler) is BatchSampler
-                assert dataloader.batch_size == self._mode
-            assert batch_sampler.batch_size == self._mode
-            assert batch_sampler.drop_last
-            # the sampler has been replaced
-            assert isinstance(batch_sampler.sampler, DistributedSampler)
-
-        def create_dataset(self):
-            dataset = IndexedRandomDataset(32, 64)
-            if self._mode == 1:
-                # with a custom batch sampler
-                batch_sampler = CustomBatchSampler(SequentialSampler(dataset), batch_size=1, drop_last=True)
-                return CustomDataLoader(32, dataset, batch_sampler=batch_sampler)
-            elif self._mode == 2:
-                # with no batch sampler provided
-                return CustomDataLoader(32, dataset, batch_size=2, drop_last=True)
-
-        def test_dataloader(self):
-            return [self.create_dataset()] * self._numbers_test_dataloaders
-
-    model = TestModel(2, mode)
-    model.test_epoch_end = None
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        limit_test_batches=2,
-        accelerator="cpu",
-        devices=1,
-        strategy="ddp_find_unused_parameters_false",
-    )
-    trainer.test(model)
-
-
-class TestSpawnBoringModel(BoringModel):
-    def __init__(self, num_workers):
-        super().__init__()
-        self.num_workers = num_workers
-
-    def train_dataloader(self):
-        return DataLoader(RandomDataset(32, 64), num_workers=self.num_workers)
-
-    def on_fit_start(self):
-        self._resout = StringIO()
-        self.ctx = redirect_stderr(self._resout)
-        self.ctx.__enter__()
-
-    def on_train_end(self):
-        def _get_warning_msg():
-            dl = self.trainer.train_dataloader.loaders
-            if hasattr(dl, "persistent_workers"):
-                if self.num_workers == 0:
-                    warn_str = "Consider setting num_workers>0 and persistent_workers=True"
-                else:
-                    warn_str = "Consider setting persistent_workers=True"
-            else:
-                warn_str = "Consider setting strategy=ddp"
-
-            return warn_str
-
-        if self.trainer.is_global_zero:
-            self.ctx.__exit__(None, None, None)
-            msg = self._resout.getvalue()
-            warn_str = _get_warning_msg()
-            assert warn_str in msg
-
-
-@RunIf(skip_windows=True)
-@pytest.mark.parametrize("num_workers", [0, 1])
-def test_dataloader_warnings(tmpdir, num_workers):
-    trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", devices=2, strategy="ddp_spawn", fast_dev_run=4)
-    assert isinstance(trainer.strategy, DDPSpawnStrategy)
-    trainer.fit(TestSpawnBoringModel(num_workers))
-
-
-def test_update_dataloader_raises():
-    with pytest.raises(ValueError, match="needs to subclass `torch.utils.data.DataLoader"):
-        _update_dataloader(object(), object(), mode="fit")
-
-
-def test_dataloaders_with_missing_keyword_arguments():
-    ds = RandomDataset(10, 20)
-
-    class TestDataLoader(DataLoader):
-        def __init__(self, dataset):
-            super().__init__(dataset)
-
-    loader = TestDataLoader(ds)
-    sampler = SequentialSampler(ds)
-    match = escape("missing arguments are ['batch_sampler', 'sampler', 'shuffle']")
-    with pytest.raises(MisconfigurationException, match=match):
-        _update_dataloader(loader, sampler, mode="fit")
-    match = escape("missing arguments are ['batch_sampler', 'batch_size', 'drop_last', 'sampler', 'shuffle']")
-    with pytest.raises(MisconfigurationException, match=match):
-        _update_dataloader(loader, sampler, mode="predict")
-
-    class TestDataLoader(DataLoader):
-        def __init__(self, dataset, *args, **kwargs):
-            super().__init__(dataset)
-
-    loader = TestDataLoader(ds)
-    sampler = SequentialSampler(ds)
-    _update_dataloader(loader, sampler, mode="fit")
-    _update_dataloader(loader, sampler, mode="predict")
-
-    class TestDataLoader(DataLoader):
-        def __init__(self, *foo, **bar):
-            super().__init__(*foo, **bar)
-
-    loader = TestDataLoader(ds)
-    sampler = SequentialSampler(ds)
-    _update_dataloader(loader, sampler, mode="fit")
-    _update_dataloader(loader, sampler, mode="predict")
-
-    class TestDataLoader(DataLoader):
-        def __init__(self, num_feat, dataset, *args, shuffle=False):
-            self.num_feat = num_feat
-            super().__init__(dataset)
-
-    loader = TestDataLoader(1, ds)
-    sampler = SequentialSampler(ds)
-    match = escape("missing arguments are ['batch_sampler', 'sampler']")
-    with pytest.raises(MisconfigurationException, match=match):
-        _update_dataloader(loader, sampler, mode="fit")
-    match = escape("missing arguments are ['batch_sampler', 'batch_size', 'drop_last', 'sampler']")
-    with pytest.raises(MisconfigurationException, match=match):
-        _update_dataloader(loader, sampler, mode="predict")
-
-    class TestDataLoader(DataLoader):
-        def __init__(self, num_feat, dataset, **kwargs):
-            self.feat_num = num_feat
-            super().__init__(dataset)
-
-    loader = TestDataLoader(1, ds)
-    sampler = SequentialSampler(ds)
-    match = escape("missing attributes are ['num_feat']")
-    with pytest.raises(MisconfigurationException, match=match):
-        _update_dataloader(loader, sampler, mode="fit")
-    match = escape("missing attributes are ['num_feat']")
-    with pytest.raises(MisconfigurationException, match=match):
-        _update_dataloader(loader, sampler, mode="predict")
-
-
-def test_update_dataloader_with_multiprocessing_context():
-    """This test verifies that replace_sampler conserves multiprocessing context."""
-    train = RandomDataset(32, 64)
-    context = "spawn"
-    train = DataLoader(train, batch_size=32, num_workers=2, multiprocessing_context=context, shuffle=True)
-    new_data_loader = _update_dataloader(train, SequentialSampler(train.dataset))
-    assert new_data_loader.multiprocessing_context == train.multiprocessing_context
-
-
-def test_dataloader_reinit_for_subclass():
-    class CustomDataLoader(DataLoader):
-        def __init__(
-            self,
-            dataset,
-            batch_size=1,
-            shuffle=False,
-            sampler=None,
-            batch_sampler=None,
-            num_workers=0,
-            collate_fn=None,
-            pin_memory=False,
-            drop_last=False,
-            timeout=0,
-            worker_init_fn=None,
-            dummy_kwarg=None,
-        ):
-            super().__init__(
-                dataset,
-                batch_size,
-                shuffle,
-                sampler,
-                batch_sampler,
-                num_workers,
-                collate_fn,
-                pin_memory,
-                drop_last,
-                timeout,
-                worker_init_fn,
-            )
-            self.dummy_kwarg = dummy_kwarg
-            self.something_unrelated = 1
-
-    trainer = Trainer(accelerator="cpu", devices=2, strategy="ddp_spawn")
-
-    class CustomDummyObj:
-        sampler = None
-
-    result = trainer._data_connector._prepare_dataloader(CustomDummyObj(), shuffle=True)
-    assert isinstance(result, CustomDummyObj), "Wrongly reinstantiated data loader"
-
-    dataset = list(range(10))
-    result = trainer._data_connector._prepare_dataloader(CustomDataLoader(dataset), shuffle=True)
-    assert isinstance(result, DataLoader)
-    assert isinstance(result, CustomDataLoader)
-    assert result.dummy_kwarg is None
-
-    # Shuffled DataLoader should also work
-    result = trainer._data_connector._prepare_dataloader(CustomDataLoader(dataset, shuffle=True), shuffle=True)
-    assert isinstance(result, DataLoader)
-    assert isinstance(result, CustomDataLoader)
-    assert result.dummy_kwarg is None
-
-    class CustomSampler(Sampler):
-        pass
-
-    # Should raise an error if existing sampler is being replaced
-    dataloader = CustomDataLoader(dataset, sampler=CustomSampler(dataset))
-    with pytest.raises(MisconfigurationException, match="will be replaced by `DistributedSampler`"):
-        trainer._data_connector._prepare_dataloader(dataloader, shuffle=True)
-
-
-class LoaderTestModel(BoringModel):
-    def training_step(self, batch, batch_idx):
-        assert len(self.trainer.train_dataloader.loaders) == 10
-        return super().training_step(batch, batch_idx)
-
-    def validation_step(self, batch, batch_idx):
-        assert len(self.trainer.val_dataloaders[0]) == 10
-        return super().validation_step(batch, batch_idx)
-
-    def test_step(self, batch, batch_idx):
-        assert len(self.trainer.test_dataloaders[0]) == 10
-        return super().test_step(batch, batch_idx)
-
-    def predict_step(self, batch, batch_idx, dataloader_idx=0):
-        assert len(self.trainer.predict_dataloaders[0]) == 10
-        return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx)
-
-
-def test_loader_detaching():
-    """Checks that the loader has been reset after the entrypoint."""
-
-    loader = DataLoader(RandomDataset(32, 10), batch_size=1)
-
-    model = LoaderTestModel()
-
-    assert len(model.train_dataloader()) == 64
-    assert len(model.val_dataloader()) == 64
-    assert len(model.predict_dataloader()) == 64
-    assert len(model.test_dataloader()) == 64
-
-    trainer = Trainer(fast_dev_run=1)
-    trainer.fit(model, loader, loader)
-
-    assert len(model.train_dataloader()) == 64
-    assert len(model.val_dataloader()) == 64
-    assert len(model.predict_dataloader()) == 64
-    assert len(model.test_dataloader()) == 64
-
-    trainer.validate(model, loader)
-
-    assert len(model.train_dataloader()) == 64
-    assert len(model.val_dataloader()) == 64
-    assert len(model.predict_dataloader()) == 64
-    assert len(model.test_dataloader()) == 64
-
-    trainer.predict(model, loader)
-
-    assert len(model.train_dataloader()) == 64
-    assert len(model.val_dataloader()) == 64
-    assert len(model.predict_dataloader()) == 64
-    assert len(model.test_dataloader()) == 64
-
-    trainer.test(model, loader)
-
-    assert len(model.train_dataloader()) == 64
-    assert len(model.val_dataloader()) == 64
-    assert len(model.predict_dataloader()) == 64
-    assert len(model.test_dataloader()) == 64
-
-
-def test_pre_made_batches():
-    """Check that loader works with pre-made batches."""
-    loader = DataLoader(RandomDataset(32, 10), batch_size=None)
-    trainer = Trainer(fast_dev_run=1)
-    trainer.predict(LoaderTestModel(), loader)
-
-
-def test_error_raised_with_float_limited_eval_batches():
-    """Test that an error is raised if there are not enough batches when passed with float value of
-    limit_eval_batches."""
-    model = BoringModel()
-    dl_size = len(model.val_dataloader())
-    limit_val_batches = 1 / (dl_size + 2)
-    trainer = Trainer(limit_val_batches=limit_val_batches)
-    trainer._data_connector.attach_data(model)
-    with pytest.raises(
-        MisconfigurationException,
-        match=rf"{limit_val_batches} \* {dl_size} < 1. Please increase the `limit_val_batches`",
-    ):
-        trainer._data_connector._reset_eval_dataloader(RunningStage.VALIDATING, model)
-
-
-@pytest.mark.parametrize(
-    "val_dl",
-    [
-        DataLoader(dataset=RandomDataset(32, 64), shuffle=True),
-        CombinedLoader(DataLoader(dataset=RandomDataset(32, 64), shuffle=True)),
-        CombinedLoader(
-            [DataLoader(dataset=RandomDataset(32, 64)), DataLoader(dataset=RandomDataset(32, 64), shuffle=True)]
-        ),
-        CombinedLoader(
-            {
-                "dl1": DataLoader(dataset=RandomDataset(32, 64)),
-                "dl2": DataLoader(dataset=RandomDataset(32, 64), shuffle=True),
-            }
-        ),
-    ],
-)
-def test_non_sequential_sampler_warning_is_raised_for_eval_dataloader(val_dl):
-    trainer = Trainer()
-    model = BoringModel()
-    trainer._data_connector.attach_data(model, val_dataloaders=val_dl)
-    with pytest.warns(PossibleUserWarning, match="recommended .* turn shuffling off for val/test/predict"):
-        trainer._data_connector._reset_eval_dataloader(RunningStage.VALIDATING, model)
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 66b5be243bf800..651ee5786a7bef 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -519,13 +519,18 @@ def test_mixing_of_dataloader_options(tmpdir, ckpt_path):
 def test_warning_on_zero_len_dataloader(tmpdir):
     """Test that a warning is raised if a zero-length dataloader is defined."""
     model = BoringModel()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        fast_dev_run=1,
-    )
-    dataloader = DataLoader(RandomDataset(32, 0))
-    with pytest.warns(UserWarning, match="returned 0 length"):
-        trainer.fit(model, dataloader)
+    trainer = Trainer()
+    train_dataloader = DataLoader(RandomDataset(32, 0))
+    val_dataloader = DataLoader(RandomDataset(32, 0))
+    trainer._data_connector.attach_data(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
+
+    with pytest.warns(UserWarning, match="Total length of `CombinedLoader` across ranks is zero"):
+        trainer.reset_train_dataloader(model)
+    assert trainer.num_training_batches == 0
+
+    with pytest.warns(UserWarning, match="Total length of `DataLoader` across ranks is zero"):
+        trainer.reset_val_dataloader(model)
+    assert trainer.num_val_batches == [0]
 
 
 @RunIf(skip_windows=True)
@@ -962,7 +967,7 @@ def test_inf_dataloader_raise_error_with_partial_batch_limits(tmpdir, stage, dat
     trainer = Trainer(**trainer_kwargs)
     trainer_fn = "fit" if stage == RunningStage.TRAINING else stage.value
 
-    with pytest.raises(MisconfigurationException, match=r"using an IterableDataset .* must be `1.0` or an int"):
+    with pytest.raises(MisconfigurationException, match=r"IterableDataset`.*limit_.*_batches\)`.*`1.0` or an int"):
         getattr(trainer, trainer_fn)(model)
 
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 25c65461678ea8..82d79d9a9adeb4 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -18,6 +18,7 @@
 import pickle
 import sys
 from argparse import Namespace
+from contextlib import nullcontext
 from copy import deepcopy
 from pathlib import Path
 from unittest import mock
@@ -36,6 +37,7 @@
 from pytorch_lightning import Callback, LightningDataModule, LightningModule, Trainer
 from pytorch_lightning.accelerators import CPUAccelerator, GPUAccelerator
 from pytorch_lightning.callbacks import EarlyStopping, GradientAccumulationScheduler, ModelCheckpoint, Timer
+from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint
 from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter
 from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv
 from pytorch_lightning.loggers import TensorBoardLogger
@@ -664,6 +666,79 @@ def test_benchmark_option(benchmark_, deterministic, expected):
     torch.backends.cudnn.benchmark = original_val
 
 
+@pytest.mark.parametrize("ckpt_path", (None, "last"))
+@pytest.mark.parametrize("fn", (TrainerFn.FITTING, TrainerFn.VALIDATING))
+def test_checkpoint_path_input_last_fault_tolerant(tmpdir, ckpt_path, fn):
+    mc = ModelCheckpoint()
+    mc.best_model_path = "foobar"
+    # manually create to simulate fault-tolerant training
+    ft_ckpt = _FaultToleranceCheckpoint(tmpdir)
+    Path(ft_ckpt.ckpt_path).touch()
+
+    trainer = Trainer(callbacks=[mc, ft_ckpt])
+    trainer.state.fn = fn
+
+    if ckpt_path == "last":
+        ctxt = nullcontext()
+        final_path = os.path.join(tmpdir, ".pl_auto_save.ckpt")
+    elif fn == "fit":  # and ckpt_path == best
+        ctxt = pytest.warns(UserWarning, match="Because fault tolerance is enabled")
+        final_path = os.path.join(tmpdir, ".pl_auto_save.ckpt")
+    else:  # ckpt_path == best and fn == validate
+        ctxt = pytest.warns(UserWarning, match="There is also a fault-tolerant checkpoint available")
+        final_path = "foobar"
+
+    with ctxt:
+        ckpt_path = trainer._Trainer__set_ckpt_path(ckpt_path, model_provided=fn == "fit", model_connected=True)
+    assert ckpt_path == final_path
+
+
+@pytest.mark.parametrize("ckpt_path", (None, "last"))
+@pytest.mark.parametrize("save_last", (True, False))
+@pytest.mark.parametrize("fn", ("fit", "validate"))
+def test_checkpoint_path_input_last(tmpdir, ckpt_path, save_last, fn):
+    model = BoringModel()
+    mc = ModelCheckpoint(save_last=save_last)
+    trainer = Trainer(
+        max_epochs=1,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+        logger=False,
+        default_root_dir=tmpdir,
+        callbacks=[mc],
+    )
+    assert trainer.ckpt_path is None
+    trainer_fn = getattr(trainer, fn)
+
+    if fn == "fit":
+        ctxt = nullcontext() if ckpt_path is None else pytest.warns(UserWarning, match="No checkpoint will be loaded")
+        with ctxt:
+            trainer_fn(model, ckpt_path=ckpt_path)
+        assert trainer.ckpt_path is None
+    else:
+        trainer.fit(model)
+        if ckpt_path is None:
+            ctxt = pytest.warns(
+                UserWarning,
+                match=r"(?!.*however it is default only when fitting)^"
+                r".*The best model of the previous `fit` call will be used",
+            )
+            final_path = mc.best_model_path
+        else:
+            if save_last:
+                ctxt = nullcontext()
+                final_path = mc.last_model_path
+            else:
+                ctxt = pytest.warns(UserWarning, match="No checkpoint will be loaded")
+                final_path = None
+
+        with ctxt:
+            trainer_fn(ckpt_path=ckpt_path)
+        assert trainer.ckpt_path == final_path
+
+
 @pytest.mark.parametrize("ckpt_path", (None, "best", "specific"))
 @pytest.mark.parametrize("save_top_k", (-1, 0, 1, 2))
 @pytest.mark.parametrize("fn", ("validate", "test", "predict"))
@@ -693,7 +768,7 @@ def predict_step(self, batch, *_):
     trainer.fit(model)
 
     trainer_fn = getattr(trainer, fn)
-    assert getattr(trainer, "ckpt_path") is None
+    assert trainer.ckpt_path is None
 
     if ckpt_path == "best":
         # ckpt_path is 'best', meaning we load the best weights
@@ -704,20 +779,20 @@ def predict_step(self, batch, *_):
                 trainer_fn(model, ckpt_path=ckpt_path)
         else:
             trainer_fn(ckpt_path=ckpt_path)
-            assert getattr(trainer, "ckpt_path") == trainer.checkpoint_callback.best_model_path
+            assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
 
             trainer_fn(model, ckpt_path=ckpt_path)
-            assert getattr(trainer, "ckpt_path") == trainer.checkpoint_callback.best_model_path
+            assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
     elif ckpt_path is None:
         # ckpt_path is None, meaning we don't load any checkpoints and use the provided model
         trainer_fn(model, ckpt_path=ckpt_path)
-        assert getattr(trainer, "ckpt_path") is None
+        assert trainer.ckpt_path is None
 
         if save_top_k > 0:
             # ckpt_path is None with no model provided means load the best weights
             with pytest.warns(UserWarning, match="The best model of the previous `fit` call will be used"):
                 trainer_fn(ckpt_path=ckpt_path)
-                assert getattr(trainer, "ckpt_path") == trainer.checkpoint_callback.best_model_path
+                assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
     else:
         # specific checkpoint, pick one from saved ones
         if save_top_k == 0:
@@ -730,10 +805,10 @@ def predict_step(self, batch, *_):
                 ].absolute()
             )
             trainer_fn(ckpt_path=ckpt_path)
-            assert getattr(trainer, "ckpt_path") == ckpt_path
+            assert trainer.ckpt_path == ckpt_path
 
             trainer_fn(model, ckpt_path=ckpt_path)
-            assert getattr(trainer, "ckpt_path") == ckpt_path
+            assert trainer.ckpt_path == ckpt_path
 
 
 @pytest.mark.parametrize("enable_checkpointing", (False, True))
@@ -764,14 +839,14 @@ def predict_step(self, batch, *_):
     trainer.fit(model)
 
     trainer_fn = getattr(trainer, fn)
-    assert getattr(trainer, "ckpt_path") is None
+    assert trainer.ckpt_path is None
 
     if enable_checkpointing:
         trainer_fn(ckpt_path="best")
-        assert getattr(trainer, "ckpt_path") == trainer.checkpoint_callback.best_model_path
+        assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
 
         trainer_fn(model, ckpt_path="best")
-        assert getattr(trainer, "ckpt_path") == trainer.checkpoint_callback.best_model_path
+        assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
     else:
         with pytest.raises(MisconfigurationException, match="`ModelCheckpoint` is not configured."):
             trainer_fn(ckpt_path="best")
@@ -1416,58 +1491,6 @@ def test_predict_return_predictions_cpu(return_predictions, precision, tmpdir):
         assert preds[0].dtype == (torch.float64 if precision == 64 else torch.float32)
 
 
-@pytest.mark.parametrize(
-    ["limit_train_batches", "global_step", "num_training_batches", "current_epoch", "should_train"],
-    [(0.2, 0, 0, 0, False), (0.5, 10, 2, 5, True)],
-)
-def test_disabled_training_for_insufficient_limit_train_batches(
-    tmpdir, limit_train_batches, global_step, num_training_batches, current_epoch, should_train
-):
-    """Verify when `limit_train_batches` is float & between [0.0, 1.0] and.
-
-    `int(self.num_training_batches * self.limit_train_batches) == 0`, the training loop is disabled.
-    """
-
-    class CurrentModel(BoringModel):
-
-        training_step_invoked = False
-        training_epoch_end_invoked = False
-
-        def training_step(self, *args, **kwargs):
-            self.training_step_invoked = True
-            return super().training_step(*args, **kwargs)
-
-        def training_epoch_end(self, *args, **kwargs):
-            self.training_epoch_end_invoked = True
-            return super().training_epoch_end(*args, **kwargs)
-
-    dataset_len = 100
-    batch_size = 25
-
-    train = RandomDataset(32, length=dataset_len)
-    train_loader = DataLoader(train, batch_size=batch_size)
-
-    model = CurrentModel()
-
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=5, limit_train_batches=limit_train_batches)
-    trainer.fit(model, train_loader)
-
-    params_string = f"""`limit_train_batches={limit_train_batches}`, `dataset_len={dataset_len}`
-                        & `batch_size={batch_size}` as
-                        `num_training_batches={num_training_batches}`"""
-    if should_train:
-        error_string = f"should run with {params_string}"
-    else:
-        error_string = f"should not run with {params_string}"
-
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
-    assert trainer.global_step == global_step
-    assert trainer.num_training_batches == num_training_batches
-    assert trainer.current_epoch == current_epoch
-    assert model.training_step_invoked == should_train, f"`training_step` {error_string}"
-    assert model.training_epoch_end_invoked == should_train, f"`training_epoch_end` {error_string}"
-
-
 @pytest.mark.parametrize(["max_steps", "max_epochs", "global_step"], [(10, 5, 10), (20, None, 20)])
 def test_repeated_fit_calls_with_max_epochs_and_steps(tmpdir, max_steps, max_epochs, global_step):
     """Ensure that the training loop is bound by `max_steps` and `max_epochs` for repeated calls of `trainer.fit`,
@@ -1746,6 +1769,8 @@ def training_step(self, batch, batch_idx):
         accelerator="gpu",
         devices=2,
         strategy="ddp",
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
 
     # simulate random failure in training_step on rank 0
@@ -2131,3 +2156,10 @@ def test_trainer_config_device_ids(monkeypatch, trainer_kwargs, expected_device_
     trainer = Trainer(**trainer_kwargs)
     assert trainer.device_ids == expected_device_ids
     assert trainer.num_devices == len(expected_device_ids)
+
+
+def test_trainer_save_checkpoint_no_model_attached():
+    trainer = Trainer()
+    assert trainer.model is None
+    with pytest.raises(AttributeError, match="Saving a checkpoint is only possible if a model is attached"):
+        trainer.save_checkpoint("checkpoint.ckpt")
diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py
index 24555b7692d232..41987141a89e8f 100644
--- a/tests/utilities/test_all_gather_grad.py
+++ b/tests/utilities/test_all_gather_grad.py
@@ -106,6 +106,8 @@ def training_epoch_end(self, outputs) -> None:
         accelerator="gpu",
         devices=2,
         strategy="ddp",
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
@@ -129,6 +131,14 @@ def training_step(self, batch, batch_idx):
             return loss
 
     model = TestModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="gpu", devices=2, strategy="ddp")
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        accelerator="gpu",
+        devices=2,
+        strategy="ddp",
+        enable_progress_bar=False,
+        enable_model_summary=False,
+    )
     trainer.fit(model)
     assert model.training_step_called
diff --git a/tests/utilities/test_argparse.py b/tests/utilities/test_argparse.py
index cc1ec10cb5541f..1e83d96a0648ef 100644
--- a/tests/utilities/test_argparse.py
+++ b/tests/utilities/test_argparse.py
@@ -19,7 +19,7 @@
 
 
 class ArgparseExample:
-    def __init__(self, a: int = 0, b: str = "", c: bool = False):
+    def __init__(self, a: int, b: str = "", c: bool = False):
         self.a = a
         self.b = b
         self.c = c
@@ -147,6 +147,16 @@ def __init__(self, invalid_class: SomeClass):
         pass
 
 
+class AddArgparseArgsExampleClassNoDefault:
+    """
+    Args:
+        my_parameter: A thing.
+    """
+
+    def __init__(self, my_parameter: int):
+        pass
+
+
 def extract_help_text(parser):
     help_str_buffer = io.StringIO()
     parser.print_help(file=help_str_buffer)
@@ -160,6 +170,7 @@ def extract_help_text(parser):
         [AddArgparseArgsExampleClass, "AddArgparseArgsExampleClass"],
         [AddArgparseArgsExampleClassViaInit, "AddArgparseArgsExampleClassViaInit"],
         [AddArgparseArgsExampleClassNoDoc, "AddArgparseArgsExampleClassNoDoc"],
+        [AddArgparseArgsExampleClassNoDefault, "AddArgparseArgsExampleClassNoDefault"],
     ],
 )
 def test_add_argparse_args(cls, name):
@@ -185,6 +196,15 @@ def test_add_argparse_args(cls, name):
     assert args.main_arg == "abc"
     assert args.my_parameter == 2
 
+    fake_argv = ["--main_arg=abc"]
+    if cls is AddArgparseArgsExampleClassNoDefault:
+        with pytest.raises(SystemExit):
+            parser.parse_args(fake_argv)
+    else:
+        args = parser.parse_args(fake_argv)
+        assert args.main_arg == "abc"
+        assert args.my_parameter == 0
+
 
 def test_negative_add_argparse_args():
     with pytest.raises(RuntimeError, match="Please only pass an ArgumentParser instance."):
diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py
index ae20f69a35e802..c614b163b76346 100644
--- a/tests/utilities/test_cli.py
+++ b/tests/utilities/test_cli.py
@@ -18,7 +18,7 @@
 import pickle
 import sys
 from argparse import Namespace
-from contextlib import redirect_stdout
+from contextlib import contextmanager, ExitStack, redirect_stdout
 from io import StringIO
 from typing import List, Optional, Union
 from unittest import mock
@@ -46,6 +46,7 @@
     LightningCLI,
     LOGGER_REGISTRY,
     LR_SCHEDULER_REGISTRY,
+    LRSchedulerTypeTuple,
     MODEL_REGISTRY,
     OPTIMIZER_REGISTRY,
     SaveConfigCallback,
@@ -61,6 +62,17 @@
     torchvision_version = version.parse(__import__("torchvision").__version__)
 
 
+@contextmanager
+def mock_subclasses(baseclass, *subclasses):
+    """Mocks baseclass so that it only has the given child subclasses."""
+    with ExitStack() as stack:
+        mgr = mock.patch.object(baseclass, "__subclasses__", return_value=[*subclasses])
+        stack.enter_context(mgr)
+        for mgr in [mock.patch.object(s, "__subclasses__", return_value=[]) for s in subclasses]:
+            stack.enter_context(mgr)
+        yield None
+
+
 @mock.patch("argparse.ArgumentParser.parse_args")
 def test_default_args(mock_argparse):
     """Tests default argument parser for Trainer."""
@@ -470,8 +482,8 @@ def test_lightning_cli_print_config():
 
     outval = yaml.safe_load(text)
     assert outval["seed_everything"] == 1234
-    assert outval["model"]["class_path"] == "tests.helpers.BoringModel"
-    assert outval["data"]["class_path"] == "tests.helpers.BoringDataModule"
+    assert outval["model"]["class_path"] == "pytorch_lightning.demos.boring_classes.BoringModel"
+    assert outval["data"]["class_path"] == "pytorch_lightning.demos.boring_classes.BoringDataModule"
     assert outval["ckpt_path"] is None
 
 
@@ -485,9 +497,9 @@ def __init__(self, submodule1: LightningModule, submodule2: LightningModule, mai
     config = """model:
         main_param: 2
         submodule1:
-            class_path: tests.helpers.BoringModel
+            class_path: pytorch_lightning.demos.boring_classes.BoringModel
         submodule2:
-            class_path: tests.helpers.BoringModel
+            class_path: pytorch_lightning.demos.boring_classes.BoringModel
     """
     config_path = tmpdir / "config.yaml"
     with open(config_path, "w") as f:
@@ -725,18 +737,18 @@ def add_arguments_to_parser(self, parser):
     assert cli.trainer.lr_scheduler_configs[0].scheduler.step_size == 50
 
 
-@pytest.mark.parametrize("use_registries", [False, True])
-def test_lightning_cli_optimizers_and_lr_scheduler_with_link_to(use_registries, tmpdir):
+@pytest.mark.parametrize("use_generic_base_class", [False, True])
+def test_lightning_cli_optimizers_and_lr_scheduler_with_link_to(use_generic_base_class, tmpdir):
     class MyLightningCLI(LightningCLI):
         def add_arguments_to_parser(self, parser):
             parser.add_optimizer_args(
-                OPTIMIZER_REGISTRY.classes if use_registries else torch.optim.Adam,
+                (torch.optim.Optimizer,) if use_generic_base_class else torch.optim.Adam,
                 nested_key="optim1",
                 link_to="model.optim1",
             )
             parser.add_optimizer_args((torch.optim.ASGD, torch.optim.SGD), nested_key="optim2", link_to="model.optim2")
             parser.add_lr_scheduler_args(
-                LR_SCHEDULER_REGISTRY.classes if use_registries else torch.optim.lr_scheduler.ExponentialLR,
+                LRSchedulerTypeTuple if use_generic_base_class else torch.optim.lr_scheduler.ExponentialLR,
                 link_to="model.scheduler",
             )
 
@@ -748,7 +760,7 @@ def __init__(self, optim1: dict, optim2: dict, scheduler: dict):
             self.scheduler = instantiate_class(self.optim1, scheduler)
 
     cli_args = ["fit", f"--trainer.default_root_dir={tmpdir}", "--trainer.max_epochs=1"]
-    if use_registries:
+    if use_generic_base_class:
         cli_args += [
             "--optim1",
             "Adam",
@@ -759,7 +771,7 @@ def __init__(self, optim1: dict, optim2: dict, scheduler: dict):
             "--lr_scheduler=ExponentialLR",
         ]
     else:
-        cli_args += ["--optim2.class_path=torch.optim.SGD", "--optim2.init_args.lr=0.01"]
+        cli_args += ["--optim2=SGD", "--optim2.lr=0.01"]
     cli_args += ["--lr_scheduler.gamma=0.2"]
 
     with mock.patch("sys.argv", ["any.py"] + cli_args):
@@ -964,18 +976,17 @@ def __init__(self, foo, bar=5):
         self.bar = bar
 
 
-def test_lightning_cli_model_choices():
-    MODEL_REGISTRY(cls=TestModel)
-    MODEL_REGISTRY(cls=BoringModel)
-
+def test_lightning_cli_model_short_arguments():
     with mock.patch("sys.argv", ["any.py", "fit", "--model=BoringModel"]), mock.patch(
         "pytorch_lightning.Trainer._fit_impl"
-    ) as run:
+    ) as run, mock_subclasses(LightningModule, BoringModel, TestModel):
         cli = LightningCLI(trainer_defaults={"fast_dev_run": 1})
         assert isinstance(cli.model, BoringModel)
         run.assert_called_once_with(cli.model, ANY, ANY, ANY, ANY)
 
-    with mock.patch("sys.argv", ["any.py", "--model=TestModel", "--model.foo", "123"]):
+    with mock.patch("sys.argv", ["any.py", "--model=TestModel", "--model.foo", "123"]), mock_subclasses(
+        LightningModule, BoringModel, TestModel
+    ):
         cli = LightningCLI(run=False)
         assert isinstance(cli.model, TestModel)
         assert cli.model.foo == 123
@@ -989,15 +1000,11 @@ def __init__(self, foo, bar=5):
         self.bar = bar
 
 
-def test_lightning_cli_datamodule_choices():
-    MODEL_REGISTRY(cls=BoringModel)
-    DATAMODULE_REGISTRY(cls=MyDataModule)
-    DATAMODULE_REGISTRY(cls=BoringDataModule)
-
+def test_lightning_cli_datamodule_short_arguments():
     # with set model
     with mock.patch("sys.argv", ["any.py", "fit", "--data=BoringDataModule"]), mock.patch(
         "pytorch_lightning.Trainer._fit_impl"
-    ) as run:
+    ) as run, mock_subclasses(LightningDataModule, BoringDataModule):
         cli = LightningCLI(BoringModel, trainer_defaults={"fast_dev_run": 1})
         assert isinstance(cli.datamodule, BoringDataModule)
         run.assert_called_once_with(ANY, ANY, ANY, cli.datamodule, ANY)
@@ -1011,30 +1018,25 @@ def test_lightning_cli_datamodule_choices():
     # with configurable model
     with mock.patch("sys.argv", ["any.py", "fit", "--model", "BoringModel", "--data=BoringDataModule"]), mock.patch(
         "pytorch_lightning.Trainer._fit_impl"
-    ) as run:
+    ) as run, mock_subclasses(LightningModule, BoringModel), mock_subclasses(LightningDataModule, BoringDataModule):
         cli = LightningCLI(trainer_defaults={"fast_dev_run": 1})
         assert isinstance(cli.model, BoringModel)
         assert isinstance(cli.datamodule, BoringDataModule)
         run.assert_called_once_with(cli.model, ANY, ANY, cli.datamodule, ANY)
 
-    with mock.patch("sys.argv", ["any.py", "--model", "BoringModel", "--data=MyDataModule"]):
+    with mock.patch("sys.argv", ["any.py", "--model", "BoringModel", "--data=MyDataModule"]), mock_subclasses(
+        LightningModule, BoringModel
+    ):
         cli = LightningCLI(run=False)
         assert isinstance(cli.model, BoringModel)
         assert isinstance(cli.datamodule, MyDataModule)
 
-    assert len(DATAMODULE_REGISTRY)  # needs a value initially added
     with mock.patch("sys.argv", ["any.py"]):
         cli = LightningCLI(BoringModel, run=False)
         # data was not passed but we are adding it automatically because there are datamodules registered
         assert "data" in cli.parser.groups
         assert not hasattr(cli.parser.groups["data"], "group_class")
 
-    with mock.patch("sys.argv", ["any.py"]), mock.patch.dict(DATAMODULE_REGISTRY, clear=True):
-        cli = LightningCLI(BoringModel, run=False, auto_registry=False)
-        # no registered classes so not added automatically
-        assert "data" not in cli.parser.groups
-    assert len(DATAMODULE_REGISTRY)  # check state was not modified
-
     with mock.patch("sys.argv", ["any.py"]):
         cli = LightningCLI(BoringModel, BoringDataModule, run=False)
         # since we are passing the DataModule, that's whats added to the parser
@@ -1043,7 +1045,6 @@ def test_lightning_cli_datamodule_choices():
 
 @pytest.mark.parametrize("use_class_path_callbacks", [False, True])
 def test_registries_resolution(use_class_path_callbacks):
-    MODEL_REGISTRY(cls=BoringModel)
 
     """This test validates registries are used when simplified command line are being used."""
     cli_args = [
@@ -1071,7 +1072,7 @@ def test_registries_resolution(use_class_path_callbacks):
         cli_args += [f"--trainer.callbacks={json.dumps(callbacks)}"]
         extras = [Callback, Callback]
 
-    with mock.patch("sys.argv", ["any.py"] + cli_args):
+    with mock.patch("sys.argv", ["any.py"] + cli_args), mock_subclasses(LightningModule, BoringModel):
         cli = LightningCLI(run=False)
 
     assert isinstance(cli.model, BoringModel)
@@ -1158,43 +1159,6 @@ def test_argv_transformation_multiple_callbacks_with_config():
     assert argv == expected
 
 
-@pytest.mark.parametrize(
-    ["args", "expected", "nested_key", "registry"],
-    [
-        (
-            ["--optimizer", "Adadelta"],
-            {"class_path": "torch.optim.adadelta.Adadelta", "init_args": {}},
-            "optimizer",
-            OPTIMIZER_REGISTRY,
-        ),
-        (
-            ["--optimizer", "Adadelta", "--optimizer.lr", "10"],
-            {"class_path": "torch.optim.adadelta.Adadelta", "init_args": {"lr": "10"}},
-            "optimizer",
-            OPTIMIZER_REGISTRY,
-        ),
-        (
-            ["--lr_scheduler", "OneCycleLR"],
-            {"class_path": "torch.optim.lr_scheduler.OneCycleLR", "init_args": {}},
-            "lr_scheduler",
-            LR_SCHEDULER_REGISTRY,
-        ),
-        (
-            ["--lr_scheduler", "OneCycleLR", "--lr_scheduler.anneal_strategy=linear"],
-            {"class_path": "torch.optim.lr_scheduler.OneCycleLR", "init_args": {"anneal_strategy": "linear"}},
-            "lr_scheduler",
-            LR_SCHEDULER_REGISTRY,
-        ),
-    ],
-)
-def test_argv_transformations_with_optimizers_and_lr_schedulers(args, expected, nested_key, registry):
-    base = ["any.py", "--trainer.max_epochs=1"]
-    argv = base + args
-    _populate_registries(False)
-    new_argv = LightningArgumentParser._convert_argv_issue_84(registry.classes, nested_key, argv)
-    assert new_argv == base + [f"--{nested_key}", str(expected)]
-
-
 def test_optimizers_and_lr_schedulers_reload(tmpdir):
     base = ["any.py", "--trainer.max_epochs=1"]
     input = base + [
@@ -1547,3 +1511,45 @@ def test_cli_logger_shorthand():
     with mock.patch("sys.argv", ["any.py", "--trainer.logger=False"]):
         cli = LightningCLI(TestModel, run=False)
     assert cli.trainer.logger is None
+
+
+def test_cli_auto_seeding():
+    with mock.patch("sys.argv", ["any.py"]):
+        cli = LightningCLI(TestModel, run=False, seed_everything_default=False)
+        assert cli.seed_everything_default is False
+        assert cli.config["seed_everything"] is False
+
+    with mock.patch("sys.argv", ["any.py"]):
+        cli = LightningCLI(TestModel, run=False, seed_everything_default=True)
+        assert cli.seed_everything_default is True
+        assert isinstance(cli.config["seed_everything"], int)
+
+    with mock.patch("sys.argv", ["any.py", "--seed_everything", "3"]):
+        cli = LightningCLI(TestModel, run=False, seed_everything_default=False)
+        assert cli.seed_everything_default is False
+        assert cli.config["seed_everything"] == 3
+
+    with mock.patch("sys.argv", ["any.py", "--seed_everything", "3"]):
+        cli = LightningCLI(TestModel, run=False, seed_everything_default=True)
+        assert cli.seed_everything_default is True
+        assert cli.config["seed_everything"] == 3
+
+    with mock.patch("sys.argv", ["any.py", "--seed_everything", "3"]):
+        cli = LightningCLI(TestModel, run=False, seed_everything_default=10)
+        assert cli.seed_everything_default == 10
+        assert cli.config["seed_everything"] == 3
+
+    with mock.patch("sys.argv", ["any.py", "--seed_everything", "false"]):
+        cli = LightningCLI(TestModel, run=False, seed_everything_default=10)
+        assert cli.seed_everything_default == 10
+        assert cli.config["seed_everything"] is False
+
+    with mock.patch("sys.argv", ["any.py", "--seed_everything", "false"]):
+        cli = LightningCLI(TestModel, run=False, seed_everything_default=True)
+        assert cli.seed_everything_default is True
+        assert cli.config["seed_everything"] is False
+
+    with mock.patch("sys.argv", ["any.py", "--seed_everything", "true"]):
+        cli = LightningCLI(TestModel, run=False, seed_everything_default=False)
+        assert cli.seed_everything_default is False
+        assert isinstance(cli.config["seed_everything"], int)
diff --git a/tests/utilities/test_deepspeed_collate_checkpoint.py b/tests/utilities/test_deepspeed_collate_checkpoint.py
index 99940d983bfd0b..521d3d4845b129 100644
--- a/tests/utilities/test_deepspeed_collate_checkpoint.py
+++ b/tests/utilities/test_deepspeed_collate_checkpoint.py
@@ -33,6 +33,8 @@ def test_deepspeed_collate_checkpoint(tmpdir):
         devices=2,
         fast_dev_run=True,
         precision=16,
+        enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     checkpoint_path = os.path.join(tmpdir, "model.pt")
diff --git a/tests/utilities/test_fetching.py b/tests/utilities/test_fetching.py
index 010e2e57143c24..843462ad2202c9 100644
--- a/tests/utilities/test_fetching.py
+++ b/tests/utilities/test_fetching.py
@@ -375,9 +375,10 @@ def __init__(self, trigger_stop_iteration) -> None:
             super().__init__()
             self.trigger_stop_iteration = trigger_stop_iteration
 
-        def training_step(self, dataloader_iter: Iterator, *args) -> STEP_OUTPUT:
+        def training_step(self, dataloader_iter: Iterator) -> STEP_OUTPUT:
             output = super().training_step(dataloader_iter)
-            if self.trigger_stop_iteration and args[0] == EXPECT_NUM_BATCHES_PROCESSED:
+            batch_idx = self.trainer.fit_loop.epoch_loop.batch_idx
+            if self.trigger_stop_iteration and batch_idx == EXPECT_NUM_BATCHES_PROCESSED:
                 raise StopIteration
             return output