From 424e78d5b35e5f90ad5700a76a920ec23b3cd944 Mon Sep 17 00:00:00 2001
From: Jerome Anand <88475913+jerome-habana@users.noreply.github.com>
Date: Thu, 25 Aug 2022 14:15:01 +0530
Subject: [PATCH 01/30] Add document to showcase scaleout on hpu (#14357)

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
---
 .../source-pytorch/accelerators/hpu_basic.rst | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/docs/source-pytorch/accelerators/hpu_basic.rst b/docs/source-pytorch/accelerators/hpu_basic.rst
index a6c20414a7a02..2f075c94001f6 100644
--- a/docs/source-pytorch/accelerators/hpu_basic.rst
+++ b/docs/source-pytorch/accelerators/hpu_basic.rst
@@ -47,6 +47,40 @@ It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy`
 
 ----
 
+Scale-out on Gaudis
+-------------------
+
+To train a Lightning model using multiple HPU nodes, set the ``num_nodes`` parameter with the available nodes in the ``Trainer`` class.
+
+.. code-block:: python
+
+    trainer = Trainer(accelerator="hpu", devices=8, strategy="hpu_parallel", num_nodes=2)
+
+In addition to this, the following environment variables need to be set to establish communication across nodes. Check out the documentation on :doc:`Cluster Environment <../clouds/cluster>` for more details.
+
+- *MASTER_PORT* - required; has to be a free port on machine with NODE_RANK 0
+- *MASTER_ADDR* - required (except for NODE_RANK 0); address of NODE_RANK 0 node
+- *WORLD_SIZE* - required; how many workers are in the cluster
+- *NODE_RANK* - required; id of the node in the cluster
+
+The trainer needs to be instantiated on every node participating in the training.
+
+On Node 1:
+
+.. code-block:: bash
+
+    MASTER_ADDR=<MASTER_ADDR> MASTER_PORT=<MASTER_PORT> NODE_RANK=0 WORLD_SIZE=16
+        python -m some_model_trainer.py (--arg1 ... train script args...)
+
+On Node 2:
+
+.. code-block:: bash
+
+    MASTER_ADDR=<MASTER_ADDR> MASTER_PORT=<MASTER_PORT> NODE_RANK=1 WORLD_SIZE=16
+        python -m some_model_trainer.py (--arg1 ... train script args...)
+
+----
+
 Select Gaudis automatically
 ---------------------------
 

From 1ae14ca7542e83db47888fd0d68324449014dfc0 Mon Sep 17 00:00:00 2001
From: otaj <6065855+otaj@users.noreply.github.com>
Date: Thu, 25 Aug 2022 19:30:06 +0200
Subject: [PATCH 02/30] [CI] fix horovod tests (#14382)

---
 .azure/gpu-tests.yml          | 11 +++++++----
 dockers/base-conda/Dockerfile | 14 ++++++++------
 dockers/base-cuda/Dockerfile  | 23 +++++++----------------
 3 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
index f19c5bafc7814..2da30c0dd66ab 100644
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@@ -44,7 +44,7 @@ jobs:
 
     - bash: |
         CHANGED_FILES=$(git diff --name-status origin/master -- . | awk  '{print $2}')
-        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
+        FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
         echo $CHANGED_FILES > changed_files.txt
         MATCHES=$(cat changed_files.txt | grep -E $FILTER)
         echo $MATCHES
@@ -72,12 +72,15 @@ jobs:
         set -e
         python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)"
+        TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
         CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])")
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION}
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
         pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0"
-        pip install -e .[strategies]
-        pip install -U deepspeed  # TODO: remove when docker images are upgraded
-        pip install --requirement requirements/pytorch/devel.txt
+        pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
+        pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
         pip list
       env:
         PACKAGE_NAME: pytorch
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
index d6bfeee90d561..9bb75e34b8ff6 100644
--- a/dockers/base-conda/Dockerfile
+++ b/dockers/base-conda/Dockerfile
@@ -34,6 +34,10 @@ RUN \
     # https://github.com/NVIDIA/nvidia-docker/issues/1631
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update -qq --fix-missing && \
+    NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
+    CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
+    MAX_ALLOWED_NCCL=2.11.4 && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V  | head -n1)-1+cuda${CUDA_VERSION_MM} && \
     apt-get install -y --no-install-recommends \
         build-essential \
         cmake \
@@ -42,17 +46,15 @@ RUN \
         curl \
         unzip \
         ca-certificates \
-        libopenmpi-dev
-
-RUN \
+        libopenmpi-dev \
+        libnccl2=$TO_INSTALL_NCCL \
+        libnccl-dev=$TO_INSTALL_NCCL && \
 # Install conda and python.
 # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
     curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
     chmod +x ~/miniconda.sh && \
     ~/miniconda.sh -b && \
-    rm ~/miniconda.sh
-
-RUN \
+    rm ~/miniconda.sh && \
 # Cleaning
     apt-get autoremove -y && \
     apt-get clean && \
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index be613f3b6415f..08692ff00ab78 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -37,7 +37,11 @@ RUN \
     # https://github.com/NVIDIA/nvidia-docker/issues/1631
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update -qq --fix-missing && \
-    apt-get install -y --no-install-recommends \
+    NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
+    CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
+    MAX_ALLOWED_NCCL=2.11.4 && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V  | head -n1)-1+cuda${CUDA_VERSION_MM} && \
+    apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
         build-essential \
         pkg-config \
         cmake \
@@ -50,8 +54,8 @@ RUN \
         libopenmpi-dev \
         openmpi-bin \
         ssh \
-    && \
-
+        libnccl2=$TO_INSTALL_NCCL \
+        libnccl-dev=$TO_INSTALL_NCCL && \
 # Install python
     add-apt-repository ppa:deadsnakes/ppa && \
     apt-get install -y \
@@ -59,10 +63,8 @@ RUN \
         python${PYTHON_VERSION}-distutils \
         python${PYTHON_VERSION}-dev \
     && \
-
     update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
     update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
-
 # Cleaning
     apt-get autoremove -y && \
     apt-get clean && \
@@ -78,7 +80,6 @@ RUN \
     wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
     python${PYTHON_VERSION} get-pip.py && \
     rm get-pip.py && \
-
     pip install -q fire && \
     # Disable cache \
     CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
@@ -91,16 +92,6 @@ RUN \
     pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
     rm assistant.py
 
-RUN \
-    apt-get purge -y cmake && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
-    tar -zxvf cmake-3.20.2.tar.gz && \
-    cd cmake-3.20.2 && \
-    ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
-    make && \
-    make install && \
-    cmake  --version
-
 ENV \
     HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
     HOROVOD_GPU_OPERATIONS=NCCL \

From 807435885ea265580fee9f4e69c063eace46def2 Mon Sep 17 00:00:00 2001
From: Tanmoy <tanmoyf2@gmail.com>
Date: Fri, 26 Aug 2022 00:27:48 +0530
Subject: [PATCH 03/30] Fix `LightningDataModule` hparams parsing (#12806)

Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
Co-authored-by: Jirka <jirka.borovec@seznam.cz>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 src/pytorch_lightning/CHANGELOG.md            |  3 +
 src/pytorch_lightning/utilities/parsing.py    | 13 ++--
 .../tuner/test_scale_batch_size.py            | 69 ++++++++++++-------
 tests/tests_pytorch/utilities/test_parsing.py | 22 ++++--
 4 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 07c34bbc0e579..642cb28d4db4c 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -82,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296))
 
 
+- Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806))
+
+
 ## [1.7.2] - 2022-08-17
 
 ### Added
diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py
index 073423ab60773..22dfb538828ab 100644
--- a/src/pytorch_lightning/utilities/parsing.py
+++ b/src/pytorch_lightning/utilities/parsing.py
@@ -321,14 +321,17 @@ def _lightning_get_all_attr_holders(model: "pl.LightningModule", attribute: str)
         holders.append(model)
 
     # Check if attribute in model.hparams, either namespace or dict
-    if hasattr(model, "hparams"):
-        if attribute in model.hparams:
-            holders.append(model.hparams)
+    if hasattr(model, "hparams") and attribute in model.hparams:
+        holders.append(model.hparams)
 
     trainer = model._trainer
     # Check if the attribute in datamodule (datamodule gets registered in Trainer)
-    if trainer is not None and trainer.datamodule is not None and hasattr(trainer.datamodule, attribute):
-        holders.append(trainer.datamodule)
+    if trainer is not None and trainer.datamodule is not None:
+        if hasattr(trainer.datamodule, attribute):
+            holders.append(trainer.datamodule)
+
+        if hasattr(trainer.datamodule, "hparams") and attribute in trainer.datamodule.hparams:
+            holders.append(trainer.datamodule.hparams)
 
     return holders
 
diff --git a/tests/tests_pytorch/tuner/test_scale_batch_size.py b/tests/tests_pytorch/tuner/test_scale_batch_size.py
index d2fc8a61e0107..ce7c3613f5012 100644
--- a/tests/tests_pytorch/tuner/test_scale_batch_size.py
+++ b/tests/tests_pytorch/tuner/test_scale_batch_size.py
@@ -29,8 +29,8 @@
 
 
 class BatchSizeDataModule(BoringDataModule):
-    def __init__(self, batch_size):
-        super().__init__()
+    def __init__(self, data_dir, batch_size):
+        super().__init__(data_dir)
         if batch_size is not None:
             self.batch_size = batch_size
 
@@ -58,7 +58,7 @@ def test_scale_batch_size_method_with_model_or_datamodule(tmpdir, model_bs, dm_b
     tuner = Tuner(trainer)
 
     model = BatchSizeModel(model_bs)
-    datamodule = BatchSizeDataModule(dm_bs) if dm_bs != -1 else None
+    datamodule = BatchSizeDataModule(tmpdir, dm_bs) if dm_bs != -1 else None
 
     new_batch_size = tuner.scale_batch_size(model, mode="binsearch", init_val=4, max_trials=2, datamodule=datamodule)
     assert new_batch_size == 16
@@ -140,47 +140,64 @@ def test_auto_scale_batch_size_trainer_arg(tmpdir, scale_arg):
     assert not os.path.exists(tmpdir / "scale_batch_size_temp_model.ckpt")
 
 
-@RunIf(min_cuda_gpus=1)
 @pytest.mark.parametrize("use_hparams", [True, False])
 def test_auto_scale_batch_size_set_model_attribute(tmpdir, use_hparams):
-    """Test that new batch size gets written to the correct hyperparameter attribute."""
+    """Test that new batch size gets written to the correct hyperparameter attribute for model."""
     tutils.reset_seed()
 
     hparams = {"batch_size": 2}
-    before_batch_size = hparams.get("batch_size")
+    before_batch_size = hparams["batch_size"]
 
-    class HparamsBatchSizeModel(BatchSizeModel):
+    class HparamsBatchSizeModel(BoringModel):
         def __init__(self, *args, **kwargs):
-            super().__init__(*args, **kwargs)
+            super().__init__()
             self.save_hyperparameters()
 
-        def dataloader(self, *args, **kwargs):
-            # artificially set batch_size so we can get a dataloader
-            # remove it immediately after, because we want only self.hparams.batch_size
-            setattr(self, "batch_size", before_batch_size)
-            dataloader = super().dataloader(*args, **kwargs)
-            del self.batch_size
-            return dataloader
+        def train_dataloader(self):
+            return DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size)
+
+        def val_dataloader(self):
+            return DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size)
+
+    model_class = HparamsBatchSizeModel if use_hparams else BatchSizeModel
+    model = model_class(**hparams)
+
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True)
+    trainer.tune(model, scale_batch_size_kwargs={"steps_per_trial": 2, "max_trials": 4})
+    after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size
+    assert before_batch_size != after_batch_size
+    assert after_batch_size <= len(trainer.train_dataloader.dataset)
+
+
+@pytest.mark.parametrize("use_hparams", [True, False])
+def test_auto_scale_batch_size_set_datamodule_attribute(tmpdir, use_hparams):
+    """Test that new batch size gets written to the correct hyperparameter attribute for datamodule."""
+    tutils.reset_seed()
+
+    hparams = {"batch_size": 2}
+    before_batch_size = hparams["batch_size"]
 
     class HparamsBatchSizeDataModule(BoringDataModule):
         def __init__(self, data_dir, batch_size):
             super().__init__(data_dir)
-            self.batch_size = batch_size
+            self.save_hyperparameters()
 
         def train_dataloader(self):
-            return DataLoader(self.random_train, batch_size=self.batch_size)
+            return DataLoader(self.random_train, batch_size=self.hparams.batch_size)
 
-    datamodule_fit = HparamsBatchSizeDataModule(data_dir=tmpdir, batch_size=before_batch_size)
-    model_class = HparamsBatchSizeModel if use_hparams else BatchSizeModel
-    model = model_class(**hparams)
+        def val_dataloader(self):
+            return DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size)
 
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True, accelerator="gpu", devices=1)
-    trainer.tune(model, datamodule_fit)
-    after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size
-    assert trainer.datamodule == datamodule_fit
-    assert before_batch_size != after_batch_size
+    datamodule_class = HparamsBatchSizeDataModule if use_hparams else BatchSizeDataModule
+    datamodule = datamodule_class(data_dir=tmpdir, batch_size=before_batch_size)
+    model = BatchSizeModel(**hparams)
+
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True)
+    trainer.tune(model, datamodule=datamodule, scale_batch_size_kwargs={"steps_per_trial": 2, "max_trials": 4})
+    after_batch_size = datamodule.hparams.batch_size if use_hparams else datamodule.batch_size
+    assert trainer.datamodule == datamodule
+    assert before_batch_size < after_batch_size
     assert after_batch_size <= len(trainer.train_dataloader.dataset)
-    assert datamodule_fit.batch_size == after_batch_size
 
 
 def test_auto_scale_batch_size_duplicate_attribute_warning(tmpdir):
diff --git a/tests/tests_pytorch/utilities/test_parsing.py b/tests/tests_pytorch/utilities/test_parsing.py
index e918c9df2ac32..98b00a374d778 100644
--- a/tests/tests_pytorch/utilities/test_parsing.py
+++ b/tests/tests_pytorch/utilities/test_parsing.py
@@ -64,8 +64,8 @@ class TestModel4(LightningModule):  # fail case
         batch_size = 1
 
     model4 = TestModel4()
-
     trainer = Trainer()
+    model4.trainer = trainer
     datamodule = LightningDataModule()
     datamodule.batch_size = 8
     trainer.datamodule = datamodule
@@ -87,12 +87,21 @@ class TestModel7(LightningModule):  # test for datamodule w/ hparams w/ attribut
     model7 = TestModel7()
     model7.trainer = trainer
 
-    return model1, model2, model3, model4, model5, model6, model7
+    class TestDataModule8(LightningDataModule):  # test for hparams dict
+        hparams = TestHparamsDict2
+
+    model8 = TestModel1()
+    trainer = Trainer()
+    model8.trainer = trainer
+    datamodule = TestDataModule8()
+    trainer.datamodule = datamodule
+
+    return model1, model2, model3, model4, model5, model6, model7, model8
 
 
 def test_lightning_hasattr():
     """Test that the lightning_hasattr works in all cases."""
-    model1, model2, model3, model4, model5, model6, model7 = models = model_cases()
+    model1, model2, model3, model4, model5, model6, model7, model8 = models = model_cases()
     assert lightning_hasattr(model1, "learning_rate"), "lightning_hasattr failed to find namespace variable"
     assert lightning_hasattr(model2, "learning_rate"), "lightning_hasattr failed to find hparams namespace variable"
     assert lightning_hasattr(model3, "learning_rate"), "lightning_hasattr failed to find hparams dict variable"
@@ -104,6 +113,7 @@ def test_lightning_hasattr():
     assert lightning_hasattr(
         model7, "batch_size"
     ), "lightning_hasattr failed to find batch_size in hparams w/ datamodule present"
+    assert lightning_hasattr(model8, "batch_size")
 
     for m in models:
         assert not lightning_hasattr(m, "this_attr_not_exist")
@@ -116,10 +126,11 @@ def test_lightning_getattr():
         value = lightning_getattr(m, "learning_rate")
         assert value == i, "attribute not correctly extracted"
 
-    model5, model6, model7 = models[4:]
+    model5, model6, model7, model8 = models[4:]
     assert lightning_getattr(model5, "batch_size") == 8, "batch_size not correctly extracted"
     assert lightning_getattr(model6, "batch_size") == 8, "batch_size not correctly extracted"
     assert lightning_getattr(model7, "batch_size") == 8, "batch_size not correctly extracted"
+    assert lightning_getattr(model8, "batch_size") == 2, "batch_size not correctly extracted"
 
     for m in models:
         with pytest.raises(
@@ -136,13 +147,14 @@ def test_lightning_setattr(tmpdir):
         lightning_setattr(m, "learning_rate", 10)
         assert lightning_getattr(m, "learning_rate") == 10, "attribute not correctly set"
 
-    model5, model6, model7 = models[4:]
+    model5, model6, model7, model8 = models[4:]
     lightning_setattr(model5, "batch_size", 128)
     lightning_setattr(model6, "batch_size", 128)
     lightning_setattr(model7, "batch_size", 128)
     assert lightning_getattr(model5, "batch_size") == 128, "batch_size not correctly set"
     assert lightning_getattr(model6, "batch_size") == 128, "batch_size not correctly set"
     assert lightning_getattr(model7, "batch_size") == 128, "batch_size not correctly set"
+    assert lightning_getattr(model8, "batch_size") == 128, "batch_size not correctly set"
 
     for m in models:
         with pytest.raises(

From 33a5ed98794943b7eb6c7fcfa078b184c9d4d736 Mon Sep 17 00:00:00 2001
From: Anner <anner.de.jong@outlook.com>
Date: Fri, 26 Aug 2022 06:26:00 +0100
Subject: [PATCH 04/30] Add torch.cuda rng state to seed save/load (#14384)

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 src/pytorch_lightning/CHANGELOG.md         |  4 +++-
 src/pytorch_lightning/utilities/seed.py    | 15 ++++++++++++---
 tests/tests_pytorch/utilities/test_seed.py | 20 +++++++++++++++++++-
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 642cb28d4db4c..ac7e68d177fbe 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -27,7 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Replaced the unwrapping logic in strategies with direct access to unwrapped `LightningModule` ([#13738](https://github.com/Lightning-AI/lightning/pull/13738))
 
 
-- Enabled `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` ([14023](https://github.com/Lightning-AI/lightning/pull/14023))
+- Enabled `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` ([#14023](https://github.com/Lightning-AI/lightning/pull/14023))
+
+- Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384))
 
 
 
diff --git a/src/pytorch_lightning/utilities/seed.py b/src/pytorch_lightning/utilities/seed.py
index 8fce6a1debfcf..925337c7845ae 100644
--- a/src/pytorch_lightning/utilities/seed.py
+++ b/src/pytorch_lightning/utilities/seed.py
@@ -121,13 +121,22 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None:
 
 
 def _collect_rng_states() -> Dict[str, Any]:
-    """Collect the global random state of :mod:`torch`, :mod:`numpy` and Python."""
-    return {"torch": torch.get_rng_state(), "numpy": np.random.get_state(), "python": python_get_rng_state()}
+    """Collect the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python."""
+    return {
+        "torch": torch.get_rng_state(),
+        "torch.cuda": torch.cuda.get_rng_state_all(),
+        "numpy": np.random.get_state(),
+        "python": python_get_rng_state(),
+    }
 
 
 def _set_rng_states(rng_state_dict: Dict[str, Any]) -> None:
-    """Set the global random state of :mod:`torch`, :mod:`numpy` and Python in the current process."""
+    """Set the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python in the current
+    process."""
     torch.set_rng_state(rng_state_dict["torch"])
+    # torch.cuda rng_state is only included since v1.8.
+    if "torch.cuda" in rng_state_dict:
+        torch.cuda.set_rng_state_all(rng_state_dict["torch.cuda"])
     np.random.set_state(rng_state_dict["numpy"])
     version, state, gauss = rng_state_dict["python"]
     python_set_rng_state((version, tuple(state), gauss))
diff --git a/tests/tests_pytorch/utilities/test_seed.py b/tests/tests_pytorch/utilities/test_seed.py
index 6908badf1a037..c8df824e93b41 100644
--- a/tests/tests_pytorch/utilities/test_seed.py
+++ b/tests/tests_pytorch/utilities/test_seed.py
@@ -9,7 +9,7 @@
 import torch
 
 import pytorch_lightning.utilities.seed as seed_utils
-from pytorch_lightning.utilities.seed import isolate_rng
+from pytorch_lightning.utilities.seed import _collect_rng_states, _set_rng_states, isolate_rng
 
 
 @mock.patch.dict(os.environ, {}, clear=True)
@@ -87,6 +87,13 @@ def test_isolate_rng():
         generated = [torch.rand(2) for _ in range(3)]
     assert torch.equal(torch.rand(2), generated[0])
 
+    # torch.cuda
+    if torch.cuda.is_available():
+        torch.cuda.FloatTensor(1).normal_()
+        with isolate_rng():
+            generated = [torch.cuda.FloatTensor(2).normal_() for _ in range(3)]
+        assert torch.equal(torch.cuda.FloatTensor(2).normal_(), generated[0])
+
     # numpy
     np.random.rand(1)
     with isolate_rng():
@@ -100,6 +107,17 @@ def test_isolate_rng():
     assert random.random() == generated[0]
 
 
+def test_backward_compatibility_rng_states_dict():
+    """Test that an older rng_states_dict without the "torch.cuda" key does not crash.
+
+    This test is only relevant when torch.cuda is available.
+    """
+    states = _collect_rng_states()
+    assert "torch.cuda" in states
+    states.pop("torch.cuda")
+    _set_rng_states(states)
+
+
 @mock.patch("pytorch_lightning.utilities.seed.log.info")
 @pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"RANK": "1"}, {"RANK": "4"}])
 def test_seed_everything_log_info(log_mock: MagicMock, env_vars: Mapping[str, str]):

From a01e016fff3c2c9b3d88b3cbace077b7f06bfdbe Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Fri, 26 Aug 2022 08:47:37 +0200
Subject: [PATCH 05/30] Remove mps config for test (#14379)

* Remove mps config for test

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 tests/tests_pytorch/utilities/test_fetching.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/tests_pytorch/utilities/test_fetching.py b/tests/tests_pytorch/utilities/test_fetching.py
index 2d5e3954c7061..fa0551b5a07e1 100644
--- a/tests/tests_pytorch/utilities/test_fetching.py
+++ b/tests/tests_pytorch/utilities/test_fetching.py
@@ -192,9 +192,7 @@ def test_dataloader(self):
 
 
 @pytest.mark.flaky(reruns=3)
-@pytest.mark.parametrize(
-    "accelerator", [pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)), pytest.param("mps", marks=RunIf(mps=True))]
-)
+@pytest.mark.parametrize("accelerator", [pytest.param("cuda", marks=RunIf(min_cuda_gpus=1))])
 def test_trainer_num_prefetch_batches(tmpdir, accelerator):
 
     model = RecommenderModel()

From 6a999f123c609eb91e65d1ab9a3652ebdd3f71ce Mon Sep 17 00:00:00 2001
From: Krishna Kalyan <krishnakalyan3@gmail.com>
Date: Fri, 26 Aug 2022 08:27:33 +0100
Subject: [PATCH 06/30] Fix mypy errors attributed to
 `pytorch_lightning.demos.boring_classes` (#14201)

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>
Co-authored-by: otaj <ota@lightning.ai>
---
 pyproject.toml                                |  1 -
 src/pytorch_lightning/demos/boring_classes.py | 77 +++++++++++--------
 2 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 14e49ffb24b8e..60c031dfe920b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,6 @@ warn_no_return = "False"
 module = [
     "pytorch_lightning.callbacks.progress.rich_progress",
     "pytorch_lightning.core.datamodule",
-    "pytorch_lightning.demos.boring_classes",
     "pytorch_lightning.demos.mnist_datamodule",
     "pytorch_lightning.profilers.base",
     "pytorch_lightning.profilers.pytorch",
diff --git a/src/pytorch_lightning/demos/boring_classes.py b/src/pytorch_lightning/demos/boring_classes.py
index 4fd8fd139e5f4..f7be5390466d8 100644
--- a/src/pytorch_lightning/demos/boring_classes.py
+++ b/src/pytorch_lightning/demos/boring_classes.py
@@ -11,14 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import cast, Dict, Iterator, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch import Tensor
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader, Dataset, IterableDataset, Subset
 
 from pytorch_lightning import LightningDataModule, LightningModule
+from pytorch_lightning.core.optimizer import LightningOptimizer
+from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT
 
 
 class RandomDictDataset(Dataset):
@@ -26,7 +31,7 @@ def __init__(self, size: int, length: int):
         self.len = length
         self.data = torch.randn(length, size)
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Dict[str, Tensor]:
         a = self.data[index]
         b = a + 2
         return {"a": a, "b": b}
@@ -40,7 +45,7 @@ def __init__(self, size: int, length: int):
         self.len = length
         self.data = torch.randn(length, size)
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Tensor:
         return self.data[index]
 
     def __len__(self) -> int:
@@ -52,7 +57,7 @@ def __init__(self, size: int, count: int):
         self.count = count
         self.size = size
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[Tensor]:
         for _ in range(self.count):
             yield torch.randn(self.size)
 
@@ -62,16 +67,16 @@ def __init__(self, size: int, count: int):
         self.count = count
         self.size = size
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[Tensor]:
         for _ in range(len(self)):
             yield torch.randn(self.size)
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.count
 
 
 class BoringModel(LightningModule):
-    def __init__(self):
+    def __init__(self) -> None:
         """Testing PL Module.
 
         Use as follows:
@@ -90,60 +95,63 @@ def training_step(...):
         super().__init__()
         self.layer = torch.nn.Linear(32, 2)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:  # type: ignore[override]
         return self.layer(x)
 
-    def loss(self, batch, preds):
+    def loss(self, batch: Tensor, preds: Tensor) -> Tensor:
         # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
         return torch.nn.functional.mse_loss(preds, torch.ones_like(preds))
 
-    def step(self, x):
+    def step(self, x: Tensor) -> Tensor:
         x = self(x)
         out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
         return out
 
-    def training_step(self, batch, batch_idx):
+    def training_step(self, batch: Tensor, batch_idx: int) -> STEP_OUTPUT:  # type: ignore[override]
         output = self(batch)
         loss = self.loss(batch, output)
         return {"loss": loss}
 
-    def training_step_end(self, training_step_outputs):
+    def training_step_end(self, training_step_outputs: STEP_OUTPUT) -> STEP_OUTPUT:
         return training_step_outputs
 
-    def training_epoch_end(self, outputs) -> None:
+    def training_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
+        outputs = cast(List[Dict[str, Tensor]], outputs)
         torch.stack([x["loss"] for x in outputs]).mean()
 
-    def validation_step(self, batch, batch_idx):
+    def validation_step(self, batch: Tensor, batch_idx: int) -> Optional[STEP_OUTPUT]:  # type: ignore[override]
         output = self(batch)
         loss = self.loss(batch, output)
         return {"x": loss}
 
-    def validation_epoch_end(self, outputs) -> None:
+    def validation_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
+        outputs = cast(List[Dict[str, Tensor]], outputs)
         torch.stack([x["x"] for x in outputs]).mean()
 
-    def test_step(self, batch, batch_idx):
+    def test_step(self, batch: Tensor, batch_idx: int) -> Optional[STEP_OUTPUT]:  # type: ignore[override]
         output = self(batch)
         loss = self.loss(batch, output)
         return {"y": loss}
 
-    def test_epoch_end(self, outputs) -> None:
+    def test_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
+        outputs = cast(List[Dict[str, Tensor]], outputs)
         torch.stack([x["y"] for x in outputs]).mean()
 
-    def configure_optimizers(self):
+    def configure_optimizers(self) -> Tuple[List[torch.optim.Optimizer], List[_LRScheduler]]:
         optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
         lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
         return [optimizer], [lr_scheduler]
 
-    def train_dataloader(self):
+    def train_dataloader(self) -> DataLoader:
         return DataLoader(RandomDataset(32, 64))
 
-    def val_dataloader(self):
+    def val_dataloader(self) -> DataLoader:
         return DataLoader(RandomDataset(32, 64))
 
-    def test_dataloader(self):
+    def test_dataloader(self) -> DataLoader:
         return DataLoader(RandomDataset(32, 64))
 
-    def predict_dataloader(self):
+    def predict_dataloader(self) -> DataLoader:
         return DataLoader(RandomDataset(32, 64))
 
 
@@ -155,7 +163,7 @@ def __init__(self, data_dir: str = "./"):
         self.checkpoint_state: Optional[str] = None
         self.random_full = RandomDataset(32, 64 * 4)
 
-    def setup(self, stage: Optional[str] = None):
+    def setup(self, stage: Optional[str] = None) -> None:
         if stage == "fit" or stage is None:
             self.random_train = Subset(self.random_full, indices=range(64))
 
@@ -168,26 +176,27 @@ def setup(self, stage: Optional[str] = None):
         if stage == "predict" or stage is None:
             self.random_predict = Subset(self.random_full, indices=range(64 * 3, 64 * 4))
 
-    def train_dataloader(self):
+    def train_dataloader(self) -> DataLoader:
         return DataLoader(self.random_train)
 
-    def val_dataloader(self):
+    def val_dataloader(self) -> DataLoader:
         return DataLoader(self.random_val)
 
-    def test_dataloader(self):
+    def test_dataloader(self) -> DataLoader:
         return DataLoader(self.random_test)
 
-    def predict_dataloader(self):
+    def predict_dataloader(self) -> DataLoader:
         return DataLoader(self.random_predict)
 
 
 class ManualOptimBoringModel(BoringModel):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.automatic_optimization = False
 
-    def training_step(self, batch, batch_idx):
+    def training_step(self, batch: Tensor, batch_idx: int) -> STEP_OUTPUT:  # type: ignore[override]
         opt = self.optimizers()
+        assert isinstance(opt, (Optimizer, LightningOptimizer))
         output = self(batch)
         loss = self.loss(batch, output)
         opt.zero_grad()
@@ -202,21 +211,21 @@ def __init__(self, out_dim: int = 10, learning_rate: float = 0.02):
         self.l1 = torch.nn.Linear(32, out_dim)
         self.learning_rate = learning_rate
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:  # type: ignore[override]
         return torch.relu(self.l1(x.view(x.size(0), -1)))
 
-    def training_step(self, batch, batch_nb):
+    def training_step(self, batch: Tensor, batch_nb: int) -> STEP_OUTPUT:  # type: ignore[override]
         x = batch
         x = self(x)
         loss = x.sum()
         return loss
 
-    def configure_optimizers(self):
+    def configure_optimizers(self) -> torch.optim.Optimizer:
         return torch.optim.Adam(self.parameters(), lr=self.learning_rate)
 
 
 class Net(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv1 = nn.Conv2d(1, 32, 3, 1)
         self.conv2 = nn.Conv2d(32, 64, 3, 1)
@@ -225,7 +234,7 @@ def __init__(self):
         self.fc1 = nn.Linear(9216, 128)
         self.fc2 = nn.Linear(128, 10)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         x = self.conv1(x)
         x = F.relu(x)
         x = self.conv2(x)

From e67842dcbab5a98a0c77a181bddcbe9ef18bf887 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 26 Aug 2022 09:58:21 +0200
Subject: [PATCH 07/30] Support sharded optimizer state dumping outside of
 sharded strategies (#14208)

---
 src/pytorch_lightning/CHANGELOG.md            |  3 ++
 src/pytorch_lightning/strategies/sharded.py   | 17 +-----
 .../strategies/sharded_spawn.py               | 16 +-----
 src/pytorch_lightning/strategies/strategy.py  | 10 ++++
 .../strategies/test_ddp_strategy.py           | 53 +++++++++++++++++++
 .../strategies/test_sharded_strategy.py       | 33 ++++++++++--
 6 files changed, 97 insertions(+), 35 deletions(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index ac7e68d177fbe..32303d6babb5d 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -15,6 +15,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for passing extra init-parameters to the `LightningDataModule.from_datasets` ([#14185](https://github.com/Lightning-AI/lightning/issues/14185))
 
 
+- Added support for saving sharded optimizer state dict outside of `DDPShardedStrategy` ([#14208](https://github.com/PyTorchLightning/pytorch-lightning/pull/14208))
+
+
 
 ### Changed
 
diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py
index ce1e4cd96b961..3b77bc6ceeb70 100644
--- a/src/pytorch_lightning/strategies/sharded.py
+++ b/src/pytorch_lightning/strategies/sharded.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from typing import Dict, Generator, List, Optional, Tuple, Union
+from typing import Dict, Generator, List, Tuple, Union
 
 from torch import Tensor
 from torch.nn import Module
@@ -27,7 +27,6 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE
 from pytorch_lightning.utilities.optimizer import optimizers_to_device
-from pytorch_lightning.utilities.rank_zero import rank_zero_only
 
 if _FAIRSCALE_AVAILABLE:
     from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
@@ -120,20 +119,6 @@ def _reinit_optimizers_with_oss(self, optimizers: List[Union[Optimizer, Lightnin
                 del optimizer
         return optimizers
 
-    def optimizer_state(self, optimizer: "OSS") -> Optional[dict]:
-        if isinstance(optimizer, LightningOptimizer):
-            optimizer = optimizer._optimizer
-        optimizer.consolidate_state_dict()
-        return self._optim_state_dict(optimizer)
-
-    @rank_zero_only
-    def _optim_state_dict(self, optimizer):
-        """
-        Retrieves state dict only on rank 0, which contains the entire optimizer state after calling
-        :meth:`consolidate_state_dict`.
-        """
-        return optimizer.state_dict()
-
     def pre_backward(self, closure_loss: Tensor) -> None:
         pass
 
diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py
index f19aae7302eea..01ccb75677544 100644
--- a/src/pytorch_lightning/strategies/sharded_spawn.py
+++ b/src/pytorch_lightning/strategies/sharded_spawn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from typing import Any, Dict, Generator, List, Tuple
+from typing import Dict, Generator, List, Tuple
 
 from torch import Tensor
 from torch.nn import Module
@@ -25,7 +25,6 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.utilities.optimizer import optimizers_to_device
-from pytorch_lightning.utilities.rank_zero import rank_zero_only
 
 if _FAIRSCALE_AVAILABLE:
     from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
@@ -85,11 +84,6 @@ def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]:
 
         return self._reinit_optimizers_with_oss(optimizers)
 
-    def optimizer_state(self, optimizer: "OSS") -> Dict[str, Any]:
-        if isinstance(optimizer, OSS):
-            optimizer.consolidate_state_dict()
-        return self._optim_state_dict(optimizer)
-
     @contextmanager
     def block_backward_sync(self) -> Generator:
         """Blocks syncing gradients behaviour on backwards pass.
@@ -103,14 +97,6 @@ def block_backward_sync(self) -> Generator:
         else:
             yield None
 
-    @rank_zero_only
-    def _optim_state_dict(self, optimizer: Optimizer) -> Dict[str, Any]:
-        """
-        Retrieves state dict only on rank 0, which contains the entire optimizer state after calling
-        :meth:`consolidate_state_dict`.
-        """
-        return optimizer.state_dict()
-
     def pre_backward(self, closure_loss: Tensor) -> None:
         pass
 
diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py
index c09e7eae8c586..0abc5fe516273 100644
--- a/src/pytorch_lightning/strategies/strategy.py
+++ b/src/pytorch_lightning/strategies/strategy.py
@@ -170,6 +170,16 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]:
 
         Allows for syncing/collating optimizer state from processes in custom plugins.
         """
+        if isinstance(optimizer, LightningOptimizer):
+            optimizer = optimizer._optimizer
+
+        if hasattr(optimizer, "consolidate_state_dict"):
+            # there are optimizers like Fairscale's OSS or PyTorch's ZeroRedundancyOptimizer that shard their
+            # states, and to avoid OOM we consolidate the full state on rank 0 only
+            optimizer.consolidate_state_dict()
+            return optimizer.state_dict() if self.is_global_zero else {}
+
+        # for optimizers that are not sharded, we return the state dict on all ranks
         return optimizer.state_dict()
 
     def backward(
diff --git a/tests/tests_pytorch/strategies/test_ddp_strategy.py b/tests/tests_pytorch/strategies/test_ddp_strategy.py
index 8d2d965f1d4c6..318505a984216 100644
--- a/tests/tests_pytorch/strategies/test_ddp_strategy.py
+++ b/tests/tests_pytorch/strategies/test_ddp_strategy.py
@@ -24,8 +24,14 @@
 from pytorch_lightning.plugins.environments import ClusterEnvironment, LightningEnvironment
 from pytorch_lightning.strategies import DDPStrategy
 from pytorch_lightning.trainer.states import TrainerFn
+from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _TORCH_GREATER_EQUAL_1_10
 from tests_pytorch.helpers.runif import RunIf
 
+if _FAIRSCALE_AVAILABLE:
+    from fairscale.optim import OSS
+if _TORCH_GREATER_EQUAL_1_10:
+    from torch.distributed.optim import ZeroRedundancyOptimizer
+
 
 class BoringModelGPU(BoringModel):
     def on_train_start(self) -> None:
@@ -252,3 +258,50 @@ def test_ddp_strategy_set_timeout(mock_init_process_group):
     mock_init_process_group.assert_called_with(
         process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta
     )
+
+
+class BoringFairScaleOptimizerModel(BoringModel):
+    def configure_optimizers(self):
+        base_optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+        return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults)
+
+
+@RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True)
+@pytest.mark.parametrize("strategy", (pytest.param("ddp", marks=RunIf(standalone=True)), "ddp_spawn"))
+def test_ddp_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy):
+    """Test to ensure that checkpoint is saved correctly when using faircale optimizer."""
+    model = BoringFairScaleOptimizerModel()
+    trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1)
+
+    trainer.fit(model)
+
+    checkpoint_path = os.path.join(tmpdir, "model.pt")
+    trainer.save_checkpoint(checkpoint_path)
+    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
+
+    # Assert model parameters are identical after loading
+    for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()):
+        assert torch.equal(trained_param.to("cpu"), loaded_param)
+
+
+class BoringZeroRedundancyOptimizerModel(BoringModel):
+    def configure_optimizers(self):
+        return ZeroRedundancyOptimizer(self.layer.parameters(), optimizer_class=torch.optim.Adam, lr=0.1)
+
+
+@RunIf(min_cuda_gpus=2, skip_windows=True, min_torch="1.10")
+@pytest.mark.parametrize("strategy", (pytest.param("ddp", marks=RunIf(standalone=True)), "ddp_spawn"))
+def test_ddp_strategy_checkpoint_zero_redundancy_optimizer(tmpdir, strategy):
+    """Test to ensure that checkpoint is saved correctly when using zero redundancy optimizer."""
+    model = BoringZeroRedundancyOptimizerModel()
+    trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1)
+
+    trainer.fit(model)
+
+    checkpoint_path = os.path.join(tmpdir, "model.pt")
+    trainer.save_checkpoint(checkpoint_path)
+    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
+
+    # Assert model parameters are identical after loading
+    for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()):
+        assert torch.equal(trained_param.to("cpu"), loaded_param)
diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py
index a0abfb3f73ec0..acefecbf4d2a7 100644
--- a/tests/tests_pytorch/strategies/test_sharded_strategy.py
+++ b/tests/tests_pytorch/strategies/test_sharded_strategy.py
@@ -14,6 +14,7 @@
 
 if _FAIRSCALE_AVAILABLE:
     from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
+    from fairscale.optim import OSS
 
 
 @pytest.mark.parametrize("clip_val", [0, 10])
@@ -70,8 +71,8 @@ def test_ddp_sharded_strategy_checkpoint_cpu(tmpdir):
     saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
 
     # Assert model parameters are identical after loading
-    for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()):
-        assert torch.equal(ddp_param.to("cpu"), shard_param)
+    for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()):
+        assert torch.equal(trained_param.to("cpu"), loaded_param)
 
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True)
@@ -87,8 +88,8 @@ def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir):
     saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
 
     # Assert model parameters are identical after loading
-    for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()):
-        assert torch.equal(ddp_param.to("cpu"), shard_param)
+    for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()):
+        assert torch.equal(trained_param.to("cpu"), loaded_param)
 
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True)
@@ -314,3 +315,27 @@ def test_block_backward_sync():
 def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
     trainer = Trainer(strategy=strategy_name)
     assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs
+
+
+class BoringFairScaleOptimizerModel(BoringModel):
+    def configure_optimizers(self):
+        base_optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+        return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults)
+
+
+@RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True)
+@pytest.mark.parametrize("strategy", (pytest.param("ddp_sharded", marks=RunIf(standalone=True)), "ddp_sharded_spawn"))
+def test_ddp_sharded_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy):
+    """Test to ensure that checkpoint is saved correctly when using fairscale optimizers."""
+    model = BoringFairScaleOptimizerModel()
+    trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1)
+
+    trainer.fit(model)
+
+    checkpoint_path = os.path.join(tmpdir, "model.pt")
+    trainer.save_checkpoint(checkpoint_path)
+    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
+
+    # Assert model parameters are identical after loading
+    for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()):
+        assert torch.equal(trained_param.to("cpu"), loaded_param)

From c418828d4150ab6fa8670b43fdda2e7abe5f237b Mon Sep 17 00:00:00 2001
From: otaj <6065855+otaj@users.noreply.github.com>
Date: Fri, 26 Aug 2022 10:40:41 +0200
Subject: [PATCH 08/30] Remove appropriate line from `pyproject.toml` as a
 followup to #13929 (#14397)

followup to #13929
---
 pyproject.toml                                  | 1 -
 src/pytorch_lightning/demos/mnist_datamodule.py | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 60c031dfe920b..18ae22feeb963 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,6 @@ warn_no_return = "False"
 module = [
     "pytorch_lightning.callbacks.progress.rich_progress",
     "pytorch_lightning.core.datamodule",
-    "pytorch_lightning.demos.mnist_datamodule",
     "pytorch_lightning.profilers.base",
     "pytorch_lightning.profilers.pytorch",
     "pytorch_lightning.strategies.sharded",
diff --git a/src/pytorch_lightning/demos/mnist_datamodule.py b/src/pytorch_lightning/demos/mnist_datamodule.py
index 0f5eea99f3e96..6466b78250b3b 100644
--- a/src/pytorch_lightning/demos/mnist_datamodule.py
+++ b/src/pytorch_lightning/demos/mnist_datamodule.py
@@ -16,7 +16,7 @@
 import random
 import time
 import urllib
-from typing import Any, Callable, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Sized, Tuple, Union
 from urllib.error import HTTPError
 from warnings import warn
 
@@ -199,6 +199,7 @@ def setup(self, stage: Optional[str] = None) -> None:
         """Split the train and valid dataset."""
         extra = dict(transform=self.default_transforms) if self.default_transforms else {}
         dataset: Dataset = MNIST(self.data_dir, train=True, download=False, **extra)
+        assert isinstance(dataset, Sized)
         train_length = len(dataset)
         self.dataset_train, self.dataset_val = random_split(dataset, [train_length - self.val_split, self.val_split])
 

From 70deac2cd4f23426b3fc4586720b8b0cbdbcb214 Mon Sep 17 00:00:00 2001
From: Christian Schell <mail@chrisschell.de>
Date: Fri, 26 Aug 2022 10:42:00 +0200
Subject: [PATCH 09/30] Reset epoch progress with batch size scaler (#13846)

Co-authored-by: Christian Schell <christian.schell@uni-wuerzburg.de>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 src/pytorch_lightning/CHANGELOG.md            |  3 +++
 .../tuner/batch_size_scaling.py               | 19 +++++++++++++++++--
 .../tuner/test_scale_batch_size.py            | 10 +++++++---
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 32303d6babb5d..8dcd45f58b876 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -87,6 +87,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296))
 
 
+- Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846)
+
+
 - Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806))
 
 
diff --git a/src/pytorch_lightning/tuner/batch_size_scaling.py b/src/pytorch_lightning/tuner/batch_size_scaling.py
index 316fc5a2197da..a1f8a2de4b9d8 100644
--- a/src/pytorch_lightning/tuner/batch_size_scaling.py
+++ b/src/pytorch_lightning/tuner/batch_size_scaling.py
@@ -128,7 +128,10 @@ def _run_power_scaling(
     """Batch scaling mode where the size is doubled at each iteration until an OOM error is encountered."""
     for _ in range(max_trials):
         garbage_collection_cuda()
-        trainer.fit_loop.global_step = 0  # reset after each try
+
+        # reset after each try
+        _reset_progress(trainer)
+
         try:
             # Try fit
             trainer.tuner._run(model)
@@ -166,7 +169,10 @@ def _run_binsearch_scaling(
     count = 0
     while True:
         garbage_collection_cuda()
-        trainer.fit_loop.global_step = 0  # reset after each try
+
+        # reset after each try
+        _reset_progress(trainer)
+
         try:
             # Try fit
             trainer.tuner._run(model)
@@ -249,3 +255,12 @@ def _adjust_batch_size(
 def _is_valid_batch_size(batch_size: int, dataloader: DataLoader, trainer: "pl.Trainer"):
     module = trainer.lightning_module or trainer.datamodule
     return not has_len_all_ranks(dataloader, trainer.strategy, module) or batch_size <= len(dataloader)
+
+
+def _reset_progress(trainer: "pl.Trainer") -> None:
+    if trainer.lightning_module.automatic_optimization:
+        trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop.optim_progress.reset()
+    else:
+        trainer.fit_loop.epoch_loop.batch_loop.manual_loop.optim_step_progress.reset()
+
+    trainer.fit_loop.epoch_progress.reset()
diff --git a/tests/tests_pytorch/tuner/test_scale_batch_size.py b/tests/tests_pytorch/tuner/test_scale_batch_size.py
index ce7c3613f5012..e703b37491d26 100644
--- a/tests/tests_pytorch/tuner/test_scale_batch_size.py
+++ b/tests/tests_pytorch/tuner/test_scale_batch_size.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 from copy import deepcopy
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -308,10 +309,13 @@ def __init__(self):
 def test_dataloader_reset_with_scale_batch_size(tmpdir, scale_method):
     """Test that train and val dataloaders are reset at every update in scale batch size."""
     model = BatchSizeModel(batch_size=16)
-    scale_batch_size_kwargs = {"max_trials": 5, "init_val": 4, "mode": scale_method}
+    max_trials = 5
+    scale_batch_size_kwargs = {"max_trials": max_trials, "steps_per_trial": 2, "init_val": 4, "mode": scale_method}
 
-    trainer = Trainer(max_epochs=2, auto_scale_batch_size=True)
-    new_batch_size = trainer.tune(model, scale_batch_size_kwargs=scale_batch_size_kwargs)["scale_batch_size"]
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True)
+    with patch.object(model, "on_train_epoch_end") as advance_mocked:
+        new_batch_size = trainer.tune(model, scale_batch_size_kwargs=scale_batch_size_kwargs)["scale_batch_size"]
+        assert advance_mocked.call_count == max_trials
 
     assert trainer.train_dataloader.loaders.batch_size == new_batch_size
     assert trainer.val_dataloaders[0].batch_size == new_batch_size

From 6d00f31f0cac7b650b3ff5e03cc5ecab40e71fc3 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Fri, 26 Aug 2022 14:31:48 +0530
Subject: [PATCH 10/30] Add auto wrapping for `DDPFullyShardedNativeStrategy`
 (#14252)

---
 .../advanced/model_parallel.rst               |  29 +++--
 src/pytorch_lightning/CHANGELOG.md            |   3 +
 .../precision/fsdp_native_native_amp.py       |   5 +-
 .../plugins/precision/sharded_native_amp.py   |   6 +-
 .../strategies/fully_sharded_native.py        |  55 ++++++++++
 .../test_ddp_fully_sharded_native.py          | 101 ++++++++++++++----
 6 files changed, 166 insertions(+), 33 deletions(-)

diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst
index d1db58b3f1869..50ae2cd2827d0 100644
--- a/docs/source-pytorch/advanced/model_parallel.rst
+++ b/docs/source-pytorch/advanced/model_parallel.rst
@@ -212,14 +212,31 @@ PyTorch Fully Sharded Training
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 PyTorch has it's own version of `FSDP <https://pytorch.org/docs/stable/fsdp.html>`_ which is upstreamed from their `fairscale <https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html>`__ project.
-It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_. The API is pretty similar to that of FairScale.
+It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_ but it is recommended to use it with PyTorch v1.12 or more and that's what
+Lightning supports. The API is pretty similar to that of FairScale.
 
-.. note::
-    Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``.
-    This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``.
-    This is a limitation of Fully Sharded Training that will be resolved in the future.
 
-To activate parameter sharding, you must wrap your model using the``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
+Auto Wrapping
+"""""""""""""
+Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The
+simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't
+have to ``wrap`` layers manually as in the case of manual wrapping.
+
+.. code-block:: python
+
+    model = BoringModel()
+    trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp_native", precision=16)
+    trainer.fit(model)
+
+
+Read more `here <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/#auto-wrapping>`__.
+
+
+Manual Wrapping
+"""""""""""""""
+
+Manual wrapping can be useful to explore complex sharding strategies by applying ``wrap`` selectively to some parts of the model. To activate
+parameter sharding with manual wrapping, you can wrap your model using the ``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
 
 When not using Fully Sharded these wrap functions are a no-op. This means once the changes have been made, there is no need to remove the changes for other strategies.
 
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 8dcd45f58b876..6ca49228cb140 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -12,6 +12,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290))
 
 
+- Added support for auto wrapping for `DDPFullyShardedNativeStrategy` ([#14252](https://github.com/Lightning-AI/lightning/issues/14252))
+
+
 - Added support for passing extra init-parameters to the `LightningDataModule.from_datasets` ([#14185](https://github.com/Lightning-AI/lightning/issues/14185))
 
 
diff --git a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py
index 2201db94586a2..f91144124ae35 100644
--- a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py
+++ b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py
@@ -25,14 +25,13 @@
     from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 else:
     MixedPrecision = None  # type: ignore[misc,assignment]
+    ShardedGradScaler = None  # type: ignore[misc,assignment]
 
 
 class FullyShardedNativeNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin):
     """Native AMP for Fully Sharded Native Training."""
 
-    def __init__(
-        self, precision: Union[str, int], device: str, scaler: Optional[torch.cuda.amp.GradScaler] = None
-    ) -> None:
+    def __init__(self, precision: Union[str, int], device: str, scaler: Optional[ShardedGradScaler] = None) -> None:
         if not _TORCH_GREATER_EQUAL_1_12:
             raise MisconfigurationException(
                 "`FullyShardedNativeNativeMixedPrecisionPlugin` is supported from PyTorch v1.12.0 onwards."
diff --git a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py
index 570e25bd85caa..15c23e18ed6bc 100644
--- a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py
+++ b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 from typing import Optional, Union
 
-import torch
-
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE
@@ -29,9 +27,7 @@
 class ShardedNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin):
     """Native AMP for Sharded Training."""
 
-    def __init__(
-        self, precision: Union[str, int], device: str, scaler: Optional[torch.cuda.amp.GradScaler] = None
-    ) -> None:
+    def __init__(self, precision: Union[str, int], device: str, scaler: Optional[ShardedGradScaler] = None) -> None:
         if not _FAIRSCALE_AVAILABLE:
             raise MisconfigurationException(
                 "You have asked for sharded AMP but you have not installed it."
diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py
index 4dbf36e4c2861..456ddd36f93e7 100644
--- a/src/pytorch_lightning/strategies/fully_sharded_native.py
+++ b/src/pytorch_lightning/strategies/fully_sharded_native.py
@@ -20,6 +20,7 @@
 from torch.distributed.distributed_c10d import _get_default_group, ProcessGroup
 
 import pytorch_lightning as pl
+from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
@@ -38,9 +39,11 @@
 from pytorch_lightning.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
+from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.optimizer import optimizers_to_device
 from pytorch_lightning.utilities.rank_zero import rank_zero_info
 from pytorch_lightning.utilities.seed import reset_seed
+from pytorch_lightning.utilities.types import STEP_OUTPUT
 
 if _TORCH_GREATER_EQUAL_1_12:
     from torch.distributed.fsdp.fully_sharded_data_parallel import (
@@ -51,6 +54,7 @@
     )
     from torch.distributed.fsdp.wrap import enable_wrap
 else:
+    FullyShardedDataParallel = None  # type: ignore[misc,assignment]
     MixedPrecision = None  # type: ignore[misc,assignment]
     BackwardPrefetch = None  # type: ignore[misc,assignment]
     CPUOffload = None  # type: ignore[misc,assignment]
@@ -201,6 +205,28 @@ def _configure_launcher(self) -> None:
             self._launcher = _SubprocessScriptLauncher(self.cluster_environment, self.num_processes, self.num_nodes)
             self._rank_0_will_call_children_scripts = True
 
+    def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel:
+        """Wraps the model into a
+        :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel` module."""
+        # If model is already wrapped, we need to avoid sending the `auto_wrap_policy`
+        assert self.lightning_module is not None
+        if (
+            any(isinstance(mod, FullyShardedDataParallel) for mod in self.lightning_module.modules())
+            and "auto_wrap_policy" in self.kwargs
+        ):
+            del self.kwargs["auto_wrap_policy"]
+
+        log.detail(f"setting up FSDP model with device id: {self.root_device.index}, kwargs: {self.kwargs}")
+        return FullyShardedDataParallel(
+            module=model,
+            process_group=self.process_group,
+            cpu_offload=self.cpu_offload,
+            backward_prefetch=self.backward_prefetch,
+            mixed_precision=self.mixed_precision_config,
+            device_id=self.root_device.index,
+            **self.kwargs,
+        )
+
     def setup(self, trainer: "pl.Trainer") -> None:
         assert self.accelerator is not None
         self.accelerator.setup(trainer)
@@ -215,9 +241,20 @@ def setup(self, trainer: "pl.Trainer") -> None:
         assert self.lightning_module is not None
         self.lightning_module._device = self.root_device
 
+        assert isinstance(self.model, pl.LightningModule)
+        self.model = _LightningModuleWrapperBase(self.model)
+        if is_overridden("configure_sharded_model", self.lightning_module):
+            rank_zero_info(
+                "You have overridden `LightningModule.configure_sharded_model` hook. It will assume that all the layers"
+                " are already wrapped for sharding and won't wrap the entire model using `FullyShardedDataParallel`."
+            )
+        else:
+            self.model = self._setup_model(self.model)
         self.barrier()
+
         self.setup_optimizers(trainer)
         optimizers_to_device(self.optimizers, self.root_device)
+
         self.setup_precision_plugin()
 
     def model_to_device(self) -> None:
@@ -273,6 +310,24 @@ def reduce(
             tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op)
         return tensor
 
+    def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+        # we don't need precision context since casting is done by FSDP
+        # read `mixed_precision` docstring here: https://pytorch.org/docs/stable/fsdp.html
+        assert self.model is not None
+        return self.model(*args, **kwargs)
+
+    def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]:
+        assert self.model is not None
+        return self.model(*args, **kwargs)
+
+    def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]:
+        assert self.model is not None
+        return self.model(*args, **kwargs)
+
+    def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+        assert self.model is not None
+        return self.model(*args, **kwargs)
+
     def _determine_device_ids(self) -> List[int]:
         return [self.root_device.index]
 
diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
index ede201da1f68f..be8bced2cbf5f 100644
--- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
+++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
@@ -11,7 +11,6 @@
 from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
-from pytorch_lightning.utilities.types import STEP_OUTPUT
 from tests_pytorch.helpers.runif import RunIf
 
 if _TORCH_GREATER_EQUAL_1_12:
@@ -19,6 +18,15 @@
     from torch.distributed.fsdp.wrap import wrap
 
 
+def custom_auto_wrap_policy(
+    module,
+    recurse,
+    unwrapped_params: int,
+    min_num_params: int = int(1e8),
+) -> bool:
+    return unwrapped_params >= 2
+
+
 @RunIf(min_torch="1.12")
 def test_invalid_on_cpu(tmpdir):
     """Test to ensure that we raise Misconfiguration for Native FSDP on CPU."""
@@ -78,38 +86,73 @@ def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
     def configure_optimizers(self):
         return torch.optim.SGD(self.layer.parameters(), lr=0.1)
 
-    def on_train_batch_end(self, outputs: STEP_OUTPUT, batch: Any, batch_idx: int) -> None:
+    def on_train_batch_end(self, outputs, batch, batch_idx) -> None:
         self._assert_layer_fsdp_instance()
 
-    def on_test_batch_end(
-        self, outputs: Optional[STEP_OUTPUT], batch: Any, batch_idx: int, dataloader_idx: int
-    ) -> None:
+    def on_test_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None:
         self._assert_layer_fsdp_instance()
 
-    def on_validation_batch_end(
-        self, outputs: Optional[STEP_OUTPUT], batch: Any, batch_idx: int, dataloader_idx: int
-    ) -> None:
+    def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None:
         self._assert_layer_fsdp_instance()
 
-    def on_predict_batch_end(self, outputs: Optional[Any], batch: Any, batch_idx: int, dataloader_idx: int) -> None:
+    def on_predict_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None:
         self._assert_layer_fsdp_instance()
 
     def _assert_layer_fsdp_instance(self) -> None:
         assert isinstance(self.layer, FullyShardedDataParallel)
         assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin)
-        assert isinstance(self.layer.module[0], FullyShardedDataParallel)
-        assert isinstance(self.layer.module[2], FullyShardedDataParallel)
         # root should not be resharding
         assert self.layer.reshard_after_forward is False
-        # Assert that the nested layers are set reshard_after_forward to True
-        assert self.layer.module[0].reshard_after_forward is True
-        assert self.layer.module[2].reshard_after_forward is True
 
         precision = torch.float16 if self.precision == 16 else torch.bfloat16
         assert self.layer.mixed_precision.param_dtype == precision
         assert self.layer.mixed_precision.reduce_dtype == precision
         assert self.layer.mixed_precision.buffer_dtype == precision
 
+        for layer_num in [0, 2]:
+            assert isinstance(self.layer.module[layer_num], FullyShardedDataParallel)
+            # Assert that the nested layers are set reshard_after_forward to True
+            assert self.layer.module[layer_num].reshard_after_forward is True
+
+            assert self.layer[layer_num].mixed_precision.param_dtype == precision
+            assert self.layer[layer_num].mixed_precision.reduce_dtype == precision
+            assert self.layer[layer_num].mixed_precision.buffer_dtype == precision
+
+
+class TestFSDPModelAutoWrapped(BoringModel):
+    def __init__(self):
+        super().__init__()
+        self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2))
+
+    def configure_optimizers(self):
+        return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1)
+
+    def on_train_batch_end(self, outputs, batch, batch_idx) -> None:
+        self._assert_layer_fsdp_instance()
+
+    def on_test_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None:
+        self._assert_layer_fsdp_instance()
+
+    def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None:
+        self._assert_layer_fsdp_instance()
+
+    def on_predict_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None:
+        self._assert_layer_fsdp_instance()
+
+    def _assert_layer_fsdp_instance(self) -> None:
+        assert isinstance(self.layer, torch.nn.Sequential)
+        assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin)
+
+        precision = torch.float16 if self.precision == 16 else torch.bfloat16
+        for layer_num in [0, 2]:
+            assert isinstance(self.layer[layer_num], FullyShardedDataParallel)
+            # Assert that the nested layers are set reshard_after_forward to True
+            assert self.layer[layer_num].reshard_after_forward
+
+            assert self.layer[layer_num].mixed_precision.param_dtype == precision
+            assert self.layer[layer_num].mixed_precision.reduce_dtype == precision
+            assert self.layer[layer_num].mixed_precision.buffer_dtype == precision
+
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.12")
 def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
@@ -140,18 +183,32 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir, precision):
 
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.12")
-def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir):
+@pytest.mark.parametrize(
+    "model, strategy",
+    [
+        (TestFSDPModel(), "fsdp_native"),
+        (TestFSDPModelAutoWrapped(), DDPFullyShardedNativeStrategy),
+    ],
+)
+def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir, model, strategy):
     """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""
 
-    model = TestFSDPModel()
     ck = ModelCheckpoint(save_last=True)
+
+    if not isinstance(strategy, str):
+        strategy = strategy(auto_wrap_policy=custom_auto_wrap_policy)
+
     trainer = Trainer(
         default_root_dir=tmpdir,
         accelerator="gpu",
         devices=2,
-        strategy="fsdp_native",
+        strategy=strategy,
         precision=16,
         max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        limit_test_batches=2,
+        limit_predict_batches=2,
         callbacks=[ck],
     )
     _run_multiple_stages(trainer, model)
@@ -164,14 +221,20 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
 
     trainer.save_checkpoint(model_path, weights_only=True)
 
-    _assert_save_equality(trainer, model_path, cls=TestFSDPModel)
+    _assert_save_equality(trainer, model_path, cls=model.__class__)
 
     # Test entry point
     trainer.test(model)  # model is wrapped, will not call `configure_sharded_model`
 
-    # provide model path, will create a new unwrapped model and load and then call configure_shared_model to wrap
+    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
     trainer.test(ckpt_path=model_path)
 
+    # Predict entry point
+    trainer.predict(model)  # model is wrapped, will not call `configure_sharded_model`
+
+    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
+    trainer.predict(ckpt_path=model_path)
+
 
 def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel):
     # Use FullySharded to get the state dict for the sake of comparison

From 70fe0ed0413e9aa2daecbbf031d788e8cd310ba8 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Fri, 26 Aug 2022 11:07:35 +0200
Subject: [PATCH 11/30] CI: Azure CPU pool (#14367)

* Azure CPU pool
* Fix empty env LAI vars

Co-authored-by: manskx <ahmed.mansy156@gmail.com>
---
 .azure/app-cloud-e2e.yml             | 39 +++++++++++++++-------------
 src/lightning_app/testing/testing.py |  2 +-
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml
index c7262c2bc5c61..aed2cd07b987b 100644
--- a/.azure/app-cloud-e2e.yml
+++ b/.azure/app-cloud-e2e.yml
@@ -24,8 +24,10 @@ variables:
 
 jobs:
   - job: App_cloud_e2e_testing
-    pool:
-      vmImage: 'ubuntu-latest'
+    pool: azure-cpus
+    container:
+      image: mcr.microsoft.com/playwright/python:v1.25.2-focal
+      options: "--shm-size=2g"
     timeoutInMinutes: "30"
     cancelTimeoutInMinutes: "2"
     strategy:
@@ -56,6 +58,7 @@ jobs:
       clean: all
     steps:
     - bash: |
+        whoami
         python --version
         pip --version
       displayName: 'Info'
@@ -80,10 +83,10 @@ jobs:
 
     - bash: |
         python -m pip install playwright
-        python -m playwright install --with-deps
+        python -m playwright install  # --with-deps
       displayName: 'Install Playwright system dependencies'
 
-    - bash: pip install -e .
+    - bash: pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
       displayName: 'Install lightning'
 
     - bash: |
@@ -110,12 +113,12 @@ jobs:
         TEST_APP_NAME: $(name)
         HAR_LOCATION: './artifacts/hars'
         SLOW_MO: '50'
-        LAI_USER: $(LAI_USER)
-        LAI_PASS: $(LAI_PASS)
-        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
-        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
+        # LAI_USER: $(LAI_USER)
+        # LAI_PASS: $(LAI_PASS)
+        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
+        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
         LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
-        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
+        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
       displayName: 'Run the tests'
 
     - publish: '$(Build.ArtifactStagingDirectory)/videos'
@@ -125,16 +128,16 @@ jobs:
     - bash: |
         time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()"
       env:
-        LAI_USER: $(LAI_USER)
-        LAI_PASS: $(LAI_PASS)
-        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
-        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
+        # LAI_USER: $(LAI_USER)
+        # LAI_PASS: $(LAI_PASS)
+        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
+        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
         LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
-        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
+        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
         PR_NUMBER: $(local_id)
         TEST_APP_NAME: $(name)
-        GRID_USER_ID: $(LIGHTNING_USER_ID)  # TODO: clarify the meaning
-        GRID_USER_KEY: $(LIGHTNING_API_KEY)  # TODO: clarify the meaning
-        GRID_URL: $(LIGHTNING_CLOUD_URL)
-        _GRID_USERNAME: $(LIGHTNING_USERNAME)
+        # GRID_USER_ID: $(LIGHTNING_USER_ID)  # TODO: clarify the meaning
+        # GRID_USER_KEY: $(LIGHTNING_API_KEY)  # TODO: clarify the meaning
+        # GRID_URL: $(LIGHTNING_CLOUD_URL)
+        # _GRID_USERNAME: $(LIGHTNING_USERNAME)
       displayName: 'Clean Previous Apps'
diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py
index 884c02a0521c1..1ccc8ba1ff63c 100644
--- a/src/lightning_app/testing/testing.py
+++ b/src/lightning_app/testing/testing.py
@@ -224,7 +224,7 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str
         context = browser.new_context(
             # Eventually this will need to be deleted
             http_credentials=HttpCredentials(
-                {"username": os.getenv("LAI_USER").strip(), "password": os.getenv("LAI_PASS")}
+                {"username": os.getenv("LAI_USER", "").strip(), "password": os.getenv("LAI_PASS", "")}
             ),
             record_video_dir=os.path.join(Config.video_location, TEST_APP_NAME),
             record_har_path=Config.har_location,

From 2112e0e47287cbcf883871f79cfdd5f98c73eb11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 26 Aug 2022 12:24:01 +0200
Subject: [PATCH 12/30] Remove unused utility function `_parse_devices`
 (#14386)

---
 src/pytorch_lightning/utilities/apply_func.py   |  2 +-
 .../utilities/device_parser.py                  | 17 -----------------
 2 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py
index 9f85ea52661c7..15e9962e40bff 100644
--- a/src/pytorch_lightning/utilities/apply_func.py
+++ b/src/pytorch_lightning/utilities/apply_func.py
@@ -27,7 +27,7 @@
 
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY
-from pytorch_lightning.utilities.warnings import rank_zero_deprecation
+from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
 
 if _TORCHTEXT_LEGACY:
     if _compare_version("torchtext", operator.ge, "0.9.0"):
diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py
index c76933e489db7..c30692d0ab01a 100644
--- a/src/pytorch_lightning/utilities/device_parser.py
+++ b/src/pytorch_lightning/utilities/device_parser.py
@@ -18,7 +18,6 @@
 import torch.cuda
 
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.types import _DEVICE
 
@@ -51,22 +50,6 @@ def determine_root_gpu_device(gpus: List[_DEVICE]) -> Optional[_DEVICE]:
     return root_gpu
 
 
-def _parse_devices(
-    gpus: Optional[Union[List[int], str, int]],
-    auto_select_gpus: bool,
-    tpu_cores: Optional[Union[List[int], str, int]],
-    include_cuda: bool = False,
-    include_mps: bool = False,
-) -> Tuple[Optional[List[int]], Optional[Union[List[int], int]]]:
-    if auto_select_gpus and isinstance(gpus, int):
-        gpus = pick_multiple_gpus(gpus)
-
-    # TODO (@seannaren, @kaushikb11): Include IPU parsing logic here
-    gpu_ids = parse_gpu_ids(gpus, include_cuda=include_cuda, include_mps=include_mps)
-    tpu_cores = parse_tpu_cores(tpu_cores)
-    return gpu_ids, tpu_cores
-
-
 def parse_gpu_ids(
     gpus: Optional[Union[int, str, List[int]]],
     include_cuda: bool = False,

From 1cfebc0eae358f79d8f37a6839c061752808c8c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Fri, 26 Aug 2022 13:18:11 +0200
Subject: [PATCH 13/30] Add back standalone task azure CI step (#14366)

---
 .azure/gpu-tests.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
index 2da30c0dd66ab..3ed4601d1b8e5 100644
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@@ -123,6 +123,15 @@ jobs:
       timeoutInMinutes: "35"
       condition: eq(variables['continue'], '1')
 
+    - bash: bash run_standalone_tasks.sh
+      workingDirectory: tests/tests_pytorch
+      env:
+        PL_USE_MOCKED_MNIST: "1"
+        PL_RUN_CUDA_TESTS: "1"
+      displayName: 'Testing: PyTorch standalone tasks'
+      timeoutInMinutes: "10"
+      condition: eq(variables['continue'], '1')
+
     - bash: |
         python -m coverage report
         python -m coverage xml

From 3201f7ff8b1657e9fc4865026a19052581b27411 Mon Sep 17 00:00:00 2001
From: Nishant Bhansali <nbhansali06@gmail.com>
Date: Fri, 26 Aug 2022 16:50:55 +0530
Subject: [PATCH 14/30] Update docs for `train_bn` in
 `Basefinetuning.filter_params` (#13938)

---
 src/pytorch_lightning/callbacks/finetuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch_lightning/callbacks/finetuning.py b/src/pytorch_lightning/callbacks/finetuning.py
index ad45ff8b0591b..d2afdd20bd9e9 100644
--- a/src/pytorch_lightning/callbacks/finetuning.py
+++ b/src/pytorch_lightning/callbacks/finetuning.py
@@ -141,7 +141,7 @@ def filter_params(
 
         Args:
             modules: A given module or an iterable of modules
-            train_bn: Whether to train BatchNorm module
+            train_bn: Whether not to train the BatchNorm module
             requires_grad: Whether to create a generator for trainable or non-trainable parameters.
         Returns:
             Generator

From e2221a0b3e951f20034e36d66b76a4a375f5f9ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 26 Aug 2022 15:11:24 +0200
Subject: [PATCH 15/30] Raise an error when resuming training with Apex
 (#14341)

---
 src/pytorch_lightning/CHANGELOG.md            |  5 +++++
 .../plugins/precision/apex_amp.py             |  9 +++++++-
 tests/tests_pytorch/models/test_amp.py        | 21 +++++++++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 6ca49228cb140..1776b917c761a 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -35,10 +35,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Enabled `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` ([#14023](https://github.com/Lightning-AI/lightning/pull/14023))
 
+
+- When resuming training with Apex enabled, the `Trainer` will now raise an error ([#14341](https://github.com/Lightning-AI/lightning/pull/14341))
+
+
 - Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384))
 
 
 
+
 ### Deprecated
 
 - Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000))
diff --git a/src/pytorch_lightning/plugins/precision/apex_amp.py b/src/pytorch_lightning/plugins/precision/apex_amp.py
index cfc13630768db..d85dceb53a069 100644
--- a/src/pytorch_lightning/plugins/precision/apex_amp.py
+++ b/src/pytorch_lightning/plugins/precision/apex_amp.py
@@ -41,6 +41,7 @@ def __init__(self, amp_level: str = "O2") -> None:
         super().__init__()
         self.amp_level = amp_level
         self._connected = False
+        self._state_dict_loaded = False
 
     def main_params(self, optimizer: Optimizer) -> _PARAMETERS:
         return amp.master_params(optimizer)
@@ -83,6 +84,11 @@ def optimizer_step(
         closure: Callable[[], Any],
         **kwargs: Any,
     ) -> Any:
+        if self._state_dict_loaded:
+            raise RuntimeError(
+                "Resuming training with APEX is currently not supported. Set `amp_backend=None` for example or use a"
+                " different precision plugin."
+            )
         if isinstance(optimizer, LBFGS):
             raise MisconfigurationException(
                 f"apex AMP and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
@@ -99,4 +105,5 @@ def state_dict(self) -> Dict[str, Any]:
         return amp.state_dict()
 
     def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
-        amp.load_state_dict(state_dict)
+        self._state_dict_loaded = True
+        return super().load_state_dict(state_dict)
diff --git a/tests/tests_pytorch/models/test_amp.py b/tests/tests_pytorch/models/test_amp.py
index 786de99f59714..822d2e0401619 100644
--- a/tests/tests_pytorch/models/test_amp.py
+++ b/tests/tests_pytorch/models/test_amp.py
@@ -211,3 +211,24 @@ def configure_optimizers(self):
 
     assert isinstance(trainer.lr_scheduler_configs[0].scheduler.optimizer, optim.Adam)
     assert isinstance(trainer.lr_scheduler_configs[1].scheduler.optimizer, optim.SGD)
+
+
+@RunIf(min_cuda_gpus=1, amp_apex=True)
+def test_amp_with_apex_reload(tmpdir):
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_steps=1,
+        limit_test_batches=1,
+        precision=16,
+        amp_backend="apex",
+        accelerator="gpu",
+        devices=1,
+    )
+    trainer.fit(model)
+    trainer.fit_loop.max_steps = 2
+
+    with pytest.raises(RuntimeError, match="Resuming training with APEX is currently not supported."):
+        trainer.fit(model, ckpt_path=trainer.checkpoint_callback.best_model_path)
+
+    trainer.test(model, ckpt_path="best")

From 94e567e6f0dbe3e06feebc8fdf4bdc807182fa96 Mon Sep 17 00:00:00 2001
From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com>
Date: Fri, 26 Aug 2022 09:28:27 -0400
Subject: [PATCH 16/30] Fix mypy errors attributed to
 `pytorch_lightning.trainer.connectors.data_connector.py` (#13806)

Co-authored-by: rohitgr7 <rohitgr1998@gmail.com>
Co-authored-by: otaj <6065855+otaj@users.noreply.github.com>
---
 pyproject.toml                                |  1 -
 src/pytorch_lightning/core/datamodule.py      |  3 +-
 src/pytorch_lightning/core/module.py          |  3 +-
 .../trainer/configuration_validator.py        |  6 ++-
 .../trainer/connectors/data_connector.py      | 50 ++++++++++---------
 src/pytorch_lightning/trainer/trainer.py      |  2 +-
 .../trainer/flags/test_overfit_batches.py     |  2 +-
 7 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 18ae22feeb963..cb5274d5774d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,7 +55,6 @@ module = [
     "pytorch_lightning.profilers.pytorch",
     "pytorch_lightning.strategies.sharded",
     "pytorch_lightning.trainer.callback_hook",
-    "pytorch_lightning.trainer.connectors.data_connector",
     "pytorch_lightning.trainer.supporters",
     "pytorch_lightning.trainer.trainer",
     "pytorch_lightning.tuner.batch_size_scaling",
diff --git a/src/pytorch_lightning/core/datamodule.py b/src/pytorch_lightning/core/datamodule.py
index 4edde3fe6a3ae..e730aabd8c1f9 100644
--- a/src/pytorch_lightning/core/datamodule.py
+++ b/src/pytorch_lightning/core/datamodule.py
@@ -18,6 +18,7 @@
 
 from torch.utils.data import DataLoader, Dataset, IterableDataset
 
+import pytorch_lightning as pl
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks
 from pytorch_lightning.core.mixins import HyperparametersMixin
 from pytorch_lightning.core.saving import _load_from_checkpoint
@@ -62,7 +63,7 @@ def teardown(self):
     def __init__(self) -> None:
         super().__init__()
         # Pointer to the trainer object
-        self.trainer = None
+        self.trainer: Optional["pl.Trainer"] = None
 
     @classmethod
     def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs) -> ArgumentParser:
diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py
index 0926cc52ecd57..a479beadc7931 100644
--- a/src/pytorch_lightning/core/module.py
+++ b/src/pytorch_lightning/core/module.py
@@ -105,7 +105,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         self._use_amp: bool = False
 
         # the precision used
-        self.precision: int = 32
+        self.precision: Union[int, str] = 32
 
         # optionally can be set by user
         self._example_input_array = None
@@ -294,6 +294,7 @@ def loggers(self) -> List[Logger]:
     def _call_batch_hook(self, hook_name: str, *args: Any) -> Any:
         if self._trainer:
             datahook_selector = self._trainer._data_connector._datahook_selector
+            assert datahook_selector is not None
             obj = datahook_selector.get_instance(hook_name)
             if isinstance(obj, self.__class__):
                 trainer_method = self._trainer._call_lightning_module_hook
diff --git a/src/pytorch_lightning/trainer/configuration_validator.py b/src/pytorch_lightning/trainer/configuration_validator.py
index 74c477b245708..9d277f5ac45c9 100644
--- a/src/pytorch_lightning/trainer/configuration_validator.py
+++ b/src/pytorch_lightning/trainer/configuration_validator.py
@@ -46,7 +46,7 @@ def verify_loop_configurations(trainer: "pl.Trainer") -> None:
     elif trainer.state.fn == TrainerFn.PREDICTING:
         __verify_eval_loop_configuration(trainer, model, "predict")
 
-    __verify_batch_transfer_support(trainer, model)
+    __verify_batch_transfer_support(trainer)
     _check_deprecated_callback_hooks(trainer)
     # TODO: Delete _check_on_hpc_hooks in v1.8
     _check_on_hpc_hooks(model)
@@ -149,10 +149,12 @@ def __verify_eval_loop_configuration(trainer: "pl.Trainer", model: "pl.Lightning
             raise MisconfigurationException(f"No `{step_name}()` method defined to run `Trainer.{trainer_method}`.")
 
 
-def __verify_batch_transfer_support(trainer: "pl.Trainer", model: "pl.LightningModule") -> None:
+def __verify_batch_transfer_support(trainer: "pl.Trainer") -> None:
     """Raise Misconfiguration exception since these hooks are not supported in DP mode."""
     batch_transfer_hooks = ("transfer_batch_to_device", "on_after_batch_transfer")
     datahook_selector = trainer._data_connector._datahook_selector
+    assert datahook_selector is not None
+
     for hook in batch_transfer_hooks:
         # TODO: Remove this blocker once batch transfer to device is integrated in Lightning for DP mode.
         if isinstance(trainer.strategy, DataParallelStrategy) and (
diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py
index dae4e4a3527df..d8665843945f7 100644
--- a/src/pytorch_lightning/trainer/connectors/data_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/data_connector.py
@@ -14,7 +14,7 @@
 import multiprocessing
 import os
 from dataclasses import dataclass, field
-from typing import Any, Collection, List, Optional, Tuple, Union
+from typing import Any, Iterable, List, Optional, Tuple, Union
 from weakref import proxy
 
 from torch.utils.data import BatchSampler, DataLoader, Sampler, SequentialSampler
@@ -55,7 +55,7 @@ def __init__(self, trainer: "pl.Trainer", multiple_trainloader_mode: str = "max_
         self._test_dataloader_source = _DataLoaderSource(None, "")
         self._predict_dataloader_source = _DataLoaderSource(None, "")
 
-        self._datahook_selector = _DataHookSelector(None, None)
+        self._datahook_selector: Optional[_DataHookSelector] = None
 
     @property
     def _should_reload_train_dl(self) -> bool:
@@ -230,7 +230,7 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None:
                 category=PossibleUserWarning,
             )
 
-    def _requires_distributed_sampler(self, dataloader) -> bool:
+    def _requires_distributed_sampler(self, dataloader: DataLoader) -> bool:
         return (
             self.trainer._accelerator_connector.replace_sampler_ddp
             and self.trainer._accelerator_connector.is_distributed
@@ -292,14 +292,18 @@ def _prepare_dataloader(
 
         return dataloader
 
-    def _resolve_sampler(self, dataloader: DataLoader, shuffle: bool, mode: Optional[RunningStage] = None) -> Sampler:
+    def _resolve_sampler(
+        self, dataloader: DataLoader, shuffle: bool, mode: Optional[RunningStage] = None
+    ) -> Union[Sampler, Iterable]:
         if self._requires_distributed_sampler(dataloader):
+            distributed_sampler_kwargs = self.trainer.distributed_sampler_kwargs
+            assert distributed_sampler_kwargs is not None
             sampler = self._get_distributed_sampler(
                 dataloader,
                 shuffle,
                 mode=mode,
                 overfit_batches=self.trainer.overfit_batches,
-                **self.trainer.distributed_sampler_kwargs,
+                **distributed_sampler_kwargs,
             )
 
             # update docs too once this is resolved
@@ -357,7 +361,7 @@ def _reset_eval_dataloader(
             dataloaders = self._resolve_overfit_batches(dataloaders, mode)
 
         if not isinstance(dataloaders, list):
-            dataloaders = [dataloaders]
+            dataloaders = [dataloaders]  # type: ignore[assignment]
 
         if any(dl is None for dl in dataloaders):
             rank_zero_warn("One of given dataloaders is None and it will be skipped.")
@@ -426,7 +430,7 @@ def _reset_eval_dataloader(
 
         return loader_num_batches, dataloaders
 
-    def _request_dataloader(self, stage: RunningStage) -> Union[DataLoader, List[DataLoader]]:
+    def _request_dataloader(self, stage: RunningStage) -> TRAIN_DATALOADERS:
         """Requests a dataloader from the given model by calling dataloader hooks corresponding to the given stage.
 
         Returns:
@@ -447,10 +451,12 @@ def _request_dataloader(self, stage: RunningStage) -> Union[DataLoader, List[Dat
         return dataloader
 
     @staticmethod
-    def _resolve_overfit_batches(dataloaders: Collection[DataLoader], mode: RunningStage) -> Collection[DataLoader]:
+    def _resolve_overfit_batches(
+        dataloaders: Union[TRAIN_DATALOADERS, EVAL_DATALOADERS], mode: RunningStage
+    ) -> Union[TRAIN_DATALOADERS, EVAL_DATALOADERS]:
         all_have_sequential_sampler = True
 
-        def resolve_has_no_sequential_sampler(dataloader: DataLoader):
+        def resolve_has_no_sequential_sampler(dataloader: DataLoader) -> None:
             nonlocal all_have_sequential_sampler
             all_have_sequential_sampler = all_have_sequential_sampler & isinstance(
                 dataloader.sampler, SequentialSampler
@@ -460,19 +466,23 @@ def resolve_has_no_sequential_sampler(dataloader: DataLoader):
 
         if not all_have_sequential_sampler:
             rank_zero_warn(
-                "You requested to overfit but enabled training dataloader shuffling."
+                f"You requested to overfit but enabled {mode.dataloader_prefix} dataloader shuffling."
                 f" We are turning off the {mode.dataloader_prefix} dataloader shuffling for you."
             )
 
             def replace_sampler(dataloader: DataLoader) -> DataLoader:
-                return _update_dataloader(dataloader, sampler=SequentialSampler(dataloader.dataset), mode=mode)
+                return _update_dataloader(
+                    dataloader,
+                    sampler=SequentialSampler(dataloader.dataset),  # type: ignore[arg-type]
+                    mode=mode,
+                )
 
             dataloaders = apply_to_collection(dataloaders, DataLoader, replace_sampler)
 
         return dataloaders
 
     @staticmethod
-    def _check_eval_shuffling(dataloader, mode):
+    def _check_eval_shuffling(dataloader: DataLoader, mode: RunningStage) -> None:
         # limit this warning only for samplers assigned automatically when shuffle is set
         if _is_dataloader_shuffled(dataloader):
             rank_zero_warn(
@@ -506,18 +516,14 @@ def dataloader(self) -> Union[TRAIN_DATALOADERS, EVAL_DATALOADERS]:
 
         If the source is a module, the method with the corresponding :attr:`name` gets called.
         """
-        from pytorch_lightning import LightningDataModule, LightningModule  # prevent cyclic import
-
-        if not self.name:
-            return self.instance
-
-        if isinstance(self.instance, LightningModule):
+        if isinstance(self.instance, pl.LightningModule):
             return self.instance.trainer._call_lightning_module_hook(self.name, pl_module=self.instance)
 
-        if isinstance(self.instance, LightningDataModule):
+        if isinstance(self.instance, pl.LightningDataModule):
             method = getattr(self.instance, self.name)
             return method()
 
+        assert self.instance is not None
         return self.instance
 
     def is_defined(self) -> bool:
@@ -532,9 +538,7 @@ def is_module(self) -> bool:
 
         It does not check whether ``*_dataloader`` methods are actually overridden.
         """
-        from pytorch_lightning import LightningDataModule, LightningModule  # prevent cyclic import
-
-        return isinstance(self.instance, (LightningModule, LightningDataModule))
+        return isinstance(self.instance, (pl.LightningModule, pl.LightningDataModule))
 
 
 @dataclass
@@ -553,7 +557,7 @@ class _DataHookSelector:
 
     model: "pl.LightningModule"
     datamodule: Optional["pl.LightningDataModule"]
-    _valid_hooks: Tuple[str] = field(
+    _valid_hooks: Tuple[str, ...] = field(
         default=("on_before_batch_transfer", "transfer_batch_to_device", "on_after_batch_transfer")
     )
 
diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py
index 8bee0ac6dfb7f..d5bcec7db85d6 100644
--- a/src/pytorch_lightning/trainer/trainer.py
+++ b/src/pytorch_lightning/trainer/trainer.py
@@ -2234,7 +2234,7 @@ def is_global_zero(self) -> bool:
         return self.strategy.is_global_zero
 
     @property
-    def distributed_sampler_kwargs(self) -> Optional[dict]:
+    def distributed_sampler_kwargs(self) -> Optional[Dict[str, Any]]:
         if isinstance(self.strategy, ParallelStrategy):
             return self.strategy.distributed_sampler_kwargs
 
diff --git a/tests/tests_pytorch/trainer/flags/test_overfit_batches.py b/tests/tests_pytorch/trainer/flags/test_overfit_batches.py
index da3e154349e1b..dc73e76cc391b 100644
--- a/tests/tests_pytorch/trainer/flags/test_overfit_batches.py
+++ b/tests/tests_pytorch/trainer/flags/test_overfit_batches.py
@@ -66,7 +66,7 @@ def val_dataloader(self):
     model = TestModel()
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=2)
 
-    with pytest.warns(UserWarning, match="requested to overfit but enabled training dataloader shuffling"):
+    with pytest.warns(UserWarning, match="requested to overfit but enabled train dataloader shuffling"):
         trainer.fit(model)
 
     assert isinstance(trainer.train_dataloader.loaders.sampler, SequentialSampler)

From 0102d0d4d48e18c364008137491e7d9697022745 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Barz?= <Callidior@users.noreply.github.com>
Date: Fri, 26 Aug 2022 17:19:08 +0200
Subject: [PATCH 17/30] Fix restoring trainer after `lr_find()` (#14113)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 src/pytorch_lightning/CHANGELOG.md          | 9 ++++++---
 src/pytorch_lightning/tuner/lr_finder.py    | 2 ++
 tests/tests_pytorch/helpers/utils.py        | 5 +++++
 tests/tests_pytorch/tuner/test_lr_finder.py | 8 +++++---
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 1776b917c761a..78b32a389f418 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -95,6 +95,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296))
 
 
+- Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964))
+
+
+- Fixed restoring the trainer after using `lr_find()` so that the correct LR schedule is used for the actual training ([#14113](https://github.com/Lightning-AI/lightning/pull/14113))
+
+
 - Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846)
 
 
@@ -130,9 +136,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed not preserving set attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks ([#14212](https://github.com/Lightning-AI/lightning/pull/14212))
 
 
-- Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964))
-
-
 ## [1.7.1] - 2022-08-09
 
 ### Fixed
diff --git a/src/pytorch_lightning/tuner/lr_finder.py b/src/pytorch_lightning/tuner/lr_finder.py
index 186dfb5ea7416..45286612231c2 100644
--- a/src/pytorch_lightning/tuner/lr_finder.py
+++ b/src/pytorch_lightning/tuner/lr_finder.py
@@ -278,6 +278,7 @@ def __lr_finder_dump_params(trainer: "pl.Trainer") -> Dict[str, Any]:
         "callbacks": trainer.callbacks,
         "logger": trainer.logger,
         "max_steps": trainer.fit_loop.max_steps,
+        "setup_optimizers": trainer.strategy.setup_optimizers,
     }
 
 
@@ -297,6 +298,7 @@ def __lr_finder_restore_params(trainer: "pl.Trainer", params: Dict[str, Any]) ->
     trainer.callbacks = params["callbacks"]
     trainer.logger = params["logger"]
     trainer.fit_loop.max_steps = params["max_steps"]
+    trainer.strategy.setup_optimizers = params["setup_optimizers"]  # type: ignore[assignment]
 
 
 class _LRCallback(Callback):
diff --git a/tests/tests_pytorch/helpers/utils.py b/tests/tests_pytorch/helpers/utils.py
index a9efd7f178f2b..54503bf75b0f7 100644
--- a/tests/tests_pytorch/helpers/utils.py
+++ b/tests/tests_pytorch/helpers/utils.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import functools
 import os
 import re
 from contextlib import contextmanager
@@ -78,6 +79,10 @@ def init_checkpoint_callback(logger):
     return checkpoint
 
 
+def getattr_recursive(obj, attr):
+    return functools.reduce(getattr, [obj] + attr.split("."))
+
+
 @contextmanager
 def no_warning_call(expected_warning: Type[Warning] = UserWarning, match: Optional[str] = None):
     with pytest.warns(None) as record:
diff --git a/tests/tests_pytorch/tuner/test_lr_finder.py b/tests/tests_pytorch/tuner/test_lr_finder.py
index 9be115d2f8fda..cf0776d755083 100644
--- a/tests/tests_pytorch/tuner/test_lr_finder.py
+++ b/tests/tests_pytorch/tuner/test_lr_finder.py
@@ -24,7 +24,7 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests_pytorch.helpers.datamodules import ClassifDataModule
 from tests_pytorch.helpers.simple_models import ClassificationModel
-from tests_pytorch.helpers.utils import no_warning_call
+from tests_pytorch.helpers.utils import getattr_recursive, no_warning_call
 
 
 def test_error_on_more_than_1_optimizer(tmpdir):
@@ -88,13 +88,15 @@ def test_trainer_reset_correctly(tmpdir):
         "logger",
         "global_step",
         "max_steps",
+        "fit_loop.max_steps",
+        "strategy.setup_optimizers",
     ]
-    expected = {ca: getattr(trainer, ca) for ca in changed_attributes}
+    expected = {ca: getattr_recursive(trainer, ca) for ca in changed_attributes}
 
     with no_warning_call(UserWarning, match="Please add the following callbacks"):
         trainer.tuner.lr_find(model, num_training=5)
 
-    actual = {ca: getattr(trainer, ca) for ca in changed_attributes}
+    actual = {ca: getattr_recursive(trainer, ca) for ca in changed_attributes}
     assert actual == expected
     assert model.trainer == trainer
 

From fafd2546782c8d2c249c9d7546f74b416e803af7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 26 Aug 2022 17:41:38 +0200
Subject: [PATCH 18/30] Fix device parser logic to avoid creating CUDA context
 (#14319)

* let environment disable forking

* add helper function and error messages

* tests

* changelog

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 src/pytorch_lightning/CHANGELOG.md                     |  3 +++
 .../strategies/launchers/multiprocessing.py            |  9 +++++++++
 .../trainer/connectors/accelerator_connector.py        |  5 +++++
 src/pytorch_lightning/utilities/device_parser.py       |  5 +++--
 .../strategies/launchers/test_multiprocessing.py       | 10 ++++++++++
 .../trainer/connectors/test_accelerator_connector.py   |  9 +++++++++
 6 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 78b32a389f418..6c2e54197fa7d 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for saving sharded optimizer state dict outside of `DDPShardedStrategy` ([#14208](https://github.com/PyTorchLightning/pytorch-lightning/pull/14208))
 
 
+- Added an environment variable `PL_DISABLE_FORK` that can be used to disable all forking in the Trainer ([#14319](https://github.com/Lightning-AI/lightning/issues/14319))
+
+
 
 ### Changed
 
diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py
index 2617e5fe27b10..cee3ed5ee7b2a 100644
--- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py
+++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py
@@ -66,6 +66,10 @@ def __init__(self, strategy: Strategy, start_method: Literal["spawn", "fork", "f
                 f"The start method '{self._start_method}' is not available on this platform. Available methods are:"
                 f" {', '.join(mp.get_all_start_methods())}"
             )
+        if start_method in ("fork", "forkserver") and _is_forking_disabled():
+            raise ValueError(
+                "Forking is disabled in this environment by `PL_DISABLE_FORKING=1`. Choose a different start method."
+            )
 
     @property
     def is_interactive_compatible(self) -> bool:
@@ -281,3 +285,8 @@ def restore(self) -> None:
             torch.use_deterministic_algorithms(self.use_deterministic_algorithms)
         torch.backends.cudnn.benchmark = self.cudnn_benchmark
         _set_rng_states(self.rng_states)
+
+
+def _is_forking_disabled() -> bool:
+    """Returns whether forking is disabled through the environment variable ``PL_DISABLE_FORK``."""
+    return bool(int(os.environ.get("PL_DISABLE_FORK", "0")))
diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 54636f6d617ac..e9183f7f52cf3 100644
--- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -73,6 +73,7 @@
     TPUSpawnStrategy,
 )
 from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES
+from pytorch_lightning.strategies.launchers.multiprocessing import _is_forking_disabled
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import (
     _StrategyType,
@@ -642,6 +643,10 @@ def _check_strategy_and_fallback(self) -> None:
                 f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this"
                 f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead."
             )
+        if strategy_flag in _DDP_FORK_ALIASES and _is_forking_disabled():
+            raise ValueError(
+                "Forking is disabled in this environment by `PL_DISABLE_FORKING=1`. Choose a different strategy."
+            )
         if strategy_flag:
             self._strategy_flag = strategy_flag
 
diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py
index c30692d0ab01a..1b9f43137943f 100644
--- a/src/pytorch_lightning/utilities/device_parser.py
+++ b/src/pytorch_lightning/utilities/device_parser.py
@@ -18,6 +18,7 @@
 import torch.cuda
 
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
+from pytorch_lightning.strategies.launchers.multiprocessing import _is_forking_disabled
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.types import _DEVICE
 
@@ -323,7 +324,7 @@ def num_cuda_devices() -> int:
     Unlike :func:`torch.cuda.device_count`, this function will do its best not to create a CUDA context for fork
     support, if the platform allows it.
     """
-    if "fork" not in torch.multiprocessing.get_all_start_methods():
+    if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled():
         return torch.cuda.device_count()
     with multiprocessing.get_context("fork").Pool(1) as pool:
         return pool.apply(torch.cuda.device_count)
@@ -335,7 +336,7 @@ def is_cuda_available() -> bool:
     Unlike :func:`torch.cuda.is_available`, this function will do its best not to create a CUDA context for fork
     support, if the platform allows it.
     """
-    if "fork" not in torch.multiprocessing.get_all_start_methods():
+    if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled():
         return torch.cuda.is_available()
     with multiprocessing.get_context("fork").Pool(1) as pool:
         return pool.apply(torch.cuda.is_available)
diff --git a/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py b/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py
index ad3e891ad607f..4859ddba60574 100644
--- a/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py
+++ b/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from unittest import mock
 from unittest.mock import ANY, Mock
 
@@ -18,6 +19,7 @@
 import torch
 
 from pytorch_lightning.strategies.launchers.multiprocessing import _GlobalStateSnapshot, _MultiProcessingLauncher
+from tests_pytorch.helpers.runif import RunIf
 
 
 @mock.patch("pytorch_lightning.strategies.launchers.multiprocessing.mp.get_all_start_methods", return_value=[])
@@ -26,6 +28,14 @@ def test_multiprocessing_launcher_forking_on_unsupported_platform(_):
         _MultiProcessingLauncher(strategy=Mock(), start_method="fork")
 
 
+@RunIf(skip_windows=True)
+@pytest.mark.parametrize("start_method", ["fork", "forkserver"])
+@mock.patch.dict(os.environ, {"PL_DISABLE_FORK": "1"}, clear=True)
+def test_multiprocessing_launcher_disabled_forking(start_method):
+    with pytest.raises(ValueError, match="Forking is disabled in this environment"):
+        _MultiProcessingLauncher(strategy=Mock(), start_method=start_method)
+
+
 @pytest.mark.parametrize("start_method", ["spawn", "fork"])
 @mock.patch("pytorch_lightning.strategies.launchers.multiprocessing.mp")
 def test_multiprocessing_launcher_start_method(mp_mock, start_method):
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
index 1296d962eea2c..434607cab78a2 100644
--- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
+++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -810,3 +810,12 @@ def test_accelerator_specific_checkpoint_io(*_):
 def test_ddp_fork_on_unsupported_platform(_, strategy):
     with pytest.raises(ValueError, match="process forking is not supported on this platform"):
         Trainer(strategy=strategy)
+
+
+@RunIf(skip_windows=True)
+@pytest.mark.parametrize("strategy", _DDP_FORK_ALIASES)
+@mock.patch.dict(os.environ, {"PL_DISABLE_FORK": "1"}, clear=True)
+def test_strategy_choice_ddp_spawn_in_interactive_when_fork_disabled(strategy):
+    """Test there is an error when forking is disabled via the environment variable and the user requests fork."""
+    with pytest.raises(ValueError, match="Forking is disabled in this environment"):
+        Trainer(devices=2, strategy=strategy)

From ed84d04bcf959b6ca8c47e53a0145f6f568920c2 Mon Sep 17 00:00:00 2001
From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com>
Date: Fri, 26 Aug 2022 12:26:26 -0400
Subject: [PATCH 19/30] Fix mypy errors attributed to
 `pytorch_lightning.core.datamodule` (#13693)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: rohitgr7 <rohitgr1998@gmail.com>
Co-authored-by: otaj <ota@lightning.ai>
---
 pyproject.toml                                |  1 -
 src/pytorch_lightning/core/datamodule.py      | 66 +++++++++++++------
 src/pytorch_lightning/core/saving.py          | 13 ++--
 src/pytorch_lightning/utilities/argparse.py   | 39 +++++------
 src/pytorch_lightning/utilities/types.py      |  2 +
 .../tests_pytorch/utilities/test_argparse.py  |  2 +-
 6 files changed, 74 insertions(+), 49 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index cb5274d5774d9..399e1e83efb3f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,6 @@ warn_no_return = "False"
 # mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g;  s|\/|\.|g' | xargs -I {} echo '"{}",'
 module = [
     "pytorch_lightning.callbacks.progress.rich_progress",
-    "pytorch_lightning.core.datamodule",
     "pytorch_lightning.profilers.base",
     "pytorch_lightning.profilers.pytorch",
     "pytorch_lightning.strategies.sharded",
diff --git a/src/pytorch_lightning/core/datamodule.py b/src/pytorch_lightning/core/datamodule.py
index e730aabd8c1f9..b7a25badf420f 100644
--- a/src/pytorch_lightning/core/datamodule.py
+++ b/src/pytorch_lightning/core/datamodule.py
@@ -22,8 +22,13 @@
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks
 from pytorch_lightning.core.mixins import HyperparametersMixin
 from pytorch_lightning.core.saving import _load_from_checkpoint
-from pytorch_lightning.utilities.argparse import add_argparse_args, from_argparse_args, get_init_arguments_and_types
-from pytorch_lightning.utilities.types import _PATH
+from pytorch_lightning.utilities.argparse import (
+    add_argparse_args,
+    from_argparse_args,
+    get_init_arguments_and_types,
+    parse_argparser,
+)
+from pytorch_lightning.utilities.types import _ADD_ARGPARSE_RETURN, _PATH, EVAL_DATALOADERS, TRAIN_DATALOADERS
 
 
 class LightningDataModule(CheckpointHooks, DataHooks, HyperparametersMixin):
@@ -55,7 +60,7 @@ def teardown(self):
                 # called on every process in DDP
     """
 
-    name: str = ...
+    name: Optional[str] = None
     CHECKPOINT_HYPER_PARAMS_KEY = "datamodule_hyper_parameters"
     CHECKPOINT_HYPER_PARAMS_NAME = "datamodule_hparams_name"
     CHECKPOINT_HYPER_PARAMS_TYPE = "datamodule_hparams_type"
@@ -66,7 +71,7 @@ def __init__(self) -> None:
         self.trainer: Optional["pl.Trainer"] = None
 
     @classmethod
-    def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs) -> ArgumentParser:
+    def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs: Any) -> _ADD_ARGPARSE_RETURN:
         """Extends existing argparse by default `LightningDataModule` attributes.
 
         Example::
@@ -77,7 +82,9 @@ def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs) -> ArgumentP
         return add_argparse_args(cls, parent_parser, **kwargs)
 
     @classmethod
-    def from_argparse_args(cls, args: Union[Namespace, ArgumentParser], **kwargs):
+    def from_argparse_args(
+        cls, args: Union[Namespace, ArgumentParser], **kwargs: Any
+    ) -> Union["pl.LightningDataModule", "pl.Trainer"]:
         """Create an instance from CLI arguments.
 
         Args:
@@ -92,6 +99,10 @@ def from_argparse_args(cls, args: Union[Namespace, ArgumentParser], **kwargs):
         """
         return from_argparse_args(cls, args, **kwargs)
 
+    @classmethod
+    def parse_argparser(cls, arg_parser: Union[ArgumentParser, Namespace]) -> Namespace:
+        return parse_argparser(cls, arg_parser)
+
     @classmethod
     def get_init_arguments_and_types(cls) -> List[Tuple[str, Tuple, Any]]:
         r"""Scans the DataModule signature and returns argument names, types and default values.
@@ -102,6 +113,15 @@ def get_init_arguments_and_types(cls) -> List[Tuple[str, Tuple, Any]]:
         """
         return get_init_arguments_and_types(cls)
 
+    @classmethod
+    def get_deprecated_arg_names(cls) -> List:
+        """Returns a list with deprecated DataModule arguments."""
+        depr_arg_names: List[str] = []
+        for name, val in cls.__dict__.items():
+            if name.startswith("DEPRECATED") and isinstance(val, (tuple, list)):
+                depr_arg_names.extend(val)
+        return depr_arg_names
+
     @classmethod
     def from_datasets(
         cls,
@@ -112,7 +132,7 @@ def from_datasets(
         batch_size: int = 1,
         num_workers: int = 0,
         **datamodule_kwargs: Any,
-    ):
+    ) -> "LightningDataModule":
         r"""
         Create an instance from torch.utils.data.Dataset.
 
@@ -133,24 +153,32 @@ def dataloader(ds: Dataset, shuffle: bool = False) -> DataLoader:
             shuffle &= not isinstance(ds, IterableDataset)
             return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True)
 
-        def train_dataloader():
+        def train_dataloader() -> TRAIN_DATALOADERS:
+            assert train_dataset
+
             if isinstance(train_dataset, Mapping):
                 return {key: dataloader(ds, shuffle=True) for key, ds in train_dataset.items()}
             if isinstance(train_dataset, Sequence):
                 return [dataloader(ds, shuffle=True) for ds in train_dataset]
             return dataloader(train_dataset, shuffle=True)
 
-        def val_dataloader():
+        def val_dataloader() -> EVAL_DATALOADERS:
+            assert val_dataset
+
             if isinstance(val_dataset, Sequence):
                 return [dataloader(ds) for ds in val_dataset]
             return dataloader(val_dataset)
 
-        def test_dataloader():
+        def test_dataloader() -> EVAL_DATALOADERS:
+            assert test_dataset
+
             if isinstance(test_dataset, Sequence):
                 return [dataloader(ds) for ds in test_dataset]
             return dataloader(test_dataset)
 
-        def predict_dataloader():
+        def predict_dataloader() -> EVAL_DATALOADERS:
+            assert predict_dataset
+
             if isinstance(predict_dataset, Sequence):
                 return [dataloader(ds) for ds in predict_dataset]
             return dataloader(predict_dataset)
@@ -161,19 +189,19 @@ def predict_dataloader():
         if accepts_kwargs:
             special_kwargs = candidate_kwargs
         else:
-            accepted_params = set(accepted_params)
-            accepted_params.discard("self")
-            special_kwargs = {k: v for k, v in candidate_kwargs.items() if k in accepted_params}
+            accepted_param_names = set(accepted_params)
+            accepted_param_names.discard("self")
+            special_kwargs = {k: v for k, v in candidate_kwargs.items() if k in accepted_param_names}
 
         datamodule = cls(**datamodule_kwargs, **special_kwargs)
         if train_dataset is not None:
-            datamodule.train_dataloader = train_dataloader
+            datamodule.train_dataloader = train_dataloader  # type: ignore[assignment]
         if val_dataset is not None:
-            datamodule.val_dataloader = val_dataloader
+            datamodule.val_dataloader = val_dataloader  # type: ignore[assignment]
         if test_dataset is not None:
-            datamodule.test_dataloader = test_dataloader
+            datamodule.test_dataloader = test_dataloader  # type: ignore[assignment]
         if predict_dataset is not None:
-            datamodule.predict_dataloader = predict_dataloader
+            datamodule.predict_dataloader = predict_dataloader  # type: ignore[assignment]
         return datamodule
 
     def state_dict(self) -> Dict[str, Any]:
@@ -197,8 +225,8 @@ def load_from_checkpoint(
         cls,
         checkpoint_path: Union[_PATH, IO],
         hparams_file: Optional[_PATH] = None,
-        **kwargs,
-    ):
+        **kwargs: Any,
+    ) -> Union["pl.LightningModule", "pl.LightningDataModule"]:
         r"""
         Primary way of loading a datamodule from a checkpoint. When Lightning saves a checkpoint
         it stores the arguments passed to ``__init__``  in the checkpoint under ``"datamodule_hyper_parameters"``.
diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py
index ffdc0988a1a6e..2df7a661c2bd6 100644
--- a/src/pytorch_lightning/core/saving.py
+++ b/src/pytorch_lightning/core/saving.py
@@ -171,10 +171,10 @@ def on_hpc_load(self, checkpoint: Dict[str, Any]) -> None:
 
 def _load_from_checkpoint(
     cls: Union[Type["ModelIO"], Type["pl.LightningModule"], Type["pl.LightningDataModule"]],
-    checkpoint_path: Union[str, IO],
+    checkpoint_path: Union[_PATH, IO],
     map_location: _MAP_LOCATION_TYPE = None,
-    hparams_file: Optional[str] = None,
-    strict: bool = True,
+    hparams_file: Optional[_PATH] = None,
+    strict: Optional[bool] = None,
     **kwargs: Any,
 ) -> Union["pl.LightningModule", "pl.LightningDataModule"]:
     if map_location is None:
@@ -183,7 +183,7 @@ def _load_from_checkpoint(
         checkpoint = pl_load(checkpoint_path, map_location=map_location)
 
     if hparams_file is not None:
-        extension = hparams_file.split(".")[-1]
+        extension = str(hparams_file).split(".")[-1]
         if extension.lower() == "csv":
             hparams = load_hparams_from_tags_csv(hparams_file)
         elif extension.lower() in ("yml", "yaml"):
@@ -201,8 +201,6 @@ def _load_from_checkpoint(
 
     if issubclass(cls, pl.LightningDataModule):
         return _load_state(cls, checkpoint, **kwargs)
-    # allow cls to be evaluated as subclassed LightningModule or,
-    # as LightningModule for internal tests
     if issubclass(cls, pl.LightningModule):
         return _load_state(cls, checkpoint, strict=strict, **kwargs)
 
@@ -210,7 +208,7 @@ def _load_from_checkpoint(
 def _load_state(
     cls: Union[Type["pl.LightningModule"], Type["pl.LightningDataModule"]],
     checkpoint: Dict[str, Any],
-    strict: bool = True,
+    strict: Optional[bool] = None,
     **cls_kwargs_new: Any,
 ) -> Union["pl.LightningModule", "pl.LightningDataModule"]:
     cls_spec = inspect.getfullargspec(cls.__init__)
@@ -257,6 +255,7 @@ def _load_state(
         return obj
 
     # load the state_dict on the model automatically
+    assert strict is not None
     keys = obj.load_state_dict(checkpoint["state_dict"], strict=strict)
 
     if not strict:
diff --git a/src/pytorch_lightning/utilities/argparse.py b/src/pytorch_lightning/utilities/argparse.py
index 58ced375fcae5..26277db183410 100644
--- a/src/pytorch_lightning/utilities/argparse.py
+++ b/src/pytorch_lightning/utilities/argparse.py
@@ -15,7 +15,6 @@
 
 import inspect
 import os
-from abc import ABC
 from argparse import _ArgumentGroup, ArgumentParser, Namespace
 from ast import literal_eval
 from contextlib import suppress
@@ -24,22 +23,17 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.utilities.parsing import str_to_bool, str_to_bool_or_int, str_to_bool_or_str
+from pytorch_lightning.utilities.types import _ADD_ARGPARSE_RETURN
 
 _T = TypeVar("_T", bound=Callable[..., Any])
-
-
-class ParseArgparserDataType(ABC):
-    def __init__(self, *_: Any, **__: Any) -> None:
-        pass
-
-    @classmethod
-    def parse_argparser(cls, args: "ArgumentParser") -> Any:
-        pass
+_ARGPARSE_CLS = Union[Type["pl.LightningDataModule"], Type["pl.Trainer"]]
 
 
 def from_argparse_args(
-    cls: Type[ParseArgparserDataType], args: Union[Namespace, ArgumentParser], **kwargs: Any
-) -> ParseArgparserDataType:
+    cls: _ARGPARSE_CLS,
+    args: Union[Namespace, ArgumentParser],
+    **kwargs: Any,
+) -> Union["pl.LightningDataModule", "pl.Trainer"]:
     """Create an instance from CLI arguments. Eventually use variables from OS environment which are defined as
     ``"PL_<CLASS-NAME>_<CLASS_ARUMENT_NAME>"``.
 
@@ -72,7 +66,7 @@ def from_argparse_args(
     return cls(**trainer_kwargs)
 
 
-def parse_argparser(cls: Type["pl.Trainer"], arg_parser: Union[ArgumentParser, Namespace]) -> Namespace:
+def parse_argparser(cls: _ARGPARSE_CLS, arg_parser: Union[ArgumentParser, Namespace]) -> Namespace:
     """Parse CLI arguments, required for custom bool types."""
     args = arg_parser.parse_args() if isinstance(arg_parser, ArgumentParser) else arg_parser
 
@@ -97,7 +91,7 @@ def parse_argparser(cls: Type["pl.Trainer"], arg_parser: Union[ArgumentParser, N
     return Namespace(**modified_args)
 
 
-def parse_env_variables(cls: Type["pl.Trainer"], template: str = "PL_%(cls_name)s_%(cls_argument)s") -> Namespace:
+def parse_env_variables(cls: _ARGPARSE_CLS, template: str = "PL_%(cls_name)s_%(cls_argument)s") -> Namespace:
     """Parse environment arguments if they are defined.
 
     Examples:
@@ -127,7 +121,7 @@ def parse_env_variables(cls: Type["pl.Trainer"], template: str = "PL_%(cls_name)
     return Namespace(**env_args)
 
 
-def get_init_arguments_and_types(cls: Any) -> List[Tuple[str, Tuple, Any]]:
+def get_init_arguments_and_types(cls: _ARGPARSE_CLS) -> List[Tuple[str, Tuple, Any]]:
     r"""Scans the class signature and returns argument names, types and default values.
 
     Returns:
@@ -155,7 +149,7 @@ def get_init_arguments_and_types(cls: Any) -> List[Tuple[str, Tuple, Any]]:
     return name_type_default
 
 
-def _get_abbrev_qualified_cls_name(cls: Any) -> str:
+def _get_abbrev_qualified_cls_name(cls: _ARGPARSE_CLS) -> str:
     assert isinstance(cls, type), repr(cls)
     if cls.__module__.startswith("pytorch_lightning."):
         # Abbreviate.
@@ -165,8 +159,11 @@ def _get_abbrev_qualified_cls_name(cls: Any) -> str:
 
 
 def add_argparse_args(
-    cls: Type["pl.Trainer"], parent_parser: ArgumentParser, *, use_argument_group: bool = True
-) -> Union[_ArgumentGroup, ArgumentParser]:
+    cls: _ARGPARSE_CLS,
+    parent_parser: ArgumentParser,
+    *,
+    use_argument_group: bool = True,
+) -> _ADD_ARGPARSE_RETURN:
     r"""Extends existing argparse by default attributes for ``cls``.
 
     Args:
@@ -207,10 +204,10 @@ def add_argparse_args(
         >>> args = parser.parse_args([])
     """
     if isinstance(parent_parser, _ArgumentGroup):
-        raise RuntimeError("Please only pass an ArgumentParser instance.")
+        raise RuntimeError("Please only pass an `ArgumentParser` instance.")
     if use_argument_group:
         group_name = _get_abbrev_qualified_cls_name(cls)
-        parser: Union[_ArgumentGroup, ArgumentParser] = parent_parser.add_argument_group(group_name)
+        parser: _ADD_ARGPARSE_RETURN = parent_parser.add_argument_group(group_name)
     else:
         parser = ArgumentParser(parents=[parent_parser], add_help=False)
 
@@ -222,7 +219,7 @@ def add_argparse_args(
 
     # Get symbols from cls or init function.
     for symbol in (cls, cls.__init__):
-        args_and_types = get_init_arguments_and_types(symbol)
+        args_and_types = get_init_arguments_and_types(symbol)  # type: ignore[arg-type]
         args_and_types = [x for x in args_and_types if x[0] not in ignore_arg_names]
         if len(args_and_types) > 0:
             break
diff --git a/src/pytorch_lightning/utilities/types.py b/src/pytorch_lightning/utilities/types.py
index 9f2db6422612f..7ab3d6948854c 100644
--- a/src/pytorch_lightning/utilities/types.py
+++ b/src/pytorch_lightning/utilities/types.py
@@ -16,6 +16,7 @@
  - Do not include any `_TYPE` suffix
  - Types used in public hooks (as those in the `LightningModule` and `Callback`) should be public (no leading `_`)
 """
+from argparse import _ArgumentGroup, ArgumentParser
 from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
@@ -50,6 +51,7 @@
 EVAL_DATALOADERS = Union[DataLoader, Sequence[DataLoader]]
 _DEVICE = Union[torch.device, str, int]
 _MAP_LOCATION_TYPE = Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]]
+_ADD_ARGPARSE_RETURN = Union[_ArgumentGroup, ArgumentParser]
 
 
 @runtime_checkable
diff --git a/tests/tests_pytorch/utilities/test_argparse.py b/tests/tests_pytorch/utilities/test_argparse.py
index 1e83d96a0648e..2a88e8db531f9 100644
--- a/tests/tests_pytorch/utilities/test_argparse.py
+++ b/tests/tests_pytorch/utilities/test_argparse.py
@@ -207,7 +207,7 @@ def test_add_argparse_args(cls, name):
 
 
 def test_negative_add_argparse_args():
-    with pytest.raises(RuntimeError, match="Please only pass an ArgumentParser instance."):
+    with pytest.raises(RuntimeError, match="Please only pass an `ArgumentParser` instance."):
         parser = ArgumentParser()
         add_argparse_args(AddArgparseArgsExampleClass, parser.add_argument_group("bad workflow"))
 

From d4bcafad7a64d7c39598fa7e4e33b81a1be31828 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Fri, 26 Aug 2022 18:56:56 +0200
Subject: [PATCH 20/30] Remove the deprecated loop output format (#14373)

---
 src/pytorch_lightning/CHANGELOG.md            |   6 +
 .../loops/epoch/training_epoch_loop.py        |  39 +-----
 src/pytorch_lightning/loops/utilities.py      |   9 +-
 .../deprecated_api/test_remove_1-8.py         |  50 --------
 .../loops/epoch/test_training_epoch_loop.py   | 115 +++---------------
 tests/tests_pytorch/loops/test_utilities.py   |  26 +---
 6 files changed, 27 insertions(+), 218 deletions(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 6c2e54197fa7d..4b5e3a893429e 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -81,6 +81,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed the deprecated `DistributedType` and `DeviceType` enum classes ([#14045](https://github.com/Lightning-AI/lightning/pull/14045))
 
 
+- Remove the deprecated `on_train_batch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#14373](https://github.com/PyTorchLightning/pytorch-lightning/pull/14373))
+
+
+- Remove the deprecated  `training_epoch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#14373](https://github.com/PyTorchLightning/pytorch-lightning/pull/14373))
+
+
 - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868))
 
 
diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 6b3bc38400803..0fe93ca7104b6 100644
--- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -22,7 +22,7 @@
 from pytorch_lightning import loops  # import as loops to avoid circular imports
 from pytorch_lightning.loops.batch import TrainingBatchLoop
 from pytorch_lightning.loops.batch.training_batch_loop import _OUTPUTS_TYPE as _BATCH_OUTPUTS_TYPE
-from pytorch_lightning.loops.utilities import _get_active_optimizers, _is_max_limit_reached, _v1_8_output_format
+from pytorch_lightning.loops.utilities import _get_active_optimizers, _is_max_limit_reached
 from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection
 from pytorch_lightning.trainer.progress import BatchProgress, SchedulerProgress
 from pytorch_lightning.trainer.supporters import CombinedLoader
@@ -31,7 +31,7 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.fetching import AbstractDataFetcher, DataLoaderIterDataFetcher
 from pytorch_lightning.utilities.model_helpers import is_overridden
-from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn
+from pytorch_lightning.utilities.rank_zero import rank_zero_warn
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
 from pytorch_lightning.utilities.warnings import WarningCache
 
@@ -342,24 +342,6 @@ def _prepare_outputs_training_batch_end(
             )
 
         array = np.array(batch_output, dtype=object)
-        # TODO: remove in v1.8
-        if (
-            num_optimizers > 1
-            and lightning_module.truncated_bptt_steps > 0
-            and is_overridden("on_train_batch_end", lightning_module)
-            and not _v1_8_output_format(lightning_module.on_train_batch_end)
-        ):
-            rank_zero_deprecation(
-                "You are training with multiple optimizers AND truncated backpropagation through time enabled."
-                " The current format of the `on_train_batch_end(outputs, ...)` is a 2d list with sizes"
-                " (n_optimizers, tbptt_steps), however, this has been deprecated and will change in version v1.8 to"
-                " (tbptt_steps, n_optimizers). You can update your code by adding the following parameter to your"
-                " hook signature: `on_train_batch_end(outputs, ..., new_format=True)`."
-            )
-            # (tbptt_steps, n_opt) -> (n_opt, tbptt_steps)
-            if array.ndim == 1:
-                array = np.expand_dims(array, 1)
-            array = array.transpose((1, 0))
         # squeeze all single-element dimensions
         array = array.squeeze()
         array = array.tolist()
@@ -384,23 +366,6 @@ def _prepare_outputs_training_epoch_end(
             )
 
         array = _recursive_pad(batch_outputs)
-        # TODO: remove in v1.8
-        if (
-            num_optimizers > 1
-            and lightning_module.truncated_bptt_steps > 0
-            and not _v1_8_output_format(lightning_module.on_train_epoch_end)
-        ):
-            rank_zero_deprecation(
-                "You are training with multiple optimizers AND truncated backpropagation through time enabled."
-                " The current format of the `training_epoch_end(outputs)` is a 3d list with sizes"
-                " (n_optimizers, n_batches, tbptt_steps), however, this has been deprecated and will change in version"
-                " v1.8 to (n_batches, tbptt_steps, n_optimizers). You can update your code by adding the following"
-                " parameter to your hook signature: `training_epoch_end(outputs, new_format=True)`."
-            )
-            # (n_batches, tbptt_steps, n_opt) -> (n_opt, n_batches, tbptt_steps)
-            if array.ndim == 2:
-                array = np.expand_dims(array, 2)
-            array = array.transpose((2, 0, 1))
         # squeeze all single-element dimensions
         array = array.squeeze()
         array = array.tolist()
diff --git a/src/pytorch_lightning/loops/utilities.py b/src/pytorch_lightning/loops/utilities.py
index 491af6605c135..9b8ec84ba3661 100644
--- a/src/pytorch_lightning/loops/utilities.py
+++ b/src/pytorch_lightning/loops/utilities.py
@@ -11,11 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import inspect
 from collections import OrderedDict
 from contextlib import contextmanager
 from functools import lru_cache
-from typing import Any, Callable, Generator, List, Optional, Sequence, Tuple
+from typing import Any, Generator, List, Optional, Sequence, Tuple
 
 import numpy as np
 import torch
@@ -216,12 +215,6 @@ def _reset_progress(loop: Loop) -> None:
             _reset_progress(v)
 
 
-# TODO: remove in v1.8
-def _v1_8_output_format(fx: Callable) -> bool:
-    parameters = inspect.signature(fx).parameters
-    return "new_format" in parameters and parameters["new_format"].default is True
-
-
 def _set_sampler_epoch(dataloader: DataLoader, epoch: int) -> None:
     """Calls the ``set_epoch`` method on either the sampler or the batch sampler of the given dataloader.
 
diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
index a69071fd67610..e1fb0b57654c0 100644
--- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
+++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
@@ -38,7 +38,6 @@
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY
 from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn
-from tests_pytorch.deprecated_api import no_deprecated_call
 from tests_pytorch.helpers.runif import RunIf
 from tests_pytorch.helpers.torchtext_utils import get_dummy_torchtext_data_iterator
 
@@ -584,55 +583,6 @@ def test_v1_8_0_weights_save_path(tmpdir):
         _ = trainer.weights_save_path
 
 
-def test_deprecated_epoch_outputs_format(tmpdir):
-    class DeprecationModel(BoringModel):
-        def __init__(self):
-            super().__init__()
-            self.truncated_bptt_steps = 1
-
-        def training_step(self, batch, batch_idx, optimizer_idx, hiddens):
-            output = super().training_step(batch, batch_idx)
-            output["hiddens"] = hiddens
-            return output
-
-        def tbptt_split_batch(self, batch, split_size):
-            return [batch, batch]
-
-        def training_epoch_end(self, outputs):
-            ...
-
-        def on_train_batch_end(self, outputs, batch, batch_idx) -> None:
-            ...
-
-        def configure_optimizers(self):
-            return [torch.optim.Adam(self.parameters()), torch.optim.Adam(self.parameters())]
-
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
-    model = DeprecationModel()
-    batch_match = r"on_train_batch_end.*will change in version v1.8 to \(tbptt_steps, n_optimizers\)"
-    with pytest.deprecated_call(match=batch_match):
-        trainer.fit(model)
-
-    class DeprecationModel2(DeprecationModel):
-        def on_train_batch_end(self, *args, new_format=True):
-            ...
-
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
-    model = DeprecationModel()
-    epoch_match = r"training_epoch_end.*will change in version v1.8 to \(n_batches, tbptt_steps, n_optimizers\)"
-    with pytest.deprecated_call(match=epoch_match):
-        trainer.fit(model)
-
-    class NoDeprecationModel(DeprecationModel2):
-        def training_epoch_end(self, outputs, new_format=True):
-            ...
-
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
-    model = NoDeprecationModel()
-    with no_deprecated_call(match="will change in version v1.8.*new_format=True"):
-        trainer.fit(model)
-
-
 @pytest.mark.flaky(reruns=3)
 @pytest.mark.parametrize(["action", "expected"], [("a", [3, 1]), ("b", [2]), ("c", [1])])
 def test_simple_profiler_iterable_durations(tmpdir, action: str, expected: list):
diff --git a/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py b/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py
index d53871116e8b7..d601542075004 100644
--- a/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py
+++ b/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from unittest import mock
 from unittest.mock import patch
 
 import pytest
@@ -20,7 +19,6 @@
 from pytorch_lightning.demos.boring_classes import BoringModel
 from pytorch_lightning.loops import TrainingEpochLoop
 from pytorch_lightning.trainer.trainer import Trainer
-from tests_pytorch.deprecated_api import no_deprecated_call
 
 _out00 = {"loss": 0.0}
 _out01 = {"loss": 0.1}
@@ -33,43 +31,33 @@
 
 
 class TestPrepareOutputs:
-    def prepare_outputs(self, fn, tbptt_splits, new_format, batch_outputs, num_optimizers, automatic_optimization):
+    def prepare_outputs(self, fn, tbptt_splits, batch_outputs, num_optimizers, automatic_optimization):
         lightning_module = LightningModule()
-        lightning_module.on_train_batch_end = lambda *_: None  # override to trigger the deprecation message
         lightning_module.automatic_optimization = automatic_optimization
         lightning_module.truncated_bptt_steps = tbptt_splits
-        match = "will change in version v1.8.*new_format=True"
-        will_warn = tbptt_splits and num_optimizers > 1 and not new_format
-        ctx_manager = pytest.deprecated_call if will_warn else no_deprecated_call
-        with ctx_manager(match=match):
-            with mock.patch(
-                "pytorch_lightning.loops.epoch.training_epoch_loop._v1_8_output_format", return_value=new_format
-            ):
-                return fn(
-                    batch_outputs,
-                    lightning_module=lightning_module,
-                    num_optimizers=num_optimizers,  # does not matter for manual optimization
-                )
+        return fn(
+            batch_outputs,
+            lightning_module=lightning_module,
+            num_optimizers=num_optimizers,  # does not matter for manual optimization
+        )
 
     def prepare_outputs_training_epoch_end(
-        self, tbptt_splits, new_format, batch_outputs, num_optimizers, automatic_optimization=True
+        self, tbptt_splits, batch_outputs, num_optimizers, automatic_optimization=True
     ):
         return self.prepare_outputs(
             TrainingEpochLoop._prepare_outputs_training_epoch_end,
             tbptt_splits,
-            new_format,
             batch_outputs,
             num_optimizers,
             automatic_optimization=automatic_optimization,
         )
 
     def prepare_outputs_training_batch_end(
-        self, tbptt_splits, new_format, batch_outputs, num_optimizers, automatic_optimization=True
+        self, tbptt_splits, batch_outputs, num_optimizers, automatic_optimization=True
     ):
         return self.prepare_outputs(
             TrainingEpochLoop._prepare_outputs_training_batch_end,
             tbptt_splits,
-            new_format,
             batch_outputs,
             num_optimizers,
             automatic_optimization=automatic_optimization,
@@ -97,53 +85,19 @@ def prepare_outputs_training_batch_end(
             ),
             # 1 batch, tbptt with 2 splits (uneven)
             (1, 2, [[{0: _out00}, {0: _out01}], [{0: _out03}]], [[_out00, _out01], [_out03]]),
-        ],
-    )
-    @pytest.mark.parametrize("new_format", (False, True))
-    def test_prepare_outputs_training_epoch_end_automatic(
-        self, num_optimizers, tbptt_splits, batch_outputs, expected, new_format
-    ):
-        """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook
-        currently expects in the case of automatic optimization."""
-        assert (
-            self.prepare_outputs_training_epoch_end(tbptt_splits, new_format, batch_outputs, num_optimizers) == expected
-        )
-
-    @pytest.mark.parametrize(
-        "num_optimizers,tbptt_splits,batch_outputs,expected",
-        [
-            # 3 batches, tbptt with 2 splits, 2 optimizers alternating
-            (
-                2,
-                2,
-                [[{0: _out00}, {0: _out01}], [{1: _out10}, {1: _out11}], [{0: _out02}, {0: _out03}]],
-                [[[_out00, _out01], [], [_out02, _out03]], [[], [_out10, _out11], []]],
-            )
-        ],
-    )
-    def test_prepare_outputs_training_epoch_end_automatic_old_format(
-        self, num_optimizers, tbptt_splits, batch_outputs, expected
-    ):
-        assert self.prepare_outputs_training_epoch_end(tbptt_splits, False, batch_outputs, num_optimizers) == expected
-
-    @pytest.mark.parametrize(
-        "num_optimizers,tbptt_splits,batch_outputs,expected",
-        [
             # 3 batches, tbptt with 2 splits, 2 optimizers alternating
             (
                 2,
                 2,
                 [[{0: _out00}, {0: _out01}], [{1: _out10}, {1: _out11}], [{0: _out02}, {0: _out03}]],
                 [[[_out00], [_out01]], [[_out10], [_out11]], [[_out02], [_out03]]],
-            )
+            ),
         ],
     )
-    def test_prepare_outputs_training_epoch_end_automatic_new_format(
-        self, num_optimizers, tbptt_splits, batch_outputs, expected
-    ):
+    def test_prepare_outputs_training_epoch_end_automatic(self, num_optimizers, tbptt_splits, batch_outputs, expected):
         """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook
         currently expects in the case of automatic optimization."""
-        assert self.prepare_outputs_training_epoch_end(tbptt_splits, True, batch_outputs, num_optimizers) == expected
+        assert self.prepare_outputs_training_epoch_end(tbptt_splits, batch_outputs, num_optimizers) == expected
 
     @pytest.mark.parametrize(
         "batch_outputs,expected",
@@ -160,14 +114,10 @@ def test_prepare_outputs_training_epoch_end_automatic_new_format(
             ([[_out00, _out01], [_out02, _out03], [], [_out10]], [[_out00, _out01], [_out02, _out03], [_out10]]),
         ],
     )
-    @pytest.mark.parametrize("new_format", (False, True))
-    def test_prepare_outputs_training_epoch_end_manual(self, batch_outputs, expected, new_format):
+    def test_prepare_outputs_training_epoch_end_manual(self, batch_outputs, expected):
         """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook
         currently expects in the case of manual optimization."""
-        assert (
-            self.prepare_outputs_training_epoch_end(0, new_format, batch_outputs, -1, automatic_optimization=False)
-            == expected
-        )
+        assert self.prepare_outputs_training_epoch_end(0, batch_outputs, -1, automatic_optimization=False) == expected
 
     @pytest.mark.parametrize(
         "num_optimizers,tbptt_splits,batch_end_outputs,expected",
@@ -180,47 +130,17 @@ def test_prepare_outputs_training_epoch_end_manual(self, batch_outputs, expected
             (2, 0, [{0: _out00, 1: _out01}], [_out00, _out01]),
             # tbptt with 2 splits
             (1, 2, [{0: _out00}, {0: _out01}], [_out00, _out01]),
+            # 2 optimizers, tbptt with 2 splits
+            (2, 2, [{0: _out00, 1: _out01}, {0: _out10, 1: _out11}], [[_out00, _out01], [_out10, _out11]]),
         ],
     )
-    @pytest.mark.parametrize("new_format", (False, True))
     def test_prepare_outputs_training_batch_end_automatic(
-        self, num_optimizers, tbptt_splits, batch_end_outputs, expected, new_format
-    ):
-        """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook
-        currently expects in the case of automatic optimization."""
-
-        assert (
-            self.prepare_outputs_training_batch_end(tbptt_splits, new_format, batch_end_outputs, num_optimizers)
-            == expected
-        )
-
-    @pytest.mark.parametrize(
-        "num_optimizers,tbptt_splits,batch_end_outputs,expected",
-        # 2 optimizers, tbptt with 2 splits
-        [(2, 2, [{0: _out00, 1: _out01}, {0: _out10, 1: _out11}], [[_out00, _out10], [_out01, _out11]])],
-    )
-    def test_prepare_outputs_training_batch_end_automatic_old_format(
         self, num_optimizers, tbptt_splits, batch_end_outputs, expected
     ):
         """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook
         currently expects in the case of automatic optimization."""
-        assert (
-            self.prepare_outputs_training_batch_end(tbptt_splits, False, batch_end_outputs, num_optimizers) == expected
-        )
 
-    @pytest.mark.parametrize(
-        "num_optimizers,tbptt_splits,batch_end_outputs,expected",
-        # 2 optimizers, tbptt with 2 splits
-        [(2, 2, [{0: _out00, 1: _out01}, {0: _out10, 1: _out11}], [[_out00, _out01], [_out10, _out11]])],
-    )
-    def test_prepare_outputs_training_batch_end_automatic_new_format(
-        self, num_optimizers, tbptt_splits, batch_end_outputs, expected
-    ):
-        """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook
-        currently expects in the case of automatic optimization."""
-        assert (
-            self.prepare_outputs_training_batch_end(tbptt_splits, True, batch_end_outputs, num_optimizers) == expected
-        )
+        assert self.prepare_outputs_training_batch_end(tbptt_splits, batch_end_outputs, num_optimizers) == expected
 
     @pytest.mark.parametrize(
         "batch_end_outputs,expected",
@@ -237,8 +157,7 @@ def test_prepare_outputs_training_batch_end_manual(self, batch_end_outputs, expe
         """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook
         currently expects in the case of manual optimization."""
         assert (
-            self.prepare_outputs_training_batch_end(0, False, batch_end_outputs, -1, automatic_optimization=False)
-            == expected
+            self.prepare_outputs_training_batch_end(0, batch_end_outputs, -1, automatic_optimization=False) == expected
         )
 
 
diff --git a/tests/tests_pytorch/loops/test_utilities.py b/tests/tests_pytorch/loops/test_utilities.py
index 914c1de8e115b..2bd86d325806d 100644
--- a/tests/tests_pytorch/loops/test_utilities.py
+++ b/tests/tests_pytorch/loops/test_utilities.py
@@ -16,7 +16,7 @@
 import pytest
 import torch
 
-from pytorch_lightning.loops.utilities import _extract_hiddens, _set_sampler_epoch, _v1_8_output_format
+from pytorch_lightning.loops.utilities import _extract_hiddens, _set_sampler_epoch
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -41,30 +41,6 @@ def test_extract_hiddens():
         _extract_hiddens(None, 1)
 
 
-def test_v1_8_output_format():
-    # old format
-    def training_epoch_end(outputs):
-        ...
-
-    assert not _v1_8_output_format(training_epoch_end)
-
-    def training_epoch_end(outputs, new_format=1):
-        ...
-
-    assert not _v1_8_output_format(training_epoch_end)
-
-    def training_epoch_end(outputs, new_format=False):
-        ...
-
-    assert not _v1_8_output_format(training_epoch_end)
-
-    # new format
-    def training_epoch_end(outputs, new_format=True):
-        ...
-
-    assert _v1_8_output_format(training_epoch_end)
-
-
 def test_set_sampler_epoch():
     # No samplers
     dataloader = Mock()

From cced33542db5896afc0e8b3e3c9263dba44deb4f Mon Sep 17 00:00:00 2001
From: Jordi Smit <j0rd1smit@users.noreply.github.com>
Date: Fri, 26 Aug 2022 19:17:20 +0200
Subject: [PATCH 21/30] Disable non blocking to device with MPS (#14368)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* disable non-blocking for mps due to race condition bug

* fixed typo

* fixed: unknown mps device for non arm systems

* Removed unrobust test case

* moved _MPS_DEVICES such that we used in apply_func

* Resolve circular dependencies

* Comment rewording

* changed torchElasticEnvironment to a global import

* simplified if statement to blocking device type

* Added change to CHANGELOG

* Update src/pytorch_lightning/utilities/apply_func.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixed mypy not detecting casting of device

* Moved check into if statement to mainain original behavior

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 src/pytorch_lightning/CHANGELOG.md                        | 3 +++
 src/pytorch_lightning/accelerators/cpu.py                 | 6 +++---
 src/pytorch_lightning/accelerators/hpu.py                 | 5 +++--
 src/pytorch_lightning/accelerators/ipu.py                 | 2 +-
 .../plugins/environments/xla_environment.py               | 2 +-
 src/pytorch_lightning/utilities/apply_func.py             | 8 ++++++--
 src/pytorch_lightning/utilities/device_parser.py          | 1 +
 7 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 4b5e3a893429e..4a746ac91b911 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -104,6 +104,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296))
 
 
+- Fixed incorrect values after transferring data to a MPS device ([#13285](https://github.com/Lightning-AI/lightning/issues/13285))
+
+
 - Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964))
 
 
diff --git a/src/pytorch_lightning/accelerators/cpu.py b/src/pytorch_lightning/accelerators/cpu.py
index fea8ee70d17df..d0981e7269305 100644
--- a/src/pytorch_lightning/accelerators/cpu.py
+++ b/src/pytorch_lightning/accelerators/cpu.py
@@ -16,7 +16,7 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.utilities import device_parser
+from pytorch_lightning.utilities.device_parser import parse_cpu_cores
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE
 from pytorch_lightning.utilities.types import _DEVICE
@@ -42,13 +42,13 @@ def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]:
     @staticmethod
     def parse_devices(devices: Union[int, str, List[int]]) -> int:
         """Accelerator device parsing logic."""
-        devices = device_parser.parse_cpu_cores(devices)
+        devices = parse_cpu_cores(devices)
         return devices
 
     @staticmethod
     def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.device]:
         """Gets parallel devices for the Accelerator."""
-        devices = device_parser.parse_cpu_cores(devices)
+        devices = parse_cpu_cores(devices)
         return [torch.device("cpu")] * devices
 
     @staticmethod
diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py
index 8fc242fa55f20..c85e81756c2a9 100644
--- a/src/pytorch_lightning/accelerators/hpu.py
+++ b/src/pytorch_lightning/accelerators/hpu.py
@@ -17,8 +17,9 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.utilities import _HPU_AVAILABLE, device_parser
+from pytorch_lightning.utilities.device_parser import parse_hpus
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 from pytorch_lightning.utilities.rank_zero import rank_zero_debug
 
 if _HPU_AVAILABLE:
@@ -61,7 +62,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
     @staticmethod
     def parse_devices(devices: Union[int, str, List[int]]) -> Optional[int]:
         """Accelerator device parsing logic."""
-        return device_parser.parse_hpus(devices)
+        return parse_hpus(devices)
 
     @staticmethod
     def get_parallel_devices(devices: int) -> List[torch.device]:
diff --git a/src/pytorch_lightning/accelerators/ipu.py b/src/pytorch_lightning/accelerators/ipu.py
index b5110e58028a5..b09fd33c29227 100644
--- a/src/pytorch_lightning/accelerators/ipu.py
+++ b/src/pytorch_lightning/accelerators/ipu.py
@@ -16,7 +16,7 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.utilities import _IPU_AVAILABLE
+from pytorch_lightning.utilities.imports import _IPU_AVAILABLE
 
 
 class IPUAccelerator(Accelerator):
diff --git a/src/pytorch_lightning/plugins/environments/xla_environment.py b/src/pytorch_lightning/plugins/environments/xla_environment.py
index a78ebeb36a6a4..4072f6f8715f5 100644
--- a/src/pytorch_lightning/plugins/environments/xla_environment.py
+++ b/src/pytorch_lightning/plugins/environments/xla_environment.py
@@ -15,7 +15,7 @@
 import os
 
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
-from pytorch_lightning.utilities import _TPU_AVAILABLE
+from pytorch_lightning.utilities.imports import _TPU_AVAILABLE
 
 if _TPU_AVAILABLE:
     import torch_xla.core.xla_env_vars as xenv
diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py
index 15e9962e40bff..fe8b0b836456f 100644
--- a/src/pytorch_lightning/utilities/apply_func.py
+++ b/src/pytorch_lightning/utilities/apply_func.py
@@ -38,7 +38,7 @@
     Batch = type(None)
 
 
-_CPU_DEVICES = ("cpu", torch.device("cpu"))
+_BLOCKING_DEVICE_TYPES = ("cpu", "mps")
 
 
 def to_dtype_tensor(
@@ -322,6 +322,9 @@ def move_data_to_device(batch: Any, device: Union[str, torch.device]) -> Any:
         - :class:`torch.device`
     """
 
+    if isinstance(device, str):
+        device = torch.device(device)
+
     def batch_to(data: Any) -> Any:
         # try to move torchtext data first
         if _TORCHTEXT_LEGACY and isinstance(data, Batch):
@@ -342,7 +345,8 @@ def batch_to(data: Any) -> Any:
 
         kwargs = {}
         # Don't issue non-blocking transfers to CPU
-        if isinstance(data, Tensor) and device not in _CPU_DEVICES:
+        # Same with MPS due to a race condition bug: https://github.com/pytorch/pytorch/issues/83015
+        if isinstance(data, Tensor) and isinstance(device, torch.device) and device.type not in _BLOCKING_DEVICE_TYPES:
             kwargs["non_blocking"] = True
         data_output = data.to(device, **kwargs)
         if data_output is not None:
diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py
index 1b9f43137943f..32f370b5b246e 100644
--- a/src/pytorch_lightning/utilities/device_parser.py
+++ b/src/pytorch_lightning/utilities/device_parser.py
@@ -93,6 +93,7 @@ def parse_gpu_ids(
     gpus = _normalize_parse_gpu_input_to_list(gpus, include_cuda=include_cuda, include_mps=include_mps)
     if not gpus:
         raise MisconfigurationException("GPUs requested but none are available.")
+
     if (
         TorchElasticEnvironment.detect()
         and len(gpus) != 1

From 89506135527d98c6524635ea044b8eb15414d950 Mon Sep 17 00:00:00 2001
From: Tianshu Wang <wang@tianshu.me>
Date: Sat, 27 Aug 2022 01:23:54 +0800
Subject: [PATCH 22/30] save checkpoints and profiler output to the first
 logger (#14325)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 .../common/checkpointing_intermediate.rst      |  2 +-
 src/pytorch_lightning/CHANGELOG.md             |  2 ++
 .../callbacks/model_checkpoint.py              | 14 ++++++--------
 src/pytorch_lightning/trainer/trainer.py       | 13 ++++++-------
 src/pytorch_lightning/utilities/logger.py      |  8 --------
 tests/tests_pytorch/profilers/test_profiler.py |  5 +----
 .../trainer/properties/test_log_dir.py         |  7 ++++---
 tests/tests_pytorch/utilities/test_logger.py   | 18 ------------------
 8 files changed, 20 insertions(+), 49 deletions(-)

diff --git a/docs/source-pytorch/common/checkpointing_intermediate.rst b/docs/source-pytorch/common/checkpointing_intermediate.rst
index f6dc1c955880f..7e17d8abc808f 100644
--- a/docs/source-pytorch/common/checkpointing_intermediate.rst
+++ b/docs/source-pytorch/common/checkpointing_intermediate.rst
@@ -120,7 +120,7 @@ What
 Where
 =====
 
-- It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
+- By default, the ``ModelCheckpoint`` will save files into the ``Trainer.log_dir``. It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
 
 |
 
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 4a746ac91b911..f2fb0a2aacf3f 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -45,6 +45,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384))
 
 
+- When using multiple loggers, by default checkpoints and profiler output now get saved to the log dir of the first logger in the list ([14325](https://github.com/Lightning-AI/lightning/pull/14325))
+
 
 
 ### Deprecated
diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py
index 9b49b9d44bb10..1ad86a0917dac 100644
--- a/src/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/src/pytorch_lightning/callbacks/model_checkpoint.py
@@ -37,7 +37,6 @@
 from pytorch_lightning.callbacks import Checkpoint
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.logger import _name, _version
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.types import _PATH, STEP_OUTPUT
 from pytorch_lightning.utilities.warnings import WarningCache
@@ -591,15 +590,14 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None:
         if trainer._weights_save_path_internal != trainer.default_root_dir:
             # the user has changed weights_save_path
             ckpt_path = os.path.join(trainer._weights_save_path_internal, "checkpoints")
-        elif trainer.loggers:
-            if len(trainer.loggers) == 1:
-                assert trainer.logger is not None
-                save_dir = trainer.logger.save_dir or trainer.default_root_dir
+        elif len(trainer.loggers) > 0:
+            if trainer.loggers[0].save_dir is not None:
+                save_dir = trainer.loggers[0].save_dir
             else:
                 save_dir = trainer.default_root_dir
-
-            name = _name(trainer.loggers)
-            version = _version(trainer.loggers)
+            save_dir = trainer.loggers[0].save_dir or trainer.default_root_dir
+            name = trainer.loggers[0].name
+            version = trainer.loggers[0].version
             version = version if isinstance(version, str) else f"version_{version}"
 
             ckpt_path = os.path.join(save_dir, str(name), version, "checkpoints")
diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py
index d5bcec7db85d6..4c9456e8ead37 100644
--- a/src/pytorch_lightning/trainer/trainer.py
+++ b/src/pytorch_lightning/trainer/trainer.py
@@ -297,9 +297,8 @@ def __init__(
 
             logger: Logger (or iterable collection of loggers) for experiment tracking. A ``True`` value uses
                 the default ``TensorBoardLogger``. ``False`` will disable logging. If multiple loggers are
-                provided and the `save_dir` property of that logger is not set, local files (checkpoints,
-                profiler traces, etc.) are saved in ``default_root_dir`` rather than in the ``log_dir`` of any
-                of the individual loggers.
+                provided, local files (checkpoints, profiler traces, etc.) are saved in the ``log_dir`` of
+                the first logger.
                 Default: ``True``.
 
             log_every_n_steps: How often to log within steps.
@@ -2210,11 +2209,11 @@ def model(self, model: torch.nn.Module) -> None:
 
     @property
     def log_dir(self) -> Optional[str]:
-        if len(self.loggers) == 1:
-            if isinstance(self.logger, TensorBoardLogger):
-                dirpath = self.logger.log_dir
+        if len(self.loggers) > 0:
+            if isinstance(self.loggers[0], TensorBoardLogger):
+                dirpath = self.loggers[0].log_dir
             else:
-                dirpath = self.logger.save_dir
+                dirpath = self.loggers[0].save_dir
         else:
             dirpath = self.default_root_dir
 
diff --git a/src/pytorch_lightning/utilities/logger.py b/src/pytorch_lightning/utilities/logger.py
index 24d75e4f41034..76c24db0fe1a0 100644
--- a/src/pytorch_lightning/utilities/logger.py
+++ b/src/pytorch_lightning/utilities/logger.py
@@ -151,14 +151,6 @@ def _add_prefix(
     return metrics
 
 
-def _name(loggers: List[Any], separator: str = "_") -> str:
-    if len(loggers) == 1:
-        return loggers[0].name
-    else:
-        # Concatenate names together, removing duplicates and preserving order
-        return separator.join(dict.fromkeys(str(logger.name) for logger in loggers))
-
-
 def _version(loggers: List[Any], separator: str = "_") -> Union[int, str]:
     if len(loggers) == 1:
         return loggers[0].version
diff --git a/tests/tests_pytorch/profilers/test_profiler.py b/tests/tests_pytorch/profilers/test_profiler.py
index 91e1980b45bb6..2e3b868407d7f 100644
--- a/tests/tests_pytorch/profilers/test_profiler.py
+++ b/tests/tests_pytorch/profilers/test_profiler.py
@@ -465,15 +465,12 @@ def look_for_trace(trace_dir):
         """Determines if a directory contains a PyTorch trace."""
         return any("trace.json" in filename for filename in os.listdir(trace_dir))
 
-    # Sanity check
-    assert not look_for_trace(tmpdir)
-
     model = BoringModel()
     loggers = [TensorBoardLogger(save_dir=tmpdir), CSVLogger(tmpdir)]
     trainer = Trainer(default_root_dir=tmpdir, profiler="pytorch", logger=loggers, limit_train_batches=5, max_epochs=1)
     assert len(trainer.loggers) == 2
     trainer.fit(model)
-    assert look_for_trace(tmpdir)
+    assert look_for_trace(tmpdir / "lightning_logs" / "version_0")
 
 
 @RunIf(min_cuda_gpus=1, standalone=True)
diff --git a/tests/tests_pytorch/trainer/properties/test_log_dir.py b/tests/tests_pytorch/trainer/properties/test_log_dir.py
index e048e50e6f0b8..9e8453b9e7032 100644
--- a/tests/tests_pytorch/trainer/properties/test_log_dir.py
+++ b/tests/tests_pytorch/trainer/properties/test_log_dir.py
@@ -113,13 +113,14 @@ def test_logdir_multiple_loggers(tmpdir):
     """Tests that the logdir equals the default_root_dir when trainer has multiple loggers."""
     default_root_dir = tmpdir / "default_root_dir"
     save_dir = tmpdir / "save_dir"
-    model = TestModel(default_root_dir)
+    expected = os.path.join(tmpdir, "save_dir", "custom_logs", "version_0")
+    model = TestModel(expected)
     trainer = Trainer(
         default_root_dir=default_root_dir,
         max_steps=2,
         logger=[TensorBoardLogger(save_dir=save_dir, name="custom_logs"), CSVLogger(tmpdir)],
     )
-    assert trainer.log_dir == default_root_dir
 
+    assert trainer.log_dir == expected
     trainer.fit(model)
-    assert trainer.log_dir == default_root_dir
+    assert trainer.log_dir == expected
diff --git a/tests/tests_pytorch/utilities/test_logger.py b/tests/tests_pytorch/utilities/test_logger.py
index 6b67272289fc3..031599ec16b20 100644
--- a/tests/tests_pytorch/utilities/test_logger.py
+++ b/tests/tests_pytorch/utilities/test_logger.py
@@ -22,7 +22,6 @@
     _add_prefix,
     _convert_params,
     _flatten_dict,
-    _name,
     _sanitize_callable_params,
     _sanitize_params,
     _version,
@@ -177,23 +176,6 @@ def test_add_prefix():
     assert metrics["prefix2_prefix-metric2"] == 2
 
 
-def test_name(tmpdir):
-    """Verify names of loggers are concatenated properly."""
-    logger1 = CSVLogger(tmpdir, name="foo")
-    logger2 = CSVLogger(tmpdir, name="bar")
-    logger3 = CSVLogger(tmpdir, name="foo")
-    logger4 = CSVLogger(tmpdir, name="baz")
-    loggers = [logger1, logger2, logger3, logger4]
-    name = _name([])
-    assert name == ""
-    name = _name([logger3])
-    assert name == "foo"
-    name = _name(loggers)
-    assert name == "foo_bar_baz"
-    name = _name(loggers, "-")
-    assert name == "foo-bar-baz"
-
-
 def test_version(tmpdir):
     """Verify versions of loggers are concatenated properly."""
     logger1 = CSVLogger(tmpdir, version=0)

From 3ba0f56b1819547963262626ed723b1179ee1155 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Fri, 26 Aug 2022 22:01:51 +0200
Subject: [PATCH 23/30] Remove support for the deprecated torchtext legacy
 (#14375)

---
 .azure/hpu-tests.yml                          |  2 +-
 dockers/base-conda/Dockerfile                 |  6 +--
 environment.yml                               |  1 -
 requirements/pytorch/adjust-versions.py       | 16 +++---
 requirements/pytorch/check-avail-extras.py    |  1 -
 requirements/pytorch/extra.txt                |  1 -
 src/pytorch_lightning/CHANGELOG.md            |  2 +
 src/pytorch_lightning/core/hooks.py           |  1 -
 src/pytorch_lightning/utilities/__init__.py   |  1 -
 src/pytorch_lightning/utilities/apply_func.py | 34 +-----------
 src/pytorch_lightning/utilities/data.py       |  8 +--
 src/pytorch_lightning/utilities/imports.py    |  2 -
 tests/tests_pytorch/accelerators/test_mps.py  | 29 ----------
 .../deprecated_api/test_remove_1-8.py         | 12 -----
 tests/tests_pytorch/helpers/imports.py        | 16 ------
 .../tests_pytorch/helpers/torchtext_utils.py  | 54 -------------------
 tests/tests_pytorch/models/test_gpu.py        | 30 +----------
 .../utilities/test_apply_func_torchtext.py    | 47 ----------------
 18 files changed, 19 insertions(+), 244 deletions(-)
 delete mode 100644 tests/tests_pytorch/helpers/imports.py
 delete mode 100644 tests/tests_pytorch/helpers/torchtext_utils.py
 delete mode 100644 tests/tests_pytorch/utilities/test_apply_func_torchtext.py

diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml
index bdfada907cac9..b128dbc5433b1 100644
--- a/.azure/hpu-tests.yml
+++ b/.azure/hpu-tests.yml
@@ -45,7 +45,7 @@ jobs:
         pip --version
         sudo pip uninstall -y lightning pytorch-lightning
         pip install fire
-        python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext
+        python .actions/assistant.py requirements-prune-pkgs torch,torchvision
         pip install ".[extra,test]"
         pip list
       env:
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
index 9bb75e34b8ff6..c82c5a4dfa15f 100644
--- a/dockers/base-conda/Dockerfile
+++ b/dockers/base-conda/Dockerfile
@@ -78,11 +78,11 @@ RUN \
     conda update -n base -c defaults conda && \
     CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \
     conda create -y --name $CONDA_ENV \
-      python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \
+      python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision cudatoolkit=${CUDA_VERSION_MM} \
       -c nvidia -c pytorch -c pytorch-test && \
     conda init bash && \
     # NOTE: this requires that the channel is presented in the yaml before packages \
-    printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n    req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
+    printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchvision']:\n    req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
     python prune.py && \
     rm prune.py && \
     cat environment.yml && \
@@ -102,7 +102,7 @@ RUN \
     pip list | grep torch && \
     python -c "import torch; print(torch.__version__)" && \
     pip install -q fire && \
-    python assistant.py requirements_prune_pkgs torch,torchvision,torchtext && \
+    python assistant.py requirements_prune_pkgs torch,torchvision && \
     # Install remaining requirements
     pip install --no-cache-dir -r requirements/pytorch/base.txt \
         -r requirements/pytorch/extra.txt \
diff --git a/environment.yml b/environment.yml
index f26e93031770e..7576bfe0e5ff3 100644
--- a/environment.yml
+++ b/environment.yml
@@ -41,7 +41,6 @@ dependencies:
     - scikit-learn>=0.20.0
     - matplotlib>=3.1.1
     - omegaconf>=2.0.5
-    - torchtext>=0.10.*
 
     # Examples
     - torchvision>=0.10.*
diff --git a/requirements/pytorch/adjust-versions.py b/requirements/pytorch/adjust-versions.py
index b5305014acc36..70c94b358fc6e 100644
--- a/requirements/pytorch/adjust-versions.py
+++ b/requirements/pytorch/adjust-versions.py
@@ -5,14 +5,14 @@
 
 # IMPORTANT: this list needs to be sorted in reverse
 VERSIONS = [
-    dict(torch="1.12.1", torchvision="0.13.1", torchtext="0.13.1"),  # stable
-    dict(torch="1.12.0", torchvision="0.13.0", torchtext="0.13.0"),
-    dict(torch="1.11.0", torchvision="0.12.0", torchtext="0.12.0"),
-    dict(torch="1.10.2", torchvision="0.11.3", torchtext="0.11.2"),
-    dict(torch="1.10.1", torchvision="0.11.2", torchtext="0.11.1"),
-    dict(torch="1.10.0", torchvision="0.11.1", torchtext="0.11.0"),
-    dict(torch="1.9.1", torchvision="0.10.1", torchtext="0.10.1"),
-    dict(torch="1.9.0", torchvision="0.10.0", torchtext="0.10.0"),
+    dict(torch="1.12.1", torchvision="0.13.1"),  # stable
+    dict(torch="1.12.0", torchvision="0.13.0"),
+    dict(torch="1.11.0", torchvision="0.12.0"),
+    dict(torch="1.10.2", torchvision="0.11.3"),
+    dict(torch="1.10.1", torchvision="0.11.2"),
+    dict(torch="1.10.0", torchvision="0.11.1"),
+    dict(torch="1.9.1", torchvision="0.10.1"),
+    dict(torch="1.9.0", torchvision="0.10.0"),
 ]
 
 
diff --git a/requirements/pytorch/check-avail-extras.py b/requirements/pytorch/check-avail-extras.py
index 228a41038c3c1..9477267683d19 100644
--- a/requirements/pytorch/check-avail-extras.py
+++ b/requirements/pytorch/check-avail-extras.py
@@ -4,4 +4,3 @@
 import matplotlib  # noqa: F401
 import omegaconf  # noqa: F401
 import rich  # noqa: F401
-import torchtext  # noqa: F401
diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt
index 20b6c1b8dbc12..f211f5654adec 100644
--- a/requirements/pytorch/extra.txt
+++ b/requirements/pytorch/extra.txt
@@ -3,7 +3,6 @@
 
 # extended list of package dependencies to reach full functionality
 matplotlib>3.1, <3.5.3
-torchtext>=0.10.*, <0.14.0
 omegaconf>=2.0.5, <2.3.0
 hydra-core>=1.0.5, <1.3.0
 jsonargparse[signatures]>=4.12.0, <=4.12.0
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index f2fb0a2aacf3f..777ba96af3d5d 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -92,6 +92,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868))
 
 
+- Removed deprecated support for old torchtext versions ([#14375](https://github.com/Lightning-AI/lightning/pull/14375))
+
 ### Fixed
 
 - Fixed an assertion error when using a `ReduceOnPlateau` scheduler with the Horovod strategy ([#14215](https://github.com/Lightning-AI/lightning/pull/14215))
diff --git a/src/pytorch_lightning/core/hooks.py b/src/pytorch_lightning/core/hooks.py
index 1e02d0b07cef9..4da53903eccca 100644
--- a/src/pytorch_lightning/core/hooks.py
+++ b/src/pytorch_lightning/core/hooks.py
@@ -624,7 +624,6 @@ def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_
         - :class:`list`
         - :class:`dict`
         - :class:`tuple`
-        - :class:`torchtext.data.batch.Batch`
 
         For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...).
 
diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py
index c849ba0a05d68..b8d5801734a25 100644
--- a/src/pytorch_lightning/utilities/__init__.py
+++ b/src/pytorch_lightning/utilities/__init__.py
@@ -43,7 +43,6 @@
     _TORCH_GREATER_EQUAL_1_11,
     _TORCH_GREATER_EQUAL_1_12,
     _TORCH_QUANTIZE_AVAILABLE,
-    _TORCHTEXT_AVAILABLE,
     _TORCHVISION_AVAILABLE,
     _TPU_AVAILABLE,
     _XLA_AVAILABLE,
diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py
index fe8b0b836456f..ae3d81eab46a5 100644
--- a/src/pytorch_lightning/utilities/apply_func.py
+++ b/src/pytorch_lightning/utilities/apply_func.py
@@ -14,10 +14,9 @@
 """Utilities used for collections."""
 
 import dataclasses
-import operator
 from abc import ABC
 from collections import defaultdict, OrderedDict
-from copy import copy, deepcopy
+from copy import deepcopy
 from functools import partial
 from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union
 
@@ -26,17 +25,6 @@
 from torch import Tensor
 
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY
-from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
-
-if _TORCHTEXT_LEGACY:
-    if _compare_version("torchtext", operator.ge, "0.9.0"):
-        from torchtext.legacy.data import Batch
-    else:
-        from torchtext.data import Batch
-else:
-    Batch = type(None)
-
 
 _BLOCKING_DEVICE_TYPES = ("cpu", "mps")
 
@@ -326,23 +314,6 @@ def move_data_to_device(batch: Any, device: Union[str, torch.device]) -> Any:
         device = torch.device(device)
 
     def batch_to(data: Any) -> Any:
-        # try to move torchtext data first
-        if _TORCHTEXT_LEGACY and isinstance(data, Batch):
-            # TODO: also remove the torchtext dependency with Lightning 1.8
-            rank_zero_deprecation(
-                "The `torchtext.legacy.Batch` object is deprecated and Lightning will remove support for it in v1.8."
-                " We recommend you to migrate away from Batch by following the TorchText README:"
-                " https://github.com/pytorch/text#bc-breaking-legacy"
-            )
-            # Shallow copy because each Batch has a reference to Dataset which contains all examples
-            device_data = copy(data)
-            for field, field_value in data.dataset.fields.items():
-                if field_value is None:
-                    continue
-                device_field = move_data_to_device(getattr(data, field), device)
-                setattr(device_data, field, device_field)
-            return device_data
-
         kwargs = {}
         # Don't issue non-blocking transfers to CPU
         # Same with MPS due to a race condition bug: https://github.com/pytorch/pytorch/issues/83015
@@ -354,8 +325,7 @@ def batch_to(data: Any) -> Any:
         # user wrongly implemented the `TransferableDataType` and forgot to return `self`.
         return data
 
-    dtype = (TransferableDataType, Batch) if _TORCHTEXT_LEGACY else TransferableDataType
-    return apply_to_collection(batch, dtype=dtype, function=batch_to)
+    return apply_to_collection(batch, dtype=TransferableDataType, function=batch_to)
 
 
 def convert_to_tensors(data: Any, device: Union[str, torch.device]) -> Any:
diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py
index b0c9307cec8e1..c795cfc47d3a5 100644
--- a/src/pytorch_lightning/utilities/data.py
+++ b/src/pytorch_lightning/utilities/data.py
@@ -124,9 +124,7 @@ def has_len(dataloader: Union[DataLoader, Iterable]) -> bool:
                 f"`{dataloader.__class__.__name__}` returned 0 length. Please make sure this was your intention."
             )
         has_len = True
-    except TypeError:
-        has_len = False
-    except NotImplementedError:  # e.g. raised by torchtext if a batch_size_fn is used
+    except (TypeError, NotImplementedError):
         has_len = False
 
     if has_len and has_iterable_dataset(dataloader):
@@ -170,9 +168,7 @@ def has_len_all_ranks(
         else:
             has_len = True
 
-    except TypeError:
-        has_len = False
-    except NotImplementedError:  # e.g. raised by torchtext if a batch_size_fn is used
+    except (TypeError, NotImplementedError):
         has_len = False
 
     if has_len and has_iterable_dataset(dataloader):
diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py
index ba437ad332dfa..dfc88104ac25c 100644
--- a/src/pytorch_lightning/utilities/imports.py
+++ b/src/pytorch_lightning/utilities/imports.py
@@ -147,8 +147,6 @@ def __repr__(self) -> str:
 _PSUTIL_AVAILABLE = _package_available("psutil")
 _RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2")
 _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"])
-_TORCHTEXT_AVAILABLE = _package_available("torchtext")
-_TORCHTEXT_LEGACY: bool = _TORCHTEXT_AVAILABLE and _compare_version("torchtext", operator.lt, "0.11.0")
 _TORCHVISION_AVAILABLE = _package_available("torchvision")
 _XLA_AVAILABLE: bool = _package_available("torch_xla")
 
diff --git a/tests/tests_pytorch/accelerators/test_mps.py b/tests/tests_pytorch/accelerators/test_mps.py
index 01e13e937b4d0..2375b167f7708 100644
--- a/tests/tests_pytorch/accelerators/test_mps.py
+++ b/tests/tests_pytorch/accelerators/test_mps.py
@@ -21,8 +21,6 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators import MPSAccelerator
 from pytorch_lightning.demos.boring_classes import BoringModel
-from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY
-from tests_pytorch.helpers.imports import Batch, Dataset, Example, Field, LabelField
 from tests_pytorch.helpers.runif import RunIf
 
 
@@ -135,30 +133,3 @@ def to(self, *args, **kwargs):
 
     batch = trainer.strategy.batch_to_device(CustomBatchType(), torch.device("mps"))
     assert batch.a.type() == "torch.mps.FloatTensor"
-
-    # torchtext.data.Batch
-    if not _TORCHTEXT_LEGACY:
-        return
-
-    samples = [
-        {"text": "PyTorch Lightning is awesome!", "label": 0},
-        {"text": "Please make it work with torchtext", "label": 1},
-    ]
-
-    text_field = Field()
-    label_field = LabelField()
-    fields = {"text": ("text", text_field), "label": ("label", label_field)}
-
-    examples = [Example.fromdict(sample, fields) for sample in samples]
-    dataset = Dataset(examples=examples, fields=fields.values())
-    # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first
-    text_field.build_vocab(dataset)
-    label_field.build_vocab(dataset)
-
-    batch = Batch(data=examples, dataset=dataset)
-
-    with pytest.deprecated_call(match="The `torchtext.legacy.Batch` object is deprecated"):
-        batch = trainer.strategy.batch_to_device(batch, torch.device("mps"))
-
-    assert batch.text.type() == "torch.mps.LongTensor"
-    assert batch.label.type() == "torch.mps.LongTensor"
diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
index e1fb0b57654c0..8af7dc6ba90f5 100644
--- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
+++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
@@ -35,20 +35,8 @@
 from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities import device_parser
-from pytorch_lightning.utilities.apply_func import move_data_to_device
-from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY
 from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn
 from tests_pytorch.helpers.runif import RunIf
-from tests_pytorch.helpers.torchtext_utils import get_dummy_torchtext_data_iterator
-
-
-@pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.")
-def test_v1_8_0_deprecated_torchtext_batch():
-
-    with pytest.deprecated_call(match="is deprecated and Lightning will remove support for it in v1.8"):
-        data_iterator, _ = get_dummy_torchtext_data_iterator(num_samples=3, batch_size=3)
-        batch = next(iter(data_iterator))
-        _ = move_data_to_device(batch=batch, device=torch.device("cpu"))
 
 
 def test_v1_8_0_on_init_start_end(tmpdir):
diff --git a/tests/tests_pytorch/helpers/imports.py b/tests/tests_pytorch/helpers/imports.py
deleted file mode 100644
index 3fddb615b7ab1..0000000000000
--- a/tests/tests_pytorch/helpers/imports.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import operator
-
-from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY
-
-if _TORCHTEXT_LEGACY:
-    if _compare_version("torchtext", operator.ge, "0.9.0"):
-        from torchtext.legacy.data import Batch, Dataset, Example, Field, Iterator, LabelField
-    else:
-        from torchtext.data import Batch, Dataset, Example, Field, Iterator, LabelField
-else:
-    Batch = type(None)
-    Dataset = type(None)
-    Example = type(None)
-    Field = type(None)
-    Iterator = type(None)
-    LabelField = type(None)
diff --git a/tests/tests_pytorch/helpers/torchtext_utils.py b/tests/tests_pytorch/helpers/torchtext_utils.py
deleted file mode 100644
index 3c18005636bf1..0000000000000
--- a/tests/tests_pytorch/helpers/torchtext_utils.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-import string
-
-from tests_pytorch.helpers.imports import Dataset, Example, Field, Iterator
-
-
-def _generate_random_string(length: int = 10):
-    return "".join(random.choices(string.ascii_letters, k=length))
-
-
-def get_dummy_torchtext_data_iterator(num_samples: int, batch_size: int, include_lengths: bool = False):
-    text_field = Field(
-        sequential=True,
-        pad_first=False,  # nosec
-        init_token="<s>",
-        eos_token="</s>",  # nosec
-        include_lengths=include_lengths,
-    )  # nosec
-
-    dataset = Dataset(
-        [
-            Example.fromdict({"text": _generate_random_string()}, {"text": ("text", text_field)})
-            for _ in range(num_samples)
-        ],
-        {"text": text_field},
-    )
-    text_field.build_vocab(dataset)
-
-    iterator = Iterator(
-        dataset,
-        batch_size=batch_size,
-        sort_key=None,
-        device=None,
-        batch_size_fn=None,
-        train=True,
-        repeat=False,
-        shuffle=None,
-        sort=None,
-        sort_within_batch=None,
-    )
-    return iterator, text_field
diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py
index 1a2d72a12118e..0a030e74273b1 100644
--- a/tests/tests_pytorch/models/test_gpu.py
+++ b/tests/tests_pytorch/models/test_gpu.py
@@ -28,9 +28,8 @@
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from pytorch_lightning.utilities import device_parser
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY
+from pytorch_lightning.utilities.imports import _compare_version
 from tests_pytorch.helpers.datamodules import ClassifDataModule
-from tests_pytorch.helpers.imports import Batch, Dataset, Example, Field, LabelField
 from tests_pytorch.helpers.runif import RunIf
 from tests_pytorch.helpers.simple_models import ClassificationModel
 
@@ -269,33 +268,6 @@ def to(self, *args, **kwargs):
     batch = trainer.strategy.batch_to_device(CustomBatchType(), torch.device("cuda:0"))
     assert batch.a.type() == "torch.cuda.FloatTensor"
 
-    # torchtext.data.Batch
-    if not _TORCHTEXT_LEGACY:
-        return
-
-    samples = [
-        {"text": "PyTorch Lightning is awesome!", "label": 0},
-        {"text": "Please make it work with torchtext", "label": 1},
-    ]
-
-    text_field = Field()
-    label_field = LabelField()
-    fields = {"text": ("text", text_field), "label": ("label", label_field)}
-
-    examples = [Example.fromdict(sample, fields) for sample in samples]
-    dataset = Dataset(examples=examples, fields=fields.values())
-    # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first
-    text_field.build_vocab(dataset)
-    label_field.build_vocab(dataset)
-
-    batch = Batch(data=examples, dataset=dataset)
-
-    with pytest.deprecated_call(match="The `torchtext.legacy.Batch` object is deprecated"):
-        batch = trainer.strategy.batch_to_device(batch, torch.device("cuda:0"))
-
-    assert batch.text.type() == "torch.cuda.LongTensor"
-    assert batch.label.type() == "torch.cuda.LongTensor"
-
 
 @RunIf(min_cuda_gpus=1)
 def test_non_blocking():
diff --git a/tests/tests_pytorch/utilities/test_apply_func_torchtext.py b/tests/tests_pytorch/utilities/test_apply_func_torchtext.py
deleted file mode 100644
index 6c72959b54587..0000000000000
--- a/tests/tests_pytorch/utilities/test_apply_func_torchtext.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import pytest
-import torch
-
-from pytorch_lightning.utilities.apply_func import move_data_to_device
-from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY
-from tests_pytorch.helpers.runif import RunIf
-from tests_pytorch.helpers.torchtext_utils import get_dummy_torchtext_data_iterator
-
-
-@pytest.mark.parametrize("include_lengths", [False, True])
-@pytest.mark.parametrize("device", [torch.device("cuda", 0)])
-@pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.")
-@RunIf(min_cuda_gpus=1)
-def test_batch_move_data_to_device_torchtext_include_lengths(include_lengths, device):
-    data_iterator, _ = get_dummy_torchtext_data_iterator(num_samples=3, batch_size=3, include_lengths=include_lengths)
-    data_iter = iter(data_iterator)
-    batch = next(data_iter)
-
-    with pytest.deprecated_call(match="The `torchtext.legacy.Batch` object is deprecated"):
-        batch_on_device = move_data_to_device(batch, device)
-
-    if include_lengths:
-        # tensor with data
-        assert batch_on_device.text[0].device == device
-        # tensor with length of data
-        assert batch_on_device.text[1].device == device
-    else:
-        assert batch_on_device.text.device == device
-
-
-@pytest.mark.parametrize("include_lengths", [False, True])
-@pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.")
-def test_batch_move_data_to_device_torchtext_include_lengths_cpu(include_lengths):
-    test_batch_move_data_to_device_torchtext_include_lengths(include_lengths, torch.device("cpu"))

From 1ad452f95a7c408f5402b271278e9b30b0d7421b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 26 Aug 2022 20:40:53 +0000
Subject: [PATCH 24/30] Update mlflow requirement from <1.28.0,>=1.0.0 to
 >=1.0.0,<1.29.0 in /requirements (#14311)

Update mlflow requirement in /requirements

Updates the requirements on [mlflow](https://github.com/mlflow/mlflow) to permit the latest version.
- [Release notes](https://github.com/mlflow/mlflow/releases)
- [Changelog](https://github.com/mlflow/mlflow/blob/master/CHANGELOG.md)
- [Commits](https://github.com/mlflow/mlflow/compare/1.0.0...v1.28.0)

---
updated-dependencies:
- dependency-name: mlflow
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements/pytorch/loggers.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt
index df83a077f8457..905823451973b 100644
--- a/requirements/pytorch/loggers.txt
+++ b/requirements/pytorch/loggers.txt
@@ -5,6 +5,6 @@
 
 neptune-client>=0.10.0, <0.16.4
 comet-ml>=3.1.12, <3.31.8
-mlflow>=1.0.0, <1.28.0
+mlflow>=1.0.0, <1.29.0
 test_tube>=0.7.5, <=0.7.5
 wandb>=0.10.22, <0.13.2

From 714137d4d65e8687455d671106d5080207f64734 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 26 Aug 2022 20:41:36 +0000
Subject: [PATCH 25/30] Bump ravsamhq/notify-slack-action from 1 to 2 (#14290)

Bumps [ravsamhq/notify-slack-action](https://github.com/ravsamhq/notify-slack-action) from 1 to 2.
- [Release notes](https://github.com/ravsamhq/notify-slack-action/releases)
- [Commits](https://github.com/ravsamhq/notify-slack-action/compare/v1...v2)

---
updated-dependencies:
- dependency-name: ravsamhq/notify-slack-action
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/ci-pytorch-dockers.yml | 10 +++++-----
 .github/workflows/events-nightly.yml     |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml
index 6cb28885e79ef..73f303c6cbc44 100644
--- a/.github/workflows/ci-pytorch-dockers.yml
+++ b/.github/workflows/ci-pytorch-dockers.yml
@@ -75,7 +75,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
         timeout-minutes: 60
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -117,7 +117,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
         timeout-minutes: 95
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -155,7 +155,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
         timeout-minutes: 95
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -199,7 +199,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }}
         timeout-minutes: 10
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -235,7 +235,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }}
         timeout-minutes: 10
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index 36907bf955999..13d3895bf365d 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -48,7 +48,7 @@ jobs:
       # report failure to Slack
       - name: Slack notification
         if: failure() && github.event_name == 'schedule'
-        uses: ravsamhq/notify-slack-action@v1
+        uses: ravsamhq/notify-slack-action@v2
         with:
           status: ${{ job.status }}
           token: ${{ secrets.GITHUB_TOKEN }}

From 250c06e406a821b381393ea9e47152e6c8e2784c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 26 Aug 2022 22:59:32 +0200
Subject: [PATCH 26/30] Remove deprecated HPC model hooks (#14315)

Co-authored-by: rohitgr7 <rohitgr1998@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 .../common/lightning_module.rst               | 12 -----
 src/pytorch_lightning/CHANGELOG.md            |  4 ++
 src/pytorch_lightning/core/saving.py          | 26 ----------
 .../trainer/configuration_validator.py        | 17 ------
 .../connectors/checkpoint_connector.py        | 13 -----
 .../deprecated_api/test_remove_1-8.py         | 23 --------
 .../connectors/test_checkpoint_connector.py   | 52 -------------------
 .../trainer/logging_/test_logger_connector.py |  6 +--
 8 files changed, 7 insertions(+), 146 deletions(-)

diff --git a/docs/source-pytorch/common/lightning_module.rst b/docs/source-pytorch/common/lightning_module.rst
index e0fc10097d002..1c3f423a001f8 100644
--- a/docs/source-pytorch/common/lightning_module.rst
+++ b/docs/source-pytorch/common/lightning_module.rst
@@ -1345,18 +1345,6 @@ load_from_checkpoint
 .. automethod:: pytorch_lightning.core.module.LightningModule.load_from_checkpoint
     :noindex:
 
-on_hpc_save
-~~~~~~~~~~~
-
-.. automethod:: pytorch_lightning.core.module.LightningModule.on_hpc_save
-    :noindex:
-
-on_hpc_load
-~~~~~~~~~~~
-
-.. automethod:: pytorch_lightning.core.module.LightningModule.on_hpc_load
-    :noindex:
-
 on_train_start
 ~~~~~~~~~~~~~~
 
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 777ba96af3d5d..af45f1dc589d7 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -92,8 +92,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868))
 
 
+- Removed the deprecated `LightningModule.{on_hpc_load,on_hpc_save}` hooks in favor of the general purpose hooks `LightningModule.{on_load_checkpoint,on_save_checkpoint}` ([#14315](https://github.com/Lightning-AI/lightning/pull/14315))
+
+
 - Removed deprecated support for old torchtext versions ([#14375](https://github.com/Lightning-AI/lightning/pull/14375))
 
+
 ### Fixed
 
 - Fixed an assertion error when using a `ReduceOnPlateau` scheduler with the Horovod strategy ([#14215](https://github.com/Lightning-AI/lightning/pull/14215))
diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py
index 2df7a661c2bd6..380338f5c0312 100644
--- a/src/pytorch_lightning/core/saving.py
+++ b/src/pytorch_lightning/core/saving.py
@@ -142,32 +142,6 @@ def load_from_checkpoint(
             **kwargs,
         )
 
-    # -------------------------
-    # OPTIONAL HOOKS
-    # -------------------------
-    def on_hpc_save(self, checkpoint: Dict[str, Any]) -> None:
-        """Hook to do whatever you need right before Slurm manager saves the model.
-
-        Args:
-            checkpoint: A dictionary in which you can save variables to save in a checkpoint.
-                Contents need to be pickleable.
-
-        .. deprecated:: v1.6
-            This method is deprecated in v1.6 and will be removed in v1.8.
-            Please use ``LightningModule.on_save_checkpoint`` instead.
-        """
-
-    def on_hpc_load(self, checkpoint: Dict[str, Any]) -> None:
-        """Hook to do whatever you need right before Slurm manager loads the model.
-
-        Args:
-            checkpoint: A dictionary with variables from the checkpoint.
-
-        .. deprecated:: v1.6
-            This method is deprecated in v1.6 and will be removed in v1.8.
-            Please use ``LightningModule.on_load_checkpoint`` instead.
-        """
-
 
 def _load_from_checkpoint(
     cls: Union[Type["ModelIO"], Type["pl.LightningModule"], Type["pl.LightningDataModule"]],
diff --git a/src/pytorch_lightning/trainer/configuration_validator.py b/src/pytorch_lightning/trainer/configuration_validator.py
index 9d277f5ac45c9..8bf68e1bedd62 100644
--- a/src/pytorch_lightning/trainer/configuration_validator.py
+++ b/src/pytorch_lightning/trainer/configuration_validator.py
@@ -48,8 +48,6 @@ def verify_loop_configurations(trainer: "pl.Trainer") -> None:
 
     __verify_batch_transfer_support(trainer)
     _check_deprecated_callback_hooks(trainer)
-    # TODO: Delete _check_on_hpc_hooks in v1.8
-    _check_on_hpc_hooks(model)
     # TODO: Delete on_epoch_start/on_epoch_end hooks in v1.8
     _check_on_epoch_start_end(model)
     # TODO: Delete CheckpointHooks off PrecisionPlugin in v1.8
@@ -209,21 +207,6 @@ def __check_training_step_requires_dataloader_iter(model: "pl.LightningModule")
             )
 
 
-# TODO: Delete _check_on_hpc_hooks in v1.8
-def _check_on_hpc_hooks(model: "pl.LightningModule") -> None:
-    if is_overridden("on_hpc_save", model):
-        rank_zero_deprecation(
-            "Method `LightningModule.on_hpc_save` is deprecated in v1.6 and"
-            " will be removed in v1.8. Please use `LightningModule.on_save_checkpoint` instead."
-        )
-
-    if is_overridden("on_hpc_load", model):
-        rank_zero_deprecation(
-            "Method `LightningModule.on_hpc_load` is deprecated in v1.6 and"
-            " will be removed in v1.8. Please use `LightningModule.on_load_checkpoint` instead."
-        )
-
-
 # TODO: Remove on_epoch_start/on_epoch_end hooks in v1.8
 def _check_on_epoch_start_end(model: "pl.LightningModule") -> None:
     hooks = (
diff --git a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index e1dccd11a0d0a..f0b35e054d6d5 100644
--- a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -23,7 +23,6 @@
 from torchmetrics import Metric
 
 import pytorch_lightning as pl
-from pytorch_lightning.plugins.environments import SLURMEnvironment
 from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE
@@ -168,16 +167,9 @@ def restore_model(self) -> None:
         if not self._loaded_checkpoint:
             return
 
-        model = self.trainer.lightning_module
-
         # hook: give user access to checkpoint if needed.
         self.trainer._call_lightning_module_hook("on_load_checkpoint", self._loaded_checkpoint)
 
-        # TODO: remove this in v1.8.
-        # call hpc specific hook
-        if self._hpc_resume_path is not None:
-            model.on_hpc_load(self._loaded_checkpoint)
-
         # restore model state_dict
         self.trainer.strategy.load_model_state_dict(self._loaded_checkpoint)
 
@@ -438,11 +430,6 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
         if datamodule is not None:
             self.trainer._call_lightning_datamodule_hook("on_save_checkpoint", checkpoint)
 
-        # TODO: remove this in v1.8.
-        environment = self.trainer._accelerator_connector.cluster_environment
-        if isinstance(environment, SLURMEnvironment) and environment.auto_requeue:
-            model.on_hpc_save(checkpoint)
-
         return checkpoint
 
     def save_checkpoint(
diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
index 8af7dc6ba90f5..07223b88b7710 100644
--- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
+++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
@@ -84,29 +84,6 @@ def test_v1_8_0_deprecated_warning_positional_category():
         rank_zero_warn("foo", FutureWarning)
 
 
-def test_v1_8_0_deprecated_on_hpc_hooks(tmpdir):
-    class TestModelSave(BoringModel):
-        def on_hpc_save(self):
-            print("on_hpc_save override")
-
-    class TestModelLoad(BoringModel):
-        def on_hpc_load(self):
-            print("on_hpc_load override")
-
-    save_model = TestModelSave()
-    load_model = TestModelLoad()
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, fast_dev_run=True)
-
-    with pytest.deprecated_call(
-        match=r"Method `LightningModule.on_hpc_save` is deprecated in v1.6 and will be removed in v1.8."
-    ):
-        trainer.fit(save_model)
-    with pytest.deprecated_call(
-        match=r"Method `LightningModule.on_hpc_load` is deprecated in v1.6 and will be removed in v1.8."
-    ):
-        trainer.fit(load_model)
-
-
 def test_v1_8_0_deprecated_run_stage():
     trainer = Trainer()
     trainer._run_stage = Mock()
diff --git a/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py b/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py
index 02e3221f4dfd4..4762e769a2379 100644
--- a/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py
+++ b/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py
@@ -12,68 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from unittest import mock
 from unittest.mock import Mock
 
-import pytest
 import torch
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.demos.boring_classes import BoringModel
-from pytorch_lightning.plugins.environments import SLURMEnvironment
 from pytorch_lightning.trainer.states import TrainerFn
 
 
-# TODO: remove HPCHookedModel in v1.8
-class HPCHookedModel(BoringModel):
-    def __init__(self):
-        super().__init__()
-        self.hpc_save_called = 0
-        self.hpc_load_called = 0
-
-    def on_hpc_save(self, checkpoint):
-        assert "state_dict" in checkpoint
-        self.hpc_save_called += 1
-
-    def on_hpc_load(self, checkpoint):
-        assert "state_dict" in checkpoint
-        self.hpc_load_called += 1
-
-
-# TODO: remove test_hpc_hook_calls in v1.8
-@mock.patch(
-    "pytorch_lightning.trainer.connectors.accelerator_connector.AcceleratorConnector._is_slurm_managing_tasks",
-    return_value=True,
-)
-def test_hpc_hook_calls(mock_slurm_env, tmpdir):
-    model = HPCHookedModel()
-    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, enable_checkpointing=False, logger=False)
-    environment = trainer._accelerator_connector.cluster_environment
-    assert isinstance(environment, SLURMEnvironment)
-    assert environment.auto_requeue
-    with pytest.deprecated_call(
-        match=r"Method `LightningModule.on_hpc_save` is deprecated in v1.6 and will be removed in v1.8."
-    ):
-        trainer.fit(model)
-
-    # simulate snapshot on slurm
-    hpc_save_path = trainer._checkpoint_connector.hpc_save_path(tmpdir)
-    trainer.save_checkpoint(hpc_save_path)
-    assert model.hpc_save_called == 1
-    assert model.hpc_load_called == 0
-
-    # new training run, restore from hpc checkpoint file automatically
-    assert set(os.listdir(tmpdir)) == {"hpc_ckpt_1.ckpt"}
-    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, enable_checkpointing=False, logger=False)
-    with pytest.deprecated_call(
-        match=r"Method `LightningModule.on_hpc_save` is deprecated in v1.6 and will be removed in v1.8."
-    ):
-        trainer.fit(model)
-    assert model.hpc_save_called == 1
-    assert model.hpc_load_called == 1
-
-
 def test_preloaded_checkpoint_lifecycle(tmpdir):
     """Tests that the preloaded checkpoint contents gets cleared from memory when it is not required anymore."""
     model = BoringModel()
diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py
index 1c346ac1e8bc5..266710abf9434 100644
--- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py
+++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py
@@ -250,7 +250,7 @@ def test_fx_validator_integration(tmpdir):
         limit_predict_batches=1,
         callbacks=callback,
     )
-    with pytest.deprecated_call(match="is deprecated in"):
+    with pytest.deprecated_call(match="was deprecated in"):
         trainer.fit(model)
 
     not_supported.update(
@@ -263,7 +263,7 @@ def test_fx_validator_integration(tmpdir):
             "on_test_end": "You can't",
         }
     )
-    with pytest.deprecated_call(match="is deprecated in"):
+    with pytest.deprecated_call(match="was deprecated in"):
         trainer.test(model, verbose=False)
 
     not_supported.update({k: "result collection is not registered yet" for k in not_supported})
@@ -280,7 +280,7 @@ def test_fx_validator_integration(tmpdir):
             "on_predict_end": "result collection is not registered yet",
         }
     )
-    with pytest.deprecated_call(match="is deprecated in"):
+    with pytest.deprecated_call(match="was deprecated in"):
         trainer.predict(model)
 
 

From 0e30e4a5a01bc3cf2f6d36f70e16e0e36a339cf2 Mon Sep 17 00:00:00 2001
From: Mansy <ahmed.mansy156@gmail.com>
Date: Sat, 27 Aug 2022 01:55:22 +0200
Subject: [PATCH 27/30] [App][CI] Fix psutil requirement CI (#14413)

---
 requirements/app/test.txt       | 1 +
 tests/tests_app/cli/test_cli.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/requirements/app/test.txt b/requirements/app/test.txt
index ab5ef8f1e85ac..c6a2a61b2c29a 100644
--- a/requirements/app/test.txt
+++ b/requirements/app/test.txt
@@ -8,3 +8,4 @@ playwright==1.22.0
 httpx
 trio
 pympler
+psutil
diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py
index 1edc9d384cf30..6daa7be5b8e07 100644
--- a/tests/tests_app/cli/test_cli.py
+++ b/tests/tests_app/cli/test_cli.py
@@ -169,3 +169,6 @@ def test_cli_logout(exists: mock.MagicMock, unlink: mock.MagicMock, creds: bool)
         unlink.assert_called_once_with()
     else:
         unlink.assert_not_called()
+
+
+# TODO: test for the other commands

From f3574176e2a0f61c68093f39639dc110d5936e90 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Sat, 27 Aug 2022 17:42:24 +0530
Subject: [PATCH 28/30] Change `trainer.should_stop` to not stop in between an
 epoch and run until `min_steps/min_epochs` only (#13890)

---
 docs/source-pytorch/common/trainer.rst        | 60 +++++++++++++++++++
 src/pytorch_lightning/CHANGELOG.md            |  3 +
 .../loops/epoch/training_epoch_loop.py        | 16 ++++-
 src/pytorch_lightning/loops/fit_loop.py       | 24 ++++----
 src/pytorch_lightning/utilities/warnings.py   |  6 ++
 .../callbacks/test_early_stopping.py          | 20 ++++---
 .../loops/epoch/test_training_epoch_loop.py   | 34 +++++++++++
 .../tests_pytorch/loops/test_training_loop.py | 33 +++++++++-
 tests/tests_pytorch/trainer/test_trainer.py   |  7 ++-
 9 files changed, 176 insertions(+), 27 deletions(-)

diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst
index fe76ebea481aa..53148c9fab583 100644
--- a/docs/source-pytorch/common/trainer.rst
+++ b/docs/source-pytorch/common/trainer.rst
@@ -1745,3 +1745,63 @@ execution within that function, and the status of the Trainer.
     trainer.state.status
     # stage in ("train", "sanity_check", "validate", "test", "predict", "tune")
     trainer.state.stage
+
+should_stop
+***********
+
+If you want to terminate the training during ``.fit``, you can set ``trainer.should_stop=True`` to terminate the training
+as soon as possible. Note that, it will respect the arguments ``min_steps`` and ``min_epochs`` to check whether to stop. If these
+arguments are set and the ``current_epoch`` or ``global_step`` don't meet these minimum conditions, training will continue until
+both conditions are met. If any of these arguments is not set, it won't be considered for the final decision.
+
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` at any point of training will terminate it
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            self.trainer.should_stop = True
+
+
+    trainer = Trainer()
+    model = LitModel()
+    trainer.fit(model)
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` will stop training only after at least 5 epochs have run
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            if self.current_epoch == 2:
+                self.trainer.should_stop = True
+
+
+    trainer = Trainer(min_epochs=5, max_epochs=100)
+    model = LitModel()
+    trainer.fit(model)
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` will stop training only after at least 5 steps have run
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            if self.global_step == 2:
+                self.trainer.should_stop = True
+
+
+    trainer = Trainer(min_steps=5, max_epochs=100)
+    model = LitModel()
+    trainer.fit(model)
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` at any until both min_steps and min_epochs are satisfied
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            if self.global_step == 7:
+                self.trainer.should_stop = True
+
+
+    trainer = Trainer(min_steps=5, min_epochs=5, max_epochs=100)
+    model = LitModel()
+    trainer.fit(model)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index af45f1dc589d7..35775201e1098 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -45,6 +45,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384))
 
 
+- Changed `trainer.should_stop` to not stop in between an epoch and run until `min_steps/min_epochs` only ([#13890](https://github.com/Lightning-AI/lightning/pull/13890))
+
+
 - When using multiple loggers, by default checkpoints and profiler output now get saved to the log dir of the first logger in the list ([14325](https://github.com/Lightning-AI/lightning/pull/14325))
 
 
diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 0fe93ca7104b6..6be4956b9bfd5 100644
--- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -102,7 +102,21 @@ def _is_validation_done(self) -> bool:
     @property
     def done(self) -> bool:
         """Evaluates when to leave the loop."""
-        return (self._is_training_done and self._is_validation_done) or self.trainer.should_stop
+        if self._is_training_done and self._is_validation_done:
+            return True
+
+        if self.trainer.should_stop:
+            # early stopping
+            min_epochs = self.trainer.fit_loop.min_epochs
+            should_stop_early = self.trainer.fit_loop._should_stop_early
+            if not should_stop_early:
+                self._warning_cache.info(
+                    f"Trainer was signaled to stop but the required `min_epochs={min_epochs!r}` or"
+                    f" `min_steps={self.min_steps!r}` has not been met. Training will continue..."
+                )
+            return should_stop_early
+
+        return False
 
     def connect(  # type: ignore[override]
         self,
diff --git a/src/pytorch_lightning/loops/fit_loop.py b/src/pytorch_lightning/loops/fit_loop.py
index 21273e70a5365..e6dcc581e247d 100644
--- a/src/pytorch_lightning/loops/fit_loop.py
+++ b/src/pytorch_lightning/loops/fit_loop.py
@@ -146,6 +146,12 @@ def _results(self) -> _ResultCollection:
             return self.epoch_loop.val_loop._results
         raise RuntimeError("`FitLoop._results` property isn't defined. Accessed outside of scope")
 
+    @property
+    def _should_stop_early(self) -> bool:
+        met_min_epochs = self.epoch_progress.current.processed >= self.min_epochs if self.min_epochs else True
+        met_min_steps = self.epoch_loop.global_step >= self.min_steps if self.min_steps else True
+        return met_min_epochs and met_min_steps
+
     @property
     def done(self) -> bool:
         """Evaluates when to leave the loop."""
@@ -169,20 +175,10 @@ def done(self) -> bool:
             rank_zero_info(f"`Trainer.fit` stopped: `max_epochs={self.max_epochs!r}` reached.")
             return True
 
-        if self.trainer.should_stop:
-            # early stopping
-            met_min_epochs = self.epoch_progress.current.processed >= self.min_epochs if self.min_epochs else True
-            met_min_steps = self.epoch_loop.global_step >= self.min_steps if self.min_steps else True
-            if met_min_epochs and met_min_steps:
-                self.trainer.should_stop = True
-                rank_zero_debug("`Trainer.fit` stopped: `trainer.should_stop` was set.")
-                return True
-            else:
-                rank_zero_info(
-                    f"Trainer was signaled to stop but the required `min_epochs={self.min_epochs!r}` or"
-                    f" `min_steps={self.min_steps!r}` has not been met. Training will continue..."
-                )
-        self.trainer.should_stop = False
+        if self.trainer.should_stop and self._should_stop_early:
+            rank_zero_debug("`Trainer.fit` stopped: `trainer.should_stop` was set.")
+            return True
+
         return False
 
     @property
diff --git a/src/pytorch_lightning/utilities/warnings.py b/src/pytorch_lightning/utilities/warnings.py
index 61d3a2b8e24e3..61883dad51144 100644
--- a/src/pytorch_lightning/utilities/warnings.py
+++ b/src/pytorch_lightning/utilities/warnings.py
@@ -18,6 +18,7 @@
 
 from pytorch_lightning.utilities.rank_zero import LightningDeprecationWarning as NewLightningDeprecationWarning
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation as new_rank_zero_deprecation
+from pytorch_lightning.utilities.rank_zero import rank_zero_info as new_rank_zero_info
 from pytorch_lightning.utilities.rank_zero import rank_zero_warn as new_rank_zero_warn
 
 # enable our warnings
@@ -39,6 +40,11 @@ def deprecation(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None:
             self.add(message)
             new_rank_zero_deprecation(message, stacklevel=stacklevel, **kwargs)
 
+    def info(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None:
+        if message not in self:
+            self.add(message)
+            new_rank_zero_info(message, stacklevel=stacklevel, **kwargs)
+
 
 def rank_zero_warn(*args: Any, **kwargs: Any) -> Any:
     new_rank_zero_deprecation(
diff --git a/tests/tests_pytorch/callbacks/test_early_stopping.py b/tests/tests_pytorch/callbacks/test_early_stopping.py
index 8c17b2ad421e5..458df2ea23b3f 100644
--- a/tests/tests_pytorch/callbacks/test_early_stopping.py
+++ b/tests/tests_pytorch/callbacks/test_early_stopping.py
@@ -265,25 +265,28 @@ def validation_epoch_end(self, outputs):
     assert early_stopping.stopped_epoch == expected_stop_epoch
 
 
-@pytest.mark.parametrize("limit_train_batches", (3, 5))
 @pytest.mark.parametrize(
-    ["min_epochs", "min_steps"],
+    "limit_train_batches,min_epochs,min_steps,stop_step",
     [
         # IF `min_steps` was set to a higher value than the `trainer.global_step` when `early_stopping` is being
         # triggered, THEN the trainer should continue until reaching `trainer.global_step == min_steps` and stop
-        (0, 10),
+        (3, 0, 10, 10),
+        (5, 0, 10, 10),
         # IF `min_epochs` resulted in a higher number of steps than the `trainer.global_step` when `early_stopping` is
         # being triggered, THEN the trainer should continue until reaching
         # `trainer.global_step` == `min_epochs * len(train_dataloader)`
-        (2, 0),
+        (3, 2, 0, 6),
+        (5, 2, 0, 10),
         # IF both `min_epochs` and `min_steps` are provided and higher than the `trainer.global_step` when
         # `early_stopping` is being triggered, THEN the highest between `min_epochs * len(train_dataloader)` and
         # `min_steps` would be reached
-        (1, 10),
-        (3, 10),
+        (3, 1, 10, 10),
+        (5, 1, 10, 10),
+        (3, 3, 10, 10),
+        (5, 3, 10, 15),
     ],
 )
-def test_min_epochs_min_steps_global_step(tmpdir, limit_train_batches, min_epochs, min_steps):
+def test_min_epochs_min_steps_global_step(tmpdir, limit_train_batches, min_epochs, min_steps, stop_step):
     if min_steps:
         assert limit_train_batches < min_steps
 
@@ -317,8 +320,7 @@ def training_step(self, batch, batch_idx):
     # epochs continue until min steps are reached
     assert trainer.current_epoch == expected_epochs
     # steps continue until min steps are reached AND the epoch is exhausted
-    # stopping mid-epoch is not supported
-    assert trainer.global_step == limit_train_batches * expected_epochs
+    assert trainer.global_step == stop_step
 
 
 def test_early_stopping_mode_options():
diff --git a/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py b/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py
index d601542075004..3aaced7dc1bc1 100644
--- a/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py
+++ b/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 from unittest.mock import patch
 
 import pytest
@@ -184,3 +185,36 @@ def test_no_val_on_train_epoch_loop_restart(tmpdir):
     ) as advance_mocked:
         trainer.fit(model, ckpt_path=ckpt_path)
         assert advance_mocked.call_count == 1
+
+
+@pytest.mark.parametrize(
+    "min_epochs, min_steps, current_epoch, global_step, early_stop, epoch_loop_done, raise_info_msg",
+    [
+        (None, None, 1, 4, True, True, False),
+        (None, None, 1, 10, True, True, False),
+        (4, None, 1, 4, False, False, True),
+        (4, 2, 1, 4, False, False, True),
+        (4, None, 1, 10, False, True, False),
+        (4, 3, 1, 3, False, False, True),
+        (4, 10, 1, 10, False, True, False),
+        (None, 4, 1, 4, True, True, False),
+    ],
+)
+def test_should_stop_early_stopping_conditions_not_met(
+    caplog, min_epochs, min_steps, current_epoch, global_step, early_stop, epoch_loop_done, raise_info_msg
+):
+    """Test that checks that info message is logged when users sets `should_stop` but min conditions are not
+    met."""
+    trainer = Trainer(min_epochs=min_epochs, min_steps=min_steps, limit_val_batches=0)
+    trainer.num_training_batches = 10
+    trainer.should_stop = True
+    trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop.optim_progress.optimizer.step.total.completed = global_step
+    trainer.fit_loop.epoch_loop.batch_progress.current.ready = global_step
+    trainer.fit_loop.epoch_progress.current.completed = current_epoch - 1
+
+    message = f"min_epochs={min_epochs}` or `min_steps={min_steps}` has not been met. Training will continue"
+    with caplog.at_level(logging.INFO, logger="pytorch_lightning.loops"):
+        assert trainer.fit_loop.epoch_loop.done is epoch_loop_done
+
+    assert (message in caplog.text) is raise_info_msg
+    assert trainer.fit_loop._should_stop_early is early_stop
diff --git a/tests/tests_pytorch/loops/test_training_loop.py b/tests/tests_pytorch/loops/test_training_loop.py
index bba0b9b7c5428..dc52a08f068ae 100644
--- a/tests/tests_pytorch/loops/test_training_loop.py
+++ b/tests/tests_pytorch/loops/test_training_loop.py
@@ -180,7 +180,6 @@ def test_fit_loop_done_log_messages(caplog):
 
     fit_loop.epoch_loop.min_steps = 100
     assert not fit_loop.done
-    assert "was signaled to stop but" in caplog.text
 
 
 def test_warning_valid_train_step_end(tmpdir):
@@ -198,3 +197,35 @@ def training_step_end(self, outputs):
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1)
 
     trainer.fit(model)
+
+
+@pytest.mark.parametrize(
+    "min_epochs, min_steps, current_epoch, early_stop, fit_loop_done, raise_debug_msg",
+    [
+        (4, None, 100, True, True, False),
+        (4, None, 3, False, False, False),
+        (4, 10, 3, False, False, False),
+        (None, 10, 4, True, True, True),
+        (4, None, 4, True, True, True),
+        (4, 10, 4, True, True, True),
+    ],
+)
+def test_should_stop_early_stopping_conditions_met(
+    caplog, min_epochs, min_steps, current_epoch, early_stop, fit_loop_done, raise_debug_msg
+):
+    """Test that checks that debug message is logged when users sets `should_stop` and min conditions are met."""
+    trainer = Trainer(min_epochs=min_epochs, min_steps=min_steps, limit_val_batches=0, max_epochs=100)
+    trainer.num_training_batches = 10
+    trainer.should_stop = True
+    trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop.optim_progress.optimizer.step.total.completed = (
+        current_epoch * trainer.num_training_batches
+    )
+    trainer.fit_loop.epoch_loop.batch_progress.current.ready = 10
+    trainer.fit_loop.epoch_progress.current.processed = current_epoch
+
+    message = "`Trainer.fit` stopped: `trainer.should_stop` was set."
+    with caplog.at_level(level=logging.DEBUG, logger="pytorch_lightning.utilities.rank_zero"):
+        assert trainer.fit_loop.done is fit_loop_done
+
+    assert (message in caplog.text) is raise_debug_msg
+    assert trainer.fit_loop._should_stop_early is early_stop
diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py
index 9506acee425d0..6febe1c7787cd 100644
--- a/tests/tests_pytorch/trainer/test_trainer.py
+++ b/tests/tests_pytorch/trainer/test_trainer.py
@@ -622,7 +622,10 @@ def training_step(self, batch, batch_idx):
             output["loss"] = output["loss"] * 0.0  # force minimal loss to trigger early stopping
             self.log("loss", output["loss"])
             self.training_step_invoked += 1
-            assert not self.trainer.should_stop
+            if self.current_epoch < 2:
+                assert not self.trainer.should_stop
+            else:
+                assert self.trainer.should_stop
             return output
 
     model = TestModel()
@@ -641,7 +644,7 @@ def training_step(self, batch, batch_idx):
 
     message = f"min_epochs={min_epochs}` or `min_steps=None` has not been met. Training will continue"
     num_messages = sum(1 for record in caplog.records if message in record.message)
-    assert num_messages == min_epochs - 2
+    assert num_messages == 1
     assert model.training_step_invoked == min_epochs * 2
 
 

From af688dee69655aece60b17d7879cbbcaa396e8f1 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Sun, 28 Aug 2022 01:14:54 +0530
Subject: [PATCH 29/30] Update changelog after v1.7.3 release (#14398)

---
 src/pytorch_lightning/CHANGELOG.md | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 35775201e1098..d6c233e4a17ca 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -103,17 +103,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
-- Fixed an assertion error when using a `ReduceOnPlateau` scheduler with the Horovod strategy ([#14215](https://github.com/Lightning-AI/lightning/pull/14215))
-
-
-- Fixed an `AttributeError` when accessing `LightningModule.logger` and the Trainer has multiple loggers ([#14234](https://github.com/Lightning-AI/lightning/pull/14234))
-
-
-- Added back support for `log`ging in the `configure_gradient_clipping` hook after unintended removal in v1.7.2 ([#14298](https://github.com/Lightning-AI/lightning/issues/14298))
-
+- Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806))
 
-- Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296))
 
+- Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846))
 
 - Fixed incorrect values after transferring data to a MPS device ([#13285](https://github.com/Lightning-AI/lightning/issues/13285))
 
@@ -130,6 +123,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806))
 
 
+## [1.7.3] - 2022-08-25
+
+### Fixed
+
+- Fixed an assertion error when using a `ReduceOnPlateau` scheduler with the Horovod strategy ([#14215](https://github.com/Lightning-AI/lightning/pull/14215))
+- Fixed an `AttributeError` when accessing `LightningModule.logger` and the Trainer has multiple loggers ([#14234](https://github.com/Lightning-AI/lightning/pull/14234))
+- Added back support for `log`ging in the `configure_gradient_clipping` hook after unintended removal in v1.7.2 ([#14298](https://github.com/Lightning-AI/lightning/issues/14298))
+- Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296))
+- Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964))
+
+
 ## [1.7.2] - 2022-08-17
 
 ### Added

From 03f2f324457cfcb1a492db46cdeeaa992687cc4b Mon Sep 17 00:00:00 2001
From: JongMok Lee <lijm1358@gmail.com>
Date: Sun, 28 Aug 2022 07:07:36 +0900
Subject: [PATCH 30/30] Fix mypy errors in
 `pytorch_lightning/strategies/sharded.py` (#14184)

Co-authored-by: otaj <ota@lightning.ai>
---
 pyproject.toml                              |  1 -
 src/pytorch_lightning/strategies/sharded.py | 19 ++++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 399e1e83efb3f..1f704e7aa20ad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,6 @@ module = [
     "pytorch_lightning.callbacks.progress.rich_progress",
     "pytorch_lightning.profilers.base",
     "pytorch_lightning.profilers.pytorch",
-    "pytorch_lightning.strategies.sharded",
     "pytorch_lightning.trainer.callback_hook",
     "pytorch_lightning.trainer.supporters",
     "pytorch_lightning.trainer.trainer",
diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py
index 3b77bc6ceeb70..6bf8e47022c45 100644
--- a/src/pytorch_lightning/strategies/sharded.py
+++ b/src/pytorch_lightning/strategies/sharded.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from typing import Dict, Generator, List, Tuple, Union
+from typing import Dict, Generator, List, Tuple
 
 from torch import Tensor
 from torch.nn import Module
@@ -20,7 +20,7 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.core.optimizer import LightningOptimizer
-from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
+from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase
 from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.enums import PrecisionType
@@ -51,10 +51,11 @@ def connect(self, model: "pl.LightningModule") -> None:
 
     def setup(self, trainer: "pl.Trainer") -> None:
         # share ddp pids to all processes
-        self._rank_0_will_call_children_scripts = self.broadcast(self._rank_0_will_call_children_scripts)
+        self._rank_0_will_call_children_scripts: bool = self.broadcast(self._rank_0_will_call_children_scripts)
         if self._should_run_deadlock_detection():
             self._share_information_to_prevent_deadlock()
 
+        assert self.accelerator is not None
         self.accelerator.setup(trainer)
 
         # move the model to the correct device
@@ -64,6 +65,7 @@ def setup(self, trainer: "pl.Trainer") -> None:
         trainer_fn = trainer.state.fn
         if trainer_fn == TrainerFn.FITTING:
             if self._layer_sync:
+                assert self.model is not None
                 self.model = self._layer_sync.apply(self.model)
 
         self.setup_precision_plugin()
@@ -73,7 +75,9 @@ def setup(self, trainer: "pl.Trainer") -> None:
 
     def configure_ddp(self) -> None:
         self._set_ddp_kwargs()
-        self.setup_optimizers(self.model.trainer)
+        assert self.lightning_module is not None
+        self.setup_optimizers(self.lightning_module.trainer)
+        assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
         self.model, self.optimizers = self._setup_model_and_optimizers(
             model=_LightningModuleWrapperBase(self.model),
             optimizers=self.optimizers,
@@ -97,12 +101,13 @@ def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]
         return model, optimizers
 
     def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]:
-        if self.model is not None and self.model.trainer.state.fn != TrainerFn.FITTING:
+        assert self.lightning_module is not None
+        if self.model is not None and self.lightning_module.trainer.state.fn != TrainerFn.FITTING:
             return optimizers
 
         return self._reinit_optimizers_with_oss(optimizers)
 
-    def _reinit_optimizers_with_oss(self, optimizers: List[Union[Optimizer, LightningOptimizer]]) -> List["OSS"]:
+    def _reinit_optimizers_with_oss(self, optimizers: List[Optimizer]) -> List["OSS"]:
         for x, optimizer in enumerate(optimizers):
             if isinstance(optimizer, LightningOptimizer):
                 optimizer = optimizer._optimizer
@@ -135,7 +140,7 @@ def block_backward_sync(self) -> Generator:
         else:
             yield None
 
-    def post_training_step(self):
+    def post_training_step(self) -> None:
         pass
 
     @classmethod