From 424e78d5b35e5f90ad5700a76a920ec23b3cd944 Mon Sep 17 00:00:00 2001 From: Jerome Anand <88475913+jerome-habana@users.noreply.github.com> Date: Thu, 25 Aug 2022 14:15:01 +0530 Subject: [PATCH 01/30] Add document to showcase scaleout on hpu (#14357) Co-authored-by: Rohit Gupta Co-authored-by: Akihiro Nitta --- .../source-pytorch/accelerators/hpu_basic.rst | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/docs/source-pytorch/accelerators/hpu_basic.rst b/docs/source-pytorch/accelerators/hpu_basic.rst index a6c20414a7a02..2f075c94001f6 100644 --- a/docs/source-pytorch/accelerators/hpu_basic.rst +++ b/docs/source-pytorch/accelerators/hpu_basic.rst @@ -47,6 +47,40 @@ It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy` ---- +Scale-out on Gaudis +------------------- + +To train a Lightning model using multiple HPU nodes, set the ``num_nodes`` parameter with the available nodes in the ``Trainer`` class. + +.. code-block:: python + + trainer = Trainer(accelerator="hpu", devices=8, strategy="hpu_parallel", num_nodes=2) + +In addition to this, the following environment variables need to be set to establish communication across nodes. Check out the documentation on :doc:`Cluster Environment <../clouds/cluster>` for more details. + +- *MASTER_PORT* - required; has to be a free port on machine with NODE_RANK 0 +- *MASTER_ADDR* - required (except for NODE_RANK 0); address of NODE_RANK 0 node +- *WORLD_SIZE* - required; how many workers are in the cluster +- *NODE_RANK* - required; id of the node in the cluster + +The trainer needs to be instantiated on every node participating in the training. + +On Node 1: + +.. code-block:: bash + + MASTER_ADDR= MASTER_PORT= NODE_RANK=0 WORLD_SIZE=16 + python -m some_model_trainer.py (--arg1 ... train script args...) + +On Node 2: + +.. code-block:: bash + + MASTER_ADDR= MASTER_PORT= NODE_RANK=1 WORLD_SIZE=16 + python -m some_model_trainer.py (--arg1 ... train script args...) + +---- + Select Gaudis automatically --------------------------- From 1ae14ca7542e83db47888fd0d68324449014dfc0 Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Thu, 25 Aug 2022 19:30:06 +0200 Subject: [PATCH 02/30] [CI] fix horovod tests (#14382) --- .azure/gpu-tests.yml | 11 +++++++---- dockers/base-conda/Dockerfile | 14 ++++++++------ dockers/base-cuda/Dockerfile | 23 +++++++---------------- 3 files changed, 22 insertions(+), 26 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index f19c5bafc7814..2da30c0dd66ab 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -44,7 +44,7 @@ jobs: - bash: | CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' echo $CHANGED_FILES > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES @@ -72,12 +72,15 @@ jobs: set -e python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)" + TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION} + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION} + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION} pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0" - pip install -e .[strategies] - pip install -U deepspeed # TODO: remove when docker images are upgraded - pip install --requirement requirements/pytorch/devel.txt + pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html + pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html pip list env: PACKAGE_NAME: pytorch diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index d6bfeee90d561..9bb75e34b8ff6 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -34,6 +34,10 @@ RUN \ # https://github.com/NVIDIA/nvidia-docker/issues/1631 apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ apt-get update -qq --fix-missing && \ + NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ + CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ + MAX_ALLOWED_NCCL=2.11.4 && \ + TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \ apt-get install -y --no-install-recommends \ build-essential \ cmake \ @@ -42,17 +46,15 @@ RUN \ curl \ unzip \ ca-certificates \ - libopenmpi-dev - -RUN \ + libopenmpi-dev \ + libnccl2=$TO_INSTALL_NCCL \ + libnccl-dev=$TO_INSTALL_NCCL && \ # Install conda and python. # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b && \ - rm ~/miniconda.sh - -RUN \ + rm ~/miniconda.sh && \ # Cleaning apt-get autoremove -y && \ apt-get clean && \ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index be613f3b6415f..08692ff00ab78 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -37,7 +37,11 @@ RUN \ # https://github.com/NVIDIA/nvidia-docker/issues/1631 apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ apt-get update -qq --fix-missing && \ - apt-get install -y --no-install-recommends \ + NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ + CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ + MAX_ALLOWED_NCCL=2.11.4 && \ + TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \ + apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ build-essential \ pkg-config \ cmake \ @@ -50,8 +54,8 @@ RUN \ libopenmpi-dev \ openmpi-bin \ ssh \ - && \ - + libnccl2=$TO_INSTALL_NCCL \ + libnccl-dev=$TO_INSTALL_NCCL && \ # Install python add-apt-repository ppa:deadsnakes/ppa && \ apt-get install -y \ @@ -59,10 +63,8 @@ RUN \ python${PYTHON_VERSION}-distutils \ python${PYTHON_VERSION}-dev \ && \ - update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \ update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \ - # Cleaning apt-get autoremove -y && \ apt-get clean && \ @@ -78,7 +80,6 @@ RUN \ wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \ python${PYTHON_VERSION} get-pip.py && \ rm get-pip.py && \ - pip install -q fire && \ # Disable cache \ CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ @@ -91,16 +92,6 @@ RUN \ pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \ rm assistant.py -RUN \ - apt-get purge -y cmake && \ - wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \ - tar -zxvf cmake-3.20.2.tar.gz && \ - cd cmake-3.20.2 && \ - ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \ - make && \ - make install && \ - cmake --version - ENV \ HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \ HOROVOD_GPU_OPERATIONS=NCCL \ From 807435885ea265580fee9f4e69c063eace46def2 Mon Sep 17 00:00:00 2001 From: Tanmoy Date: Fri, 26 Aug 2022 00:27:48 +0530 Subject: [PATCH 03/30] Fix `LightningDataModule` hparams parsing (#12806) Co-authored-by: Akihiro Nitta Co-authored-by: Jirka Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 3 + src/pytorch_lightning/utilities/parsing.py | 13 ++-- .../tuner/test_scale_batch_size.py | 69 ++++++++++++------- tests/tests_pytorch/utilities/test_parsing.py | 22 ++++-- 4 files changed, 71 insertions(+), 36 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 07c34bbc0e579..642cb28d4db4c 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -82,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296)) +- Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806)) + + ## [1.7.2] - 2022-08-17 ### Added diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py index 073423ab60773..22dfb538828ab 100644 --- a/src/pytorch_lightning/utilities/parsing.py +++ b/src/pytorch_lightning/utilities/parsing.py @@ -321,14 +321,17 @@ def _lightning_get_all_attr_holders(model: "pl.LightningModule", attribute: str) holders.append(model) # Check if attribute in model.hparams, either namespace or dict - if hasattr(model, "hparams"): - if attribute in model.hparams: - holders.append(model.hparams) + if hasattr(model, "hparams") and attribute in model.hparams: + holders.append(model.hparams) trainer = model._trainer # Check if the attribute in datamodule (datamodule gets registered in Trainer) - if trainer is not None and trainer.datamodule is not None and hasattr(trainer.datamodule, attribute): - holders.append(trainer.datamodule) + if trainer is not None and trainer.datamodule is not None: + if hasattr(trainer.datamodule, attribute): + holders.append(trainer.datamodule) + + if hasattr(trainer.datamodule, "hparams") and attribute in trainer.datamodule.hparams: + holders.append(trainer.datamodule.hparams) return holders diff --git a/tests/tests_pytorch/tuner/test_scale_batch_size.py b/tests/tests_pytorch/tuner/test_scale_batch_size.py index d2fc8a61e0107..ce7c3613f5012 100644 --- a/tests/tests_pytorch/tuner/test_scale_batch_size.py +++ b/tests/tests_pytorch/tuner/test_scale_batch_size.py @@ -29,8 +29,8 @@ class BatchSizeDataModule(BoringDataModule): - def __init__(self, batch_size): - super().__init__() + def __init__(self, data_dir, batch_size): + super().__init__(data_dir) if batch_size is not None: self.batch_size = batch_size @@ -58,7 +58,7 @@ def test_scale_batch_size_method_with_model_or_datamodule(tmpdir, model_bs, dm_b tuner = Tuner(trainer) model = BatchSizeModel(model_bs) - datamodule = BatchSizeDataModule(dm_bs) if dm_bs != -1 else None + datamodule = BatchSizeDataModule(tmpdir, dm_bs) if dm_bs != -1 else None new_batch_size = tuner.scale_batch_size(model, mode="binsearch", init_val=4, max_trials=2, datamodule=datamodule) assert new_batch_size == 16 @@ -140,47 +140,64 @@ def test_auto_scale_batch_size_trainer_arg(tmpdir, scale_arg): assert not os.path.exists(tmpdir / "scale_batch_size_temp_model.ckpt") -@RunIf(min_cuda_gpus=1) @pytest.mark.parametrize("use_hparams", [True, False]) def test_auto_scale_batch_size_set_model_attribute(tmpdir, use_hparams): - """Test that new batch size gets written to the correct hyperparameter attribute.""" + """Test that new batch size gets written to the correct hyperparameter attribute for model.""" tutils.reset_seed() hparams = {"batch_size": 2} - before_batch_size = hparams.get("batch_size") + before_batch_size = hparams["batch_size"] - class HparamsBatchSizeModel(BatchSizeModel): + class HparamsBatchSizeModel(BoringModel): def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + super().__init__() self.save_hyperparameters() - def dataloader(self, *args, **kwargs): - # artificially set batch_size so we can get a dataloader - # remove it immediately after, because we want only self.hparams.batch_size - setattr(self, "batch_size", before_batch_size) - dataloader = super().dataloader(*args, **kwargs) - del self.batch_size - return dataloader + def train_dataloader(self): + return DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) + + def val_dataloader(self): + return DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) + + model_class = HparamsBatchSizeModel if use_hparams else BatchSizeModel + model = model_class(**hparams) + + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True) + trainer.tune(model, scale_batch_size_kwargs={"steps_per_trial": 2, "max_trials": 4}) + after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size + assert before_batch_size != after_batch_size + assert after_batch_size <= len(trainer.train_dataloader.dataset) + + +@pytest.mark.parametrize("use_hparams", [True, False]) +def test_auto_scale_batch_size_set_datamodule_attribute(tmpdir, use_hparams): + """Test that new batch size gets written to the correct hyperparameter attribute for datamodule.""" + tutils.reset_seed() + + hparams = {"batch_size": 2} + before_batch_size = hparams["batch_size"] class HparamsBatchSizeDataModule(BoringDataModule): def __init__(self, data_dir, batch_size): super().__init__(data_dir) - self.batch_size = batch_size + self.save_hyperparameters() def train_dataloader(self): - return DataLoader(self.random_train, batch_size=self.batch_size) + return DataLoader(self.random_train, batch_size=self.hparams.batch_size) - datamodule_fit = HparamsBatchSizeDataModule(data_dir=tmpdir, batch_size=before_batch_size) - model_class = HparamsBatchSizeModel if use_hparams else BatchSizeModel - model = model_class(**hparams) + def val_dataloader(self): + return DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) - trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True, accelerator="gpu", devices=1) - trainer.tune(model, datamodule_fit) - after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size - assert trainer.datamodule == datamodule_fit - assert before_batch_size != after_batch_size + datamodule_class = HparamsBatchSizeDataModule if use_hparams else BatchSizeDataModule + datamodule = datamodule_class(data_dir=tmpdir, batch_size=before_batch_size) + model = BatchSizeModel(**hparams) + + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True) + trainer.tune(model, datamodule=datamodule, scale_batch_size_kwargs={"steps_per_trial": 2, "max_trials": 4}) + after_batch_size = datamodule.hparams.batch_size if use_hparams else datamodule.batch_size + assert trainer.datamodule == datamodule + assert before_batch_size < after_batch_size assert after_batch_size <= len(trainer.train_dataloader.dataset) - assert datamodule_fit.batch_size == after_batch_size def test_auto_scale_batch_size_duplicate_attribute_warning(tmpdir): diff --git a/tests/tests_pytorch/utilities/test_parsing.py b/tests/tests_pytorch/utilities/test_parsing.py index e918c9df2ac32..98b00a374d778 100644 --- a/tests/tests_pytorch/utilities/test_parsing.py +++ b/tests/tests_pytorch/utilities/test_parsing.py @@ -64,8 +64,8 @@ class TestModel4(LightningModule): # fail case batch_size = 1 model4 = TestModel4() - trainer = Trainer() + model4.trainer = trainer datamodule = LightningDataModule() datamodule.batch_size = 8 trainer.datamodule = datamodule @@ -87,12 +87,21 @@ class TestModel7(LightningModule): # test for datamodule w/ hparams w/ attribut model7 = TestModel7() model7.trainer = trainer - return model1, model2, model3, model4, model5, model6, model7 + class TestDataModule8(LightningDataModule): # test for hparams dict + hparams = TestHparamsDict2 + + model8 = TestModel1() + trainer = Trainer() + model8.trainer = trainer + datamodule = TestDataModule8() + trainer.datamodule = datamodule + + return model1, model2, model3, model4, model5, model6, model7, model8 def test_lightning_hasattr(): """Test that the lightning_hasattr works in all cases.""" - model1, model2, model3, model4, model5, model6, model7 = models = model_cases() + model1, model2, model3, model4, model5, model6, model7, model8 = models = model_cases() assert lightning_hasattr(model1, "learning_rate"), "lightning_hasattr failed to find namespace variable" assert lightning_hasattr(model2, "learning_rate"), "lightning_hasattr failed to find hparams namespace variable" assert lightning_hasattr(model3, "learning_rate"), "lightning_hasattr failed to find hparams dict variable" @@ -104,6 +113,7 @@ def test_lightning_hasattr(): assert lightning_hasattr( model7, "batch_size" ), "lightning_hasattr failed to find batch_size in hparams w/ datamodule present" + assert lightning_hasattr(model8, "batch_size") for m in models: assert not lightning_hasattr(m, "this_attr_not_exist") @@ -116,10 +126,11 @@ def test_lightning_getattr(): value = lightning_getattr(m, "learning_rate") assert value == i, "attribute not correctly extracted" - model5, model6, model7 = models[4:] + model5, model6, model7, model8 = models[4:] assert lightning_getattr(model5, "batch_size") == 8, "batch_size not correctly extracted" assert lightning_getattr(model6, "batch_size") == 8, "batch_size not correctly extracted" assert lightning_getattr(model7, "batch_size") == 8, "batch_size not correctly extracted" + assert lightning_getattr(model8, "batch_size") == 2, "batch_size not correctly extracted" for m in models: with pytest.raises( @@ -136,13 +147,14 @@ def test_lightning_setattr(tmpdir): lightning_setattr(m, "learning_rate", 10) assert lightning_getattr(m, "learning_rate") == 10, "attribute not correctly set" - model5, model6, model7 = models[4:] + model5, model6, model7, model8 = models[4:] lightning_setattr(model5, "batch_size", 128) lightning_setattr(model6, "batch_size", 128) lightning_setattr(model7, "batch_size", 128) assert lightning_getattr(model5, "batch_size") == 128, "batch_size not correctly set" assert lightning_getattr(model6, "batch_size") == 128, "batch_size not correctly set" assert lightning_getattr(model7, "batch_size") == 128, "batch_size not correctly set" + assert lightning_getattr(model8, "batch_size") == 128, "batch_size not correctly set" for m in models: with pytest.raises( From 33a5ed98794943b7eb6c7fcfa078b184c9d4d736 Mon Sep 17 00:00:00 2001 From: Anner Date: Fri, 26 Aug 2022 06:26:00 +0100 Subject: [PATCH 04/30] Add torch.cuda rng state to seed save/load (#14384) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 4 +++- src/pytorch_lightning/utilities/seed.py | 15 ++++++++++++--- tests/tests_pytorch/utilities/test_seed.py | 20 +++++++++++++++++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 642cb28d4db4c..ac7e68d177fbe 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -27,7 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Replaced the unwrapping logic in strategies with direct access to unwrapped `LightningModule` ([#13738](https://github.com/Lightning-AI/lightning/pull/13738)) -- Enabled `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` ([14023](https://github.com/Lightning-AI/lightning/pull/14023)) +- Enabled `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` ([#14023](https://github.com/Lightning-AI/lightning/pull/14023)) + +- Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384)) diff --git a/src/pytorch_lightning/utilities/seed.py b/src/pytorch_lightning/utilities/seed.py index 8fce6a1debfcf..925337c7845ae 100644 --- a/src/pytorch_lightning/utilities/seed.py +++ b/src/pytorch_lightning/utilities/seed.py @@ -121,13 +121,22 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: def _collect_rng_states() -> Dict[str, Any]: - """Collect the global random state of :mod:`torch`, :mod:`numpy` and Python.""" - return {"torch": torch.get_rng_state(), "numpy": np.random.get_state(), "python": python_get_rng_state()} + """Collect the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python.""" + return { + "torch": torch.get_rng_state(), + "torch.cuda": torch.cuda.get_rng_state_all(), + "numpy": np.random.get_state(), + "python": python_get_rng_state(), + } def _set_rng_states(rng_state_dict: Dict[str, Any]) -> None: - """Set the global random state of :mod:`torch`, :mod:`numpy` and Python in the current process.""" + """Set the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python in the current + process.""" torch.set_rng_state(rng_state_dict["torch"]) + # torch.cuda rng_state is only included since v1.8. + if "torch.cuda" in rng_state_dict: + torch.cuda.set_rng_state_all(rng_state_dict["torch.cuda"]) np.random.set_state(rng_state_dict["numpy"]) version, state, gauss = rng_state_dict["python"] python_set_rng_state((version, tuple(state), gauss)) diff --git a/tests/tests_pytorch/utilities/test_seed.py b/tests/tests_pytorch/utilities/test_seed.py index 6908badf1a037..c8df824e93b41 100644 --- a/tests/tests_pytorch/utilities/test_seed.py +++ b/tests/tests_pytorch/utilities/test_seed.py @@ -9,7 +9,7 @@ import torch import pytorch_lightning.utilities.seed as seed_utils -from pytorch_lightning.utilities.seed import isolate_rng +from pytorch_lightning.utilities.seed import _collect_rng_states, _set_rng_states, isolate_rng @mock.patch.dict(os.environ, {}, clear=True) @@ -87,6 +87,13 @@ def test_isolate_rng(): generated = [torch.rand(2) for _ in range(3)] assert torch.equal(torch.rand(2), generated[0]) + # torch.cuda + if torch.cuda.is_available(): + torch.cuda.FloatTensor(1).normal_() + with isolate_rng(): + generated = [torch.cuda.FloatTensor(2).normal_() for _ in range(3)] + assert torch.equal(torch.cuda.FloatTensor(2).normal_(), generated[0]) + # numpy np.random.rand(1) with isolate_rng(): @@ -100,6 +107,17 @@ def test_isolate_rng(): assert random.random() == generated[0] +def test_backward_compatibility_rng_states_dict(): + """Test that an older rng_states_dict without the "torch.cuda" key does not crash. + + This test is only relevant when torch.cuda is available. + """ + states = _collect_rng_states() + assert "torch.cuda" in states + states.pop("torch.cuda") + _set_rng_states(states) + + @mock.patch("pytorch_lightning.utilities.seed.log.info") @pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"RANK": "1"}, {"RANK": "4"}]) def test_seed_everything_log_info(log_mock: MagicMock, env_vars: Mapping[str, str]): From a01e016fff3c2c9b3d88b3cbace077b7f06bfdbe Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Fri, 26 Aug 2022 08:47:37 +0200 Subject: [PATCH 05/30] Remove mps config for test (#14379) * Remove mps config for test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- tests/tests_pytorch/utilities/test_fetching.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/tests_pytorch/utilities/test_fetching.py b/tests/tests_pytorch/utilities/test_fetching.py index 2d5e3954c7061..fa0551b5a07e1 100644 --- a/tests/tests_pytorch/utilities/test_fetching.py +++ b/tests/tests_pytorch/utilities/test_fetching.py @@ -192,9 +192,7 @@ def test_dataloader(self): @pytest.mark.flaky(reruns=3) -@pytest.mark.parametrize( - "accelerator", [pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)), pytest.param("mps", marks=RunIf(mps=True))] -) +@pytest.mark.parametrize("accelerator", [pytest.param("cuda", marks=RunIf(min_cuda_gpus=1))]) def test_trainer_num_prefetch_batches(tmpdir, accelerator): model = RecommenderModel() From 6a999f123c609eb91e65d1ab9a3652ebdd3f71ce Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Fri, 26 Aug 2022 08:27:33 +0100 Subject: [PATCH 06/30] Fix mypy errors attributed to `pytorch_lightning.demos.boring_classes` (#14201) Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: otaj --- pyproject.toml | 1 - src/pytorch_lightning/demos/boring_classes.py | 77 +++++++++++-------- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 14e49ffb24b8e..60c031dfe920b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,6 @@ warn_no_return = "False" module = [ "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.core.datamodule", - "pytorch_lightning.demos.boring_classes", "pytorch_lightning.demos.mnist_datamodule", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", diff --git a/src/pytorch_lightning/demos/boring_classes.py b/src/pytorch_lightning/demos/boring_classes.py index 4fd8fd139e5f4..f7be5390466d8 100644 --- a/src/pytorch_lightning/demos/boring_classes.py +++ b/src/pytorch_lightning/demos/boring_classes.py @@ -11,14 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import cast, Dict, Iterator, List, Optional, Tuple, Union import torch import torch.nn as nn import torch.nn.functional as F +from torch import Tensor +from torch.optim import Optimizer +from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import DataLoader, Dataset, IterableDataset, Subset from pytorch_lightning import LightningDataModule, LightningModule +from pytorch_lightning.core.optimizer import LightningOptimizer +from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT class RandomDictDataset(Dataset): @@ -26,7 +31,7 @@ def __init__(self, size: int, length: int): self.len = length self.data = torch.randn(length, size) - def __getitem__(self, index): + def __getitem__(self, index: int) -> Dict[str, Tensor]: a = self.data[index] b = a + 2 return {"a": a, "b": b} @@ -40,7 +45,7 @@ def __init__(self, size: int, length: int): self.len = length self.data = torch.randn(length, size) - def __getitem__(self, index): + def __getitem__(self, index: int) -> Tensor: return self.data[index] def __len__(self) -> int: @@ -52,7 +57,7 @@ def __init__(self, size: int, count: int): self.count = count self.size = size - def __iter__(self): + def __iter__(self) -> Iterator[Tensor]: for _ in range(self.count): yield torch.randn(self.size) @@ -62,16 +67,16 @@ def __init__(self, size: int, count: int): self.count = count self.size = size - def __iter__(self): + def __iter__(self) -> Iterator[Tensor]: for _ in range(len(self)): yield torch.randn(self.size) - def __len__(self): + def __len__(self) -> int: return self.count class BoringModel(LightningModule): - def __init__(self): + def __init__(self) -> None: """Testing PL Module. Use as follows: @@ -90,60 +95,63 @@ def training_step(...): super().__init__() self.layer = torch.nn.Linear(32, 2) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: # type: ignore[override] return self.layer(x) - def loss(self, batch, preds): + def loss(self, batch: Tensor, preds: Tensor) -> Tensor: # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls return torch.nn.functional.mse_loss(preds, torch.ones_like(preds)) - def step(self, x): + def step(self, x: Tensor) -> Tensor: x = self(x) out = torch.nn.functional.mse_loss(x, torch.ones_like(x)) return out - def training_step(self, batch, batch_idx): + def training_step(self, batch: Tensor, batch_idx: int) -> STEP_OUTPUT: # type: ignore[override] output = self(batch) loss = self.loss(batch, output) return {"loss": loss} - def training_step_end(self, training_step_outputs): + def training_step_end(self, training_step_outputs: STEP_OUTPUT) -> STEP_OUTPUT: return training_step_outputs - def training_epoch_end(self, outputs) -> None: + def training_epoch_end(self, outputs: EPOCH_OUTPUT) -> None: + outputs = cast(List[Dict[str, Tensor]], outputs) torch.stack([x["loss"] for x in outputs]).mean() - def validation_step(self, batch, batch_idx): + def validation_step(self, batch: Tensor, batch_idx: int) -> Optional[STEP_OUTPUT]: # type: ignore[override] output = self(batch) loss = self.loss(batch, output) return {"x": loss} - def validation_epoch_end(self, outputs) -> None: + def validation_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None: + outputs = cast(List[Dict[str, Tensor]], outputs) torch.stack([x["x"] for x in outputs]).mean() - def test_step(self, batch, batch_idx): + def test_step(self, batch: Tensor, batch_idx: int) -> Optional[STEP_OUTPUT]: # type: ignore[override] output = self(batch) loss = self.loss(batch, output) return {"y": loss} - def test_epoch_end(self, outputs) -> None: + def test_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None: + outputs = cast(List[Dict[str, Tensor]], outputs) torch.stack([x["y"] for x in outputs]).mean() - def configure_optimizers(self): + def configure_optimizers(self) -> Tuple[List[torch.optim.Optimizer], List[_LRScheduler]]: optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer], [lr_scheduler] - def train_dataloader(self): + def train_dataloader(self) -> DataLoader: return DataLoader(RandomDataset(32, 64)) - def val_dataloader(self): + def val_dataloader(self) -> DataLoader: return DataLoader(RandomDataset(32, 64)) - def test_dataloader(self): + def test_dataloader(self) -> DataLoader: return DataLoader(RandomDataset(32, 64)) - def predict_dataloader(self): + def predict_dataloader(self) -> DataLoader: return DataLoader(RandomDataset(32, 64)) @@ -155,7 +163,7 @@ def __init__(self, data_dir: str = "./"): self.checkpoint_state: Optional[str] = None self.random_full = RandomDataset(32, 64 * 4) - def setup(self, stage: Optional[str] = None): + def setup(self, stage: Optional[str] = None) -> None: if stage == "fit" or stage is None: self.random_train = Subset(self.random_full, indices=range(64)) @@ -168,26 +176,27 @@ def setup(self, stage: Optional[str] = None): if stage == "predict" or stage is None: self.random_predict = Subset(self.random_full, indices=range(64 * 3, 64 * 4)) - def train_dataloader(self): + def train_dataloader(self) -> DataLoader: return DataLoader(self.random_train) - def val_dataloader(self): + def val_dataloader(self) -> DataLoader: return DataLoader(self.random_val) - def test_dataloader(self): + def test_dataloader(self) -> DataLoader: return DataLoader(self.random_test) - def predict_dataloader(self): + def predict_dataloader(self) -> DataLoader: return DataLoader(self.random_predict) class ManualOptimBoringModel(BoringModel): - def __init__(self): + def __init__(self) -> None: super().__init__() self.automatic_optimization = False - def training_step(self, batch, batch_idx): + def training_step(self, batch: Tensor, batch_idx: int) -> STEP_OUTPUT: # type: ignore[override] opt = self.optimizers() + assert isinstance(opt, (Optimizer, LightningOptimizer)) output = self(batch) loss = self.loss(batch, output) opt.zero_grad() @@ -202,21 +211,21 @@ def __init__(self, out_dim: int = 10, learning_rate: float = 0.02): self.l1 = torch.nn.Linear(32, out_dim) self.learning_rate = learning_rate - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: # type: ignore[override] return torch.relu(self.l1(x.view(x.size(0), -1))) - def training_step(self, batch, batch_nb): + def training_step(self, batch: Tensor, batch_nb: int) -> STEP_OUTPUT: # type: ignore[override] x = batch x = self(x) loss = x.sum() return loss - def configure_optimizers(self): + def configure_optimizers(self) -> torch.optim.Optimizer: return torch.optim.Adam(self.parameters(), lr=self.learning_rate) class Net(nn.Module): - def __init__(self): + def __init__(self) -> None: super().__init__() self.conv1 = nn.Conv2d(1, 32, 3, 1) self.conv2 = nn.Conv2d(32, 64, 3, 1) @@ -225,7 +234,7 @@ def __init__(self): self.fc1 = nn.Linear(9216, 128) self.fc2 = nn.Linear(128, 10) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: x = self.conv1(x) x = F.relu(x) x = self.conv2(x) From e67842dcbab5a98a0c77a181bddcbe9ef18bf887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 26 Aug 2022 09:58:21 +0200 Subject: [PATCH 07/30] Support sharded optimizer state dumping outside of sharded strategies (#14208) --- src/pytorch_lightning/CHANGELOG.md | 3 ++ src/pytorch_lightning/strategies/sharded.py | 17 +----- .../strategies/sharded_spawn.py | 16 +----- src/pytorch_lightning/strategies/strategy.py | 10 ++++ .../strategies/test_ddp_strategy.py | 53 +++++++++++++++++++ .../strategies/test_sharded_strategy.py | 33 ++++++++++-- 6 files changed, 97 insertions(+), 35 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ac7e68d177fbe..32303d6babb5d 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -15,6 +15,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for passing extra init-parameters to the `LightningDataModule.from_datasets` ([#14185](https://github.com/Lightning-AI/lightning/issues/14185)) +- Added support for saving sharded optimizer state dict outside of `DDPShardedStrategy` ([#14208](https://github.com/PyTorchLightning/pytorch-lightning/pull/14208)) + + ### Changed diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py index ce1e4cd96b961..3b77bc6ceeb70 100644 --- a/src/pytorch_lightning/strategies/sharded.py +++ b/src/pytorch_lightning/strategies/sharded.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from typing import Dict, Generator, List, Optional, Tuple, Union +from typing import Dict, Generator, List, Tuple, Union from torch import Tensor from torch.nn import Module @@ -27,7 +27,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE from pytorch_lightning.utilities.optimizer import optimizers_to_device -from pytorch_lightning.utilities.rank_zero import rank_zero_only if _FAIRSCALE_AVAILABLE: from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel @@ -120,20 +119,6 @@ def _reinit_optimizers_with_oss(self, optimizers: List[Union[Optimizer, Lightnin del optimizer return optimizers - def optimizer_state(self, optimizer: "OSS") -> Optional[dict]: - if isinstance(optimizer, LightningOptimizer): - optimizer = optimizer._optimizer - optimizer.consolidate_state_dict() - return self._optim_state_dict(optimizer) - - @rank_zero_only - def _optim_state_dict(self, optimizer): - """ - Retrieves state dict only on rank 0, which contains the entire optimizer state after calling - :meth:`consolidate_state_dict`. - """ - return optimizer.state_dict() - def pre_backward(self, closure_loss: Tensor) -> None: pass diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py index f19aae7302eea..01ccb75677544 100644 --- a/src/pytorch_lightning/strategies/sharded_spawn.py +++ b/src/pytorch_lightning/strategies/sharded_spawn.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from typing import Any, Dict, Generator, List, Tuple +from typing import Dict, Generator, List, Tuple from torch import Tensor from torch.nn import Module @@ -25,7 +25,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE from pytorch_lightning.utilities.optimizer import optimizers_to_device -from pytorch_lightning.utilities.rank_zero import rank_zero_only if _FAIRSCALE_AVAILABLE: from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel @@ -85,11 +84,6 @@ def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]: return self._reinit_optimizers_with_oss(optimizers) - def optimizer_state(self, optimizer: "OSS") -> Dict[str, Any]: - if isinstance(optimizer, OSS): - optimizer.consolidate_state_dict() - return self._optim_state_dict(optimizer) - @contextmanager def block_backward_sync(self) -> Generator: """Blocks syncing gradients behaviour on backwards pass. @@ -103,14 +97,6 @@ def block_backward_sync(self) -> Generator: else: yield None - @rank_zero_only - def _optim_state_dict(self, optimizer: Optimizer) -> Dict[str, Any]: - """ - Retrieves state dict only on rank 0, which contains the entire optimizer state after calling - :meth:`consolidate_state_dict`. - """ - return optimizer.state_dict() - def pre_backward(self, closure_loss: Tensor) -> None: pass diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index c09e7eae8c586..0abc5fe516273 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -170,6 +170,16 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]: Allows for syncing/collating optimizer state from processes in custom plugins. """ + if isinstance(optimizer, LightningOptimizer): + optimizer = optimizer._optimizer + + if hasattr(optimizer, "consolidate_state_dict"): + # there are optimizers like Fairscale's OSS or PyTorch's ZeroRedundancyOptimizer that shard their + # states, and to avoid OOM we consolidate the full state on rank 0 only + optimizer.consolidate_state_dict() + return optimizer.state_dict() if self.is_global_zero else {} + + # for optimizers that are not sharded, we return the state dict on all ranks return optimizer.state_dict() def backward( diff --git a/tests/tests_pytorch/strategies/test_ddp_strategy.py b/tests/tests_pytorch/strategies/test_ddp_strategy.py index 8d2d965f1d4c6..318505a984216 100644 --- a/tests/tests_pytorch/strategies/test_ddp_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_strategy.py @@ -24,8 +24,14 @@ from pytorch_lightning.plugins.environments import ClusterEnvironment, LightningEnvironment from pytorch_lightning.strategies import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn +from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _TORCH_GREATER_EQUAL_1_10 from tests_pytorch.helpers.runif import RunIf +if _FAIRSCALE_AVAILABLE: + from fairscale.optim import OSS +if _TORCH_GREATER_EQUAL_1_10: + from torch.distributed.optim import ZeroRedundancyOptimizer + class BoringModelGPU(BoringModel): def on_train_start(self) -> None: @@ -252,3 +258,50 @@ def test_ddp_strategy_set_timeout(mock_init_process_group): mock_init_process_group.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta ) + + +class BoringFairScaleOptimizerModel(BoringModel): + def configure_optimizers(self): + base_optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults) + + +@RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True) +@pytest.mark.parametrize("strategy", (pytest.param("ddp", marks=RunIf(standalone=True)), "ddp_spawn")) +def test_ddp_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy): + """Test to ensure that checkpoint is saved correctly when using faircale optimizer.""" + model = BoringFairScaleOptimizerModel() + trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1) + + trainer.fit(model) + + checkpoint_path = os.path.join(tmpdir, "model.pt") + trainer.save_checkpoint(checkpoint_path) + saved_model = BoringModel.load_from_checkpoint(checkpoint_path) + + # Assert model parameters are identical after loading + for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()): + assert torch.equal(trained_param.to("cpu"), loaded_param) + + +class BoringZeroRedundancyOptimizerModel(BoringModel): + def configure_optimizers(self): + return ZeroRedundancyOptimizer(self.layer.parameters(), optimizer_class=torch.optim.Adam, lr=0.1) + + +@RunIf(min_cuda_gpus=2, skip_windows=True, min_torch="1.10") +@pytest.mark.parametrize("strategy", (pytest.param("ddp", marks=RunIf(standalone=True)), "ddp_spawn")) +def test_ddp_strategy_checkpoint_zero_redundancy_optimizer(tmpdir, strategy): + """Test to ensure that checkpoint is saved correctly when using zero redundancy optimizer.""" + model = BoringZeroRedundancyOptimizerModel() + trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1) + + trainer.fit(model) + + checkpoint_path = os.path.join(tmpdir, "model.pt") + trainer.save_checkpoint(checkpoint_path) + saved_model = BoringModel.load_from_checkpoint(checkpoint_path) + + # Assert model parameters are identical after loading + for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()): + assert torch.equal(trained_param.to("cpu"), loaded_param) diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py index a0abfb3f73ec0..acefecbf4d2a7 100644 --- a/tests/tests_pytorch/strategies/test_sharded_strategy.py +++ b/tests/tests_pytorch/strategies/test_sharded_strategy.py @@ -14,6 +14,7 @@ if _FAIRSCALE_AVAILABLE: from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel + from fairscale.optim import OSS @pytest.mark.parametrize("clip_val", [0, 10]) @@ -70,8 +71,8 @@ def test_ddp_sharded_strategy_checkpoint_cpu(tmpdir): saved_model = BoringModel.load_from_checkpoint(checkpoint_path) # Assert model parameters are identical after loading - for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()): - assert torch.equal(ddp_param.to("cpu"), shard_param) + for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()): + assert torch.equal(trained_param.to("cpu"), loaded_param) @RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True) @@ -87,8 +88,8 @@ def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir): saved_model = BoringModel.load_from_checkpoint(checkpoint_path) # Assert model parameters are identical after loading - for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()): - assert torch.equal(ddp_param.to("cpu"), shard_param) + for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()): + assert torch.equal(trained_param.to("cpu"), loaded_param) @RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True) @@ -314,3 +315,27 @@ def test_block_backward_sync(): def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): trainer = Trainer(strategy=strategy_name) assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs + + +class BoringFairScaleOptimizerModel(BoringModel): + def configure_optimizers(self): + base_optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults) + + +@RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True) +@pytest.mark.parametrize("strategy", (pytest.param("ddp_sharded", marks=RunIf(standalone=True)), "ddp_sharded_spawn")) +def test_ddp_sharded_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy): + """Test to ensure that checkpoint is saved correctly when using fairscale optimizers.""" + model = BoringFairScaleOptimizerModel() + trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1) + + trainer.fit(model) + + checkpoint_path = os.path.join(tmpdir, "model.pt") + trainer.save_checkpoint(checkpoint_path) + saved_model = BoringModel.load_from_checkpoint(checkpoint_path) + + # Assert model parameters are identical after loading + for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()): + assert torch.equal(trained_param.to("cpu"), loaded_param) From c418828d4150ab6fa8670b43fdda2e7abe5f237b Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Fri, 26 Aug 2022 10:40:41 +0200 Subject: [PATCH 08/30] Remove appropriate line from `pyproject.toml` as a followup to #13929 (#14397) followup to #13929 --- pyproject.toml | 1 - src/pytorch_lightning/demos/mnist_datamodule.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 60c031dfe920b..18ae22feeb963 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,6 @@ warn_no_return = "False" module = [ "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.core.datamodule", - "pytorch_lightning.demos.mnist_datamodule", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", "pytorch_lightning.strategies.sharded", diff --git a/src/pytorch_lightning/demos/mnist_datamodule.py b/src/pytorch_lightning/demos/mnist_datamodule.py index 0f5eea99f3e96..6466b78250b3b 100644 --- a/src/pytorch_lightning/demos/mnist_datamodule.py +++ b/src/pytorch_lightning/demos/mnist_datamodule.py @@ -16,7 +16,7 @@ import random import time import urllib -from typing import Any, Callable, Optional, Tuple, Union +from typing import Any, Callable, Optional, Sized, Tuple, Union from urllib.error import HTTPError from warnings import warn @@ -199,6 +199,7 @@ def setup(self, stage: Optional[str] = None) -> None: """Split the train and valid dataset.""" extra = dict(transform=self.default_transforms) if self.default_transforms else {} dataset: Dataset = MNIST(self.data_dir, train=True, download=False, **extra) + assert isinstance(dataset, Sized) train_length = len(dataset) self.dataset_train, self.dataset_val = random_split(dataset, [train_length - self.val_split, self.val_split]) From 70deac2cd4f23426b3fc4586720b8b0cbdbcb214 Mon Sep 17 00:00:00 2001 From: Christian Schell Date: Fri, 26 Aug 2022 10:42:00 +0200 Subject: [PATCH 09/30] Reset epoch progress with batch size scaler (#13846) Co-authored-by: Christian Schell Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../tuner/batch_size_scaling.py | 19 +++++++++++++++++-- .../tuner/test_scale_batch_size.py | 10 +++++++--- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 32303d6babb5d..8dcd45f58b876 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -87,6 +87,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296)) +- Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846) + + - Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806)) diff --git a/src/pytorch_lightning/tuner/batch_size_scaling.py b/src/pytorch_lightning/tuner/batch_size_scaling.py index 316fc5a2197da..a1f8a2de4b9d8 100644 --- a/src/pytorch_lightning/tuner/batch_size_scaling.py +++ b/src/pytorch_lightning/tuner/batch_size_scaling.py @@ -128,7 +128,10 @@ def _run_power_scaling( """Batch scaling mode where the size is doubled at each iteration until an OOM error is encountered.""" for _ in range(max_trials): garbage_collection_cuda() - trainer.fit_loop.global_step = 0 # reset after each try + + # reset after each try + _reset_progress(trainer) + try: # Try fit trainer.tuner._run(model) @@ -166,7 +169,10 @@ def _run_binsearch_scaling( count = 0 while True: garbage_collection_cuda() - trainer.fit_loop.global_step = 0 # reset after each try + + # reset after each try + _reset_progress(trainer) + try: # Try fit trainer.tuner._run(model) @@ -249,3 +255,12 @@ def _adjust_batch_size( def _is_valid_batch_size(batch_size: int, dataloader: DataLoader, trainer: "pl.Trainer"): module = trainer.lightning_module or trainer.datamodule return not has_len_all_ranks(dataloader, trainer.strategy, module) or batch_size <= len(dataloader) + + +def _reset_progress(trainer: "pl.Trainer") -> None: + if trainer.lightning_module.automatic_optimization: + trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop.optim_progress.reset() + else: + trainer.fit_loop.epoch_loop.batch_loop.manual_loop.optim_step_progress.reset() + + trainer.fit_loop.epoch_progress.reset() diff --git a/tests/tests_pytorch/tuner/test_scale_batch_size.py b/tests/tests_pytorch/tuner/test_scale_batch_size.py index ce7c3613f5012..e703b37491d26 100644 --- a/tests/tests_pytorch/tuner/test_scale_batch_size.py +++ b/tests/tests_pytorch/tuner/test_scale_batch_size.py @@ -13,6 +13,7 @@ # limitations under the License. import os from copy import deepcopy +from unittest.mock import patch import pytest import torch @@ -308,10 +309,13 @@ def __init__(self): def test_dataloader_reset_with_scale_batch_size(tmpdir, scale_method): """Test that train and val dataloaders are reset at every update in scale batch size.""" model = BatchSizeModel(batch_size=16) - scale_batch_size_kwargs = {"max_trials": 5, "init_val": 4, "mode": scale_method} + max_trials = 5 + scale_batch_size_kwargs = {"max_trials": max_trials, "steps_per_trial": 2, "init_val": 4, "mode": scale_method} - trainer = Trainer(max_epochs=2, auto_scale_batch_size=True) - new_batch_size = trainer.tune(model, scale_batch_size_kwargs=scale_batch_size_kwargs)["scale_batch_size"] + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True) + with patch.object(model, "on_train_epoch_end") as advance_mocked: + new_batch_size = trainer.tune(model, scale_batch_size_kwargs=scale_batch_size_kwargs)["scale_batch_size"] + assert advance_mocked.call_count == max_trials assert trainer.train_dataloader.loaders.batch_size == new_batch_size assert trainer.val_dataloaders[0].batch_size == new_batch_size From 6d00f31f0cac7b650b3ff5e03cc5ecab40e71fc3 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 26 Aug 2022 14:31:48 +0530 Subject: [PATCH 10/30] Add auto wrapping for `DDPFullyShardedNativeStrategy` (#14252) --- .../advanced/model_parallel.rst | 29 +++-- src/pytorch_lightning/CHANGELOG.md | 3 + .../precision/fsdp_native_native_amp.py | 5 +- .../plugins/precision/sharded_native_amp.py | 6 +- .../strategies/fully_sharded_native.py | 55 ++++++++++ .../test_ddp_fully_sharded_native.py | 101 ++++++++++++++---- 6 files changed, 166 insertions(+), 33 deletions(-) diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst index d1db58b3f1869..50ae2cd2827d0 100644 --- a/docs/source-pytorch/advanced/model_parallel.rst +++ b/docs/source-pytorch/advanced/model_parallel.rst @@ -212,14 +212,31 @@ PyTorch Fully Sharded Training ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PyTorch has it's own version of `FSDP `_ which is upstreamed from their `fairscale `__ project. -It was introduced in their `v1.11.0 release `_. The API is pretty similar to that of FairScale. +It was introduced in their `v1.11.0 release `_ but it is recommended to use it with PyTorch v1.12 or more and that's what +Lightning supports. The API is pretty similar to that of FairScale. -.. note:: - Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``. - This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``. - This is a limitation of Fully Sharded Training that will be resolved in the future. -To activate parameter sharding, you must wrap your model using the``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly. +Auto Wrapping +""""""""""""" +Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The +simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't +have to ``wrap`` layers manually as in the case of manual wrapping. + +.. code-block:: python + + model = BoringModel() + trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp_native", precision=16) + trainer.fit(model) + + +Read more `here `__. + + +Manual Wrapping +""""""""""""""" + +Manual wrapping can be useful to explore complex sharding strategies by applying ``wrap`` selectively to some parts of the model. To activate +parameter sharding with manual wrapping, you can wrap your model using the ``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly. When not using Fully Sharded these wrap functions are a no-op. This means once the changes have been made, there is no need to remove the changes for other strategies. diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 8dcd45f58b876..6ca49228cb140 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -12,6 +12,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290)) +- Added support for auto wrapping for `DDPFullyShardedNativeStrategy` ([#14252](https://github.com/Lightning-AI/lightning/issues/14252)) + + - Added support for passing extra init-parameters to the `LightningDataModule.from_datasets` ([#14185](https://github.com/Lightning-AI/lightning/issues/14185)) diff --git a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py index 2201db94586a2..f91144124ae35 100644 --- a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py @@ -25,14 +25,13 @@ from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler else: MixedPrecision = None # type: ignore[misc,assignment] + ShardedGradScaler = None # type: ignore[misc,assignment] class FullyShardedNativeNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin): """Native AMP for Fully Sharded Native Training.""" - def __init__( - self, precision: Union[str, int], device: str, scaler: Optional[torch.cuda.amp.GradScaler] = None - ) -> None: + def __init__(self, precision: Union[str, int], device: str, scaler: Optional[ShardedGradScaler] = None) -> None: if not _TORCH_GREATER_EQUAL_1_12: raise MisconfigurationException( "`FullyShardedNativeNativeMixedPrecisionPlugin` is supported from PyTorch v1.12.0 onwards." diff --git a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py index 570e25bd85caa..15c23e18ed6bc 100644 --- a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py @@ -13,8 +13,6 @@ # limitations under the License. from typing import Optional, Union -import torch - from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE @@ -29,9 +27,7 @@ class ShardedNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin): """Native AMP for Sharded Training.""" - def __init__( - self, precision: Union[str, int], device: str, scaler: Optional[torch.cuda.amp.GradScaler] = None - ) -> None: + def __init__(self, precision: Union[str, int], device: str, scaler: Optional[ShardedGradScaler] = None) -> None: if not _FAIRSCALE_AVAILABLE: raise MisconfigurationException( "You have asked for sharded AMP but you have not installed it." diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 4dbf36e4c2861..456ddd36f93e7 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -20,6 +20,7 @@ from torch.distributed.distributed_c10d import _get_default_group, ProcessGroup import pytorch_lightning as pl +from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin @@ -38,9 +39,11 @@ from pytorch_lightning.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 +from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info from pytorch_lightning.utilities.seed import reset_seed +from pytorch_lightning.utilities.types import STEP_OUTPUT if _TORCH_GREATER_EQUAL_1_12: from torch.distributed.fsdp.fully_sharded_data_parallel import ( @@ -51,6 +54,7 @@ ) from torch.distributed.fsdp.wrap import enable_wrap else: + FullyShardedDataParallel = None # type: ignore[misc,assignment] MixedPrecision = None # type: ignore[misc,assignment] BackwardPrefetch = None # type: ignore[misc,assignment] CPUOffload = None # type: ignore[misc,assignment] @@ -201,6 +205,28 @@ def _configure_launcher(self) -> None: self._launcher = _SubprocessScriptLauncher(self.cluster_environment, self.num_processes, self.num_nodes) self._rank_0_will_call_children_scripts = True + def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel: + """Wraps the model into a + :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel` module.""" + # If model is already wrapped, we need to avoid sending the `auto_wrap_policy` + assert self.lightning_module is not None + if ( + any(isinstance(mod, FullyShardedDataParallel) for mod in self.lightning_module.modules()) + and "auto_wrap_policy" in self.kwargs + ): + del self.kwargs["auto_wrap_policy"] + + log.detail(f"setting up FSDP model with device id: {self.root_device.index}, kwargs: {self.kwargs}") + return FullyShardedDataParallel( + module=model, + process_group=self.process_group, + cpu_offload=self.cpu_offload, + backward_prefetch=self.backward_prefetch, + mixed_precision=self.mixed_precision_config, + device_id=self.root_device.index, + **self.kwargs, + ) + def setup(self, trainer: "pl.Trainer") -> None: assert self.accelerator is not None self.accelerator.setup(trainer) @@ -215,9 +241,20 @@ def setup(self, trainer: "pl.Trainer") -> None: assert self.lightning_module is not None self.lightning_module._device = self.root_device + assert isinstance(self.model, pl.LightningModule) + self.model = _LightningModuleWrapperBase(self.model) + if is_overridden("configure_sharded_model", self.lightning_module): + rank_zero_info( + "You have overridden `LightningModule.configure_sharded_model` hook. It will assume that all the layers" + " are already wrapped for sharding and won't wrap the entire model using `FullyShardedDataParallel`." + ) + else: + self.model = self._setup_model(self.model) self.barrier() + self.setup_optimizers(trainer) optimizers_to_device(self.optimizers, self.root_device) + self.setup_precision_plugin() def model_to_device(self) -> None: @@ -273,6 +310,24 @@ def reduce( tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op) return tensor + def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + # we don't need precision context since casting is done by FSDP + # read `mixed_precision` docstring here: https://pytorch.org/docs/stable/fsdp.html + assert self.model is not None + return self.model(*args, **kwargs) + + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + assert self.model is not None + return self.model(*args, **kwargs) + + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + assert self.model is not None + return self.model(*args, **kwargs) + + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + assert self.model is not None + return self.model(*args, **kwargs) + def _determine_device_ids(self) -> List[int]: return [self.root_device.index] diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index ede201da1f68f..be8bced2cbf5f 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -11,7 +11,6 @@ from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 -from pytorch_lightning.utilities.types import STEP_OUTPUT from tests_pytorch.helpers.runif import RunIf if _TORCH_GREATER_EQUAL_1_12: @@ -19,6 +18,15 @@ from torch.distributed.fsdp.wrap import wrap +def custom_auto_wrap_policy( + module, + recurse, + unwrapped_params: int, + min_num_params: int = int(1e8), +) -> bool: + return unwrapped_params >= 2 + + @RunIf(min_torch="1.12") def test_invalid_on_cpu(tmpdir): """Test to ensure that we raise Misconfiguration for Native FSDP on CPU.""" @@ -78,38 +86,73 @@ def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: def configure_optimizers(self): return torch.optim.SGD(self.layer.parameters(), lr=0.1) - def on_train_batch_end(self, outputs: STEP_OUTPUT, batch: Any, batch_idx: int) -> None: + def on_train_batch_end(self, outputs, batch, batch_idx) -> None: self._assert_layer_fsdp_instance() - def on_test_batch_end( - self, outputs: Optional[STEP_OUTPUT], batch: Any, batch_idx: int, dataloader_idx: int - ) -> None: + def on_test_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None: self._assert_layer_fsdp_instance() - def on_validation_batch_end( - self, outputs: Optional[STEP_OUTPUT], batch: Any, batch_idx: int, dataloader_idx: int - ) -> None: + def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None: self._assert_layer_fsdp_instance() - def on_predict_batch_end(self, outputs: Optional[Any], batch: Any, batch_idx: int, dataloader_idx: int) -> None: + def on_predict_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None: self._assert_layer_fsdp_instance() def _assert_layer_fsdp_instance(self) -> None: assert isinstance(self.layer, FullyShardedDataParallel) assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin) - assert isinstance(self.layer.module[0], FullyShardedDataParallel) - assert isinstance(self.layer.module[2], FullyShardedDataParallel) # root should not be resharding assert self.layer.reshard_after_forward is False - # Assert that the nested layers are set reshard_after_forward to True - assert self.layer.module[0].reshard_after_forward is True - assert self.layer.module[2].reshard_after_forward is True precision = torch.float16 if self.precision == 16 else torch.bfloat16 assert self.layer.mixed_precision.param_dtype == precision assert self.layer.mixed_precision.reduce_dtype == precision assert self.layer.mixed_precision.buffer_dtype == precision + for layer_num in [0, 2]: + assert isinstance(self.layer.module[layer_num], FullyShardedDataParallel) + # Assert that the nested layers are set reshard_after_forward to True + assert self.layer.module[layer_num].reshard_after_forward is True + + assert self.layer[layer_num].mixed_precision.param_dtype == precision + assert self.layer[layer_num].mixed_precision.reduce_dtype == precision + assert self.layer[layer_num].mixed_precision.buffer_dtype == precision + + +class TestFSDPModelAutoWrapped(BoringModel): + def __init__(self): + super().__init__() + self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2)) + + def configure_optimizers(self): + return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1) + + def on_train_batch_end(self, outputs, batch, batch_idx) -> None: + self._assert_layer_fsdp_instance() + + def on_test_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None: + self._assert_layer_fsdp_instance() + + def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None: + self._assert_layer_fsdp_instance() + + def on_predict_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None: + self._assert_layer_fsdp_instance() + + def _assert_layer_fsdp_instance(self) -> None: + assert isinstance(self.layer, torch.nn.Sequential) + assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin) + + precision = torch.float16 if self.precision == 16 else torch.bfloat16 + for layer_num in [0, 2]: + assert isinstance(self.layer[layer_num], FullyShardedDataParallel) + # Assert that the nested layers are set reshard_after_forward to True + assert self.layer[layer_num].reshard_after_forward + + assert self.layer[layer_num].mixed_precision.param_dtype == precision + assert self.layer[layer_num].mixed_precision.reduce_dtype == precision + assert self.layer[layer_num].mixed_precision.buffer_dtype == precision + @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.12") def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): @@ -140,18 +183,32 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir, precision): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.12") -def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir): +@pytest.mark.parametrize( + "model, strategy", + [ + (TestFSDPModel(), "fsdp_native"), + (TestFSDPModelAutoWrapped(), DDPFullyShardedNativeStrategy), + ], +) +def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir, model, strategy): """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run.""" - model = TestFSDPModel() ck = ModelCheckpoint(save_last=True) + + if not isinstance(strategy, str): + strategy = strategy(auto_wrap_policy=custom_auto_wrap_policy) + trainer = Trainer( default_root_dir=tmpdir, accelerator="gpu", devices=2, - strategy="fsdp_native", + strategy=strategy, precision=16, max_epochs=1, + limit_train_batches=2, + limit_val_batches=2, + limit_test_batches=2, + limit_predict_batches=2, callbacks=[ck], ) _run_multiple_stages(trainer, model) @@ -164,14 +221,20 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): trainer.save_checkpoint(model_path, weights_only=True) - _assert_save_equality(trainer, model_path, cls=TestFSDPModel) + _assert_save_equality(trainer, model_path, cls=model.__class__) # Test entry point trainer.test(model) # model is wrapped, will not call `configure_sharded_model` - # provide model path, will create a new unwrapped model and load and then call configure_shared_model to wrap + # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap trainer.test(ckpt_path=model_path) + # Predict entry point + trainer.predict(model) # model is wrapped, will not call `configure_sharded_model` + + # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap + trainer.predict(ckpt_path=model_path) + def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel): # Use FullySharded to get the state dict for the sake of comparison From 70fe0ed0413e9aa2daecbbf031d788e8cd310ba8 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 26 Aug 2022 11:07:35 +0200 Subject: [PATCH 11/30] CI: Azure CPU pool (#14367) * Azure CPU pool * Fix empty env LAI vars Co-authored-by: manskx --- .azure/app-cloud-e2e.yml | 39 +++++++++++++++------------- src/lightning_app/testing/testing.py | 2 +- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index c7262c2bc5c61..aed2cd07b987b 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -24,8 +24,10 @@ variables: jobs: - job: App_cloud_e2e_testing - pool: - vmImage: 'ubuntu-latest' + pool: azure-cpus + container: + image: mcr.microsoft.com/playwright/python:v1.25.2-focal + options: "--shm-size=2g" timeoutInMinutes: "30" cancelTimeoutInMinutes: "2" strategy: @@ -56,6 +58,7 @@ jobs: clean: all steps: - bash: | + whoami python --version pip --version displayName: 'Info' @@ -80,10 +83,10 @@ jobs: - bash: | python -m pip install playwright - python -m playwright install --with-deps + python -m playwright install # --with-deps displayName: 'Install Playwright system dependencies' - - bash: pip install -e . + - bash: pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html displayName: 'Install lightning' - bash: | @@ -110,12 +113,12 @@ jobs: TEST_APP_NAME: $(name) HAR_LOCATION: './artifacts/hars' SLOW_MO: '50' - LAI_USER: $(LAI_USER) - LAI_PASS: $(LAI_PASS) - LIGHTNING_USER_ID: $(LIGHTNING_USER_ID) - LIGHTNING_API_KEY: $(LIGHTNING_API_KEY) + # LAI_USER: $(LAI_USER) + # LAI_PASS: $(LAI_PASS) + LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) + LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) LIGHTNING_USERNAME: $(LIGHTNING_USERNAME) - LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL) + LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) displayName: 'Run the tests' - publish: '$(Build.ArtifactStagingDirectory)/videos' @@ -125,16 +128,16 @@ jobs: - bash: | time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" env: - LAI_USER: $(LAI_USER) - LAI_PASS: $(LAI_PASS) - LIGHTNING_USER_ID: $(LIGHTNING_USER_ID) - LIGHTNING_API_KEY: $(LIGHTNING_API_KEY) + # LAI_USER: $(LAI_USER) + # LAI_PASS: $(LAI_PASS) + LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) + LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) LIGHTNING_USERNAME: $(LIGHTNING_USERNAME) - LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL) + LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) PR_NUMBER: $(local_id) TEST_APP_NAME: $(name) - GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning - GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning - GRID_URL: $(LIGHTNING_CLOUD_URL) - _GRID_USERNAME: $(LIGHTNING_USERNAME) + # GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning + # GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning + # GRID_URL: $(LIGHTNING_CLOUD_URL) + # _GRID_USERNAME: $(LIGHTNING_USERNAME) displayName: 'Clean Previous Apps' diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 884c02a0521c1..1ccc8ba1ff63c 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -224,7 +224,7 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str context = browser.new_context( # Eventually this will need to be deleted http_credentials=HttpCredentials( - {"username": os.getenv("LAI_USER").strip(), "password": os.getenv("LAI_PASS")} + {"username": os.getenv("LAI_USER", "").strip(), "password": os.getenv("LAI_PASS", "")} ), record_video_dir=os.path.join(Config.video_location, TEST_APP_NAME), record_har_path=Config.har_location, From 2112e0e47287cbcf883871f79cfdd5f98c73eb11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 26 Aug 2022 12:24:01 +0200 Subject: [PATCH 12/30] Remove unused utility function `_parse_devices` (#14386) --- src/pytorch_lightning/utilities/apply_func.py | 2 +- .../utilities/device_parser.py | 17 ----------------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py index 9f85ea52661c7..15e9962e40bff 100644 --- a/src/pytorch_lightning/utilities/apply_func.py +++ b/src/pytorch_lightning/utilities/apply_func.py @@ -27,7 +27,7 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY -from pytorch_lightning.utilities.warnings import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation if _TORCHTEXT_LEGACY: if _compare_version("torchtext", operator.ge, "0.9.0"): diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index c76933e489db7..c30692d0ab01a 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -18,7 +18,6 @@ import torch.cuda from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.types import _DEVICE @@ -51,22 +50,6 @@ def determine_root_gpu_device(gpus: List[_DEVICE]) -> Optional[_DEVICE]: return root_gpu -def _parse_devices( - gpus: Optional[Union[List[int], str, int]], - auto_select_gpus: bool, - tpu_cores: Optional[Union[List[int], str, int]], - include_cuda: bool = False, - include_mps: bool = False, -) -> Tuple[Optional[List[int]], Optional[Union[List[int], int]]]: - if auto_select_gpus and isinstance(gpus, int): - gpus = pick_multiple_gpus(gpus) - - # TODO (@seannaren, @kaushikb11): Include IPU parsing logic here - gpu_ids = parse_gpu_ids(gpus, include_cuda=include_cuda, include_mps=include_mps) - tpu_cores = parse_tpu_cores(tpu_cores) - return gpu_ids, tpu_cores - - def parse_gpu_ids( gpus: Optional[Union[int, str, List[int]]], include_cuda: bool = False, From 1cfebc0eae358f79d8f37a6839c061752808c8c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 26 Aug 2022 13:18:11 +0200 Subject: [PATCH 13/30] Add back standalone task azure CI step (#14366) --- .azure/gpu-tests.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 2da30c0dd66ab..3ed4601d1b8e5 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -123,6 +123,15 @@ jobs: timeoutInMinutes: "35" condition: eq(variables['continue'], '1') + - bash: bash run_standalone_tasks.sh + workingDirectory: tests/tests_pytorch + env: + PL_USE_MOCKED_MNIST: "1" + PL_RUN_CUDA_TESTS: "1" + displayName: 'Testing: PyTorch standalone tasks' + timeoutInMinutes: "10" + condition: eq(variables['continue'], '1') + - bash: | python -m coverage report python -m coverage xml From 3201f7ff8b1657e9fc4865026a19052581b27411 Mon Sep 17 00:00:00 2001 From: Nishant Bhansali Date: Fri, 26 Aug 2022 16:50:55 +0530 Subject: [PATCH 14/30] Update docs for `train_bn` in `Basefinetuning.filter_params` (#13938) --- src/pytorch_lightning/callbacks/finetuning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/callbacks/finetuning.py b/src/pytorch_lightning/callbacks/finetuning.py index ad45ff8b0591b..d2afdd20bd9e9 100644 --- a/src/pytorch_lightning/callbacks/finetuning.py +++ b/src/pytorch_lightning/callbacks/finetuning.py @@ -141,7 +141,7 @@ def filter_params( Args: modules: A given module or an iterable of modules - train_bn: Whether to train BatchNorm module + train_bn: Whether not to train the BatchNorm module requires_grad: Whether to create a generator for trainable or non-trainable parameters. Returns: Generator From e2221a0b3e951f20034e36d66b76a4a375f5f9ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 26 Aug 2022 15:11:24 +0200 Subject: [PATCH 15/30] Raise an error when resuming training with Apex (#14341) --- src/pytorch_lightning/CHANGELOG.md | 5 +++++ .../plugins/precision/apex_amp.py | 9 +++++++- tests/tests_pytorch/models/test_amp.py | 21 +++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 6ca49228cb140..1776b917c761a 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -35,10 +35,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Enabled `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` ([#14023](https://github.com/Lightning-AI/lightning/pull/14023)) + +- When resuming training with Apex enabled, the `Trainer` will now raise an error ([#14341](https://github.com/Lightning-AI/lightning/pull/14341)) + + - Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384)) + ### Deprecated - Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) diff --git a/src/pytorch_lightning/plugins/precision/apex_amp.py b/src/pytorch_lightning/plugins/precision/apex_amp.py index cfc13630768db..d85dceb53a069 100644 --- a/src/pytorch_lightning/plugins/precision/apex_amp.py +++ b/src/pytorch_lightning/plugins/precision/apex_amp.py @@ -41,6 +41,7 @@ def __init__(self, amp_level: str = "O2") -> None: super().__init__() self.amp_level = amp_level self._connected = False + self._state_dict_loaded = False def main_params(self, optimizer: Optimizer) -> _PARAMETERS: return amp.master_params(optimizer) @@ -83,6 +84,11 @@ def optimizer_step( closure: Callable[[], Any], **kwargs: Any, ) -> Any: + if self._state_dict_loaded: + raise RuntimeError( + "Resuming training with APEX is currently not supported. Set `amp_backend=None` for example or use a" + " different precision plugin." + ) if isinstance(optimizer, LBFGS): raise MisconfigurationException( f"apex AMP and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})." @@ -99,4 +105,5 @@ def state_dict(self) -> Dict[str, Any]: return amp.state_dict() def load_state_dict(self, state_dict: Dict[str, Any]) -> None: - amp.load_state_dict(state_dict) + self._state_dict_loaded = True + return super().load_state_dict(state_dict) diff --git a/tests/tests_pytorch/models/test_amp.py b/tests/tests_pytorch/models/test_amp.py index 786de99f59714..822d2e0401619 100644 --- a/tests/tests_pytorch/models/test_amp.py +++ b/tests/tests_pytorch/models/test_amp.py @@ -211,3 +211,24 @@ def configure_optimizers(self): assert isinstance(trainer.lr_scheduler_configs[0].scheduler.optimizer, optim.Adam) assert isinstance(trainer.lr_scheduler_configs[1].scheduler.optimizer, optim.SGD) + + +@RunIf(min_cuda_gpus=1, amp_apex=True) +def test_amp_with_apex_reload(tmpdir): + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=1, + limit_test_batches=1, + precision=16, + amp_backend="apex", + accelerator="gpu", + devices=1, + ) + trainer.fit(model) + trainer.fit_loop.max_steps = 2 + + with pytest.raises(RuntimeError, match="Resuming training with APEX is currently not supported."): + trainer.fit(model, ckpt_path=trainer.checkpoint_callback.best_model_path) + + trainer.test(model, ckpt_path="best") From 94e567e6f0dbe3e06feebc8fdf4bdc807182fa96 Mon Sep 17 00:00:00 2001 From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com> Date: Fri, 26 Aug 2022 09:28:27 -0400 Subject: [PATCH 16/30] Fix mypy errors attributed to `pytorch_lightning.trainer.connectors.data_connector.py` (#13806) Co-authored-by: rohitgr7 Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> --- pyproject.toml | 1 - src/pytorch_lightning/core/datamodule.py | 3 +- src/pytorch_lightning/core/module.py | 3 +- .../trainer/configuration_validator.py | 6 ++- .../trainer/connectors/data_connector.py | 50 ++++++++++--------- src/pytorch_lightning/trainer/trainer.py | 2 +- .../trainer/flags/test_overfit_batches.py | 2 +- 7 files changed, 37 insertions(+), 30 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 18ae22feeb963..cb5274d5774d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,6 @@ module = [ "pytorch_lightning.profilers.pytorch", "pytorch_lightning.strategies.sharded", "pytorch_lightning.trainer.callback_hook", - "pytorch_lightning.trainer.connectors.data_connector", "pytorch_lightning.trainer.supporters", "pytorch_lightning.trainer.trainer", "pytorch_lightning.tuner.batch_size_scaling", diff --git a/src/pytorch_lightning/core/datamodule.py b/src/pytorch_lightning/core/datamodule.py index 4edde3fe6a3ae..e730aabd8c1f9 100644 --- a/src/pytorch_lightning/core/datamodule.py +++ b/src/pytorch_lightning/core/datamodule.py @@ -18,6 +18,7 @@ from torch.utils.data import DataLoader, Dataset, IterableDataset +import pytorch_lightning as pl from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks from pytorch_lightning.core.mixins import HyperparametersMixin from pytorch_lightning.core.saving import _load_from_checkpoint @@ -62,7 +63,7 @@ def teardown(self): def __init__(self) -> None: super().__init__() # Pointer to the trainer object - self.trainer = None + self.trainer: Optional["pl.Trainer"] = None @classmethod def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs) -> ArgumentParser: diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 0926cc52ecd57..a479beadc7931 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -105,7 +105,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._use_amp: bool = False # the precision used - self.precision: int = 32 + self.precision: Union[int, str] = 32 # optionally can be set by user self._example_input_array = None @@ -294,6 +294,7 @@ def loggers(self) -> List[Logger]: def _call_batch_hook(self, hook_name: str, *args: Any) -> Any: if self._trainer: datahook_selector = self._trainer._data_connector._datahook_selector + assert datahook_selector is not None obj = datahook_selector.get_instance(hook_name) if isinstance(obj, self.__class__): trainer_method = self._trainer._call_lightning_module_hook diff --git a/src/pytorch_lightning/trainer/configuration_validator.py b/src/pytorch_lightning/trainer/configuration_validator.py index 74c477b245708..9d277f5ac45c9 100644 --- a/src/pytorch_lightning/trainer/configuration_validator.py +++ b/src/pytorch_lightning/trainer/configuration_validator.py @@ -46,7 +46,7 @@ def verify_loop_configurations(trainer: "pl.Trainer") -> None: elif trainer.state.fn == TrainerFn.PREDICTING: __verify_eval_loop_configuration(trainer, model, "predict") - __verify_batch_transfer_support(trainer, model) + __verify_batch_transfer_support(trainer) _check_deprecated_callback_hooks(trainer) # TODO: Delete _check_on_hpc_hooks in v1.8 _check_on_hpc_hooks(model) @@ -149,10 +149,12 @@ def __verify_eval_loop_configuration(trainer: "pl.Trainer", model: "pl.Lightning raise MisconfigurationException(f"No `{step_name}()` method defined to run `Trainer.{trainer_method}`.") -def __verify_batch_transfer_support(trainer: "pl.Trainer", model: "pl.LightningModule") -> None: +def __verify_batch_transfer_support(trainer: "pl.Trainer") -> None: """Raise Misconfiguration exception since these hooks are not supported in DP mode.""" batch_transfer_hooks = ("transfer_batch_to_device", "on_after_batch_transfer") datahook_selector = trainer._data_connector._datahook_selector + assert datahook_selector is not None + for hook in batch_transfer_hooks: # TODO: Remove this blocker once batch transfer to device is integrated in Lightning for DP mode. if isinstance(trainer.strategy, DataParallelStrategy) and ( diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index dae4e4a3527df..d8665843945f7 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -14,7 +14,7 @@ import multiprocessing import os from dataclasses import dataclass, field -from typing import Any, Collection, List, Optional, Tuple, Union +from typing import Any, Iterable, List, Optional, Tuple, Union from weakref import proxy from torch.utils.data import BatchSampler, DataLoader, Sampler, SequentialSampler @@ -55,7 +55,7 @@ def __init__(self, trainer: "pl.Trainer", multiple_trainloader_mode: str = "max_ self._test_dataloader_source = _DataLoaderSource(None, "") self._predict_dataloader_source = _DataLoaderSource(None, "") - self._datahook_selector = _DataHookSelector(None, None) + self._datahook_selector: Optional[_DataHookSelector] = None @property def _should_reload_train_dl(self) -> bool: @@ -230,7 +230,7 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None: category=PossibleUserWarning, ) - def _requires_distributed_sampler(self, dataloader) -> bool: + def _requires_distributed_sampler(self, dataloader: DataLoader) -> bool: return ( self.trainer._accelerator_connector.replace_sampler_ddp and self.trainer._accelerator_connector.is_distributed @@ -292,14 +292,18 @@ def _prepare_dataloader( return dataloader - def _resolve_sampler(self, dataloader: DataLoader, shuffle: bool, mode: Optional[RunningStage] = None) -> Sampler: + def _resolve_sampler( + self, dataloader: DataLoader, shuffle: bool, mode: Optional[RunningStage] = None + ) -> Union[Sampler, Iterable]: if self._requires_distributed_sampler(dataloader): + distributed_sampler_kwargs = self.trainer.distributed_sampler_kwargs + assert distributed_sampler_kwargs is not None sampler = self._get_distributed_sampler( dataloader, shuffle, mode=mode, overfit_batches=self.trainer.overfit_batches, - **self.trainer.distributed_sampler_kwargs, + **distributed_sampler_kwargs, ) # update docs too once this is resolved @@ -357,7 +361,7 @@ def _reset_eval_dataloader( dataloaders = self._resolve_overfit_batches(dataloaders, mode) if not isinstance(dataloaders, list): - dataloaders = [dataloaders] + dataloaders = [dataloaders] # type: ignore[assignment] if any(dl is None for dl in dataloaders): rank_zero_warn("One of given dataloaders is None and it will be skipped.") @@ -426,7 +430,7 @@ def _reset_eval_dataloader( return loader_num_batches, dataloaders - def _request_dataloader(self, stage: RunningStage) -> Union[DataLoader, List[DataLoader]]: + def _request_dataloader(self, stage: RunningStage) -> TRAIN_DATALOADERS: """Requests a dataloader from the given model by calling dataloader hooks corresponding to the given stage. Returns: @@ -447,10 +451,12 @@ def _request_dataloader(self, stage: RunningStage) -> Union[DataLoader, List[Dat return dataloader @staticmethod - def _resolve_overfit_batches(dataloaders: Collection[DataLoader], mode: RunningStage) -> Collection[DataLoader]: + def _resolve_overfit_batches( + dataloaders: Union[TRAIN_DATALOADERS, EVAL_DATALOADERS], mode: RunningStage + ) -> Union[TRAIN_DATALOADERS, EVAL_DATALOADERS]: all_have_sequential_sampler = True - def resolve_has_no_sequential_sampler(dataloader: DataLoader): + def resolve_has_no_sequential_sampler(dataloader: DataLoader) -> None: nonlocal all_have_sequential_sampler all_have_sequential_sampler = all_have_sequential_sampler & isinstance( dataloader.sampler, SequentialSampler @@ -460,19 +466,23 @@ def resolve_has_no_sequential_sampler(dataloader: DataLoader): if not all_have_sequential_sampler: rank_zero_warn( - "You requested to overfit but enabled training dataloader shuffling." + f"You requested to overfit but enabled {mode.dataloader_prefix} dataloader shuffling." f" We are turning off the {mode.dataloader_prefix} dataloader shuffling for you." ) def replace_sampler(dataloader: DataLoader) -> DataLoader: - return _update_dataloader(dataloader, sampler=SequentialSampler(dataloader.dataset), mode=mode) + return _update_dataloader( + dataloader, + sampler=SequentialSampler(dataloader.dataset), # type: ignore[arg-type] + mode=mode, + ) dataloaders = apply_to_collection(dataloaders, DataLoader, replace_sampler) return dataloaders @staticmethod - def _check_eval_shuffling(dataloader, mode): + def _check_eval_shuffling(dataloader: DataLoader, mode: RunningStage) -> None: # limit this warning only for samplers assigned automatically when shuffle is set if _is_dataloader_shuffled(dataloader): rank_zero_warn( @@ -506,18 +516,14 @@ def dataloader(self) -> Union[TRAIN_DATALOADERS, EVAL_DATALOADERS]: If the source is a module, the method with the corresponding :attr:`name` gets called. """ - from pytorch_lightning import LightningDataModule, LightningModule # prevent cyclic import - - if not self.name: - return self.instance - - if isinstance(self.instance, LightningModule): + if isinstance(self.instance, pl.LightningModule): return self.instance.trainer._call_lightning_module_hook(self.name, pl_module=self.instance) - if isinstance(self.instance, LightningDataModule): + if isinstance(self.instance, pl.LightningDataModule): method = getattr(self.instance, self.name) return method() + assert self.instance is not None return self.instance def is_defined(self) -> bool: @@ -532,9 +538,7 @@ def is_module(self) -> bool: It does not check whether ``*_dataloader`` methods are actually overridden. """ - from pytorch_lightning import LightningDataModule, LightningModule # prevent cyclic import - - return isinstance(self.instance, (LightningModule, LightningDataModule)) + return isinstance(self.instance, (pl.LightningModule, pl.LightningDataModule)) @dataclass @@ -553,7 +557,7 @@ class _DataHookSelector: model: "pl.LightningModule" datamodule: Optional["pl.LightningDataModule"] - _valid_hooks: Tuple[str] = field( + _valid_hooks: Tuple[str, ...] = field( default=("on_before_batch_transfer", "transfer_batch_to_device", "on_after_batch_transfer") ) diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 8bee0ac6dfb7f..d5bcec7db85d6 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -2234,7 +2234,7 @@ def is_global_zero(self) -> bool: return self.strategy.is_global_zero @property - def distributed_sampler_kwargs(self) -> Optional[dict]: + def distributed_sampler_kwargs(self) -> Optional[Dict[str, Any]]: if isinstance(self.strategy, ParallelStrategy): return self.strategy.distributed_sampler_kwargs diff --git a/tests/tests_pytorch/trainer/flags/test_overfit_batches.py b/tests/tests_pytorch/trainer/flags/test_overfit_batches.py index da3e154349e1b..dc73e76cc391b 100644 --- a/tests/tests_pytorch/trainer/flags/test_overfit_batches.py +++ b/tests/tests_pytorch/trainer/flags/test_overfit_batches.py @@ -66,7 +66,7 @@ def val_dataloader(self): model = TestModel() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=2) - with pytest.warns(UserWarning, match="requested to overfit but enabled training dataloader shuffling"): + with pytest.warns(UserWarning, match="requested to overfit but enabled train dataloader shuffling"): trainer.fit(model) assert isinstance(trainer.train_dataloader.loaders.sampler, SequentialSampler) From 0102d0d4d48e18c364008137491e7d9697022745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Barz?= Date: Fri, 26 Aug 2022 17:19:08 +0200 Subject: [PATCH 17/30] Fix restoring trainer after `lr_find()` (#14113) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 9 ++++++--- src/pytorch_lightning/tuner/lr_finder.py | 2 ++ tests/tests_pytorch/helpers/utils.py | 5 +++++ tests/tests_pytorch/tuner/test_lr_finder.py | 8 +++++--- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 1776b917c761a..78b32a389f418 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -95,6 +95,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296)) +- Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964)) + + +- Fixed restoring the trainer after using `lr_find()` so that the correct LR schedule is used for the actual training ([#14113](https://github.com/Lightning-AI/lightning/pull/14113)) + + - Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846) @@ -130,9 +136,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed not preserving set attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks ([#14212](https://github.com/Lightning-AI/lightning/pull/14212)) -- Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964)) - - ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/tuner/lr_finder.py b/src/pytorch_lightning/tuner/lr_finder.py index 186dfb5ea7416..45286612231c2 100644 --- a/src/pytorch_lightning/tuner/lr_finder.py +++ b/src/pytorch_lightning/tuner/lr_finder.py @@ -278,6 +278,7 @@ def __lr_finder_dump_params(trainer: "pl.Trainer") -> Dict[str, Any]: "callbacks": trainer.callbacks, "logger": trainer.logger, "max_steps": trainer.fit_loop.max_steps, + "setup_optimizers": trainer.strategy.setup_optimizers, } @@ -297,6 +298,7 @@ def __lr_finder_restore_params(trainer: "pl.Trainer", params: Dict[str, Any]) -> trainer.callbacks = params["callbacks"] trainer.logger = params["logger"] trainer.fit_loop.max_steps = params["max_steps"] + trainer.strategy.setup_optimizers = params["setup_optimizers"] # type: ignore[assignment] class _LRCallback(Callback): diff --git a/tests/tests_pytorch/helpers/utils.py b/tests/tests_pytorch/helpers/utils.py index a9efd7f178f2b..54503bf75b0f7 100644 --- a/tests/tests_pytorch/helpers/utils.py +++ b/tests/tests_pytorch/helpers/utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import functools import os import re from contextlib import contextmanager @@ -78,6 +79,10 @@ def init_checkpoint_callback(logger): return checkpoint +def getattr_recursive(obj, attr): + return functools.reduce(getattr, [obj] + attr.split(".")) + + @contextmanager def no_warning_call(expected_warning: Type[Warning] = UserWarning, match: Optional[str] = None): with pytest.warns(None) as record: diff --git a/tests/tests_pytorch/tuner/test_lr_finder.py b/tests/tests_pytorch/tuner/test_lr_finder.py index 9be115d2f8fda..cf0776d755083 100644 --- a/tests/tests_pytorch/tuner/test_lr_finder.py +++ b/tests/tests_pytorch/tuner/test_lr_finder.py @@ -24,7 +24,7 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.simple_models import ClassificationModel -from tests_pytorch.helpers.utils import no_warning_call +from tests_pytorch.helpers.utils import getattr_recursive, no_warning_call def test_error_on_more_than_1_optimizer(tmpdir): @@ -88,13 +88,15 @@ def test_trainer_reset_correctly(tmpdir): "logger", "global_step", "max_steps", + "fit_loop.max_steps", + "strategy.setup_optimizers", ] - expected = {ca: getattr(trainer, ca) for ca in changed_attributes} + expected = {ca: getattr_recursive(trainer, ca) for ca in changed_attributes} with no_warning_call(UserWarning, match="Please add the following callbacks"): trainer.tuner.lr_find(model, num_training=5) - actual = {ca: getattr(trainer, ca) for ca in changed_attributes} + actual = {ca: getattr_recursive(trainer, ca) for ca in changed_attributes} assert actual == expected assert model.trainer == trainer From fafd2546782c8d2c249c9d7546f74b416e803af7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 26 Aug 2022 17:41:38 +0200 Subject: [PATCH 18/30] Fix device parser logic to avoid creating CUDA context (#14319) * let environment disable forking * add helper function and error messages * tests * changelog Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Jirka Borovec --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../strategies/launchers/multiprocessing.py | 9 +++++++++ .../trainer/connectors/accelerator_connector.py | 5 +++++ src/pytorch_lightning/utilities/device_parser.py | 5 +++-- .../strategies/launchers/test_multiprocessing.py | 10 ++++++++++ .../trainer/connectors/test_accelerator_connector.py | 9 +++++++++ 6 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 78b32a389f418..6c2e54197fa7d 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for saving sharded optimizer state dict outside of `DDPShardedStrategy` ([#14208](https://github.com/PyTorchLightning/pytorch-lightning/pull/14208)) +- Added an environment variable `PL_DISABLE_FORK` that can be used to disable all forking in the Trainer ([#14319](https://github.com/Lightning-AI/lightning/issues/14319)) + + ### Changed diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index 2617e5fe27b10..cee3ed5ee7b2a 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -66,6 +66,10 @@ def __init__(self, strategy: Strategy, start_method: Literal["spawn", "fork", "f f"The start method '{self._start_method}' is not available on this platform. Available methods are:" f" {', '.join(mp.get_all_start_methods())}" ) + if start_method in ("fork", "forkserver") and _is_forking_disabled(): + raise ValueError( + "Forking is disabled in this environment by `PL_DISABLE_FORKING=1`. Choose a different start method." + ) @property def is_interactive_compatible(self) -> bool: @@ -281,3 +285,8 @@ def restore(self) -> None: torch.use_deterministic_algorithms(self.use_deterministic_algorithms) torch.backends.cudnn.benchmark = self.cudnn_benchmark _set_rng_states(self.rng_states) + + +def _is_forking_disabled() -> bool: + """Returns whether forking is disabled through the environment variable ``PL_DISABLE_FORK``.""" + return bool(int(os.environ.get("PL_DISABLE_FORK", "0"))) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 54636f6d617ac..e9183f7f52cf3 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -73,6 +73,7 @@ TPUSpawnStrategy, ) from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES +from pytorch_lightning.strategies.launchers.multiprocessing import _is_forking_disabled from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus from pytorch_lightning.utilities import ( _StrategyType, @@ -642,6 +643,10 @@ def _check_strategy_and_fallback(self) -> None: f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this" f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead." ) + if strategy_flag in _DDP_FORK_ALIASES and _is_forking_disabled(): + raise ValueError( + "Forking is disabled in this environment by `PL_DISABLE_FORKING=1`. Choose a different strategy." + ) if strategy_flag: self._strategy_flag = strategy_flag diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index c30692d0ab01a..1b9f43137943f 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -18,6 +18,7 @@ import torch.cuda from pytorch_lightning.plugins.environments import TorchElasticEnvironment +from pytorch_lightning.strategies.launchers.multiprocessing import _is_forking_disabled from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.types import _DEVICE @@ -323,7 +324,7 @@ def num_cuda_devices() -> int: Unlike :func:`torch.cuda.device_count`, this function will do its best not to create a CUDA context for fork support, if the platform allows it. """ - if "fork" not in torch.multiprocessing.get_all_start_methods(): + if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled(): return torch.cuda.device_count() with multiprocessing.get_context("fork").Pool(1) as pool: return pool.apply(torch.cuda.device_count) @@ -335,7 +336,7 @@ def is_cuda_available() -> bool: Unlike :func:`torch.cuda.is_available`, this function will do its best not to create a CUDA context for fork support, if the platform allows it. """ - if "fork" not in torch.multiprocessing.get_all_start_methods(): + if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled(): return torch.cuda.is_available() with multiprocessing.get_context("fork").Pool(1) as pool: return pool.apply(torch.cuda.is_available) diff --git a/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py b/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py index ad3e891ad607f..4859ddba60574 100644 --- a/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py +++ b/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from unittest import mock from unittest.mock import ANY, Mock @@ -18,6 +19,7 @@ import torch from pytorch_lightning.strategies.launchers.multiprocessing import _GlobalStateSnapshot, _MultiProcessingLauncher +from tests_pytorch.helpers.runif import RunIf @mock.patch("pytorch_lightning.strategies.launchers.multiprocessing.mp.get_all_start_methods", return_value=[]) @@ -26,6 +28,14 @@ def test_multiprocessing_launcher_forking_on_unsupported_platform(_): _MultiProcessingLauncher(strategy=Mock(), start_method="fork") +@RunIf(skip_windows=True) +@pytest.mark.parametrize("start_method", ["fork", "forkserver"]) +@mock.patch.dict(os.environ, {"PL_DISABLE_FORK": "1"}, clear=True) +def test_multiprocessing_launcher_disabled_forking(start_method): + with pytest.raises(ValueError, match="Forking is disabled in this environment"): + _MultiProcessingLauncher(strategy=Mock(), start_method=start_method) + + @pytest.mark.parametrize("start_method", ["spawn", "fork"]) @mock.patch("pytorch_lightning.strategies.launchers.multiprocessing.mp") def test_multiprocessing_launcher_start_method(mp_mock, start_method): diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 1296d962eea2c..434607cab78a2 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -810,3 +810,12 @@ def test_accelerator_specific_checkpoint_io(*_): def test_ddp_fork_on_unsupported_platform(_, strategy): with pytest.raises(ValueError, match="process forking is not supported on this platform"): Trainer(strategy=strategy) + + +@RunIf(skip_windows=True) +@pytest.mark.parametrize("strategy", _DDP_FORK_ALIASES) +@mock.patch.dict(os.environ, {"PL_DISABLE_FORK": "1"}, clear=True) +def test_strategy_choice_ddp_spawn_in_interactive_when_fork_disabled(strategy): + """Test there is an error when forking is disabled via the environment variable and the user requests fork.""" + with pytest.raises(ValueError, match="Forking is disabled in this environment"): + Trainer(devices=2, strategy=strategy) From ed84d04bcf959b6ca8c47e53a0145f6f568920c2 Mon Sep 17 00:00:00 2001 From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com> Date: Fri, 26 Aug 2022 12:26:26 -0400 Subject: [PATCH 19/30] Fix mypy errors attributed to `pytorch_lightning.core.datamodule` (#13693) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: rohitgr7 Co-authored-by: otaj --- pyproject.toml | 1 - src/pytorch_lightning/core/datamodule.py | 66 +++++++++++++------ src/pytorch_lightning/core/saving.py | 13 ++-- src/pytorch_lightning/utilities/argparse.py | 39 +++++------ src/pytorch_lightning/utilities/types.py | 2 + .../tests_pytorch/utilities/test_argparse.py | 2 +- 6 files changed, 74 insertions(+), 49 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cb5274d5774d9..399e1e83efb3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,6 @@ warn_no_return = "False" # mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",' module = [ "pytorch_lightning.callbacks.progress.rich_progress", - "pytorch_lightning.core.datamodule", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", "pytorch_lightning.strategies.sharded", diff --git a/src/pytorch_lightning/core/datamodule.py b/src/pytorch_lightning/core/datamodule.py index e730aabd8c1f9..b7a25badf420f 100644 --- a/src/pytorch_lightning/core/datamodule.py +++ b/src/pytorch_lightning/core/datamodule.py @@ -22,8 +22,13 @@ from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks from pytorch_lightning.core.mixins import HyperparametersMixin from pytorch_lightning.core.saving import _load_from_checkpoint -from pytorch_lightning.utilities.argparse import add_argparse_args, from_argparse_args, get_init_arguments_and_types -from pytorch_lightning.utilities.types import _PATH +from pytorch_lightning.utilities.argparse import ( + add_argparse_args, + from_argparse_args, + get_init_arguments_and_types, + parse_argparser, +) +from pytorch_lightning.utilities.types import _ADD_ARGPARSE_RETURN, _PATH, EVAL_DATALOADERS, TRAIN_DATALOADERS class LightningDataModule(CheckpointHooks, DataHooks, HyperparametersMixin): @@ -55,7 +60,7 @@ def teardown(self): # called on every process in DDP """ - name: str = ... + name: Optional[str] = None CHECKPOINT_HYPER_PARAMS_KEY = "datamodule_hyper_parameters" CHECKPOINT_HYPER_PARAMS_NAME = "datamodule_hparams_name" CHECKPOINT_HYPER_PARAMS_TYPE = "datamodule_hparams_type" @@ -66,7 +71,7 @@ def __init__(self) -> None: self.trainer: Optional["pl.Trainer"] = None @classmethod - def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs) -> ArgumentParser: + def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs: Any) -> _ADD_ARGPARSE_RETURN: """Extends existing argparse by default `LightningDataModule` attributes. Example:: @@ -77,7 +82,9 @@ def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs) -> ArgumentP return add_argparse_args(cls, parent_parser, **kwargs) @classmethod - def from_argparse_args(cls, args: Union[Namespace, ArgumentParser], **kwargs): + def from_argparse_args( + cls, args: Union[Namespace, ArgumentParser], **kwargs: Any + ) -> Union["pl.LightningDataModule", "pl.Trainer"]: """Create an instance from CLI arguments. Args: @@ -92,6 +99,10 @@ def from_argparse_args(cls, args: Union[Namespace, ArgumentParser], **kwargs): """ return from_argparse_args(cls, args, **kwargs) + @classmethod + def parse_argparser(cls, arg_parser: Union[ArgumentParser, Namespace]) -> Namespace: + return parse_argparser(cls, arg_parser) + @classmethod def get_init_arguments_and_types(cls) -> List[Tuple[str, Tuple, Any]]: r"""Scans the DataModule signature and returns argument names, types and default values. @@ -102,6 +113,15 @@ def get_init_arguments_and_types(cls) -> List[Tuple[str, Tuple, Any]]: """ return get_init_arguments_and_types(cls) + @classmethod + def get_deprecated_arg_names(cls) -> List: + """Returns a list with deprecated DataModule arguments.""" + depr_arg_names: List[str] = [] + for name, val in cls.__dict__.items(): + if name.startswith("DEPRECATED") and isinstance(val, (tuple, list)): + depr_arg_names.extend(val) + return depr_arg_names + @classmethod def from_datasets( cls, @@ -112,7 +132,7 @@ def from_datasets( batch_size: int = 1, num_workers: int = 0, **datamodule_kwargs: Any, - ): + ) -> "LightningDataModule": r""" Create an instance from torch.utils.data.Dataset. @@ -133,24 +153,32 @@ def dataloader(ds: Dataset, shuffle: bool = False) -> DataLoader: shuffle &= not isinstance(ds, IterableDataset) return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True) - def train_dataloader(): + def train_dataloader() -> TRAIN_DATALOADERS: + assert train_dataset + if isinstance(train_dataset, Mapping): return {key: dataloader(ds, shuffle=True) for key, ds in train_dataset.items()} if isinstance(train_dataset, Sequence): return [dataloader(ds, shuffle=True) for ds in train_dataset] return dataloader(train_dataset, shuffle=True) - def val_dataloader(): + def val_dataloader() -> EVAL_DATALOADERS: + assert val_dataset + if isinstance(val_dataset, Sequence): return [dataloader(ds) for ds in val_dataset] return dataloader(val_dataset) - def test_dataloader(): + def test_dataloader() -> EVAL_DATALOADERS: + assert test_dataset + if isinstance(test_dataset, Sequence): return [dataloader(ds) for ds in test_dataset] return dataloader(test_dataset) - def predict_dataloader(): + def predict_dataloader() -> EVAL_DATALOADERS: + assert predict_dataset + if isinstance(predict_dataset, Sequence): return [dataloader(ds) for ds in predict_dataset] return dataloader(predict_dataset) @@ -161,19 +189,19 @@ def predict_dataloader(): if accepts_kwargs: special_kwargs = candidate_kwargs else: - accepted_params = set(accepted_params) - accepted_params.discard("self") - special_kwargs = {k: v for k, v in candidate_kwargs.items() if k in accepted_params} + accepted_param_names = set(accepted_params) + accepted_param_names.discard("self") + special_kwargs = {k: v for k, v in candidate_kwargs.items() if k in accepted_param_names} datamodule = cls(**datamodule_kwargs, **special_kwargs) if train_dataset is not None: - datamodule.train_dataloader = train_dataloader + datamodule.train_dataloader = train_dataloader # type: ignore[assignment] if val_dataset is not None: - datamodule.val_dataloader = val_dataloader + datamodule.val_dataloader = val_dataloader # type: ignore[assignment] if test_dataset is not None: - datamodule.test_dataloader = test_dataloader + datamodule.test_dataloader = test_dataloader # type: ignore[assignment] if predict_dataset is not None: - datamodule.predict_dataloader = predict_dataloader + datamodule.predict_dataloader = predict_dataloader # type: ignore[assignment] return datamodule def state_dict(self) -> Dict[str, Any]: @@ -197,8 +225,8 @@ def load_from_checkpoint( cls, checkpoint_path: Union[_PATH, IO], hparams_file: Optional[_PATH] = None, - **kwargs, - ): + **kwargs: Any, + ) -> Union["pl.LightningModule", "pl.LightningDataModule"]: r""" Primary way of loading a datamodule from a checkpoint. When Lightning saves a checkpoint it stores the arguments passed to ``__init__`` in the checkpoint under ``"datamodule_hyper_parameters"``. diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py index ffdc0988a1a6e..2df7a661c2bd6 100644 --- a/src/pytorch_lightning/core/saving.py +++ b/src/pytorch_lightning/core/saving.py @@ -171,10 +171,10 @@ def on_hpc_load(self, checkpoint: Dict[str, Any]) -> None: def _load_from_checkpoint( cls: Union[Type["ModelIO"], Type["pl.LightningModule"], Type["pl.LightningDataModule"]], - checkpoint_path: Union[str, IO], + checkpoint_path: Union[_PATH, IO], map_location: _MAP_LOCATION_TYPE = None, - hparams_file: Optional[str] = None, - strict: bool = True, + hparams_file: Optional[_PATH] = None, + strict: Optional[bool] = None, **kwargs: Any, ) -> Union["pl.LightningModule", "pl.LightningDataModule"]: if map_location is None: @@ -183,7 +183,7 @@ def _load_from_checkpoint( checkpoint = pl_load(checkpoint_path, map_location=map_location) if hparams_file is not None: - extension = hparams_file.split(".")[-1] + extension = str(hparams_file).split(".")[-1] if extension.lower() == "csv": hparams = load_hparams_from_tags_csv(hparams_file) elif extension.lower() in ("yml", "yaml"): @@ -201,8 +201,6 @@ def _load_from_checkpoint( if issubclass(cls, pl.LightningDataModule): return _load_state(cls, checkpoint, **kwargs) - # allow cls to be evaluated as subclassed LightningModule or, - # as LightningModule for internal tests if issubclass(cls, pl.LightningModule): return _load_state(cls, checkpoint, strict=strict, **kwargs) @@ -210,7 +208,7 @@ def _load_from_checkpoint( def _load_state( cls: Union[Type["pl.LightningModule"], Type["pl.LightningDataModule"]], checkpoint: Dict[str, Any], - strict: bool = True, + strict: Optional[bool] = None, **cls_kwargs_new: Any, ) -> Union["pl.LightningModule", "pl.LightningDataModule"]: cls_spec = inspect.getfullargspec(cls.__init__) @@ -257,6 +255,7 @@ def _load_state( return obj # load the state_dict on the model automatically + assert strict is not None keys = obj.load_state_dict(checkpoint["state_dict"], strict=strict) if not strict: diff --git a/src/pytorch_lightning/utilities/argparse.py b/src/pytorch_lightning/utilities/argparse.py index 58ced375fcae5..26277db183410 100644 --- a/src/pytorch_lightning/utilities/argparse.py +++ b/src/pytorch_lightning/utilities/argparse.py @@ -15,7 +15,6 @@ import inspect import os -from abc import ABC from argparse import _ArgumentGroup, ArgumentParser, Namespace from ast import literal_eval from contextlib import suppress @@ -24,22 +23,17 @@ import pytorch_lightning as pl from pytorch_lightning.utilities.parsing import str_to_bool, str_to_bool_or_int, str_to_bool_or_str +from pytorch_lightning.utilities.types import _ADD_ARGPARSE_RETURN _T = TypeVar("_T", bound=Callable[..., Any]) - - -class ParseArgparserDataType(ABC): - def __init__(self, *_: Any, **__: Any) -> None: - pass - - @classmethod - def parse_argparser(cls, args: "ArgumentParser") -> Any: - pass +_ARGPARSE_CLS = Union[Type["pl.LightningDataModule"], Type["pl.Trainer"]] def from_argparse_args( - cls: Type[ParseArgparserDataType], args: Union[Namespace, ArgumentParser], **kwargs: Any -) -> ParseArgparserDataType: + cls: _ARGPARSE_CLS, + args: Union[Namespace, ArgumentParser], + **kwargs: Any, +) -> Union["pl.LightningDataModule", "pl.Trainer"]: """Create an instance from CLI arguments. Eventually use variables from OS environment which are defined as ``"PL__"``. @@ -72,7 +66,7 @@ def from_argparse_args( return cls(**trainer_kwargs) -def parse_argparser(cls: Type["pl.Trainer"], arg_parser: Union[ArgumentParser, Namespace]) -> Namespace: +def parse_argparser(cls: _ARGPARSE_CLS, arg_parser: Union[ArgumentParser, Namespace]) -> Namespace: """Parse CLI arguments, required for custom bool types.""" args = arg_parser.parse_args() if isinstance(arg_parser, ArgumentParser) else arg_parser @@ -97,7 +91,7 @@ def parse_argparser(cls: Type["pl.Trainer"], arg_parser: Union[ArgumentParser, N return Namespace(**modified_args) -def parse_env_variables(cls: Type["pl.Trainer"], template: str = "PL_%(cls_name)s_%(cls_argument)s") -> Namespace: +def parse_env_variables(cls: _ARGPARSE_CLS, template: str = "PL_%(cls_name)s_%(cls_argument)s") -> Namespace: """Parse environment arguments if they are defined. Examples: @@ -127,7 +121,7 @@ def parse_env_variables(cls: Type["pl.Trainer"], template: str = "PL_%(cls_name) return Namespace(**env_args) -def get_init_arguments_and_types(cls: Any) -> List[Tuple[str, Tuple, Any]]: +def get_init_arguments_and_types(cls: _ARGPARSE_CLS) -> List[Tuple[str, Tuple, Any]]: r"""Scans the class signature and returns argument names, types and default values. Returns: @@ -155,7 +149,7 @@ def get_init_arguments_and_types(cls: Any) -> List[Tuple[str, Tuple, Any]]: return name_type_default -def _get_abbrev_qualified_cls_name(cls: Any) -> str: +def _get_abbrev_qualified_cls_name(cls: _ARGPARSE_CLS) -> str: assert isinstance(cls, type), repr(cls) if cls.__module__.startswith("pytorch_lightning."): # Abbreviate. @@ -165,8 +159,11 @@ def _get_abbrev_qualified_cls_name(cls: Any) -> str: def add_argparse_args( - cls: Type["pl.Trainer"], parent_parser: ArgumentParser, *, use_argument_group: bool = True -) -> Union[_ArgumentGroup, ArgumentParser]: + cls: _ARGPARSE_CLS, + parent_parser: ArgumentParser, + *, + use_argument_group: bool = True, +) -> _ADD_ARGPARSE_RETURN: r"""Extends existing argparse by default attributes for ``cls``. Args: @@ -207,10 +204,10 @@ def add_argparse_args( >>> args = parser.parse_args([]) """ if isinstance(parent_parser, _ArgumentGroup): - raise RuntimeError("Please only pass an ArgumentParser instance.") + raise RuntimeError("Please only pass an `ArgumentParser` instance.") if use_argument_group: group_name = _get_abbrev_qualified_cls_name(cls) - parser: Union[_ArgumentGroup, ArgumentParser] = parent_parser.add_argument_group(group_name) + parser: _ADD_ARGPARSE_RETURN = parent_parser.add_argument_group(group_name) else: parser = ArgumentParser(parents=[parent_parser], add_help=False) @@ -222,7 +219,7 @@ def add_argparse_args( # Get symbols from cls or init function. for symbol in (cls, cls.__init__): - args_and_types = get_init_arguments_and_types(symbol) + args_and_types = get_init_arguments_and_types(symbol) # type: ignore[arg-type] args_and_types = [x for x in args_and_types if x[0] not in ignore_arg_names] if len(args_and_types) > 0: break diff --git a/src/pytorch_lightning/utilities/types.py b/src/pytorch_lightning/utilities/types.py index 9f2db6422612f..7ab3d6948854c 100644 --- a/src/pytorch_lightning/utilities/types.py +++ b/src/pytorch_lightning/utilities/types.py @@ -16,6 +16,7 @@ - Do not include any `_TYPE` suffix - Types used in public hooks (as those in the `LightningModule` and `Callback`) should be public (no leading `_`) """ +from argparse import _ArgumentGroup, ArgumentParser from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path @@ -50,6 +51,7 @@ EVAL_DATALOADERS = Union[DataLoader, Sequence[DataLoader]] _DEVICE = Union[torch.device, str, int] _MAP_LOCATION_TYPE = Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] +_ADD_ARGPARSE_RETURN = Union[_ArgumentGroup, ArgumentParser] @runtime_checkable diff --git a/tests/tests_pytorch/utilities/test_argparse.py b/tests/tests_pytorch/utilities/test_argparse.py index 1e83d96a0648e..2a88e8db531f9 100644 --- a/tests/tests_pytorch/utilities/test_argparse.py +++ b/tests/tests_pytorch/utilities/test_argparse.py @@ -207,7 +207,7 @@ def test_add_argparse_args(cls, name): def test_negative_add_argparse_args(): - with pytest.raises(RuntimeError, match="Please only pass an ArgumentParser instance."): + with pytest.raises(RuntimeError, match="Please only pass an `ArgumentParser` instance."): parser = ArgumentParser() add_argparse_args(AddArgparseArgsExampleClass, parser.add_argument_group("bad workflow")) From d4bcafad7a64d7c39598fa7e4e33b81a1be31828 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 26 Aug 2022 18:56:56 +0200 Subject: [PATCH 20/30] Remove the deprecated loop output format (#14373) --- src/pytorch_lightning/CHANGELOG.md | 6 + .../loops/epoch/training_epoch_loop.py | 39 +----- src/pytorch_lightning/loops/utilities.py | 9 +- .../deprecated_api/test_remove_1-8.py | 50 -------- .../loops/epoch/test_training_epoch_loop.py | 115 +++--------------- tests/tests_pytorch/loops/test_utilities.py | 26 +--- 6 files changed, 27 insertions(+), 218 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 6c2e54197fa7d..4b5e3a893429e 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -81,6 +81,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated `DistributedType` and `DeviceType` enum classes ([#14045](https://github.com/Lightning-AI/lightning/pull/14045)) +- Remove the deprecated `on_train_batch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#14373](https://github.com/PyTorchLightning/pytorch-lightning/pull/14373)) + + +- Remove the deprecated `training_epoch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#14373](https://github.com/PyTorchLightning/pytorch-lightning/pull/14373)) + + - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py index 6b3bc38400803..0fe93ca7104b6 100644 --- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -22,7 +22,7 @@ from pytorch_lightning import loops # import as loops to avoid circular imports from pytorch_lightning.loops.batch import TrainingBatchLoop from pytorch_lightning.loops.batch.training_batch_loop import _OUTPUTS_TYPE as _BATCH_OUTPUTS_TYPE -from pytorch_lightning.loops.utilities import _get_active_optimizers, _is_max_limit_reached, _v1_8_output_format +from pytorch_lightning.loops.utilities import _get_active_optimizers, _is_max_limit_reached from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection from pytorch_lightning.trainer.progress import BatchProgress, SchedulerProgress from pytorch_lightning.trainer.supporters import CombinedLoader @@ -31,7 +31,7 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.fetching import AbstractDataFetcher, DataLoaderIterDataFetcher from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn +from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature from pytorch_lightning.utilities.warnings import WarningCache @@ -342,24 +342,6 @@ def _prepare_outputs_training_batch_end( ) array = np.array(batch_output, dtype=object) - # TODO: remove in v1.8 - if ( - num_optimizers > 1 - and lightning_module.truncated_bptt_steps > 0 - and is_overridden("on_train_batch_end", lightning_module) - and not _v1_8_output_format(lightning_module.on_train_batch_end) - ): - rank_zero_deprecation( - "You are training with multiple optimizers AND truncated backpropagation through time enabled." - " The current format of the `on_train_batch_end(outputs, ...)` is a 2d list with sizes" - " (n_optimizers, tbptt_steps), however, this has been deprecated and will change in version v1.8 to" - " (tbptt_steps, n_optimizers). You can update your code by adding the following parameter to your" - " hook signature: `on_train_batch_end(outputs, ..., new_format=True)`." - ) - # (tbptt_steps, n_opt) -> (n_opt, tbptt_steps) - if array.ndim == 1: - array = np.expand_dims(array, 1) - array = array.transpose((1, 0)) # squeeze all single-element dimensions array = array.squeeze() array = array.tolist() @@ -384,23 +366,6 @@ def _prepare_outputs_training_epoch_end( ) array = _recursive_pad(batch_outputs) - # TODO: remove in v1.8 - if ( - num_optimizers > 1 - and lightning_module.truncated_bptt_steps > 0 - and not _v1_8_output_format(lightning_module.on_train_epoch_end) - ): - rank_zero_deprecation( - "You are training with multiple optimizers AND truncated backpropagation through time enabled." - " The current format of the `training_epoch_end(outputs)` is a 3d list with sizes" - " (n_optimizers, n_batches, tbptt_steps), however, this has been deprecated and will change in version" - " v1.8 to (n_batches, tbptt_steps, n_optimizers). You can update your code by adding the following" - " parameter to your hook signature: `training_epoch_end(outputs, new_format=True)`." - ) - # (n_batches, tbptt_steps, n_opt) -> (n_opt, n_batches, tbptt_steps) - if array.ndim == 2: - array = np.expand_dims(array, 2) - array = array.transpose((2, 0, 1)) # squeeze all single-element dimensions array = array.squeeze() array = array.tolist() diff --git a/src/pytorch_lightning/loops/utilities.py b/src/pytorch_lightning/loops/utilities.py index 491af6605c135..9b8ec84ba3661 100644 --- a/src/pytorch_lightning/loops/utilities.py +++ b/src/pytorch_lightning/loops/utilities.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import inspect from collections import OrderedDict from contextlib import contextmanager from functools import lru_cache -from typing import Any, Callable, Generator, List, Optional, Sequence, Tuple +from typing import Any, Generator, List, Optional, Sequence, Tuple import numpy as np import torch @@ -216,12 +215,6 @@ def _reset_progress(loop: Loop) -> None: _reset_progress(v) -# TODO: remove in v1.8 -def _v1_8_output_format(fx: Callable) -> bool: - parameters = inspect.signature(fx).parameters - return "new_format" in parameters and parameters["new_format"].default is True - - def _set_sampler_epoch(dataloader: DataLoader, epoch: int) -> None: """Calls the ``set_epoch`` method on either the sampler or the batch sampler of the given dataloader. diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index a69071fd67610..e1fb0b57654c0 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -38,7 +38,6 @@ from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn -from tests_pytorch.deprecated_api import no_deprecated_call from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.torchtext_utils import get_dummy_torchtext_data_iterator @@ -584,55 +583,6 @@ def test_v1_8_0_weights_save_path(tmpdir): _ = trainer.weights_save_path -def test_deprecated_epoch_outputs_format(tmpdir): - class DeprecationModel(BoringModel): - def __init__(self): - super().__init__() - self.truncated_bptt_steps = 1 - - def training_step(self, batch, batch_idx, optimizer_idx, hiddens): - output = super().training_step(batch, batch_idx) - output["hiddens"] = hiddens - return output - - def tbptt_split_batch(self, batch, split_size): - return [batch, batch] - - def training_epoch_end(self, outputs): - ... - - def on_train_batch_end(self, outputs, batch, batch_idx) -> None: - ... - - def configure_optimizers(self): - return [torch.optim.Adam(self.parameters()), torch.optim.Adam(self.parameters())] - - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) - model = DeprecationModel() - batch_match = r"on_train_batch_end.*will change in version v1.8 to \(tbptt_steps, n_optimizers\)" - with pytest.deprecated_call(match=batch_match): - trainer.fit(model) - - class DeprecationModel2(DeprecationModel): - def on_train_batch_end(self, *args, new_format=True): - ... - - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) - model = DeprecationModel() - epoch_match = r"training_epoch_end.*will change in version v1.8 to \(n_batches, tbptt_steps, n_optimizers\)" - with pytest.deprecated_call(match=epoch_match): - trainer.fit(model) - - class NoDeprecationModel(DeprecationModel2): - def training_epoch_end(self, outputs, new_format=True): - ... - - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) - model = NoDeprecationModel() - with no_deprecated_call(match="will change in version v1.8.*new_format=True"): - trainer.fit(model) - - @pytest.mark.flaky(reruns=3) @pytest.mark.parametrize(["action", "expected"], [("a", [3, 1]), ("b", [2]), ("c", [1])]) def test_simple_profiler_iterable_durations(tmpdir, action: str, expected: list): diff --git a/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py b/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py index d53871116e8b7..d601542075004 100644 --- a/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py +++ b/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest import mock from unittest.mock import patch import pytest @@ -20,7 +19,6 @@ from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.loops import TrainingEpochLoop from pytorch_lightning.trainer.trainer import Trainer -from tests_pytorch.deprecated_api import no_deprecated_call _out00 = {"loss": 0.0} _out01 = {"loss": 0.1} @@ -33,43 +31,33 @@ class TestPrepareOutputs: - def prepare_outputs(self, fn, tbptt_splits, new_format, batch_outputs, num_optimizers, automatic_optimization): + def prepare_outputs(self, fn, tbptt_splits, batch_outputs, num_optimizers, automatic_optimization): lightning_module = LightningModule() - lightning_module.on_train_batch_end = lambda *_: None # override to trigger the deprecation message lightning_module.automatic_optimization = automatic_optimization lightning_module.truncated_bptt_steps = tbptt_splits - match = "will change in version v1.8.*new_format=True" - will_warn = tbptt_splits and num_optimizers > 1 and not new_format - ctx_manager = pytest.deprecated_call if will_warn else no_deprecated_call - with ctx_manager(match=match): - with mock.patch( - "pytorch_lightning.loops.epoch.training_epoch_loop._v1_8_output_format", return_value=new_format - ): - return fn( - batch_outputs, - lightning_module=lightning_module, - num_optimizers=num_optimizers, # does not matter for manual optimization - ) + return fn( + batch_outputs, + lightning_module=lightning_module, + num_optimizers=num_optimizers, # does not matter for manual optimization + ) def prepare_outputs_training_epoch_end( - self, tbptt_splits, new_format, batch_outputs, num_optimizers, automatic_optimization=True + self, tbptt_splits, batch_outputs, num_optimizers, automatic_optimization=True ): return self.prepare_outputs( TrainingEpochLoop._prepare_outputs_training_epoch_end, tbptt_splits, - new_format, batch_outputs, num_optimizers, automatic_optimization=automatic_optimization, ) def prepare_outputs_training_batch_end( - self, tbptt_splits, new_format, batch_outputs, num_optimizers, automatic_optimization=True + self, tbptt_splits, batch_outputs, num_optimizers, automatic_optimization=True ): return self.prepare_outputs( TrainingEpochLoop._prepare_outputs_training_batch_end, tbptt_splits, - new_format, batch_outputs, num_optimizers, automatic_optimization=automatic_optimization, @@ -97,53 +85,19 @@ def prepare_outputs_training_batch_end( ), # 1 batch, tbptt with 2 splits (uneven) (1, 2, [[{0: _out00}, {0: _out01}], [{0: _out03}]], [[_out00, _out01], [_out03]]), - ], - ) - @pytest.mark.parametrize("new_format", (False, True)) - def test_prepare_outputs_training_epoch_end_automatic( - self, num_optimizers, tbptt_splits, batch_outputs, expected, new_format - ): - """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook - currently expects in the case of automatic optimization.""" - assert ( - self.prepare_outputs_training_epoch_end(tbptt_splits, new_format, batch_outputs, num_optimizers) == expected - ) - - @pytest.mark.parametrize( - "num_optimizers,tbptt_splits,batch_outputs,expected", - [ - # 3 batches, tbptt with 2 splits, 2 optimizers alternating - ( - 2, - 2, - [[{0: _out00}, {0: _out01}], [{1: _out10}, {1: _out11}], [{0: _out02}, {0: _out03}]], - [[[_out00, _out01], [], [_out02, _out03]], [[], [_out10, _out11], []]], - ) - ], - ) - def test_prepare_outputs_training_epoch_end_automatic_old_format( - self, num_optimizers, tbptt_splits, batch_outputs, expected - ): - assert self.prepare_outputs_training_epoch_end(tbptt_splits, False, batch_outputs, num_optimizers) == expected - - @pytest.mark.parametrize( - "num_optimizers,tbptt_splits,batch_outputs,expected", - [ # 3 batches, tbptt with 2 splits, 2 optimizers alternating ( 2, 2, [[{0: _out00}, {0: _out01}], [{1: _out10}, {1: _out11}], [{0: _out02}, {0: _out03}]], [[[_out00], [_out01]], [[_out10], [_out11]], [[_out02], [_out03]]], - ) + ), ], ) - def test_prepare_outputs_training_epoch_end_automatic_new_format( - self, num_optimizers, tbptt_splits, batch_outputs, expected - ): + def test_prepare_outputs_training_epoch_end_automatic(self, num_optimizers, tbptt_splits, batch_outputs, expected): """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook currently expects in the case of automatic optimization.""" - assert self.prepare_outputs_training_epoch_end(tbptt_splits, True, batch_outputs, num_optimizers) == expected + assert self.prepare_outputs_training_epoch_end(tbptt_splits, batch_outputs, num_optimizers) == expected @pytest.mark.parametrize( "batch_outputs,expected", @@ -160,14 +114,10 @@ def test_prepare_outputs_training_epoch_end_automatic_new_format( ([[_out00, _out01], [_out02, _out03], [], [_out10]], [[_out00, _out01], [_out02, _out03], [_out10]]), ], ) - @pytest.mark.parametrize("new_format", (False, True)) - def test_prepare_outputs_training_epoch_end_manual(self, batch_outputs, expected, new_format): + def test_prepare_outputs_training_epoch_end_manual(self, batch_outputs, expected): """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook currently expects in the case of manual optimization.""" - assert ( - self.prepare_outputs_training_epoch_end(0, new_format, batch_outputs, -1, automatic_optimization=False) - == expected - ) + assert self.prepare_outputs_training_epoch_end(0, batch_outputs, -1, automatic_optimization=False) == expected @pytest.mark.parametrize( "num_optimizers,tbptt_splits,batch_end_outputs,expected", @@ -180,47 +130,17 @@ def test_prepare_outputs_training_epoch_end_manual(self, batch_outputs, expected (2, 0, [{0: _out00, 1: _out01}], [_out00, _out01]), # tbptt with 2 splits (1, 2, [{0: _out00}, {0: _out01}], [_out00, _out01]), + # 2 optimizers, tbptt with 2 splits + (2, 2, [{0: _out00, 1: _out01}, {0: _out10, 1: _out11}], [[_out00, _out01], [_out10, _out11]]), ], ) - @pytest.mark.parametrize("new_format", (False, True)) def test_prepare_outputs_training_batch_end_automatic( - self, num_optimizers, tbptt_splits, batch_end_outputs, expected, new_format - ): - """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook - currently expects in the case of automatic optimization.""" - - assert ( - self.prepare_outputs_training_batch_end(tbptt_splits, new_format, batch_end_outputs, num_optimizers) - == expected - ) - - @pytest.mark.parametrize( - "num_optimizers,tbptt_splits,batch_end_outputs,expected", - # 2 optimizers, tbptt with 2 splits - [(2, 2, [{0: _out00, 1: _out01}, {0: _out10, 1: _out11}], [[_out00, _out10], [_out01, _out11]])], - ) - def test_prepare_outputs_training_batch_end_automatic_old_format( self, num_optimizers, tbptt_splits, batch_end_outputs, expected ): """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook currently expects in the case of automatic optimization.""" - assert ( - self.prepare_outputs_training_batch_end(tbptt_splits, False, batch_end_outputs, num_optimizers) == expected - ) - @pytest.mark.parametrize( - "num_optimizers,tbptt_splits,batch_end_outputs,expected", - # 2 optimizers, tbptt with 2 splits - [(2, 2, [{0: _out00, 1: _out01}, {0: _out10, 1: _out11}], [[_out00, _out01], [_out10, _out11]])], - ) - def test_prepare_outputs_training_batch_end_automatic_new_format( - self, num_optimizers, tbptt_splits, batch_end_outputs, expected - ): - """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook - currently expects in the case of automatic optimization.""" - assert ( - self.prepare_outputs_training_batch_end(tbptt_splits, True, batch_end_outputs, num_optimizers) == expected - ) + assert self.prepare_outputs_training_batch_end(tbptt_splits, batch_end_outputs, num_optimizers) == expected @pytest.mark.parametrize( "batch_end_outputs,expected", @@ -237,8 +157,7 @@ def test_prepare_outputs_training_batch_end_manual(self, batch_end_outputs, expe """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook currently expects in the case of manual optimization.""" assert ( - self.prepare_outputs_training_batch_end(0, False, batch_end_outputs, -1, automatic_optimization=False) - == expected + self.prepare_outputs_training_batch_end(0, batch_end_outputs, -1, automatic_optimization=False) == expected ) diff --git a/tests/tests_pytorch/loops/test_utilities.py b/tests/tests_pytorch/loops/test_utilities.py index 914c1de8e115b..2bd86d325806d 100644 --- a/tests/tests_pytorch/loops/test_utilities.py +++ b/tests/tests_pytorch/loops/test_utilities.py @@ -16,7 +16,7 @@ import pytest import torch -from pytorch_lightning.loops.utilities import _extract_hiddens, _set_sampler_epoch, _v1_8_output_format +from pytorch_lightning.loops.utilities import _extract_hiddens, _set_sampler_epoch from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -41,30 +41,6 @@ def test_extract_hiddens(): _extract_hiddens(None, 1) -def test_v1_8_output_format(): - # old format - def training_epoch_end(outputs): - ... - - assert not _v1_8_output_format(training_epoch_end) - - def training_epoch_end(outputs, new_format=1): - ... - - assert not _v1_8_output_format(training_epoch_end) - - def training_epoch_end(outputs, new_format=False): - ... - - assert not _v1_8_output_format(training_epoch_end) - - # new format - def training_epoch_end(outputs, new_format=True): - ... - - assert _v1_8_output_format(training_epoch_end) - - def test_set_sampler_epoch(): # No samplers dataloader = Mock() From cced33542db5896afc0e8b3e3c9263dba44deb4f Mon Sep 17 00:00:00 2001 From: Jordi Smit Date: Fri, 26 Aug 2022 19:17:20 +0200 Subject: [PATCH 21/30] Disable non blocking to device with MPS (#14368) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * disable non-blocking for mps due to race condition bug * fixed typo * fixed: unknown mps device for non arm systems * Removed unrobust test case * moved _MPS_DEVICES such that we used in apply_func * Resolve circular dependencies * Comment rewording * changed torchElasticEnvironment to a global import * simplified if statement to blocking device type * Added change to CHANGELOG * Update src/pytorch_lightning/utilities/apply_func.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed mypy not detecting casting of device * Moved check into if statement to mainain original behavior Co-authored-by: Carlos Mocholí Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/accelerators/cpu.py | 6 +++--- src/pytorch_lightning/accelerators/hpu.py | 5 +++-- src/pytorch_lightning/accelerators/ipu.py | 2 +- .../plugins/environments/xla_environment.py | 2 +- src/pytorch_lightning/utilities/apply_func.py | 8 ++++++-- src/pytorch_lightning/utilities/device_parser.py | 1 + 7 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 4b5e3a893429e..4a746ac91b911 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -104,6 +104,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296)) +- Fixed incorrect values after transferring data to a MPS device ([#13285](https://github.com/Lightning-AI/lightning/issues/13285)) + + - Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964)) diff --git a/src/pytorch_lightning/accelerators/cpu.py b/src/pytorch_lightning/accelerators/cpu.py index fea8ee70d17df..d0981e7269305 100644 --- a/src/pytorch_lightning/accelerators/cpu.py +++ b/src/pytorch_lightning/accelerators/cpu.py @@ -16,7 +16,7 @@ import torch from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities import device_parser +from pytorch_lightning.utilities.device_parser import parse_cpu_cores from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE from pytorch_lightning.utilities.types import _DEVICE @@ -42,13 +42,13 @@ def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> int: """Accelerator device parsing logic.""" - devices = device_parser.parse_cpu_cores(devices) + devices = parse_cpu_cores(devices) return devices @staticmethod def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.device]: """Gets parallel devices for the Accelerator.""" - devices = device_parser.parse_cpu_cores(devices) + devices = parse_cpu_cores(devices) return [torch.device("cpu")] * devices @staticmethod diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py index 8fc242fa55f20..c85e81756c2a9 100644 --- a/src/pytorch_lightning/accelerators/hpu.py +++ b/src/pytorch_lightning/accelerators/hpu.py @@ -17,8 +17,9 @@ import torch from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities import _HPU_AVAILABLE, device_parser +from pytorch_lightning.utilities.device_parser import parse_hpus from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _HPU_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_debug if _HPU_AVAILABLE: @@ -61,7 +62,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[int]: """Accelerator device parsing logic.""" - return device_parser.parse_hpus(devices) + return parse_hpus(devices) @staticmethod def get_parallel_devices(devices: int) -> List[torch.device]: diff --git a/src/pytorch_lightning/accelerators/ipu.py b/src/pytorch_lightning/accelerators/ipu.py index b5110e58028a5..b09fd33c29227 100644 --- a/src/pytorch_lightning/accelerators/ipu.py +++ b/src/pytorch_lightning/accelerators/ipu.py @@ -16,7 +16,7 @@ import torch from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities import _IPU_AVAILABLE +from pytorch_lightning.utilities.imports import _IPU_AVAILABLE class IPUAccelerator(Accelerator): diff --git a/src/pytorch_lightning/plugins/environments/xla_environment.py b/src/pytorch_lightning/plugins/environments/xla_environment.py index a78ebeb36a6a4..4072f6f8715f5 100644 --- a/src/pytorch_lightning/plugins/environments/xla_environment.py +++ b/src/pytorch_lightning/plugins/environments/xla_environment.py @@ -15,7 +15,7 @@ import os from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment -from pytorch_lightning.utilities import _TPU_AVAILABLE +from pytorch_lightning.utilities.imports import _TPU_AVAILABLE if _TPU_AVAILABLE: import torch_xla.core.xla_env_vars as xenv diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py index 15e9962e40bff..fe8b0b836456f 100644 --- a/src/pytorch_lightning/utilities/apply_func.py +++ b/src/pytorch_lightning/utilities/apply_func.py @@ -38,7 +38,7 @@ Batch = type(None) -_CPU_DEVICES = ("cpu", torch.device("cpu")) +_BLOCKING_DEVICE_TYPES = ("cpu", "mps") def to_dtype_tensor( @@ -322,6 +322,9 @@ def move_data_to_device(batch: Any, device: Union[str, torch.device]) -> Any: - :class:`torch.device` """ + if isinstance(device, str): + device = torch.device(device) + def batch_to(data: Any) -> Any: # try to move torchtext data first if _TORCHTEXT_LEGACY and isinstance(data, Batch): @@ -342,7 +345,8 @@ def batch_to(data: Any) -> Any: kwargs = {} # Don't issue non-blocking transfers to CPU - if isinstance(data, Tensor) and device not in _CPU_DEVICES: + # Same with MPS due to a race condition bug: https://github.com/pytorch/pytorch/issues/83015 + if isinstance(data, Tensor) and isinstance(device, torch.device) and device.type not in _BLOCKING_DEVICE_TYPES: kwargs["non_blocking"] = True data_output = data.to(device, **kwargs) if data_output is not None: diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index 1b9f43137943f..32f370b5b246e 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -93,6 +93,7 @@ def parse_gpu_ids( gpus = _normalize_parse_gpu_input_to_list(gpus, include_cuda=include_cuda, include_mps=include_mps) if not gpus: raise MisconfigurationException("GPUs requested but none are available.") + if ( TorchElasticEnvironment.detect() and len(gpus) != 1 From 89506135527d98c6524635ea044b8eb15414d950 Mon Sep 17 00:00:00 2001 From: Tianshu Wang Date: Sat, 27 Aug 2022 01:23:54 +0800 Subject: [PATCH 22/30] save checkpoints and profiler output to the first logger (#14325) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: Jirka Borovec --- .../common/checkpointing_intermediate.rst | 2 +- src/pytorch_lightning/CHANGELOG.md | 2 ++ .../callbacks/model_checkpoint.py | 14 ++++++-------- src/pytorch_lightning/trainer/trainer.py | 13 ++++++------- src/pytorch_lightning/utilities/logger.py | 8 -------- tests/tests_pytorch/profilers/test_profiler.py | 5 +---- .../trainer/properties/test_log_dir.py | 7 ++++--- tests/tests_pytorch/utilities/test_logger.py | 18 ------------------ 8 files changed, 20 insertions(+), 49 deletions(-) diff --git a/docs/source-pytorch/common/checkpointing_intermediate.rst b/docs/source-pytorch/common/checkpointing_intermediate.rst index f6dc1c955880f..7e17d8abc808f 100644 --- a/docs/source-pytorch/common/checkpointing_intermediate.rst +++ b/docs/source-pytorch/common/checkpointing_intermediate.rst @@ -120,7 +120,7 @@ What Where ===== -- It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`. +- By default, the ``ModelCheckpoint`` will save files into the ``Trainer.log_dir``. It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`. | diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 4a746ac91b911..f2fb0a2aacf3f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -45,6 +45,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384)) +- When using multiple loggers, by default checkpoints and profiler output now get saved to the log dir of the first logger in the list ([14325](https://github.com/Lightning-AI/lightning/pull/14325)) + ### Deprecated diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py index 9b49b9d44bb10..1ad86a0917dac 100644 --- a/src/pytorch_lightning/callbacks/model_checkpoint.py +++ b/src/pytorch_lightning/callbacks/model_checkpoint.py @@ -37,7 +37,6 @@ from pytorch_lightning.callbacks import Checkpoint from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.logger import _name, _version from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.types import _PATH, STEP_OUTPUT from pytorch_lightning.utilities.warnings import WarningCache @@ -591,15 +590,14 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: if trainer._weights_save_path_internal != trainer.default_root_dir: # the user has changed weights_save_path ckpt_path = os.path.join(trainer._weights_save_path_internal, "checkpoints") - elif trainer.loggers: - if len(trainer.loggers) == 1: - assert trainer.logger is not None - save_dir = trainer.logger.save_dir or trainer.default_root_dir + elif len(trainer.loggers) > 0: + if trainer.loggers[0].save_dir is not None: + save_dir = trainer.loggers[0].save_dir else: save_dir = trainer.default_root_dir - - name = _name(trainer.loggers) - version = _version(trainer.loggers) + save_dir = trainer.loggers[0].save_dir or trainer.default_root_dir + name = trainer.loggers[0].name + version = trainer.loggers[0].version version = version if isinstance(version, str) else f"version_{version}" ckpt_path = os.path.join(save_dir, str(name), version, "checkpoints") diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index d5bcec7db85d6..4c9456e8ead37 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -297,9 +297,8 @@ def __init__( logger: Logger (or iterable collection of loggers) for experiment tracking. A ``True`` value uses the default ``TensorBoardLogger``. ``False`` will disable logging. If multiple loggers are - provided and the `save_dir` property of that logger is not set, local files (checkpoints, - profiler traces, etc.) are saved in ``default_root_dir`` rather than in the ``log_dir`` of any - of the individual loggers. + provided, local files (checkpoints, profiler traces, etc.) are saved in the ``log_dir`` of + the first logger. Default: ``True``. log_every_n_steps: How often to log within steps. @@ -2210,11 +2209,11 @@ def model(self, model: torch.nn.Module) -> None: @property def log_dir(self) -> Optional[str]: - if len(self.loggers) == 1: - if isinstance(self.logger, TensorBoardLogger): - dirpath = self.logger.log_dir + if len(self.loggers) > 0: + if isinstance(self.loggers[0], TensorBoardLogger): + dirpath = self.loggers[0].log_dir else: - dirpath = self.logger.save_dir + dirpath = self.loggers[0].save_dir else: dirpath = self.default_root_dir diff --git a/src/pytorch_lightning/utilities/logger.py b/src/pytorch_lightning/utilities/logger.py index 24d75e4f41034..76c24db0fe1a0 100644 --- a/src/pytorch_lightning/utilities/logger.py +++ b/src/pytorch_lightning/utilities/logger.py @@ -151,14 +151,6 @@ def _add_prefix( return metrics -def _name(loggers: List[Any], separator: str = "_") -> str: - if len(loggers) == 1: - return loggers[0].name - else: - # Concatenate names together, removing duplicates and preserving order - return separator.join(dict.fromkeys(str(logger.name) for logger in loggers)) - - def _version(loggers: List[Any], separator: str = "_") -> Union[int, str]: if len(loggers) == 1: return loggers[0].version diff --git a/tests/tests_pytorch/profilers/test_profiler.py b/tests/tests_pytorch/profilers/test_profiler.py index 91e1980b45bb6..2e3b868407d7f 100644 --- a/tests/tests_pytorch/profilers/test_profiler.py +++ b/tests/tests_pytorch/profilers/test_profiler.py @@ -465,15 +465,12 @@ def look_for_trace(trace_dir): """Determines if a directory contains a PyTorch trace.""" return any("trace.json" in filename for filename in os.listdir(trace_dir)) - # Sanity check - assert not look_for_trace(tmpdir) - model = BoringModel() loggers = [TensorBoardLogger(save_dir=tmpdir), CSVLogger(tmpdir)] trainer = Trainer(default_root_dir=tmpdir, profiler="pytorch", logger=loggers, limit_train_batches=5, max_epochs=1) assert len(trainer.loggers) == 2 trainer.fit(model) - assert look_for_trace(tmpdir) + assert look_for_trace(tmpdir / "lightning_logs" / "version_0") @RunIf(min_cuda_gpus=1, standalone=True) diff --git a/tests/tests_pytorch/trainer/properties/test_log_dir.py b/tests/tests_pytorch/trainer/properties/test_log_dir.py index e048e50e6f0b8..9e8453b9e7032 100644 --- a/tests/tests_pytorch/trainer/properties/test_log_dir.py +++ b/tests/tests_pytorch/trainer/properties/test_log_dir.py @@ -113,13 +113,14 @@ def test_logdir_multiple_loggers(tmpdir): """Tests that the logdir equals the default_root_dir when trainer has multiple loggers.""" default_root_dir = tmpdir / "default_root_dir" save_dir = tmpdir / "save_dir" - model = TestModel(default_root_dir) + expected = os.path.join(tmpdir, "save_dir", "custom_logs", "version_0") + model = TestModel(expected) trainer = Trainer( default_root_dir=default_root_dir, max_steps=2, logger=[TensorBoardLogger(save_dir=save_dir, name="custom_logs"), CSVLogger(tmpdir)], ) - assert trainer.log_dir == default_root_dir + assert trainer.log_dir == expected trainer.fit(model) - assert trainer.log_dir == default_root_dir + assert trainer.log_dir == expected diff --git a/tests/tests_pytorch/utilities/test_logger.py b/tests/tests_pytorch/utilities/test_logger.py index 6b67272289fc3..031599ec16b20 100644 --- a/tests/tests_pytorch/utilities/test_logger.py +++ b/tests/tests_pytorch/utilities/test_logger.py @@ -22,7 +22,6 @@ _add_prefix, _convert_params, _flatten_dict, - _name, _sanitize_callable_params, _sanitize_params, _version, @@ -177,23 +176,6 @@ def test_add_prefix(): assert metrics["prefix2_prefix-metric2"] == 2 -def test_name(tmpdir): - """Verify names of loggers are concatenated properly.""" - logger1 = CSVLogger(tmpdir, name="foo") - logger2 = CSVLogger(tmpdir, name="bar") - logger3 = CSVLogger(tmpdir, name="foo") - logger4 = CSVLogger(tmpdir, name="baz") - loggers = [logger1, logger2, logger3, logger4] - name = _name([]) - assert name == "" - name = _name([logger3]) - assert name == "foo" - name = _name(loggers) - assert name == "foo_bar_baz" - name = _name(loggers, "-") - assert name == "foo-bar-baz" - - def test_version(tmpdir): """Verify versions of loggers are concatenated properly.""" logger1 = CSVLogger(tmpdir, version=0) From 3ba0f56b1819547963262626ed723b1179ee1155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 26 Aug 2022 22:01:51 +0200 Subject: [PATCH 23/30] Remove support for the deprecated torchtext legacy (#14375) --- .azure/hpu-tests.yml | 2 +- dockers/base-conda/Dockerfile | 6 +-- environment.yml | 1 - requirements/pytorch/adjust-versions.py | 16 +++--- requirements/pytorch/check-avail-extras.py | 1 - requirements/pytorch/extra.txt | 1 - src/pytorch_lightning/CHANGELOG.md | 2 + src/pytorch_lightning/core/hooks.py | 1 - src/pytorch_lightning/utilities/__init__.py | 1 - src/pytorch_lightning/utilities/apply_func.py | 34 +----------- src/pytorch_lightning/utilities/data.py | 8 +-- src/pytorch_lightning/utilities/imports.py | 2 - tests/tests_pytorch/accelerators/test_mps.py | 29 ---------- .../deprecated_api/test_remove_1-8.py | 12 ----- tests/tests_pytorch/helpers/imports.py | 16 ------ .../tests_pytorch/helpers/torchtext_utils.py | 54 ------------------- tests/tests_pytorch/models/test_gpu.py | 30 +---------- .../utilities/test_apply_func_torchtext.py | 47 ---------------- 18 files changed, 19 insertions(+), 244 deletions(-) delete mode 100644 tests/tests_pytorch/helpers/imports.py delete mode 100644 tests/tests_pytorch/helpers/torchtext_utils.py delete mode 100644 tests/tests_pytorch/utilities/test_apply_func_torchtext.py diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index bdfada907cac9..b128dbc5433b1 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -45,7 +45,7 @@ jobs: pip --version sudo pip uninstall -y lightning pytorch-lightning pip install fire - python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext + python .actions/assistant.py requirements-prune-pkgs torch,torchvision pip install ".[extra,test]" pip list env: diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 9bb75e34b8ff6..c82c5a4dfa15f 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -78,11 +78,11 @@ RUN \ conda update -n base -c defaults conda && \ CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \ conda create -y --name $CONDA_ENV \ - python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \ + python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision cudatoolkit=${CUDA_VERSION_MM} \ -c nvidia -c pytorch -c pytorch-test && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ - printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ + printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ python prune.py && \ rm prune.py && \ cat environment.yml && \ @@ -102,7 +102,7 @@ RUN \ pip list | grep torch && \ python -c "import torch; print(torch.__version__)" && \ pip install -q fire && \ - python assistant.py requirements_prune_pkgs torch,torchvision,torchtext && \ + python assistant.py requirements_prune_pkgs torch,torchvision && \ # Install remaining requirements pip install --no-cache-dir -r requirements/pytorch/base.txt \ -r requirements/pytorch/extra.txt \ diff --git a/environment.yml b/environment.yml index f26e93031770e..7576bfe0e5ff3 100644 --- a/environment.yml +++ b/environment.yml @@ -41,7 +41,6 @@ dependencies: - scikit-learn>=0.20.0 - matplotlib>=3.1.1 - omegaconf>=2.0.5 - - torchtext>=0.10.* # Examples - torchvision>=0.10.* diff --git a/requirements/pytorch/adjust-versions.py b/requirements/pytorch/adjust-versions.py index b5305014acc36..70c94b358fc6e 100644 --- a/requirements/pytorch/adjust-versions.py +++ b/requirements/pytorch/adjust-versions.py @@ -5,14 +5,14 @@ # IMPORTANT: this list needs to be sorted in reverse VERSIONS = [ - dict(torch="1.12.1", torchvision="0.13.1", torchtext="0.13.1"), # stable - dict(torch="1.12.0", torchvision="0.13.0", torchtext="0.13.0"), - dict(torch="1.11.0", torchvision="0.12.0", torchtext="0.12.0"), - dict(torch="1.10.2", torchvision="0.11.3", torchtext="0.11.2"), - dict(torch="1.10.1", torchvision="0.11.2", torchtext="0.11.1"), - dict(torch="1.10.0", torchvision="0.11.1", torchtext="0.11.0"), - dict(torch="1.9.1", torchvision="0.10.1", torchtext="0.10.1"), - dict(torch="1.9.0", torchvision="0.10.0", torchtext="0.10.0"), + dict(torch="1.12.1", torchvision="0.13.1"), # stable + dict(torch="1.12.0", torchvision="0.13.0"), + dict(torch="1.11.0", torchvision="0.12.0"), + dict(torch="1.10.2", torchvision="0.11.3"), + dict(torch="1.10.1", torchvision="0.11.2"), + dict(torch="1.10.0", torchvision="0.11.1"), + dict(torch="1.9.1", torchvision="0.10.1"), + dict(torch="1.9.0", torchvision="0.10.0"), ] diff --git a/requirements/pytorch/check-avail-extras.py b/requirements/pytorch/check-avail-extras.py index 228a41038c3c1..9477267683d19 100644 --- a/requirements/pytorch/check-avail-extras.py +++ b/requirements/pytorch/check-avail-extras.py @@ -4,4 +4,3 @@ import matplotlib # noqa: F401 import omegaconf # noqa: F401 import rich # noqa: F401 -import torchtext # noqa: F401 diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index 20b6c1b8dbc12..f211f5654adec 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -3,7 +3,6 @@ # extended list of package dependencies to reach full functionality matplotlib>3.1, <3.5.3 -torchtext>=0.10.*, <0.14.0 omegaconf>=2.0.5, <2.3.0 hydra-core>=1.0.5, <1.3.0 jsonargparse[signatures]>=4.12.0, <=4.12.0 diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index f2fb0a2aacf3f..777ba96af3d5d 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -92,6 +92,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) +- Removed deprecated support for old torchtext versions ([#14375](https://github.com/Lightning-AI/lightning/pull/14375)) + ### Fixed - Fixed an assertion error when using a `ReduceOnPlateau` scheduler with the Horovod strategy ([#14215](https://github.com/Lightning-AI/lightning/pull/14215)) diff --git a/src/pytorch_lightning/core/hooks.py b/src/pytorch_lightning/core/hooks.py index 1e02d0b07cef9..4da53903eccca 100644 --- a/src/pytorch_lightning/core/hooks.py +++ b/src/pytorch_lightning/core/hooks.py @@ -624,7 +624,6 @@ def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_ - :class:`list` - :class:`dict` - :class:`tuple` - - :class:`torchtext.data.batch.Batch` For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...). diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py index c849ba0a05d68..b8d5801734a25 100644 --- a/src/pytorch_lightning/utilities/__init__.py +++ b/src/pytorch_lightning/utilities/__init__.py @@ -43,7 +43,6 @@ _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_12, _TORCH_QUANTIZE_AVAILABLE, - _TORCHTEXT_AVAILABLE, _TORCHVISION_AVAILABLE, _TPU_AVAILABLE, _XLA_AVAILABLE, diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py index fe8b0b836456f..ae3d81eab46a5 100644 --- a/src/pytorch_lightning/utilities/apply_func.py +++ b/src/pytorch_lightning/utilities/apply_func.py @@ -14,10 +14,9 @@ """Utilities used for collections.""" import dataclasses -import operator from abc import ABC from collections import defaultdict, OrderedDict -from copy import copy, deepcopy +from copy import deepcopy from functools import partial from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union @@ -26,17 +25,6 @@ from torch import Tensor from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - -if _TORCHTEXT_LEGACY: - if _compare_version("torchtext", operator.ge, "0.9.0"): - from torchtext.legacy.data import Batch - else: - from torchtext.data import Batch -else: - Batch = type(None) - _BLOCKING_DEVICE_TYPES = ("cpu", "mps") @@ -326,23 +314,6 @@ def move_data_to_device(batch: Any, device: Union[str, torch.device]) -> Any: device = torch.device(device) def batch_to(data: Any) -> Any: - # try to move torchtext data first - if _TORCHTEXT_LEGACY and isinstance(data, Batch): - # TODO: also remove the torchtext dependency with Lightning 1.8 - rank_zero_deprecation( - "The `torchtext.legacy.Batch` object is deprecated and Lightning will remove support for it in v1.8." - " We recommend you to migrate away from Batch by following the TorchText README:" - " https://github.com/pytorch/text#bc-breaking-legacy" - ) - # Shallow copy because each Batch has a reference to Dataset which contains all examples - device_data = copy(data) - for field, field_value in data.dataset.fields.items(): - if field_value is None: - continue - device_field = move_data_to_device(getattr(data, field), device) - setattr(device_data, field, device_field) - return device_data - kwargs = {} # Don't issue non-blocking transfers to CPU # Same with MPS due to a race condition bug: https://github.com/pytorch/pytorch/issues/83015 @@ -354,8 +325,7 @@ def batch_to(data: Any) -> Any: # user wrongly implemented the `TransferableDataType` and forgot to return `self`. return data - dtype = (TransferableDataType, Batch) if _TORCHTEXT_LEGACY else TransferableDataType - return apply_to_collection(batch, dtype=dtype, function=batch_to) + return apply_to_collection(batch, dtype=TransferableDataType, function=batch_to) def convert_to_tensors(data: Any, device: Union[str, torch.device]) -> Any: diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index b0c9307cec8e1..c795cfc47d3a5 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -124,9 +124,7 @@ def has_len(dataloader: Union[DataLoader, Iterable]) -> bool: f"`{dataloader.__class__.__name__}` returned 0 length. Please make sure this was your intention." ) has_len = True - except TypeError: - has_len = False - except NotImplementedError: # e.g. raised by torchtext if a batch_size_fn is used + except (TypeError, NotImplementedError): has_len = False if has_len and has_iterable_dataset(dataloader): @@ -170,9 +168,7 @@ def has_len_all_ranks( else: has_len = True - except TypeError: - has_len = False - except NotImplementedError: # e.g. raised by torchtext if a batch_size_fn is used + except (TypeError, NotImplementedError): has_len = False if has_len and has_iterable_dataset(dataloader): diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index ba437ad332dfa..dfc88104ac25c 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -147,8 +147,6 @@ def __repr__(self) -> str: _PSUTIL_AVAILABLE = _package_available("psutil") _RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2") _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"]) -_TORCHTEXT_AVAILABLE = _package_available("torchtext") -_TORCHTEXT_LEGACY: bool = _TORCHTEXT_AVAILABLE and _compare_version("torchtext", operator.lt, "0.11.0") _TORCHVISION_AVAILABLE = _package_available("torchvision") _XLA_AVAILABLE: bool = _package_available("torch_xla") diff --git a/tests/tests_pytorch/accelerators/test_mps.py b/tests/tests_pytorch/accelerators/test_mps.py index 01e13e937b4d0..2375b167f7708 100644 --- a/tests/tests_pytorch/accelerators/test_mps.py +++ b/tests/tests_pytorch/accelerators/test_mps.py @@ -21,8 +21,6 @@ from pytorch_lightning import Trainer from pytorch_lightning.accelerators import MPSAccelerator from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY -from tests_pytorch.helpers.imports import Batch, Dataset, Example, Field, LabelField from tests_pytorch.helpers.runif import RunIf @@ -135,30 +133,3 @@ def to(self, *args, **kwargs): batch = trainer.strategy.batch_to_device(CustomBatchType(), torch.device("mps")) assert batch.a.type() == "torch.mps.FloatTensor" - - # torchtext.data.Batch - if not _TORCHTEXT_LEGACY: - return - - samples = [ - {"text": "PyTorch Lightning is awesome!", "label": 0}, - {"text": "Please make it work with torchtext", "label": 1}, - ] - - text_field = Field() - label_field = LabelField() - fields = {"text": ("text", text_field), "label": ("label", label_field)} - - examples = [Example.fromdict(sample, fields) for sample in samples] - dataset = Dataset(examples=examples, fields=fields.values()) - # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first - text_field.build_vocab(dataset) - label_field.build_vocab(dataset) - - batch = Batch(data=examples, dataset=dataset) - - with pytest.deprecated_call(match="The `torchtext.legacy.Batch` object is deprecated"): - batch = trainer.strategy.batch_to_device(batch, torch.device("mps")) - - assert batch.text.type() == "torch.mps.LongTensor" - assert batch.label.type() == "torch.mps.LongTensor" diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index e1fb0b57654c0..8af7dc6ba90f5 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -35,20 +35,8 @@ from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import device_parser -from pytorch_lightning.utilities.apply_func import move_data_to_device -from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.helpers.torchtext_utils import get_dummy_torchtext_data_iterator - - -@pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.") -def test_v1_8_0_deprecated_torchtext_batch(): - - with pytest.deprecated_call(match="is deprecated and Lightning will remove support for it in v1.8"): - data_iterator, _ = get_dummy_torchtext_data_iterator(num_samples=3, batch_size=3) - batch = next(iter(data_iterator)) - _ = move_data_to_device(batch=batch, device=torch.device("cpu")) def test_v1_8_0_on_init_start_end(tmpdir): diff --git a/tests/tests_pytorch/helpers/imports.py b/tests/tests_pytorch/helpers/imports.py deleted file mode 100644 index 3fddb615b7ab1..0000000000000 --- a/tests/tests_pytorch/helpers/imports.py +++ /dev/null @@ -1,16 +0,0 @@ -import operator - -from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY - -if _TORCHTEXT_LEGACY: - if _compare_version("torchtext", operator.ge, "0.9.0"): - from torchtext.legacy.data import Batch, Dataset, Example, Field, Iterator, LabelField - else: - from torchtext.data import Batch, Dataset, Example, Field, Iterator, LabelField -else: - Batch = type(None) - Dataset = type(None) - Example = type(None) - Field = type(None) - Iterator = type(None) - LabelField = type(None) diff --git a/tests/tests_pytorch/helpers/torchtext_utils.py b/tests/tests_pytorch/helpers/torchtext_utils.py deleted file mode 100644 index 3c18005636bf1..0000000000000 --- a/tests/tests_pytorch/helpers/torchtext_utils.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import random -import string - -from tests_pytorch.helpers.imports import Dataset, Example, Field, Iterator - - -def _generate_random_string(length: int = 10): - return "".join(random.choices(string.ascii_letters, k=length)) - - -def get_dummy_torchtext_data_iterator(num_samples: int, batch_size: int, include_lengths: bool = False): - text_field = Field( - sequential=True, - pad_first=False, # nosec - init_token="", - eos_token="", # nosec - include_lengths=include_lengths, - ) # nosec - - dataset = Dataset( - [ - Example.fromdict({"text": _generate_random_string()}, {"text": ("text", text_field)}) - for _ in range(num_samples) - ], - {"text": text_field}, - ) - text_field.build_vocab(dataset) - - iterator = Iterator( - dataset, - batch_size=batch_size, - sort_key=None, - device=None, - batch_size_fn=None, - train=True, - repeat=False, - shuffle=None, - sort=None, - sort_within_batch=None, - ) - return iterator, text_field diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py index 1a2d72a12118e..0a030e74273b1 100644 --- a/tests/tests_pytorch/models/test_gpu.py +++ b/tests/tests_pytorch/models/test_gpu.py @@ -28,9 +28,8 @@ from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY +from pytorch_lightning.utilities.imports import _compare_version from tests_pytorch.helpers.datamodules import ClassifDataModule -from tests_pytorch.helpers.imports import Batch, Dataset, Example, Field, LabelField from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.simple_models import ClassificationModel @@ -269,33 +268,6 @@ def to(self, *args, **kwargs): batch = trainer.strategy.batch_to_device(CustomBatchType(), torch.device("cuda:0")) assert batch.a.type() == "torch.cuda.FloatTensor" - # torchtext.data.Batch - if not _TORCHTEXT_LEGACY: - return - - samples = [ - {"text": "PyTorch Lightning is awesome!", "label": 0}, - {"text": "Please make it work with torchtext", "label": 1}, - ] - - text_field = Field() - label_field = LabelField() - fields = {"text": ("text", text_field), "label": ("label", label_field)} - - examples = [Example.fromdict(sample, fields) for sample in samples] - dataset = Dataset(examples=examples, fields=fields.values()) - # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first - text_field.build_vocab(dataset) - label_field.build_vocab(dataset) - - batch = Batch(data=examples, dataset=dataset) - - with pytest.deprecated_call(match="The `torchtext.legacy.Batch` object is deprecated"): - batch = trainer.strategy.batch_to_device(batch, torch.device("cuda:0")) - - assert batch.text.type() == "torch.cuda.LongTensor" - assert batch.label.type() == "torch.cuda.LongTensor" - @RunIf(min_cuda_gpus=1) def test_non_blocking(): diff --git a/tests/tests_pytorch/utilities/test_apply_func_torchtext.py b/tests/tests_pytorch/utilities/test_apply_func_torchtext.py deleted file mode 100644 index 6c72959b54587..0000000000000 --- a/tests/tests_pytorch/utilities/test_apply_func_torchtext.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pytest -import torch - -from pytorch_lightning.utilities.apply_func import move_data_to_device -from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY -from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.helpers.torchtext_utils import get_dummy_torchtext_data_iterator - - -@pytest.mark.parametrize("include_lengths", [False, True]) -@pytest.mark.parametrize("device", [torch.device("cuda", 0)]) -@pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.") -@RunIf(min_cuda_gpus=1) -def test_batch_move_data_to_device_torchtext_include_lengths(include_lengths, device): - data_iterator, _ = get_dummy_torchtext_data_iterator(num_samples=3, batch_size=3, include_lengths=include_lengths) - data_iter = iter(data_iterator) - batch = next(data_iter) - - with pytest.deprecated_call(match="The `torchtext.legacy.Batch` object is deprecated"): - batch_on_device = move_data_to_device(batch, device) - - if include_lengths: - # tensor with data - assert batch_on_device.text[0].device == device - # tensor with length of data - assert batch_on_device.text[1].device == device - else: - assert batch_on_device.text.device == device - - -@pytest.mark.parametrize("include_lengths", [False, True]) -@pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.") -def test_batch_move_data_to_device_torchtext_include_lengths_cpu(include_lengths): - test_batch_move_data_to_device_torchtext_include_lengths(include_lengths, torch.device("cpu")) From 1ad452f95a7c408f5402b271278e9b30b0d7421b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Aug 2022 20:40:53 +0000 Subject: [PATCH 24/30] Update mlflow requirement from <1.28.0,>=1.0.0 to >=1.0.0,<1.29.0 in /requirements (#14311) Update mlflow requirement in /requirements Updates the requirements on [mlflow](https://github.com/mlflow/mlflow) to permit the latest version. - [Release notes](https://github.com/mlflow/mlflow/releases) - [Changelog](https://github.com/mlflow/mlflow/blob/master/CHANGELOG.md) - [Commits](https://github.com/mlflow/mlflow/compare/1.0.0...v1.28.0) --- updated-dependencies: - dependency-name: mlflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/loggers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt index df83a077f8457..905823451973b 100644 --- a/requirements/pytorch/loggers.txt +++ b/requirements/pytorch/loggers.txt @@ -5,6 +5,6 @@ neptune-client>=0.10.0, <0.16.4 comet-ml>=3.1.12, <3.31.8 -mlflow>=1.0.0, <1.28.0 +mlflow>=1.0.0, <1.29.0 test_tube>=0.7.5, <=0.7.5 wandb>=0.10.22, <0.13.2 From 714137d4d65e8687455d671106d5080207f64734 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Aug 2022 20:41:36 +0000 Subject: [PATCH 25/30] Bump ravsamhq/notify-slack-action from 1 to 2 (#14290) Bumps [ravsamhq/notify-slack-action](https://github.com/ravsamhq/notify-slack-action) from 1 to 2. - [Release notes](https://github.com/ravsamhq/notify-slack-action/releases) - [Commits](https://github.com/ravsamhq/notify-slack-action/compare/v1...v2) --- updated-dependencies: - dependency-name: ravsamhq/notify-slack-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-pytorch-dockers.yml | 10 +++++----- .github/workflows/events-nightly.yml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 6cb28885e79ef..73f303c6cbc44 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -75,7 +75,7 @@ jobs: push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} timeout-minutes: 60 - - uses: ravsamhq/notify-slack-action@v1 + - uses: ravsamhq/notify-slack-action@v2 if: failure() && env.PUSH_TO_HUB == 'true' with: status: ${{ job.status }} @@ -117,7 +117,7 @@ jobs: push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 95 - - uses: ravsamhq/notify-slack-action@v1 + - uses: ravsamhq/notify-slack-action@v2 if: failure() && env.PUSH_TO_HUB == 'true' with: status: ${{ job.status }} @@ -155,7 +155,7 @@ jobs: push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} timeout-minutes: 95 - - uses: ravsamhq/notify-slack-action@v1 + - uses: ravsamhq/notify-slack-action@v2 if: failure() && env.PUSH_TO_HUB == 'true' with: status: ${{ job.status }} @@ -199,7 +199,7 @@ jobs: push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }} timeout-minutes: 10 - - uses: ravsamhq/notify-slack-action@v1 + - uses: ravsamhq/notify-slack-action@v2 if: failure() && env.PUSH_TO_HUB == 'true' with: status: ${{ job.status }} @@ -235,7 +235,7 @@ jobs: push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }} timeout-minutes: 10 - - uses: ravsamhq/notify-slack-action@v1 + - uses: ravsamhq/notify-slack-action@v2 if: failure() && env.PUSH_TO_HUB == 'true' with: status: ${{ job.status }} diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 36907bf955999..13d3895bf365d 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -48,7 +48,7 @@ jobs: # report failure to Slack - name: Slack notification if: failure() && github.event_name == 'schedule' - uses: ravsamhq/notify-slack-action@v1 + uses: ravsamhq/notify-slack-action@v2 with: status: ${{ job.status }} token: ${{ secrets.GITHUB_TOKEN }} From 250c06e406a821b381393ea9e47152e6c8e2784c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 26 Aug 2022 22:59:32 +0200 Subject: [PATCH 26/30] Remove deprecated HPC model hooks (#14315) Co-authored-by: rohitgr7 Co-authored-by: Jirka Borovec --- .../common/lightning_module.rst | 12 ----- src/pytorch_lightning/CHANGELOG.md | 4 ++ src/pytorch_lightning/core/saving.py | 26 ---------- .../trainer/configuration_validator.py | 17 ------ .../connectors/checkpoint_connector.py | 13 ----- .../deprecated_api/test_remove_1-8.py | 23 -------- .../connectors/test_checkpoint_connector.py | 52 ------------------- .../trainer/logging_/test_logger_connector.py | 6 +-- 8 files changed, 7 insertions(+), 146 deletions(-) diff --git a/docs/source-pytorch/common/lightning_module.rst b/docs/source-pytorch/common/lightning_module.rst index e0fc10097d002..1c3f423a001f8 100644 --- a/docs/source-pytorch/common/lightning_module.rst +++ b/docs/source-pytorch/common/lightning_module.rst @@ -1345,18 +1345,6 @@ load_from_checkpoint .. automethod:: pytorch_lightning.core.module.LightningModule.load_from_checkpoint :noindex: -on_hpc_save -~~~~~~~~~~~ - -.. automethod:: pytorch_lightning.core.module.LightningModule.on_hpc_save - :noindex: - -on_hpc_load -~~~~~~~~~~~ - -.. automethod:: pytorch_lightning.core.module.LightningModule.on_hpc_load - :noindex: - on_train_start ~~~~~~~~~~~~~~ diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 777ba96af3d5d..af45f1dc589d7 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -92,8 +92,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) +- Removed the deprecated `LightningModule.{on_hpc_load,on_hpc_save}` hooks in favor of the general purpose hooks `LightningModule.{on_load_checkpoint,on_save_checkpoint}` ([#14315](https://github.com/Lightning-AI/lightning/pull/14315)) + + - Removed deprecated support for old torchtext versions ([#14375](https://github.com/Lightning-AI/lightning/pull/14375)) + ### Fixed - Fixed an assertion error when using a `ReduceOnPlateau` scheduler with the Horovod strategy ([#14215](https://github.com/Lightning-AI/lightning/pull/14215)) diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py index 2df7a661c2bd6..380338f5c0312 100644 --- a/src/pytorch_lightning/core/saving.py +++ b/src/pytorch_lightning/core/saving.py @@ -142,32 +142,6 @@ def load_from_checkpoint( **kwargs, ) - # ------------------------- - # OPTIONAL HOOKS - # ------------------------- - def on_hpc_save(self, checkpoint: Dict[str, Any]) -> None: - """Hook to do whatever you need right before Slurm manager saves the model. - - Args: - checkpoint: A dictionary in which you can save variables to save in a checkpoint. - Contents need to be pickleable. - - .. deprecated:: v1.6 - This method is deprecated in v1.6 and will be removed in v1.8. - Please use ``LightningModule.on_save_checkpoint`` instead. - """ - - def on_hpc_load(self, checkpoint: Dict[str, Any]) -> None: - """Hook to do whatever you need right before Slurm manager loads the model. - - Args: - checkpoint: A dictionary with variables from the checkpoint. - - .. deprecated:: v1.6 - This method is deprecated in v1.6 and will be removed in v1.8. - Please use ``LightningModule.on_load_checkpoint`` instead. - """ - def _load_from_checkpoint( cls: Union[Type["ModelIO"], Type["pl.LightningModule"], Type["pl.LightningDataModule"]], diff --git a/src/pytorch_lightning/trainer/configuration_validator.py b/src/pytorch_lightning/trainer/configuration_validator.py index 9d277f5ac45c9..8bf68e1bedd62 100644 --- a/src/pytorch_lightning/trainer/configuration_validator.py +++ b/src/pytorch_lightning/trainer/configuration_validator.py @@ -48,8 +48,6 @@ def verify_loop_configurations(trainer: "pl.Trainer") -> None: __verify_batch_transfer_support(trainer) _check_deprecated_callback_hooks(trainer) - # TODO: Delete _check_on_hpc_hooks in v1.8 - _check_on_hpc_hooks(model) # TODO: Delete on_epoch_start/on_epoch_end hooks in v1.8 _check_on_epoch_start_end(model) # TODO: Delete CheckpointHooks off PrecisionPlugin in v1.8 @@ -209,21 +207,6 @@ def __check_training_step_requires_dataloader_iter(model: "pl.LightningModule") ) -# TODO: Delete _check_on_hpc_hooks in v1.8 -def _check_on_hpc_hooks(model: "pl.LightningModule") -> None: - if is_overridden("on_hpc_save", model): - rank_zero_deprecation( - "Method `LightningModule.on_hpc_save` is deprecated in v1.6 and" - " will be removed in v1.8. Please use `LightningModule.on_save_checkpoint` instead." - ) - - if is_overridden("on_hpc_load", model): - rank_zero_deprecation( - "Method `LightningModule.on_hpc_load` is deprecated in v1.6 and" - " will be removed in v1.8. Please use `LightningModule.on_load_checkpoint` instead." - ) - - # TODO: Remove on_epoch_start/on_epoch_end hooks in v1.8 def _check_on_epoch_start_end(model: "pl.LightningModule") -> None: hooks = ( diff --git a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py index e1dccd11a0d0a..f0b35e054d6d5 100644 --- a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -23,7 +23,6 @@ from torchmetrics import Metric import pytorch_lightning as pl -from pytorch_lightning.plugins.environments import SLURMEnvironment from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE @@ -168,16 +167,9 @@ def restore_model(self) -> None: if not self._loaded_checkpoint: return - model = self.trainer.lightning_module - # hook: give user access to checkpoint if needed. self.trainer._call_lightning_module_hook("on_load_checkpoint", self._loaded_checkpoint) - # TODO: remove this in v1.8. - # call hpc specific hook - if self._hpc_resume_path is not None: - model.on_hpc_load(self._loaded_checkpoint) - # restore model state_dict self.trainer.strategy.load_model_state_dict(self._loaded_checkpoint) @@ -438,11 +430,6 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: if datamodule is not None: self.trainer._call_lightning_datamodule_hook("on_save_checkpoint", checkpoint) - # TODO: remove this in v1.8. - environment = self.trainer._accelerator_connector.cluster_environment - if isinstance(environment, SLURMEnvironment) and environment.auto_requeue: - model.on_hpc_save(checkpoint) - return checkpoint def save_checkpoint( diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 8af7dc6ba90f5..07223b88b7710 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -84,29 +84,6 @@ def test_v1_8_0_deprecated_warning_positional_category(): rank_zero_warn("foo", FutureWarning) -def test_v1_8_0_deprecated_on_hpc_hooks(tmpdir): - class TestModelSave(BoringModel): - def on_hpc_save(self): - print("on_hpc_save override") - - class TestModelLoad(BoringModel): - def on_hpc_load(self): - print("on_hpc_load override") - - save_model = TestModelSave() - load_model = TestModelLoad() - trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, fast_dev_run=True) - - with pytest.deprecated_call( - match=r"Method `LightningModule.on_hpc_save` is deprecated in v1.6 and will be removed in v1.8." - ): - trainer.fit(save_model) - with pytest.deprecated_call( - match=r"Method `LightningModule.on_hpc_load` is deprecated in v1.6 and will be removed in v1.8." - ): - trainer.fit(load_model) - - def test_v1_8_0_deprecated_run_stage(): trainer = Trainer() trainer._run_stage = Mock() diff --git a/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py b/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py index 02e3221f4dfd4..4762e769a2379 100644 --- a/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py @@ -12,68 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from unittest import mock from unittest.mock import Mock -import pytest import torch from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.plugins.environments import SLURMEnvironment from pytorch_lightning.trainer.states import TrainerFn -# TODO: remove HPCHookedModel in v1.8 -class HPCHookedModel(BoringModel): - def __init__(self): - super().__init__() - self.hpc_save_called = 0 - self.hpc_load_called = 0 - - def on_hpc_save(self, checkpoint): - assert "state_dict" in checkpoint - self.hpc_save_called += 1 - - def on_hpc_load(self, checkpoint): - assert "state_dict" in checkpoint - self.hpc_load_called += 1 - - -# TODO: remove test_hpc_hook_calls in v1.8 -@mock.patch( - "pytorch_lightning.trainer.connectors.accelerator_connector.AcceleratorConnector._is_slurm_managing_tasks", - return_value=True, -) -def test_hpc_hook_calls(mock_slurm_env, tmpdir): - model = HPCHookedModel() - trainer = Trainer(default_root_dir=tmpdir, max_steps=1, enable_checkpointing=False, logger=False) - environment = trainer._accelerator_connector.cluster_environment - assert isinstance(environment, SLURMEnvironment) - assert environment.auto_requeue - with pytest.deprecated_call( - match=r"Method `LightningModule.on_hpc_save` is deprecated in v1.6 and will be removed in v1.8." - ): - trainer.fit(model) - - # simulate snapshot on slurm - hpc_save_path = trainer._checkpoint_connector.hpc_save_path(tmpdir) - trainer.save_checkpoint(hpc_save_path) - assert model.hpc_save_called == 1 - assert model.hpc_load_called == 0 - - # new training run, restore from hpc checkpoint file automatically - assert set(os.listdir(tmpdir)) == {"hpc_ckpt_1.ckpt"} - trainer = Trainer(default_root_dir=tmpdir, max_steps=1, enable_checkpointing=False, logger=False) - with pytest.deprecated_call( - match=r"Method `LightningModule.on_hpc_save` is deprecated in v1.6 and will be removed in v1.8." - ): - trainer.fit(model) - assert model.hpc_save_called == 1 - assert model.hpc_load_called == 1 - - def test_preloaded_checkpoint_lifecycle(tmpdir): """Tests that the preloaded checkpoint contents gets cleared from memory when it is not required anymore.""" model = BoringModel() diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py index 1c346ac1e8bc5..266710abf9434 100644 --- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py +++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py @@ -250,7 +250,7 @@ def test_fx_validator_integration(tmpdir): limit_predict_batches=1, callbacks=callback, ) - with pytest.deprecated_call(match="is deprecated in"): + with pytest.deprecated_call(match="was deprecated in"): trainer.fit(model) not_supported.update( @@ -263,7 +263,7 @@ def test_fx_validator_integration(tmpdir): "on_test_end": "You can't", } ) - with pytest.deprecated_call(match="is deprecated in"): + with pytest.deprecated_call(match="was deprecated in"): trainer.test(model, verbose=False) not_supported.update({k: "result collection is not registered yet" for k in not_supported}) @@ -280,7 +280,7 @@ def test_fx_validator_integration(tmpdir): "on_predict_end": "result collection is not registered yet", } ) - with pytest.deprecated_call(match="is deprecated in"): + with pytest.deprecated_call(match="was deprecated in"): trainer.predict(model) From 0e30e4a5a01bc3cf2f6d36f70e16e0e36a339cf2 Mon Sep 17 00:00:00 2001 From: Mansy Date: Sat, 27 Aug 2022 01:55:22 +0200 Subject: [PATCH 27/30] [App][CI] Fix psutil requirement CI (#14413) --- requirements/app/test.txt | 1 + tests/tests_app/cli/test_cli.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/requirements/app/test.txt b/requirements/app/test.txt index ab5ef8f1e85ac..c6a2a61b2c29a 100644 --- a/requirements/app/test.txt +++ b/requirements/app/test.txt @@ -8,3 +8,4 @@ playwright==1.22.0 httpx trio pympler +psutil diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 1edc9d384cf30..6daa7be5b8e07 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -169,3 +169,6 @@ def test_cli_logout(exists: mock.MagicMock, unlink: mock.MagicMock, creds: bool) unlink.assert_called_once_with() else: unlink.assert_not_called() + + +# TODO: test for the other commands From f3574176e2a0f61c68093f39639dc110d5936e90 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Sat, 27 Aug 2022 17:42:24 +0530 Subject: [PATCH 28/30] Change `trainer.should_stop` to not stop in between an epoch and run until `min_steps/min_epochs` only (#13890) --- docs/source-pytorch/common/trainer.rst | 60 +++++++++++++++++++ src/pytorch_lightning/CHANGELOG.md | 3 + .../loops/epoch/training_epoch_loop.py | 16 ++++- src/pytorch_lightning/loops/fit_loop.py | 24 ++++---- src/pytorch_lightning/utilities/warnings.py | 6 ++ .../callbacks/test_early_stopping.py | 20 ++++--- .../loops/epoch/test_training_epoch_loop.py | 34 +++++++++++ .../tests_pytorch/loops/test_training_loop.py | 33 +++++++++- tests/tests_pytorch/trainer/test_trainer.py | 7 ++- 9 files changed, 176 insertions(+), 27 deletions(-) diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index fe76ebea481aa..53148c9fab583 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -1745,3 +1745,63 @@ execution within that function, and the status of the Trainer. trainer.state.status # stage in ("train", "sanity_check", "validate", "test", "predict", "tune") trainer.state.stage + +should_stop +*********** + +If you want to terminate the training during ``.fit``, you can set ``trainer.should_stop=True`` to terminate the training +as soon as possible. Note that, it will respect the arguments ``min_steps`` and ``min_epochs`` to check whether to stop. If these +arguments are set and the ``current_epoch`` or ``global_step`` don't meet these minimum conditions, training will continue until +both conditions are met. If any of these arguments is not set, it won't be considered for the final decision. + + +.. code-block:: python + + # setting `trainer.should_stop` at any point of training will terminate it + class LitModel(LightningModule): + def training_step(self, *args, **kwargs): + self.trainer.should_stop = True + + + trainer = Trainer() + model = LitModel() + trainer.fit(model) + +.. code-block:: python + + # setting `trainer.should_stop` will stop training only after at least 5 epochs have run + class LitModel(LightningModule): + def training_step(self, *args, **kwargs): + if self.current_epoch == 2: + self.trainer.should_stop = True + + + trainer = Trainer(min_epochs=5, max_epochs=100) + model = LitModel() + trainer.fit(model) + +.. code-block:: python + + # setting `trainer.should_stop` will stop training only after at least 5 steps have run + class LitModel(LightningModule): + def training_step(self, *args, **kwargs): + if self.global_step == 2: + self.trainer.should_stop = True + + + trainer = Trainer(min_steps=5, max_epochs=100) + model = LitModel() + trainer.fit(model) + +.. code-block:: python + + # setting `trainer.should_stop` at any until both min_steps and min_epochs are satisfied + class LitModel(LightningModule): + def training_step(self, *args, **kwargs): + if self.global_step == 7: + self.trainer.should_stop = True + + + trainer = Trainer(min_steps=5, min_epochs=5, max_epochs=100) + model = LitModel() + trainer.fit(model) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index af45f1dc589d7..35775201e1098 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -45,6 +45,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384)) +- Changed `trainer.should_stop` to not stop in between an epoch and run until `min_steps/min_epochs` only ([#13890](https://github.com/Lightning-AI/lightning/pull/13890)) + + - When using multiple loggers, by default checkpoints and profiler output now get saved to the log dir of the first logger in the list ([14325](https://github.com/Lightning-AI/lightning/pull/14325)) diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py index 0fe93ca7104b6..6be4956b9bfd5 100644 --- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -102,7 +102,21 @@ def _is_validation_done(self) -> bool: @property def done(self) -> bool: """Evaluates when to leave the loop.""" - return (self._is_training_done and self._is_validation_done) or self.trainer.should_stop + if self._is_training_done and self._is_validation_done: + return True + + if self.trainer.should_stop: + # early stopping + min_epochs = self.trainer.fit_loop.min_epochs + should_stop_early = self.trainer.fit_loop._should_stop_early + if not should_stop_early: + self._warning_cache.info( + f"Trainer was signaled to stop but the required `min_epochs={min_epochs!r}` or" + f" `min_steps={self.min_steps!r}` has not been met. Training will continue..." + ) + return should_stop_early + + return False def connect( # type: ignore[override] self, diff --git a/src/pytorch_lightning/loops/fit_loop.py b/src/pytorch_lightning/loops/fit_loop.py index 21273e70a5365..e6dcc581e247d 100644 --- a/src/pytorch_lightning/loops/fit_loop.py +++ b/src/pytorch_lightning/loops/fit_loop.py @@ -146,6 +146,12 @@ def _results(self) -> _ResultCollection: return self.epoch_loop.val_loop._results raise RuntimeError("`FitLoop._results` property isn't defined. Accessed outside of scope") + @property + def _should_stop_early(self) -> bool: + met_min_epochs = self.epoch_progress.current.processed >= self.min_epochs if self.min_epochs else True + met_min_steps = self.epoch_loop.global_step >= self.min_steps if self.min_steps else True + return met_min_epochs and met_min_steps + @property def done(self) -> bool: """Evaluates when to leave the loop.""" @@ -169,20 +175,10 @@ def done(self) -> bool: rank_zero_info(f"`Trainer.fit` stopped: `max_epochs={self.max_epochs!r}` reached.") return True - if self.trainer.should_stop: - # early stopping - met_min_epochs = self.epoch_progress.current.processed >= self.min_epochs if self.min_epochs else True - met_min_steps = self.epoch_loop.global_step >= self.min_steps if self.min_steps else True - if met_min_epochs and met_min_steps: - self.trainer.should_stop = True - rank_zero_debug("`Trainer.fit` stopped: `trainer.should_stop` was set.") - return True - else: - rank_zero_info( - f"Trainer was signaled to stop but the required `min_epochs={self.min_epochs!r}` or" - f" `min_steps={self.min_steps!r}` has not been met. Training will continue..." - ) - self.trainer.should_stop = False + if self.trainer.should_stop and self._should_stop_early: + rank_zero_debug("`Trainer.fit` stopped: `trainer.should_stop` was set.") + return True + return False @property diff --git a/src/pytorch_lightning/utilities/warnings.py b/src/pytorch_lightning/utilities/warnings.py index 61d3a2b8e24e3..61883dad51144 100644 --- a/src/pytorch_lightning/utilities/warnings.py +++ b/src/pytorch_lightning/utilities/warnings.py @@ -18,6 +18,7 @@ from pytorch_lightning.utilities.rank_zero import LightningDeprecationWarning as NewLightningDeprecationWarning from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation as new_rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_info as new_rank_zero_info from pytorch_lightning.utilities.rank_zero import rank_zero_warn as new_rank_zero_warn # enable our warnings @@ -39,6 +40,11 @@ def deprecation(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None: self.add(message) new_rank_zero_deprecation(message, stacklevel=stacklevel, **kwargs) + def info(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None: + if message not in self: + self.add(message) + new_rank_zero_info(message, stacklevel=stacklevel, **kwargs) + def rank_zero_warn(*args: Any, **kwargs: Any) -> Any: new_rank_zero_deprecation( diff --git a/tests/tests_pytorch/callbacks/test_early_stopping.py b/tests/tests_pytorch/callbacks/test_early_stopping.py index 8c17b2ad421e5..458df2ea23b3f 100644 --- a/tests/tests_pytorch/callbacks/test_early_stopping.py +++ b/tests/tests_pytorch/callbacks/test_early_stopping.py @@ -265,25 +265,28 @@ def validation_epoch_end(self, outputs): assert early_stopping.stopped_epoch == expected_stop_epoch -@pytest.mark.parametrize("limit_train_batches", (3, 5)) @pytest.mark.parametrize( - ["min_epochs", "min_steps"], + "limit_train_batches,min_epochs,min_steps,stop_step", [ # IF `min_steps` was set to a higher value than the `trainer.global_step` when `early_stopping` is being # triggered, THEN the trainer should continue until reaching `trainer.global_step == min_steps` and stop - (0, 10), + (3, 0, 10, 10), + (5, 0, 10, 10), # IF `min_epochs` resulted in a higher number of steps than the `trainer.global_step` when `early_stopping` is # being triggered, THEN the trainer should continue until reaching # `trainer.global_step` == `min_epochs * len(train_dataloader)` - (2, 0), + (3, 2, 0, 6), + (5, 2, 0, 10), # IF both `min_epochs` and `min_steps` are provided and higher than the `trainer.global_step` when # `early_stopping` is being triggered, THEN the highest between `min_epochs * len(train_dataloader)` and # `min_steps` would be reached - (1, 10), - (3, 10), + (3, 1, 10, 10), + (5, 1, 10, 10), + (3, 3, 10, 10), + (5, 3, 10, 15), ], ) -def test_min_epochs_min_steps_global_step(tmpdir, limit_train_batches, min_epochs, min_steps): +def test_min_epochs_min_steps_global_step(tmpdir, limit_train_batches, min_epochs, min_steps, stop_step): if min_steps: assert limit_train_batches < min_steps @@ -317,8 +320,7 @@ def training_step(self, batch, batch_idx): # epochs continue until min steps are reached assert trainer.current_epoch == expected_epochs # steps continue until min steps are reached AND the epoch is exhausted - # stopping mid-epoch is not supported - assert trainer.global_step == limit_train_batches * expected_epochs + assert trainer.global_step == stop_step def test_early_stopping_mode_options(): diff --git a/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py b/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py index d601542075004..3aaced7dc1bc1 100644 --- a/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py +++ b/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging from unittest.mock import patch import pytest @@ -184,3 +185,36 @@ def test_no_val_on_train_epoch_loop_restart(tmpdir): ) as advance_mocked: trainer.fit(model, ckpt_path=ckpt_path) assert advance_mocked.call_count == 1 + + +@pytest.mark.parametrize( + "min_epochs, min_steps, current_epoch, global_step, early_stop, epoch_loop_done, raise_info_msg", + [ + (None, None, 1, 4, True, True, False), + (None, None, 1, 10, True, True, False), + (4, None, 1, 4, False, False, True), + (4, 2, 1, 4, False, False, True), + (4, None, 1, 10, False, True, False), + (4, 3, 1, 3, False, False, True), + (4, 10, 1, 10, False, True, False), + (None, 4, 1, 4, True, True, False), + ], +) +def test_should_stop_early_stopping_conditions_not_met( + caplog, min_epochs, min_steps, current_epoch, global_step, early_stop, epoch_loop_done, raise_info_msg +): + """Test that checks that info message is logged when users sets `should_stop` but min conditions are not + met.""" + trainer = Trainer(min_epochs=min_epochs, min_steps=min_steps, limit_val_batches=0) + trainer.num_training_batches = 10 + trainer.should_stop = True + trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop.optim_progress.optimizer.step.total.completed = global_step + trainer.fit_loop.epoch_loop.batch_progress.current.ready = global_step + trainer.fit_loop.epoch_progress.current.completed = current_epoch - 1 + + message = f"min_epochs={min_epochs}` or `min_steps={min_steps}` has not been met. Training will continue" + with caplog.at_level(logging.INFO, logger="pytorch_lightning.loops"): + assert trainer.fit_loop.epoch_loop.done is epoch_loop_done + + assert (message in caplog.text) is raise_info_msg + assert trainer.fit_loop._should_stop_early is early_stop diff --git a/tests/tests_pytorch/loops/test_training_loop.py b/tests/tests_pytorch/loops/test_training_loop.py index bba0b9b7c5428..dc52a08f068ae 100644 --- a/tests/tests_pytorch/loops/test_training_loop.py +++ b/tests/tests_pytorch/loops/test_training_loop.py @@ -180,7 +180,6 @@ def test_fit_loop_done_log_messages(caplog): fit_loop.epoch_loop.min_steps = 100 assert not fit_loop.done - assert "was signaled to stop but" in caplog.text def test_warning_valid_train_step_end(tmpdir): @@ -198,3 +197,35 @@ def training_step_end(self, outputs): trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) trainer.fit(model) + + +@pytest.mark.parametrize( + "min_epochs, min_steps, current_epoch, early_stop, fit_loop_done, raise_debug_msg", + [ + (4, None, 100, True, True, False), + (4, None, 3, False, False, False), + (4, 10, 3, False, False, False), + (None, 10, 4, True, True, True), + (4, None, 4, True, True, True), + (4, 10, 4, True, True, True), + ], +) +def test_should_stop_early_stopping_conditions_met( + caplog, min_epochs, min_steps, current_epoch, early_stop, fit_loop_done, raise_debug_msg +): + """Test that checks that debug message is logged when users sets `should_stop` and min conditions are met.""" + trainer = Trainer(min_epochs=min_epochs, min_steps=min_steps, limit_val_batches=0, max_epochs=100) + trainer.num_training_batches = 10 + trainer.should_stop = True + trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop.optim_progress.optimizer.step.total.completed = ( + current_epoch * trainer.num_training_batches + ) + trainer.fit_loop.epoch_loop.batch_progress.current.ready = 10 + trainer.fit_loop.epoch_progress.current.processed = current_epoch + + message = "`Trainer.fit` stopped: `trainer.should_stop` was set." + with caplog.at_level(level=logging.DEBUG, logger="pytorch_lightning.utilities.rank_zero"): + assert trainer.fit_loop.done is fit_loop_done + + assert (message in caplog.text) is raise_debug_msg + assert trainer.fit_loop._should_stop_early is early_stop diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index 9506acee425d0..6febe1c7787cd 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -622,7 +622,10 @@ def training_step(self, batch, batch_idx): output["loss"] = output["loss"] * 0.0 # force minimal loss to trigger early stopping self.log("loss", output["loss"]) self.training_step_invoked += 1 - assert not self.trainer.should_stop + if self.current_epoch < 2: + assert not self.trainer.should_stop + else: + assert self.trainer.should_stop return output model = TestModel() @@ -641,7 +644,7 @@ def training_step(self, batch, batch_idx): message = f"min_epochs={min_epochs}` or `min_steps=None` has not been met. Training will continue" num_messages = sum(1 for record in caplog.records if message in record.message) - assert num_messages == min_epochs - 2 + assert num_messages == 1 assert model.training_step_invoked == min_epochs * 2 From af688dee69655aece60b17d7879cbbcaa396e8f1 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Sun, 28 Aug 2022 01:14:54 +0530 Subject: [PATCH 29/30] Update changelog after v1.7.3 release (#14398) --- src/pytorch_lightning/CHANGELOG.md | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 35775201e1098..d6c233e4a17ca 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -103,17 +103,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Fixed an assertion error when using a `ReduceOnPlateau` scheduler with the Horovod strategy ([#14215](https://github.com/Lightning-AI/lightning/pull/14215)) - - -- Fixed an `AttributeError` when accessing `LightningModule.logger` and the Trainer has multiple loggers ([#14234](https://github.com/Lightning-AI/lightning/pull/14234)) - - -- Added back support for `log`ging in the `configure_gradient_clipping` hook after unintended removal in v1.7.2 ([#14298](https://github.com/Lightning-AI/lightning/issues/14298)) - +- Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806)) -- Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296)) +- Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846)) - Fixed incorrect values after transferring data to a MPS device ([#13285](https://github.com/Lightning-AI/lightning/issues/13285)) @@ -130,6 +123,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806)) +## [1.7.3] - 2022-08-25 + +### Fixed + +- Fixed an assertion error when using a `ReduceOnPlateau` scheduler with the Horovod strategy ([#14215](https://github.com/Lightning-AI/lightning/pull/14215)) +- Fixed an `AttributeError` when accessing `LightningModule.logger` and the Trainer has multiple loggers ([#14234](https://github.com/Lightning-AI/lightning/pull/14234)) +- Added back support for `log`ging in the `configure_gradient_clipping` hook after unintended removal in v1.7.2 ([#14298](https://github.com/Lightning-AI/lightning/issues/14298)) +- Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296)) +- Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964)) + + ## [1.7.2] - 2022-08-17 ### Added From 03f2f324457cfcb1a492db46cdeeaa992687cc4b Mon Sep 17 00:00:00 2001 From: JongMok Lee Date: Sun, 28 Aug 2022 07:07:36 +0900 Subject: [PATCH 30/30] Fix mypy errors in `pytorch_lightning/strategies/sharded.py` (#14184) Co-authored-by: otaj --- pyproject.toml | 1 - src/pytorch_lightning/strategies/sharded.py | 19 ++++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 399e1e83efb3f..1f704e7aa20ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ module = [ "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", - "pytorch_lightning.strategies.sharded", "pytorch_lightning.trainer.callback_hook", "pytorch_lightning.trainer.supporters", "pytorch_lightning.trainer.trainer", diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py index 3b77bc6ceeb70..6bf8e47022c45 100644 --- a/src/pytorch_lightning/strategies/sharded.py +++ b/src/pytorch_lightning/strategies/sharded.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from typing import Dict, Generator, List, Tuple, Union +from typing import Dict, Generator, List, Tuple from torch import Tensor from torch.nn import Module @@ -20,7 +20,7 @@ import pytorch_lightning as pl from pytorch_lightning.core.optimizer import LightningOptimizer -from pytorch_lightning.overrides.base import _LightningModuleWrapperBase +from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.enums import PrecisionType @@ -51,10 +51,11 @@ def connect(self, model: "pl.LightningModule") -> None: def setup(self, trainer: "pl.Trainer") -> None: # share ddp pids to all processes - self._rank_0_will_call_children_scripts = self.broadcast(self._rank_0_will_call_children_scripts) + self._rank_0_will_call_children_scripts: bool = self.broadcast(self._rank_0_will_call_children_scripts) if self._should_run_deadlock_detection(): self._share_information_to_prevent_deadlock() + assert self.accelerator is not None self.accelerator.setup(trainer) # move the model to the correct device @@ -64,6 +65,7 @@ def setup(self, trainer: "pl.Trainer") -> None: trainer_fn = trainer.state.fn if trainer_fn == TrainerFn.FITTING: if self._layer_sync: + assert self.model is not None self.model = self._layer_sync.apply(self.model) self.setup_precision_plugin() @@ -73,7 +75,9 @@ def setup(self, trainer: "pl.Trainer") -> None: def configure_ddp(self) -> None: self._set_ddp_kwargs() - self.setup_optimizers(self.model.trainer) + assert self.lightning_module is not None + self.setup_optimizers(self.lightning_module.trainer) + assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) self.model, self.optimizers = self._setup_model_and_optimizers( model=_LightningModuleWrapperBase(self.model), optimizers=self.optimizers, @@ -97,12 +101,13 @@ def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer] return model, optimizers def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]: - if self.model is not None and self.model.trainer.state.fn != TrainerFn.FITTING: + assert self.lightning_module is not None + if self.model is not None and self.lightning_module.trainer.state.fn != TrainerFn.FITTING: return optimizers return self._reinit_optimizers_with_oss(optimizers) - def _reinit_optimizers_with_oss(self, optimizers: List[Union[Optimizer, LightningOptimizer]]) -> List["OSS"]: + def _reinit_optimizers_with_oss(self, optimizers: List[Optimizer]) -> List["OSS"]: for x, optimizer in enumerate(optimizers): if isinstance(optimizer, LightningOptimizer): optimizer = optimizer._optimizer @@ -135,7 +140,7 @@ def block_backward_sync(self) -> Generator: else: yield None - def post_training_step(self): + def post_training_step(self) -> None: pass @classmethod