From 36c2a275b6a612a0453ebefb40a24ae4a9a0c9bb Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Thu, 29 Sep 2022 18:36:17 +0530 Subject: [PATCH 1/5] Avoid initialize optimizer during deepspeed evaluation --- src/pytorch_lightning/strategies/deepspeed.py | 14 +++----------- .../strategies/test_deepspeed_strategy.py | 12 +++++++++--- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 6e91cc3d54a4f..41588477b85f0 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -564,15 +564,7 @@ def _set_deepspeed_activation_checkpointing(self) -> None: def _initialize_deepspeed_inference(self, model: Module) -> None: # todo: Currently DeepSpeed requires optimizers at inference to partition weights correctly assert isinstance(self.config, dict) - optimizer, scheduler = None, None - if "optimizer" not in self.config: - rank_zero_info( - "You have not specified an optimizer or scheduler within the DeepSpeed config." - " Using `configure_optimizers` to define optimizer and scheduler." - ) - optimizer, lr_scheduler, _ = self._init_optimizers() - if lr_scheduler is not None: - scheduler = lr_scheduler.scheduler + # todo: this is required for DeepSpeed throughput timers inference_config = {"train_micro_batch_size_per_gpu": 1} if "fp16" in self.config: @@ -590,8 +582,8 @@ def _initialize_deepspeed_inference(self, model: Module) -> None: args=argparse.Namespace(device_rank=self.root_device.index), config=inference_config, model=model, - optimizer=optimizer, - lr_scheduler=scheduler, + optimizer=None, + lr_scheduler=None, model_parameters=[], dist_init_required=False, ) diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 038058b44682b..e529414b90187 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -468,10 +468,16 @@ def test_deepspeed_multigpu(tmpdir): enable_progress_bar=False, enable_model_summary=False, ) + + with mock.patch.object( + model, "configure_optimizers", wraps=model.configure_optimizers + ) as mock_configure_optimizers: + trainer.test(model) + assert mock_configure_optimizers.call_count == 0 + with mock.patch("deepspeed.init_distributed", wraps=deepspeed.init_distributed) as mock_deepspeed_distributed: trainer.fit(model) mock_deepspeed_distributed.assert_called_once() - trainer.test(model) _assert_save_model_is_equal(model, tmpdir, trainer) @@ -655,8 +661,8 @@ def test_deepspeed_multigpu_stage_3(tmpdir): enable_progress_bar=False, enable_model_summary=False, ) - trainer.fit(model) trainer.test(model) + trainer.fit(model) _assert_save_model_is_equal(model, tmpdir, trainer) @@ -676,8 +682,8 @@ def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config enable_progress_bar=False, enable_model_summary=False, ) - trainer.fit(model) trainer.test(model) + trainer.fit(model) _assert_save_model_is_equal(model, tmpdir, trainer) From 968450b33ca52a740421d145e9935d728abc4f87 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Thu, 29 Sep 2022 18:48:30 +0530 Subject: [PATCH 2/5] chlog --- src/pytorch_lightning/CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index c6868311cd1b3..bc888392bf279 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -276,6 +276,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an attribute error when running the tuner together with the `StochasticWeightAveraging` callback ([#14836](https://github.com/Lightning-AI/lightning/pull/14836)) +- Avoided initializing optimizers during deepspeed inference ([#14944](https://github.com/Lightning-AI/lightning/pull/14944)) + + ## [1.7.7] - 2022-09-22 ### Fixed From 90c60cdf95bb102f3734326e82f1689b400d7c40 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 21 Oct 2022 16:57:57 +0530 Subject: [PATCH 3/5] Apply suggestions from code review --- src/pytorch_lightning/strategies/deepspeed.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index f648b225768db..e2afe9bc80525 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -559,7 +559,6 @@ def _set_deepspeed_activation_checkpointing(self) -> None: ) def _initialize_deepspeed_inference(self, model: Module) -> None: - # todo: Currently DeepSpeed requires optimizers at inference to partition weights correctly assert isinstance(self.config, dict) # todo: this is required for DeepSpeed throughput timers From 6bbb75e7aff0fc389b934f9c50b8ff3ff211c6cd Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 21 Oct 2022 17:00:17 +0530 Subject: [PATCH 4/5] Apply suggestions from code review --- src/pytorch_lightning/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 1c0d96137fe53..9bc9ff43bb0eb 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -181,6 +181,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `Trainer` support for PyTorch built without distributed support ([#14971](https://github.com/Lightning-AI/lightning/pull/14971)) - Fixed batch normalization statistics calculation in `StochasticWeightAveraging` callback ([#14866](https://github.com/Lightning-AI/lightning/pull/14866)) + ## [1.7.7] - 2022-09-22 ### Fixed From 8cac543cd6a7ca18cbf053c101de4e87748b289d Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 21 Oct 2022 17:01:05 +0530 Subject: [PATCH 5/5] chlog --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 9bc9ff43bb0eb..e06db49a6d157 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -169,7 +169,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue with `LightningLite.setup()` not setting the `.device` attribute correctly on the returned wrapper ([#14822](https://github.com/Lightning-AI/lightning/pull/14822)) - Fixed an attribute error when running the tuner together with the `StochasticWeightAveraging` callback ([#14836](https://github.com/Lightning-AI/lightning/pull/14836)) -- Avoided initializing optimizers during deepspeed inference ([#14944](https://github.com/Lightning-AI/lightning/pull/14944)) - Fixed MissingFieldException in offline mode for the `NeptuneLogger()` ([#14919](https://github.com/Lightning-AI/lightning/pull/14919)) - Fixed wandb `save_dir` is overridden by `None` `dir` when using CLI ([#14878](https://github.com/Lightning-AI/lightning/pull/14878)) - Fixed a missing call to `LightningDataModule.load_state_dict` hook while restoring checkpoint using `LightningDataModule.load_from_checkpoint` ([#14883](https://github.com/Lightning-AI/lightning/pull/14883)) @@ -180,6 +179,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Do not update on-plateau schedulers when reloading from an end-of-epoch checkpoint ([#14702](https://github.com/Lightning-AI/lightning/pull/14702)) - Fixed `Trainer` support for PyTorch built without distributed support ([#14971](https://github.com/Lightning-AI/lightning/pull/14971)) - Fixed batch normalization statistics calculation in `StochasticWeightAveraging` callback ([#14866](https://github.com/Lightning-AI/lightning/pull/14866)) +- Avoided initializing optimizers during deepspeed inference ([#14944](https://github.com/Lightning-AI/lightning/pull/14944)) ## [1.7.7] - 2022-09-22