From ad3dd9fecf0b9d2b37b01db83c12afdc8a234b8c Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Mon, 28 Nov 2022 21:30:53 +0000 Subject: [PATCH] Fix issues with PL 1.8 (#5353) * Fix issues with PL 1.8 Signed-off-by: SeanNaren * Set scripting variable Signed-off-by: SeanNaren * Fix missing arg Signed-off-by: SeanNaren * Cleanup list Signed-off-by: SeanNaren * Fix reference Signed-off-by: SeanNaren * Try to fix hanging EMA test Signed-off-by: SeanNaren * Missing \ Signed-off-by: SeanNaren * Add strategy Signed-off-by: SeanNaren * See if setting the chdir fixes the hanging DDP test Signed-off-by: SeanNaren * See if removing the subdir setter fixes the issue Signed-off-by: SeanNaren * Remove checks Signed-off-by: SeanNaren * Try [0,1] for devices Signed-off-by: SeanNaren * Add code back Signed-off-by: SeanNaren * Remove space Signed-off-by: SeanNaren * Update requirements Signed-off-by: SeanNaren * Swap import path Signed-off-by: SeanNaren * Update references Signed-off-by: SeanNaren * Fix deprecated variables Signed-off-by: SeanNaren * Fix missing var Signed-off-by: SeanNaren * Fix var Signed-off-by: SeanNaren * Revert changes Signed-off-by: SeanNaren * Address review Signed-off-by: SeanNaren Signed-off-by: SeanNaren Co-authored-by: Oleksii Kuchaiev Signed-off-by: Hainan Xu --- .../megatron_bart_pretraining.py | 2 +- .../megatron_bert_pretraining.py | 2 +- .../megatron_ckpt_to_nemo.py | 2 +- .../megatron_gpt_pretraining.py | 2 +- .../megatron_gpt_prompt_learning.py | 2 +- .../megatron_retro_cal_shape.py | 2 +- .../megatron_retro_mutransfer_pretrain.py | 2 +- .../megatron_retro_pretraining.py | 2 +- .../megatron_t5_lm_adaptation_finetune.py | 2 +- .../megatron_t5_pretraining.py | 2 +- .../megatron_t5_prompt_learning.py | 2 +- .../megatron_t5_seq2seq_eval.py | 2 +- .../megatron_t5_seq2seq_finetune.py | 2 +- .../tuning/megatron_gpt_adapter_tuning.py | 2 +- .../tuning/megatron_gpt_ia3_tuning.py | 2 +- .../tuning/megatron_t5_adapter_tuning.py | 2 +- .../tuning/megatron_t5_ia3_tuning.py | 2 +- .../megatron_nmt_training.py | 2 +- .../dialogue_gpt_classification_model.py | 2 +- .../dialogue/dialogue_gpt_generation_model.py | 2 +- .../nlp/models/dialogue/sgdqa_model.py | 2 +- .../language_modeling/megatron_base_model.py | 10 ++++----- .../megatron_retrieval_model.py | 10 ++++----- nemo/collections/nlp/parts/nlp_overrides.py | 4 ++-- nemo/collections/tts/models/fastpitch.py | 11 +++++----- nemo/collections/tts/models/radtts.py | 11 +++++----- nemo/collections/tts/models/tacotron2.py | 11 +++++----- nemo/collections/tts/models/waveglow.py | 11 +++++----- nemo/core/classes/exportable.py | 4 ++-- nemo/core/config/pytorch_lightning.py | 1 + nemo/utils/exp_manager.py | 22 ------------------- nemo_text_processing/g2p/models/ctc_g2p.py | 2 +- nemo_text_processing/g2p/models/t5_g2p.py | 2 +- requirements/requirements_lightning.txt | 2 +- tests/core/test_config_utils.py | 3 --- tests/core/test_optimizers_schedulers.py | 9 ++++---- tests/core_ptl/check_for_ranks.py | 2 +- tests/core_ptl/test_ptl_stateless_timer.py | 1 - .../nlp/Multitask_Prompt_and_PTuning.ipynb | 2 +- 39 files changed, 66 insertions(+), 94 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_bart_pretraining.py b/examples/nlp/language_modeling/megatron_bart_pretraining.py index 9a7300656f99..b08772c24348 100644 --- a/examples/nlp/language_modeling/megatron_bart_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bart_pretraining.py @@ -13,11 +13,11 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index b5c26259f711..e21a29a6f77a 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py index 2e4987f2e18c..a0eec29f38fb 100644 --- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py +++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py @@ -29,7 +29,7 @@ import torch from apex.transformer import parallel_state -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment +from lightning_lite.plugins.environments import TorchElasticEnvironment from pytorch_lightning.trainer.trainer import Trainer from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py index 811d8833a99e..0563cdf703b1 100644 --- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py @@ -13,10 +13,10 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel diff --git a/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py b/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py index ddd6b8eb8d97..1d0debb924f1 100644 --- a/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py +++ b/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import ( MegatronGPTPromptLearningModel, diff --git a/examples/nlp/language_modeling/megatron_retro_cal_shape.py b/examples/nlp/language_modeling/megatron_retro_cal_shape.py index 06bec216e925..7e8cf1dca755 100644 --- a/examples/nlp/language_modeling/megatron_retro_cal_shape.py +++ b/examples/nlp/language_modeling/megatron_retro_cal_shape.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel diff --git a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py index 80804a2602e5..d755da52fe2f 100644 --- a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py +++ b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining.py b/examples/nlp/language_modeling/megatron_retro_pretraining.py index 374bb938583e..f9bde24ca1ba 100644 --- a/examples/nlp/language_modeling/megatron_retro_pretraining.py +++ b/examples/nlp/language_modeling/megatron_retro_pretraining.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector diff --git a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py index 063147d66abb..3550d5e2918c 100644 --- a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py @@ -13,11 +13,11 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py index 4f044cb3c34d..018cdeae4c24 100644 --- a/examples/nlp/language_modeling/megatron_t5_pretraining.py +++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py @@ -13,11 +13,11 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model diff --git a/examples/nlp/language_modeling/megatron_t5_prompt_learning.py b/examples/nlp/language_modeling/megatron_t5_prompt_learning.py index e91c7c178c94..68c9f2cf5b30 100644 --- a/examples/nlp/language_modeling/megatron_t5_prompt_learning.py +++ b/examples/nlp/language_modeling/megatron_t5_prompt_learning.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_t5_prompt_learning_model import ( MegatronT5PromptLearningModel, diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py index e78d34adee65..2d1e104660a2 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from megatron_t5_seq2seq_finetune import load_from_checkpoint_dir, load_from_nemo, validate_checkpoint_loading_args from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py index 84b78739f673..84dec0fac387 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py @@ -15,10 +15,10 @@ import os import tempfile +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py index 325d9767e167..aeabe18b3d9a 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py @@ -13,10 +13,10 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTAdapterLearningModel from nemo.collections.nlp.parts.nlp_overrides import ( diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py index db1b8ef723d5..8103be100b10 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py @@ -13,10 +13,10 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTInfusedAdapterModel from nemo.collections.nlp.parts.nlp_overrides import ( diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py b/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py index 5f3cae47024c..50e126e0de52 100644 --- a/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py @@ -13,10 +13,10 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5AdapterLearningModel from nemo.collections.nlp.parts.nlp_overrides import ( diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py b/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py index 5a72b7829bfe..6230231638c7 100644 --- a/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py @@ -13,10 +13,10 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5InfusedAdapterModel from nemo.collections.nlp.parts.nlp_overrides import ( diff --git a/examples/nlp/machine_translation/megatron_nmt_training.py b/examples/nlp/machine_translation/megatron_nmt_training.py index 0bd349225fb2..9299996efc24 100644 --- a/examples/nlp/machine_translation/megatron_nmt_training.py +++ b/examples/nlp/machine_translation/megatron_nmt_training.py @@ -13,11 +13,11 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py index 9608a0320bd6..223b9238bb8a 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py @@ -710,7 +710,7 @@ def prepare_data(self): self.data_prepared = True def setup(self, stage=None): - super().setup() + super().setup(stage) if self.cfg.library == "megatron" and self.prompt_learning and stage == "fit": if self.cfg.virtual_prompt_style == VirtualPromptStyle.PROMPT_TUNING: self.language_model.init_new_prompts() diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py index 69ff6d37527e..c6c976a501a1 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py @@ -216,7 +216,7 @@ def mask_and_reduce_loss(self, loss_mask, output_tensor): return loss def setup(self, stage=None): - super().setup() + super().setup(stage) if self.cfg.library == "megatron" and self.prompt_learning: self.language_model.init_new_prompts() diff --git a/nemo/collections/nlp/models/dialogue/sgdqa_model.py b/nemo/collections/nlp/models/dialogue/sgdqa_model.py index c8b6c468b988..2dd4011d74ac 100644 --- a/nemo/collections/nlp/models/dialogue/sgdqa_model.py +++ b/nemo/collections/nlp/models/dialogue/sgdqa_model.py @@ -226,7 +226,7 @@ def eval_step_helper(self, batch: List[torch.Tensor]): all_start_char_idx = [] all_end_char_idx = [] - if self.trainer.devices and self.trainer.world_size > 1: + if self.trainer.num_devices and self.trainer.world_size > 1: world_size = self.trainer.world_size for ind in range(world_size): all_example_id_num.append(torch.empty_like(example_id_num)) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 9d09020fc137..7119e47acb98 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -302,16 +302,16 @@ def on_train_batch_end(self, outputs, batch, batch_idx: int, unused: Optional[in # If the grad scaler skipped its optimizer step due to infs/nans, # decrement the step of all schedulers. if grad_scaler.optimizer_update_skipped is not None and grad_scaler.optimizer_update_skipped is True: - schedulers = self.trainer.lr_schedulers + scheduler_cfgs = self.trainer.lr_scheduler_configs - if not schedulers or not self.trainer.lightning_module.automatic_optimization: + if not scheduler_cfgs or not self.trainer.lightning_module.automatic_optimization: return - for scheduler in schedulers: + for scheduler_cfg in scheduler_cfgs: # Decrement the counter by 2, then perform a scheduler.step() to perform a no-up # as well as update the optimizer lr in all param groups - scheduler['scheduler'].last_epoch -= 2 - scheduler['scheduler'].step() + scheduler_cfg.scheduler.last_epoch -= 2 + scheduler_cfg.scheduler.step() # Removing the line below because it messes up train_valid_test_num_samples calculation. # self.trainer.fit_loop.max_steps = self.trainer.fit_loop.max_steps + 1 diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py index 97efa254459a..66e43458d20e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py @@ -309,16 +309,16 @@ def on_train_batch_end(self, outputs, batch, batch_idx: int, unused: Optional[in # If the grad scaler skipped its optimizer step due to infs/nans, # decrement the step of all schedulers. if grad_scaler.optimizer_update_skipped is not None and grad_scaler.optimizer_update_skipped is True: - schedulers = self.trainer.lr_schedulers + scheduler_cfgs = self.trainer.lr_scheduler_configs - if not schedulers or not self.trainer.lightning_module.automatic_optimization: + if not scheduler_cfgs or not self.trainer.lightning_module.automatic_optimization: return - for scheduler in schedulers: + for scheduler_cfg in scheduler_cfgs: # Decrement the counter by 2, then perform a scheduler.step() to perform a no-up # as well as update the optimizer lr in all param groups - scheduler['scheduler'].last_epoch -= 2 - scheduler['scheduler'].step() + scheduler_cfg.scheduler.last_epoch -= 2 + scheduler_cfg.scheduler.step() # Increase the max step count by 1 diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 2e52be81ce34..c31be8a9109e 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -22,16 +22,16 @@ import pytorch_lightning as pl import torch +from lightning_lite.plugins import ClusterEnvironment +from lightning_lite.utilities.types import _PATH from omegaconf import OmegaConf from pytorch_lightning.overrides import LightningDistributedModule -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.trainer import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.fetching import DataFetcher -from pytorch_lightning.utilities.types import _PATH from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook from torch.nn.parallel import DistributedDataParallel diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 211e8d31fba3..fd3a4412889a 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -19,7 +19,7 @@ from hydra.utils import instantiate from omegaconf import DictConfig, OmegaConf, open_dict from pytorch_lightning import Trainer -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger +from pytorch_lightning.loggers import TensorBoardLogger from nemo.collections.common.parts.preprocessing import parsers from nemo.collections.tts.helpers.helpers import plot_alignment_to_numpy, plot_spectrogram_to_numpy, process_batch @@ -219,11 +219,10 @@ def tb_logger(self): if self.logger is None and self.logger.experiment is None: return None tb_logger = self.logger.experiment - if isinstance(self.logger, LoggerCollection): - for logger in self.logger: - if isinstance(logger, TensorBoardLogger): - tb_logger = logger.experiment - break + for logger in self.trainer.loggers: + if isinstance(logger, TensorBoardLogger): + tb_logger = logger.experiment + break self._tb_logger = tb_logger return self._tb_logger diff --git a/nemo/collections/tts/models/radtts.py b/nemo/collections/tts/models/radtts.py index 30b6189c484f..2f51f70bd571 100644 --- a/nemo/collections/tts/models/radtts.py +++ b/nemo/collections/tts/models/radtts.py @@ -21,7 +21,7 @@ from hydra.utils import instantiate from omegaconf import DictConfig, OmegaConf from pytorch_lightning import Trainer -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger +from pytorch_lightning.loggers import TensorBoardLogger from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import BaseTokenizer from nemo.collections.tts.helpers.helpers import plot_alignment_to_numpy @@ -388,11 +388,10 @@ def tb_logger(self): if self.logger is None and self.logger.experiment is None: return None tb_logger = self.logger.experiment - if isinstance(self.logger, LoggerCollection): - for logger in self.logger: - if isinstance(logger, TensorBoardLogger): - tb_logger = logger.experiment - break + for logger in self.trainer.loggers: + if isinstance(logger, TensorBoardLogger): + tb_logger = logger.experiment + break self._tb_logger = tb_logger return self._tb_logger diff --git a/nemo/collections/tts/models/tacotron2.py b/nemo/collections/tts/models/tacotron2.py index d5e7747d4cbc..bbcc7d48af79 100644 --- a/nemo/collections/tts/models/tacotron2.py +++ b/nemo/collections/tts/models/tacotron2.py @@ -20,7 +20,7 @@ from hydra.utils import instantiate from omegaconf import MISSING, DictConfig, OmegaConf, open_dict from omegaconf.errors import ConfigAttributeError -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger, WandbLogger +from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger from torch import nn from nemo.collections.common.parts.preprocessing import parsers @@ -284,11 +284,10 @@ def validation_step(self, batch, batch_idx): def validation_epoch_end(self, outputs): if self.logger is not None and self.logger.experiment is not None: logger = self.logger.experiment - if isinstance(self.logger, LoggerCollection): - for logger in self.logger: - if isinstance(logger, TensorBoardLogger): - logger = logger.experiment - break + for logger in self.trainer.loggers: + if isinstance(logger, TensorBoardLogger): + logger = logger.experiment + break if isinstance(logger, TensorBoardLogger): tacotron2_log_to_tb_func( logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False, diff --git a/nemo/collections/tts/models/waveglow.py b/nemo/collections/tts/models/waveglow.py index a1a522a44c0c..d54b77b61721 100644 --- a/nemo/collections/tts/models/waveglow.py +++ b/nemo/collections/tts/models/waveglow.py @@ -16,7 +16,7 @@ import torch from hydra.utils import instantiate from omegaconf import DictConfig, open_dict -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger +from pytorch_lightning.loggers import TensorBoardLogger from nemo.collections.tts.helpers.helpers import OperationMode, waveglow_log_to_tb_func from nemo.collections.tts.losses.waveglowloss import WaveGlowLoss @@ -124,11 +124,10 @@ def validation_step(self, batch, batch_idx): def validation_epoch_end(self, outputs): if self.logger is not None and self.logger.experiment is not None: tb_logger = self.logger.experiment - if isinstance(self.logger, LoggerCollection): - for logger in self.logger: - if isinstance(logger, TensorBoardLogger): - tb_logger = logger.experiment - break + for logger in self.trainer.loggers: + if isinstance(logger, TensorBoardLogger): + tb_logger = logger.experiment + break waveglow_log_to_tb_func( tb_logger, outputs[0].values(), diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py index e5f7b5231600..50266dab3dbe 100644 --- a/nemo/core/classes/exportable.py +++ b/nemo/core/classes/exportable.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os from abc import ABC from typing import List, Union import torch +from pytorch_lightning.core.module import _jit_is_scripting from torch.onnx import TrainingMode from nemo.core.classes import typecheck @@ -128,7 +128,7 @@ def _export( # Set module mode with torch.onnx.select_model_mode_for_export( self, training - ), torch.inference_mode(), torch.no_grad(), torch.jit.optimized_execution(True): + ), torch.inference_mode(), torch.no_grad(), torch.jit.optimized_execution(True), _jit_is_scripting(): if input_example is None: input_example = self.input_module.input_example() diff --git a/nemo/core/config/pytorch_lightning.py b/nemo/core/config/pytorch_lightning.py index 46a294bd1cca..70f3d6cad0a9 100644 --- a/nemo/core/config/pytorch_lightning.py +++ b/nemo/core/config/pytorch_lightning.py @@ -84,6 +84,7 @@ class TrainerConfig: strategy: Any = None enable_checkpointing: bool = False enable_model_summary: bool = True + inference_mode: bool = True # Register the trainer config. diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index d8d4c7a98477..207170a61fb8 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -32,7 +32,6 @@ from omegaconf import DictConfig, OmegaConf, open_dict from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.callbacks.timer import Interval, Timer -from pytorch_lightning.loggers import LoggerCollection as _LoggerCollection from pytorch_lightning.loggers import MLFlowLogger, TensorBoardLogger, WandbLogger from pytorch_lightning.loops import TrainingEpochLoop from pytorch_lightning.strategies.ddp import DDPStrategy @@ -691,24 +690,6 @@ def get_git_diff(): return "{}\n".format(err.output.decode("utf-8")) -class LoggerList(_LoggerCollection): - """ A thin wrapper on Lightning's LoggerCollection such that name and version are better aligned with exp_manager - """ - - def __init__(self, _logger_iterable, nemo_name=None, nemo_version=""): - super().__init__(_logger_iterable) - self._nemo_name = nemo_name - self._nemo_version = nemo_version - - @property - def name(self) -> str: - return self._nemo_name - - @property - def version(self) -> str: - return self._nemo_version - - def configure_loggers( trainer: 'pytorch_lightning.Trainer', exp_dir: [Path, str], @@ -759,9 +740,6 @@ def configure_loggers( logger_list.append(mlflow_logger) logging.info('MLFlowLogger has been set up') - logger_list = ( - LoggerList(logger_list, nemo_name=name, nemo_version=version) if len(logger_list) > 1 else logger_list[0] - ) trainer._logger_connector.configure_logger(logger_list) diff --git a/nemo_text_processing/g2p/models/ctc_g2p.py b/nemo_text_processing/g2p/models/ctc_g2p.py index 7f77ed6595e5..a456942c1ad8 100644 --- a/nemo_text_processing/g2p/models/ctc_g2p.py +++ b/nemo_text_processing/g2p/models/ctc_g2p.py @@ -69,7 +69,7 @@ class CTCG2PModel(G2PModel, ASRBPEMixin): def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.world_size = 1 if trainer is not None: - self.world_size = trainer.num_nodes * trainer.num_gpus + self.world_size = trainer.num_nodes * trainer.num_devices self.mode = cfg.model_name.lower() diff --git a/nemo_text_processing/g2p/models/t5_g2p.py b/nemo_text_processing/g2p/models/t5_g2p.py index 437b9bf57475..7ed1c917a880 100644 --- a/nemo_text_processing/g2p/models/t5_g2p.py +++ b/nemo_text_processing/g2p/models/t5_g2p.py @@ -58,7 +58,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.world_size = 1 if trainer is not None: - self.world_size = trainer.num_nodes * trainer.num_gpus + self.world_size = trainer.num_nodes * trainer.num_devices # Load appropriate tokenizer from HuggingFace self.model_name = cfg.model_name diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index 8610e3f992f0..dcde2c6bd810 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -1,6 +1,6 @@ hydra-core>=1.2.0,<1.3 omegaconf>=2.2,<2.3 -pytorch-lightning>=1.7.0,<1.8 +pytorch-lightning>=1.8.3 pyyaml<6 # Pinned until omegaconf works with pyyaml>=6 torchmetrics>=0.4.1rc0 transformers>=4.0.1,<=4.21.2 diff --git a/tests/core/test_config_utils.py b/tests/core/test_config_utils.py index a1922a3c6f86..4eccd90afada 100644 --- a/tests/core/test_config_utils.py +++ b/tests/core/test_config_utils.py @@ -116,11 +116,8 @@ class DummyDataClass: @pytest.mark.unit def test_ptl_config(self): PTL_DEPRECATED = [ - 'distributed_backend', - 'automatic_optimization', 'gpus', 'num_processes', - 'weights_save_path', ] result = config_utils.assert_dataclass_signature_match(ptl.Trainer, TrainerConfig, ignore_args=PTL_DEPRECATED) diff --git a/tests/core/test_optimizers_schedulers.py b/tests/core/test_optimizers_schedulers.py index 2797964e3455..c5163b0a8e63 100644 --- a/tests/core/test_optimizers_schedulers.py +++ b/tests/core/test_optimizers_schedulers.py @@ -20,6 +20,7 @@ import pytorch_lightning as pl import torch import torch.optim +from pytorch_lightning.utilities import rank_zero_only from nemo.core import config, optim from nemo.core.optim.lr_scheduler import AVAILABLE_SCHEDULERS @@ -85,7 +86,7 @@ def configure_optimizers(self): class Callback(pl.callbacks.Callback): - @pl.utilities.distributed.rank_zero_only + @rank_zero_only def on_train_end(self, trainer, module): count = module.my_opt.param_groups[0]['count'] if trainer.global_step != count or trainer.global_step != module.max_steps: @@ -110,13 +111,13 @@ class SchedulerNoOpCallback(Callback): def on_train_batch_end(self, trainer: pl.Trainer, pl_module, outputs, batch, batch_idx): # pl_module.max_steps is "original" max steps without trainer extra steps. if (trainer.global_step + 1) % 3 == 0 and (trainer.global_step + 1) < pl_module.max_steps: - schedulers = trainer.lr_schedulers + schedulers = trainer.lr_scheduler_configs for scheduler in schedulers: # Decrement the counter by 2, then perform a scheduler.step() to perform a no-up # as well as update the optimizer lr in all param groups - scheduler['scheduler'].last_epoch -= 2 - scheduler['scheduler'].step() + scheduler.scheduler.last_epoch -= 2 + scheduler.scheduler.step() # Increase the max step count by 1 trainer.fit_loop.max_steps = trainer.fit_loop.max_steps + 1 diff --git a/tests/core_ptl/check_for_ranks.py b/tests/core_ptl/check_for_ranks.py index 4ae967593bc1..d8f785957131 100644 --- a/tests/core_ptl/check_for_ranks.py +++ b/tests/core_ptl/check_for_ranks.py @@ -18,7 +18,7 @@ import torch from omegaconf import OmegaConf from pytorch_lightning import Trainer -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities import rank_zero_only from nemo.core import ModelPT from nemo.utils import logging diff --git a/tests/core_ptl/test_ptl_stateless_timer.py b/tests/core_ptl/test_ptl_stateless_timer.py index f63f56bdf446..c20cac4fecf0 100644 --- a/tests/core_ptl/test_ptl_stateless_timer.py +++ b/tests/core_ptl/test_ptl_stateless_timer.py @@ -19,7 +19,6 @@ import torch from omegaconf import OmegaConf from pytorch_lightning import Trainer -from pytorch_lightning.utilities.distributed import rank_zero_only from nemo.core import ModelPT from nemo.utils import logging diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb index b03316bfce02..512a38bc90cc 100644 --- a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb +++ b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb @@ -826,7 +826,7 @@ "import torch\n", "import pytorch_lightning as pl\n", "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n", - "from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment\n", + "from lightning_lite.plugins.environments import TorchElasticEnvironment\n", "\n", "# lets modify some trainer configs\n", "# checks if we have GPU available and uses it\n",