From 477baaaf47d62a01b96fd712616f611f886d8513 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Nov 2022 15:50:13 +0000 Subject: [PATCH 01/22] Fix issues with PL 1.8 Signed-off-by: SeanNaren --- nemo/collections/nlp/parts/nlp_overrides.py | 4 ++-- nemo/collections/tts/models/fastpitch.py | 11 +++++------ nemo/collections/tts/models/radtts.py | 11 +++++------ nemo/collections/tts/models/tacotron2.py | 11 +++++------ nemo/collections/tts/models/waveglow.py | 11 +++++------ nemo/utils/exp_manager.py | 22 --------------------- requirements/requirements_lightning.txt | 2 +- tests/core/test_optimizers_schedulers.py | 3 ++- tests/core_ptl/check_for_ranks.py | 2 +- tests/core_ptl/test_ptl_stateless_timer.py | 1 - 10 files changed, 26 insertions(+), 52 deletions(-) diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 2e52be81ce34..c31be8a9109e 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -22,16 +22,16 @@ import pytorch_lightning as pl import torch +from lightning_lite.plugins import ClusterEnvironment +from lightning_lite.utilities.types import _PATH from omegaconf import OmegaConf from pytorch_lightning.overrides import LightningDistributedModule -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.trainer import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.fetching import DataFetcher -from pytorch_lightning.utilities.types import _PATH from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook from torch.nn.parallel import DistributedDataParallel diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 880004a47d1d..a173dda87a4e 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -19,7 +19,7 @@ from hydra.utils import instantiate from omegaconf import DictConfig, OmegaConf, open_dict from pytorch_lightning import Trainer -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger +from pytorch_lightning.loggers import TensorBoardLogger from nemo.collections.common.parts.preprocessing import parsers from nemo.collections.tts.helpers.helpers import plot_alignment_to_numpy, plot_spectrogram_to_numpy, process_batch @@ -228,11 +228,10 @@ def tb_logger(self): if self.logger is None and self.logger.experiment is None: return None tb_logger = self.logger.experiment - if isinstance(self.logger, LoggerCollection): - for logger in self.logger: - if isinstance(logger, TensorBoardLogger): - tb_logger = logger.experiment - break + for logger in self.trainer.loggers: + if isinstance(logger, TensorBoardLogger): + tb_logger = logger.experiment + break self._tb_logger = tb_logger return self._tb_logger diff --git a/nemo/collections/tts/models/radtts.py b/nemo/collections/tts/models/radtts.py index 47251b4a3f61..c94467491e19 100644 --- a/nemo/collections/tts/models/radtts.py +++ b/nemo/collections/tts/models/radtts.py @@ -21,7 +21,7 @@ from hydra.utils import instantiate from omegaconf import DictConfig, OmegaConf from pytorch_lightning import Trainer -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger +from pytorch_lightning.loggers import TensorBoardLogger from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import BaseTokenizer from nemo.collections.tts.helpers.helpers import plot_alignment_to_numpy @@ -389,11 +389,10 @@ def tb_logger(self): if self.logger is None and self.logger.experiment is None: return None tb_logger = self.logger.experiment - if isinstance(self.logger, LoggerCollection): - for logger in self.logger: - if isinstance(logger, TensorBoardLogger): - tb_logger = logger.experiment - break + for logger in self.trainer.loggers: + if isinstance(logger, TensorBoardLogger): + tb_logger = logger.experiment + break self._tb_logger = tb_logger return self._tb_logger diff --git a/nemo/collections/tts/models/tacotron2.py b/nemo/collections/tts/models/tacotron2.py index d5e7747d4cbc..bbcc7d48af79 100644 --- a/nemo/collections/tts/models/tacotron2.py +++ b/nemo/collections/tts/models/tacotron2.py @@ -20,7 +20,7 @@ from hydra.utils import instantiate from omegaconf import MISSING, DictConfig, OmegaConf, open_dict from omegaconf.errors import ConfigAttributeError -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger, WandbLogger +from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger from torch import nn from nemo.collections.common.parts.preprocessing import parsers @@ -284,11 +284,10 @@ def validation_step(self, batch, batch_idx): def validation_epoch_end(self, outputs): if self.logger is not None and self.logger.experiment is not None: logger = self.logger.experiment - if isinstance(self.logger, LoggerCollection): - for logger in self.logger: - if isinstance(logger, TensorBoardLogger): - logger = logger.experiment - break + for logger in self.trainer.loggers: + if isinstance(logger, TensorBoardLogger): + logger = logger.experiment + break if isinstance(logger, TensorBoardLogger): tacotron2_log_to_tb_func( logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False, diff --git a/nemo/collections/tts/models/waveglow.py b/nemo/collections/tts/models/waveglow.py index a1a522a44c0c..d54b77b61721 100644 --- a/nemo/collections/tts/models/waveglow.py +++ b/nemo/collections/tts/models/waveglow.py @@ -16,7 +16,7 @@ import torch from hydra.utils import instantiate from omegaconf import DictConfig, open_dict -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger +from pytorch_lightning.loggers import TensorBoardLogger from nemo.collections.tts.helpers.helpers import OperationMode, waveglow_log_to_tb_func from nemo.collections.tts.losses.waveglowloss import WaveGlowLoss @@ -124,11 +124,10 @@ def validation_step(self, batch, batch_idx): def validation_epoch_end(self, outputs): if self.logger is not None and self.logger.experiment is not None: tb_logger = self.logger.experiment - if isinstance(self.logger, LoggerCollection): - for logger in self.logger: - if isinstance(logger, TensorBoardLogger): - tb_logger = logger.experiment - break + for logger in self.trainer.loggers: + if isinstance(logger, TensorBoardLogger): + tb_logger = logger.experiment + break waveglow_log_to_tb_func( tb_logger, outputs[0].values(), diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index 4e15943b5e2e..18a3b1f6ae94 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -32,7 +32,6 @@ from omegaconf import DictConfig, OmegaConf, open_dict from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.callbacks.timer import Interval, Timer -from pytorch_lightning.loggers import LoggerCollection as _LoggerCollection from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger from pytorch_lightning.loops import TrainingEpochLoop from pytorch_lightning.strategies.ddp import DDPStrategy @@ -658,24 +657,6 @@ def get_git_diff(): return "{}\n".format(err.output.decode("utf-8")) -class LoggerList(_LoggerCollection): - """ A thin wrapper on Lightning's LoggerCollection such that name and version are better aligned with exp_manager - """ - - def __init__(self, _logger_iterable, nemo_name=None, nemo_version=""): - super().__init__(_logger_iterable) - self._nemo_name = nemo_name - self._nemo_version = nemo_version - - @property - def name(self) -> str: - return self._nemo_name - - @property - def version(self) -> str: - return self._nemo_version - - def configure_loggers( trainer: 'pytorch_lightning.Trainer', exp_dir: [Path, str], @@ -718,9 +699,6 @@ def configure_loggers( logger_list.append(wandb_logger) logging.info("WandBLogger has been set up") - logger_list = ( - LoggerList(logger_list, nemo_name=name, nemo_version=version) if len(logger_list) > 1 else logger_list[0] - ) trainer._logger_connector.configure_logger(logger_list) diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index 8610e3f992f0..b4da4c37a4dc 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -1,6 +1,6 @@ hydra-core>=1.2.0,<1.3 omegaconf>=2.2,<2.3 -pytorch-lightning>=1.7.0,<1.8 +pytorch-lightning>=1.8.0 pyyaml<6 # Pinned until omegaconf works with pyyaml>=6 torchmetrics>=0.4.1rc0 transformers>=4.0.1,<=4.21.2 diff --git a/tests/core/test_optimizers_schedulers.py b/tests/core/test_optimizers_schedulers.py index 1e3bf2896c99..6c822cf6add0 100644 --- a/tests/core/test_optimizers_schedulers.py +++ b/tests/core/test_optimizers_schedulers.py @@ -20,6 +20,7 @@ import pytorch_lightning as pl import torch import torch.optim +from pytorch_lightning.utilities import rank_zero_only from nemo.core import config, optim from nemo.core.optim.lr_scheduler import AVAILABLE_SCHEDULERS @@ -85,7 +86,7 @@ def configure_optimizers(self): class Callback(pl.callbacks.Callback): - @pl.utilities.distributed.rank_zero_only + @rank_zero_only def on_train_end(self, trainer, module): count = module.my_opt.param_groups[0]['count'] if trainer.global_step != count or trainer.global_step != module.max_steps: diff --git a/tests/core_ptl/check_for_ranks.py b/tests/core_ptl/check_for_ranks.py index 4ae967593bc1..d8f785957131 100644 --- a/tests/core_ptl/check_for_ranks.py +++ b/tests/core_ptl/check_for_ranks.py @@ -18,7 +18,7 @@ import torch from omegaconf import OmegaConf from pytorch_lightning import Trainer -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities import rank_zero_only from nemo.core import ModelPT from nemo.utils import logging diff --git a/tests/core_ptl/test_ptl_stateless_timer.py b/tests/core_ptl/test_ptl_stateless_timer.py index f63f56bdf446..c20cac4fecf0 100644 --- a/tests/core_ptl/test_ptl_stateless_timer.py +++ b/tests/core_ptl/test_ptl_stateless_timer.py @@ -19,7 +19,6 @@ import torch from omegaconf import OmegaConf from pytorch_lightning import Trainer -from pytorch_lightning.utilities.distributed import rank_zero_only from nemo.core import ModelPT from nemo.utils import logging From c6eb78c04d0815a87605dd6914ee86ef91acd990 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Nov 2022 16:19:09 +0000 Subject: [PATCH 02/22] Set scripting variable Signed-off-by: SeanNaren --- nemo/core/classes/exportable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py index 5a9ab55a4ee7..f31a92f2a411 100644 --- a/nemo/core/classes/exportable.py +++ b/nemo/core/classes/exportable.py @@ -16,6 +16,7 @@ from typing import List, Union import torch +from pytorch_lightning.core.module import _jit_is_scripting from torch.onnx import TrainingMode from nemo.core.classes import typecheck @@ -128,7 +129,7 @@ def _export( # Set module mode with torch.onnx.select_model_mode_for_export( self, training - ), torch.inference_mode(), torch.jit.optimized_execution(True): + ), torch.inference_mode(), torch.jit.optimized_execution(True), _jit_is_scripting(): if input_example is None: input_example = self.input_module.input_example() From 4cc63013ea5a2606898d7057adb1140d503eb047 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Nov 2022 16:50:02 +0000 Subject: [PATCH 03/22] Fix missing arg Signed-off-by: SeanNaren --- nemo/core/config/pytorch_lightning.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/core/config/pytorch_lightning.py b/nemo/core/config/pytorch_lightning.py index 46a294bd1cca..70f3d6cad0a9 100644 --- a/nemo/core/config/pytorch_lightning.py +++ b/nemo/core/config/pytorch_lightning.py @@ -84,6 +84,7 @@ class TrainerConfig: strategy: Any = None enable_checkpointing: bool = False enable_model_summary: bool = True + inference_mode: bool = True # Register the trainer config. From 25e0f77c4a379ce904d18553ba7b4fb1b3d6f6a1 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Nov 2022 16:57:09 +0000 Subject: [PATCH 04/22] Cleanup list Signed-off-by: SeanNaren --- tests/core/test_config_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/core/test_config_utils.py b/tests/core/test_config_utils.py index a1922a3c6f86..4eccd90afada 100644 --- a/tests/core/test_config_utils.py +++ b/tests/core/test_config_utils.py @@ -116,11 +116,8 @@ class DummyDataClass: @pytest.mark.unit def test_ptl_config(self): PTL_DEPRECATED = [ - 'distributed_backend', - 'automatic_optimization', 'gpus', 'num_processes', - 'weights_save_path', ] result = config_utils.assert_dataclass_signature_match(ptl.Trainer, TrainerConfig, ignore_args=PTL_DEPRECATED) From 60db72b02f7a0b592ab0a7c53b8d8a799c7ab0da Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Nov 2022 10:44:02 +0000 Subject: [PATCH 05/22] Fix reference Signed-off-by: SeanNaren --- tests/core/test_optimizers_schedulers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/core/test_optimizers_schedulers.py b/tests/core/test_optimizers_schedulers.py index 6c822cf6add0..968bbea11d7a 100644 --- a/tests/core/test_optimizers_schedulers.py +++ b/tests/core/test_optimizers_schedulers.py @@ -111,13 +111,13 @@ class SchedulerNoOpCallback(Callback): def on_train_batch_end(self, trainer: pl.Trainer, pl_module, outputs, batch, batch_idx): # pl_module.max_steps is "original" max steps without trainer extra steps. if (trainer.global_step + 1) % 3 == 0 and (trainer.global_step + 1) < pl_module.max_steps: - schedulers = trainer.lr_schedulers + schedulers = trainer.lr_scheduler_configs for scheduler in schedulers: # Decrement the counter by 2, then perform a scheduler.step() to perform a no-up # as well as update the optimizer lr in all param groups - scheduler['scheduler'].last_epoch -= 2 - scheduler['scheduler'].step() + scheduler.scheduler.last_epoch -= 2 + scheduler.scheduler.step() # Increase the max step count by 1 trainer.fit_loop.max_steps = trainer.fit_loop.max_steps + 1 From 6ef98e776500d57e22f8fa5ea86d5eeed8e6a340 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Nov 2022 12:03:42 +0000 Subject: [PATCH 06/22] Try to fix hanging EMA test Signed-off-by: SeanNaren --- Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Jenkinsfile b/Jenkinsfile index c7d16ec5560e..3ad689d46eff 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -225,6 +225,7 @@ pipeline { stage('Speech to Text EMA') { steps { sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ + --config-path="../conf/" --config-name="config" model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ trainer.devices=2 \ From e25704ca8fc581a71dac07d5a1bb12b930d23aea Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Nov 2022 14:54:16 +0000 Subject: [PATCH 07/22] Missing \ Signed-off-by: SeanNaren --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 3ad689d46eff..add629fcb288 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -225,7 +225,7 @@ pipeline { stage('Speech to Text EMA') { steps { sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - --config-path="../conf/" --config-name="config" + --config-path="../conf/" --config-name="config" \ model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ trainer.devices=2 \ From 4da05ebfe32fd5a9163ed5735534e8a7dad0ef24 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Nov 2022 15:33:37 +0000 Subject: [PATCH 08/22] Add strategy Signed-off-by: SeanNaren --- Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Jenkinsfile b/Jenkinsfile index add629fcb288..54584022d287 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -229,6 +229,7 @@ pipeline { model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ trainer.devices=2 \ + trainer.strategy="ddp" \ trainer.accelerator="gpu" \ +trainer.fast_dev_run=True \ +exp_manager.ema.enable=True \ From 4e6913cc3acfe4bf60895cc4cd1d5c00756ac2f1 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Nov 2022 16:38:33 +0000 Subject: [PATCH 09/22] See if setting the chdir fixes the hanging DDP test Signed-off-by: SeanNaren --- nemo/core/config/hydra_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo/core/config/hydra_runner.py b/nemo/core/config/hydra_runner.py index 41d4557d6f36..de65e6b968df 100644 --- a/nemo/core/config/hydra_runner.py +++ b/nemo/core/config/hydra_runner.py @@ -67,6 +67,9 @@ def wrapper(cfg_passthrough: Optional[DictConfig] = None) -> Any: # Set run.dir ONLY for ExpManager "compatibility" - to be removed. overrides.append("hydra.run.dir=.") + # Set working directory to the job's output directory + overrides.append("hydra.job.chdir=True") + # Check if user set the schema. if schema is not None: # Create config store. From 07d429210aa100b0bec626363d99aa22185e071c Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 10 Nov 2022 11:49:38 +0000 Subject: [PATCH 10/22] See if removing the subdir setter fixes the issue Signed-off-by: SeanNaren --- nemo/core/config/hydra_runner.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/nemo/core/config/hydra_runner.py b/nemo/core/config/hydra_runner.py index de65e6b968df..16376507a774 100644 --- a/nemo/core/config/hydra_runner.py +++ b/nemo/core/config/hydra_runner.py @@ -57,9 +57,6 @@ def wrapper(cfg_passthrough: Optional[DictConfig] = None) -> Any: # Get overriding args in dot string format overrides = parsed_args.overrides # type: list - # Disable the creation of .hydra subdir - # https://hydra.cc/docs/tutorials/basic/running_your_app/working_directory - overrides.append("hydra.output_subdir=null") # Hydra logging outputs only to stdout (no log file). # https://hydra.cc/docs/configure_hydra/logging overrides.append("hydra/job_logging=stdout") @@ -67,9 +64,6 @@ def wrapper(cfg_passthrough: Optional[DictConfig] = None) -> Any: # Set run.dir ONLY for ExpManager "compatibility" - to be removed. overrides.append("hydra.run.dir=.") - # Set working directory to the job's output directory - overrides.append("hydra.job.chdir=True") - # Check if user set the schema. if schema is not None: # Create config store. From 15087e515b7b148fa56c6d0871d3f6e7fb86212b Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 10 Nov 2022 12:09:13 +0000 Subject: [PATCH 11/22] Remove checks Signed-off-by: SeanNaren --- tests/hydra/test_hydra_runner.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/hydra/test_hydra_runner.py b/tests/hydra/test_hydra_runner.py index 1da0a914cfaf..803d6aa6af8a 100644 --- a/tests/hydra/test_hydra_runner.py +++ b/tests/hydra/test_hydra_runner.py @@ -41,8 +41,6 @@ def test_config1(self): # Run the call as subprocess. subprocess.check_call(call, shell=True, stdout=sys.stdout, stderr=sys.stdout) - # Make sure that .hydra dir is not present. - assert not path.exists(f".hydra") # Make sure that default hydra log file is not present. assert not path.exists(f"my_app.log") @@ -67,8 +65,6 @@ def test_config2(self): # Run the call as subprocess. subprocess.check_call(call, shell=True, stdout=sys.stdout, stderr=sys.stdout) - # Make sure that .hydra dir is not present. - assert not path.exists(f".hydra") # Make sure that default hydra log file is not present. assert not path.exists(f"my_app.log") From 4a01cec27a7ec1063a43a0ce1ca0a9fc50aba5c0 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 10 Nov 2022 12:55:50 +0000 Subject: [PATCH 12/22] Try [0,1] for devices Signed-off-by: SeanNaren --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2239b5cb698c..b677855d003d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -228,7 +228,7 @@ pipeline { --config-path="../conf/" --config-name="config" \ model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=2 \ + trainer.devices=[0,1] \ trainer.strategy="ddp" \ trainer.accelerator="gpu" \ +trainer.fast_dev_run=True \ From b1add4039ec559bb1f9264cf7b7eabe074d9162a Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 15 Nov 2022 13:08:19 +0000 Subject: [PATCH 13/22] Add code back Signed-off-by: SeanNaren --- nemo/core/config/hydra_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nemo/core/config/hydra_runner.py b/nemo/core/config/hydra_runner.py index daa11643098d..4f61b9275e15 100644 --- a/nemo/core/config/hydra_runner.py +++ b/nemo/core/config/hydra_runner.py @@ -58,6 +58,10 @@ def wrapper(cfg_passthrough: Optional[DictConfig] = None) -> Any: # Get overriding args in dot string format overrides = parsed_args.overrides # type: list + # Disable the creation of .hydra subdir + # https://hydra.cc/docs/tutorials/basic/running_your_app/working_directory + overrides.append("hydra.output_subdir=null") + # Hydra logging outputs only to stdout (no log file). # https://hydra.cc/docs/configure_hydra/logging overrides.append("hydra/job_logging=stdout") From 8c7d7e00c7b66abe09859d811b0870c680823e5f Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 21 Nov 2022 16:56:46 +0000 Subject: [PATCH 14/22] Remove space Signed-off-by: SeanNaren --- nemo/core/config/hydra_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/core/config/hydra_runner.py b/nemo/core/config/hydra_runner.py index 4f61b9275e15..6c6c9b47e0fd 100644 --- a/nemo/core/config/hydra_runner.py +++ b/nemo/core/config/hydra_runner.py @@ -61,7 +61,6 @@ def wrapper(cfg_passthrough: Optional[DictConfig] = None) -> Any: # Disable the creation of .hydra subdir # https://hydra.cc/docs/tutorials/basic/running_your_app/working_directory overrides.append("hydra.output_subdir=null") - # Hydra logging outputs only to stdout (no log file). # https://hydra.cc/docs/configure_hydra/logging overrides.append("hydra/job_logging=stdout") From 9e8ab6b00d1c3c20f37d0e05ae63b3d7b8082d29 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 23 Nov 2022 11:10:58 +0000 Subject: [PATCH 15/22] Update requirements Signed-off-by: SeanNaren --- requirements/requirements_lightning.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index b4da4c37a4dc..dcde2c6bd810 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -1,6 +1,6 @@ hydra-core>=1.2.0,<1.3 omegaconf>=2.2,<2.3 -pytorch-lightning>=1.8.0 +pytorch-lightning>=1.8.3 pyyaml<6 # Pinned until omegaconf works with pyyaml>=6 torchmetrics>=0.4.1rc0 transformers>=4.0.1,<=4.21.2 From b33501d0f3f594e34c11482fc0af76b6e23f80e7 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 23 Nov 2022 13:47:10 +0000 Subject: [PATCH 16/22] Swap import path Signed-off-by: SeanNaren --- examples/nlp/language_modeling/megatron_bart_pretraining.py | 2 +- examples/nlp/language_modeling/megatron_bert_pretraining.py | 2 +- examples/nlp/language_modeling/megatron_ckpt_to_nemo.py | 2 +- examples/nlp/language_modeling/megatron_gpt_pretraining.py | 2 +- examples/nlp/language_modeling/megatron_gpt_prompt_learning.py | 2 +- examples/nlp/language_modeling/megatron_retro_cal_shape.py | 2 +- .../nlp/language_modeling/megatron_retro_mutransfer_pretrain.py | 2 +- examples/nlp/language_modeling/megatron_retro_pretraining.py | 2 +- .../nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py | 2 +- examples/nlp/language_modeling/megatron_t5_pretraining.py | 2 +- examples/nlp/language_modeling/megatron_t5_prompt_learning.py | 2 +- examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py | 2 +- examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py | 2 +- .../nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py | 2 +- .../nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py | 2 +- .../nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py | 2 +- examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py | 2 +- examples/nlp/machine_translation/megatron_nmt_training.py | 2 +- tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb | 2 +- 19 files changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_bart_pretraining.py b/examples/nlp/language_modeling/megatron_bart_pretraining.py index 9a7300656f99..b08772c24348 100644 --- a/examples/nlp/language_modeling/megatron_bart_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bart_pretraining.py @@ -13,11 +13,11 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index b5c26259f711..e21a29a6f77a 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py index 2e4987f2e18c..a0eec29f38fb 100644 --- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py +++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py @@ -29,7 +29,7 @@ import torch from apex.transformer import parallel_state -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment +from lightning_lite.plugins.environments import TorchElasticEnvironment from pytorch_lightning.trainer.trainer import Trainer from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py index 811d8833a99e..0563cdf703b1 100644 --- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py @@ -13,10 +13,10 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel diff --git a/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py b/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py index ddd6b8eb8d97..1d0debb924f1 100644 --- a/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py +++ b/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import ( MegatronGPTPromptLearningModel, diff --git a/examples/nlp/language_modeling/megatron_retro_cal_shape.py b/examples/nlp/language_modeling/megatron_retro_cal_shape.py index 06bec216e925..7e8cf1dca755 100644 --- a/examples/nlp/language_modeling/megatron_retro_cal_shape.py +++ b/examples/nlp/language_modeling/megatron_retro_cal_shape.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel diff --git a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py index 80804a2602e5..d755da52fe2f 100644 --- a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py +++ b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining.py b/examples/nlp/language_modeling/megatron_retro_pretraining.py index 374bb938583e..f9bde24ca1ba 100644 --- a/examples/nlp/language_modeling/megatron_retro_pretraining.py +++ b/examples/nlp/language_modeling/megatron_retro_pretraining.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector diff --git a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py index 063147d66abb..3550d5e2918c 100644 --- a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py @@ -13,11 +13,11 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py index 4f044cb3c34d..018cdeae4c24 100644 --- a/examples/nlp/language_modeling/megatron_t5_pretraining.py +++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py @@ -13,11 +13,11 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model diff --git a/examples/nlp/language_modeling/megatron_t5_prompt_learning.py b/examples/nlp/language_modeling/megatron_t5_prompt_learning.py index e91c7c178c94..68c9f2cf5b30 100644 --- a/examples/nlp/language_modeling/megatron_t5_prompt_learning.py +++ b/examples/nlp/language_modeling/megatron_t5_prompt_learning.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_t5_prompt_learning_model import ( MegatronT5PromptLearningModel, diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py index e78d34adee65..2d1e104660a2 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from megatron_t5_seq2seq_finetune import load_from_checkpoint_dir, load_from_nemo, validate_checkpoint_loading_args from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py index 84b78739f673..84dec0fac387 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py @@ -15,10 +15,10 @@ import os import tempfile +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py index 325d9767e167..aeabe18b3d9a 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py @@ -13,10 +13,10 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTAdapterLearningModel from nemo.collections.nlp.parts.nlp_overrides import ( diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py index db1b8ef723d5..8103be100b10 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py @@ -13,10 +13,10 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTInfusedAdapterModel from nemo.collections.nlp.parts.nlp_overrides import ( diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py b/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py index 5f3cae47024c..50e126e0de52 100644 --- a/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py @@ -13,10 +13,10 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5AdapterLearningModel from nemo.collections.nlp.parts.nlp_overrides import ( diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py b/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py index 5a72b7829bfe..6230231638c7 100644 --- a/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py @@ -13,10 +13,10 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5InfusedAdapterModel from nemo.collections.nlp.parts.nlp_overrides import ( diff --git a/examples/nlp/machine_translation/megatron_nmt_training.py b/examples/nlp/machine_translation/megatron_nmt_training.py index 0bd349225fb2..9299996efc24 100644 --- a/examples/nlp/machine_translation/megatron_nmt_training.py +++ b/examples/nlp/machine_translation/megatron_nmt_training.py @@ -13,11 +13,11 @@ # limitations under the License. +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb index b03316bfce02..512a38bc90cc 100644 --- a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb +++ b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb @@ -826,7 +826,7 @@ "import torch\n", "import pytorch_lightning as pl\n", "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n", - "from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment\n", + "from lightning_lite.plugins.environments import TorchElasticEnvironment\n", "\n", "# lets modify some trainer configs\n", "# checks if we have GPU available and uses it\n", From 580654cfaf279d1050b9d6cf7d74f140423debea Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 23 Nov 2022 15:09:47 +0000 Subject: [PATCH 17/22] Update references Signed-off-by: SeanNaren --- .../models/language_modeling/megatron_base_model.py | 10 +++++----- .../language_modeling/megatron_retrieval_model.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 9d09020fc137..7119e47acb98 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -302,16 +302,16 @@ def on_train_batch_end(self, outputs, batch, batch_idx: int, unused: Optional[in # If the grad scaler skipped its optimizer step due to infs/nans, # decrement the step of all schedulers. if grad_scaler.optimizer_update_skipped is not None and grad_scaler.optimizer_update_skipped is True: - schedulers = self.trainer.lr_schedulers + scheduler_cfgs = self.trainer.lr_scheduler_configs - if not schedulers or not self.trainer.lightning_module.automatic_optimization: + if not scheduler_cfgs or not self.trainer.lightning_module.automatic_optimization: return - for scheduler in schedulers: + for scheduler_cfg in scheduler_cfgs: # Decrement the counter by 2, then perform a scheduler.step() to perform a no-up # as well as update the optimizer lr in all param groups - scheduler['scheduler'].last_epoch -= 2 - scheduler['scheduler'].step() + scheduler_cfg.scheduler.last_epoch -= 2 + scheduler_cfg.scheduler.step() # Removing the line below because it messes up train_valid_test_num_samples calculation. # self.trainer.fit_loop.max_steps = self.trainer.fit_loop.max_steps + 1 diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py index 97efa254459a..66e43458d20e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py @@ -309,16 +309,16 @@ def on_train_batch_end(self, outputs, batch, batch_idx: int, unused: Optional[in # If the grad scaler skipped its optimizer step due to infs/nans, # decrement the step of all schedulers. if grad_scaler.optimizer_update_skipped is not None and grad_scaler.optimizer_update_skipped is True: - schedulers = self.trainer.lr_schedulers + scheduler_cfgs = self.trainer.lr_scheduler_configs - if not schedulers or not self.trainer.lightning_module.automatic_optimization: + if not scheduler_cfgs or not self.trainer.lightning_module.automatic_optimization: return - for scheduler in schedulers: + for scheduler_cfg in scheduler_cfgs: # Decrement the counter by 2, then perform a scheduler.step() to perform a no-up # as well as update the optimizer lr in all param groups - scheduler['scheduler'].last_epoch -= 2 - scheduler['scheduler'].step() + scheduler_cfg.scheduler.last_epoch -= 2 + scheduler_cfg.scheduler.step() # Increase the max step count by 1 From ed8df6f317901be00fae6b66f6ea52fb984f3291 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 23 Nov 2022 16:00:04 +0000 Subject: [PATCH 18/22] Fix deprecated variables Signed-off-by: SeanNaren --- nemo_text_processing/g2p/models/ctc_g2p.py | 2 +- nemo_text_processing/g2p/models/t5_g2p.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/g2p/models/ctc_g2p.py b/nemo_text_processing/g2p/models/ctc_g2p.py index 7f77ed6595e5..a456942c1ad8 100644 --- a/nemo_text_processing/g2p/models/ctc_g2p.py +++ b/nemo_text_processing/g2p/models/ctc_g2p.py @@ -69,7 +69,7 @@ class CTCG2PModel(G2PModel, ASRBPEMixin): def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.world_size = 1 if trainer is not None: - self.world_size = trainer.num_nodes * trainer.num_gpus + self.world_size = trainer.num_nodes * trainer.num_devices self.mode = cfg.model_name.lower() diff --git a/nemo_text_processing/g2p/models/t5_g2p.py b/nemo_text_processing/g2p/models/t5_g2p.py index 437b9bf57475..7ed1c917a880 100644 --- a/nemo_text_processing/g2p/models/t5_g2p.py +++ b/nemo_text_processing/g2p/models/t5_g2p.py @@ -58,7 +58,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.world_size = 1 if trainer is not None: - self.world_size = trainer.num_nodes * trainer.num_gpus + self.world_size = trainer.num_nodes * trainer.num_devices # Load appropriate tokenizer from HuggingFace self.model_name = cfg.model_name From 12d7ea21f552a68a5b5d4cdf33bf4ae889252e4a Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 23 Nov 2022 16:38:40 +0000 Subject: [PATCH 19/22] Fix missing var Signed-off-by: SeanNaren --- .../nlp/models/dialogue/dialogue_gpt_classification_model.py | 2 +- .../nlp/models/dialogue/dialogue_gpt_generation_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py index 9608a0320bd6..223b9238bb8a 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py @@ -710,7 +710,7 @@ def prepare_data(self): self.data_prepared = True def setup(self, stage=None): - super().setup() + super().setup(stage) if self.cfg.library == "megatron" and self.prompt_learning and stage == "fit": if self.cfg.virtual_prompt_style == VirtualPromptStyle.PROMPT_TUNING: self.language_model.init_new_prompts() diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py index 69ff6d37527e..c6c976a501a1 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py @@ -216,7 +216,7 @@ def mask_and_reduce_loss(self, loss_mask, output_tensor): return loss def setup(self, stage=None): - super().setup() + super().setup(stage) if self.cfg.library == "megatron" and self.prompt_learning: self.language_model.init_new_prompts() From 0e8e59ab31049fa3e88f15dce51d7aaabd31ed44 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 23 Nov 2022 18:25:13 +0000 Subject: [PATCH 20/22] Fix var Signed-off-by: SeanNaren --- nemo/collections/nlp/models/dialogue/sgdqa_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/dialogue/sgdqa_model.py b/nemo/collections/nlp/models/dialogue/sgdqa_model.py index c8b6c468b988..2dd4011d74ac 100644 --- a/nemo/collections/nlp/models/dialogue/sgdqa_model.py +++ b/nemo/collections/nlp/models/dialogue/sgdqa_model.py @@ -226,7 +226,7 @@ def eval_step_helper(self, batch: List[torch.Tensor]): all_start_char_idx = [] all_end_char_idx = [] - if self.trainer.devices and self.trainer.world_size > 1: + if self.trainer.num_devices and self.trainer.world_size > 1: world_size = self.trainer.world_size for ind in range(world_size): all_example_id_num.append(torch.empty_like(example_id_num)) From d672304e6656b597aadec779da7edaf3ce89be04 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 23 Nov 2022 21:00:03 +0000 Subject: [PATCH 21/22] Revert changes Signed-off-by: SeanNaren --- Jenkinsfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 36c23ecc9c08..2003e468e6a9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -225,11 +225,9 @@ pipeline { stage('Speech to Text EMA') { steps { sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - --config-path="../conf/" --config-name="config" \ model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=[0,1] \ - trainer.strategy="ddp" \ + trainer.devices=2 \ trainer.accelerator="gpu" \ +trainer.fast_dev_run=True \ +exp_manager.ema.enable=True \ From d6143783c0f134d219337b6e2f5e43359cd9aa10 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 24 Nov 2022 10:31:35 +0000 Subject: [PATCH 22/22] Address review Signed-off-by: SeanNaren --- nemo/core/classes/exportable.py | 1 - tests/hydra/test_hydra_runner.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py index 53b47cf48bad..50266dab3dbe 100644 --- a/nemo/core/classes/exportable.py +++ b/nemo/core/classes/exportable.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os from abc import ABC from typing import List, Union diff --git a/tests/hydra/test_hydra_runner.py b/tests/hydra/test_hydra_runner.py index 803d6aa6af8a..1da0a914cfaf 100644 --- a/tests/hydra/test_hydra_runner.py +++ b/tests/hydra/test_hydra_runner.py @@ -41,6 +41,8 @@ def test_config1(self): # Run the call as subprocess. subprocess.check_call(call, shell=True, stdout=sys.stdout, stderr=sys.stdout) + # Make sure that .hydra dir is not present. + assert not path.exists(f".hydra") # Make sure that default hydra log file is not present. assert not path.exists(f"my_app.log") @@ -65,6 +67,8 @@ def test_config2(self): # Run the call as subprocess. subprocess.check_call(call, shell=True, stdout=sys.stdout, stderr=sys.stdout) + # Make sure that .hydra dir is not present. + assert not path.exists(f".hydra") # Make sure that default hydra log file is not present. assert not path.exists(f"my_app.log")