Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issues with PL 1.8 #5353

Merged
merged 29 commits into from
Nov 28, 2022
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
477baaa
Fix issues with PL 1.8
SeanNaren Nov 8, 2022
c6eb78c
Set scripting variable
SeanNaren Nov 8, 2022
4cc6301
Fix missing arg
SeanNaren Nov 8, 2022
25e0f77
Cleanup list
SeanNaren Nov 8, 2022
60db72b
Fix reference
SeanNaren Nov 9, 2022
0ed8d1e
Merge branch 'main' into feat/lightning_1.8_support
SeanNaren Nov 9, 2022
6ef98e7
Try to fix hanging EMA test
SeanNaren Nov 9, 2022
e25704c
Missing \
SeanNaren Nov 9, 2022
4da05eb
Add strategy
SeanNaren Nov 9, 2022
4e6913c
See if setting the chdir fixes the hanging DDP test
SeanNaren Nov 9, 2022
07d4292
See if removing the subdir setter fixes the issue
SeanNaren Nov 10, 2022
9c6307a
Merge branch 'main' into feat/lightning_1.8_support
SeanNaren Nov 10, 2022
15087e5
Remove checks
SeanNaren Nov 10, 2022
4a01cec
Try [0,1] for devices
SeanNaren Nov 10, 2022
6c3e417
Merge branch 'main' into feat/lightning_1.8_support
okuchaiev Nov 14, 2022
1aad730
Merge branch 'main' into feat/lightning_1.8_support
okuchaiev Nov 15, 2022
b1add40
Add code back
SeanNaren Nov 15, 2022
8c7d7e0
Remove space
SeanNaren Nov 21, 2022
dce4918
Merge branch 'main' into feat/lightning_1.8_support
SeanNaren Nov 23, 2022
9e8ab6b
Update requirements
SeanNaren Nov 23, 2022
b33501d
Swap import path
SeanNaren Nov 23, 2022
580654c
Update references
SeanNaren Nov 23, 2022
ed8df6f
Fix deprecated variables
SeanNaren Nov 23, 2022
12d7ea2
Fix missing var
SeanNaren Nov 23, 2022
0e8e59a
Fix var
SeanNaren Nov 23, 2022
8efe28f
Merge branch 'main' into feat/lightning_1.8_support
SeanNaren Nov 23, 2022
d672304
Revert changes
SeanNaren Nov 23, 2022
706b19c
Merge branch 'main' into feat/lightning_1.8_support
SeanNaren Nov 23, 2022
d614378
Address review
SeanNaren Nov 24, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,11 @@ pipeline {
stage('Speech to Text EMA') {
steps {
sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
--config-path="../conf/" --config-name="config" \
SeanNaren marked this conversation as resolved.
Show resolved Hide resolved
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
trainer.devices=2 \
trainer.devices=[0,1] \
trainer.strategy="ddp" \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
+exp_manager.ema.enable=True \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelSummary
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

import torch
from apex.transformer import parallel_state
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from lightning_lite.plugins.environments import TorchElasticEnvironment
from pytorch_lightning.trainer.trainer import Trainer

from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_gpt_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import (
MegatronGPTPromptLearningModel,
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_retro_cal_shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin

from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelSummary
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_t5_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelSummary
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_t5_prompt_learning_model import (
MegatronT5PromptLearningModel,
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from megatron_t5_seq2seq_finetune import load_from_checkpoint_dir, load_from_nemo, validate_checkpoint_loading_args
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin

from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
import os
import tempfile

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTAdapterLearningModel
from nemo.collections.nlp.parts.nlp_overrides import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTInfusedAdapterModel
from nemo.collections.nlp.parts.nlp_overrides import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5AdapterLearningModel
from nemo.collections.nlp.parts.nlp_overrides import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5InfusedAdapterModel
from nemo.collections.nlp.parts.nlp_overrides import (
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/machine_translation/megatron_nmt_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelSummary
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ def prepare_data(self):
self.data_prepared = True

def setup(self, stage=None):
super().setup()
super().setup(stage)
if self.cfg.library == "megatron" and self.prompt_learning and stage == "fit":
if self.cfg.virtual_prompt_style == VirtualPromptStyle.PROMPT_TUNING:
self.language_model.init_new_prompts()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def mask_and_reduce_loss(self, loss_mask, output_tensor):
return loss

def setup(self, stage=None):
super().setup()
super().setup(stage)
if self.cfg.library == "megatron" and self.prompt_learning:
self.language_model.init_new_prompts()

Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/nlp/models/dialogue/sgdqa_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def eval_step_helper(self, batch: List[torch.Tensor]):
all_start_char_idx = []
all_end_char_idx = []

if self.trainer.devices and self.trainer.world_size > 1:
if self.trainer.num_devices and self.trainer.world_size > 1:
world_size = self.trainer.world_size
for ind in range(world_size):
all_example_id_num.append(torch.empty_like(example_id_num))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -302,16 +302,16 @@ def on_train_batch_end(self, outputs, batch, batch_idx: int, unused: Optional[in
# If the grad scaler skipped its optimizer step due to infs/nans,
# decrement the step of all schedulers.
if grad_scaler.optimizer_update_skipped is not None and grad_scaler.optimizer_update_skipped is True:
schedulers = self.trainer.lr_schedulers
scheduler_cfgs = self.trainer.lr_scheduler_configs

if not schedulers or not self.trainer.lightning_module.automatic_optimization:
if not scheduler_cfgs or not self.trainer.lightning_module.automatic_optimization:
return

for scheduler in schedulers:
for scheduler_cfg in scheduler_cfgs:
# Decrement the counter by 2, then perform a scheduler.step() to perform a no-up
# as well as update the optimizer lr in all param groups
scheduler['scheduler'].last_epoch -= 2
scheduler['scheduler'].step()
scheduler_cfg.scheduler.last_epoch -= 2
scheduler_cfg.scheduler.step()

# Removing the line below because it messes up train_valid_test_num_samples calculation.
# self.trainer.fit_loop.max_steps = self.trainer.fit_loop.max_steps + 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,16 +309,16 @@ def on_train_batch_end(self, outputs, batch, batch_idx: int, unused: Optional[in
# If the grad scaler skipped its optimizer step due to infs/nans,
# decrement the step of all schedulers.
if grad_scaler.optimizer_update_skipped is not None and grad_scaler.optimizer_update_skipped is True:
schedulers = self.trainer.lr_schedulers
scheduler_cfgs = self.trainer.lr_scheduler_configs

if not schedulers or not self.trainer.lightning_module.automatic_optimization:
if not scheduler_cfgs or not self.trainer.lightning_module.automatic_optimization:
return

for scheduler in schedulers:
for scheduler_cfg in scheduler_cfgs:
# Decrement the counter by 2, then perform a scheduler.step() to perform a no-up
# as well as update the optimizer lr in all param groups
scheduler['scheduler'].last_epoch -= 2
scheduler['scheduler'].step()
scheduler_cfg.scheduler.last_epoch -= 2
scheduler_cfg.scheduler.step()

# Increase the max step count by 1

Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@

import pytorch_lightning as pl
import torch
from lightning_lite.plugins import ClusterEnvironment
from lightning_lite.utilities.types import _PATH
from omegaconf import OmegaConf
from pytorch_lightning.overrides import LightningDistributedModule
from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
from pytorch_lightning.strategies.ddp import DDPStrategy
from pytorch_lightning.trainer.trainer import Trainer
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.fetching import DataFetcher
from pytorch_lightning.utilities.types import _PATH
from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook
from torch.nn.parallel import DistributedDataParallel

Expand Down
11 changes: 5 additions & 6 deletions nemo/collections/tts/models/fastpitch.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger
from pytorch_lightning.loggers import TensorBoardLogger

from nemo.collections.common.parts.preprocessing import parsers
from nemo.collections.tts.helpers.helpers import plot_alignment_to_numpy, plot_spectrogram_to_numpy, process_batch
Expand Down Expand Up @@ -219,11 +219,10 @@ def tb_logger(self):
if self.logger is None and self.logger.experiment is None:
return None
tb_logger = self.logger.experiment
if isinstance(self.logger, LoggerCollection):
for logger in self.logger:
if isinstance(logger, TensorBoardLogger):
tb_logger = logger.experiment
break
for logger in self.trainer.loggers:
if isinstance(logger, TensorBoardLogger):
tb_logger = logger.experiment
SeanNaren marked this conversation as resolved.
Show resolved Hide resolved
break
self._tb_logger = tb_logger
return self._tb_logger

Expand Down
11 changes: 5 additions & 6 deletions nemo/collections/tts/models/radtts.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger
from pytorch_lightning.loggers import TensorBoardLogger

from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import BaseTokenizer
from nemo.collections.tts.helpers.helpers import plot_alignment_to_numpy
Expand Down Expand Up @@ -388,11 +388,10 @@ def tb_logger(self):
if self.logger is None and self.logger.experiment is None:
return None
tb_logger = self.logger.experiment
if isinstance(self.logger, LoggerCollection):
for logger in self.logger:
if isinstance(logger, TensorBoardLogger):
tb_logger = logger.experiment
break
for logger in self.trainer.loggers:
if isinstance(logger, TensorBoardLogger):
tb_logger = logger.experiment
break
self._tb_logger = tb_logger
return self._tb_logger

Expand Down
11 changes: 5 additions & 6 deletions nemo/collections/tts/models/tacotron2.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from hydra.utils import instantiate
from omegaconf import MISSING, DictConfig, OmegaConf, open_dict
from omegaconf.errors import ConfigAttributeError
from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger, WandbLogger
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from torch import nn

from nemo.collections.common.parts.preprocessing import parsers
Expand Down Expand Up @@ -284,11 +284,10 @@ def validation_step(self, batch, batch_idx):
def validation_epoch_end(self, outputs):
if self.logger is not None and self.logger.experiment is not None:
logger = self.logger.experiment
if isinstance(self.logger, LoggerCollection):
for logger in self.logger:
if isinstance(logger, TensorBoardLogger):
logger = logger.experiment
break
for logger in self.trainer.loggers:
if isinstance(logger, TensorBoardLogger):
logger = logger.experiment
break
if isinstance(logger, TensorBoardLogger):
tacotron2_log_to_tb_func(
logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False,
Expand Down
Loading