Skip to content

Commit

Permalink
Merge pull request #895 from NVIDIA/flatten_pl_config
Browse files Browse the repository at this point in the history
Flatten pl.trainer to trainer
  • Loading branch information
ericharper authored Jul 23, 2020
2 parents 54472fd + d4af44c commit 32f659e
Show file tree
Hide file tree
Showing 18 changed files with 140 additions and 152 deletions.
58 changes: 29 additions & 29 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ pipeline {
sh 'python examples/asr/speech_to_text.py \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
pl.trainer.gpus=[0] \
+pl.trainer.fast_dev_run=True \
trainer.gpus=[0] \
+trainer.fast_dev_run=True \
exp_manager.root_dir=examples/asr/speech_to_text_results'
sh 'rm -rf examples/asr/speech_to_text_results'
}
Expand All @@ -115,8 +115,8 @@ pipeline {
sh 'python examples/asr/speech_to_label.py \
model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
pl.trainer.gpus=[1] \
+pl.trainer.fast_dev_run=True \
trainer.gpus=[1] \
+trainer.fast_dev_run=True \
model.preprocessor.cls=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
model.preprocessor.params=null \
exp_manager.root_dir=examples/asr/speech_to_label_results'
Expand All @@ -131,8 +131,8 @@ pipeline {
model.validation_ds.batch_size=2 \
model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
pl.trainer.gpus=[1] \
+pl.trainer.fast_dev_run=True \
trainer.gpus=[1] \
+trainer.fast_dev_run=True \
exp_manager.root_dir=examples/speaker_recognition/speaker_recognition_results'
sh 'rm -rf examples/speaker_recognition/speaker_recognition_results'
}
Expand Down Expand Up @@ -161,10 +161,10 @@ pipeline {
model.validation_ds.use_cache=false \
model.language_model.pretrained_model_name=bert-base-uncased \
model.version_2_with_negative=false \
pl.trainer.precision=16 \
pl.trainer.amp_level=O1 \
pl.trainer.gpus=[0] \
+pl.trainer.fast_dev_run=true \
trainer.precision=16 \
trainer.amp_level=O1 \
trainer.gpus=[0] \
+trainer.fast_dev_run=true \
exp_manager.root_dir=exp_bert_squad_1.1 \
'
sh 'rm -rf examples/nlp/question_answering/exp_bert_squad_1.1'
Expand All @@ -179,10 +179,10 @@ pipeline {
model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
model.language_model.pretrained_model_name=bert-base-uncased \
model.version_2_with_negative=true \
pl.trainer.precision=16 \
pl.trainer.amp_level=O1 \
pl.trainer.gpus=[1] \
+pl.trainer.fast_dev_run=true \
trainer.precision=16 \
trainer.amp_level=O1 \
trainer.gpus=[1] \
+trainer.fast_dev_run=true \
exp_manager.root_dir=exp_bert_squad_2.0 \
'
sh 'rm -rf examples/nlp/question_answering/exp_bert_squad_2.0'
Expand Down Expand Up @@ -210,10 +210,10 @@ pipeline {
model.language_model.do_lower_case=true \
model.language_model.pretrained_model_name=roberta-base \
model.version_2_with_negative=false \
pl.trainer.precision=16 \
pl.trainer.amp_level=O1 \
pl.trainer.gpus=[0] \
+pl.trainer.fast_dev_run=true \
trainer.precision=16 \
trainer.amp_level=O1 \
trainer.gpus=[0] \
+trainer.fast_dev_run=true \
exp_manager.root_dir=exp_roberta_squad_1.1 \
'
sh 'rm -rf examples/nlp/question_answering/exp_roberta_squad_1.1'
Expand All @@ -229,10 +229,10 @@ pipeline {
model.language_model.do_lower_case=true \
model.language_model.pretrained_model_name=roberta-base \
model.version_2_with_negative=true \
pl.trainer.precision=16 \
pl.trainer.amp_level=O1 \
pl.trainer.gpus=[1] \
+pl.trainer.fast_dev_run=true \
trainer.precision=16 \
trainer.amp_level=O1 \
trainer.gpus=[1] \
+trainer.fast_dev_run=true \
exp_manager.root_dir=exp_roberta_squad_2.0 \
'
sh 'rm -rf examples/nlp/question_answering/exp_roberta_squad_2.0'
Expand Down Expand Up @@ -261,8 +261,8 @@ pipeline {
model.train_ds.batch_size=10 \
model.train_ds.use_cache=false \
model.language_model.do_lower_case=true \
pl.trainer.gpus=[0] \
+pl.trainer.fast_dev_run=true \
trainer.gpus=[0] \
+trainer.fast_dev_run=true \
exp_manager.root_dir=exp_bert_base_uncased \
'
sh 'rm -rf examples/nlp/text_classification/exp_bert_base_uncased'
Expand Down Expand Up @@ -309,8 +309,8 @@ pipeline {
sh 'cd examples/nlp/token_classification && \
python ner.py \
model.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
pl.trainer.gpus=[0] \
+pl.trainer.fast_dev_run=true \
trainer.gpus=[0] \
+trainer.fast_dev_run=true \
model.use_cache=false \
'
}
Expand All @@ -329,9 +329,9 @@ pipeline {
model.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
model.language_model.pretrained_model_name=distilbert-base-uncased \
model.use_cache=false \
pl.trainer.gpus=[0,1] \
pl.trainer.distributed_backend=ddp \
+pl.trainer.fast_dev_run=true \
trainer.gpus=[0,1] \
trainer.distributed_backend=ddp \
+trainer.fast_dev_run=true \
exp_manager.root_dir=exp_distilbert_base_uncased \
'
sh 'rm -rf examples/nlp/token_classification/exp_distilbert_base_uncased'
Expand Down
19 changes: 9 additions & 10 deletions examples/asr/conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -158,16 +158,15 @@ model:
min_lr: 0.0
last_epoch: -1

pl:
trainer:
gpus: 0 # number of gpus
max_epochs: 5
max_steps: null # computed at runtime if not set
num_nodes: 1
distributed_backend: ddp
accumulate_grad_batches: 1
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager
trainer:
gpus: 0 # number of gpus
max_epochs: 5
max_steps: null # computed at runtime if not set
num_nodes: 1
distributed_backend: ddp
accumulate_grad_batches: 1
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager

exp_manager:
root_dir: null
Expand Down
19 changes: 9 additions & 10 deletions examples/asr/conf/matchboxnet_3x1x64_v1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,16 +160,15 @@ model:
min_lr: 0.001
last_epoch: -1

pl:
trainer:
gpus: 0 # number of gpus
max_epochs: 200
max_steps: null # computed at runtime if not set
num_nodes: 1
distributed_backend: ddp
accumulate_grad_batches: 1
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager
trainer:
gpus: 0 # number of gpus
max_epochs: 200
max_steps: null # computed at runtime if not set
num_nodes: 1
distributed_backend: ddp
accumulate_grad_batches: 1
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager

exp_manager:
root_dir: null
Expand Down
19 changes: 9 additions & 10 deletions examples/asr/conf/matchboxnet_3x1x64_v2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,16 +160,15 @@ model:
min_lr: 0.001
last_epoch: -1

pl:
trainer:
gpus: 0 # number of gpus
max_epochs: 200
max_steps: null # computed at runtime if not set
num_nodes: 1
distributed_backend: ddp
accumulate_grad_batches: 1
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager
trainer:
gpus: 0 # number of gpus
max_epochs: 200
max_steps: null # computed at runtime if not set
num_nodes: 1
distributed_backend: ddp
accumulate_grad_batches: 1
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager

exp_manager:
root_dir: null
Expand Down
19 changes: 9 additions & 10 deletions examples/asr/experimental/configs/config_bpe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -155,16 +155,15 @@ model:
min_lr: 1e-6
last_epoch: -1

pl:
trainer:
gpus: 0 # number of gpus
max_epochs: 5
max_steps: null # computed at runtime if not set
num_nodes: 1
distributed_backend: ddp
accumulate_grad_batches: 1
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager
trainer:
gpus: 0 # number of gpus
max_epochs: 5
max_steps: null # computed at runtime if not set
num_nodes: 1
distributed_backend: ddp
accumulate_grad_batches: 1
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager

exp_manager:
root_dir: null
Expand Down
2 changes: 1 addition & 1 deletion examples/asr/speech_to_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

@hydra_runner(config_path="conf", config_name="matchboxnet_3x1x64_v1.yaml")
def main(cfg):
trainer = pl.Trainer(**cfg.pl.trainer)
trainer = pl.Trainer(**cfg.trainer)
exp_manager(trainer, cfg.get("exp_manager", None))
asr_model = EncDecClassificationModel(cfg=cfg.model, trainer=trainer)

Expand Down
16 changes: 8 additions & 8 deletions examples/asr/speech_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@
model.train_ds.manifest_filepath="/Users/okuchaiev/Data/an4_dataset/an4_train.json" \
model.validation_ds.manifest_filepath="/Users/okuchaiev/Data/an4_dataset/an4_val.json" \
hydra.run.dir="." \
pl.trainer.gpus=0 \
pl.trainer.max_epochs=50
trainer.gpus=0 \
trainer.max_epochs=50
Add PyTorch Lightning Trainer arguments from CLI:
python speech_to_text.py \
... \
+pl.trainer.fast_dev_run=true
+trainer.fast_dev_run=true
Hydra logs will be found in "$(./outputs/$(date +"%y-%m-%d")/$(date +"%H-%M-%S")/.hydra)"
PTL logs will be found in "$(./outputs/$(date +"%y-%m-%d")/$(date +"%H-%M-%S")/lightning_logs)"
Expand All @@ -42,8 +42,8 @@
model.train_ds.manifest_filepath="./an4/train_manifest.json" \
model.validation_ds.manifest_filepath="./an4/test_manifest.json" \
hydra.run.dir="." \
pl.trainer.gpus=2 \
pl.trainer.max_epochs=2 \
trainer.gpus=2 \
trainer.max_epochs=2 \
model.optim.args.params.betas=[0.8,0.5] \
model.optim.args.params.weight_decay=0.0001
Expand All @@ -52,8 +52,8 @@
model.train_ds.manifest_filepath="./an4/train_manifest.json" \
model.validation_ds.manifest_filepath="./an4/test_manifest.json" \
hydra.run.dir="." \
pl.trainer.gpus=2 \
pl.trainer.max_epochs=2 \
trainer.gpus=2 \
trainer.max_epochs=2 \
model.optim.name=adamw \
model.optim.lr=0.001 \
~model.optim.args \
Expand All @@ -65,7 +65,7 @@

@hydra_runner(config_path="conf", config_name="config")
def main(cfg):
trainer = pl.Trainer(**cfg.pl.trainer)
trainer = pl.Trainer(**cfg.trainer)
exp_manager(trainer, cfg.get("exp_manager", None))
asr_model = EncDecCTCModel(cfg=cfg.model, trainer=trainer)

Expand Down
8 changes: 4 additions & 4 deletions examples/asr/speech_to_text_bpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
model.train_ds.manifest_filepath="./an4/train_manifest.json" \
model.validation_ds.manifest_filepath="./an4/test_manifest.json" \
model.tokenizer.path="./an4/tokenizer/LibriSpeechTokenizer/librispeech_tokenizer_bpe_v1024/" \
pl.trainer.gpus=2 \
pl.trainer.distributed_backend="ddp" \
pl.trainer.max_epochs=100 \
trainer.gpus=2 \
trainer.distributed_backend="ddp" \
trainer.max_epochs=100 \
model.optim.name="adamw" \
model.optim.lr=0.1 \
model.optim.args.params.betas=[0.9,0.999] \
Expand All @@ -40,7 +40,7 @@
@hydra_runner(config_path="experimental/configs/", config_name="config_bpe")
def main(cfg):
logging.info(f'Hydra config: {cfg.pretty()}')
trainer = pl.Trainer(**cfg.pl.trainer)
trainer = pl.Trainer(**cfg.trainer)
exp_manager(trainer, cfg.get("exp_manager", None))
asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer)

Expand Down
28 changes: 13 additions & 15 deletions examples/nlp/question_answering/conf/config.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
# Question Answering with SQUAD
name: &name QA

pl:
trainer:
gpus: 1 # the number of gpus, 0 for CPU, or list with gpu indices
num_nodes: 1
max_epochs: 2 # the number of training epochs
max_steps: null # precedence over max_epochs
accumulate_grad_batches: 1 # accumulates grads every k batches
precision: 16 # 16 to use AMP
amp_level: O1 # O1 or O2 if using AMP
distributed_backend: ddp
gradient_clip_val: 0.0
val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
checkpoint_callback: false # provided by exp_manager
logger: false # provided by exp_manager
trainer:
gpus: 1 # the number of gpus, 0 for CPU, or list with gpu indices
num_nodes: 1
max_epochs: 2 # the number of training epochs
max_steps: null # precedence over max_epochs
accumulate_grad_batches: 1 # accumulates grads every k batches
precision: 16 # 16 to use AMP
amp_level: O1 # O1 or O2 if using AMP
distributed_backend: ddp
gradient_clip_val: 0.0
val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
checkpoint_callback: false # provided by exp_manager
logger: false # provided by exp_manager

model:

Expand Down Expand Up @@ -127,7 +126,6 @@ model:
log_softmax: false
use_transformer_init: true

pl: null # used at runtime

exp_manager:
root_dir: null # where to store logs and checkpoints
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
@hydra_runner(config_path="conf", config_name="config")
def main(cfg: DictConfig) -> None:
logging.info(f'Config: {cfg.pretty()}')
trainer = pl.Trainer(**cfg.pl.trainer)
trainer = pl.Trainer(**cfg.trainer)
exp_manager(trainer, cfg.get("exp_manager", None))
question_answering_model = QAModel(cfg.model, trainer=trainer)
trainer.fit(question_answering_model)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,16 @@

# Config file for text classification with pretrained BERT models

pl:
trainer:
gpus: 1 # the number of gpus, 0 for CPU
num_nodes: 1
max_epochs: 100
max_steps: null # precedence over max_epochs
accumulate_grad_batches: 1 # accumulates grads every k batches
amp_level: O0 # O1/O2 for mixed precision
distributed_backend: ddp
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager
trainer:
gpus: 1 # the number of gpus, 0 for CPU
num_nodes: 1
max_epochs: 100
max_steps: null # precedence over max_epochs
accumulate_grad_batches: 1 # accumulates grads every k batches
amp_level: O0 # O1/O2 for mixed precision
distributed_backend: ddp
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager

model:
data_dir: ??? # /path/to/data
Expand Down Expand Up @@ -91,7 +90,6 @@ model:
warmup_ratio: 0.1
last_epoch: -1

pl: null # used at runtime

exp_manager:
root_dir: null # root_dir for your experiment, if None, defaults to "./NeMo_experiments"
Expand Down
Loading

0 comments on commit 32f659e

Please sign in to comment.