From 03695567aeeb5190c5f94ae1d0d0b3a760876d05 Mon Sep 17 00:00:00 2001 From: Harish Rao Date: Mon, 6 May 2024 14:36:31 -0700 Subject: [PATCH] If training restarts, it now restarts from the last checkpoint - Fix for Issue #29 --- nemo/examples/nlp/language_modeling/test_llama.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo/examples/nlp/language_modeling/test_llama.sh b/nemo/examples/nlp/language_modeling/test_llama.sh index 707d9b2..7d7f6ef 100755 --- a/nemo/examples/nlp/language_modeling/test_llama.sh +++ b/nemo/examples/nlp/language_modeling/test_llama.sh @@ -72,8 +72,8 @@ $MAYBE_COMPILE torchrun $DISTRIBUTED_ARGS megatron_gpt_pretraining.py \ model.wrap_with_zero=$wrap_with_zero \ model.zero_use_master_weight=$zero_use_master_weight \ exp_manager.create_tensorboard_logger=$CREATE_TB_LOGGER \ - exp_manager.resume_if_exists=False \ - exp_manager.resume_ignore_no_checkpoint=False \ + exp_manager.resume_if_exists=True \ + exp_manager.resume_ignore_no_checkpoint=True \ exp_manager.create_checkpoint_callback=$CHECKPOINT_CALLBACK \ exp_manager.explicit_log_dir=$EXPLICIT_LOGDIR \ +exp_manager.checkpoint_callback_params.train_time_interval=36000 \ @@ -81,3 +81,5 @@ $MAYBE_COMPILE torchrun $DISTRIBUTED_ARGS megatron_gpt_pretraining.py \ # Note: to resume training using a checkpoint, please add the following configuration above, adjusting for your checkpoint path # model.resume_from_checkpoint='/efs/checkpoint/megatron_gpt--step\=1085-consumed_samples\=69632.0-last.ckpt' \ +# +exp_manager.checkpoint_callback_params.train_time_interval=36000 is in seconds and will cause checkpoints to be created every +# 10 hours. Change this appropriately for your use case.