From 03695567aeeb5190c5f94ae1d0d0b3a760876d05 Mon Sep 17 00:00:00 2001
From: Harish Rao <harisvs@amazon.com>
Date: Mon, 6 May 2024 14:36:31 -0700
Subject: [PATCH] If training restarts, it now restarts from the last
 checkpoint - Fix for Issue #29

---
 nemo/examples/nlp/language_modeling/test_llama.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/nemo/examples/nlp/language_modeling/test_llama.sh b/nemo/examples/nlp/language_modeling/test_llama.sh
index 707d9b2..7d7f6ef 100755
--- a/nemo/examples/nlp/language_modeling/test_llama.sh
+++ b/nemo/examples/nlp/language_modeling/test_llama.sh
@@ -72,8 +72,8 @@ $MAYBE_COMPILE torchrun $DISTRIBUTED_ARGS megatron_gpt_pretraining.py  \
     model.wrap_with_zero=$wrap_with_zero \
     model.zero_use_master_weight=$zero_use_master_weight \
     exp_manager.create_tensorboard_logger=$CREATE_TB_LOGGER \
-    exp_manager.resume_if_exists=False \
-    exp_manager.resume_ignore_no_checkpoint=False \
+    exp_manager.resume_if_exists=True \
+    exp_manager.resume_ignore_no_checkpoint=True \
     exp_manager.create_checkpoint_callback=$CHECKPOINT_CALLBACK \
     exp_manager.explicit_log_dir=$EXPLICIT_LOGDIR \
     +exp_manager.checkpoint_callback_params.train_time_interval=36000 \
@@ -81,3 +81,5 @@ $MAYBE_COMPILE torchrun $DISTRIBUTED_ARGS megatron_gpt_pretraining.py  \
 
 # Note: to resume training using a checkpoint, please add the following configuration above, adjusting for your checkpoint path
 #    model.resume_from_checkpoint='/efs/checkpoint/megatron_gpt--step\=1085-consumed_samples\=69632.0-last.ckpt' \
+#    +exp_manager.checkpoint_callback_params.train_time_interval=36000 is in seconds and will cause checkpoints to be created every
+#    10 hours. Change this appropriately for your use case.