microsoft · mrwyattii · Oct 12, 2023 · Oct 5, 2023 · Oct 11, 2023 · Oct 11, 2023
@@ -130,7 +130,9 @@ def create_critic_model(model_name_or_path,
     critic_model = create_hf_model(AutoModel, model_name_or_path, tokenizer,
                                    ds_config, rlhf_training, dropout)
     end = time.time()
-    if torch.distributed.get_rank() == 0:
+    if not torch.distributed.is_initialized():
+        print(f"> Creating model from_config took {end - start} seconds")
+    elif torch.distributed.get_rank() == 0:
         print(f"> Creating model from_config took {end - start} seconds")
 
     critic_model = RewardModel(
@@ -152,8 +154,10 @@ def create_critic_model(model_name_or_path,
         start = time.time()
         model_ckpt_state_dict = torch.load(model_ckpt_path, map_location='cpu')
         end = time.time()
-        if torch.distributed.get_rank() == 0:
-            print(f"> torch.load took {end - start} seconds")
+        if not torch.distributed.is_initialized():
+            print(f"> Creating model from_config took {end - start} seconds")
+        elif torch.distributed.get_rank() == 0:
+            print(f"> Creating model from_config took {end - start} seconds")
 
         # load critic model from checkpoint with zero-stage 3 compatibility
         # this functionality may be moved to DS checkpoint load API in future
@@ -163,7 +167,10 @@ def create_critic_model(model_name_or_path,
                                    "",
                                    zero_stage=zero_stage)
         end = time.time()
-        if torch.distributed.get_rank() == 0:
+
+        if not torch.distributed.is_initialized():
+            print(f"> Creating model from_config took {end - start} seconds")
+        elif torch.distributed.get_rank() == 0:
             print(f"> Loading model state dict took {end - start} seconds")
 
     return critic_model