Remove unnecessary attention mask (#8733)

* pass a config to GPTDataset Signed-off-by: Xiaowei Ren <[email protected]> * set attention mask to None if dataloader does not have it Signed-off-by: Xiaowei Ren <[email protected]> * fix function name Signed-off-by: Xiaowei Ren <[email protected]> * fix nsys profile Signed-off-by: Xiaowei Ren <[email protected]> * dataset config variable name change Signed-off-by: Xiaowei Ren <[email protected]> * Apply isort and black reformatting Signed-off-by: xrennvidia <[email protected]> --------- Signed-off-by: Xiaowei Ren <[email protected]> Signed-off-by: xrennvidia <[email protected]> Co-authored-by: xrennvidia <[email protected]> Signed-off-by: Marc Romeyn <[email protected]>
NVIDIA · Jun 7, 2024 · 92937fb · 92937fb
1 parent ef5f072
commit 92937fb
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 1 deletion.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1126,6 +1126,7 @@ def get_batch(self, data_iterator, tuning):
             'tokens': data["tokens"],
             'labels': data["labels"],
             'loss_mask': data["loss_mask"],
+            'attention_mask': None if "attention_mask" not in data else data["attention_mask"],
             'position_ids': data["position_ids"],
         }
         if "attention_mask" in data:
@@ -1497,6 +1498,7 @@ def build_train_valid_test_datasets(self):
                 "reset_position_ids": self.reset_position_ids,
                 "reset_attention_mask": self.reset_attention_mask,
                 "eod_mask_loss": self.eod_mask_loss,
+                "create_attention_mask": not self.get_attention_mask_from_fusion,
                 "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
             }
 

diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
@@ -122,7 +122,7 @@ def __init__(
     ):
 
         # Initialize process groups
-        if 'process_group' not in kwargs and not parallel_state.is_unitialized():
+        if 'process_group' not in kwargs and parallel_state.is_initialized():
             kwargs['process_group'] = parallel_state.get_data_parallel_group(with_context_parallel=True)
         if disable_distributed_parameters:
             world_size = torch.distributed.get_world_size()