diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py index 85f9af6e3df2..9340b1f7c504 100644 --- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py +++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py @@ -218,14 +218,20 @@ def vocab(self): @property def pad_id(self): + if getattr(self, 'pad_token') is None: + return None return self.tokens_to_ids([getattr(self, 'pad_token')])[0] @property def bos_id(self): + if getattr(self, 'bos_token') is None: + return None return self.tokens_to_ids([getattr(self, 'bos_token')])[0] @property def eos_id(self): + if getattr(self, 'eos_token') is None: + return None return self.tokens_to_ids([getattr(self, 'eos_token')])[0] @property @@ -235,18 +241,26 @@ def eod(self): @property def sep_id(self): + if getattr(self, 'sep_token') is None: + return None return self.tokens_to_ids([getattr(self, 'sep_token')])[0] @property def cls_id(self): + if getattr(self, 'cls_token') is None: + return None return self.tokens_to_ids([getattr(self, 'cls_token')])[0] @property def unk_id(self): + if getattr(self, 'unk_token') is None: + return None return self.tokens_to_ids([getattr(self, 'unk_token')])[0] @property def mask_id(self): + if getattr(self, 'mask_token') is None: + return None return self.tokens_to_ids([getattr(self, 'mask_token')])[0] @property diff --git a/scripts/nlp_language_modeling/preprocess_data_for_megatron.py b/scripts/nlp_language_modeling/preprocess_data_for_megatron.py index d347febd55d6..fa7acfdfe783 100644 --- a/scripts/nlp_language_modeling/preprocess_data_for_megatron.py +++ b/scripts/nlp_language_modeling/preprocess_data_for_megatron.py @@ -309,7 +309,7 @@ def main(): output_bin_files[key], impl=args.dataset_impl, chunk_size=args.chunk_size, - pad_id=tokenizer.pad_id if hasattr(tokenizer, "pad_id") else 0, + pad_id=tokenizer.pad_id if getattr(tokenizer, "pad_id", None) is not None else 0, retrieval_db=args.retrieval_db, vocab_size=tokenizer.vocab_size, stride=args.chunk_stride_size,