From 60fc43fb25ae7f16eaec7ac68b8fed317f32081f Mon Sep 17 00:00:00 2001 From: Huiying Date: Tue, 27 Feb 2024 09:50:53 -0800 Subject: [PATCH] Handle in HF AutoTokenizer with pad_token=None (#8068) * check if none before encode special token Signed-off-by: Huiying Li * handle when pad_id does not exist for hf Autotokenizer Signed-off-by: Huiying Li * refactor pad_id assignment to use getattr for cleaner code readability Signed-off-by: Huiying Li --------- Signed-off-by: Huiying Li Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> --- .../tokenizers/huggingface/auto_tokenizer.py | 14 ++++++++++++++ .../preprocess_data_for_megatron.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py index 85f9af6e3df2..9340b1f7c504 100644 --- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py +++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py @@ -218,14 +218,20 @@ def vocab(self): @property def pad_id(self): + if getattr(self, 'pad_token') is None: + return None return self.tokens_to_ids([getattr(self, 'pad_token')])[0] @property def bos_id(self): + if getattr(self, 'bos_token') is None: + return None return self.tokens_to_ids([getattr(self, 'bos_token')])[0] @property def eos_id(self): + if getattr(self, 'eos_token') is None: + return None return self.tokens_to_ids([getattr(self, 'eos_token')])[0] @property @@ -235,18 +241,26 @@ def eod(self): @property def sep_id(self): + if getattr(self, 'sep_token') is None: + return None return self.tokens_to_ids([getattr(self, 'sep_token')])[0] @property def cls_id(self): + if getattr(self, 'cls_token') is None: + return None return self.tokens_to_ids([getattr(self, 'cls_token')])[0] @property def unk_id(self): + if getattr(self, 'unk_token') is None: + return None return self.tokens_to_ids([getattr(self, 'unk_token')])[0] @property def mask_id(self): + if getattr(self, 'mask_token') is None: + return None return self.tokens_to_ids([getattr(self, 'mask_token')])[0] @property diff --git a/scripts/nlp_language_modeling/preprocess_data_for_megatron.py b/scripts/nlp_language_modeling/preprocess_data_for_megatron.py index d347febd55d6..fa7acfdfe783 100644 --- a/scripts/nlp_language_modeling/preprocess_data_for_megatron.py +++ b/scripts/nlp_language_modeling/preprocess_data_for_megatron.py @@ -309,7 +309,7 @@ def main(): output_bin_files[key], impl=args.dataset_impl, chunk_size=args.chunk_size, - pad_id=tokenizer.pad_id if hasattr(tokenizer, "pad_id") else 0, + pad_id=tokenizer.pad_id if getattr(tokenizer, "pad_id", None) is not None else 0, retrieval_db=args.retrieval_db, vocab_size=tokenizer.vocab_size, stride=args.chunk_stride_size,