From 56ff9730e456a7bc4ee5aa339460da00ebd2f402 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 20 Dec 2024 13:24:16 +0000 Subject: [PATCH 1/4] Support GLM Edge Support ByteLevel post-processing wrapped in the Sequence post-processor --- python/openvino_tokenizers/hf_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py index 1476c3a6..8ae7348a 100644 --- a/python/openvino_tokenizers/hf_parser.py +++ b/python/openvino_tokenizers/hf_parser.py @@ -277,6 +277,7 @@ def tokenization_model(self) -> None: "TemplateProcessing": CombineSegmentsStep.from_hf_json_template_postprocessor, "BertProcessing": CombineSegmentsStep.from_hf_json_bert_postprocessor, "RobertaProcessing": CombineSegmentsStep.from_hf_json_roberta_processor, + "ByteLevel": lambda *args: [], # return no handle for ByteLevel } def post_tokenization(self) -> None: @@ -316,7 +317,7 @@ def post_tokenization(self) -> None: post_processor_json, self.number_of_inputs, self.add_special_tokens ) - self.num_of_added_tokens += combine_segments_step.number_of_added_tokens + self.num_of_added_tokens += getattr(combine_segments_step, "number_of_added_tokens", 0) self.add_truncation() self.pipeline.add_steps(combine_segments_step) From 81b7122442bd1be583c8584957d3d392b9184890 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 20 Dec 2024 13:25:08 +0000 Subject: [PATCH 2/4] Support GLM Edge Support ByteLevel post-processing wrapped in the Sequence post-processor --- python/openvino_tokenizers/hf_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py index 8ae7348a..f2bca62e 100644 --- a/python/openvino_tokenizers/hf_parser.py +++ b/python/openvino_tokenizers/hf_parser.py @@ -277,7 +277,7 @@ def tokenization_model(self) -> None: "TemplateProcessing": CombineSegmentsStep.from_hf_json_template_postprocessor, "BertProcessing": CombineSegmentsStep.from_hf_json_bert_postprocessor, "RobertaProcessing": CombineSegmentsStep.from_hf_json_roberta_processor, - "ByteLevel": lambda *args: [], # return no handle for ByteLevel + "ByteLevel": lambda *args: [], # return no handle for ByteLevel so add_steps skips it } def post_tokenization(self) -> None: From 254672e62e2bb85a39e28c44a6ebc380f41fa6a5 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 20 Dec 2024 13:26:22 +0000 Subject: [PATCH 3/4] Support GLM Edge Support ByteLevel post-processing wrapped in the Sequence post-processor --- python/openvino_tokenizers/hf_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py index f2bca62e..d22b39c9 100644 --- a/python/openvino_tokenizers/hf_parser.py +++ b/python/openvino_tokenizers/hf_parser.py @@ -277,7 +277,7 @@ def tokenization_model(self) -> None: "TemplateProcessing": CombineSegmentsStep.from_hf_json_template_postprocessor, "BertProcessing": CombineSegmentsStep.from_hf_json_bert_postprocessor, "RobertaProcessing": CombineSegmentsStep.from_hf_json_roberta_processor, - "ByteLevel": lambda *args: [], # return no handle for ByteLevel so add_steps skips it + "ByteLevel": lambda *args: list(), # return no handle for ByteLevel so add_steps skips it } def post_tokenization(self) -> None: From a6173e530cce832eb1ff2b1149119c053eede6da Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 20 Dec 2024 15:01:44 +0000 Subject: [PATCH 4/4] Support ModernBERT --- python/openvino_tokenizers/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py index ea1027e8..70951b1e 100644 --- a/python/openvino_tokenizers/utils.py +++ b/python/openvino_tokenizers/utils.py @@ -214,7 +214,12 @@ def unicode_to_bytes() -> Dict[str, int]: def apply_unicode_to_bytes(token: str) -> str: bytes_encoder = unicode_to_bytes() - return bytes(bytes_encoder[char] for char in token) + try: + return bytes(bytes_encoder[char] for char in token) + except KeyError: + # tokens that was not bytes-to-chars encoded + # ModernBERT adds such tokens to the vocab directly, which is wrong, but we need to handle it + return token def get_hf_tokenizer_attribute(