openvinotoolkit · apaniukov · Dec 23, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
@@ -276,6 +276,7 @@ def tokenization_model(self) -> None:
         "TemplateProcessing": CombineSegmentsStep.from_hf_json_template_postprocessor,
         "BertProcessing": CombineSegmentsStep.from_hf_json_bert_postprocessor,
         "RobertaProcessing": CombineSegmentsStep.from_hf_json_roberta_processor,
+        "ByteLevel": lambda *args: list(),  # return no handle for ByteLevel so add_steps skips it
     }
 
     def post_tokenization(self) -> None:
@@ -315,7 +316,7 @@ def post_tokenization(self) -> None:
                 post_processor_json, self.number_of_inputs, self.add_special_tokens
             )
 
-        self.num_of_added_tokens += combine_segments_step.number_of_added_tokens
+        self.num_of_added_tokens += getattr(combine_segments_step, "number_of_added_tokens", 0)
 
         self.add_truncation()
         self.pipeline.add_steps(combine_segments_step)

diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py
@@ -214,7 +214,12 @@ def unicode_to_bytes() -> Dict[str, int]:
 
 def apply_unicode_to_bytes(token: str) -> str:
     bytes_encoder = unicode_to_bytes()
-    return bytes(bytes_encoder[char] for char in token)
+    try:
+        return bytes(bytes_encoder[char] for char in token)
+    except KeyError:
+        # tokens that was not bytes-to-chars encoded
+        # ModernBERT adds such tokens to the vocab directly, which is wrong, but we need to handle it
+        return token
 
 
 def get_hf_tokenizer_attribute(