diff --git a/README.md b/README.md
index aae1dfc88..4e77fe686 100644
--- a/README.md
+++ b/README.md
@@ -264,12 +264,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
BPE |
- 95.84 |
+ 96.74 |
3439 |
SentencePiece |
- 86.36 |
+ 86.08 |
2896 |
@@ -279,7 +279,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
WordPiece |
- 82.55 |
+ 90.43 |
533 |
@@ -300,13 +300,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
BPE |
EleutherAI/gpt-j-6b |
- 98.34 |
+ 98.90 |
181 |
BPE |
EleutherAI/gpt-neo-125m |
- 98.34 |
+ 98.90 |
181 |
@@ -330,7 +330,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
BPE |
Salesforce/codegen-16B-multi |
- 97.24 |
+ 97.79 |
181 |
@@ -354,7 +354,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
BPE |
facebook/bart-large-mnli |
- 97.24 |
+ 98.90 |
181 |
@@ -372,31 +372,31 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
BPE |
gpt2 |
- 97.24 |
+ 98.90 |
181 |
BPE |
laion/CLIP-ViT-bigG-14-laion2B-39B-b160k |
- 61.33 |
+ 65.19 |
181 |
BPE |
microsoft/deberta-base |
- 96.13 |
+ 98.90 |
181 |
BPE |
roberta-base |
- 96.13 |
+ 98.90 |
181 |
BPE |
sentence-transformers/all-roberta-large-v1 |
- 96.13 |
+ 98.90 |
181 |
@@ -456,7 +456,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
SentencePiece |
camembert-base_slow |
- 75.14 |
+ 74.03 |
181 |
@@ -486,13 +486,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
SentencePiece |
xlm-roberta-base |
- 98.90 |
+ 97.24 |
181 |
SentencePiece |
xlm-roberta-base_slow |
- 98.90 |
+ 97.24 |
181 |
@@ -528,19 +528,19 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
WordPiece |
ProsusAI/finbert |
- 80.49 |
+ 95.12 |
41 |
WordPiece |
bert-base-multilingual-cased |
- 80.49 |
+ 95.12 |
41 |
WordPiece |
bert-large-cased |
- 80.49 |
+ 95.12 |
41 |
@@ -552,13 +552,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
WordPiece |
distilbert-base-uncased-finetuned-sst-2-english |
- 80.49 |
+ 95.12 |
41 |
WordPiece |
google/electra-base-discriminator |
- 80.49 |
+ 95.12 |
41 |
@@ -588,7 +588,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
WordPiece |
rasa/LaBSE |
- 73.17 |
+ 87.80 |
41 |
@@ -600,7 +600,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
WordPiece |
squeezebert/squeezebert-uncased |
- 80.49 |
+ 95.12 |
41 |
diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
index 9a0396a05..fcc20fd41 100644
--- a/python/openvino_tokenizers/hf_parser.py
+++ b/python/openvino_tokenizers/hf_parser.py
@@ -51,14 +51,21 @@
WhitespaceSplitStep,
WordPieceTokenizationStep,
)
+from .utils import filter_re2_incompatible
-def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegexNormalizationStep:
+def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> List[RegexNormalizationStep]:
regex_search_pattern = normalizer_dict["pattern"].get("String") or normalizer_dict["pattern"]["Regex"]
- return RegexNormalizationStep(
- regex_search_pattern=regex_search_pattern,
- replace_term=normalizer_dict["content"],
- )
+ filtered_pattern = filter_re2_incompatible(regex_search_pattern)
+ if filtered_pattern == "":
+ return []
+
+ return [
+ RegexNormalizationStep(
+ regex_search_pattern=regex_search_pattern,
+ replace_term=normalizer_dict["content"],
+ )
+ ]
def parse_bert_normalizer(normalizer_dict: Dict[str, Any]) -> List[NormalizationStep]:
@@ -368,7 +375,6 @@ def modify_sentencepiece_model(
sp_model_path: Path,
add_tokens: Dict[int, str],
skip_special_tokens: bool = False,
- reference_vocab: Optional[List[str]] = None,
) -> None:
model_pb = import_protobuf()
model = model_pb.ModelProto()
@@ -573,7 +579,7 @@ def convert_tiktoken_model_tokenizer(
pipeline.add_steps(
[
NormalizeUnicode("NFC"),
- RegexSplitStep(split_pattern),
+ RegexSplitStep(split_pattern, behaviour="contiguous"),
BytesToCharsStep(),
BPETokenizationStep.from_tiktoken_encoding(encoding),
TruncationStep.from_hf_object(hf_tokenizer),
diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
index 3cab78b08..51fe4cc43 100644
--- a/python/openvino_tokenizers/tokenizer_pipeline.py
+++ b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -2,6 +2,7 @@
# Copyright (C) 2018-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
+import logging
import weakref
from dataclasses import dataclass, field
from functools import singledispatchmethod
@@ -25,6 +26,10 @@
TOKENIZER_NAME,
)
from .str_pack import pack_string, pack_strings
+from .utils import has_incompatible_re2_op
+
+
+logger = logging.getLogger(__name__)
@dataclass
@@ -98,6 +103,15 @@ class RegexNormalizationStep(NormalizationStep):
regex_search_pattern: str
replace_term: str
+ def __post_init__(self):
+ self.vet_search_pattern()
+
+ def vet_search_pattern(self) -> None:
+ if has_incompatible_re2_op(self.regex_search_pattern):
+ logger.warning(
+ "RegexNormalization pattern is not supported, operation output might differ from the original tokenizer."
+ )
+
@classmethod
def strip_accents_regex(cls) -> "RegexNormalizationStep":
return cls(regex_search_pattern=r"\p{Mn}", replace_term="")
@@ -168,6 +182,20 @@ class RegexSplitStep(PreTokenizatinStep):
invert: bool = False
behaviour: str = "remove"
+ def __post_init__(self):
+ self.vet_split_pattern()
+
+ def vet_split_pattern(self) -> None:
+ if r"(?!\S)" in self.split_pattern:
+ # rewrite regex pattern to get results closer to qwen.cpp results
+ logger.warning(r"Replace `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation")
+ self.split_pattern = self.split_pattern.replace(r"(?!\S)", r"(?:$|[^\S])")
+
+ if has_incompatible_re2_op(self.split_pattern):
+ logger.warning(
+ "RegexSplit pattern is not supported, operation output might differ from the original tokenizer."
+ )
+
@classmethod
def bert_whitespace_splitter(cls) -> "RegexSplitStep":
return cls(split_pattern=r"\s+", invert=False)
@@ -481,6 +509,7 @@ def set_token_id(self, vocab: Optional[List[str]]) -> None:
def token_id(self) -> Optional[int]:
return self._token_id
+
@dataclass
class TokenWithTypeId:
token_type_id: Optional[int] = None
diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py
index c31f8ed08..55f2a6792 100644
--- a/python/openvino_tokenizers/utils.py
+++ b/python/openvino_tokenizers/utils.py
@@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
import logging
+import re
from typing import Dict, Optional, Sequence, Tuple, Union
from openvino import Model, Type
@@ -87,7 +88,7 @@ def greedy_decoder(input) -> Model:
def add_greedy_decoding(
- text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
+ text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
) -> Model:
ppp = PrePostProcessor(text_generation_model)
ppp.output(logits_output).postprocess().custom(greedy_decoder)
@@ -109,3 +110,22 @@ def change_outputs_type(model: Model, output_type: Type) -> Model:
for idx, _ in enumerate(model.outputs):
ppp.output(idx).tensor().set_element_type(output_type)
return ppp.build()
+
+
+def has_incompatible_re2_op(pattern: str) -> bool:
+ return "(?=" in pattern or "(?!" in pattern or "(?<=" in pattern or "(? str:
+ not_filtered = []
+
+ for subpattern in (match.group() for match in _subpattern_regex.finditer(pattern)):
+ if has_incompatible_re2_op(subpattern):
+ logging.warning(f"Subpattern `{subpattern}` is not supported by re2 and filtered out.")
+ continue
+ not_filtered.append(subpattern)
+
+ return "|".join(not_filtered)
diff --git a/tests/conftest.py b/tests/conftest.py
index 353d73ae6..5cd1ed82d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -56,11 +56,9 @@ def add_tokenizer_type(row):
new_readme.write(old_readme)
new_readme.write(
"## Test Results\n\n"
- "This report is autogenerated and includes tokenizers and detokenizers tests. "
- "The `Output Matched, %` column shows the percent of test strings "
- "for which the results of OpenVINO and Hugingface Tokenizers are the same. "
- "To update the report run `pytest tokenizers_test.py --update_readme` in "
- "`modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.\n\n"
+ "This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column "
+ "shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same. "
+ "To update the report run `pytest --update_readme tokenizers_test.py` in `tests` directory.\n\n"
"### Output Match by Tokenizer Type\n\n"
)
is_pandas_2 = tuple(map(int, version("pandas").split("."))) >= (2, 0, 0)
diff --git a/tests/pass_rates.json b/tests/pass_rates.json
index 04ed42352..2e58bdf36 100644
--- a/tests/pass_rates.json
+++ b/tests/pass_rates.json
@@ -1,3 +1,3 @@
{
- "tokenizers_test.py::test_": 0.9110740586355426
+ "tokenizers_test.py::test_": 0.9201055995553703
}
\ No newline at end of file
diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py
index 7a8acd78d..8a3d046d9 100644
--- a/tests/tokenizers_test.py
+++ b/tests/tokenizers_test.py
@@ -278,7 +278,7 @@ def test_hf_wordpiece_tokenizers(wordpiece_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = wordpiece_tokenizers
packed_strings = pack_strings([test_string])
- hf_tokenized = hf_tokenizer([test_string], return_tensors="np")
+ hf_tokenized = hf_tokenizer([test_string], return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(packed_strings)
for output_name, hf_result in hf_tokenized.items():
@@ -298,7 +298,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(wordpiece_tokenizers, test_str
hf_tokenizer, ov_tokenizer = wordpiece_tokenizers
packed_strings = pack_strings(test_string)
- hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True)
+ hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True, truncation=True)
ov_tokenized = ov_tokenizer(packed_strings)
for output_name, hf_result in hf_tokenized.items():
@@ -317,7 +317,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(wordpiece_tokenizers, test_str
def test_sentencepiece_model_tokenizer(sentencepice_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = sentencepice_tokenizers
- hf_tokenized = hf_tokenizer(test_string, return_tensors="np")
+ hf_tokenized = hf_tokenizer(test_string, return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(pack_strings([test_string]))
for output_name, hf_result in hf_tokenized.items():
@@ -364,7 +364,7 @@ def test_hf_bpe_tokenizers_outputs(bpe_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = bpe_tokenizers
packed_strings = pack_strings([test_string])
- hf_tokenized = hf_tokenizer([test_string], return_tensors="np")
+ hf_tokenized = hf_tokenizer([test_string], return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(packed_strings)
for output_name, hf_result in hf_tokenized.items():
@@ -410,7 +410,7 @@ def test_bpe_detokenizer(
def test_tiktoken_tokenizers(tiktoken_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = tiktoken_tokenizers
- hf_tokenized = hf_tokenizer(test_string, return_tensors="np")
+ hf_tokenized = hf_tokenizer(test_string, return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(pack_strings([test_string]))
for output_name, hf_result in hf_tokenized.items():