Skip to content

Commit

Permalink
Check Regex For Supported Operations
Browse files Browse the repository at this point in the history
Rewrite regex pattern from QWEN
Filter Normalization Patterns
Log Regex Filtering and Rewriting
Add truncation to HF tokenizers in tests
  • Loading branch information
apaniukov committed Feb 12, 2024
1 parent 73e3592 commit c53ccb7
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 41 deletions.
44 changes: 22 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tbody>
<tr>
<td >BPE</td>
<td >95.84</td>
<td >96.74</td>
<td >3439</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >86.36</td>
<td >86.08</td>
<td >2896</td>
</tr>
<tr>
Expand All @@ -279,7 +279,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >WordPiece</td>
<td >82.55</td>
<td >90.43</td>
<td >533</td>
</tr>
</tbody>
Expand All @@ -300,13 +300,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >BPE</td>
<td >EleutherAI/gpt-j-6b</td>
<td >98.34</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
<td >BPE</td>
<td >EleutherAI/gpt-neo-125m</td>
<td >98.34</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
Expand All @@ -330,7 +330,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >BPE</td>
<td >Salesforce/codegen-16B-multi</td>
<td >97.24</td>
<td >97.79</td>
<td >181</td>
</tr>
<tr>
Expand All @@ -354,7 +354,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >BPE</td>
<td >facebook/bart-large-mnli</td>
<td >97.24</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
Expand All @@ -372,31 +372,31 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >BPE</td>
<td >gpt2</td>
<td >97.24</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
<td >BPE</td>
<td >laion/CLIP-ViT-bigG-14-laion2B-39B-b160k</td>
<td >61.33</td>
<td >65.19</td>
<td >181</td>
</tr>
<tr>
<td >BPE</td>
<td >microsoft/deberta-base</td>
<td >96.13</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
<td >BPE</td>
<td >roberta-base</td>
<td >96.13</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
<td >BPE</td>
<td >sentence-transformers/all-roberta-large-v1</td>
<td >96.13</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
Expand Down Expand Up @@ -456,7 +456,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >SentencePiece</td>
<td >camembert-base_slow</td>
<td >75.14</td>
<td >74.03</td>
<td >181</td>
</tr>
<tr>
Expand Down Expand Up @@ -486,13 +486,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >SentencePiece</td>
<td >xlm-roberta-base</td>
<td >98.90</td>
<td >97.24</td>
<td >181</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >xlm-roberta-base_slow</td>
<td >98.90</td>
<td >97.24</td>
<td >181</td>
</tr>
<tr>
Expand Down Expand Up @@ -528,19 +528,19 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >WordPiece</td>
<td >ProsusAI/finbert</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
<tr>
<td >WordPiece</td>
<td >bert-base-multilingual-cased</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
<tr>
<td >WordPiece</td>
<td >bert-large-cased</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
<tr>
Expand All @@ -552,13 +552,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >WordPiece</td>
<td >distilbert-base-uncased-finetuned-sst-2-english</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
<tr>
<td >WordPiece</td>
<td >google/electra-base-discriminator</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
<tr>
Expand Down Expand Up @@ -588,7 +588,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >WordPiece</td>
<td >rasa/LaBSE</td>
<td >73.17</td>
<td >87.80</td>
<td >41</td>
</tr>
<tr>
Expand All @@ -600,7 +600,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >WordPiece</td>
<td >squeezebert/squeezebert-uncased</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
</tbody>
Expand Down
20 changes: 13 additions & 7 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,21 @@
WhitespaceSplitStep,
WordPieceTokenizationStep,
)
from .utils import filter_re2_incompatible


def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegexNormalizationStep:
def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> List[RegexNormalizationStep]:
regex_search_pattern = normalizer_dict["pattern"].get("String") or normalizer_dict["pattern"]["Regex"]
return RegexNormalizationStep(
regex_search_pattern=regex_search_pattern,
replace_term=normalizer_dict["content"],
)
filtered_pattern = filter_re2_incompatible(regex_search_pattern)
if filtered_pattern == "":
return []

return [
RegexNormalizationStep(
regex_search_pattern=regex_search_pattern,
replace_term=normalizer_dict["content"],
)
]


def parse_bert_normalizer(normalizer_dict: Dict[str, Any]) -> List[NormalizationStep]:
Expand Down Expand Up @@ -368,7 +375,6 @@ def modify_sentencepiece_model(
sp_model_path: Path,
add_tokens: Dict[int, str],
skip_special_tokens: bool = False,
reference_vocab: Optional[List[str]] = None,
) -> None:
model_pb = import_protobuf()
model = model_pb.ModelProto()
Expand Down Expand Up @@ -573,7 +579,7 @@ def convert_tiktoken_model_tokenizer(
pipeline.add_steps(
[
NormalizeUnicode("NFC"),
RegexSplitStep(split_pattern),
RegexSplitStep(split_pattern, behaviour="contiguous"),
BytesToCharsStep(),
BPETokenizationStep.from_tiktoken_encoding(encoding),
TruncationStep.from_hf_object(hf_tokenizer),
Expand Down
29 changes: 29 additions & 0 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (C) 2018-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import logging
import weakref
from dataclasses import dataclass, field
from functools import singledispatchmethod
Expand All @@ -25,6 +26,10 @@
TOKENIZER_NAME,
)
from .str_pack import pack_string, pack_strings
from .utils import has_incompatible_re2_op


logger = logging.getLogger(__name__)


@dataclass
Expand Down Expand Up @@ -98,6 +103,15 @@ class RegexNormalizationStep(NormalizationStep):
regex_search_pattern: str
replace_term: str

def __post_init__(self):
self.vet_search_pattern()

def vet_search_pattern(self) -> None:
if has_incompatible_re2_op(self.regex_search_pattern):
logger.warning(
"RegexNormalization pattern is not supported, operation output might differ from the original tokenizer."
)

@classmethod
def strip_accents_regex(cls) -> "RegexNormalizationStep":
return cls(regex_search_pattern=r"\p{Mn}", replace_term="")
Expand Down Expand Up @@ -168,6 +182,20 @@ class RegexSplitStep(PreTokenizatinStep):
invert: bool = False
behaviour: str = "remove"

def __post_init__(self):
self.vet_split_pattern()

def vet_split_pattern(self) -> None:
if r"(?!\S)" in self.split_pattern:
# rewrite regex pattern to get results closer to qwen.cpp results
logger.warning(r"Replace `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation")
self.split_pattern = self.split_pattern.replace(r"(?!\S)", r"(?:$|[^\S])")

if has_incompatible_re2_op(self.split_pattern):
logger.warning(
"RegexSplit pattern is not supported, operation output might differ from the original tokenizer."
)

@classmethod
def bert_whitespace_splitter(cls) -> "RegexSplitStep":
return cls(split_pattern=r"\s+", invert=False)
Expand Down Expand Up @@ -481,6 +509,7 @@ def set_token_id(self, vocab: Optional[List[str]]) -> None:
def token_id(self) -> Optional[int]:
return self._token_id


@dataclass
class TokenWithTypeId:
token_type_id: Optional[int] = None
Expand Down
22 changes: 21 additions & 1 deletion python/openvino_tokenizers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import logging
import re
from typing import Dict, Optional, Sequence, Tuple, Union

from openvino import Model, Type
Expand Down Expand Up @@ -87,7 +88,7 @@ def greedy_decoder(input) -> Model:


def add_greedy_decoding(
text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
) -> Model:
ppp = PrePostProcessor(text_generation_model)
ppp.output(logits_output).postprocess().custom(greedy_decoder)
Expand All @@ -109,3 +110,22 @@ def change_outputs_type(model: Model, output_type: Type) -> Model:
for idx, _ in enumerate(model.outputs):
ppp.output(idx).tensor().set_element_type(output_type)
return ppp.build()


def has_incompatible_re2_op(pattern: str) -> bool:
return "(?=" in pattern or "(?!" in pattern or "(?<=" in pattern or "(?<!" in pattern


_subpattern_regex = re.compile(r"(?:[^()|]+|\([^)]*\))+")


def filter_re2_incompatible(pattern: str) -> str:
not_filtered = []

for subpattern in (match.group() for match in _subpattern_regex.finditer(pattern)):
if has_incompatible_re2_op(subpattern):
logging.warning(f"Subpattern `{subpattern}` is not supported by re2 and filtered out.")
continue
not_filtered.append(subpattern)

return "|".join(not_filtered)
8 changes: 3 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,9 @@ def add_tokenizer_type(row):
new_readme.write(old_readme)
new_readme.write(
"## Test Results\n\n"
"This report is autogenerated and includes tokenizers and detokenizers tests. "
"The `Output Matched, %` column shows the percent of test strings "
"for which the results of OpenVINO and Hugingface Tokenizers are the same. "
"To update the report run `pytest tokenizers_test.py --update_readme` in "
"`modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.\n\n"
"This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column "
"shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same. "
"To update the report run `pytest --update_readme tokenizers_test.py` in `tests` directory.\n\n"
"### Output Match by Tokenizer Type\n\n"
)
is_pandas_2 = tuple(map(int, version("pandas").split("."))) >= (2, 0, 0)
Expand Down
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tokenizers_test.py::test_": 0.9110740586355426
"tokenizers_test.py::test_": 0.9201055995553703
}
10 changes: 5 additions & 5 deletions tests/tokenizers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def test_hf_wordpiece_tokenizers(wordpiece_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = wordpiece_tokenizers
packed_strings = pack_strings([test_string])

hf_tokenized = hf_tokenizer([test_string], return_tensors="np")
hf_tokenized = hf_tokenizer([test_string], return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(packed_strings)

for output_name, hf_result in hf_tokenized.items():
Expand All @@ -298,7 +298,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(wordpiece_tokenizers, test_str
hf_tokenizer, ov_tokenizer = wordpiece_tokenizers
packed_strings = pack_strings(test_string)

hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True)
hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True, truncation=True)
ov_tokenized = ov_tokenizer(packed_strings)

for output_name, hf_result in hf_tokenized.items():
Expand All @@ -317,7 +317,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(wordpiece_tokenizers, test_str
def test_sentencepiece_model_tokenizer(sentencepice_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = sentencepice_tokenizers

hf_tokenized = hf_tokenizer(test_string, return_tensors="np")
hf_tokenized = hf_tokenizer(test_string, return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(pack_strings([test_string]))

for output_name, hf_result in hf_tokenized.items():
Expand Down Expand Up @@ -364,7 +364,7 @@ def test_hf_bpe_tokenizers_outputs(bpe_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = bpe_tokenizers
packed_strings = pack_strings([test_string])

hf_tokenized = hf_tokenizer([test_string], return_tensors="np")
hf_tokenized = hf_tokenizer([test_string], return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(packed_strings)

for output_name, hf_result in hf_tokenized.items():
Expand Down Expand Up @@ -410,7 +410,7 @@ def test_bpe_detokenizer(
def test_tiktoken_tokenizers(tiktoken_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = tiktoken_tokenizers

hf_tokenized = hf_tokenizer(test_string, return_tensors="np")
hf_tokenized = hf_tokenizer(test_string, return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(pack_strings([test_string]))

for output_name, hf_result in hf_tokenized.items():
Expand Down

0 comments on commit c53ccb7

Please sign in to comment.