Skip to content

Commit

Permalink
Update Prepend Regex (#317)
Browse files Browse the repository at this point in the history
(cherry picked from commit 5ccd56d)
  • Loading branch information
apaniukov committed Nov 15, 2024
1 parent e30c99f commit 5af75b5
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 75 deletions.
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >88.32</td>
<td >89.19</td>
<td >6633</td>
</tr>
<tr>
Expand Down Expand Up @@ -621,19 +621,19 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >SentencePiece</td>
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0</td>
<td >94.33</td>
<td >100.00</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy_sp_backend</td>
<td >95.14</td>
<td >98.38</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_sp_backend</td>
<td >96.76</td>
<td >100.00</td>
<td >247</td>
</tr>
<tr>
Expand Down Expand Up @@ -669,19 +669,19 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >SentencePiece</td>
<td >microsoft/Phi-3-mini-128k-instruct</td>
<td >95.14</td>
<td >100.00</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/Phi-3-mini-128k-instruct_legacy_sp_backend</td>
<td >94.33</td>
<td >97.57</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/Phi-3-mini-128k-instruct_sp_backend</td>
<td >95.95</td>
<td >99.19</td>
<td >247</td>
</tr>
<tr>
Expand Down
2 changes: 1 addition & 1 deletion python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def replace_spaces_metaspace(cls, replace_term=r"▁") -> "RegexNormalizationSte

@classmethod
def prepend_regex(cls, string: str) -> "RegexNormalizationStep":
return cls(regex_search_pattern=r"(^)(.)", replace_term=rf"{string}$2")
return cls(regex_search_pattern=r"(?:^)([\s\S])", replace_term=rf"{string}$1")

@classmethod
def prepend_with_check_regex(cls, string: str, check_string: str) -> "RegexNormalizationStep":
Expand Down
27 changes: 21 additions & 6 deletions src/regex_normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

#include "regex_normalization.hpp"
#include "utils.hpp"
#include <regex>


using namespace ov;
Expand All @@ -18,20 +17,36 @@ namespace {
* @return std::string Reformatted replace pattern
*/
std::string reformat_replace_pattern(std::string replace_pattern) {
return std::regex_replace(replace_pattern, std::regex(R"((?:\\)([0-9]+))"), R"($$1)");
for (char i = '1'; i <= '9'; ++i) {
std::string from = "\\" + std::string(1, i);
std::string to = "$" + std::string(1, i);
size_t pos = 0;
while ((pos = replace_pattern.find(from, pos)) != std::string::npos) {
replace_pattern.replace(pos, from.length(), to);
pos += to.length();
}
}
return replace_pattern;
}

const std::map<std::string, std::string> search_pattern_rewrites = {
{R"( ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))", R"((?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)))"},
{R"((^)(.))", R"((^)([\s\S]))"}
};

/**
* @brief Fix old search pattern for backward compatibility
*
* @param search_pattern Search pattern to replace
* @return std::string Replaced search pattern
*/
std::string fix_search_pattern(std::string search_pattern) {
if (search_pattern == R"( ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))") {
return R"((?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)))";
std::string fix_search_pattern(const std::string search_pattern) {
const auto it = search_pattern_rewrites.find(search_pattern);
if (it == search_pattern_rewrites.end()) {
return search_pattern;
}
return search_pattern;
std::cerr << "Replace search pattern: `" << search_pattern << "` -> `" << it->second << "`" << std::endl;
return it->second;
}

} // namespace
Expand Down
14 changes: 13 additions & 1 deletion tests/layer_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,24 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
replace_term=r"\1",
)
),
("", "", RegexNormalizationStep.prepend_regex("▁")),
("\n", "▁\n", RegexNormalizationStep.prepend_regex("▁")),
("n", "▁n", RegexNormalizationStep.prepend_regex("▁")),
(" ", "▁ ", RegexNormalizationStep.prepend_regex("▁")),
( # test backward compatibility with old regex
"\n",
"▁\n",
RegexNormalizationStep(
regex_search_pattern=r"(^)(.)",
replace_term=r"▁\2",
)
),
]
)
def test_regex_normalization(test_string, expected, layer):
compiled_model = create_normalization_model(layer)
res_ov = compiled_model([test_string])[0]
assert res_ov == expected
assert res_ov[0] == expected


############################################
Expand Down
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tests/tokenizers_test.py::test_": 0.9250843102617633
"tests/tokenizers_test.py::test_": 0.9297414485305926
}
Loading

0 comments on commit 5af75b5

Please sign in to comment.