Update Prepend Regex (#317)

(cherry picked from commit 5ccd56d)
openvinotoolkit · Nov 15, 2024 · 5af75b5 · 5af75b5
1 parent e30c99f
commit 5af75b5
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -464,7 +464,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >88.32</td>
+      <td >89.19</td>
       <td >6633</td>
     </tr>
     <tr>
@@ -621,19 +621,19 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >SentencePiece</td>
       <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0</td>
-      <td >94.33</td>
+      <td >100.00</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy_sp_backend</td>
-      <td >95.14</td>
+      <td >98.38</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_sp_backend</td>
-      <td >96.76</td>
+      <td >100.00</td>
       <td >247</td>
     </tr>
     <tr>
@@ -669,19 +669,19 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >SentencePiece</td>
       <td >microsoft/Phi-3-mini-128k-instruct</td>
-      <td >95.14</td>
+      <td >100.00</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >microsoft/Phi-3-mini-128k-instruct_legacy_sp_backend</td>
-      <td >94.33</td>
+      <td >97.57</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >microsoft/Phi-3-mini-128k-instruct_sp_backend</td>
-      <td >95.95</td>
+      <td >99.19</td>
       <td >247</td>
     </tr>
     <tr>

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -218,7 +218,7 @@ def replace_spaces_metaspace(cls, replace_term=r"▁") -> "RegexNormalizationSte
 
     @classmethod
     def prepend_regex(cls, string: str) -> "RegexNormalizationStep":
-        return cls(regex_search_pattern=r"(^)(.)", replace_term=rf"{string}$2")
+        return cls(regex_search_pattern=r"(?:^)([\s\S])", replace_term=rf"{string}$1")
 
     @classmethod
     def prepend_with_check_regex(cls, string: str, check_string: str) -> "RegexNormalizationStep":

diff --git a/src/regex_normalization.cpp b/src/regex_normalization.cpp
@@ -4,7 +4,6 @@
 
 #include "regex_normalization.hpp"
 #include "utils.hpp"
-#include <regex>
 
 
 using namespace ov;
@@ -18,20 +17,36 @@ namespace {
  * @return std::string Reformatted replace pattern
  */
 std::string reformat_replace_pattern(std::string replace_pattern) {
-    return std::regex_replace(replace_pattern, std::regex(R"((?:\\)([0-9]+))"), R"($$1)");
+    for (char i = '1'; i <= '9'; ++i) {
+        std::string from = "\\" + std::string(1, i);
+        std::string to = "$" + std::string(1, i);
+        size_t pos = 0;
+        while ((pos = replace_pattern.find(from, pos)) != std::string::npos) {
+            replace_pattern.replace(pos, from.length(), to);
+            pos += to.length();
+        }
+    }
+    return replace_pattern;
 }
 
+const std::map<std::string, std::string> search_pattern_rewrites = {
+    {R"( ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))", R"((?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)))"},
+    {R"((^)(.))", R"((^)([\s\S]))"}
+};
+
 /**
  * @brief Fix old search pattern for backward compatibility
  *
  * @param search_pattern Search pattern to replace
  * @return std::string Replaced search pattern
  */
-std::string fix_search_pattern(std::string search_pattern) {
-    if (search_pattern == R"( ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))") {
-        return R"((?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)))";
+std::string fix_search_pattern(const std::string search_pattern) {
+    const auto it = search_pattern_rewrites.find(search_pattern);
+    if (it == search_pattern_rewrites.end()) {
+        return search_pattern;
     }
-    return search_pattern;
+    std::cerr << "Replace search pattern: `" << search_pattern << "` -> `" << it->second << "`" << std::endl;
+    return it->second;
 }
 
 } // namespace

diff --git a/tests/layer_tests.py b/tests/layer_tests.py
@@ -144,12 +144,24 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
                 replace_term=r"\1",
             )
         ),
+        ("", "", RegexNormalizationStep.prepend_regex("▁")),
+        ("\n", "▁\n", RegexNormalizationStep.prepend_regex("▁")),
+        ("n", "▁n", RegexNormalizationStep.prepend_regex("▁")),
+        (" ", "▁ ", RegexNormalizationStep.prepend_regex("▁")),
+        (  # test backward compatibility with old regex
+            "\n",
+            "▁\n",
+            RegexNormalizationStep(
+                regex_search_pattern=r"(^)(.)",
+                replace_term=r"▁\2",
+            )
+        ),
     ]
 )
 def test_regex_normalization(test_string, expected, layer):
     compiled_model = create_normalization_model(layer)
     res_ov = compiled_model([test_string])[0]
-    assert res_ov == expected
+    assert res_ov[0] == expected
 
 
 ############################################

diff --git a/tests/pass_rates.json b/tests/pass_rates.json
@@ -1,3 +1,3 @@
 {
-    "tests/tokenizers_test.py::test_": 0.9250843102617633
+    "tests/tokenizers_test.py::test_": 0.9297414485305926
 }