Merge branch 'openvinotoolkit:master' into master

mryzhov · Jun 19, 2024 · a8d1630 · a8d1630
2 parents dd5a49f + 950ad70
commit a8d1630
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -341,8 +341,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >79.06</td>
-      <td >4340</td>
+      <td >79.41</td>
+      <td >4774</td>
     </tr>
     <tr>
       <td >Tiktoken</td>
@@ -591,6 +591,18 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
       <td >100.00</td>
       <td >217</td>
     </tr>
+    <tr>
+      <td >SentencePiece</td>
+      <td >rinna/bilingual-gpt-neox-4b</td>
+      <td >75.12</td>
+      <td >217</td>
+    </tr>
+    <tr>
+      <td >SentencePiece</td>
+      <td >rinna/bilingual-gpt-neox-4b_slow</td>
+      <td >90.78</td>
+      <td >217</td>
+    </tr>
     <tr>
       <td >SentencePiece</td>
       <td >t5-base</td>

diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
@@ -448,7 +448,7 @@ def modify_sentencepiece_model(
         else:
             new_piece = model.pieces[idx]
 
-        if skip_special_tokens and new_piece.type != 2:  # type 2 is for unk symbol
+        if skip_special_tokens and new_piece.type not in (2, 4):  # type 2 is for unk symbol
             new_piece.type = 3  # make it control symbol so it will not decode during detokenization
         elif not skip_special_tokens and new_piece.type == 3:
             new_piece.type = 4  # change control type to userdef type
@@ -489,19 +489,30 @@ def convert_sentencepiece_model_tokenizer(
         raise OVTypeError("Cannot convert tokenizer of this type without `.model` file.")
 
     is_chatglm = getattr(hf_tokenizer, "name", None) == "GLMTokenizer"
+    add_bos_token = add_eos_token = None
     if is_chatglm:
         add_eos_token = False
-    elif hasattr(hf_tokenizer, "add_eos_token"):
+    elif hasattr(hf_tokenizer, "build_inputs_with_special_tokens"):
+        _fake_token_id = -0.5
+        try:
+            _ids = hf_tokenizer.build_inputs_with_special_tokens([_fake_token_id])
+            add_bos_token = _ids[0] != _fake_token_id
+            add_eos_token = _ids[-1] != _fake_token_id
+        except:
+            pass
+
+    if add_eos_token is None and hasattr(hf_tokenizer, "add_eos_token"):
         add_eos_token = hf_tokenizer.add_eos_token or False
-    else:
+    elif add_eos_token is None:
         add_eos_token = (
             getattr(hf_tokenizer, "truncation_side", "") == "right"
             or getattr(hf_tokenizer, "padding_side", "") == "right"
         )
 
-    add_bos_token = (
-        getattr(hf_tokenizer, "add_bos_token", add_eos_token) and hf_tokenizer.bos_token_id is not None
-    ) or False
+    if add_bos_token is None:
+        add_bos_token = (
+            getattr(hf_tokenizer, "add_bos_token", add_eos_token) and hf_tokenizer.bos_token_id is not None
+        ) or False
 
     if add_special_tokens is False:
         add_bos_token = add_eos_token = False

diff --git a/tests/pass_rates.json b/tests/pass_rates.json
@@ -1,3 +1,3 @@
 {
-    "tests/tokenizers_test.py::test_": 0.8824711639286963
+    "tests/tokenizers_test.py::test_": 0.8805354436773868
 }
diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py
@@ -128,6 +128,7 @@ def unpack_strings(strings):
     "THUDM/chatglm3-6b",
     "t5-base",
     "facebook/musicgen-small",
+    "rinna/bilingual-gpt-neox-4b",
 ]
 tiktiken_models = [
     "Qwen/Qwen-14B-Chat",