Skip to content

Commit

Permalink
Merge branch 'openvinotoolkit:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
mryzhov authored Jun 19, 2024
2 parents dd5a49f + 950ad70 commit a8d1630
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 9 deletions.
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -341,8 +341,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >79.06</td>
<td >4340</td>
<td >79.41</td>
<td >4774</td>
</tr>
<tr>
<td >Tiktoken</td>
Expand Down Expand Up @@ -591,6 +591,18 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >100.00</td>
<td >217</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >rinna/bilingual-gpt-neox-4b</td>
<td >75.12</td>
<td >217</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >rinna/bilingual-gpt-neox-4b_slow</td>
<td >90.78</td>
<td >217</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >t5-base</td>
Expand Down
23 changes: 17 additions & 6 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ def modify_sentencepiece_model(
else:
new_piece = model.pieces[idx]

if skip_special_tokens and new_piece.type != 2: # type 2 is for unk symbol
if skip_special_tokens and new_piece.type not in (2, 4): # type 2 is for unk symbol
new_piece.type = 3 # make it control symbol so it will not decode during detokenization
elif not skip_special_tokens and new_piece.type == 3:
new_piece.type = 4 # change control type to userdef type
Expand Down Expand Up @@ -489,19 +489,30 @@ def convert_sentencepiece_model_tokenizer(
raise OVTypeError("Cannot convert tokenizer of this type without `.model` file.")

is_chatglm = getattr(hf_tokenizer, "name", None) == "GLMTokenizer"
add_bos_token = add_eos_token = None
if is_chatglm:
add_eos_token = False
elif hasattr(hf_tokenizer, "add_eos_token"):
elif hasattr(hf_tokenizer, "build_inputs_with_special_tokens"):
_fake_token_id = -0.5
try:
_ids = hf_tokenizer.build_inputs_with_special_tokens([_fake_token_id])
add_bos_token = _ids[0] != _fake_token_id
add_eos_token = _ids[-1] != _fake_token_id
except:
pass

if add_eos_token is None and hasattr(hf_tokenizer, "add_eos_token"):
add_eos_token = hf_tokenizer.add_eos_token or False
else:
elif add_eos_token is None:
add_eos_token = (
getattr(hf_tokenizer, "truncation_side", "") == "right"
or getattr(hf_tokenizer, "padding_side", "") == "right"
)

add_bos_token = (
getattr(hf_tokenizer, "add_bos_token", add_eos_token) and hf_tokenizer.bos_token_id is not None
) or False
if add_bos_token is None:
add_bos_token = (
getattr(hf_tokenizer, "add_bos_token", add_eos_token) and hf_tokenizer.bos_token_id is not None
) or False

if add_special_tokens is False:
add_bos_token = add_eos_token = False
Expand Down
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tests/tokenizers_test.py::test_": 0.8824711639286963
"tests/tokenizers_test.py::test_": 0.8805354436773868
}
1 change: 1 addition & 0 deletions tests/tokenizers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def unpack_strings(strings):
"THUDM/chatglm3-6b",
"t5-base",
"facebook/musicgen-small",
"rinna/bilingual-gpt-neox-4b",
]
tiktiken_models = [
"Qwen/Qwen-14B-Chat",
Expand Down

0 comments on commit a8d1630

Please sign in to comment.