Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bugfix][WIP] Fix the bug of Chinese tokenization in 1.10.12 #6755

Merged
5 changes: 4 additions & 1 deletion rasa/nlu/tokenizers/convert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,11 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
# clean tokens (remove special chars and empty tokens)
split_token_strings = self._clean_tokens(split_token_strings)

# ConverRT models are removed from official github repo
# (https://github.com/RasaHQ/rasa/issues/6806),
# so here we pass a fake UNK token for pass the test for now
tokens_out += train_utils.align_tokens(
split_token_strings, token_end, token_start
split_token_strings, token_end, token_start, ""
howl-anderson marked this conversation as resolved.
Show resolved Hide resolved
)

return tokens_out
Expand Down
2 changes: 1 addition & 1 deletion rasa/nlu/utils/hugging_face/hf_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def _tokenize_example(
token_ids_out += split_token_ids

tokens_out += train_utils.align_tokens(
split_token_strings, token.end, token.start
split_token_strings, token.end, token.start, self.tokenizer.unk_token
)

return tokens_out, token_ids_out
Expand Down
10 changes: 6 additions & 4 deletions rasa/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:


def align_tokens(
tokens_in: List[Text], token_end: int, token_start: int
tokens_in: List[Text], token_end: int, token_start: int, unk_token: str
howl-anderson marked this conversation as resolved.
Show resolved Hide resolved
) -> List[Token]:
"""Align sub-tokens of Language model with tokens return by the WhitespaceTokenizer.

Expand All @@ -95,22 +95,24 @@ def align_tokens(
current_token_offset = token_start

for index, string in enumerate(tokens_in):
string_len = len(string) if string != unk_token else 1
howl-anderson marked this conversation as resolved.
Show resolved Hide resolved

if index == 0:
if index == len(tokens_in) - 1:
s_token_end = token_end
else:
s_token_end = current_token_offset + len(string)
s_token_end = current_token_offset + string_len
tokens_out.append(Token(string, token_start, end=s_token_end))
elif index == len(tokens_in) - 1:
tokens_out.append(Token(string, current_token_offset, end=token_end))
else:
tokens_out.append(
Token(
string, current_token_offset, end=current_token_offset + len(string)
string, current_token_offset, end=current_token_offset + string_len
)
)

current_token_offset += len(string)
current_token_offset += string_len

return tokens_out

Expand Down