Merge pull request #6755 from howl-anderson/bugfix/chinese_tokenizati…

…on_in_rasa_v1 [bugfix][WIP] Fix the bug of Chinese tokenization in 1.10.12
RasaHQ · Sep 30, 2020 · 0dce587 · 0dce587
2 parents 7257a24 + 1f503e1
commit 0dce587
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 8 deletions.
diff --git a/changelog/6755.bugfix.rst b/changelog/6755.bugfix.rst
@@ -0,0 +1 @@
+Treat the length of OOV token as 1 to fix token align issue when OOV occurred.
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -222,7 +222,7 @@ def _tokenize_example(
             token_ids_out += split_token_ids
 
             tokens_out += train_utils.align_tokens(
-                split_token_strings, token.end, token.start
+                split_token_strings, token.end, token.start, self.tokenizer.unk_token
             )
 
         return tokens_out, token_ids_out

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
@@ -80,7 +80,10 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
 
 
 def align_tokens(
-    tokens_in: List[Text], token_end: int, token_start: int
+    tokens_in: List[Text],
+    token_end: int,
+    token_start: int,
+    unk_token: Optional[Text] = None,
 ) -> List[Token]:
     """Align sub-tokens of Language model with tokens return by the WhitespaceTokenizer.
 
@@ -95,22 +98,31 @@ def align_tokens(
     current_token_offset = token_start
 
     for index, string in enumerate(tokens_in):
+        # There is absolute no guarantee that the length of OOV token is always 1.
+        # But some documents (e.g.
+        # https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#22-tokenization)
+        # show that it very likely to be 1 in most case.
+        # It seems that OOV tokens in most languages (except for Chinese) are emoji characters.
+        # Chinese language has lots of characters, some rare characters may become OOV.
+        # This is not a perfect solution, but in practice it can solve most issues related to OOV
+        string_len = len(string) if unk_token is None or string != unk_token else 1
+
         if index == 0:
             if index == len(tokens_in) - 1:
                 s_token_end = token_end
             else:
-                s_token_end = current_token_offset + len(string)
+                s_token_end = current_token_offset + string_len
             tokens_out.append(Token(string, token_start, end=s_token_end))
         elif index == len(tokens_in) - 1:
             tokens_out.append(Token(string, current_token_offset, end=token_end))
         else:
             tokens_out.append(
                 Token(
-                    string, current_token_offset, end=current_token_offset + len(string)
+                    string, current_token_offset, end=current_token_offset + string_len
                 )
             )
 
-        current_token_offset += len(string)
+        current_token_offset += string_len
 
     return tokens_out
 

diff --git a/tests/nlu/tokenizers/test_lm_tokenizer.py b/tests/nlu/tokenizers/test_lm_tokenizer.py
@@ -15,10 +15,11 @@
 # TODO: need to fix this failing test
 @pytest.mark.xfail(strict=False)
 @pytest.mark.parametrize(
-    "model_name, texts, expected_tokens, expected_indices, expected_num_token_ids",
+    "model_name, model_weights, texts, expected_tokens, expected_indices, expected_num_token_ids",
     [
         (
             "bert",
+            None,
             [
                 "Good evening.",
                 "you're",
@@ -66,8 +67,32 @@
             ],
             [4, 4, 5, 5, 13],
         ),
+        (
+            "bert",
+            "bert-base-chinese",
+            [
+                "晚上好",  # normal & easy case
+                "没问题！",  # `！` is a Chinese punctuation
+                "去东畈村",  # `畈` is a OOV token for bert-base-chinese
+                "好的😃",  # include a emoji which is common in Chinese text-based chat
+            ],
+            [
+                ["晚", "上", "好"],
+                ["没", "问", "题", "！"],
+                ["去", "东", "畈", "村"],
+                ["好", "的", "😃"],
+            ],
+            [
+                [(0, 1), (1, 2), (2, 3)],
+                [(0, 1), (1, 2), (2, 3), (3, 4)],
+                [(0, 1), (1, 2), (2, 3), (3, 4)],
+                [(0, 1), (1, 2), (2, 3)],
+            ],
+            [3, 4, 4, 3],
+        ),
         (
             "gpt",
+            None,
             [
                 "Good evening.",
                 "hello",
@@ -106,6 +131,7 @@
         ),
         (
             "gpt2",
+            None,
             [
                 "Good evening.",
                 "hello",
@@ -158,6 +184,7 @@
         ),
         (
             "xlnet",
+            None,
             [
                 "Good evening.",
                 "hello",
@@ -208,6 +235,7 @@
         ),
         (
             "distilbert",
+            None,
             [
                 "Good evening.",
                 "you're",
@@ -257,6 +285,7 @@
         ),
         (
             "roberta",
+            None,
             [
                 "Good evening.",
                 "hello",
@@ -310,10 +339,19 @@
     ],
 )
 def test_lm_tokenizer_edge_cases(
-    model_name, texts, expected_tokens, expected_indices, expected_num_token_ids
+    model_name,
+    model_weights,
+    texts,
+    expected_tokens,
+    expected_indices,
+    expected_num_token_ids,
 ):
 
-    transformers_config = {"model_name": model_name}
+    if model_weights is None:
+        model_weights_config = {}
+    else:
+        model_weights_config = {"model_weights": model_weights}
+    transformers_config = {**{"model_name": model_name}, **model_weights_config}
 
     transformers_nlp = HFTransformersNLP(transformers_config)
     lm_tokenizer = LanguageModelTokenizer()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Treat the length of OOV token as 1 to fix token align issue when OOV occurred.