RasaHQ · tabergma · Sep 30, 2020 · Sep 23, 2020 · Sep 29, 2020 · Sep 29, 2020
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -71,8 +71,11 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
             # clean tokens (remove special chars and empty tokens)
             split_token_strings = self._clean_tokens(split_token_strings)
 
+            # ConverRT models are removed from official github repo
+            # (https://github.com/RasaHQ/rasa/issues/6806),
+            # so here we pass a fake UNK token for pass the test for now
             tokens_out += train_utils.align_tokens(
-                split_token_strings, token_end, token_start
+                split_token_strings, token_end, token_start, ""
             )
 
         return tokens_out

diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -222,7 +222,7 @@ def _tokenize_example(
             token_ids_out += split_token_ids
 
             tokens_out += train_utils.align_tokens(
-                split_token_strings, token.end, token.start
+                split_token_strings, token.end, token.start, self.tokenizer.unk_token
             )
 
         return tokens_out, token_ids_out

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
@@ -80,7 +80,7 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
 
 
 def align_tokens(
-    tokens_in: List[Text], token_end: int, token_start: int
+        tokens_in: List[Text], token_end: int, token_start: int, unk_token: str
 ) -> List[Token]:
     """Align sub-tokens of Language model with tokens return by the WhitespaceTokenizer.
 
@@ -95,22 +95,24 @@ def align_tokens(
     current_token_offset = token_start
 
     for index, string in enumerate(tokens_in):
+        string_len = len(string) if string != unk_token else 1
+
         if index == 0:
             if index == len(tokens_in) - 1:
                 s_token_end = token_end
             else:
-                s_token_end = current_token_offset + len(string)
+                s_token_end = current_token_offset + string_len
             tokens_out.append(Token(string, token_start, end=s_token_end))
         elif index == len(tokens_in) - 1:
             tokens_out.append(Token(string, current_token_offset, end=token_end))
         else:
             tokens_out.append(
                 Token(
-                    string, current_token_offset, end=current_token_offset + len(string)
+                    string, current_token_offset, end=current_token_offset + string_len
                 )
             )
 
-        current_token_offset += len(string)
+        current_token_offset += string_len
 
     return tokens_out