[Bugfix] Fix edge-case crash when using chat with the Mistral Tekken …

…Tokenizer (vllm-project#10051) Signed-off-by: Travis Johnson <[email protected]> Signed-off-by: Sumit Dubey <[email protected]>
sumitd2 · Nov 14, 2024 · 1bc3c94 · 1bc3c94
1 parent 4edbd2f
commit 1bc3c94
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 5 deletions.
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
@@ -10,19 +10,22 @@
 
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.1",
-    "mistralai/Mistral-7B-Instruct-v0.3",
-    # Mistral-Nemo is to big for CI, but passes locally
-    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
 MISTRAL_FORMAT_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3",
+    # uses the v3-Tekken tokenizer
+    "mistralai/Ministral-8B-Instruct-2410",
+    # Mistral-Nemo is to big for CI, but passes locally
+    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
     "勇敢な船乗りについての詩を書く",  # japanese
     "寫一首關於勇敢的水手的詩",  # chinese
+    "ပုံပြင်လေးပြောပြပါ်:\n",  # burmese
+    "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n",  # see https://github.com/vllm-project/vllm/pull/9625
 ]
 
 # for function calling

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
@@ -254,7 +254,7 @@ def decode(self,
                skip_special_tokens: bool = True) -> str:
         assert (
             skip_special_tokens
-        ), "Skipping special tokens is not supported for Mistral tokenizers."
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
 
         if isinstance(ids, int):
             ids = [ids]
@@ -268,12 +268,16 @@ def convert_ids_to_tokens(
         # TODO(Patrick) - potentially allow special tokens to not be skipped
         assert (
             skip_special_tokens
-        ), "Skipping special tokens is not supported for Mistral tokenizers."
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
 
         assert isinstance(self.tokenizer,
                           (Tekkenizer, SentencePieceTokenizer)), type(
                               self.tokenizer)
 
+        if isinstance(self.tokenizer, Tekkenizer):
+            # skip special tokens
+            ids = [i for i in ids if i > self.tokenizer.num_special_tokens]
+
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
 
         if any("�" in t for t in tokens):