From a888b63ffa86ce736cc007aed2b40b36746bdfbd Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Wed, 6 Nov 2024 14:59:45 -0300 Subject: [PATCH] Revert "[Bugfix] Fix edge-case crash when using chat with the Mistral Tekken Tokenizer (#10051)" This reverts commit 2bcbae704c0d52913c6a2887260fc6bde6c20361. --- tests/models/decoder_only/language/test_mistral.py | 9 +++------ vllm/transformers_utils/tokenizers/mistral.py | 8 ++------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 6ec4b7e7e3f71..5be44c54a717c 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -10,22 +10,19 @@ MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", + "mistralai/Mistral-7B-Instruct-v0.3", + # Mistral-Nemo is to big for CI, but passes locally + # "mistralai/Mistral-Nemo-Instruct-2407" ] MISTRAL_FORMAT_MODELS = [ "mistralai/Mistral-7B-Instruct-v0.3", - # uses the v3-Tekken tokenizer - "mistralai/Ministral-8B-Instruct-2410", - # Mistral-Nemo is to big for CI, but passes locally - # "mistralai/Mistral-Nemo-Instruct-2407" ] SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) SYMBOLIC_LANG_PROMPTS = [ "勇敢な船乗りについての詩を書く", # japanese "寫一首關於勇敢的水手的詩", # chinese - "ပုံပြင်လေးပြောပြပါ်:\n", # burmese - "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n", # see https://github.com/vllm-project/vllm/pull/9625 ] # for function calling diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index ccffdcc2a4df2..896f70bc1dafd 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -254,7 +254,7 @@ def decode(self, skip_special_tokens: bool = True) -> str: assert ( skip_special_tokens - ), "skip_special_tokens=False is not supported for Mistral tokenizers." + ), "Skipping special tokens is not supported for Mistral tokenizers." if isinstance(ids, int): ids = [ids] @@ -268,16 +268,12 @@ def convert_ids_to_tokens( # TODO(Patrick) - potentially allow special tokens to not be skipped assert ( skip_special_tokens - ), "skip_special_tokens=False is not supported for Mistral tokenizers." + ), "Skipping special tokens is not supported for Mistral tokenizers." assert isinstance(self.tokenizer, (Tekkenizer, SentencePieceTokenizer)), type( self.tokenizer) - if isinstance(self.tokenizer, Tekkenizer): - # skip special tokens - ids = [i for i in ids if i > self.tokenizer.num_special_tokens] - tokens = [self.tokenizer.id_to_piece(id) for id in ids] if any("�" in t for t in tokens):