diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 15bdae38d1d46..11ac28e758c39 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -176,6 +176,15 @@ class ChatCompletionRequest(OpenAIBaseModel): "This is a parameter used by chat template in tokenizer config of the " "model."), ) + add_special_tokens: Optional[bool] = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to False (as is the " + "default)."), + ) include_stop_str_in_output: Optional[bool] = Field( default=False, description=( diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7b52e10952462..afd87f49c1c45 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -163,7 +163,9 @@ async def create_chat_completion( try: # Tokenize/detokenize depending on prompt format (string/token list) prompt_ids, prompt_text = self._validate_prompt_and_tokenize( - request, prompt=prompt, add_special_tokens=False) + request, + prompt=prompt, + add_special_tokens=request.add_special_tokens) sampling_params = request.to_sampling_params() lora_request = self._maybe_get_lora(request) decoding_config = await self.engine.get_decoding_config() diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index ae659d19c878b..6b5a62efc7f20 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -131,7 +131,8 @@ def _validate_prompt_and_tokenize( prompt_ids: Optional[List[int]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, - add_special_tokens: bool = True) -> Tuple[List[int], str]: + add_special_tokens: Optional[bool] = True + ) -> Tuple[List[int], str]: if not (prompt or prompt_ids): raise ValueError("Either prompt or prompt_ids should be provided.") if (prompt and prompt_ids): @@ -139,11 +140,12 @@ def _validate_prompt_and_tokenize( "Only one of prompt or prompt_ids should be provided.") if prompt_ids is None: - # When using OpenAIServingChat for chat completions, the - # special tokens (e.g., BOS) have already been added by the - # chat template. Therefore, we do not need to add them again. - # Set add_special_tokens to False to avoid adding the BOS tokens - # again. + # When using OpenAIServingChat for chat completions, for + # most models the special tokens (e.g., BOS) have already + # been added by the chat template. Therefore, we do not + # need to add them again. + # Set add_special_tokens to False (by default) to avoid + # adding the BOS tokens again. tokenizer_kwargs: Dict[str, Any] = { "add_special_tokens": add_special_tokens }