Skip to content

Commit

Permalink
Changes pending from vllm-project/vllm#2976
Browse files Browse the repository at this point in the history
Include matched stop string/token in responses

[Cherry-picked from open upstream PR vllm-project/vllm#2976]

Currently a finish_reason of "stop" is returned if any of the following are encountered:
- One of the provided stop strings
- One of the provided stop tokens
- The EOS token

It can be useful to know specifically which of these caused the sequence generation to stop, especially since by default the stop strings/tokens are omitted from the output text (and output token_ids?).

This PR adds a "stop_reason" field to the CompletionOutput class which will contain the matched stop string or integer token id. It will be None otherwise, including the EOS token case. This means in particular that EOS can be inferred by (finish_reason=="stop" and stop_reason=None).

I've also added to the openai server responses but not sure whether or not this should be included since it isn't part of the official API.

Signed-off-by: Joe Runde <[email protected]>
  • Loading branch information
njhill authored and joerunde committed Mar 11, 2024
1 parent d326f87 commit eff6f5e
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 5 deletions.
7 changes: 5 additions & 2 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,12 +1013,15 @@ def _check_stop(self, seq: Sequence,
if seq.output_text.endswith(stop_str):
self._finalize_sequence(seq, sampling_params, stop_str)
seq.status = SequenceStatus.FINISHED_STOPPED
seq.stop_reason = stop_str
return
if seq.get_last_token_id() in sampling_params.stop_token_ids:
last_token_id = seq.get_last_token_id()
if last_token_id in sampling_params.stop_token_ids:
stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens(
seq.get_last_token_id())
last_token_id)
self._finalize_sequence(seq, sampling_params, stop_str)
seq.status = SequenceStatus.FINISHED_STOPPED
seq.stop_reason = last_token_id
return

# Check if the sequence has generated the EOS token.
Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ class CompletionResponseChoice(BaseModel):
text: str
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length"]] = None
stop_reason: Union[None, int, str] = None


class CompletionResponse(BaseModel):
Expand All @@ -270,6 +271,7 @@ class CompletionResponseStreamChoice(BaseModel):
text: str
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length"]] = None
stop_reason: Union[None, int, str] = None


class CompletionStreamResponse(BaseModel):
Expand All @@ -291,6 +293,7 @@ class ChatCompletionResponseChoice(BaseModel):
message: ChatMessage
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length"]] = None
stop_reason: Union[None, int, str] = None


class ChatCompletionResponse(BaseModel):
Expand All @@ -312,6 +315,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
delta: DeltaMessage
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length"]] = None
stop_reason: Union[None, int, str] = None


class ChatCompletionStreamResponse(BaseModel):
Expand Down
4 changes: 3 additions & 1 deletion vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,8 @@ async def chat_completion_stream_generator(
index=i,
delta=DeltaMessage(content=delta_text),
logprobs=logprobs,
finish_reason=output.finish_reason)
finish_reason=output.finish_reason,
stop_reason=output.stop_reason)
chunk = ChatCompletionStreamResponse(
id=request_id,
object=chunk_object_type,
Expand Down Expand Up @@ -271,6 +272,7 @@ async def chat_completion_full_generator(
message=ChatMessage(role=role, content=output.text),
logprobs=logprobs,
finish_reason=output.finish_reason,
stop_reason=output.stop_reason,
)
choices.append(choice_data)

Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ async def completion_stream_generator(
previous_texts[i] = output.text
previous_num_tokens[i] = len(output.token_ids)
finish_reason = output.finish_reason
stop_reason = output.stop_reason
response_json = CompletionStreamResponse(
id=request_id,
created=created_time,
Expand All @@ -271,6 +272,7 @@ async def completion_stream_generator(
text=delta_text,
logprobs=logprobs,
finish_reason=finish_reason,
stop_reason=stop_reason,
)
]).model_dump_json()
yield f"data: {response_json}\n\n"
Expand All @@ -295,6 +297,7 @@ async def completion_stream_generator(
text="",
logprobs=logprobs,
finish_reason=output.finish_reason,
stop_reason=output.stop_reason,
)
],
usage=final_usage,
Expand Down Expand Up @@ -354,6 +357,7 @@ def request_output_to_completion_response(
text=output_text,
logprobs=logprobs,
finish_reason=output.finish_reason,
stop_reason=output.stop_reason,
)
choices.append(choice_data)

Expand Down
10 changes: 8 additions & 2 deletions vllm/outputs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional
from typing import List, Optional, Union
import time

from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
Expand All @@ -18,6 +18,9 @@ class CompletionOutput:
logprobs: The log probabilities of the top probability words at each
position if the logprobs are requested.
finish_reason: The reason why the sequence is finished.
stop_reason: The stop string or token id that caused the completion to stop,
None if the completion finished for some other reason including
encountering the EOS token.
lora_request: The LoRA request that was used to generate the output.
"""

Expand All @@ -29,6 +32,7 @@ def __init__(
cumulative_logprob: float,
logprobs: Optional[SampleLogprobs],
finish_reason: Optional[str] = None,
stop_reason: Union[int, str, None] = None,
lora_request: Optional[LoRARequest] = None,
) -> None:
self.index = index
Expand All @@ -37,6 +41,7 @@ def __init__(
self.cumulative_logprob = cumulative_logprob
self.logprobs = logprobs
self.finish_reason = finish_reason
self.stop_reason = stop_reason
self.lora_request = lora_request

def finished(self) -> bool:
Expand All @@ -48,7 +53,8 @@ def __repr__(self) -> str:
f"token_ids={self.token_ids}, "
f"cumulative_logprob={self.cumulative_logprob}, "
f"logprobs={self.logprobs}, "
f"finish_reason={self.finish_reason})")
f"finish_reason={self.finish_reason}, "
f"stop_reason={self.stop_reason})")


class RequestOutput:
Expand Down
1 change: 1 addition & 0 deletions vllm/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def __init__(
# Initialize the logical token blocks with the prompt token ids.
self._append_tokens_to_blocks(prompt_token_ids)
self.status = SequenceStatus.WAITING
self.stop_reason: Union[int, str, None] = None

# Used for incremental detokenization
self.prefix_offset = 0
Expand Down

0 comments on commit eff6f5e

Please sign in to comment.