Skip to content

Commit

Permalink
Changes pending from vllm-project/vllm#2976
Browse files Browse the repository at this point in the history
Signed-off-by: Joe Runde <[email protected]>
  • Loading branch information
njhill authored and joerunde committed Mar 11, 2024
1 parent 0efdf51 commit 8c9f28e
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 6 deletions.
7 changes: 5 additions & 2 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,12 +1013,15 @@ def _check_stop(self, seq: Sequence,
if seq.output_text.endswith(stop_str):
self._finalize_sequence(seq, sampling_params, stop_str)
seq.status = SequenceStatus.FINISHED_STOPPED
seq.stop_reason = stop_str
return
if seq.get_last_token_id() in sampling_params.stop_token_ids:
last_token_id = seq.get_last_token_id()
if last_token_id in sampling_params.stop_token_ids:
stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens(
seq.get_last_token_id())
last_token_id)
self._finalize_sequence(seq, sampling_params, stop_str)
seq.status = SequenceStatus.FINISHED_STOPPED
seq.stop_reason = last_token_id
return

# Check if the sequence has generated the EOS token.
Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ class CompletionResponseChoice(BaseModel):
text: str
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length"]] = None
stop_reason: Union[None, int, str] = None


class CompletionResponse(BaseModel):
Expand All @@ -270,6 +271,7 @@ class CompletionResponseStreamChoice(BaseModel):
text: str
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length"]] = None
stop_reason: Union[None, int, str] = None


class CompletionStreamResponse(BaseModel):
Expand All @@ -291,6 +293,7 @@ class ChatCompletionResponseChoice(BaseModel):
message: ChatMessage
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length"]] = None
stop_reason: Union[None, int, str] = None


class ChatCompletionResponse(BaseModel):
Expand All @@ -312,6 +315,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
delta: DeltaMessage
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length"]] = None
stop_reason: Union[None, int, str] = None


class ChatCompletionStreamResponse(BaseModel):
Expand Down
4 changes: 3 additions & 1 deletion vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,8 @@ async def chat_completion_stream_generator(
index=i,
delta=DeltaMessage(content=delta_text),
logprobs=logprobs,
finish_reason=output.finish_reason)
finish_reason=output.finish_reason,
stop_reason=output.stop_reason)
chunk = ChatCompletionStreamResponse(
id=request_id,
object=chunk_object_type,
Expand Down Expand Up @@ -271,6 +272,7 @@ async def chat_completion_full_generator(
message=ChatMessage(role=role, content=output.text),
logprobs=logprobs,
finish_reason=output.finish_reason,
stop_reason=output.stop_reason,
)
choices.append(choice_data)

Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ async def completion_stream_generator(
previous_texts[i] = output.text
previous_num_tokens[i] = len(output.token_ids)
finish_reason = output.finish_reason
stop_reason = output.stop_reason
response_json = CompletionStreamResponse(
id=request_id,
created=created_time,
Expand All @@ -271,6 +272,7 @@ async def completion_stream_generator(
text=delta_text,
logprobs=logprobs,
finish_reason=finish_reason,
stop_reason=stop_reason,
)
]).model_dump_json()
yield f"data: {response_json}\n\n"
Expand All @@ -295,6 +297,7 @@ async def completion_stream_generator(
text="",
logprobs=logprobs,
finish_reason=output.finish_reason,
stop_reason=output.stop_reason,
)
],
usage=final_usage,
Expand Down Expand Up @@ -354,6 +357,7 @@ def request_output_to_completion_response(
text=output_text,
logprobs=logprobs,
finish_reason=output.finish_reason,
stop_reason=output.stop_reason,
)
choices.append(choice_data)

Expand Down
13 changes: 10 additions & 3 deletions vllm/outputs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional
from typing import List, Optional, Union
import time

from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
Expand All @@ -18,6 +18,9 @@ class CompletionOutput:
logprobs: The log probabilities of the top probability words at each
position if the logprobs are requested.
finish_reason: The reason why the sequence is finished.
stop_reason: The stop string or token id that caused the completion to stop,
None if the completion finished for some other reason including
encountering the EOS token.
lora_request: The LoRA request that was used to generate the output.
"""

Expand All @@ -29,6 +32,7 @@ def __init__(
cumulative_logprob: float,
logprobs: Optional[SampleLogprobs],
finish_reason: Optional[str] = None,
stop_reason: Union[int, str, None] = None,
lora_request: Optional[LoRARequest] = None,
) -> None:
self.index = index
Expand All @@ -37,6 +41,7 @@ def __init__(
self.cumulative_logprob = cumulative_logprob
self.logprobs = logprobs
self.finish_reason = finish_reason
self.stop_reason = stop_reason
self.lora_request = lora_request

def finished(self) -> bool:
Expand All @@ -48,7 +53,8 @@ def __repr__(self) -> str:
f"token_ids={self.token_ids}, "
f"cumulative_logprob={self.cumulative_logprob}, "
f"logprobs={self.logprobs}, "
f"finish_reason={self.finish_reason})")
f"finish_reason={self.finish_reason}, "
f"stop_reason={self.stop_reason})")


class RequestOutput:
Expand Down Expand Up @@ -111,7 +117,8 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
seq.get_output_token_ids(),
seq.get_cumulative_logprob(),
seq.output_logprobs if include_logprobs else None,
SequenceStatus.get_finished_reason(seq.status))
SequenceStatus.get_finished_reason(seq.status),
seq.stop_reason)
for seq in top_n_seqs
]

Expand Down
1 change: 1 addition & 0 deletions vllm/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def __init__(
# Initialize the logical token blocks with the prompt token ids.
self._append_tokens_to_blocks(prompt_token_ids)
self.status = SequenceStatus.WAITING
self.stop_reason: Union[int, str, None] = None

# Used for incremental detokenization
self.prefix_offset = 0
Expand Down

0 comments on commit 8c9f28e

Please sign in to comment.