Skip to content

Commit

Permalink
[Frontend] Add Usage data in each chunk for chat_serving. vllm-projec…
Browse files Browse the repository at this point in the history
  • Loading branch information
yecohn authored and jimpang committed Jul 24, 2024
1 parent 13540d6 commit 5b654e2
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 12 deletions.
40 changes: 32 additions & 8 deletions tests/entrypoints/openai/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,14 +295,19 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
async for chunk in stream:
assert chunk.usage is None

# Test stream=True, stream_options={"include_usage": True}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
stream_options={"include_usage": True})
# Test stream=True, stream_options={"include_usage": True,
# "continuous_usage_stats": False}}
stream = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
stream_options={
"include_usage":
True,
"continuous_usage_stats":
False
})

async for chunk in stream:
if chunk.choices[0].finish_reason is None:
Expand Down Expand Up @@ -338,6 +343,25 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream=False,
stream_options={"include_usage": True})

# Test stream=True, stream_options={"include_usage": True,
# "continuous_usage_stats": True}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True
},
)
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)


# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
# (i.e. using the same ordering as in the Completions API tests), the test
Expand Down
50 changes: 46 additions & 4 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,15 @@ async def chat_completion_stream_generator(
model=model_name)
if (request.stream_options
and request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.continuous_usage_stats):
prompt_tokens = len(res.prompt_token_ids)
usage = UsageInfo(prompt_tokens=prompt_tokens,
completion_tokens=0,
total_tokens=prompt_tokens)
chunk.usage = usage
else:
chunk.usage = None

data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"

Expand Down Expand Up @@ -277,7 +285,18 @@ async def chat_completion_stream_generator(
model=model_name)
if (request.stream_options and
request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.
continuous_usage_stats):
prompt_tokens = len(
res.prompt_token_ids)
usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=0,
total_tokens=prompt_tokens)
chunk.usage = usage
else:
chunk.usage = None

data = chunk.model_dump_json(
exclude_unset=True)
yield f"data: {data}\n\n"
Expand Down Expand Up @@ -336,7 +355,19 @@ async def chat_completion_stream_generator(
model=model_name)
if (request.stream_options
and request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.continuous_usage_stats):
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens +
completion_tokens,
)
chunk.usage = usage
else:
chunk.usage = None

data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"
else:
Expand All @@ -356,7 +387,18 @@ async def chat_completion_stream_generator(
model=model_name)
if (request.stream_options
and request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.continuous_usage_stats):
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens +
completion_tokens,
)
chunk.usage = usage
else:
chunk.usage = None
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"
finish_reason_sent[i] = True
Expand Down

0 comments on commit 5b654e2

Please sign in to comment.