From 5b654e253057acc5ddcf0266696b951358bc28e9 Mon Sep 17 00:00:00 2001 From: Yehoshua Cohen <61619195+yecohn@users.noreply.github.com> Date: Tue, 23 Jul 2024 21:41:55 +0300 Subject: [PATCH] [Frontend] Add Usage data in each chunk for chat_serving. #6540 (#6652) --- tests/entrypoints/openai/test_chat.py | 40 ++++++++++++++++---- vllm/entrypoints/openai/serving_chat.py | 50 +++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 12 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 1abaa01ae192a..c96d602b63438 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -295,14 +295,19 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, async for chunk in stream: assert chunk.usage is None - # Test stream=True, stream_options={"include_usage": True} - stream = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=True, - stream_options={"include_usage": True}) + # Test stream=True, stream_options={"include_usage": True, + # "continuous_usage_stats": False}} + stream = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + stream_options={ + "include_usage": + True, + "continuous_usage_stats": + False + }) async for chunk in stream: if chunk.choices[0].finish_reason is None: @@ -338,6 +343,25 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, stream=False, stream_options={"include_usage": True}) + # Test stream=True, stream_options={"include_usage": True, + # "continuous_usage_stats": True} + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + stream_options={ + "include_usage": True, + "continuous_usage_stats": True + }, + ) + async for chunk in stream: + assert chunk.usage.prompt_tokens >= 0 + assert chunk.usage.completion_tokens >= 0 + assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + + chunk.usage.completion_tokens) + # NOTE: Not sure why, but when I place this after `test_guided_regex_chat` # (i.e. using the same ordering as in the Completions API tests), the test diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index b21c2bc513186..3899509ef3ff4 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -247,7 +247,15 @@ async def chat_completion_stream_generator( model=model_name) if (request.stream_options and request.stream_options.include_usage): - chunk.usage = None + if (request.stream_options.continuous_usage_stats): + prompt_tokens = len(res.prompt_token_ids) + usage = UsageInfo(prompt_tokens=prompt_tokens, + completion_tokens=0, + total_tokens=prompt_tokens) + chunk.usage = usage + else: + chunk.usage = None + data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" @@ -277,7 +285,18 @@ async def chat_completion_stream_generator( model=model_name) if (request.stream_options and request.stream_options.include_usage): - chunk.usage = None + if (request.stream_options. + continuous_usage_stats): + prompt_tokens = len( + res.prompt_token_ids) + usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=0, + total_tokens=prompt_tokens) + chunk.usage = usage + else: + chunk.usage = None + data = chunk.model_dump_json( exclude_unset=True) yield f"data: {data}\n\n" @@ -336,7 +355,19 @@ async def chat_completion_stream_generator( model=model_name) if (request.stream_options and request.stream_options.include_usage): - chunk.usage = None + if (request.stream_options.continuous_usage_stats): + prompt_tokens = len(res.prompt_token_ids) + completion_tokens = len(output.token_ids) + usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + + completion_tokens, + ) + chunk.usage = usage + else: + chunk.usage = None + data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" else: @@ -356,7 +387,18 @@ async def chat_completion_stream_generator( model=model_name) if (request.stream_options and request.stream_options.include_usage): - chunk.usage = None + if (request.stream_options.continuous_usage_stats): + prompt_tokens = len(res.prompt_token_ids) + completion_tokens = len(output.token_ids) + usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + + completion_tokens, + ) + chunk.usage = usage + else: + chunk.usage = None data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" finish_reason_sent[i] = True