diff --git a/swift/llm/deploy.py b/swift/llm/deploy.py index 0baa80f6a..7c6bb4f90 100644 --- a/swift/llm/deploy.py +++ b/swift/llm/deploy.py @@ -516,9 +516,7 @@ async def _generate_full(): prompt_tokens=num_prompt_tokens, completion_tokens=num_generated_tokens, total_tokens=num_prompt_tokens + num_generated_tokens) - finish_reason = None - if output.status.name == 'FINISH': - finish_reason = 'stop' + finish_reason = 'stop' if output.status.name == 'FINISH' else None if isinstance(request, ChatCompletionRequest): action, action_input = split_action_action_input(response) @@ -554,31 +552,25 @@ async def _generate_stream(): async with llm_engine.safe_run(session_id): async_iter = generator.async_stream_infer( session_id=session_id, **inputs, stream_output=True, gen_config=generation_config).__aiter__() - is_finished = False response = None - while not is_finished: - try: - output = await async_iter.__anext__() - except StopAsyncIteration: - is_finished = True + async for output in async_iter: num_generated_tokens = len(output.token_ids) usage_info = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=num_generated_tokens, total_tokens=num_prompt_tokens + num_generated_tokens, ) + is_finished = output.status.name == 'FINISH' delta_text = template.generate_ids_to_response( output.token_ids, is_finished, return_delta=True, print_idx=print_idx) - finish_reason = None - if output.status.name == 'FINISH': - finish_reason = 'stop' - if not delta_text and finish_reason != 'stop': + if not delta_text and not is_finished: continue + finish_reason = 'stop' if is_finished else None total_response += delta_text if isinstance(request, ChatCompletionRequest): toolcall = None - if finish_reason == 'stop': + if is_finished: action, action_input = split_action_action_input(total_response) if action is not None: toolcall = [