diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 252f970742495..9551b4f2091dd 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -581,6 +581,9 @@ async def chat_completion_full_generator( final_res = res except asyncio.CancelledError: return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) assert final_res is not None diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index db31b1153d97e..570232be38379 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -189,7 +189,13 @@ async def create_completion( try: async for i, res in result_generator: final_res_batch[i] = res + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + try: for i, final_res in enumerate(final_res_batch): assert final_res is not None @@ -211,8 +217,6 @@ async def create_completion( tokenizer, request_metadata, ) - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index e1db99ec2f88c..90cd9145fac1e 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -225,7 +225,10 @@ async def create_embedding( try: async for i, res in result_generator: final_res_batch[i] = res + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + try: for final_res in final_res_batch: assert final_res is not None @@ -235,8 +238,6 @@ async def create_embedding( response = request_output_to_embedding_response( final_res_batch_checked, request_id, created_time, model_name, encoding_format) - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e))