diff --git a/README.md b/README.md index 22bc2e4cd..c544f7cc7 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ python examples/inference/api_server_openai/query_http_requests.py # using OpenAI SDK # please install openai in current env by running: pip install openai>=1.0 -export OPENAI_API_BASE=http://localhost:8000/v1 +export OPENAI_BASE_URL=http://localhost:8000/v1 export OPENAI_API_KEY="not_a_real_key" python examples/inference/api_server_openai/query_openai_sdk.py ``` diff --git a/docs/serve.md b/docs/serve.md index 0611f60e1..2beed2b18 100644 --- a/docs/serve.md +++ b/docs/serve.md @@ -64,7 +64,7 @@ python examples/inference/api_server_openai/query_http_requests.py # using OpenAI SDK # please install openai in current env by running: pip install openai>=1.0 -export OPENAI_API_BASE=http://localhost:8000/v1 +export OPENAI_BASE_URL=http://localhost:8000/v1 export OPENAI_API_KEY="not_a_real_key" python examples/inference/api_server_openai/query_openai_sdk.py ``` diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py index 234a62ebb..37487a761 100644 --- a/examples/inference/api_server_openai/query_http_requests.py +++ b/examples/inference/api_server_openai/query_http_requests.py @@ -67,7 +67,7 @@ response = s.post(url, json=body, proxies=proxies, stream=args.streaming_response) # type: ignore for chunk in response.iter_lines(decode_unicode=True): try: - if chunk is not None: + if chunk is not None and chunk != "": if args.streaming_response: # Get data from reponse chunk chunk_data = chunk.split("data: ")[1] diff --git a/examples/inference/api_server_openai/query_openai_sdk.py b/examples/inference/api_server_openai/query_openai_sdk.py index 3eaa1f404..ed6622e12 100644 --- a/examples/inference/api_server_openai/query_openai_sdk.py +++ b/examples/inference/api_server_openai/query_openai_sdk.py @@ -58,4 +58,11 @@ temperature=args.temperature, top_p=args.top_p, ) -print(chat_completion) +if args.streaming_response: + for chunk in chat_completion: + content = chunk.choices[0].delta.content + if content is not None: + print(content, end="") + print("") +else: + print(chat_completion) diff --git a/inference/api_openai_backend/router_app.py b/inference/api_openai_backend/router_app.py index f622e1275..7f511760f 100644 --- a/inference/api_openai_backend/router_app.py +++ b/inference/api_openai_backend/router_app.py @@ -108,7 +108,7 @@ async def _completions_wrapper( logger.error(f"{subresult_dict['error']}") all_results.pop() had_error = True - yield "data: " + ModelResponse(**subresult_dict).json() + "\n" + yield "data: " + ModelResponse(**subresult_dict).json() + "\n\n" # Return early in case of an error break choices = [ @@ -125,7 +125,7 @@ async def _completions_wrapper( model=body.model, choices=choices, usage=usage, - ).json() + "\n" + ).json() + "\n\n" if had_error: # Return early in case of an error break @@ -141,8 +141,8 @@ async def _completions_wrapper( model=body.model, choices=choices, usage=usage, - ).json() + "\n" - yield "data: [DONE]\n" + ).json() + "\n\n" + yield "data: [DONE]\n\n" async def _chat_completions_wrapper( @@ -167,7 +167,7 @@ async def _chat_completions_wrapper( model=body.model, choices=choices, usage=None, - ).json() + "\n" + ).json() + "\n\n" all_results = [] async for results in generator: @@ -182,7 +182,7 @@ async def _chat_completions_wrapper( subresult_dict["finish_reason"] = None all_results.pop() had_error = True - yield "data: " + ModelResponse(**subresult_dict).json() + "\n" + yield "data: " + ModelResponse(**subresult_dict).json() + "\n\n" # Return early in case of an error break else: @@ -200,7 +200,7 @@ async def _chat_completions_wrapper( model=body.model, choices=choices, usage=None, - ).json() + "\n" + ).json() + "\n\n" if had_error: # Return early in case of an error break @@ -223,8 +223,8 @@ async def _chat_completions_wrapper( model=body.model, choices=choices, usage=usage, - ).json() + "\n" - yield "data: [DONE]\n" + ).json() + "\n\n" + yield "data: [DONE]\n\n" class Router: