From 2709c0009aa434fbf2ef0fe48ca3094a2268b190 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+HMellor@users.noreply.github.com> Date: Fri, 19 Jan 2024 04:34:08 +0000 Subject: [PATCH] Support OpenAI API server in `benchmark_serving.py` (#2172) --- .gitignore | 3 ++ benchmarks/benchmark_serving.py | 80 ++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index 58572718850e6..b5195629e5cf3 100644 --- a/.gitignore +++ b/.gitignore @@ -181,3 +181,6 @@ _build/ # hip files generated by PyTorch *.hip *_hip* + +# Benchmark dataset +*.json diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 3a80e679191e3..6b5dd097cea09 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -24,6 +24,7 @@ import aiohttp import numpy as np +from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase from vllm.transformers_utils.tokenizer import get_tokenizer @@ -40,15 +41,10 @@ def sample_requests( with open(dataset_path) as f: dataset = json.load(f) # Filter out the conversations with less than 2 turns. - dataset = [ - data for data in dataset - if len(data["conversations"]) >= 2 - ] + dataset = [data for data in dataset if len(data["conversations"]) >= 2] # Only keep the first two turns of each conversation. - dataset = [ - (data["conversations"][0]["value"], data["conversations"][1]["value"]) - for data in dataset - ] + dataset = [(data["conversations"][0]["value"], + data["conversations"][1]["value"]) for data in dataset] # Tokenize the prompts and completions. prompts = [prompt for prompt, _ in dataset] @@ -98,6 +94,7 @@ async def get_request( async def send_request( backend: str, + model: str, api_url: str, prompt: str, prompt_len: int, @@ -120,6 +117,8 @@ async def send_request( "ignore_eos": True, "stream": False, } + if model is not None: + pload["model"] = model elif backend == "tgi": assert not use_beam_search params = { @@ -137,7 +136,8 @@ async def send_request( timeout = aiohttp.ClientTimeout(total=3 * 3600) async with aiohttp.ClientSession(timeout=timeout) as session: while True: - async with session.post(api_url, headers=headers, json=pload) as response: + async with session.post(api_url, headers=headers, + json=pload) as response: chunks = [] async for chunk, _ in response.content.iter_chunks(): chunks.append(chunk) @@ -155,6 +155,7 @@ async def send_request( async def benchmark( backend: str, + model: str, api_url: str, input_requests: List[Tuple[str, int, int]], best_of: int, @@ -164,11 +165,11 @@ async def benchmark( tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): prompt, prompt_len, output_len = request - task = asyncio.create_task(send_request(backend, api_url, prompt, - prompt_len, output_len, - best_of, use_beam_search)) + task = asyncio.create_task( + send_request(backend, model, api_url, prompt, prompt_len, + output_len, best_of, use_beam_search)) tasks.append(task) - await asyncio.gather(*tasks) + await tqdm.gather(*tasks) def main(args: argparse.Namespace): @@ -176,13 +177,15 @@ def main(args: argparse.Namespace): random.seed(args.seed) np.random.seed(args.seed) - api_url = f"http://{args.host}:{args.port}/generate" - tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code) + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + tokenizer = get_tokenizer(args.tokenizer, + trust_remote_code=args.trust_remote_code) input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer) benchmark_start_time = time.perf_counter() - asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of, - args.use_beam_search, args.request_rate)) + asyncio.run( + benchmark(args.backend, args.model, api_url, input_requests, + args.best_of, args.use_beam_search, args.request_rate)) benchmark_end_time = time.perf_counter() benchmark_time = benchmark_end_time - benchmark_start_time print(f"Total time: {benchmark_time:.2f} s") @@ -196,10 +199,8 @@ def main(args: argparse.Namespace): for prompt_len, output_len, latency in REQUEST_LATENCY ]) print(f"Average latency per token: {avg_per_token_latency:.2f} s") - avg_per_output_token_latency = np.mean([ - latency / output_len - for _, output_len, latency in REQUEST_LATENCY - ]) + avg_per_output_token_latency = np.mean( + [latency / output_len for _, output_len, latency in REQUEST_LATENCY]) print("Average latency per output token: " f"{avg_per_output_token_latency:.2f} s") @@ -207,27 +208,42 @@ def main(args: argparse.Namespace): if __name__ == "__main__": parser = argparse.ArgumentParser( description="Benchmark the online serving throughput.") - parser.add_argument("--backend", type=str, default="vllm", + parser.add_argument("--backend", + type=str, + default="vllm", choices=["vllm", "tgi"]) parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--dataset", type=str, required=True, + parser.add_argument("--endpoint", type=str, default="/generate") + parser.add_argument("--model", type=str, default=None) + parser.add_argument("--dataset", + type=str, + required=True, help="Path to the dataset.") - parser.add_argument("--tokenizer", type=str, required=True, + parser.add_argument("--tokenizer", + type=str, + required=True, help="Name or path of the tokenizer.") - parser.add_argument("--best-of", type=int, default=1, + parser.add_argument("--best-of", + type=int, + default=1, help="Generates `best_of` sequences per prompt and " - "returns the best one.") + "returns the best one.") parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument("--num-prompts", type=int, default=1000, + parser.add_argument("--num-prompts", + type=int, + default=1000, help="Number of prompts to process.") - parser.add_argument("--request-rate", type=float, default=float("inf"), + parser.add_argument("--request-rate", + type=float, + default=float("inf"), help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize " - "the request arrival times.") + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process to synthesize " + "the request arrival times.") parser.add_argument("--seed", type=int, default=0) - parser.add_argument('--trust-remote-code', action='store_true', + parser.add_argument('--trust-remote-code', + action='store_true', help='trust remote code from huggingface') args = parser.parse_args() main(args)