From 6e3db4d860ca381d9df29c894c4141ee5805df6c Mon Sep 17 00:00:00 2001 From: Duyi-Wang Date: Mon, 29 Apr 2024 16:09:46 +0800 Subject: [PATCH] [Benchmark] Calculate throughput using avg latency. --- benchmark/benchmark.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 4d38673c..5ae2fd80 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -240,9 +240,7 @@ def build_inputs_chatglm(tokenizer, query: List[str], padding, history: List[Tup print(f"Next token P90 Latency:\t{np.percentile(next_token_times, 90):.2f} ms") print(f"Next token Avg Latency:\t{np.mean(next_token_times):.2f} ms") print(f"Next token Latency:\t{np.percentile(next_token_times, 90):.2f} ms") - print( - f"Throughput without 1st token:\t{1000 / np.percentile(next_token_times, 90) * args.batch_size:.2f} tokens/s" - ) + print(f"Throughput without 1st token:\t{1000 / np.mean(next_token_times) * args.batch_size:.2f} tokens/s") print("=" * 120, "\n" * 3) if args.csv != "": @@ -262,7 +260,7 @@ def build_inputs_chatglm(tokenizer, query: List[str], padding, history: List[Tup "2nd_min(ms)": round(np.min(next_token_times), 2), "2nd_P90(ms)": round(np.percentile(next_token_times, 90), 2), "2nd_avg(ms)": round(np.mean(next_token_times), 2), - "throughput_wo_1st (tokens/s)": round(1000 / np.percentile(next_token_times, 90) * args.batch_size, 2), + "throughput_wo_1st (tokens/s)": round(1000 / np.mean(next_token_times) * args.batch_size, 2), **arg_dict, "Fake_model": True if os.environ.get("XFT_FAKE_MODEL", "-1") == "1" else False, "Response": response,