From 6e3db4d860ca381d9df29c894c4141ee5805df6c Mon Sep 17 00:00:00 2001
From: Duyi-Wang <duyi.wang@intel.com>
Date: Mon, 29 Apr 2024 16:09:46 +0800
Subject: [PATCH] [Benchmark] Calculate throughput using avg latency.

---
 benchmark/benchmark.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 4d38673c..5ae2fd80 100644
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -240,9 +240,7 @@ def build_inputs_chatglm(tokenizer, query: List[str], padding, history: List[Tup
         print(f"Next token P90 Latency:\t{np.percentile(next_token_times, 90):.2f} ms")
         print(f"Next token Avg Latency:\t{np.mean(next_token_times):.2f} ms")
         print(f"Next token Latency:\t{np.percentile(next_token_times, 90):.2f} ms")
-        print(
-            f"Throughput without 1st token:\t{1000 / np.percentile(next_token_times, 90) * args.batch_size:.2f} tokens/s"
-        )
+        print(f"Throughput without 1st token:\t{1000 / np.mean(next_token_times) * args.batch_size:.2f} tokens/s")
         print("=" * 120, "\n" * 3)
 
         if args.csv != "":
@@ -262,7 +260,7 @@ def build_inputs_chatglm(tokenizer, query: List[str], padding, history: List[Tup
                 "2nd_min(ms)": round(np.min(next_token_times), 2),
                 "2nd_P90(ms)": round(np.percentile(next_token_times, 90), 2),
                 "2nd_avg(ms)": round(np.mean(next_token_times), 2),
-                "throughput_wo_1st (tokens/s)": round(1000 / np.percentile(next_token_times, 90) * args.batch_size, 2),
+                "throughput_wo_1st (tokens/s)": round(1000 / np.mean(next_token_times) * args.batch_size, 2),
                 **arg_dict,
                 "Fake_model": True if os.environ.get("XFT_FAKE_MODEL", "-1") == "1" else False,
                 "Response": response,