[Benchmark] Add KPI of max, min, avg and p90 latency (#77)

* Add KPI of max, min, avg and p90 latency * Update var names
intel · Nov 27, 2023 · f205d37 · f205d37
1 parent b5a5fc0
commit f205d37
Showing 1 changed file with 13 additions and 5 deletions.
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
@@ -22,6 +22,7 @@
 from transformers import AutoTokenizer, TextStreamer
 import json
 import pathlib
+import numpy as np
 
 import argparse
 
@@ -163,18 +164,25 @@ def build_inputs_baichuan(tokenizer, query: List[str], padding, history: List[Tu
         output_token_nums = int(torch.numel(generated_ids) / args.batch_size) - input_token_nums
         # Sort the execution times in ascending order
         remained_token_times.sort()
-        # Get the 90th element (index 89) from the sorted list
-        latency_90 = remained_token_times[int(args.iteration * 0.9) - 1] * 1000 / (output_token_nums - 1)
+        # Get the max, min, avg and 90th element (index 89) from the sorted list
+        next_token_latency_max = remained_token_times[-1] * 1000 / (output_token_nums - 1)
+        next_token_latency_min = remained_token_times[0] * 1000 / (output_token_nums - 1)
+        next_token_latency_avg = np.mean(remained_token_times) * 1000 / (output_token_nums - 1)
+        next_token_latency_90 = remained_token_times[int(args.iteration * 0.9) - 1] * 1000 / (output_token_nums - 1)
         # Calculate total latency
         inference_latency = sum(total_times) / len(total_times)
         # Calculate the first token latency
         first_token_latency = sum(first_token_times) / len(first_token_times) * 1000
-        Next_token_throughput = 1000 / latency_90 * args.batch_size
+        Next_token_throughput = 1000 / next_token_latency_90 * args.batch_size
         print("\n")
         print("=" * 50 + args.model_name + " Final Performance" + "=" * 50)
         print(f"Inference Latency:\t{inference_latency:.2f} s")
-        print(f"First token Latency:\t{first_token_latency:.2f} ms")
-        print(f"Next token Latency:\t{latency_90:.2f} ms")
+        print(f"First token Avg Latency:\t{first_token_latency:.2f} ms")
+        print(f"Next token Max Latency:\t{next_token_latency_max:.2f} ms")
+        print(f"Next token Min Latency:\t{next_token_latency_min:.2f} ms")
+        print(f"Next token P90 Latency:\t{next_token_latency_90:.2f} ms")
+        print(f"Next token Avg Latency:\t{next_token_latency_avg:.2f} ms")
+        print(f"Next token Latency:\t{next_token_latency_90:.2f} ms")
         print(f"Throughput without 1st token:\t{Next_token_throughput:.2f} tokens/s")
     else:
         for i in range(args.warmup + args.iteration):