Skip to content

Commit

Permalink
[Benchmark] Add KPI of max, min, avg and p90 latency (#77)
Browse files Browse the repository at this point in the history
* Add KPI of max, min, avg and p90 latency

* Update var names
  • Loading branch information
JunxiChhen authored Nov 27, 2023
1 parent b5a5fc0 commit f205d37
Showing 1 changed file with 13 additions and 5 deletions.
18 changes: 13 additions & 5 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from transformers import AutoTokenizer, TextStreamer
import json
import pathlib
import numpy as np

import argparse

Expand Down Expand Up @@ -163,18 +164,25 @@ def build_inputs_baichuan(tokenizer, query: List[str], padding, history: List[Tu
output_token_nums = int(torch.numel(generated_ids) / args.batch_size) - input_token_nums
# Sort the execution times in ascending order
remained_token_times.sort()
# Get the 90th element (index 89) from the sorted list
latency_90 = remained_token_times[int(args.iteration * 0.9) - 1] * 1000 / (output_token_nums - 1)
# Get the max, min, avg and 90th element (index 89) from the sorted list
next_token_latency_max = remained_token_times[-1] * 1000 / (output_token_nums - 1)
next_token_latency_min = remained_token_times[0] * 1000 / (output_token_nums - 1)
next_token_latency_avg = np.mean(remained_token_times) * 1000 / (output_token_nums - 1)
next_token_latency_90 = remained_token_times[int(args.iteration * 0.9) - 1] * 1000 / (output_token_nums - 1)
# Calculate total latency
inference_latency = sum(total_times) / len(total_times)
# Calculate the first token latency
first_token_latency = sum(first_token_times) / len(first_token_times) * 1000
Next_token_throughput = 1000 / latency_90 * args.batch_size
Next_token_throughput = 1000 / next_token_latency_90 * args.batch_size
print("\n")
print("=" * 50 + args.model_name + " Final Performance" + "=" * 50)
print(f"Inference Latency:\t{inference_latency:.2f} s")
print(f"First token Latency:\t{first_token_latency:.2f} ms")
print(f"Next token Latency:\t{latency_90:.2f} ms")
print(f"First token Avg Latency:\t{first_token_latency:.2f} ms")
print(f"Next token Max Latency:\t{next_token_latency_max:.2f} ms")
print(f"Next token Min Latency:\t{next_token_latency_min:.2f} ms")
print(f"Next token P90 Latency:\t{next_token_latency_90:.2f} ms")
print(f"Next token Avg Latency:\t{next_token_latency_avg:.2f} ms")
print(f"Next token Latency:\t{next_token_latency_90:.2f} ms")
print(f"Throughput without 1st token:\t{Next_token_throughput:.2f} tokens/s")
else:
for i in range(args.warmup + args.iteration):
Expand Down

0 comments on commit f205d37

Please sign in to comment.