Fix wrong token latency when batch size is greater than 1 (#1244)

Fix the wrong 2nd token latency when batch size is greater than 1. python benchmark.py -m /mnt/llm_irs/models_6c715998_ww45.4_optimum/llama-2-7b-chat/pytorch/dldt/FP16 -n 1 --genai -ic 128 -bs 16 [ INFO ] [Average] P[0] Input token size: 128, 1st token latency: **0.36 ms/16tokens**, **2nd token latency: 1958228200.33 ms/16tokens**, 2nd tokens throughput: **0.00** 16tokenss/s Fix result: [ INFO ] [Average] P[0] Input token size: 128, 1st token latency: 91.54 ms/16tokens, 2nd token latency: 69.81 ms/16tokens, 2nd tokens throughput: 229.18 tokens/s
openvinotoolkit · Nov 22, 2024 · ff8846a · ff8846a
1 parent 5d5fe75
commit ff8846a
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 6 deletions.
diff --git a/tools/llm_bench/llm_bench_utils/metrics_print.py b/tools/llm_bench/llm_bench_utils/metrics_print.py
@@ -149,15 +149,15 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch
             avg_input_size = int(avg_input_size / index_num)
             if avg_2nd_tokens_latency > 0:
                 avg_2nd_token_tput = (1 / avg_2nd_tokens_latency) * batch_size * 1000
-            latency_unit = 'token' if is_text_gen is True else 'step'
+            tput_unit = latency_unit = 'token' if is_text_gen is True else 'step'
             if batch_size > 1:
                 if is_text_gen is True:
                     latency_unit = '{}tokens'.format(batch_size)
                 else:
                     latency_unit = '{}steps'.format(batch_size)
             avg_1st_token_latency = 'NA' if avg_1st_token_latency < 0 else f'{avg_1st_token_latency:.2f} ms/{latency_unit}'
             avg_2nd_tokens_latency = 'NA' if avg_2nd_tokens_latency < 0 else f'{avg_2nd_tokens_latency:.2f} ms/{latency_unit}'
-            avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {latency_unit}s/s'
+            avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {tput_unit}s/s'
             prefix = f'[ INFO ] [Average] P[{p_idx}]L[{loop_idx}]' if loop_idx != -1 else f'[ INFO ] [Average] P[{p_idx}]'
             if is_text_gen is True:
                 output_info = ''

diff --git a/tools/llm_bench/task/speech_to_text_generation.py b/tools/llm_bench/task/speech_to_text_generation.py
@@ -51,10 +51,10 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list):
         )
         end = time.perf_counter()
         perf_metrics = result_text.perf_metrics
-        first_token_time = perf_metrics.get_ttft().mean / args["batch_size"]
+        first_token_time = perf_metrics.get_ttft().mean
         second_tokens_durations = (
             np.array(perf_metrics.raw_metrics.m_new_token_times[1:])
-            - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) / args["batch_size"]
+            - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
         ).tolist()
         tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist()
         tm_infer_list = []

diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
@@ -240,10 +240,10 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
         per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
     else:
         log.warning("No generated tokens")
-    first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) / args["batch_size"]
+    first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) * args["batch_size"]
     second_tokens_durations = (
         np.array(perf_metrics.raw_metrics.m_new_token_times[1:])
-        - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) / args["batch_size"]
+        - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
     ).tolist()
 
     tm_list = np.array([first_token_time] + second_tokens_durations) / 1000