[test] Ensure that the first token generation is not included into TP…

…OT (#1414) CVS-155098
openvinotoolkit · Dec 23, 2024 · c09207c · c09207c
1 parent 5d68567
commit c09207c
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 3 deletions.
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
@@ -101,7 +101,7 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
 
         auto ttft = tok_times[0] - start_time_val;
         raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>();
-        raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]);
+        raw_metrics.m_times_to_first_token.emplace_back(ttft);
         num_generated_tokens = batch_sizes[0];
 
         // The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens.

diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py
@@ -3,7 +3,8 @@
 
 def pytest_make_parametrize_id(config, val, argname):
     if argname in ['prompt', 'prompts', 'batched_prompts']:
-        return f'{val}'
+        # Print only first 1000 characters of long prompts.
+        return f'{val[:1000]}'
     elif argname == 'model_descr':
         return f"{val[0]}"
     elif argname == 'chat_config':

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
@@ -798,6 +798,12 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std)
     assert mean_ttft > 0 and mean_ttft < 1000.0
 
+    raw_metrics = perf_metrics.raw_metrics
+    durations = np.array(raw_metrics.m_durations) / 1000
+    # Check that prefill is not included in durations for TPOT calculation.
+    # For the very long prompt prefill is slow and TTFT is much larger than any other token genration duration.
+    assert np.all(mean_ttft > durations * 2)
+
     mean_tpot, std_tpot = perf_metrics.get_tpot()
     assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std)
     assert mean_tpot > 0 and mean_ttft < 1000.0
@@ -822,7 +828,9 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     assert std_detok_duration == 0
 
     # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics
-    raw_metrics = perf_metrics.raw_metrics
+    assert np.allclose(mean_tpot, np.mean(durations))
+    assert np.allclose(std_tpot, np.std(durations))
+
     raw_dur = np.array(raw_metrics.generate_durations) / 1000
     assert np.allclose(mean_gen_duration, np.mean(raw_dur))
     assert np.allclose(std_gen_duration, np.std(raw_dur))