From c09207cd497e250e8b3e7ad442cec3bc4181827e Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Mon, 23 Dec 2024 12:33:47 +0100
Subject: [PATCH] [test] Ensure that the first token generation is not included
 into TPOT (#1414)

CVS-155098
---
 src/cpp/src/perf_metrics.cpp            |  2 +-
 tests/python_tests/conftest.py          |  3 ++-
 tests/python_tests/test_generate_api.py | 10 +++++++++-
 3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
index 3bd6252c78..3725dc0cfc 100644
--- a/src/cpp/src/perf_metrics.cpp
+++ b/src/cpp/src/perf_metrics.cpp
@@ -101,7 +101,7 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
 
         auto ttft = tok_times[0] - start_time_val;
         raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>();
-        raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]);
+        raw_metrics.m_times_to_first_token.emplace_back(ttft);
         num_generated_tokens = batch_sizes[0];
         
         // The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens.
diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py
index f98f47ecf3..e159045601 100644
--- a/tests/python_tests/conftest.py
+++ b/tests/python_tests/conftest.py
@@ -3,7 +3,8 @@
 
 def pytest_make_parametrize_id(config, val, argname):
     if argname in ['prompt', 'prompts', 'batched_prompts']:
-        return f'{val}'
+        # Print only first 1000 characters of long prompts.
+        return f'{val[:1000]}'
     elif argname == 'model_descr':
         return f"{val[0]}"
     elif argname == 'chat_config':
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index d15747be63..9bb9eff49c 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -798,6 +798,12 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std)
     assert mean_ttft > 0 and mean_ttft < 1000.0
 
+    raw_metrics = perf_metrics.raw_metrics
+    durations = np.array(raw_metrics.m_durations) / 1000
+    # Check that prefill is not included in durations for TPOT calculation.
+    # For the very long prompt prefill is slow and TTFT is much larger than any other token genration duration.
+    assert np.all(mean_ttft > durations * 2)
+
     mean_tpot, std_tpot = perf_metrics.get_tpot()
     assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std)
     assert mean_tpot > 0 and mean_ttft < 1000.0
@@ -822,7 +828,9 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     assert std_detok_duration == 0
     
     # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics
-    raw_metrics = perf_metrics.raw_metrics
+    assert np.allclose(mean_tpot, np.mean(durations))
+    assert np.allclose(std_tpot, np.std(durations))
+
     raw_dur = np.array(raw_metrics.generate_durations) / 1000
     assert np.allclose(mean_gen_duration, np.mean(raw_dur))
     assert np.allclose(std_gen_duration, np.std(raw_dur))