From c09207cd497e250e8b3e7ad442cec3bc4181827e Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 23 Dec 2024 12:33:47 +0100 Subject: [PATCH] [test] Ensure that the first token generation is not included into TPOT (#1414) CVS-155098 --- src/cpp/src/perf_metrics.cpp | 2 +- tests/python_tests/conftest.py | 3 ++- tests/python_tests/test_generate_api.py | 10 +++++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index 3bd6252c78..3725dc0cfc 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -101,7 +101,7 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { auto ttft = tok_times[0] - start_time_val; raw_metrics.m_times_to_first_token = std::vector(); - raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]); + raw_metrics.m_times_to_first_token.emplace_back(ttft); num_generated_tokens = batch_sizes[0]; // The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens. diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py index f98f47ecf3..e159045601 100644 --- a/tests/python_tests/conftest.py +++ b/tests/python_tests/conftest.py @@ -3,7 +3,8 @@ def pytest_make_parametrize_id(config, val, argname): if argname in ['prompt', 'prompts', 'batched_prompts']: - return f'{val}' + # Print only first 1000 characters of long prompts. + return f'{val[:1000]}' elif argname == 'model_descr': return f"{val[0]}" elif argname == 'chat_config': diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index d15747be63..9bb9eff49c 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -798,6 +798,12 @@ def test_perf_metrics(model_descr, generation_config, prompt): assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std) assert mean_ttft > 0 and mean_ttft < 1000.0 + raw_metrics = perf_metrics.raw_metrics + durations = np.array(raw_metrics.m_durations) / 1000 + # Check that prefill is not included in durations for TPOT calculation. + # For the very long prompt prefill is slow and TTFT is much larger than any other token genration duration. + assert np.all(mean_ttft > durations * 2) + mean_tpot, std_tpot = perf_metrics.get_tpot() assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std) assert mean_tpot > 0 and mean_ttft < 1000.0 @@ -822,7 +828,9 @@ def test_perf_metrics(model_descr, generation_config, prompt): assert std_detok_duration == 0 # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics - raw_metrics = perf_metrics.raw_metrics + assert np.allclose(mean_tpot, np.mean(durations)) + assert np.allclose(std_tpot, np.std(durations)) + raw_dur = np.array(raw_metrics.generate_durations) / 1000 assert np.allclose(mean_gen_duration, np.mean(raw_dur)) assert np.allclose(std_gen_duration, np.std(raw_dur))