Skip to content

Commit

Permalink
[test] Ensure that the first token generation is not included into TP…
Browse files Browse the repository at this point in the history
…OT (#1414)

CVS-155098
  • Loading branch information
pavel-esir authored Dec 23, 2024
1 parent 5d68567 commit c09207c
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/cpp/src/perf_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {

auto ttft = tok_times[0] - start_time_val;
raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>();
raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]);
raw_metrics.m_times_to_first_token.emplace_back(ttft);
num_generated_tokens = batch_sizes[0];

// The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens.
Expand Down
3 changes: 2 additions & 1 deletion tests/python_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

def pytest_make_parametrize_id(config, val, argname):
if argname in ['prompt', 'prompts', 'batched_prompts']:
return f'{val}'
# Print only first 1000 characters of long prompts.
return f'{val[:1000]}'
elif argname == 'model_descr':
return f"{val[0]}"
elif argname == 'chat_config':
Expand Down
10 changes: 9 additions & 1 deletion tests/python_tests/test_generate_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,12 @@ def test_perf_metrics(model_descr, generation_config, prompt):
assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std)
assert mean_ttft > 0 and mean_ttft < 1000.0

raw_metrics = perf_metrics.raw_metrics
durations = np.array(raw_metrics.m_durations) / 1000
# Check that prefill is not included in durations for TPOT calculation.
# For the very long prompt prefill is slow and TTFT is much larger than any other token genration duration.
assert np.all(mean_ttft > durations * 2)

mean_tpot, std_tpot = perf_metrics.get_tpot()
assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std)
assert mean_tpot > 0 and mean_ttft < 1000.0
Expand All @@ -822,7 +828,9 @@ def test_perf_metrics(model_descr, generation_config, prompt):
assert std_detok_duration == 0

# assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics
raw_metrics = perf_metrics.raw_metrics
assert np.allclose(mean_tpot, np.mean(durations))
assert np.allclose(std_tpot, np.std(durations))

raw_dur = np.array(raw_metrics.generate_durations) / 1000
assert np.allclose(mean_gen_duration, np.mean(raw_dur))
assert np.allclose(std_gen_duration, np.std(raw_dur))
Expand Down

0 comments on commit c09207c

Please sign in to comment.