From fce5a69e395b9a09cf149c2153f2f80c021a1ca3 Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Tue, 5 Nov 2024 00:53:24 +0200 Subject: [PATCH] [Frontend] Add max_tokens prometheus metric (#9881) Signed-off-by: Tomer Asida Signed-off-by: Linkun Chen --- tests/entrypoints/openai/test_metrics.py | 11 +++++++++-- tests/metrics/test_metrics.py | 1 + vllm/engine/llm_engine.py | 4 ++++ vllm/engine/metrics.py | 8 ++++++++ vllm/engine/metrics_types.py | 1 + 5 files changed, 23 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index b3f1fea91d13e..6523c8b6297c6 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -70,10 +70,14 @@ async def client(server): [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), ("_count", _NUM_REQUESTS)], "vllm:request_params_n": [("_count", _NUM_REQUESTS)], + "vllm:request_params_max_tokens": + [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), + ("_count", _NUM_REQUESTS)], "vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], - "vllm:generation_tokens": - [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], + "vllm:generation_tokens": [ + ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST) + ], "vllm:request_success": [("_total", _NUM_REQUESTS)], } @@ -149,6 +153,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "vllm:request_params_n_sum", "vllm:request_params_n_bucket", "vllm:request_params_n_count", + "vllm:request_params_max_tokens_sum", + "vllm:request_params_max_tokens_bucket", + "vllm:request_params_max_tokens_count", "vllm:num_preemptions_total", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 7a361ef320810..4a824c7acef21 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -365,6 +365,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool, "vllm:request_prompt_tokens", "vllm:request_generation_tokens", "vllm:request_params_n", + "vllm:request_params_max_tokens", ] for metric_name in request_histogram_metrics: metric_value = REGISTRY.get_sample_value(f"{metric_name}_count", diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index b12d29c4a8503..2c584218485c8 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1685,6 +1685,7 @@ def _get_stats(self, num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] n_requests: List[int] = [] + max_tokens_requests: List[int] = [] finished_reason_requests: List[str] = [] # Lora requests @@ -1792,6 +1793,8 @@ def _get_stats(self, ]) if seq_group.sampling_params is not None: n_requests.append(seq_group.sampling_params.n) + max_tokens_requests.append( + seq_group.sampling_params.max_tokens) finished_reason_requests.extend([ SequenceStatus.get_finished_reason(seq.status) for seq in seq_group.get_finished_seqs() @@ -1847,6 +1850,7 @@ def _get_stats(self, num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, n_requests=n_requests, + max_tokens_requests=max_tokens_requests, finished_reason_requests=finished_reason_requests, max_lora=str(max_lora_stat), waiting_lora_adapters=list(waiting_lora_adapters.keys()), diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 9ed30e1e99857..3e3357ed74633 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -179,6 +179,12 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames, buckets=[1, 2, 5, 10, 20], ) + self.histogram_max_tokens_request = self._histogram_cls( + name="vllm:request_params_max_tokens", + documentation="Histogram of the max_tokens request parameter.", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len), + ) self.counter_request_success = self._counter_cls( name="vllm:request_success_total", documentation="Count of successfully processed requests.", @@ -547,6 +553,8 @@ def _log_prometheus(self, stats: Stats) -> None: self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests) self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) + self._log_histogram(self.metrics.histogram_max_tokens_request, + stats.max_tokens_requests) def _log_prometheus_interval(self, prompt_throughput: float, generation_throughput: float) -> None: diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 510dd04bb3e55..25b7a7479672a 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -53,6 +53,7 @@ class Stats: num_prompt_tokens_requests: List[int] num_generation_tokens_requests: List[int] n_requests: List[int] + max_tokens_requests: List[int] finished_reason_requests: List[str] waiting_lora_adapters: List[str] running_lora_adapters: List[str]