-
-
Notifications
You must be signed in to change notification settings - Fork 5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Bugfix] Revert to aioprometheus to avoid 307 redirect #4511
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,11 +2,10 @@ | |
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING | ||
from typing import Counter as CollectionsCounter | ||
from typing import Dict, List, Optional, Protocol, Union | ||
from typing import Dict, List, Optional, Union | ||
|
||
import numpy as np | ||
from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info, | ||
disable_created_metrics) | ||
from aioprometheus import Counter, Gauge, Histogram | ||
|
||
from vllm.logger import init_logger | ||
|
||
|
@@ -15,8 +14,6 @@ | |
|
||
logger = init_logger(__name__) | ||
|
||
disable_created_metrics() | ||
|
||
# The begin-* and end* here are used by the documentation generator | ||
# to extract the metrics definitions. | ||
|
||
|
@@ -25,62 +22,41 @@ | |
class Metrics: | ||
labelname_finish_reason = "finished_reason" | ||
|
||
def __init__(self, labelnames: List[str], max_model_len: int): | ||
# Unregister any existing vLLM collectors | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was a hack to enable our CI to pass before
I just updated the CI to run with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We had problems using --forked before due to cuda issue (and we removed it). #3631 Seems like metrics test also use the vllm instances, and I am a little concern it will introduce the similar issue again. Why don't we just keep this hack instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As long as the test does not import torch, —forked is fine There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC, the vllm runner should import torch? https://github.com/vllm-project/vllm/blob/main/tests/metrics/test_metrics.py |
||
for collector in list(REGISTRY._collector_to_names): | ||
if hasattr(collector, "_name") and "vllm" in collector._name: | ||
REGISTRY.unregister(collector) | ||
|
||
# Config Information | ||
self.info_cache_config = Info( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This had to be removed since |
||
name='vllm:cache_config', | ||
documentation='information of cache_config') | ||
|
||
def __init__(self, max_model_len: int): | ||
# System stats | ||
# Scheduler State | ||
self.gauge_scheduler_running = Gauge( | ||
name="vllm:num_requests_running", | ||
documentation="Number of requests currently running on GPU.", | ||
labelnames=labelnames) | ||
"vllm:num_requests_running", | ||
"Number of requests currently running on GPU.") | ||
self.gauge_scheduler_waiting = Gauge( | ||
name="vllm:num_requests_waiting", | ||
documentation="Number of requests waiting to be processed.", | ||
labelnames=labelnames) | ||
"vllm:num_requests_waiting", | ||
"Number of requests waiting to be processed.") | ||
self.gauge_scheduler_swapped = Gauge( | ||
name="vllm:num_requests_swapped", | ||
documentation="Number of requests swapped to CPU.", | ||
labelnames=labelnames) | ||
"vllm:num_requests_swapped", "Number of requests swapped to CPU.") | ||
# KV Cache Usage in % | ||
self.gauge_gpu_cache_usage = Gauge( | ||
name="vllm:gpu_cache_usage_perc", | ||
documentation="GPU KV-cache usage. 1 means 100 percent usage.", | ||
labelnames=labelnames) | ||
"vllm:gpu_cache_usage_perc", | ||
"GPU KV-cache usage. 1 means 100 percent usage.") | ||
self.gauge_cpu_cache_usage = Gauge( | ||
name="vllm:cpu_cache_usage_perc", | ||
documentation="CPU KV-cache usage. 1 means 100 percent usage.", | ||
labelnames=labelnames) | ||
"vllm:cpu_cache_usage_perc", | ||
"CPU KV-cache usage. 1 means 100 percent usage.") | ||
|
||
# Iteration stats | ||
self.counter_prompt_tokens = Counter( | ||
name="vllm:prompt_tokens_total", | ||
documentation="Number of prefill tokens processed.", | ||
labelnames=labelnames) | ||
"vllm:prompt_tokens_total", "Number of prefill tokens processed.") | ||
self.counter_generation_tokens = Counter( | ||
name="vllm:generation_tokens_total", | ||
documentation="Number of generation tokens processed.", | ||
labelnames=labelnames) | ||
"vllm:generation_tokens_total", | ||
"Number of generation tokens processed.") | ||
self.histogram_time_to_first_token = Histogram( | ||
name="vllm:time_to_first_token_seconds", | ||
documentation="Histogram of time to first token in seconds.", | ||
labelnames=labelnames, | ||
"vllm:time_to_first_token_seconds", | ||
"Histogram of time to first token in seconds.", | ||
buckets=[ | ||
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, | ||
0.75, 1.0, 2.5, 5.0, 7.5, 10.0 | ||
]) | ||
self.histogram_time_per_output_token = Histogram( | ||
name="vllm:time_per_output_token_seconds", | ||
documentation="Histogram of time per output token in seconds.", | ||
labelnames=labelnames, | ||
"vllm:time_per_output_token_seconds", | ||
"Histogram of time per output token in seconds.", | ||
buckets=[ | ||
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, | ||
1.0, 2.5 | ||
|
@@ -89,51 +65,42 @@ def __init__(self, labelnames: List[str], max_model_len: int): | |
# Request stats | ||
# Latency | ||
self.histogram_e2e_time_request = Histogram( | ||
name="vllm:e2e_request_latency_seconds", | ||
documentation="Histogram of end to end request latency in seconds.", | ||
labelnames=labelnames, | ||
"vllm:e2e_request_latency_seconds", | ||
"Histogram of end to end request latency in seconds.", | ||
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) | ||
# Metadata | ||
self.histogram_num_prompt_tokens_request = Histogram( | ||
name="vllm:request_prompt_tokens", | ||
documentation="Number of prefill tokens processed.", | ||
labelnames=labelnames, | ||
"vllm:request_prompt_tokens", | ||
"Number of prefill tokens processed.", | ||
buckets=build_1_2_5_buckets(max_model_len), | ||
) | ||
self.histogram_num_generation_tokens_request = Histogram( | ||
name="vllm:request_generation_tokens", | ||
documentation="Number of generation tokens processed.", | ||
labelnames=labelnames, | ||
"vllm:request_generation_tokens", | ||
"Number of generation tokens processed.", | ||
buckets=build_1_2_5_buckets(max_model_len), | ||
) | ||
self.histogram_best_of_request = Histogram( | ||
name="vllm:request_params_best_of", | ||
documentation="Histogram of the best_of request parameter.", | ||
labelnames=labelnames, | ||
"vllm:request_params_best_of", | ||
"Histogram of the best_of request parameter.", | ||
buckets=[1, 2, 5, 10, 20], | ||
) | ||
self.histogram_n_request = Histogram( | ||
name="vllm:request_params_n", | ||
documentation="Histogram of the n request parameter.", | ||
labelnames=labelnames, | ||
"vllm:request_params_n", | ||
"Histogram of the n request parameter.", | ||
buckets=[1, 2, 5, 10, 20], | ||
) | ||
self.counter_request_success = Counter( | ||
name="vllm:request_success", | ||
documentation="Count of successfully processed requests.", | ||
labelnames=labelnames + [Metrics.labelname_finish_reason]) | ||
"vllm:request_success_total", | ||
"Count of successfully processed requests.") | ||
|
||
# Deprecated in favor of vllm:prompt_tokens_total | ||
self.gauge_avg_prompt_throughput = Gauge( | ||
name="vllm:avg_prompt_throughput_toks_per_s", | ||
documentation="Average prefill throughput in tokens/s.", | ||
labelnames=labelnames, | ||
) | ||
"vllm:avg_prompt_throughput_toks_per_s", | ||
"Average prefill throughput in tokens/s.") | ||
# Deprecated in favor of vllm:generation_tokens_total | ||
self.gauge_avg_generation_throughput = Gauge( | ||
name="vllm:avg_generation_throughput_toks_per_s", | ||
documentation="Average generation throughput in tokens/s.", | ||
labelnames=labelnames, | ||
"vllm:avg_generation_throughput_toks_per_s", | ||
"Average generation throughput in tokens/s.", | ||
) | ||
|
||
|
||
|
@@ -195,12 +162,6 @@ class Stats: | |
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None | ||
|
||
|
||
class SupportsMetricsInfo(Protocol): | ||
|
||
def metrics_info(self) -> Dict[str, str]: | ||
... | ||
|
||
|
||
class StatLogger: | ||
"""StatLogger is used LLMEngine to log to Promethus and Stdout.""" | ||
|
||
|
@@ -216,12 +177,7 @@ def __init__(self, local_interval: float, labels: Dict[str, str], | |
|
||
# Prometheus metrics | ||
self.labels = labels | ||
self.metrics = Metrics(labelnames=list(labels.keys()), | ||
max_model_len=max_model_len) | ||
|
||
def info(self, type: str, obj: SupportsMetricsInfo) -> None: | ||
if type == "cache_config": | ||
self.metrics.info_cache_config.info(obj.metrics_info()) | ||
self.metrics = Metrics(max_model_len=max_model_len) | ||
|
||
def _get_throughput(self, tracked_stats: List[int], now: float) -> float: | ||
return float(np.sum(tracked_stats) / (now - self.last_local_log)) | ||
|
@@ -274,23 +230,23 @@ def _log_prometheus(self, stats: Stats) -> None: | |
|
||
def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None: | ||
# Convenience function for logging to gauge. | ||
gauge.labels(**self.labels).set(data) | ||
gauge.set(self.labels, data) | ||
|
||
def _log_counter(self, counter: Counter, data: Union[int, float]) -> None: | ||
# Convenience function for logging to counter. | ||
counter.labels(**self.labels).inc(data) | ||
counter.add(self.labels, data) | ||
|
||
def _log_counter_labels(self, counter: Counter, data: CollectionsCounter, | ||
label_key: str) -> None: | ||
# Convenience function for collection counter of labels. | ||
for label, count in data.items(): | ||
counter.labels(**{**self.labels, label_key: label}).inc(count) | ||
counter.add({**self.labels, label_key: label}, count) | ||
|
||
def _log_histogram(self, histogram: Histogram, | ||
data: Union[List[int], List[float]]) -> None: | ||
# Convenience function for logging list to histogram. | ||
for datum in data: | ||
histogram.labels(**self.labels).observe(datum) | ||
histogram.observe(self.labels, datum) | ||
|
||
def _log_prometheus_interval(self, prompt_throughput: float, | ||
generation_throughput: float) -> None: | ||
|
@@ -301,10 +257,10 @@ def _log_prometheus_interval(self, prompt_throughput: float, | |
# Which log raw data and calculate summaries using rate() on the | ||
# grafana/prometheus side. See | ||
# https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 | ||
self.metrics.gauge_avg_prompt_throughput.labels( | ||
**self.labels).set(prompt_throughput) | ||
self.metrics.gauge_avg_generation_throughput.labels( | ||
**self.labels).set(generation_throughput) | ||
self._log_gauge(self.metrics.gauge_avg_prompt_throughput, | ||
prompt_throughput) | ||
self._log_gauge(self.metrics.gauge_avg_generation_throughput, | ||
generation_throughput) | ||
|
||
def log(self, stats: Stats) -> None: | ||
"""Called by LLMEngine. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not understand what this does / why it was there before, but the equivalent does not seem to exist for
aioprometheus
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://github.com/prometheus/client_python/blob/master/prometheus_client/metrics.py#L74
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://prometheus.github.io/client_python/instrumenting/#disabling-_created-metrics