Skip to content

Commit

Permalink
[MISC] Add lora requests to metrics (vllm-project#9477)
Browse files Browse the repository at this point in the history
Co-authored-by: Kunjan Patel <kunjanp_google_com@vllm.us-central1-a.c.kunjanp-gke-dev-2.internal>
Signed-off-by: Alvant <[email protected]>
  • Loading branch information
2 people authored and Alvant committed Oct 26, 2024
1 parent 0fa30c6 commit c69a58f
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 2 deletions.
24 changes: 23 additions & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
from collections import Counter as collectionsCounter
from collections import deque
from contextlib import contextmanager
from dataclasses import dataclass
Expand Down Expand Up @@ -1664,6 +1665,25 @@ def _get_stats(self,
n_requests: List[int] = []
finished_reason_requests: List[str] = []

# Lora requests
running_lora_adapters = dict(
collectionsCounter([
running_request.lora_request.lora_name
for scheduler in self.scheduler
for running_request in scheduler.running
if running_request.lora_request
]))
waiting_lora_adapters = dict(
collectionsCounter([
waiting_request.lora_request.lora_name
for scheduler in self.scheduler
for waiting_request in scheduler.waiting
if waiting_request.lora_request
]))
max_lora_stat = "0"
if self.lora_config:
max_lora_stat = str(self.lora_config.max_loras)

# NOTE: This loop assumes prefill seq_groups are before
# decode seq_groups in scheduled_seq_groups.
if scheduler_outputs is not None:
Expand Down Expand Up @@ -1785,7 +1805,9 @@ def _get_stats(self,
num_generation_tokens_requests=num_generation_tokens_requests,
n_requests=n_requests,
finished_reason_requests=finished_reason_requests,
)
max_lora=str(max_lora_stat),
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
running_lora_adapters=list(running_lora_adapters.keys()))

def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_executor.add_lora(lora_request)
Expand Down
29 changes: 28 additions & 1 deletion vllm/engine/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ class Metrics:
See https://prometheus.github.io/client_python/multiprocess/ for more
details on limitations.
"""

labelname_finish_reason = "finished_reason"
labelname_waiting_lora_adapters = "waiting_lora_adapters"
labelname_running_lora_adapters = "running_lora_adapters"
labelname_max_lora = "max_lora"
_gauge_cls = prometheus_client.Gauge
_counter_cls = prometheus_client.Counter
_histogram_cls = prometheus_client.Histogram
Expand All @@ -55,6 +59,16 @@ def __init__(self, labelnames: List[str], max_model_len: int):
documentation="Number of requests waiting to be processed.",
labelnames=labelnames,
multiprocess_mode="sum")
self.gauge_lora_info = self._gauge_cls(
name="vllm:lora_requests_info",
documentation="Running stats on lora requests.",
labelnames=[
self.labelname_running_lora_adapters,
self.labelname_max_lora,
self.labelname_waiting_lora_adapters,
],
multiprocess_mode="livemostrecent",
)
self.gauge_scheduler_swapped = self._gauge_cls(
name="vllm:num_requests_swapped",
documentation="Number of requests swapped to CPU.",
Expand Down Expand Up @@ -426,6 +440,9 @@ def _log_histogram(self, histogram, data: Union[List[int],
for datum in data:
histogram.labels(**self.labels).observe(datum)

def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
gauge.labels(**data).set(1)

def _log_prometheus(self, stats: Stats) -> None:
# System state data
self._log_gauge(self.metrics.gauge_scheduler_running,
Expand All @@ -442,7 +459,17 @@ def _log_prometheus(self, stats: Stats) -> None:
stats.cpu_prefix_cache_hit_rate)
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
stats.gpu_prefix_cache_hit_rate)

# Including max-lora in metric, in future this property of lora
# config maybe extended to be dynamic.
lora_info = {
self.metrics.labelname_running_lora_adapters:
",".join(stats.running_lora_adapters),
self.metrics.labelname_waiting_lora_adapters:
",".join(stats.waiting_lora_adapters),
self.metrics.labelname_max_lora:
stats.max_lora,
}
self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
# Iteration level data
self._log_counter(self.metrics.counter_num_preemption,
stats.num_preemption_iter)
Expand Down
3 changes: 3 additions & 0 deletions vllm/engine/metrics_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ class Stats:
num_generation_tokens_requests: List[int]
n_requests: List[int]
finished_reason_requests: List[str]
waiting_lora_adapters: List[str]
running_lora_adapters: List[str]
max_lora: str

spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None

Expand Down

0 comments on commit c69a58f

Please sign in to comment.