Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bugfix] Revert to aioprometheus to avoid 307 redirect #4511

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ steps:
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader

- label: Metrics Test
command: pytest -v -s metrics
command: pytest -v -s metrics --forked

- label: Quantization Test
command: pytest -v -s quantization
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def setup(app):
"torch",
"transformers",
"psutil",
"prometheus_client",
"aioprometheus",
"sentencepiece",
"vllm.cuda_utils",
"vllm._C",
Expand Down
3 changes: 1 addition & 2 deletions requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ fastapi
openai
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
aioprometheus[starlette]
tiktoken == 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.9.8
outlines == 0.0.34 # Requires torch >= 2.1.0
Expand Down
9 changes: 4 additions & 5 deletions tests/metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ def test_metric_counter_prompt_tokens(

_ = vllm_model.generate_greedy(example_prompts, max_tokens)
stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
**stat_logger.labels)._value.get()

metric_count = stat_logger.metrics.counter_prompt_tokens.get_value(
stat_logger.labels)
assert vllm_prompt_token_count == metric_count, (
f"prompt token count: {vllm_prompt_token_count!r}\n"
f"metric: {metric_count!r}")
Expand All @@ -55,8 +54,8 @@ def test_metric_counter_generation_tokens(
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
metric_count = stat_logger.metrics.counter_generation_tokens.get_value(
stat_logger.labels)
vllm_generation_count = 0
for i in range(len(example_prompts)):
vllm_output_ids, vllm_output_str = vllm_outputs[i]
Expand Down
1 change: 0 additions & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,6 @@ def __init__(
local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
labels=dict(model_name=model_config.model),
max_model_len=self.model_config.max_model_len)
self.stat_logger.info("cache_config", self.cache_config)

# Create sequence output processor, e.g. for beam search or
# speculative decoding.
Expand Down
132 changes: 44 additions & 88 deletions vllm/engine/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Counter as CollectionsCounter
from typing import Dict, List, Optional, Protocol, Union
from typing import Dict, List, Optional, Union

import numpy as np
from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
disable_created_metrics)
from aioprometheus import Counter, Gauge, Histogram

from vllm.logger import init_logger

Expand All @@ -15,8 +14,6 @@

logger = init_logger(__name__)

disable_created_metrics()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not understand what this does / why it was there before, but the equivalent does not seem to exist for aioprometheus

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


# The begin-* and end* here are used by the documentation generator
# to extract the metrics definitions.

Expand All @@ -25,62 +22,41 @@
class Metrics:
labelname_finish_reason = "finished_reason"

def __init__(self, labelnames: List[str], max_model_len: int):
# Unregister any existing vLLM collectors
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a hack to enable our CI to pass before

REGISTRY is a global value. We were touching internal state since our CI runs LLM twice

I just updated the CI to run with --forked which obviate the need for this hack

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We had problems using --forked before due to cuda issue (and we removed it). #3631

Seems like metrics test also use the vllm instances, and I am a little concern it will introduce the similar issue again. Why don't we just keep this hack instead?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As long as the test does not import torch, —forked is fine

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for collector in list(REGISTRY._collector_to_names):
if hasattr(collector, "_name") and "vllm" in collector._name:
REGISTRY.unregister(collector)

# Config Information
self.info_cache_config = Info(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This had to be removed since Info does not exist in aioprometheus AFAIK

name='vllm:cache_config',
documentation='information of cache_config')

def __init__(self, max_model_len: int):
# System stats
# Scheduler State
self.gauge_scheduler_running = Gauge(
name="vllm:num_requests_running",
documentation="Number of requests currently running on GPU.",
labelnames=labelnames)
"vllm:num_requests_running",
"Number of requests currently running on GPU.")
self.gauge_scheduler_waiting = Gauge(
name="vllm:num_requests_waiting",
documentation="Number of requests waiting to be processed.",
labelnames=labelnames)
"vllm:num_requests_waiting",
"Number of requests waiting to be processed.")
self.gauge_scheduler_swapped = Gauge(
name="vllm:num_requests_swapped",
documentation="Number of requests swapped to CPU.",
labelnames=labelnames)
"vllm:num_requests_swapped", "Number of requests swapped to CPU.")
# KV Cache Usage in %
self.gauge_gpu_cache_usage = Gauge(
name="vllm:gpu_cache_usage_perc",
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
labelnames=labelnames)
"vllm:gpu_cache_usage_perc",
"GPU KV-cache usage. 1 means 100 percent usage.")
self.gauge_cpu_cache_usage = Gauge(
name="vllm:cpu_cache_usage_perc",
documentation="CPU KV-cache usage. 1 means 100 percent usage.",
labelnames=labelnames)
"vllm:cpu_cache_usage_perc",
"CPU KV-cache usage. 1 means 100 percent usage.")

# Iteration stats
self.counter_prompt_tokens = Counter(
name="vllm:prompt_tokens_total",
documentation="Number of prefill tokens processed.",
labelnames=labelnames)
"vllm:prompt_tokens_total", "Number of prefill tokens processed.")
self.counter_generation_tokens = Counter(
name="vllm:generation_tokens_total",
documentation="Number of generation tokens processed.",
labelnames=labelnames)
"vllm:generation_tokens_total",
"Number of generation tokens processed.")
self.histogram_time_to_first_token = Histogram(
name="vllm:time_to_first_token_seconds",
documentation="Histogram of time to first token in seconds.",
labelnames=labelnames,
"vllm:time_to_first_token_seconds",
"Histogram of time to first token in seconds.",
buckets=[
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
0.75, 1.0, 2.5, 5.0, 7.5, 10.0
])
self.histogram_time_per_output_token = Histogram(
name="vllm:time_per_output_token_seconds",
documentation="Histogram of time per output token in seconds.",
labelnames=labelnames,
"vllm:time_per_output_token_seconds",
"Histogram of time per output token in seconds.",
buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5
Expand All @@ -89,51 +65,42 @@ def __init__(self, labelnames: List[str], max_model_len: int):
# Request stats
# Latency
self.histogram_e2e_time_request = Histogram(
name="vllm:e2e_request_latency_seconds",
documentation="Histogram of end to end request latency in seconds.",
labelnames=labelnames,
"vllm:e2e_request_latency_seconds",
"Histogram of end to end request latency in seconds.",
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
# Metadata
self.histogram_num_prompt_tokens_request = Histogram(
name="vllm:request_prompt_tokens",
documentation="Number of prefill tokens processed.",
labelnames=labelnames,
"vllm:request_prompt_tokens",
"Number of prefill tokens processed.",
buckets=build_1_2_5_buckets(max_model_len),
)
self.histogram_num_generation_tokens_request = Histogram(
name="vllm:request_generation_tokens",
documentation="Number of generation tokens processed.",
labelnames=labelnames,
"vllm:request_generation_tokens",
"Number of generation tokens processed.",
buckets=build_1_2_5_buckets(max_model_len),
)
self.histogram_best_of_request = Histogram(
name="vllm:request_params_best_of",
documentation="Histogram of the best_of request parameter.",
labelnames=labelnames,
"vllm:request_params_best_of",
"Histogram of the best_of request parameter.",
buckets=[1, 2, 5, 10, 20],
)
self.histogram_n_request = Histogram(
name="vllm:request_params_n",
documentation="Histogram of the n request parameter.",
labelnames=labelnames,
"vllm:request_params_n",
"Histogram of the n request parameter.",
buckets=[1, 2, 5, 10, 20],
)
self.counter_request_success = Counter(
name="vllm:request_success",
documentation="Count of successfully processed requests.",
labelnames=labelnames + [Metrics.labelname_finish_reason])
"vllm:request_success_total",
"Count of successfully processed requests.")

# Deprecated in favor of vllm:prompt_tokens_total
self.gauge_avg_prompt_throughput = Gauge(
name="vllm:avg_prompt_throughput_toks_per_s",
documentation="Average prefill throughput in tokens/s.",
labelnames=labelnames,
)
"vllm:avg_prompt_throughput_toks_per_s",
"Average prefill throughput in tokens/s.")
# Deprecated in favor of vllm:generation_tokens_total
self.gauge_avg_generation_throughput = Gauge(
name="vllm:avg_generation_throughput_toks_per_s",
documentation="Average generation throughput in tokens/s.",
labelnames=labelnames,
"vllm:avg_generation_throughput_toks_per_s",
"Average generation throughput in tokens/s.",
)


Expand Down Expand Up @@ -195,12 +162,6 @@ class Stats:
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None


class SupportsMetricsInfo(Protocol):

def metrics_info(self) -> Dict[str, str]:
...


class StatLogger:
"""StatLogger is used LLMEngine to log to Promethus and Stdout."""

Expand All @@ -216,12 +177,7 @@ def __init__(self, local_interval: float, labels: Dict[str, str],

# Prometheus metrics
self.labels = labels
self.metrics = Metrics(labelnames=list(labels.keys()),
max_model_len=max_model_len)

def info(self, type: str, obj: SupportsMetricsInfo) -> None:
if type == "cache_config":
self.metrics.info_cache_config.info(obj.metrics_info())
self.metrics = Metrics(max_model_len=max_model_len)

def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
return float(np.sum(tracked_stats) / (now - self.last_local_log))
Expand Down Expand Up @@ -274,23 +230,23 @@ def _log_prometheus(self, stats: Stats) -> None:

def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None:
# Convenience function for logging to gauge.
gauge.labels(**self.labels).set(data)
gauge.set(self.labels, data)

def _log_counter(self, counter: Counter, data: Union[int, float]) -> None:
# Convenience function for logging to counter.
counter.labels(**self.labels).inc(data)
counter.add(self.labels, data)

def _log_counter_labels(self, counter: Counter, data: CollectionsCounter,
label_key: str) -> None:
# Convenience function for collection counter of labels.
for label, count in data.items():
counter.labels(**{**self.labels, label_key: label}).inc(count)
counter.add({**self.labels, label_key: label}, count)

def _log_histogram(self, histogram: Histogram,
data: Union[List[int], List[float]]) -> None:
# Convenience function for logging list to histogram.
for datum in data:
histogram.labels(**self.labels).observe(datum)
histogram.observe(self.labels, datum)

def _log_prometheus_interval(self, prompt_throughput: float,
generation_throughput: float) -> None:
Expand All @@ -301,10 +257,10 @@ def _log_prometheus_interval(self, prompt_throughput: float,
# Which log raw data and calculate summaries using rate() on the
# grafana/prometheus side. See
# https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
self.metrics.gauge_avg_prompt_throughput.labels(
**self.labels).set(prompt_throughput)
self.metrics.gauge_avg_generation_throughput.labels(
**self.labels).set(generation_throughput)
self._log_gauge(self.metrics.gauge_avg_prompt_throughput,
prompt_throughput)
self._log_gauge(self.metrics.gauge_avg_generation_throughput,
generation_throughput)

def log(self, stats: Stats) -> None:
"""Called by LLMEngine.
Expand Down
8 changes: 4 additions & 4 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@

import fastapi
import uvicorn
from aioprometheus import MetricsMiddleware
from aioprometheus.asgi.starlette import metrics
from fastapi import Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse
from prometheus_client import make_asgi_app

import vllm
from vllm.engine.arg_utils import AsyncEngineArgs
Expand Down Expand Up @@ -54,9 +55,8 @@ def parse_args():
return parser.parse_args()


# Add prometheus asgi middleware to route /metrics requests
metrics_app = make_asgi_app()
app.mount("/metrics", metrics_app)
app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics
app.add_route("/metrics", metrics) # Exposes HTTP metrics


@app.exception_handler(RequestValidationError)
Expand Down
Loading