vllm-project · simon-mo · Mar 29, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 9, 2024
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -53,6 +53,8 @@ steps:
                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                 {% endif %}
                 env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
                   - name: HF_TOKEN
                     valueFrom:
                       secretKeyRef:

diff --git a/Dockerfile b/Dockerfile
@@ -127,5 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 COPY --from=build /workspace/vllm/*.so /workspace/vllm/
 COPY vllm vllm
 
+ENV VLLM_USAGE_SOURCE production-docker-image
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -73,6 +73,7 @@ Documentation
    serving/deploying_with_docker
    serving/distributed_serving
    serving/metrics
+   serving/usage_stats
    serving/integrations
 
 .. toctree::

diff --git a/docs/source/serving/usage_stats.md b/docs/source/serving/usage_stats.md
@@ -0,0 +1,57 @@
+# Usage Stats Collection
+
+vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit.
+
+## What data is collected?
+
+You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py).
+
+Here is an example as of v0.4.0:
+
+```json
+{
+  "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109",
+  "provider": "GCP",
+  "num_cpu": 24,
+  "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz",
+  "cpu_family_model_stepping": "6,85,7",
+  "total_memory": 101261135872,
+  "architecture": "x86_64",
+  "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31",
+  "gpu_count": 2,
+  "gpu_type": "NVIDIA L4",
+  "gpu_memory_per_device": 23580639232,
+  "model_architecture": "OPTForCausalLM",
+  "vllm_version": "0.3.2+cu123",
+  "context": "LLM_CLASS",
+  "log_time": 1711663373492490000,
+  "source": "production",
+  "dtype": "torch.float16",
+  "tensor_parallel_size": 1,
+  "block_size": 16,
+  "gpu_memory_utilization": 0.9,
+  "quantization": null,
+  "kv_cache_dtype": "auto",
+  "enable_lora": false,
+  "enable_prefix_caching": false,
+  "enforce_eager": false,
+  "disable_custom_all_reduce": true
+}
+```
+
+You can preview the collected data by running the following command:
+
+```bash
+tail ~/.config/vllm/usage_stats.json
+```
+
+## Opt-out of Usage Stats Collection
+
+You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file:
+
+```bash
+# Any of the following methods can disable usage stats collection
+export VLLM_NO_USAGE_STATS=1
+export DO_NOT_TRACK=1
+mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track
+```
diff --git a/requirements-neuron.txt b/requirements-neuron.txt
@@ -7,3 +7,6 @@ fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
+requests
+psutil
+py-cpuinfo
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
@@ -2,6 +2,8 @@ cmake>=3.21
 ninja  # For faster builds.
 typing-extensions>=4.8.0
 starlette
+requests
+py-cpuinfo
 psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.

diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,9 @@ ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.1.2
+requests
+psutil
+py-cpuinfo
 transformers >= 4.39.1  # Required for StarCoder2 & Llava.
 xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -16,6 +16,7 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import MultiModalData
+from vllm.usage.usage_lib import UsageContext
 
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = int(
@@ -319,9 +320,12 @@ def __init__(self,
         self._errored_with: Optional[BaseException] = None
 
     @classmethod
-    def from_engine_args(cls,
-                         engine_args: AsyncEngineArgs,
-                         start_engine_loop: bool = True) -> "AsyncLLMEngine":
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+    ) -> "AsyncLLMEngine":
         """Creates an async LLM engine from the engine arguments."""
         # Create the engine configs.
         engine_configs = engine_args.create_engine_configs()
@@ -341,14 +345,17 @@ def from_engine_args(cls,
             from vllm.executor.gpu_executor import GPUExecutorAsync
             executor_class = GPUExecutorAsync
         # Create the async LLM engine.
-        engine = cls(parallel_config.worker_use_ray,
-                     engine_args.engine_use_ray,
-                     *engine_configs,
-                     executor_class,
-                     log_requests=not engine_args.disable_log_requests,
-                     log_stats=not engine_args.disable_log_stats,
-                     max_log_len=engine_args.max_log_len,
-                     start_engine_loop=start_engine_loop)
+        engine = cls(
+            parallel_config.worker_use_ray,
+            engine_args.engine_use_ray,
+            *engine_configs,
+            executor_class,
+            log_requests=not engine_args.disable_log_requests,
+            log_stats=not engine_args.disable_log_stats,
+            max_log_len=engine_args.max_log_len,
+            start_engine_loop=start_engine_loop,
+            usage_context=usage_context,
+        )
         return engine
 
     @property

@@ -13,6 +13,7 @@
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.model_executor.model_loader import get_architecture_class_name
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (MultiModalData, SamplerOutput, Sequence,
@@ -21,6 +22,8 @@
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                      get_tokenizer_group)
+from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
+                                  usage_message)
 from vllm.utils import Counter
 
 logger = init_logger(__name__)
@@ -53,6 +56,7 @@ class LLMEngine:
         executor_class: The model executor class for managing distributed
             execution.
         log_stats: Whether to log statistics.
+        usage_context: Specified entry point, used for usage info collection
     """
 
     def __init__(
@@ -66,6 +70,7 @@ def __init__(
         vision_language_config: Optional["VisionLanguageConfig"],
         executor_class: Type[ExecutorBase],
         log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
     ) -> None:
         logger.info(
             f"Initializing an LLM engine (v{vllm.__version__}) with config: "
@@ -108,6 +113,39 @@ def __init__(
                                              device_config, lora_config,
                                              vision_language_config)
 
+        # If usage stat is enabled, collect relevant info.
+        if is_usage_stats_enabled():
+            usage_message.report_usage(
+                get_architecture_class_name(model_config),
+                usage_context,
+                extra_kvs={
+                    # Common configuration
+                    "dtype":
+                    str(model_config.dtype),
+                    "tensor_parallel_size":
+                    parallel_config.tensor_parallel_size,
+                    "block_size":
+                    cache_config.block_size,
+                    "gpu_memory_utilization":
+                    cache_config.gpu_memory_utilization,
+
+                    # Quantization
+                    "quantization":
+                    model_config.quantization,
+                    "kv_cache_dtype":
+                    cache_config.cache_dtype,
+
+                    # Feature flags
+                    "enable_lora":
+                    bool(lora_config),
+                    "enable_prefix_caching":
+                    cache_config.enable_prefix_caching,
+                    "enforce_eager":
+                    model_config.enforce_eager,
+                    "disable_custom_all_reduce":
+                    parallel_config.disable_custom_all_reduce,
+                })
+
         # Ping the tokenizer to ensure liveness if it runs in a
         # different process.
         self.tokenizer.ping()
@@ -125,7 +163,11 @@ def __init__(
             self.stat_logger.info("cache_config", self.cache_config)
 
     @classmethod
-    def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+    ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
         # Create the engine configs.
         engine_configs = engine_args.create_engine_configs()
@@ -147,9 +189,12 @@ def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
             executor_class = GPUExecutor
 
         # Create the LLM engine.
-        engine = cls(*engine_configs,
-                     executor_class=executor_class,
-                     log_stats=not engine_args.disable_log_stats)
+        engine = cls(
+            *engine_configs,
+            executor_class=executor_class,
+            log_stats=not engine_args.disable_log_stats,
+            usage_context=usage_context,
+        )
         return engine
 
     def __reduce__(self):

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
@@ -18,6 +18,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
+from vllm.usage.usage_lib import UsageContext
 from vllm.utils import random_uuid
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
@@ -100,9 +101,9 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
         help="FastAPI root_path when app is behind a path based routing proxy")
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
-
     engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    engine = AsyncLLMEngine.from_engine_args(
+        engine_args, usage_context=UsageContext.API_SERVER)
 
     app.root_path = args.root_path
     uvicorn.run(app,

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -10,6 +10,7 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import MultiModalData
+from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter
 
 
@@ -108,7 +109,8 @@ def __init__(
             disable_custom_all_reduce=disable_custom_all_reduce,
             **kwargs,
         )
-        self.llm_engine = LLMEngine.from_engine_args(engine_args)
+        self.llm_engine = LLMEngine.from_engine_args(
+            engine_args, usage_context=UsageContext.LLM_CLASS)
         self.request_counter = Counter()
 
     def get_tokenizer(

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -22,6 +22,7 @@
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.logger import init_logger
+from vllm.usage.usage_lib import UsageContext
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
@@ -151,9 +152,9 @@ async def authentication(request: Request, call_next):
         served_model = args.served_model_name
     else:
         served_model = args.model
-
     engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    engine = AsyncLLMEngine.from_engine_args(
+        engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
     openai_serving_chat = OpenAIServingChat(engine, served_model,
                                             args.response_role,
                                             args.lora_modules,

diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
@@ -1,6 +1,6 @@
 """Utilities for selecting and loading models."""
 import contextlib
-from typing import Type
+from typing import Tuple, Type
 
 import torch
 import torch.nn as nn
@@ -25,7 +25,8 @@ def _set_default_torch_dtype(dtype: torch.dtype):
     torch.set_default_dtype(old_dtype)
 
 
-def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]:
+def _get_model_architecture(
+        model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
@@ -36,17 +37,21 @@ def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]:
     for arch in architectures:
         model_cls = ModelRegistry.load_model_cls(arch)
         if model_cls is not None:
-            return model_cls
+            return (model_cls, arch)
     raise ValueError(
         f"Model architectures {architectures} are not supported for now. "
         f"Supported architectures: {ModelRegistry.get_supported_archs()}")
 
 
+def get_architecture_class_name(model_config: ModelConfig) -> str:
+    return _get_model_architecture(model_config)[1]
+
+
 def get_model(model_config: ModelConfig, device_config: DeviceConfig,
               **kwargs) -> nn.Module:
     lora_config = kwargs.get("lora_config", None)
     vision_language_config = kwargs.get("vision_language_config", None)
-    model_class = _get_model_architecture(model_config)
+    model_class = _get_model_architecture(model_config)[0]
 
     # Get the (maybe quantized) linear method.
     linear_method = None

diff --git a/vllm/usage/__init__.py b/vllm/usage/__init__.py