diff --git a/docs/source/index.rst b/docs/source/index.rst index 961373eb71c0b..d20e46b4a3656 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -86,6 +86,7 @@ Documentation serving/usage_stats serving/integrations serving/tensorizer + serving/compatibility_matrix serving/faq .. toctree:: diff --git a/docs/source/models/performance.rst b/docs/source/models/performance.rst index d8750ddc34e8e..23b5ab79a7378 100644 --- a/docs/source/models/performance.rst +++ b/docs/source/models/performance.rst @@ -22,6 +22,8 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False. +.. _chunked-prefill: + Chunked Prefill --------------- vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst new file mode 100644 index 0000000000000..cac0605ca132b --- /dev/null +++ b/docs/source/serving/compatibility_matrix.rst @@ -0,0 +1,427 @@ +.. _compatibility_matrix: + +Compatibility Matrix +==================== + +The tables below show mutually exclusive features and the support on some hardware. + +.. note:: + + Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. + +Feature x Feature +----------------- + + +.. raw:: html + + + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - Feature + - :ref:`CP ` + - :ref:`APC ` + - :ref:`LoRA ` + - :abbr:`prmpt adptr (Prompt Adapter)` + - :ref:`SD ` + - CUDA graph + - :abbr:`enc-dec (Encoder-Decoder Models)` + - :abbr:`logP (Logprobs)` + - :abbr:`prmpt logP (Prompt Logprobs)` + - :abbr:`async output (Async Output Processing)` + - multi-step + - :abbr:`MM (Multimodal)` + - best-of + - beam-search + - :abbr:`guided dec (Guided Decoding)` + * - :ref:`CP ` + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`APC ` + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`LoRA ` + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :abbr:`prmpt adptr (Prompt Adapter)` + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`SD ` + - ✗ + - ✅ + - ✗ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + * - :abbr:`enc-dec (Encoder-Decoder Models)` + - ✗ + - `✗ `__ + - ✗ + - ✗ + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + * - :abbr:`logP (Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + * - :abbr:`prmpt logP (Prompt Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + * - :abbr:`async output (Async Output Processing)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✅ + - + - + - + - + - + - + * - multi-step + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - `✗ `__ + - ✅ + - + - + - + - + - + * - :abbr:`MM (Multimodal)` + - `✗ `__ + - `✗ `__ + - `✗ `__ + - ? + - ? + - ✅ + - ✗ + - ✅ + - ✅ + - ✅ + - ? + - + - + - + - + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - `✗ `__ + - ✅ + - + - + - + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - `✗ `__ + - ? + - ✅ + - + - + * - :abbr:`guided dec (Guided Decoding)` + - ✅ + - ✅ + - ? + - ? + - ✅ + - ✅ + - ? + - ✅ + - ✅ + - ✅ + - ✗ + - ? + - ✅ + - ✅ + - + + +Feature x Hardware +^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - Feature + - Volta + - Turing + - Ampere + - Ada + - Hopper + - CPU + - AMD + * - :ref:`CP ` + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :ref:`APC ` + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :ref:`LoRA ` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :abbr:`prmpt adptr (Prompt Adapter)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :ref:`SD ` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :abbr:`enc-dec (Encoder-Decoder Models)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✗ + * - :abbr:`logP (Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`prmpt logP (Prompt Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`async output (Async Output Processing)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✗ + * - multi-step + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :abbr:`MM (Multimodal)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`guided dec (Guided Decoding)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 7456aab8b8d2a..03fb9193f892d 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -420,6 +420,8 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " "encoder/decoder cross-attention " diff --git a/vllm/config.py b/vllm/config.py index 7b3996dc90b94..869e9fb600ade 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -348,6 +348,8 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if device_config.device_type not in ("cuda", "tpu"): logger.warning( "Async output processing is only supported for CUDA or TPU. " @@ -361,6 +363,8 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if device_config.device_type == "cuda" and self.enforce_eager: logger.warning( "To see benefits of async output processing, enable CUDA " @@ -374,6 +378,8 @@ def verify_async_output_proc(self, parallel_config, speculative_config, if self.embedding_mode: self.use_async_output_proc = False + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if speculative_config: logger.warning("Async output processing is not supported with" " speculative decoding currently.") @@ -1182,6 +1188,8 @@ def maybe_create_spec_config( "speculative decoding is > 1, but got " f"{speculative_disable_by_batch_size=}") + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if enable_chunked_prefill: raise ValueError( "Speculative decoding and chunked prefill are " @@ -1543,6 +1551,8 @@ def verify_with_model_config(self, model_config: ModelConfig): model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: raise ValueError("LoRA is not supported with chunked prefill yet.") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cae95d20ca23d..3ea94250a87a3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1001,6 +1001,8 @@ def create_engine_config(self) -> EngineConfig: disable_logprobs=self.disable_logprobs_during_spec_decoding, ) + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: raise ValueError("Speculative decoding is not supported with " diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 47de3656ca892..74ddb250ccd9e 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -62,6 +62,8 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, @staticmethod @functools.lru_cache() def _log_prompt_logprob_unsupported_warning_once(): + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " "(e.g., speculative decode uses multi step workers).") diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 9ad240ef60820..e32993e0e452e 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -28,6 +28,8 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid assert self.lora_config is None, "cpu backend doesn't support LoRA" # @@ -324,6 +326,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: if config.dtype == torch.float16: logger.warning("float16 is not supported on CPU, casting to bfloat16.") config.dtype = torch.bfloat16 + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if not config.enforce_eager: logger.warning( "CUDA graph is not supported on CPU, fallback to the eager " @@ -334,6 +338,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: def _verify_and_get_scheduler_config( config: SchedulerConfig) -> SchedulerConfig: + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if config.chunked_prefill_enabled: logger.warning("Chunked prefill is not supported on CPU, disable it.") config.chunked_prefill_enabled = False @@ -342,6 +348,8 @@ def _verify_and_get_scheduler_config( def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig: + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if config.enable_prefix_caching: logger.warning("Prefix caching is not supported on CPU, disable it.") config.enable_prefix_caching = False diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index d4474a10f542d..100744e0f157c 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -298,6 +298,8 @@ def _build_enc_dec_llm_inputs( encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if decoder_mm_data is not None: raise ValueError( "Multi-modality decoder inputs of encoder-decoder models are " diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a67715290a515..13d39773944fb 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -87,6 +87,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker +# Reminder: Please update docs/source/serving/compatibility_matrix.rst +# If the feature combo become valid class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/utils.py b/vllm/utils.py index 9c6f1a347fb83..00ef7a9e580ad 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -41,6 +41,9 @@ # Exception strings for non-implemented encoder/decoder scenarios +# Reminder: Please update docs/source/serving/compatibility_matrix.rst +# If the feature combo become valid + STR_NOT_IMPL_ENC_DEC_SWA = \ "Sliding window attention for encoder/decoder models " + \ "is not currently supported." diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 12aa473525c13..0cd0047bebf2d 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -816,6 +816,9 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid + # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: assert len(seq_group.sampling_params.logits_processors) == 0, ( "Logits Processors are not supported in multi-step decoding") diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index a07395dfc61d8..f43635464ef00 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -13,6 +13,9 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid + if enc_dec_mr.cache_config.enable_prefix_caching: raise NotImplementedError( STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])