From 83ea5c72b9a287b65c9f7b95fbd868b3f613e6f5 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 10 Oct 2024 21:18:58 +0400
Subject: [PATCH 01/31] [OpenVINO] Use torch 2.4.0 and newer optimim version
 (#9121)

Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 requirements-openvino.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index 800d59e2b9483..ac54cf0c3288f 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -1,8 +1,8 @@
 # Common dependencies
 -r requirements-common.txt
 
-# OpenVINO dependencies
-torch >= 2.1.2
-openvino ~= 2024.4.0
-openvino-tokenizers[transformers] ~= 2024.4.0
-optimum-intel[openvino] >= 1.19.0
+torch == 2.4.0 #  should be aligned with "common" vLLM torch version
+openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
+
+optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
+optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version

From 18511aeda64b473314bb7727a97a220565e0af41 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 10 Oct 2024 13:39:56 -0400
Subject: [PATCH 02/31] [Bugfix] Fix Machete unittests failing with
 `NotImplementedError` (#9218)

---
 csrc/quantization/machete/machete_pytorch.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu
index a27f1e7c83df9..ff037756f55ab 100644
--- a/csrc/quantization/machete/machete_pytorch.cu
+++ b/csrc/quantization/machete/machete_pytorch.cu
@@ -89,6 +89,10 @@ torch::Tensor prepack_B(torch::Tensor const& B,
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("machete_prepack_B", &prepack_B);
   m.impl("machete_gemm", &gemm);
+}
+
+// use CatchAll since supported_schedules has no tensor arguments
+TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) {
   m.impl("machete_supported_schedules", &supported_schedules);
 }
 

From 055f3270d40bbc492630d0f2c96ec8b64823ba34 Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Thu, 10 Oct 2024 13:48:51 -0400
Subject: [PATCH 03/31] [Doc] Improve debugging documentation (#9204)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
---
 docs/source/getting_started/debugging.rst | 89 ++++++++++++++---------
 1 file changed, 53 insertions(+), 36 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 81287762d3c0a..cfd2dcb3bd5d3 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -1,32 +1,53 @@
 .. _debugging:
 
+===============
 Debugging Tips
 ===============
 
-Debugging hang/crash issues
----------------------------
+This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues <https://github.com/vllm-project/vllm/issues?q=is%3Aissue>`_ first to see if it has already been reported. If not, please `file a new issue <https://github.com/vllm-project/vllm/issues/new/choose>`_, providing as much relevant information as possible.
+
+.. note::
+
+    Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
 
-When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time:
+Hangs downloading a model 
+----------------------------------------
+If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. 
+It's recommended to download the model first using the `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and passing the local path to the model to vLLM. This way, you can isolate the issue.
 
-- **Downloading a model**: Do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue.
-- **Loading the model from disk**: If the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory.
-- **Tensor parallel inference**: If the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+Hangs loading a model from disk
+----------------------------------------
+If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. 
+It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
 
-If you have already taken care of the above issues, but the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue:
+Model is too large
+----------------------------------------
+If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism <https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving>`_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
-- Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
-- Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble.
-- Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
-- Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.
+Enable more logging 
+----------------------------------------
+If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
 
-With more logging, hopefully you can find the root cause of the issue.
+- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
+- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem.
+- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
+- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs.
 
-If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
+Incorrect network setup
+----------------------------------------
+The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one. 
+If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=<your_ip_address>``. 
 
-Here are some common issues that can cause hangs:
+You might also need to set ``export NCCL_SOCKET_IFNAME=<your_network_interface>`` and ``export GLOO_SOCKET_IFNAME=<your_network_interface>`` to specify the network interface for the IP address.
 
-- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address.
-- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly.
+Error near ``self.graph.replay()`` 
+----------------------------------------
+If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph. 
+To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
+
+Incorrect hardware/driver
+----------------------------------------
+If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
 
 .. code-block:: python
 
@@ -84,33 +105,29 @@ Here are some common issues that can cause hangs:
     dist.destroy_process_group(gloo_group)
     dist.destroy_process_group()
 
-.. tip::
+If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use:
 
-    Save the script as ``test.py``.
-    
-    If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use.
-    
-    If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``:
-  
-    - is the correct IP address of the master node
-    - is reachable from all nodes
-    - is set before running the script.
+.. code-block:: shell
 
-    If the script runs successfully, you should see the message ``sanity check is successful!``.
+    NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
 
-    Note that multi-node environment is more complicated than single-node. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
+If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run:
 
-    - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``.
-    - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``.
+.. code-block:: shell
+
+    NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
 
-    Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup. The difference is that you need to execute different commands (with different ``--node-rank``) on different nodes.
+If the script runs successfully, you should see the message ``sanity check is successful!``.
 
-If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
+.. note::
 
-Some known issues:
+    A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
 
-- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can cause hangs at a low probability (once in about 20 times, depending on the machine configuration). The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_ .
+    - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``.
+    - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``.
 
-.. warning::
+    Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes.
 
-    After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on.
+Known Issues
+----------------------------------------
+- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_.

From 21efb603f5f88a0d78ad11e4fbc6e18fe83916d4 Mon Sep 17 00:00:00 2001
From: jordanyono <40174853+jyono@users.noreply.github.com>
Date: Thu, 10 Oct 2024 14:18:18 -0400
Subject: [PATCH 04/31] [CI/Build] Make the `Dockerfile.cpu` file's 
 `PIP_EXTRA_INDEX_URL` Configurable as a Build Argument (#9252)

---
 Dockerfile.cpu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 1803b38629002..b9134d4ae41cb 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -26,7 +26,8 @@ RUN pip install intel_extension_for_pytorch==2.4.0
 
 WORKDIR /workspace
 
-ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
     pip install --upgrade pip && \

From 78c0b4166cb097de749993970b51cb7b8becba58 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 10 Oct 2024 12:29:24 -0700
Subject: [PATCH 05/31] Suggest codeowners for the core componenets (#9210)

---
 .github/CODEOWNERS | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index e15f129719f8f..cd721971d01d6 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,19 +1,30 @@
 # See https://help.github.com/articles/about-codeowners/
 # for more info about CODEOWNERS file
 
+# This lists cover the "core" components of vLLM that require careful review
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+CMakeLists.txt @tlrmchlsmth @WoosukKwon
+
+# Test ownership
 /tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo 
+/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96
-/tests/prefix_caching @comaniac @KuntaiDu 
+/tests/prefix_caching @comaniac @KuntaiDu
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
-/tests/kernels @tlrmchlsmth @WoosukKwon 
+/tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/quantization @mgoin @robertgshaw2-neuralmagic
-/.buildkite/lm-eval-harness @mgoin @simon-mo 
+/.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/multi_step @alexm-neuralmagic @SolitaryThinker @comaniac
+/tests/multi_step @alexm-neuralmagic @comaniac
 /tests/weight_loading @mgoin @youkaichao
 /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac

From e4d652ea3ed9b2a60c1582cb2e2605695e61280f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 10 Oct 2024 12:39:36 -0700
Subject: [PATCH 06/31] [torch.compile] integration with compilation control
 (#9058)

---
 .buildkite/test-pipeline.yaml              |  20 ++--
 tests/compile/test_basic_correctness.py    |  48 +++++++++
 tests/compile/test_full_graph.py           |  15 ++-
 tests/compile/test_full_graph_multi_gpu.py |  22 ----
 tests/compile/test_full_graph_smoke.py     |  13 ---
 tests/compile/utils.py                     |  24 ++---
 tests/tpu/test_compilation.py              |   4 +-
 tests/tpu/test_custom_dispatcher.py        |  13 ++-
 vllm/compilation/backends.py               | 115 ++++++++++++++++++++-
 vllm/compilation/compile_context.py        |  23 +++++
 vllm/compilation/decorators.py             |  85 +++++++++++++++
 vllm/compilation/levels.py                 |   9 ++
 vllm/compilation/wrapper.py                |  27 ++++-
 vllm/envs.py                               |  16 +--
 vllm/model_executor/custom_op.py           |   3 +-
 vllm/model_executor/models/gemma2.py       |   2 +
 vllm/model_executor/models/llama.py        |   2 +
 vllm/model_executor/models/llava.py        |   8 +-
 vllm/platforms/tpu.py                      |  14 +++
 vllm/plugins/__init__.py                   |  14 ++-
 vllm/sequence.py                           |   7 +-
 vllm/worker/model_runner.py                |  18 +++-
 22 files changed, 404 insertions(+), 98 deletions(-)
 create mode 100644 tests/compile/test_basic_correctness.py
 delete mode 100644 tests/compile/test_full_graph_multi_gpu.py
 delete mode 100644 tests/compile/test_full_graph_smoke.py
 create mode 100644 vllm/compilation/compile_context.py
 create mode 100644 vllm/compilation/decorators.py
 create mode 100644 vllm/compilation/levels.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ccc5003e66beb..ae8e03a2fdf8f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -121,7 +121,9 @@ steps:
   - vllm/core/
   - tests/distributed
   - tests/spec_decode/e2e/test_integration_dist_tp4
+  - tests/compile
   commands:
+  - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
@@ -231,14 +233,16 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph_smoke.py
+  - pytest -v -s compile/test_basic_correctness.py
 
-- label: "PyTorch Fullgraph Test" # 18min
-  source_file_dependencies:
-  - vllm/
-  - tests/compile
-  commands:
-  - pytest -v -s compile/test_full_graph.py
+# TODO: re-write in comparison tests, and fix symbolic shape
+# for quantization ops.
+# - label: "PyTorch Fullgraph Test" # 18min
+#   source_file_dependencies:
+#   - vllm/
+#   - tests/compile
+#   commands:
+#   - pytest -v -s compile/test_full_graph.py
 
 - label: Kernels Test %N # 1h each
   mirror_hardwares: [amd]
@@ -394,7 +398,7 @@ steps:
   - tests/distributed/
   - vllm/compilation
   commands:
-  - pytest -v -s ./compile/test_full_graph_multi_gpu.py
+  - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
   - TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1  pytest basic_correctness/ -v -s -m distributed_2_gpus
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
new file mode 100644
index 0000000000000..b6ec7413978f4
--- /dev/null
+++ b/tests/compile/test_basic_correctness.py
@@ -0,0 +1,48 @@
+from typing import Dict, List, Optional
+
+import pytest
+
+from vllm.compilation.levels import CompilationLevel
+from vllm.utils import cuda_device_count_stateless
+
+from ..utils import compare_all_settings
+
+
+# we cannot afford testing the full Catesian product
+# of all models and all levels
+@pytest.mark.parametrize(
+    "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
+    [
+        ("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate",
+         True),
+        ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
+         ["--quantization", "compressed-tensors"
+          ], 1, 1, "FLASH_ATTN", "generate", True),
+        ("google/gemma-2-2b-it", [], 1, 2, "FLASHINFER", "generate", True),
+        # TODO: add multi-modality test for llava
+        ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False)
+    ])
+def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
+                             method, fullgraph):
+    # this test is run under multiple suits, with different GPUs.
+    # make sure we only run the test with correct CUDA devices.
+    # don't use "<", as it will duplicate the tests.
+    if cuda_device_count_stateless() != pp_size * tp_size:
+        pytest.skip("Not correct CUDA devices for the test.")
+    import os
+    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
+    if not fullgraph:
+        os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"
+    all_args = [["--enforce-eager"] + model_args + ["--max_model_len", "1024"]
+                + ["-pp", str(pp_size)] + ["-tp", str(tp_size)]] * 3
+    # don't test VLLM_TORCH_COMPILE_LEVEL == 3 case
+    # inductor will change the output, so we cannot compare them.
+    all_envs: List[Optional[Dict[str, str]]] = [{
+        "VLLM_TORCH_COMPILE_LEVEL":
+        str(level)
+    } for level in [
+        CompilationLevel.NO_COMPILATION,
+        CompilationLevel.DYNAMO_AS_IS,
+        CompilationLevel.DYNAMO_ONCE,
+    ]]
+    compare_all_settings(model, all_args, all_envs, method=method)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 5dd65ad7236f9..f28f9145bb442 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,13 +1,20 @@
 import pytest
 
-from vllm.compilation.backends import vllm_backend
+from vllm.compilation.levels import CompilationLevel
 
+from ..utils import fork_new_process_for_each_test
 from .utils import TEST_MODELS, check_full_graph_support
 
 
 @pytest.mark.parametrize("model_info", TEST_MODELS)
-@pytest.mark.parametrize("backend", ["eager", vllm_backend])
-def test_full_graph(model_info, backend):
+@pytest.mark.parametrize(
+    "optimization_level",
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.INDUCTOR])
+@fork_new_process_for_each_test
+def test_full_graph(model_info, optimization_level):
     model = model_info[0]
     model_kwargs = model_info[1]
-    check_full_graph_support(model, model_kwargs, backend, tp_size=1)
+    check_full_graph_support(model,
+                             model_kwargs,
+                             optimization_level,
+                             tp_size=1)
diff --git a/tests/compile/test_full_graph_multi_gpu.py b/tests/compile/test_full_graph_multi_gpu.py
deleted file mode 100644
index e9883d5254e72..0000000000000
--- a/tests/compile/test_full_graph_multi_gpu.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import pytest
-
-from vllm.compilation.backends import vllm_backend
-from vllm.utils import cuda_device_count_stateless
-
-from ..utils import fork_new_process_for_each_test
-from .utils import TEST_MODELS_SMOKE, check_full_graph_support
-
-
-@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
-@pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("backend", ["eager", vllm_backend])
-@fork_new_process_for_each_test
-def test_full_graph_multi_gpu(model_info, tp_size, backend):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-
-    # Skip the test if there are not enough CUDA devices.
-    if cuda_device_count_stateless() < tp_size:
-        pytest.skip("Not enough CUDA devices for the test.")
-
-    check_full_graph_support(model, model_kwargs, backend, tp_size=tp_size)
diff --git a/tests/compile/test_full_graph_smoke.py b/tests/compile/test_full_graph_smoke.py
deleted file mode 100644
index 0c5a95b4ead4c..0000000000000
--- a/tests/compile/test_full_graph_smoke.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import pytest
-
-from vllm.compilation.backends import vllm_backend
-
-from .utils import TEST_MODELS_SMOKE, check_full_graph_support
-
-
-@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
-@pytest.mark.parametrize("backend", ["eager", vllm_backend])
-def test_full_graph(model_info, backend):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-    check_full_graph_support(model, model_kwargs, backend, tp_size=1)
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
index 2d06a0946d911..5386eb0e3795d 100644
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -4,16 +4,9 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.plugins import set_torch_compile_backend
+from vllm.compilation.levels import CompilationLevel
 from vllm.utils import is_hip
 
-TEST_MODELS_SMOKE = [
-    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
-        "quantization": "compressed-tensors"
-    }),
-    ("meta-llama/Meta-Llama-3-8B", {}),
-]
-
 TEST_MODELS = [
     ("facebook/opt-125m", {}),
     ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
@@ -68,20 +61,21 @@
     }))
 
 
-def check_full_graph_support(model, model_kwargs, backend, tp_size=1):
+def check_full_graph_support(model,
+                             model_kwargs,
+                             optimization_level,
+                             tp_size=1):
     # make sure these models can be captured in full graph mode
-    if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ:
-        os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
-        os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
+    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
+    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
 
     # Inductor doesn't support fp8/gptq_marlin_24 yet.
     quantization = model_kwargs.get("quantization")
     if (quantization == "fp8" or quantization == "gptq_marlin"
-            or quantization == "gptq_marlin_24") and backend != "eager":
+            or quantization == "gptq_marlin_24"
+        ) and optimization_level >= CompilationLevel.INDUCTOR:
         return
 
-    set_torch_compile_backend(backend)
-
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index d8df86b2aaa14..86d9af88e49ea 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -5,9 +5,11 @@
 
 import depyf
 
+from vllm.compilation.levels import CompilationLevel
+
 # disable custom dispatcher, let Dynamo takes over
 # all the control
-os.environ['VLLM_DYNAMO_USE_CUSTOM_DISPATCHER'] = "0"
+os.environ['VLLM_TORCH_COMPILE_LEVEL'] = str(CompilationLevel.DYNAMO_AS_IS)
 
 temp_dir = tempfile.mkdtemp()
 with depyf.prepare_debug(temp_dir):
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index 69ab67abdd12b..923d0f1680802 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,5 +1,7 @@
 import os
 
+from vllm.compilation.levels import CompilationLevel
+
 from ..utils import compare_two_settings
 
 # --enforce-eager on TPU causes graph compilation
@@ -9,8 +11,9 @@
 
 
 def test_custom_dispatcher():
-    compare_two_settings("google/gemma-2b",
-                         arg1=["--enforce-eager"],
-                         arg2=["--enforce-eager"],
-                         env1={"VLLM_DYNAMO_USE_CUSTOM_DISPATCHER": "0"},
-                         env2={})
+    compare_two_settings(
+        "google/gemma-2b",
+        arg1=["--enforce-eager"],
+        arg2=["--enforce-eager"],
+        env1={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_ONCE)},
+        env2={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_AS_IS)})
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index de0b1d8a75757..4780358cea517 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,8 +1,17 @@
+import copy
 import operator
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.fx as fx
 
+from vllm.logger import init_logger
+
+from .compile_context import get_compile_context
+from .levels import CompilationLevel
+
+logger = init_logger(__name__)
+
 
 def fix_functionalization(graph: fx.Graph):
     """
@@ -148,9 +157,113 @@ def fix_functionalization(graph: fx.Graph):
     #     print(graph.python_code(root_module="self", verbose=True).src, file=f)
 
 
-def vllm_backend(graph, example_inputs):
+def wrap_inductor(graph, example_inputs, additional_inductor_config):
     from torch._inductor import config
     current_config = config.shallow_copy_dict()
     from torch._inductor.compile_fx import compile_fx
+
+    if additional_inductor_config is not None:
+        current_config.update(additional_inductor_config)
+    if current_config['post_grad_custom_post_pass'] is not None:
+        logger.warning(
+            "post_grad_custom_post_pass is already set in the config. "
+            "Overwriting it with the fix_functionalization")
     current_config['post_grad_custom_post_pass'] = fix_functionalization
     return compile_fx(graph, example_inputs, config_patches=current_config)
+
+
+def vllm_backend(
+        graph,
+        example_inputs,
+        additional_inductor_config: Optional[Dict] = None) -> Callable:
+
+    context = get_compile_context()
+    context = copy.deepcopy(context) if context is not None else []
+    sizes_to_specialize: List[int] = context
+
+    # flags for all the seen shapes, whether we need to specialize
+    runtime_shapes_to_compile_flags: Dict[Tuple[int, ...], bool] = {}
+
+    # if we need to specialize, the compiled graph for that shape
+    runtime_shapes_to_compiled_graph: Dict[Tuple[int, ...], Callable] = {}
+
+    # this is the first compilation, we will compile a graph with
+    # dynamic shape, as the caller will mark first dimension as dynamic
+    logger.info("Compiling a graph for general shapes")
+    graph_for_symbolic_shape = wrap_inductor(graph, example_inputs,
+                                             additional_inductor_config)
+
+    # TODO: Dynamo does not pass all dynamic shapes.
+    # Need to investigate why. It works now because all the dynamic
+    # shapes have the same value, and either of them can be used.
+    sym_shape_indices = [
+        i for i, x in enumerate(example_inputs) if isinstance(x, torch.SymInt)
+    ]
+
+    first_run = True
+
+    # this is the function we return to Dynamo to run finally
+    def compiled_graph_wrapper(*args):
+
+        runtime_shapes: Tuple[int,
+                              ...] = tuple(args[i] for i in sym_shape_indices)
+
+        nonlocal first_run
+        nonlocal runtime_shapes_to_compile_flags
+        nonlocal runtime_shapes_to_compiled_graph
+
+        if first_run:
+            # the first compilation is for profiling, we directly run it
+            first_run = False
+            return graph_for_symbolic_shape(*args)
+
+        if runtime_shapes not in runtime_shapes_to_compile_flags:
+            # we haven't seen this shape before
+            # query if we need to specialize for this shape
+            # we only specialize for the first dimension.
+            # TODO: investigate if any model needs to specialize
+            # beyond the first dimension
+            runtime_shapes_to_compile_flags[runtime_shapes] = runtime_shapes[
+                0] in sizes_to_specialize
+
+        if not runtime_shapes_to_compile_flags[runtime_shapes]:
+            # we don't need to specialize for this shape
+            return graph_for_symbolic_shape(*args)
+
+        if runtime_shapes not in runtime_shapes_to_compiled_graph:
+            # we need to specialize for this shape, and we haven't compiled
+            # compile the graph for this shape
+            logger.info("Compiling a graph for shapes %s", runtime_shapes)
+            runtime_shapes_to_compiled_graph[runtime_shapes] = wrap_inductor(
+                graph, args, additional_inductor_config)
+
+        return runtime_shapes_to_compiled_graph[runtime_shapes](*args)
+
+    return compiled_graph_wrapper
+
+
+def select_default_backend(level: int) -> Union[str, Callable]:
+    if level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
+        backend = "eager"
+        return backend
+    assert level in [
+        CompilationLevel.INDUCTOR, CompilationLevel.INDUCTOR_MAX_AUTOTUNE
+    ], f"Invalid level {level}"
+
+    from vllm.compilation.backends import vllm_backend
+    from vllm.plugins import get_inductor_additional_configs
+    additional_configs = get_inductor_additional_configs()
+
+    if level == CompilationLevel.INDUCTOR_MAX_AUTOTUNE:
+        if "max_autotune" in additional_configs and not additional_configs[
+                "max_autotune"]:
+            logger.warning(
+                "max_autotune is disabled, but is overridden by level %s",
+                CompilationLevel.INDUCTOR_MAX_AUTOTUNE)
+        additional_configs['max_autotune'] = True
+
+    from functools import partial
+    backend = partial(vllm_backend,
+                      additional_inductor_config=additional_configs)
+
+    return backend
diff --git a/vllm/compilation/compile_context.py b/vllm/compilation/compile_context.py
new file mode 100644
index 0000000000000..29db3d4c637b9
--- /dev/null
+++ b/vllm/compilation/compile_context.py
@@ -0,0 +1,23 @@
+from contextlib import contextmanager
+from typing import Any
+
+_compile_context: Any = None
+
+
+def get_compile_context() -> Any:
+    """Get the current compile context."""
+    return _compile_context
+
+
+@contextmanager
+def set_compile_context(context: Any):
+    """A context manager that stores the current compile context,
+    usually it is a list of sizes to specialize.
+    """
+    global _compile_context
+    prev_context = _compile_context
+    _compile_context = context
+    try:
+        yield
+    finally:
+        _compile_context = prev_context
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
new file mode 100644
index 0000000000000..b790e5550adb7
--- /dev/null
+++ b/vllm/compilation/decorators.py
@@ -0,0 +1,85 @@
+from typing import List, Optional, Union
+
+import torch
+
+import vllm.envs as envs
+from vllm.attention import AttentionMetadata
+from vllm.compilation.levels import CompilationLevel
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.sequence import IntermediateTensors
+from vllm.utils import supports_dynamo
+
+
+def support_compile_llama_style(cls: type):
+    """
+    A decorator to add support for compiling the forward method of a class.
+    If a module's **forward signature** is compatible with llama, this 
+    decorator can be used to enable the compilation of the forward method.
+    """
+
+    # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
+    # will handle the compilation, so we don't need to do anything here.
+    if envs.VLLM_TORCH_COMPILE_LEVEL in [
+            CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
+    ] or not supports_dynamo():
+        return cls
+
+    # take care of method resolution order
+    # make sure super().__init__ is called on the base class
+    #  other than TorchCompileWrapperWithCustomDispatcher
+    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
+
+    old_init = cls.__init__
+
+    def __init__(self, *args, **kwargs):
+        old_init(self, *args, **kwargs)
+        TorchCompileWrapperWithCustomDispatcher.__init__(self)
+
+    cls.__init__ = __init__
+
+    def __call__(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        # torch.compiler.is_compiling() means we are inside the compilation
+        # e.g. TPU has the compilation logic in model runner, so we don't
+        # need to compile the model inside.
+        if torch.compiler.is_compiling():
+            return self.forward(input_ids, positions, kv_caches, attn_metadata,
+                                intermediate_tensors, inputs_embeds)
+
+        # the first compilation needs to have dynamic shapes marked
+        if len(self.compiled_codes) < 1:
+            if input_ids is not None:
+                torch._dynamo.mark_dynamic(input_ids, 0)
+            torch._dynamo.mark_dynamic(positions, 0)
+            if inputs_embeds is not None:
+                torch._dynamo.mark_dynamic(inputs_embeds, 0)
+            if intermediate_tensors is not None:
+                for tensors in intermediate_tensors.tensors.values():
+                    torch._dynamo.mark_dynamic(tensors, 0)
+
+        # if we don't use custom dispatcher, we can directly call the
+        # compiled function and let torch.compile handle the dispatching,
+        # with the overhead of guard evaluation and recompilation.
+        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
+            return self.compiled_callable(input_ids, positions, kv_caches,
+                                          attn_metadata, intermediate_tensors,
+                                          inputs_embeds)
+
+        # usually, capturing the model once is enough, and then we can
+        # dispatch to the compiled code directly, without going through
+        # the Dynamo guard mechanism.
+        with self.dispatch_to_code(0):
+            model_output = self.forward(input_ids, positions, kv_caches,
+                                        attn_metadata, intermediate_tensors,
+                                        inputs_embeds)
+            return model_output
+
+    cls.__call__ = __call__
+    return cls
diff --git a/vllm/compilation/levels.py b/vllm/compilation/levels.py
new file mode 100644
index 0000000000000..162bf5ae64997
--- /dev/null
+++ b/vllm/compilation/levels.py
@@ -0,0 +1,9 @@
+# constants for the levels of the compilation process
+
+
+class CompilationLevel:
+    NO_COMPILATION = 0
+    DYNAMO_AS_IS = 1
+    DYNAMO_ONCE = 2
+    INDUCTOR = 3
+    INDUCTOR_MAX_AUTOTUNE = 4
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index e923bd36ccc08..1594b64a61b94 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -3,12 +3,14 @@
 from abc import abstractmethod
 from contextlib import contextmanager
 from types import CodeType
-from typing import Callable, List
+from typing import Callable, List, Optional
 
 import torch
 
 import vllm.envs as envs
 
+from .levels import CompilationLevel
+
 
 class TorchCompileWrapperWithCustomDispatcher:
     """
@@ -23,7 +25,26 @@ class TorchCompileWrapperWithCustomDispatcher:
         `torch.compile` over the forward method.
     """
 
-    def __init__(self, compiled_callable: Callable):
+    def __init__(self, compiled_callable: Optional[Callable] = None):
+
+        if compiled_callable is None:
+            # default compilation settings
+            # compiling the forward method
+
+            # choose the compile backend
+
+            # if the user has set the backend, use it
+            from vllm.plugins import get_torch_compile_backend
+            backend = get_torch_compile_backend()
+            if backend is None:
+                from vllm.compilation.backends import select_default_backend
+                backend = select_default_backend(envs.VLLM_TORCH_COMPILE_LEVEL)
+
+            compiled_callable = torch.compile(
+                self.forward,
+                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                backend=backend)
+
         self.compiled_callable = compiled_callable
         self.original_code_object = self.__class__.forward.__code__
         self.compiled_codes: List[CodeType] = []
@@ -33,7 +54,7 @@ def __init__(self, compiled_callable: Callable):
         # subclasses can use this to switch between the custom dispatcher
         # and the default Dynamo guard mechanism.
         self.use_custom_dispatcher: bool = \
-            envs.VLLM_DYNAMO_USE_CUSTOM_DISPATCHER
+            envs.VLLM_TORCH_COMPILE_LEVEL >= CompilationLevel.DYNAMO_ONCE
 
     def __call__(self, *args, **kwargs):
         """Implement the dispatch logic here, beyond the torch.compile level.
diff --git a/vllm/envs.py b/vllm/envs.py
index 97767bf5b5ad9..8b541e5b78c01 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -65,6 +65,7 @@
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False
+    VLLM_TORCH_COMPILE_LEVEL: int = 0
 
 
 def get_default_cache_root():
@@ -198,23 +199,12 @@ def get_default_config_root():
     lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
              ("true", "1")),
 
-    # Internal flag to enable Dynamo graph capture
-    "VLLM_TEST_DYNAMO_GRAPH_CAPTURE":
-    lambda: int(os.environ.get("VLLM_TEST_DYNAMO_GRAPH_CAPTURE", "0")),
-    "VLLM_DYNAMO_USE_CUSTOM_DISPATCHER":
-    lambda:
-    (os.environ.get("VLLM_DYNAMO_USE_CUSTOM_DISPATCHER", "True").lower() in
-     ("true", "1")),
-
-    # Internal flag to control whether we use custom op,
-    # or use the native pytorch implementation
-    "VLLM_TEST_COMPILE_NO_CUSTOM_OPS":
-    lambda: int(os.environ.get("VLLM_TEST_COMPILE_NO_CUSTOM_OPS", "0")),
-
     # Internal flag to enable Dynamo fullgraph capture
     "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
     lambda: bool(
         os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
+    "VLLM_TORCH_COMPILE_LEVEL":
+    lambda: int(os.environ.get("VLLM_TORCH_COMPILE_LEVEL", "0")),
 
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 9102b5e19ebec..d0e90245ad010 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,6 +1,7 @@
 import torch.nn as nn
 
 import vllm.envs as envs
+from vllm.compilation.levels import CompilationLevel
 from vllm.platforms import current_platform
 from vllm.utils import is_cpu, is_hip, is_xpu
 
@@ -55,7 +56,7 @@ def dispatch_forward(self):
         # NOTE(woosuk): Here we assume that vLLM was built for only one
         # specific backend. Currently, we do not support dynamic dispatching.
 
-        if envs.VLLM_TEST_COMPILE_NO_CUSTOM_OPS:
+        if envs.VLLM_TORCH_COMPILE_LEVEL >= CompilationLevel.INDUCTOR:
             return self.forward_native
 
         if is_hip():
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index c442b6d2e7c96..edc71435b551f 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -21,6 +21,7 @@
 from transformers import Gemma2Config
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_compile_llama_style
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
@@ -238,6 +239,7 @@ def forward(
         return hidden_states, residual
 
 
+@support_compile_llama_style
 class Gemma2Model(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2a79a9edf2111..3f17e9004c30f 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -28,6 +28,7 @@
 from transformers import LlamaConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_compile_llama_style
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
@@ -265,6 +266,7 @@ def forward(
         return hidden_states, residual
 
 
+@support_compile_llama_style
 class LlamaModel(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index a3acb93dc3c11..864b9ff66a84e 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -365,6 +365,8 @@ def forward(
             input_ids = None
             inputs_embeds = None
         else:
+            # always pass the input via `inputs_embeds`
+            # to make sure the computation graph is consistent
             image_input = self._parse_and_validate_image_input(**kwargs)
 
             if image_input is not None:
@@ -375,10 +377,10 @@ def forward(
                 inputs_embeds = merge_multimodal_embeddings(
                     input_ids, inputs_embeds, vision_embeddings,
                     self.config.image_token_index)
-
-                input_ids = None
             else:
-                inputs_embeds = None
+                inputs_embeds = self.language_model.model.get_input_embeddings(
+                    input_ids)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index a35777f91cac9..8ba973b28263f 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,7 +1,21 @@
+import os
+
 import torch
 
+import vllm.envs as envs
+from vllm.compilation.levels import CompilationLevel
+from vllm.plugins import set_torch_compile_backend
+
 from .interface import Platform, PlatformEnum
 
+if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
+    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE)
+
+assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.INDUCTOR,\
+     "TPU does not support Inductor."
+
+set_torch_compile_backend("openxla")
+
 
 class TpuPlatform(Platform):
     _enum = PlatformEnum.TPU
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 7939688ef0da3..211fedbc6e2ec 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Callable, Optional, Union
+from typing import Callable, Dict, Optional, Union
 
 import vllm.envs as envs
 
@@ -42,3 +42,15 @@ def set_torch_compile_backend(backend: Union[Callable, str]):
 
 def get_torch_compile_backend() -> Optional[Union[Callable, str]]:
     return _torch_compile_backend
+
+
+_inductor_additional_configs: Dict = {}
+
+
+def set_inductor_additional_configs(configs: Dict):
+    global _inductor_additional_configs
+    _inductor_additional_configs = configs
+
+
+def get_inductor_additional_configs() -> Dict:
+    return _inductor_additional_configs
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 0c27ffca36cfd..51be9466e66be 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1137,10 +1137,9 @@ def __eq__(self, other: object) -> bool:
         return self.embeddings == other.embeddings
 
 
-class IntermediateTensors(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        array_like=True):  # type: ignore[call-arg]
+# cannot use msgspec.Struct here because Dynamo does not support it
+@dataclass
+class IntermediateTensors:
     """For all pipeline stages except the last, we need to return the hidden
     states and residuals to be sent to the next stage. This data structure
     contains the hidden states and residuals for a request.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0bd2958816718..5bc7100732291 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -18,6 +18,8 @@
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.compilation.compile_context import set_compile_context
+from vllm.compilation.levels import CompilationLevel
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ObservabilityConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig)
@@ -1126,10 +1128,10 @@ def load_model(self) -> None:
                     "provided. Defaulting to scaling factors of 1.0. "
                     "This may lead to less accurate results!")
 
-        if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE and supports_dynamo():
-            from vllm.compilation.backends import vllm_backend
+        if envs.VLLM_TORCH_COMPILE_LEVEL == CompilationLevel.DYNAMO_AS_IS \
+            and supports_dynamo():
             from vllm.plugins import get_torch_compile_backend
-            backend = get_torch_compile_backend() or vllm_backend
+            backend = get_torch_compile_backend() or "eager"
             self.model = torch.compile(
                 self.model,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
@@ -1289,7 +1291,15 @@ def profile_run(self) -> None:
                 batch_size=batch_size,
                 dtype=self.model_config.dtype,
                 device=self.device)
-        self.execute_model(model_input, kv_caches, intermediate_tensors)
+
+        graph_batch_size = self.max_batchsize_to_capture
+        batch_size_capture_list = [
+            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
+        ]
+        if self.model_config.enforce_eager:
+            batch_size_capture_list = []
+        with set_compile_context(batch_size_capture_list):
+            self.execute_model(model_input, kv_caches, intermediate_tensors)
         torch.cuda.synchronize()
         return
 

From 9cc811c4ff3d5200cc23f16709f540821531b77c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 10 Oct 2024 13:30:24 -0700
Subject: [PATCH 07/31] Bump actions/github-script from 6 to 7 (#9197)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/add_label_automerge.yml | 2 +-
 .github/workflows/publish.yml             | 2 +-
 .github/workflows/reminder_comment.yml    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
index 761cae8e33fbd..2e7c7f7f087af 100644
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -8,7 +8,7 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             -   name: Add label
-                uses: actions/github-script@v6
+                uses: actions/github-script@v7
                 with:
                     script: |
                         github.rest.issues.addLabels({
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 4cbe32bdf33bd..30e27551fef3c 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -30,7 +30,7 @@ jobs:
 
       - name: Create Release
         id: create_release
-        uses: "actions/github-script@v6"
+        uses: "actions/github-script@v7"
         env:
           RELEASE_TAG: ${{ env.release_tag }}
         with:
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 99827756d2066..d1791c3bc865a 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Remind to run full CI on PR
-        uses: actions/github-script@v6
+        uses: actions/github-script@v7
         with:
           script: |
             github.rest.issues.createComment({

From 270953bafb1ccf444f2018d1c0a88c51472de22e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 10 Oct 2024 13:30:35 -0700
Subject: [PATCH 08/31] Bump actions/checkout from 3 to 4 (#9196)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/actionlint.yml   | 2 +-
 .github/workflows/clang-format.yml | 2 +-
 .github/workflows/mypy.yaml        | 2 +-
 .github/workflows/publish.yml      | 4 ++--
 .github/workflows/ruff.yml         | 2 +-
 .github/workflows/yapf.yml         | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 38e23651eefef..2a0e3239f58da 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -28,7 +28,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Checkout"
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
         with:
           fetch-depth: 0
 
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index 4eec72b96622d..9aa2b71367523 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -17,7 +17,7 @@ jobs:
       matrix:
         python-version: ["3.11"]
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v3
       with:
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index d578d7c521402..60bdca56f5176 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -17,7 +17,7 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v3
       with:
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 30e27551fef3c..7d2b184d69bb5 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,7 +21,7 @@ jobs:
       upload_url: ${{ steps.create_release.outputs.upload_url }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Extract branch info
         shell: bash
@@ -54,7 +54,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@v1.2
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 73ce56e9e6a2e..520da043f74a9 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -17,7 +17,7 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v3
       with:
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 5f24b5b90b513..c82c5e3ac822b 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -16,7 +16,7 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v3
       with:

From fb870fd491482cfe5a41648b8c081d1bd6941205 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 10 Oct 2024 13:30:46 -0700
Subject: [PATCH 09/31] Bump actions/setup-python from 3 to 5 (#9195)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/clang-format.yml | 2 +-
 .github/workflows/mypy.yaml        | 2 +-
 .github/workflows/publish.yml      | 2 +-
 .github/workflows/ruff.yml         | 2 +-
 .github/workflows/yapf.yml         | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index 9aa2b71367523..064af291009fa 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -19,7 +19,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 60bdca56f5176..22e3564779ad9 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -19,7 +19,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 7d2b184d69bb5..96549b3f99181 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -68,7 +68,7 @@ jobs:
           bash -x .github/workflows/scripts/env.sh
 
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
             python-version: ${{ matrix.python-version }}
 
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 520da043f74a9..be73fb85ed1fa 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -19,7 +19,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index c82c5e3ac822b..eb728ae04dfc1 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -18,7 +18,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies

From a78c6ba7c88a7bb42b38410f9dcfa5b342b95b57 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 10 Oct 2024 15:45:09 -0700
Subject: [PATCH 10/31] [ci/build] Add placeholder command for custom models
 test (#9262)

---
 .buildkite/test-pipeline.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ae8e03a2fdf8f..4c2fe41c739b1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -347,10 +347,11 @@ steps:
     - pytest -v -s models/encoder_decoder/language
     - pytest -v -s models/encoder_decoder/vision_language
 
+# This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
-  #mirror_hardwares: [amd]
   optional: true
   commands:
+    - echo 'Testing custom models...'
     # PR authors can temporarily add commands below to test individual models
     # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
     # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

From e00c094f15e79c5a113fdf975df1ee9018cb65b3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 10 Oct 2024 15:54:23 -0700
Subject: [PATCH 11/31] [torch.compile] generic decorators (#9258)

---
 vllm/compilation/decorators.py       | 88 ++++++++++++++++++----------
 vllm/model_executor/models/gemma2.py | 10 +++-
 vllm/model_executor/models/llama.py  | 10 +++-
 3 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index b790e5550adb7..655c4c4430179 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,20 +1,54 @@
-from typing import List, Optional, Union
+import inspect
+from typing import Dict, List, Union
 
 import torch
 
 import vllm.envs as envs
-from vllm.attention import AttentionMetadata
 from vllm.compilation.levels import CompilationLevel
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 from vllm.sequence import IntermediateTensors
 from vllm.utils import supports_dynamo
 
 
-def support_compile_llama_style(cls: type):
+def support_torch_compile(dynamic_arg_dims: Dict[str, Union[int, List[int]]]):
+    """
+    A decorator to add support for compiling the forward method of a class.
+
+    `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic
+    dimensions of the argument. The dynamic dimensions can be either a single
+    integer or a list of integers.
+
+    Depending on the value of arguments:
+
+    - if it is a single integer, the corresponding dimension of the argument
+        will be marked as dynamic.
+    - if it is `None`, ignored.
+    - if it is `IntermediateTensors`, all the tensors in the intermediate
+        tensors will be marked as dynamic.
+    - otherwise, it will raise an error.
+
+    NOTE: if an argument is `None`, it should always be passed as `None` during
+    the lifetime of the model, otherwise, it cannot be captured as a single
+    computation graph.
+    """
+
+    def cls_decorator_helper(cls: type):
+        # helper to pass `dynamic_arg_dims`` to `_support_torch_compile``
+        # to avoid too much indentation for `_support_torch_compile``
+        sig = inspect.signature(cls.forward)
+        for k in dynamic_arg_dims:
+            if k not in sig.parameters:
+                raise ValueError(
+                    f"Argument {k} not found in the forward method of {cls}")
+        return _support_torch_compile(cls, dynamic_arg_dims)
+
+    return cls_decorator_helper
+
+
+def _support_torch_compile(cls: type,
+                           dynamic_arg_dims: Dict[str, Union[int, List[int]]]):
     """
     A decorator to add support for compiling the forward method of a class.
-    If a module's **forward signature** is compatible with llama, this 
-    decorator can be used to enable the compilation of the forward method.
     """
 
     # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
@@ -37,48 +71,42 @@ def __init__(self, *args, **kwargs):
 
     cls.__init__ = __init__
 
-    def __call__(
-        self,
-        input_ids: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
         # e.g. TPU has the compilation logic in model runner, so we don't
         # need to compile the model inside.
         if torch.compiler.is_compiling():
-            return self.forward(input_ids, positions, kv_caches, attn_metadata,
-                                intermediate_tensors, inputs_embeds)
+            return self.forward(*args, **kwargs)
 
         # the first compilation needs to have dynamic shapes marked
         if len(self.compiled_codes) < 1:
-            if input_ids is not None:
-                torch._dynamo.mark_dynamic(input_ids, 0)
-            torch._dynamo.mark_dynamic(positions, 0)
-            if inputs_embeds is not None:
-                torch._dynamo.mark_dynamic(inputs_embeds, 0)
-            if intermediate_tensors is not None:
-                for tensors in intermediate_tensors.tensors.values():
-                    torch._dynamo.mark_dynamic(tensors, 0)
+            sig = inspect.signature(self.__class__.forward)
+            bound_args = sig.bind(self, *args, **kwargs)
+            bound_args.apply_defaults()
+            for k, dims in dynamic_arg_dims.items():
+                arg = bound_args.arguments.get(k)
+                if arg is not None:
+                    if isinstance(arg, torch.Tensor):
+                        torch._dynamo.mark_dynamic(arg, dims)
+                    elif isinstance(arg, IntermediateTensors):
+                        for tensor in arg.tensors.values():
+                            torch._dynamo.mark_dynamic(tensor, dims)
+                    else:
+                        raise ValueError(
+                            "Unsupported dynamic dimensions"
+                            f" {dims} for argument {k} with type {type(arg)}.")
 
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,
         # with the overhead of guard evaluation and recompilation.
         if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
-            return self.compiled_callable(input_ids, positions, kv_caches,
-                                          attn_metadata, intermediate_tensors,
-                                          inputs_embeds)
+            return self.compiled_callable(*args, **kwargs)
 
         # usually, capturing the model once is enough, and then we can
         # dispatch to the compiled code directly, without going through
         # the Dynamo guard mechanism.
         with self.dispatch_to_code(0):
-            model_output = self.forward(input_ids, positions, kv_caches,
-                                        attn_metadata, intermediate_tensors,
-                                        inputs_embeds)
+            model_output = self.forward(*args, **kwargs)
             return model_output
 
     cls.__call__ = __call__
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index edc71435b551f..bcb03ef55ef94 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -21,7 +21,7 @@
 from transformers import Gemma2Config
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.compilation.decorators import support_compile_llama_style
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
@@ -239,7 +239,13 @@ def forward(
         return hidden_states, residual
 
 
-@support_compile_llama_style
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": 0,
+        "inputs_embeds": 0,
+        "intermediate_tensors": 0,
+    })
 class Gemma2Model(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 3f17e9004c30f..ad5cfcc44022f 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -28,7 +28,7 @@
 from transformers import LlamaConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.compilation.decorators import support_compile_llama_style
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
@@ -266,7 +266,13 @@ def forward(
         return hidden_states, residual
 
 
-@support_compile_llama_style
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": 0,
+        "inputs_embeds": 0,
+        "intermediate_tensors": 0,
+    })
 class LlamaModel(nn.Module):
 
     def __init__(

From f990bab2a4198c4de6b5b349d35fc74bf0f36f3e Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Thu, 10 Oct 2024 16:36:32 -0700
Subject: [PATCH 12/31] [Doc][Neuron] add note to neuron documentation about
 resolving triton issue (#9257)

Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>
---
 docs/source/getting_started/neuron-installation.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst
index a9ed4d7fa2cd7..ec99fc013057b 100644
--- a/docs/source/getting_started/neuron-installation.rst
+++ b/docs/source/getting_started/neuron-installation.rst
@@ -27,6 +27,10 @@ Installation steps:
 
 .. _build_from_source_neuron:
 
+.. note::
+
+    The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
+
 Build from source
 -----------------
 

From 94bf9ae4e9b8199636668ccbe4dabcdc3b9e5ae6 Mon Sep 17 00:00:00 2001
From: Andy Dai <76841985+Imss27@users.noreply.github.com>
Date: Thu, 10 Oct 2024 17:33:16 -0700
Subject: [PATCH 13/31] [Misc] Fix sampling from sonnet for long context case
 (#9235)

---
 benchmarks/benchmark_serving.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 292d1f37fbf3e..04999518b7138 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -176,9 +176,9 @@ def sample_sonnet_requests(
     # Sample the rest of lines per request.
     sampled_requests: List[Tuple[str, int, int]] = []
     for _ in range(num_requests):
-        sampled_lines = "".join(
-            prefix_lines +
-            random.sample(poem_lines, num_input_lines - num_prefix_lines))
+        num_lines_needed = num_input_lines - num_prefix_lines
+        sampled_lines = "".join(prefix_lines +
+                                random.choices(poem_lines, k=num_lines_needed))
 
         prompt = f"{base_prompt}{sampled_lines}"
         message = [
@@ -536,7 +536,7 @@ def process_one_metric(
         # E.g., "Time to First Token"
         metric_header: str,
     ):
-        # This function print and add statistics of the specified
+        # This function prints and adds statistics of the specified
         # metric.
         if metric_attribute_name not in selected_percentile_metrics:
             return

From cbc2ef55292b2af6ff742095c030e8425124c005 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 10 Oct 2024 21:30:44 -0700
Subject: [PATCH 14/31] [misc] hide best_of from engine (#9261)

Co-authored-by: Brendan Wong <bjwpokemon@gmail.com>
---
 tests/entrypoints/openai/test_metrics.py    |  4 ---
 tests/metrics/test_metrics.py               |  1 -
 tests/tracing/test_tracing.py               |  4 ---
 vllm/core/scheduler.py                      |  2 +-
 vllm/engine/llm_engine.py                   | 11 ++-----
 vllm/engine/metrics.py                      |  8 -----
 vllm/engine/metrics_types.py                |  1 -
 vllm/engine/output_processor/single_step.py |  2 +-
 vllm/model_executor/layers/sampler.py       | 17 +++++------
 vllm/outputs.py                             |  2 +-
 vllm/sampling_params.py                     | 33 +++++++++++----------
 vllm/sequence.py                            | 10 +++----
 vllm/tracing.py                             |  1 -
 vllm/worker/tpu_model_runner.py             | 23 +++++++-------
 14 files changed, 46 insertions(+), 73 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 5e9a9f8ab7d4d..6cb74eb78cbf0 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -70,7 +70,6 @@ async def client(server):
     [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
      ("_count", _NUM_REQUESTS)],
     "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
-    "vllm:request_params_best_of": [("_count", _NUM_REQUESTS)],
     "vllm:prompt_tokens": [("_total",
                             _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
     "vllm:generation_tokens":
@@ -151,9 +150,6 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
     "vllm:request_params_n_sum",
     "vllm:request_params_n_bucket",
     "vllm:request_params_n_count",
-    "vllm:request_params_best_of_sum",
-    "vllm:request_params_best_of_bucket",
-    "vllm:request_params_best_of_count",
     "vllm:num_preemptions_total",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 23a7a85580a0a..f1003221ab518 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -326,7 +326,6 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
             "vllm:e2e_request_latency_seconds",
             "vllm:request_prompt_tokens",
             "vllm:request_generation_tokens",
-            "vllm:request_params_best_of",
             "vllm:request_params_n",
         ]
         for metric_name in request_histogram_metrics:
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 3cee3b890862a..64ed8e26f38ed 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -98,8 +98,6 @@ def test_traces(trace_service):
         SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
     assert attributes.get(
         SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
     assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
     assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
         outputs[0].prompt_token_ids)
@@ -155,8 +153,6 @@ def test_traces_with_detailed_steps(trace_service):
         SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
     assert attributes.get(
         SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
     assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
     assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
         outputs[0].prompt_token_ids)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index e930f807280f0..2d7a27d1377e4 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1205,7 +1205,7 @@ def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
         # async_output_proc is allowed only when we have a single sequence
         # in the sequence group
         no_single_seq = seq_group.sampling_params is None or (
-            seq_group.sampling_params.best_of == 1)
+            seq_group.sampling_params.n == 1)
         return no_single_seq
 
     def schedule(
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 510ffac6f6892..563e52a37d935 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -767,7 +767,7 @@ def add_request(
         Details:
             - Set arrival_time to the current time if it is None.
             - Set prompt_token_ids to the encoded prompt if it is None.
-            - Create `best_of` number of :class:`~vllm.Sequence` objects.
+            - Create `n` number of :class:`~vllm.Sequence` objects.
             - Create a :class:`~vllm.SequenceGroup` object
               from the list of :class:`~vllm.Sequence`.
             - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
@@ -1242,8 +1242,7 @@ def _advance_to_next_step(
             if seq_group_metadata.do_sample:
                 assert len(sequence_group_outputs.samples) == 1, (
                     "Async output processor expects a single sample"
-                    " (i.e sampling_params.n == 1 and no "
-                    "sampling_params.best_of > 1)")
+                    " (i.e sampling_params.n == 1)")
                 sample = sequence_group_outputs.samples[0]
 
                 assert len(seq_group.seqs) == 1
@@ -1612,7 +1611,6 @@ def _get_stats(self,
         #   Metadata
         num_prompt_tokens_requests: List[int] = []
         num_generation_tokens_requests: List[int] = []
-        best_of_requests: List[int] = []
         n_requests: List[int] = []
         finished_reason_requests: List[str] = []
 
@@ -1683,8 +1681,6 @@ def _get_stats(self,
                         for seq in seq_group.get_finished_seqs()
                     ])
                     if seq_group.sampling_params is not None:
-                        best_of_requests.append(
-                            seq_group.sampling_params.best_of)
                         n_requests.append(seq_group.sampling_params.n)
                     finished_reason_requests.extend([
                         SequenceStatus.get_finished_reason(seq.status)
@@ -1737,7 +1733,6 @@ def _get_stats(self,
             #   Metadata
             num_prompt_tokens_requests=num_prompt_tokens_requests,
             num_generation_tokens_requests=num_generation_tokens_requests,
-            best_of_requests=best_of_requests,
             n_requests=n_requests,
             finished_reason_requests=finished_reason_requests,
         )
@@ -1824,8 +1819,6 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None:
                                    seq_group.sampling_params.top_p)
             seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS,
                                    seq_group.sampling_params.max_tokens)
-            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_BEST_OF,
-                                   seq_group.sampling_params.best_of)
             seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N,
                                    seq_group.sampling_params.n)
             seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES,
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 74277cae7c8ef..42acd3ea4c94c 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -134,12 +134,6 @@ def __init__(self, labelnames: List[str], max_model_len: int):
                 labelnames=labelnames,
                 buckets=build_1_2_5_buckets(max_model_len),
             )
-        self.histogram_best_of_request = self._histogram_cls(
-            name="vllm:request_params_best_of",
-            documentation="Histogram of the best_of request parameter.",
-            labelnames=labelnames,
-            buckets=[1, 2, 5, 10, 20],
-        )
         self.histogram_n_request = self._histogram_cls(
             name="vllm:request_params_n",
             documentation="Histogram of the n request parameter.",
@@ -473,8 +467,6 @@ def _log_prometheus(self, stats: Stats) -> None:
             self.metrics.histogram_num_generation_tokens_request,
             stats.num_generation_tokens_requests)
         self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
-        self._log_histogram(self.metrics.histogram_best_of_request,
-                            stats.best_of_requests)
 
     def _log_prometheus_interval(self, prompt_throughput: float,
                                  generation_throughput: float) -> None:
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 1eccb23593408..bafd5fa1a8a82 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -49,7 +49,6 @@ class Stats:
     #   Metadata
     num_prompt_tokens_requests: List[int]
     num_generation_tokens_requests: List[int]
-    best_of_requests: List[int]
     n_requests: List[int]
     finished_reason_requests: List[str]
 
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index 00d9297e41d99..cfa84077685a0 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -112,7 +112,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                                         outputs: SequenceGroupOutput,
                                         is_async: bool) -> None:
         sampling_params = seq_group.sampling_params
-        if sampling_params.best_of == 1:
+        if sampling_params.n == 1:
             # only have one output sample
             sample = outputs.samples[0]
             # only have one sequence
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 0b959da79c3be..42a6a0e6b3229 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -508,7 +508,7 @@ def _random_sample(
         same as the length of selected_seq_groups. If the corresponding
         seq_group has do_sample=False, tuple contains ([], [])
     """
-    # Find the maximum best_of value of the prompt phase requests.
+    # Find the maximum n value of the prompt phase requests.
     random_samples = random_samples.cpu()
     sample_idx = 0
     results: SampleResultType = []
@@ -523,9 +523,9 @@ def _random_sample(
         num_parent_seqs = len(seq_ids)
         if is_prompt:
             # Prompt phase.
-            parent_ids = [0] * sampling_params.best_of
+            parent_ids = [0] * sampling_params.n
             next_token_ids = random_samples[
-                sample_idx, :sampling_params.best_of].tolist()
+                sample_idx, :sampling_params.n].tolist()
         else:
             # Generation phase.
             parent_ids = list(range(num_parent_seqs))
@@ -570,7 +570,7 @@ def _beam_search_sample(
         is_prompt = seq_group.is_prompt
         seq_ids, sampling_params = seq_group.seq_ids, seq_group.sampling_params
         num_parent_seqs = len(seq_ids)
-        beam_width = sampling_params.best_of
+        beam_width = sampling_params.n
         seq_group_logprobs = logprobs[sample_idx:sample_idx + num_parent_seqs]
         if is_prompt:
             # Prompt phase.
@@ -797,12 +797,11 @@ def _sample_with_torch(
                                              greedy_samples)
 
         elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
-            max_best_of_in_batch = 1
+            max_n_in_batch = 1
             for seq_group in seq_groups:
                 if seq_group.is_prompt:
                     sampling_params = seq_group.sampling_params
-                    max_best_of_in_batch = max(max_best_of_in_batch,
-                                               sampling_params.best_of)
+                    max_n_in_batch = max(max_n_in_batch, sampling_params.n)
             seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else
                               seq_groups)
 
@@ -812,13 +811,13 @@ def _sample_with_torch(
                         probs[long_sample_indices],
                         sampling_tensors.top_ks[long_sample_indices],
                         sampling_tensors.top_ps[long_sample_indices],
-                        max_best_of_in_batch,
+                        max_n_in_batch,
                         seq_groups_arg,
                     )
             else:
                 multinomial_samples[sampling_type] = _multinomial(
                     probs[long_sample_indices],
-                    max_best_of_in_batch,
+                    max_n_in_batch,
                     seq_groups=seq_groups_arg)
 
             if sampled_token_ids_tensor is not None:
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 4f29226aa5128..07650241cb638 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -141,7 +141,7 @@ def from_seq_group(cls, seq_group: SequenceGroup,
             top_n_seqs = seqs
         else:
             # Get the top-n sequences.
-            n = sampling_params.n
+            n = sampling_params._real_n or sampling_params.n
             sorting_key = lambda seq: seq.get_cumulative_logprob()
             sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
             top_n_seqs = sorted_seqs[:n]
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 95345df43b57d..4f2ae75e65f3a 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -106,9 +106,8 @@ class SamplingParams(
         n: Number of output sequences to return for the given prompt.
         best_of: Number of output sequences that are generated from the prompt.
             From these `best_of` sequences, the top `n` sequences are returned.
-            `best_of` must be greater than or equal to `n`. This is treated as
-            the beam width when `use_beam_search` is True. By default, `best_of`
-            is set to `n`.
+            `best_of` must be greater than or equal to `n`. By default,
+            `best_of` is set to `n`.
         presence_penalty: Float that penalizes new tokens based on whether they
             appear in the generated text so far. Values > 0 encourage the model
             to use new tokens, while values < 0 encourage the model to repeat
@@ -173,6 +172,7 @@ class SamplingParams(
 
     n: int = 1
     best_of: Optional[int] = None
+    _real_n: Optional[int] = None
     presence_penalty: float = 0.0
     frequency_penalty: float = 0.0
     repetition_penalty: float = 1.0
@@ -282,7 +282,19 @@ def from_optional(
         )
 
     def __post_init__(self) -> None:
-        self.best_of = self.best_of or self.n
+        # how we deal with `best_of``:
+        # if `best_of`` is not set, we default to `n`;
+        # if `best_of`` is set, we set `n`` to `best_of`,
+        # and set `_real_n`` to the original `n`.
+        # when we return the result, we will check
+        # if we need to return `n` or `_real_n` results
+        if self.best_of:
+            if self.best_of < self.n:
+                raise ValueError(
+                    f"best_of must be greater than or equal to n, "
+                    f"got n={self.n} and best_of={self.best_of}.")
+            self._real_n = self.n
+            self.n = self.best_of
         if 0 < self.temperature < _MAX_TEMP:
             logger.warning(
                 "temperature %s is less than %s, which may cause numerical "
@@ -329,12 +341,6 @@ def _verify_args(self) -> None:
                              f"type {type(self.n)}")
         if self.n < 1:
             raise ValueError(f"n must be at least 1, got {self.n}.")
-        if not isinstance(self.best_of, int):
-            raise ValueError(f"best_of must be an int, but is of "
-                             f"type {type(self.best_of)}")
-        if self.best_of < self.n:
-            raise ValueError(f"best_of must be greater than or equal to n, "
-                             f"got n={self.n} and best_of={self.best_of}.")
         if not -2.0 <= self.presence_penalty <= 2.0:
             raise ValueError("presence_penalty must be in [-2, 2], got "
                              f"{self.presence_penalty}.")
@@ -385,7 +391,7 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "stop strings are only supported when detokenize is True. "
                 "Set detokenize=True to use stop.")
-        if self.best_of != self.n and self.output_kind == (
+        if self.best_of != self._real_n and self.output_kind == (
                 RequestOutputKind.DELTA):
             raise ValueError("best_of must equal n to use output_kind=DELTA")
 
@@ -393,10 +399,6 @@ def _verify_greedy_sampling(self) -> None:
         if self.n > 1:
             raise ValueError("n must be 1 when using greedy sampling, "
                              f"got {self.n}.")
-        assert isinstance(self.best_of, int)
-        if self.best_of > 1:
-            raise ValueError("best_of must be 1 when using greedy sampling, "
-                             f"got {self.best_of}.")
 
     def update_from_generation_config(
             self,
@@ -453,7 +455,6 @@ def clone(self) -> "SamplingParams":
     def __repr__(self) -> str:
         return (
             f"SamplingParams(n={self.n}, "
-            f"best_of={self.best_of}, "
             f"presence_penalty={self.presence_penalty}, "
             f"frequency_penalty={self.frequency_penalty}, "
             f"repetition_penalty={self.repetition_penalty}, "
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 51be9466e66be..3bb35ea955c8c 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -803,14 +803,14 @@ def get_max_num_running_seqs(self) -> int:
         """The maximum number of sequences running in parallel in the remaining
         lifetime of the request."""
         if self.sampling_params:
-            best_of = self.sampling_params.best_of
-            assert isinstance(best_of, int)
-            if best_of > self.num_seqs():
+            n = self.sampling_params.n
+            assert isinstance(n, int)
+            if n > self.num_seqs():
                 # At prompt stage, the sequence group is not yet filled up
                 # and only have one sequence running. However, in the
-                # generation stage, we will have `best_of` sequences
+                # generation stage, we will have `n` sequences
                 # running.
-                return best_of
+                return n
         # At sampling stages, return the number of actual sequences
         # that are not finished yet.
         return self.num_unfinished_seqs()
diff --git a/vllm/tracing.py b/vllm/tracing.py
index 31849e2b635aa..50068d8cf9c25 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -96,7 +96,6 @@ class SpanAttributes(BaseSpanAttributes):
     # The following span attribute names are added here because they are missing
     # from the Semantic Conventions for LLM.
     LLM_REQUEST_ID = "gen_ai.request.id"
-    LLM_REQUEST_BEST_OF = "gen_ai.request.best_of"
     LLM_REQUEST_N = "gen_ai.request.n"
     LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
     LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 12e4215038d74..b3ae5b4a9a0ce 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -49,7 +49,7 @@ class ModelInputForTPU(ModelRunnerInputBase):
     t: torch.Tensor
     p: torch.Tensor
     num_samples: int
-    best_of: List[int]
+    n: List[int]
     seq_groups: List[List[int]]
     is_first_multi_step: bool = True
     is_last_step: bool = True
@@ -65,7 +65,7 @@ def as_broadcastable_tensor_dict(
             "t": self.t,
             "p": self.p,
             "num_samples": self.num_samples,
-            "best_of": self.best_of,
+            "n": self.n,
             "seq_groups": self.seq_groups,
             "is_first_multi_step": self.is_first_multi_step,
             "is_last_step": self.is_last_step,
@@ -435,7 +435,7 @@ def _prepare_sample(
         assert len(seq_group_metadata_list) > 0
         t = []
         p = []
-        best_of = []
+        n = []
         for seq_group_metadata in seq_group_metadata_list:
             sampling_params = seq_group_metadata.sampling_params
             t.append(sampling_params.temperature)
@@ -448,11 +448,11 @@ def _prepare_sample(
                 raise NotImplementedError(
                     "Top-k sampling is currently disabled for the TPU backend "
                     "due to performance issues.")
-            if sampling_params.best_of > _MAX_NUM_SAMPLES:
+            if sampling_params.n > _MAX_NUM_SAMPLES:
                 raise NotImplementedError(
                     f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU "
                     "backend.")
-            best_of.append(sampling_params.best_of)
+            n.append(sampling_params.n)
             if sampling_params.logprobs is not None:
                 raise NotImplementedError(
                     "logprobs is not currently supported by the TPU backend.")
@@ -465,7 +465,7 @@ def _prepare_sample(
             num_seqs = len(seq_group_metadata.seq_data)
             t += [t[-1]] * (num_seqs - 1)
             p += [p[-1]] * (num_seqs - 1)
-            best_of += [best_of[-1]] * (num_seqs - 1)
+            n += [n[-1]] * (num_seqs - 1)
 
         num_paddings = padded_batch_size - len(t)
         t += [1.0] * num_paddings
@@ -473,7 +473,7 @@ def _prepare_sample(
 
         t = torch.tensor(t, dtype=torch.float32, device="cpu")
         p = torch.tensor(p, dtype=torch.float32, device="cpu")
-        return t, p, best_of
+        return t, p, n
 
     def prepare_model_input(
         self,
@@ -493,8 +493,8 @@ def prepare_model_input(
             inputs = self._prepare_decode(seq_group_metadata_list)
         input_tokens, input_positions, attn_metadata, input_lens = inputs
         padded_batch_size = input_tokens.shape[0]
-        t, p, best_of = self._prepare_sample(seq_group_metadata_list,
-                                             padded_batch_size)
+        t, p, n = self._prepare_sample(seq_group_metadata_list,
+                                       padded_batch_size)
         num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
 
         seq_groups = [
@@ -502,8 +502,7 @@ def prepare_model_input(
             for metadata in seq_group_metadata_list
         ]
         return ModelInputForTPU(input_tokens, input_positions, attn_metadata,
-                                input_lens, t, p, num_samples, best_of,
-                                seq_groups)
+                                input_lens, t, p, num_samples, n, seq_groups)
 
     def make_model_input_from_broadcasted_tensor_dict(
             self, tensor_dict: Dict[str, Any]) -> ModelInputForTPU:
@@ -609,7 +608,7 @@ def execute_model(
                 assert len(seq_ids) == 1
                 seq_id = seq_ids[0]
                 seq_outputs = []
-                for j in range(model_input.best_of[i]):
+                for j in range(model_input.n[i]):
                     next_token_id = next_token_ids[i][j]
                     seq_outputs.append(
                         SequenceOutput(seq_id, next_token_id,

From e808156f305ce2ecfbe87eefa19ce2ae11c83d00 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 11 Oct 2024 19:08:11 +0800
Subject: [PATCH 15/31] [Misc] Collect model support info in a single process
 per model (#9233)

---
 docs/source/models/adding_model.rst    |   2 +-
 vllm/engine/arg_utils.py               |   2 +
 vllm/engine/multiprocessing/engine.py  |   3 +
 vllm/model_executor/models/registry.py | 380 +++++++++++++++----------
 4 files changed, 228 insertions(+), 159 deletions(-)

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index fa1003874033e..ae09259c0756c 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -99,7 +99,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 5. Register your model
 ----------------------
 
-Finally, register your :code:`*ForCausalLM` class to the :code:`_MODELS` in `vllm/model_executor/models/registry.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py>`_.
+Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py>`_.
 
 6. Out-of-Tree Model Integration
 --------------------------------------------
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cae95d20ca23d..efdcec4ab797a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -183,6 +183,8 @@ class EngineArgs:
     def __post_init__(self):
         if self.tokenizer is None:
             self.tokenizer = self.model
+
+        # Setup plugins
         from vllm.plugins import load_general_plugins
         load_general_plugins()
 
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index eecca82cd2f7d..d68970e1da24c 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -130,6 +130,9 @@ def dead_error(self) -> BaseException:
     def from_engine_args(cls, engine_args: AsyncEngineArgs,
                          usage_context: UsageContext, ipc_path: str):
         """Creates an MQLLMEngine from the engine arguments."""
+        # Setup plugins for each process
+        from vllm.plugins import load_general_plugins
+        load_general_plugins()
 
         engine_config = engine_args.create_engine_config()
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f1d484521acb9..b37452877cf0c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -3,8 +3,10 @@
 import subprocess
 import sys
 import tempfile
-from functools import lru_cache, partial
-from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union
 
 import cloudpickle
 import torch.nn as nn
@@ -116,18 +118,13 @@
 }
 # yapf: enable
 
-_MODELS = {
+_VLLM_MODELS = {
     **_TEXT_GENERATION_MODELS,
     **_EMBEDDING_MODELS,
     **_MULTIMODAL_MODELS,
     **_SPECULATIVE_DECODING_MODELS,
 }
 
-# Architecture -> type or (module, class).
-# out of tree models
-_OOT_MODELS: Dict[str, Type[nn.Module]] = {}
-_OOT_MODELS_LAZY: Dict[str, Tuple[str, str]] = {}
-
 # Models not supported by ROCm.
 _ROCM_UNSUPPORTED_MODELS: List[str] = []
 
@@ -154,79 +151,125 @@
 }
 
 
-class ModelRegistry:
+@dataclass(frozen=True)
+class _ModelInfo:
+    is_text_generation_model: bool
+    is_embedding_model: bool
+    supports_multimodal: bool
+    supports_pp: bool
 
     @staticmethod
-    def _get_module_cls_name(model_arch: str) -> Tuple[str, str]:
-        if model_arch in _MODELS:
-            module_relname, cls_name = _MODELS[model_arch]
-            return f"vllm.model_executor.models.{module_relname}", cls_name
+    def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
+        return _ModelInfo(
+            is_text_generation_model=is_text_generation_model(model),
+            is_embedding_model=is_embedding_model(model),
+            supports_multimodal=supports_multimodal(model),
+            supports_pp=supports_pp(model),
+        )
 
-        if model_arch in _OOT_MODELS_LAZY:
-            return _OOT_MODELS_LAZY[model_arch]
 
-        raise KeyError(model_arch)
+class _BaseRegisteredModel(ABC):
 
-    @staticmethod
-    @lru_cache(maxsize=128)
-    def _try_get_model_stateful(model_arch: str) -> Optional[Type[nn.Module]]:
-        try:
-            mod_name, cls_name = ModelRegistry._get_module_cls_name(model_arch)
-        except KeyError:
-            return None
+    @abstractmethod
+    def inspect_model_cls(self) -> _ModelInfo:
+        raise NotImplementedError
 
-        module = importlib.import_module(mod_name)
-        return getattr(module, cls_name, None)
+    @abstractmethod
+    def load_model_cls(self) -> Type[nn.Module]:
+        raise NotImplementedError
 
-    @staticmethod
-    def _try_get_model_stateless(model_arch: str) -> Optional[Type[nn.Module]]:
-        if model_arch in _OOT_MODELS:
-            return _OOT_MODELS[model_arch]
-
-        if is_hip():
-            if model_arch in _ROCM_UNSUPPORTED_MODELS:
-                raise ValueError(
-                    f"Model architecture {model_arch} is not supported by "
-                    "ROCm for now.")
-            if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
-                logger.warning(
-                    "Model architecture %s is partially supported by ROCm: %s",
-                    model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
 
-        return None
+@dataclass(frozen=True)
+class _RegisteredModel(_BaseRegisteredModel):
+    """
+    Represents a model that has already been imported in the main process.
+    """
+
+    interfaces: _ModelInfo
+    model_cls: Type[nn.Module]
 
     @staticmethod
-    def _try_load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
-        model = ModelRegistry._try_get_model_stateless(model_arch)
-        if model is not None:
-            return model
+    def from_model_cls(model_cls: Type[nn.Module]):
+        return _RegisteredModel(
+            interfaces=_ModelInfo.from_model_cls(model_cls),
+            model_cls=model_cls,
+        )
+
+    def inspect_model_cls(self) -> _ModelInfo:
+        return self.interfaces
+
+    def load_model_cls(self) -> Type[nn.Module]:
+        return self.model_cls
+
+
+@dataclass(frozen=True)
+class _LazyRegisteredModel(_BaseRegisteredModel):
+    """
+    Represents a model that has not been imported in the main process.
+    """
+    module_name: str
+    class_name: str
+
+    # Performed in another process to avoid initializing CUDA
+    def inspect_model_cls(self) -> _ModelInfo:
+        return _run_in_subprocess(
+            lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
+
+    def load_model_cls(self) -> Type[nn.Module]:
+        mod = importlib.import_module(self.module_name)
+        return getattr(mod, self.class_name)
+
+
+@lru_cache(maxsize=128)
+def _try_load_model_cls(
+    model_arch: str,
+    model: _BaseRegisteredModel,
+) -> Optional[Type[nn.Module]]:
+    if is_hip():
+        if model_arch in _ROCM_UNSUPPORTED_MODELS:
+            raise ValueError(f"Model architecture '{model_arch}' is not "
+                             "supported by ROCm for now.")
+
+        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
+            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
+            logger.warning(
+                "Model architecture '%s' is partially "
+                "supported by ROCm: %s", model_arch, msg)
+
+    try:
+        return model.load_model_cls()
+    except Exception:
+        logger.exception("Error in loading model architecture '%s'",
+                         model_arch)
+        return None
 
-        return ModelRegistry._try_get_model_stateful(model_arch)
 
-    @staticmethod
-    def resolve_model_cls(
-        architectures: Union[str, List[str]], ) -> Tuple[Type[nn.Module], str]:
-        if isinstance(architectures, str):
-            architectures = [architectures]
-        if not architectures:
-            logger.warning("No model architectures are specified")
+@lru_cache(maxsize=128)
+def _try_inspect_model_cls(
+    model_arch: str,
+    model: _BaseRegisteredModel,
+) -> Optional[_ModelInfo]:
+    try:
+        return model.inspect_model_cls()
+    except Exception:
+        logger.exception("Error in inspecting model architecture '%s'",
+                         model_arch)
+        return None
 
-        for arch in architectures:
-            model_cls = ModelRegistry._try_load_model_cls(arch)
-            if model_cls is not None:
-                return (model_cls, arch)
 
-        raise ValueError(
-            f"Model architectures {architectures} are not supported for now. "
-            f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+@dataclass
+class _ModelRegistry:
+    # Keyed by model_arch
+    models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict)
 
-    @staticmethod
-    def get_supported_archs() -> List[str]:
-        return list(_MODELS.keys()) + list(_OOT_MODELS.keys())
+    def get_supported_archs(self) -> List[str]:
+        return list(self.models.keys())
 
-    @staticmethod
-    def register_model(model_arch: str, model_cls: Union[Type[nn.Module],
-                                                         str]):
+    def register_model(
+        self,
+        model_arch: str,
+        model_cls: Union[Type[nn.Module], str],
+    ) -> None:
         """
         Register an external model to be used in vLLM.
 
@@ -238,7 +281,7 @@ def register_model(model_arch: str, model_cls: Union[Type[nn.Module],
           when importing the model and thus the related error
           :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
         """
-        if model_arch in _MODELS:
+        if model_arch in self.models:
             logger.warning(
                 "Model architecture %s is already registered, and will be "
                 "overwritten by the new model class %s.", model_arch,
@@ -250,120 +293,141 @@ def register_model(model_arch: str, model_cls: Union[Type[nn.Module],
                 msg = "Expected a string in the format `<module>:<class>`"
                 raise ValueError(msg)
 
-            module_name, cls_name = split_str
-            _OOT_MODELS_LAZY[model_arch] = module_name, cls_name
+            model = _LazyRegisteredModel(*split_str)
         else:
-            _OOT_MODELS[model_arch] = model_cls
+            model = _RegisteredModel.from_model_cls(model_cls)
 
-    @staticmethod
-    @lru_cache(maxsize=128)
-    def _check_stateless(
-        func: Callable[[Type[nn.Module]], bool],
-        model_arch: str,
-        *,
-        default: Optional[bool] = None,
-    ) -> bool:
-        """
-        Run a boolean function against a model and return the result.
+        self.models[model_arch] = model
 
-        If the model is not found, returns the provided default value.
+    def _raise_for_unsupported(self, architectures: List[str]):
+        all_supported_archs = self.get_supported_archs()
 
-        If the model is not already imported, the function is run inside a
-        subprocess to avoid initializing CUDA for the main program.
-        """
-        model = ModelRegistry._try_get_model_stateless(model_arch)
-        if model is not None:
-            return func(model)
+        raise ValueError(
+            f"Model architectures {architectures} are not supported for now. "
+            f"Supported architectures: {all_supported_archs}")
 
-        try:
-            mod_name, cls_name = ModelRegistry._get_module_cls_name(model_arch)
-        except KeyError:
-            if default is not None:
-                return default
-
-            raise
-
-        with tempfile.NamedTemporaryFile() as output_file:
-            # `cloudpickle` allows pickling lambda functions directly
-            input_bytes = cloudpickle.dumps(
-                (mod_name, cls_name, func, output_file.name))
-            # cannot use `sys.executable __file__` here because the script
-            # contains relative imports
-            returned = subprocess.run(
-                [sys.executable, "-m", "vllm.model_executor.models.registry"],
-                input=input_bytes,
-                capture_output=True)
-
-            # check if the subprocess is successful
-            try:
-                returned.check_returncode()
-            except Exception as e:
-                # wrap raised exception to provide more information
-                raise RuntimeError(f"Error happened when testing "
-                                   f"model support for{mod_name}.{cls_name}:\n"
-                                   f"{returned.stderr.decode()}") from e
-            with open(output_file.name, "rb") as f:
-                result = pickle.load(f)
-            return result
+    def _try_load_model_cls(self,
+                            model_arch: str) -> Optional[Type[nn.Module]]:
+        if model_arch not in self.models:
+            return None
 
-    @staticmethod
-    def is_text_generation_model(architectures: Union[str, List[str]]) -> bool:
-        if isinstance(architectures, str):
-            architectures = [architectures]
-        if not architectures:
-            logger.warning("No model architectures are specified")
+        return _try_load_model_cls(model_arch, self.models[model_arch])
 
-        is_txt_gen = partial(ModelRegistry._check_stateless,
-                             is_text_generation_model,
-                             default=False)
+    def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
+        if model_arch not in self.models:
+            return None
 
-        return any(is_txt_gen(arch) for arch in architectures)
+        return _try_inspect_model_cls(model_arch, self.models[model_arch])
 
-    @staticmethod
-    def is_embedding_model(architectures: Union[str, List[str]]) -> bool:
+    def _normalize_archs(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> List[str]:
         if isinstance(architectures, str):
             architectures = [architectures]
         if not architectures:
             logger.warning("No model architectures are specified")
 
-        is_emb = partial(ModelRegistry._check_stateless,
-                         is_embedding_model,
-                         default=False)
+        return architectures
 
-        return any(is_emb(arch) for arch in architectures)
+    def inspect_model_cls(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> _ModelInfo:
+        architectures = self._normalize_archs(architectures)
 
-    @staticmethod
-    def is_multimodal_model(architectures: Union[str, List[str]]) -> bool:
-        if isinstance(architectures, str):
-            architectures = [architectures]
-        if not architectures:
-            logger.warning("No model architectures are specified")
+        for arch in architectures:
+            model_info = self._try_inspect_model_cls(arch)
+            if model_info is not None:
+                return model_info
 
-        is_mm = partial(ModelRegistry._check_stateless,
-                        supports_multimodal,
-                        default=False)
+        return self._raise_for_unsupported(architectures)
 
-        return any(is_mm(arch) for arch in architectures)
+    def resolve_model_cls(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> Tuple[Type[nn.Module], str]:
+        architectures = self._normalize_archs(architectures)
 
-    @staticmethod
-    def is_pp_supported_model(architectures: Union[str, List[str]]) -> bool:
-        if isinstance(architectures, str):
-            architectures = [architectures]
-        if not architectures:
-            logger.warning("No model architectures are specified")
+        for arch in architectures:
+            model_cls = self._try_load_model_cls(arch)
+            if model_cls is not None:
+                return (model_cls, arch)
 
-        is_pp = partial(ModelRegistry._check_stateless,
-                        supports_pp,
-                        default=False)
+        return self._raise_for_unsupported(architectures)
 
-        return any(is_pp(arch) for arch in architectures)
+    def is_text_generation_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        return self.inspect_model_cls(architectures).is_text_generation_model
 
+    def is_embedding_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        return self.inspect_model_cls(architectures).is_embedding_model
+
+    def is_multimodal_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        return self.inspect_model_cls(architectures).supports_multimodal
+
+    def is_pp_supported_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        return self.inspect_model_cls(architectures).supports_pp
+
+
+ModelRegistry = _ModelRegistry({
+    model_arch: _LazyRegisteredModel(
+        module_name=f"vllm.model_executor.models.{mod_relname}",
+        class_name=cls_name,
+    )
+    for model_arch, (mod_relname, cls_name) in _VLLM_MODELS.items()
+})
+
+_T = TypeVar("_T")
+
+
+def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
+    with tempfile.NamedTemporaryFile() as output_file:
+        # `cloudpickle` allows pickling lambda functions directly
+        input_bytes = cloudpickle.dumps((fn, output_file.name))
+
+        # cannot use `sys.executable __file__` here because the script
+        # contains relative imports
+        returned = subprocess.run(
+            [sys.executable, "-m", "vllm.model_executor.models.registry"],
+            input=input_bytes,
+            capture_output=True)
+
+        # check if the subprocess is successful
+        try:
+            returned.check_returncode()
+        except Exception as e:
+            # wrap raised exception to provide more information
+            raise RuntimeError(f"Error raised in subprocess:\n"
+                               f"{returned.stderr.decode()}") from e
+
+        with open(output_file.name, "rb") as f:
+            return pickle.load(f)
+
+
+def _run() -> None:
+    # Setup plugins
+    from vllm.plugins import load_general_plugins
+    load_general_plugins()
+
+    fn, output_file = pickle.loads(sys.stdin.buffer.read())
+
+    result = fn()
 
-if __name__ == "__main__":
-    (mod_name, cls_name, func,
-     output_file) = pickle.loads(sys.stdin.buffer.read())
-    mod = importlib.import_module(mod_name)
-    klass = getattr(mod, cls_name)
-    result = func(klass)
     with open(output_file, "wb") as f:
         f.write(pickle.dumps(result))
+
+
+if __name__ == "__main__":
+    _run()

From 36ea79079bc499cd8fb07d3fe82fe069564e5570 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 11 Oct 2024 20:31:21 +0800
Subject: [PATCH 16/31] [Misc][LoRA] Support loading LoRA weights for
 target_modules in reg format (#9275)

---
 tests/lora/conftest.py              |  5 +++++
 tests/lora/test_lora_checkpoints.py | 17 ++++++++++++--
 vllm/lora/models.py                 |  7 ++++--
 vllm/lora/utils.py                  | 35 ++++++++++++++++++++++++++++-
 4 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index da98fac99cf22..405c0d0efad65 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -199,6 +199,11 @@ def baichuan_zero_lora_files():
     return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
 
 
+@pytest.fixture(scope="session")
+def baichuan_regex_lora_files():
+    return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")
+
+
 @pytest.fixture(scope="session")
 def minicpmv_lora_files():
     return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 3514dcb7aedf4..9a529e27b4cd8 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -5,7 +5,9 @@
 from vllm.lora.models import LoRAModel
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 
-lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"]
+lora_lst = [
+    "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
+]
 
 
 @pytest.mark.parametrize("lora_name", lora_lst)
@@ -13,6 +15,7 @@ def test_load_checkpoints(
     lora_name,
     baichuan_lora_files,
     baichuan_zero_lora_files,
+    baichuan_regex_lora_files,
     chatglm3_lora_files,
 ):
     supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
@@ -36,7 +39,7 @@ def test_load_checkpoints(
             embedding_modules=embedding_modules,
             embedding_padding_modules=embed_padding_modules)
     elif lora_name == "baichuan7B-zero":
-        #Test that the target_modules contain prefix
+        # Test that the target_modules contain prefix
         # such as "model.layers.0.self_atten.W_pack", and
         # the test should pass.
         LoRAModel.from_local_checkpoint(
@@ -46,6 +49,16 @@ def test_load_checkpoints(
             device="cpu",
             embedding_modules=embedding_modules,
             embedding_padding_modules=embed_padding_modules)
+    elif lora_name == "baichuan7B-zero-regex":
+        # Test that the `target_modules` in the form of regular expressions,
+        # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
+        LoRAModel.from_local_checkpoint(
+            baichuan_regex_lora_files,
+            expected_lora_modules,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
     else:
         # For the baichuan7B model, load chatglm3-6b's LoRA,
         # and the test should raise the following error.
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 91e9f55e82433..0dc54516f8671 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -23,6 +23,7 @@
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.punica import PunicaWrapper
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
+                             is_regex_target_modules,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@@ -233,6 +234,8 @@ def from_local_checkpoint(
             # modules.
             unexpected_modules = []
             target_modules = config["target_modules"]
+            if not isinstance(target_modules, list):
+                target_modules = [target_modules]
             for module in target_modules:
                 # Compatible with more modules,
                 # such as:layers.11.self_attn.k_proj
@@ -243,8 +246,8 @@ def from_local_checkpoint(
             # expected_lora_modules. It is not reliable. See
             # https://github.com/vllm-project/vllm/pull/5909. But there's no
             # other better mechanism.
-            if unexpected_modules:
-                print(unexpected_modules, "modules")
+            if unexpected_modules and not is_regex_target_modules(
+                    config["target_modules"], expected_lora_modules):
                 raise ValueError(
                     f"While loading {lora_dir}, expected"
                     f" target modules in {expected_lora_modules}"
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index ee983328e2c5b..a780429f413d3 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,5 +1,6 @@
 import os
-from typing import List, Optional, Set, Tuple, Type
+import re
+from typing import List, Optional, Set, Tuple, Type, Union
 
 import huggingface_hub
 from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
@@ -113,6 +114,38 @@ def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]:
     raise ValueError(f"{name} is unsupported LoRA weight")
 
 
+def is_regex_target_modules(load_modules: Union[str, List[str]],
+                            expected_lora_modules: List[str]) -> bool:
+    """
+    PEFT supports passing `target_modules` in the form of regular expressions, 
+    such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to 
+    determine whether the suffix in the regular expression is present in the 
+    `expected_lora_modules`.
+    """
+
+    def is_valid_regex(pattern):
+        try:
+            re.compile(pattern)
+            return True
+        except re.error:
+            return False
+
+    def is_subset(sub_list, full_list):
+        return set(sub_list).issubset(set(full_list))
+
+    # Similar to PEFT's processing logic, regex-related operations are only
+    #  executed when the load_modules is a `str`.
+    if not isinstance(load_modules, str):
+        return False
+
+    if is_valid_regex(load_modules):
+        match = re.search(r"\((.*?)\)\$?$", load_modules)
+        if match:
+            suffix = match.group(1).split("|")
+            return is_subset(suffix, expected_lora_modules)
+    return False
+
+
 def get_adapter_absolute_path(lora_path: str) -> str:
     """
     Resolves the given lora_path to an absolute local path.

From df3dcdf49dccfa4914d825fa08b74de8ae050e1e Mon Sep 17 00:00:00 2001
From: Sebastian Schoennenbeck <sebastian.schoennenbeck@comma-soft.com>
Date: Fri, 11 Oct 2024 17:35:35 +0200
Subject: [PATCH 17/31] [Bugfix] Fix priority in multiprocessing engine (#9277)

---
 vllm/engine/multiprocessing/engine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index d68970e1da24c..2bf0ce83c7607 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -285,7 +285,8 @@ def _handle_process_request(self, request: RPCProcessRequest):
                 params=request.params,
                 lora_request=request.lora_request,
                 trace_headers=request.trace_headers,
-                prompt_adapter_request=request.prompt_adapter_request)
+                prompt_adapter_request=request.prompt_adapter_request,
+                priority=request.priority)
 
             if self.log_requests:
                 logger.info("Added request %s.", request.request_id)

From 7342a7d7f87ea3f4e03ec0775093a0f1ce56e2a1 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 11 Oct 2024 11:40:06 -0400
Subject: [PATCH 18/31] [Model] Support Mamba (#6484)

---
 .buildkite/run-cpu-test-ppc64le.sh            |   8 +-
 .buildkite/run-cpu-test.sh                    |   1 +
 docs/source/models/supported_models.rst       |   5 +
 tests/kernels/test_attention_selector.py      |  37 +-
 .../decoder_only/language/test_mamba.py       | 295 +++++++++++
 vllm/attention/backends/placeholder_attn.py   | 324 ++++++++++++
 vllm/attention/layer.py                       |   8 +-
 vllm/attention/selector.py                    |  21 +-
 vllm/config.py                                |  50 +-
 vllm/core/interfaces.py                       |   8 +-
 ....py => placeholder_block_space_manager.py} |   9 +-
 vllm/core/scheduler.py                        |   5 +-
 vllm/engine/arg_utils.py                      |   7 +-
 .../model_loader/weight_utils.py              |  35 +-
 vllm/model_executor/models/interfaces.py      |  45 +-
 vllm/model_executor/models/jamba.py           | 261 ++-------
 vllm/model_executor/models/mamba.py           | 499 ++++++++++++++++++
 vllm/model_executor/models/mamba_cache.py     | 222 ++++++++
 vllm/model_executor/models/registry.py        |  16 +-
 vllm/worker/cache_engine.py                   |  15 +-
 vllm/worker/cpu_model_runner.py               |   3 +-
 vllm/worker/cpu_worker.py                     |   3 +-
 vllm/worker/enc_dec_model_runner.py           |   2 +-
 vllm/worker/model_runner.py                   |  30 +-
 vllm/worker/openvino_model_runner.py          |   3 +-
 vllm/worker/openvino_worker.py                |   3 +-
 vllm/worker/tpu_model_runner.py               |   3 +-
 vllm/worker/worker.py                         |  25 +-
 vllm/worker/xpu_model_runner.py               |   3 +-
 29 files changed, 1603 insertions(+), 343 deletions(-)
 create mode 100644 tests/models/decoder_only/language/test_mamba.py
 create mode 100644 vllm/attention/backends/placeholder_attn.py
 rename vllm/core/{embedding_model_block_manager.py => placeholder_block_space_manager.py} (90%)
 create mode 100644 vllm/model_executor/models/mamba.py
 create mode 100644 vllm/model_executor/models/mamba_cache.py

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 49ae838cf0690..fd60f5b6afeca 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -18,7 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  pytest -v -s tests/models -m \"not vlm\" \
+    --ignore=tests/models/test_embedding.py \
+    --ignore=tests/models/test_oot_registration.py \
+    --ignore=tests/models/test_registry.py \
+    --ignore=tests/models/test_jamba.py \
+    --ignore=tests/models/test_mamba.py \
+    --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
 
 # online inference
 docker exec cpu-test bash -c "
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 62d3afb0212fd..c2818c38965ea 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -27,6 +27,7 @@ docker exec cpu-test bash -c "
   pytest -v -s tests/models/decoder_only/language \
     --ignore=tests/models/test_fp8.py \
     --ignore=tests/models/decoder_only/language/test_jamba.py \
+    --ignore=tests/models/decoder_only/language/test_mamba.py \
     --ignore=tests/models/decoder_only/language/test_granitemoe.py \
     --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index ec64a82de84d4..f5d53edcebd35 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -152,6 +152,11 @@ Text Generation
     - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`MambaForCausalLM`
+    - Mamba
+    - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc.
+    - ✅︎
+    -
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM
     - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index c1fb45955a0e5..f471dcee938be 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -20,22 +20,22 @@ def test_env(name: str, device: str, monkeypatch):
 
     if device == "cpu":
         with patch("vllm.attention.selector.is_cpu", return_value=True):
-            backend = which_attn_to_use(8, 16, 8, None, torch.float16,
-                                        torch.float16, 16)
+            backend = which_attn_to_use(16, None, torch.float16, torch.float16,
+                                        16, False)
         assert backend.name == "TORCH_SDPA"
     elif device == "hip":
         with patch("vllm.attention.selector.is_hip", return_value=True):
-            backend = which_attn_to_use(8, 16, 8, None, torch.float16,
-                                        torch.float16, 16)
+            backend = which_attn_to_use(16, None, torch.float16, torch.float16,
+                                        16, False)
         assert backend.name == "ROCM_FLASH"
     elif device == "openvino":
         with patch("vllm.attention.selector.is_openvino", return_value=True):
-            backend = which_attn_to_use(8, 16, 8, None, torch.float16,
-                                        torch.float16, 16)
+            backend = which_attn_to_use(16, None, torch.float16, torch.float16,
+                                        16, False)
         assert backend.name == "OPENVINO"
     else:
-        backend = which_attn_to_use(8, 16, 8, None, torch.float16,
-                                    torch.float16, 16)
+        backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16,
+                                    False)
         assert backend.name == name
 
 
@@ -46,32 +46,37 @@ def test_flash_attn(monkeypatch):
 
     # Unsupported CUDA arch
     with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
-        backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
+        backend = which_attn_to_use(16, None, torch.float16, None, 16, False)
         assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported data type
-    backend = which_attn_to_use(8, 16, 8, None, torch.float8_e4m3fn, None, 16)
+    backend = which_attn_to_use(16, None, torch.float8_e4m3fn, None, 16, False)
     assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported kv cache data type
-    backend = which_attn_to_use(8, 16, 8, None, torch.float16, "fp8", 16)
+    backend = which_attn_to_use(16, None, torch.float16, "fp8", 16, False)
     assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported block size
-    backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 8)
+    backend = which_attn_to_use(16, None, torch.float16, None, 8, False)
     assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported sliding window
-    backend = which_attn_to_use(8, 16, 8, 1, torch.float16, None, 16)
+    backend = which_attn_to_use(16, 1, torch.float16, None, 16, False)
     assert backend.name != STR_FLASH_ATTN_VAL
 
     # flash-attn is not installed
     with patch.dict('sys.modules', {'vllm_flash_attn': None}):
-        backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
+        backend = which_attn_to_use(16, None, torch.float16, None, 16, False)
         assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported head size
-    backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16)
+    backend = which_attn_to_use(17, None, torch.float16, None, 16, False)
+    assert backend.name != STR_FLASH_ATTN_VAL
+
+    # Attention-free models should bypass env and use PlaceholderAttention
+    backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16,
+                                True)
     assert backend.name != STR_FLASH_ATTN_VAL
 
 
@@ -79,4 +84,4 @@ def test_invalid_env(monkeypatch):
     """Throw an exception if the backend name is invalid."""
     override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
     with pytest.raises(ValueError):
-        which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
+        which_attn_to_use(16, None, torch.float16, None, 16, False)
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
new file mode 100644
index 0000000000000..c27bf6a60a4f4
--- /dev/null
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -0,0 +1,295 @@
+"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
+
+Run `pytest tests/models/test_mamba.py`.
+"""
+import pytest
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from vllm.sampling_params import SamplingParams
+from vllm.worker.model_runner import _get_graph_batch_size
+
+from ...utils import check_outputs_equal
+
+MODELS = ["state-spaces/mamba-130m-hf"]
+
+
+# Use lower-level interfaces to create this greedy generator, as mamba will
+# choke on the model_kwarg 'attention_mask' if hf_model.generate_greedy is used.
+def generate_greedy(model_name, example_prompts, max_tokens):
+    # Create a text generation pipeline
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+
+    # Generate texts from the prompts
+    outputs = []
+    for prompt in example_prompts:
+        # Tokenize the input prompt with truncation
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
+        input_ids = inputs["input_ids"].to(model.device)
+
+        # Generate text using the model's generate method directly
+        generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
+        generated_text = tokenizer.decode(generated_ids[0],
+                                          skip_special_tokens=True)
+
+        outputs.append((generated_ids[0].tolist(), generated_text))
+
+    return outputs
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    hf_outputs = generate_greedy(model, example_prompts, max_tokens)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_batching(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # To pass the small model tests, we need full precision.
+    for_loop_outputs = []
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        for prompt in example_prompts:
+            for_loop_outputs.append(
+                vllm_model.generate_greedy([prompt], max_tokens)[0])
+
+        batched_outputs = vllm_model.generate_greedy(example_prompts,
+                                                     max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=for_loop_outputs,
+        outputs_1_lst=batched_outputs,
+        name_0="for_loop_vllm",
+        name_1="batched_vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_chunked_prefill_with_parallel_sampling(vllm_runner, example_prompts,
+                                                model: str, dtype: str,
+                                                max_tokens: int) -> None:
+    # Tests chunked prefill in conjunction with n>1. In this case, prefill is
+    # populated with decoding tokens and we test that it doesn't fail.
+    # This test might fail if cache is not allocated correctly for n > 1
+    # decoding steps inside a chunked prefill forward pass (where we have both
+    # prefill and decode together )
+    sampling_params = SamplingParams(n=3,
+                                     temperature=1,
+                                     seed=0,
+                                     max_tokens=max_tokens)
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            enable_chunked_prefill=True,
+            max_num_batched_tokens=30,
+            max_num_seqs=10  # forces prefill chunks with decoding
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+def test_chunked_prefill(vllm_runner, example_prompts, model: str, dtype: str,
+                         max_tokens: int,
+                         chunked_prefill_token_size: int) -> None:
+    """
+    Checks exact match decode between huggingface model and vllm runner with
+    chunked prefill.
+    """
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    non_chunked = generate_greedy(model, example_prompts, max_tokens)
+
+    with vllm_runner(model,
+                     dtype=dtype,
+                     enable_chunked_prefill=True,
+                     max_num_batched_tokens=max_num_batched_tokens,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+        chunked = vllm_model.generate_greedy(example_prompts,
+                                             max_tokens=max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=chunked,
+        outputs_1_lst=non_chunked,
+        name_0="chunked",
+        name_1="non_chunked",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [15])
+def test_parallel_sampling(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        for_loop_outputs = []
+        for _ in range(10):
+            for_loop_outputs.append(
+                # using example_prompts index 1 instead of 0 since with 0 the
+                # logprobs get really close and the test doesn't pass
+                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
+                [0])
+        sampling_params = SamplingParams(n=10,
+                                         temperature=0.001,
+                                         seed=0,
+                                         max_tokens=max_tokens)
+        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
+                                             sampling_params)
+    token_ids, texts = n_lt_1_outputs[0]
+    n_lt_1_outputs = [(token_id, text)
+                      for token_id, text in zip(token_ids, texts)]
+
+    check_outputs_equal(
+        outputs_0_lst=n_lt_1_outputs,
+        outputs_1_lst=for_loop_outputs,
+        name_0="vllm_n_lt_1_outputs",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_mamba_cache_cg_padding(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # This test is for verifying that mamba cache is padded to CG captured
+    # batch size. If it's not, a torch RuntimeError will be raised because
+    # tensor dimensions aren't compatible
+    while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
+        example_prompts.append(example_prompts[0])
+
+    try:
+        with vllm_runner(model, dtype=dtype) as vllm_model:
+            vllm_model.generate_greedy(example_prompts, max_tokens)
+    except RuntimeError:
+        pytest.fail(
+            "Couldn't run batch size which is not equal to a Cuda Graph "
+            "captured batch size. "
+            "Could be related to mamba cache not padded correctly")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_models_preemption_recompute(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # Tests that outputs are identical with and w/o preemtions (recompute)
+    assert dtype == "float"
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_model.model.llm_engine.scheduler[
+            0].ENABLE_ARTIFICIAL_PREEMPT = True
+        preempt_vllm_outputs = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+        vllm_model.model.llm_engine.scheduler[
+            0].ENABLE_ARTIFICIAL_PREEMPT = False
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=preempt_vllm_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="vllm_preepmtions",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    # This test is for verifying that the Mamba inner state management doesn't
+    # collapse in case where the number of incoming requests and
+    # finished_requests_ids is larger than the maximum Mamba block capacity.
+    # This could generally happen due to the fact that Mamba does support
+    # statelessness mechanism where it can cleanup new incoming requests in
+    # a single step.
+    try:
+        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
+            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
+    except ValueError:
+        pytest.fail("Mamba inner state wasn't cleaned up properly between"
+                    "steps finished requests registered unnecessarily ")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_state_cleanup(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    # This test is for verifying that the Mamba state is cleaned up between
+    # steps, If its not cleaned, an error would be expected.
+    try:
+        with vllm_runner(model, dtype=dtype) as vllm_model:
+            for _ in range(10):
+                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
+    except ValueError:
+        pytest.fail("Mamba inner state wasn't cleaned up between states, "
+                    "could be related to finished_requests_ids")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_model_print(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
new file mode 100644
index 0000000000000..99c68a863f599
--- /dev/null
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -0,0 +1,324 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder)
+from vllm.attention.backends.utils import CommonAttentionState
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
+
+# Placeholder attention backend for models like Mamba and embedding models that
+# lack attention.
+
+
+class PlaceholderAttentionBackend(AttentionBackend):
+    """Placeholder backend for when no attention is needed."""
+
+    @staticmethod
+    def get_name() -> str:
+        return "placeholder-attn"
+
+    @staticmethod
+    def get_impl_cls() -> Type["PlaceholderAttentionImpl"]:
+        return PlaceholderAttentionImpl
+
+    @staticmethod
+    def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]:
+        return PlaceholderAttentionMetadataBuilder
+
+    @staticmethod
+    def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]:
+        return PlaceholderAttentionMetadata
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (1, 1, 1, 1, 1)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        return
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        return
+
+
+@dataclass
+class PlaceholderAttentionMetadata(AttentionMetadata):
+    """Attention metadata for prefill and decode batched together."""
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # Maximum query length in the batch.
+    max_query_len: Optional[int]
+
+    # Number of query tokens for each request in the batch.
+    # Currently, we require that all requests have the same number of query
+    # tokens during the decoding phase. When speculavie decoding is enabled,
+    # decode_query_len might be greater than 1. In all other cases, it is 1.
+    decode_query_len: Optional[int]
+
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor]
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor]
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
+
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
+
+    _cached_prefill_metadata: Optional["PlaceholderAttentionMetadata"] = None
+    _cached_decode_metadata: Optional["PlaceholderAttentionMetadata"] = None
+
+    @property
+    def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        assert self.seq_lens is not None
+        assert self.seq_lens_tensor is not None
+        assert self.query_start_loc is not None
+        assert self.context_lens_tensor is not None
+        assert self.seq_start_loc is not None
+
+        # Placeholders
+        slot_mapping = torch.empty(0)
+        block_tables = torch.empty(0)
+
+        self._cached_prefill_metadata = PlaceholderAttentionMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            seq_lens=self.seq_lens[:self.num_prefills],
+            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
+            decode_query_len=0,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_seq_len=0,
+            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
+            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
+            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
+            block_tables=block_tables,
+            use_cuda_graph=False,
+        )
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert self.seq_lens_tensor is not None
+
+        # Placeholders
+        slot_mapping = torch.empty(0)
+        block_tables = torch.empty(0)
+
+        self._cached_decode_metadata = PlaceholderAttentionMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            seq_lens=None,
+            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
+            decode_query_len=self.decode_query_len,
+            max_query_len=None,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            query_start_loc=None,
+            seq_start_loc=None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            use_cuda_graph=self.use_cuda_graph,
+        )
+        return self._cached_decode_metadata
+
+
+class PlaceholderAttentionMetadataBuilder(
+        AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.curr_seq_lens: List[int] = []
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        """
+        is_prompt = inter_data.is_prompt
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                assert query_len == 1, (
+                    "seq_len: {}, context_len: {}, query_len: {}".format(
+                        seq_len, context_len, query_len))
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        logits_soft_cap = getattr(self.runner.model_config.hf_config,
+                                  "attn_logit_softcapping", None)
+        if logits_soft_cap is not None:
+            raise ValueError(
+                "Please use Flashinfer backend for models with logits_soft_cap"
+                " (i.e., Gemma-2). Otherwise, the output might be wrong."
+                " Set Flashinfer backend by "
+                "export VLLM_ATTENTION_BACKEND=FLASHINFER.")
+
+        max_query_len = max(query_lens)
+        decode_query_lens = query_lens[self.num_prefills:]
+        if len(decode_query_lens) > 0:
+            decode_query_len = max(decode_query_lens)
+        else:
+            decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+
+        if use_captured_graph:
+            num_decode_tokens = batch_size
+
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        context_lens_tensor = torch.tensor(self.context_lens,
+                                           dtype=torch.int,
+                                           device=device)
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=device)
+        query_lens_tensor = torch.tensor(query_lens,
+                                         dtype=torch.long,
+                                         device=device)
+        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=device)
+        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=device)
+        torch.cumsum(seq_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+        torch.cumsum(query_lens_tensor,
+                     dim=0,
+                     dtype=query_start_loc.dtype,
+                     out=query_start_loc[1:])
+
+        # Placeholders
+        slot_mapping = torch.empty(0)
+        block_tables = torch.empty(0)
+
+        return PlaceholderAttentionMetadata(
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            decode_query_len=decode_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+        )
+
+
+class PlaceholderAttentionImpl(AttentionImpl):
+
+    def __init__(self, *args, **kwargs) -> None:
+        return
+
+    def forward(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index ecf964fa49d9b..0112f49876996 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -42,10 +42,12 @@ def __init__(
             kv_cache_dtype = cache_config.cache_dtype
             block_size = cache_config.block_size
             sliding_window = cache_config.sliding_window
+            is_attention_free = cache_config.is_attention_free
         else:
             kv_cache_dtype = "auto"
             block_size = 16
             sliding_window = None
+            is_attention_free = False
         if num_kv_heads is None:
             num_kv_heads = num_heads
 
@@ -76,9 +78,9 @@ def __init__(
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
-        attn_backend = get_attn_backend(num_heads, head_size, num_kv_heads,
-                                        sliding_window, dtype, kv_cache_dtype,
-                                        block_size, blocksparse_params
+        attn_backend = get_attn_backend(head_size, sliding_window, dtype,
+                                        kv_cache_dtype, block_size,
+                                        is_attention_free, blocksparse_params
                                         is not None)
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 30aa7cb311afb..7edb7676ea2cd 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -24,6 +24,7 @@ class _Backend(enum.Enum):
     FLASHINFER = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
+    NO_ATTENTION = enum.auto()
 
 
 def backend_name_to_enum(backend_name: str) -> _Backend:
@@ -88,13 +89,12 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
 
 @lru_cache(maxsize=None)
 def get_attn_backend(
-    num_heads: int,
     head_size: int,
-    num_kv_heads: int,
     sliding_window: Optional[int],
     dtype: torch.dtype,
     kv_cache_dtype: Optional[str],
     block_size: int,
+    is_attention_free: bool,
     is_blocksparse: bool = False,
 ) -> Type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
@@ -105,9 +105,8 @@ def get_attn_backend(
             BlocksparseFlashAttentionBackend)
         return BlocksparseFlashAttentionBackend
 
-    backend = which_attn_to_use(num_heads, head_size, num_kv_heads,
-                                sliding_window, dtype, kv_cache_dtype,
-                                block_size)
+    backend = which_attn_to_use(head_size, sliding_window, dtype,
+                                kv_cache_dtype, block_size, is_attention_free)
     if backend == _Backend.FLASH_ATTN:
         from vllm.attention.backends.flash_attn import (  # noqa: F401
             FlashAttentionBackend)
@@ -146,23 +145,31 @@ def get_attn_backend(
         logger.info("Using Pallas backend.")
         from vllm.attention.backends.pallas import PallasAttentionBackend
         return PallasAttentionBackend
+    elif backend == _Backend.NO_ATTENTION:
+        from vllm.attention.backends.placeholder_attn import (
+            PlaceholderAttentionBackend)
+        return PlaceholderAttentionBackend
     else:
         raise ValueError("Invalid attention backend.")
 
 
 def which_attn_to_use(
-    num_heads: int,
     head_size: int,
-    num_kv_heads: int,
     sliding_window: Optional[int],
     dtype: torch.dtype,
     kv_cache_dtype: Optional[str],
     block_size: int,
+    is_attention_free: bool,
 ) -> _Backend:
     """Returns which flash attention backend to use."""
     # Default case.
     selected_backend = _Backend.FLASH_ATTN
 
+    # If there are no attention layers (e.g. we are running Mamba),
+    # use the placeholder NO_ATTENTION
+    if is_attention_free:
+        return _Backend.NO_ATTENTION
+
     # Check whether a particular choice of backend was
     # previously forced.
     #
diff --git a/vllm/config.py b/vllm/config.py
index 91ba45798b4ba..f964928aa0a68 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -196,6 +196,9 @@ def __init__(self,
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
+        self.is_attention_free = self._init_attention_free()
+        self.has_inner_state = self._init_has_inner_state()
+
         self.override_neuron_config = override_neuron_config if is_neuron(
         ) else None
         self._verify_embedding_mode()
@@ -216,6 +219,14 @@ def _init_multimodal_config(
 
         return None
 
+    def _init_attention_free(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return ModelRegistry.is_attention_free_model(architectures)
+
+    def _init_has_inner_state(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return ModelRegistry.model_has_inner_state(architectures)
+
     def _verify_tokenizer_mode(self) -> None:
         tokenizer_mode = self.tokenizer_mode.lower()
         if tokenizer_mode not in ["auto", "slow", "mistral"]:
@@ -438,6 +449,10 @@ def get_head_size(self) -> int:
             # FlashAttention supports only head_size 32, 64, 128, 256,
             # we need to pad head_size 192 to 256
             return 256
+
+        if self.is_attention_free:
+            return 0
+
         if hasattr(self.hf_text_config, "head_dim"):
             return self.hf_text_config.head_dim
         # FIXME(woosuk): This may not be true for all models.
@@ -469,6 +484,9 @@ def get_total_num_kv_heads(self) -> int:
             return getattr(self.hf_config.attn_config, "kv_n_heads",
                            self.hf_config.num_attention_heads)
 
+        if self.is_attention_free:
+            return 0
+
         attributes = [
             # For Falcon:
             "n_head_kv",
@@ -511,31 +529,17 @@ def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
         start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
         return end - start
 
-    def contains_seqlen_agnostic_layers(
-            self, parallel_config: "ParallelConfig") -> bool:
-        """True for Mamba/SSM models (Jamba)"""
-        return self._get_num_seqlen_agnostic_layers(parallel_config) > 0
+    def get_num_attention_layers(self,
+                                 parallel_config: "ParallelConfig") -> int:
+        if self.is_attention_free:
+            return 0
 
-    def get_layers_block_type(self,
-                              parallel_config: "ParallelConfig") -> List[str]:
         num_layers = self.get_num_layers(parallel_config)
-        # Transformers supports layers_block_type @property
-        return getattr(self.hf_config, "layers_block_type",
-                       ["attention"] * num_layers)
 
-    def get_num_attention_layers(self,
-                                 parallel_config: "ParallelConfig") -> int:
-        return len([
-            t for t in self.get_layers_block_type(parallel_config)
-            if t == "attention"
-        ])
-
-    def _get_num_seqlen_agnostic_layers(
-            self, parallel_config: "ParallelConfig") -> int:
-        return len([
-            t for t in self.get_layers_block_type(parallel_config)
-            if t != "attention"
-        ])
+        # Transformers supports layers_block_type @property
+        layers = getattr(self.hf_config, "layers_block_type",
+                         ["attention"] * num_layers)
+        return len([t for t in layers if t == "attention"])
 
     def get_multimodal_config(self) -> "MultiModalConfig":
         """
@@ -585,6 +589,7 @@ def __init__(
         gpu_memory_utilization: float,
         swap_space: float,
         cache_dtype: str,
+        is_attention_free: bool = False,
         num_gpu_blocks_override: Optional[int] = None,
         sliding_window: Optional[int] = None,
         enable_prefix_caching: bool = False,
@@ -595,6 +600,7 @@ def __init__(
         self.swap_space_bytes = swap_space * GiB_bytes
         self.num_gpu_blocks_override = num_gpu_blocks_override
         self.cache_dtype = cache_dtype
+        self.is_attention_free = is_attention_free
         self.sliding_window = sliding_window
         self.enable_prefix_caching = enable_prefix_caching
         self.cpu_offload_gb = cpu_offload_gb
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 6346711587301..9e1d1b02f6805 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -36,10 +36,10 @@ def get_block_space_manager_class(version: str):
             from vllm.core.block_manager_v2 import BlockSpaceManagerV2
             return BlockSpaceManagerV2
 
-        if version == "embedding":
-            from vllm.core.embedding_model_block_manager import (
-                EmbeddingModelBlockSpaceManager)
-            return EmbeddingModelBlockSpaceManager
+        if version == "placeholder":
+            from vllm.core.placeholder_block_space_manager import (
+                PlaceholderBlockSpaceManager)
+            return PlaceholderBlockSpaceManager
 
         raise ValueError(f"Unknown version {version=}")
 
diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/placeholder_block_space_manager.py
similarity index 90%
rename from vllm/core/embedding_model_block_manager.py
rename to vllm/core/placeholder_block_space_manager.py
index 476e043ecc52d..a337392bbed53 100644
--- a/vllm/core/embedding_model_block_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@@ -5,9 +5,10 @@
 from vllm.utils import Device
 
 
-class EmbeddingModelBlockSpaceManager(BlockSpaceManager):
-    """An embedding version of BlockSpaceManager for use in environments
-    with embedding models where block management is not required.
+class PlaceholderBlockSpaceManager(BlockSpaceManager):
+    """A version of BlockSpaceManager for use in environments
+    where block management is not required. 
+    For example: embedding models or attention-free models like Mamba.
 
     This class provides the same interface as BlockSpaceManager, but its
     methods perform no actions or return simple values like True in specific
@@ -40,7 +41,7 @@ def append_slots(
         seq: Sequence,
         num_lookahead_slots: int,
     ) -> List[Tuple[int, int]]:
-        return None  # type: ignore
+        return []
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         pass
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 2d7a27d1377e4..1f0a121711db5 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -314,8 +314,9 @@ def __init__(
         version = "v1"
         if self.scheduler_config.use_v2_block_manager:
             version = "v2"
-        if self.scheduler_config.embedding_mode:
-            version = "embedding"
+        if (self.scheduler_config.embedding_mode
+                or self.cache_config.is_attention_free):
+            version = "placeholder"
 
         BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
             version)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index efdcec4ab797a..bdfecabf96f2c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -912,6 +912,7 @@ def create_engine_config(self) -> EngineConfig:
             gpu_memory_utilization=self.gpu_memory_utilization,
             swap_space=self.swap_space,
             cache_dtype=self.kv_cache_dtype,
+            is_attention_free=model_config.is_attention_free,
             num_gpu_blocks_override=self.num_gpu_blocks_override,
             sliding_window=model_config.get_sliding_window(),
             enable_prefix_caching=self.enable_prefix_caching,
@@ -945,13 +946,9 @@ def create_engine_config(self) -> EngineConfig:
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
                 use_spec_decode = self.speculative_model is not None
-                has_seqlen_agnostic_layers = (
-                    model_config.contains_seqlen_agnostic_layers(
-                        parallel_config))
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
-                        and not self.enable_prompt_adapter
-                        and not has_seqlen_agnostic_layers):
+                        and not self.enable_prompt_adapter):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models with "
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 5051d45dd1154..1e2857ee28cbf 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -6,7 +6,8 @@
 import os
 import tempfile
 from collections import defaultdict
-from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Union
+from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional,
+                    Tuple, Union)
 
 import filelock
 import gguf
@@ -559,6 +560,38 @@ def row_parallel_weight_loader(param: torch.Tensor,
     return default_weight_loader(param, loaded_weight)
 
 
+LoaderFunction = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
+
+
+def sharded_weight_loader(shard_axis: int) -> LoaderFunction:
+    """Create a weight loader that shards the weights along the given axis"""
+
+    def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+        tp_rank = get_tensor_model_parallel_rank()
+
+        shard_size = param.data.shape[shard_axis]
+        start_idx = tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(shard_axis, start_idx, shard_size)
+
+        return default_weight_loader(param, loaded_weight)
+
+    return loader
+
+
+def composed_weight_loader(
+        loader: LoaderFunction, fn: Callable[[torch.Tensor],
+                                             torch.Tensor]) -> LoaderFunction:
+    """Create a weight loader that post-processes the weights after loading"""
+
+    def composed_loader(param: torch.Tensor,
+                        loaded_weight: torch.Tensor) -> None:
+        loader(param, loaded_weight)
+        param.data.copy_(fn(param))
+        return
+
+    return composed_loader
+
+
 def initialize_dummy_weights(
     model: torch.nn.Module,
     low: float = -1e-3,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 278dfc52078ef..dcead65115132 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -271,7 +271,7 @@ class HasInnerState(Protocol):
     """
         A flag that indicates this model has inner state.
         Models that has inner state usually need access to the scheduler_config
-        for max_num_seqs ,etc... (Currently only used by Jamba)
+        for max_num_seqs, etc. True for e.g. both Mamba and Jamba.
     """
 
     def __init__(self,
@@ -307,3 +307,46 @@ def has_inner_state(
         return isinstance(model, _HasInnerStateType)
 
     return isinstance(model, HasInnerState)
+
+
+@runtime_checkable
+class IsAttentionFree(Protocol):
+    """The interface required for all models like Mamba that lack attention,
+    but do have state whose size is constant wrt the number of tokens."""
+
+    is_attention_free: ClassVar[Literal[True]] = True
+    """
+        A flag that indicates this model has no attention.
+        Used for block manager and attention backend selection.
+        True for Mamba but not Jamba.
+    """
+
+    def __init__(self) -> None:
+        ...
+
+
+@runtime_checkable
+class _IsAttentionFreeType(Protocol):
+    is_attention_free: ClassVar[Literal[True]]
+
+    def __init__(self) -> None:
+        ...
+
+
+@overload
+def is_attention_free(model: object) -> TypeIs[IsAttentionFree]:
+    ...
+
+
+@overload
+def is_attention_free(model: Type[object]) -> TypeIs[Type[IsAttentionFree]]:
+    ...
+
+
+def is_attention_free(
+    model: Union[Type[object], object]
+) -> Union[TypeIs[Type[IsAttentionFree]], TypeIs[IsAttentionFree]]:
+    if isinstance(model, type):
+        return isinstance(model, _IsAttentionFreeType)
+
+    return isinstance(model, IsAttentionFree)
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 06ec324b3e108..ac251b88e872c 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,18 +1,16 @@
 # coding=utf-8
 """Inference-only Jamba model."""
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
-from torch.nn.parameter import Parameter
 from transformers import JambaConfig
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -29,7 +27,9 @@
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    composed_weight_loader, default_weight_loader, sharded_weight_loader)
+from vllm.model_executor.models.mamba_cache import MambaCacheManager
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import IntermediateTensors
@@ -99,16 +99,6 @@ def __init__(self, config: JambaConfig, layer_idx):
                                             bias=True,
                                             skip_bias_add=True)
 
-        def weight_loader(param: Parameter, loaded_weight: torch.Tensor):
-            tp_rank = get_tensor_model_parallel_rank()
-            tp_size = get_tensor_model_parallel_world_size()
-            param.data.copy_(
-                loaded_weight.data.split(loaded_weight.shape[0] // tp_size,
-                                         dim=0)[tp_rank])
-
-        def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
-            weight_loader(param, -torch.exp(loaded_weight.float()))
-
         tp_size = get_tensor_model_parallel_world_size()
         self.A = nn.Parameter(
             torch.empty(
@@ -118,8 +108,10 @@ def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
             ))
         self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
 
-        set_weight_attrs(self.D, {"weight_loader": weight_loader})
-        set_weight_attrs(self.A, {"weight_loader": A_weight_loader})
+        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
+        a_weight_loader = composed_weight_loader(
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
+        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
 
         self.out_proj = RowParallelLinear(
             self.intermediate_size,
@@ -571,10 +563,8 @@ def __init__(
             if not lora_config else lora_config.lora_vocab_padding_size,
         )
         # Used to track and store by the Mamba cache between steps.
-        self.mamba_cache: Tuple[torch.Tensor, torch.Tensor] = tuple()
-        # Maps between the request id and a dict that maps between the seq_id
-        # and its index inside the self.mamba_cache
-        self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
         self.sampler = Sampler()
@@ -586,203 +576,36 @@ def forward(self,
                 attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 **kwargs):
-        if not self.mamba_cache:
-            self._prepare_mamba_cache()
-
-        if "seqlen_agnostic_capture_inputs" not in kwargs:
-            # We get here only on Prefill/Eager mode runs
-            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-            finished_requests_ids = kwargs["finished_requests_ids"]
-            mamba_cache = self._release_finished_and_prepare_mamba_cache(
-                finished_requests_ids, request_ids_to_seq_ids)
-        else:
-            # CUDA graph capturing runs
-            mamba_cache = kwargs["seqlen_agnostic_capture_inputs"]
-
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, mamba_cache[0],
-                                   mamba_cache[1])
-        return hidden_states
-
-    def _swap_mamba_cache(self, from_index: int, to_index: int):
-        assert len(self.mamba_cache) > 0
-        for cache_t in self.mamba_cache:
-            cache_t[:, [to_index,from_index]] = \
-             cache_t[:, [from_index,to_index]]
-
-    def _copy_mamba_cache(self, from_index: int, to_index: int):
-        assert len(self.mamba_cache) > 0
-        for cache_t in self.mamba_cache:
-            cache_t[:, to_index].copy_(cache_t[:, from_index],
-                                       non_blocking=True)
-
-    def _move_out_if_already_occupied(self, index: int,
-                                      all_occupied_indices: List[int]):
-        if index in all_occupied_indices:
-            first_free_index = self._first_free_index_in_mamba_cache()
-            # In case occupied, move the occupied to a new empty block
-            self._move_cache_index_and_mappings(from_index=index,
-                                                to_index=first_free_index)
-
-    def _assign_seq_id_to_mamba_cache_in_specific_dest(self, cur_rid: str,
-                                                       seq_id: int,
-                                                       destination_index: int):
-        """
-        Assign (req_id,seq_id) pair to a `destination_index` index, if
-        already occupied, move the occupying index to a free index.
-        """
-        all_occupied_indices = self._get_all_occupied_indices()
-        if cur_rid not in self.mamba_cache_indices_mapping:
-            self._move_out_if_already_occupied(
-                index=destination_index,
-                all_occupied_indices=all_occupied_indices)
-            self.mamba_cache_indices_mapping[cur_rid] = {
-                seq_id: destination_index
-            }
-        elif seq_id not in (seq_ids2indices :=
-                            self.mamba_cache_indices_mapping[cur_rid]):
-            # parallel sampling , where n > 1, assume prefill have
-            # already happened now we only need to copy the already
-            # existing cache into the siblings seq_ids caches
-            self._move_out_if_already_occupied(
-                index=destination_index,
-                all_occupied_indices=all_occupied_indices)
-            index_exists = list(seq_ids2indices.values())[0]
-            # case of decoding n>1, copy prefill cache to decoding indices
-            self._copy_mamba_cache(from_index=index_exists,
-                                   to_index=destination_index)
-            self.mamba_cache_indices_mapping[cur_rid][
-                seq_id] = destination_index
-        else:
-            # already exists
-            cache_index_already_exists = self.mamba_cache_indices_mapping[
-                cur_rid][seq_id]
-            if cache_index_already_exists != destination_index:
-                # In case the seq id already exists but not in
-                # the right destination, swap it with what's occupying it
-                self._swap_pair_indices_and_mappings(
-                    from_index=cache_index_already_exists,
-                    to_index=destination_index)
-
-    def _prepare_current_run_mamba_cache(
-            self, request_ids_to_seq_ids: Dict[str, list[int]],
-            finished_requests_ids: List[str]
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        running_indices = []
-        request_ids_to_seq_ids_flatten = [
-            (req_id, seq_id)
-            for req_id, seq_ids in request_ids_to_seq_ids.items()
-            for seq_id in seq_ids
-        ]
-        batch_size = len(request_ids_to_seq_ids_flatten)
-        for dest_index, (request_id,
-                         seq_id) in enumerate(request_ids_to_seq_ids_flatten):
-            if request_id in finished_requests_ids:
-                # Do not allocate cache index for requests that run
-                # and finish right after
-                continue
-            self._assign_seq_id_to_mamba_cache_in_specific_dest(
-                request_id, seq_id, dest_index)
-            running_indices.append(dest_index)
+        if self.mamba_cache is None:
+            max_batch_size = (_get_graph_batch_size(
+                self.scheduler_config.max_num_seqs) if self.scheduler_config
+                              else max(_BATCH_SIZES_TO_CAPTURE) + 2)
 
-        self._clean_up_first_bs_blocks(batch_size, running_indices)
-        conv_state = self.mamba_cache[0][:, :batch_size]
-        temporal_state = self.mamba_cache[1][:, :batch_size]
+            layers_type = self.config.layers_block_type
+            num_mamba_layers = sum(
+                [layer_type == "mamba" for layer_type in layers_type])
 
-        return (conv_state, temporal_state)
+            self.mamba_cache = MambaCacheManager(
+                self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
+                *self._get_mamba_cache_shape())
 
-    def _get_all_occupied_indices(self):
-        return [
-            cache_idx
-            for seq_ids2indices in self.mamba_cache_indices_mapping.values()
-            for cache_idx in seq_ids2indices.values()
-        ]
+        mamba_cache_tensors = self.mamba_cache.current_run_tensors(
+            input_ids, attn_metadata, **kwargs)
 
-    def _clean_up_first_bs_blocks(self, batch_size: int,
-                                  indices_for_current_run: List[int]):
-        # move out all of the occupied but currently not running blocks
-        # outside of the first n blocks
-        destination_indices = range(batch_size)
-        max_possible_batch_size = self.mamba_cache[0].shape[1]
-        for destination_index in destination_indices:
-            if destination_index in self._get_all_occupied_indices() and  \
-               destination_index not in indices_for_current_run:
-                # move not running indices outside of the batch
-                all_other_indices = list(
-                    range(batch_size, max_possible_batch_size))
-                first_avail_index = self._first_free_index_in_mamba_cache(
-                    all_other_indices)
-                self._swap_indices(from_index=destination_index,
-                                   to_index=first_avail_index)
-
-    def _move_cache_index_and_mappings(self, from_index: int, to_index: int):
-        self._copy_mamba_cache(from_index=from_index, to_index=to_index)
-        self._update_mapping_index(from_index=from_index, to_index=to_index)
-
-    def _swap_pair_indices_and_mappings(self, from_index: int, to_index: int):
-        self._swap_mamba_cache(from_index=from_index, to_index=to_index)
-        self._swap_mapping_index(from_index=from_index, to_index=to_index)
-
-    def _swap_mapping_index(self, from_index: int, to_index: int):
-        for seq_ids2index in self.mamba_cache_indices_mapping.values():
-            for seq_id, index in seq_ids2index.items():
-                if from_index == index:
-                    seq_ids2index.update({seq_id: to_index})
-                elif to_index == index:
-                    seq_ids2index.update({seq_id: from_index})
-
-    def _update_mapping_index(self, from_index: int, to_index: int):
-        for seq_ids2index in self.mamba_cache_indices_mapping.values():
-            for seq_id, index in seq_ids2index.items():
-                if from_index == index:
-                    seq_ids2index.update({seq_id: to_index})
-                    return
-
-    def _release_finished_and_prepare_mamba_cache(
-            self, finished_requests_ids,
-            request_ids_to_seq_ids) -> Tuple[torch.Tensor, torch.Tensor]:
-        self._release_mamba_cache(finished_requests_ids)
-        return self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
-                                                     finished_requests_ids)
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, mamba_cache_tensors[0],
+                                   mamba_cache_tensors[1])
+        return hidden_states
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
-        """
-        Copy the relevant Mamba cache into the CUDA graph input buffer 
-        that was provided during the capture runs 
-        (JambaForCausalLM.mamba_gc_cache_buffer). 
-        """
-        self._release_finished_and_prepare_mamba_cache(
-            kwargs["finished_requests_ids"], kwargs["request_ids_to_seq_ids"])
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
 
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
-        """
-        Provide the CUDA graph capture runs with a buffer in adjusted size.
-        The buffer is used to maintain the Mamba Cache during the CUDA graph 
-        replay runs.
-        """
-        return tuple(buffer[:, :batch_size] for buffer in self.mamba_cache)
-
-    def _release_mamba_cache(self, finished_seq_groups_req_ids: List[str]):
-        for req_id in finished_seq_groups_req_ids:
-            if req_id in self.mamba_cache_indices_mapping:
-                self.mamba_cache_indices_mapping.pop(req_id)
-
-    def _first_free_index_in_mamba_cache(
-            self, indices_range: Optional[List[int]] = None) -> int:
-        assert self.mamba_cache is not None
-        if indices_range is None:
-            max_possible_batch_size = self.mamba_cache[0].shape[1]
-            indices_range = list(range(max_possible_batch_size))
-        all_occupied_indices = self._get_all_occupied_indices()
-        for i in indices_range:
-            if i not in all_occupied_indices:
-                return i
-        raise Exception("Couldn't find a free spot in the mamba cache! This"
-                        "should never happen")
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
     def _get_mamba_cache_shape(
-            self
-    ) -> Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]:
+            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
         world_size = get_tensor_model_parallel_world_size()
         hidden_size = self.config.hidden_size
         conv_state_shape = (
@@ -790,31 +613,11 @@ def _get_mamba_cache_shape(
             self.config.mamba_d_conv - 1,
         )
         temporal_state_shape = (
-            self.config.mamba_expand * self.config.hidden_size // world_size,
+            self.config.mamba_expand * hidden_size // world_size,
             self.config.mamba_d_state,
         )
         return conv_state_shape, temporal_state_shape
 
-    def _prepare_mamba_cache(self):
-        dtype = self.lm_head.weight.dtype
-        layers_type = self.config.layers_block_type
-        mamba_layers = sum(
-            [layer_type == "mamba" for layer_type in layers_type])
-        max_batch_size = (_get_graph_batch_size(
-            self.scheduler_config.max_num_seqs) if self.scheduler_config else
-                          max(_BATCH_SIZES_TO_CAPTURE) + 2)
-        conv_state_shape, temporal_state_shape = self._get_mamba_cache_shape()
-        assert conv_state_shape is not None and temporal_state_shape is not None
-
-        self.mamba_cache = (torch.empty(size=(mamba_layers, max_batch_size) +
-                                        conv_state_shape,
-                                        dtype=dtype,
-                                        device="cuda"),
-                            torch.empty(size=(mamba_layers, max_batch_size) +
-                                        temporal_state_shape,
-                                        dtype=dtype,
-                                        device="cuda"))
-
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
new file mode 100644
index 0000000000000..1112a2181135a
--- /dev/null
+++ b/vllm/model_executor/models/mamba.py
@@ -0,0 +1,499 @@
+# coding=utf-8
+"""PyTorch MAMBA model."""
+from dataclasses import dataclass
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import MambaConfig
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    composed_weight_loader, default_weight_loader, sharded_weight_loader)
+from vllm.model_executor.models.interfaces import (HasInnerState,
+                                                   IsAttentionFree)
+from vllm.model_executor.models.mamba_cache import MambaCacheManager
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.sequence import IntermediateTensors
+from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE,
+                                      _get_graph_batch_size)
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+@dataclass
+class MambaCacheParams:
+    is_prompt: bool = False
+    conv_state: torch.Tensor = torch.Tensor()
+    ssm_state: torch.Tensor = torch.Tensor()
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+class MambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute
+    the `contextualized_states`. A, D are input independent
+    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
+    for why A isn't selective) ∆, B, C are input-dependent
+    (this is a key difference between Mamba and the linear time
+    invariant S4, and is why Mamba is called
+    **selective** state spaces)
+    """
+
+    def __init__(self, config: MambaConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.intermediate_size = config.intermediate_size
+        self.time_step_rank = int(config.time_step_rank)
+
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.intermediate_size,
+            bias=config.use_conv_bias,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(self.hidden_size,
+                                                  [self.intermediate_size] * 2,
+                                                  bias=config.use_bias)
+        # selective projection used to make dt, B and C input dependent
+        self.x_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.time_step_rank + self.ssm_state_size * 2,
+            bias=False,
+        )
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(self.time_step_rank,
+                                            self.intermediate_size,
+                                            bias=True,
+                                            skip_bias_add=True)
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.A = nn.Parameter(
+            torch.empty(
+                self.intermediate_size // tp_size,
+                self.ssm_state_size,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
+
+        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
+        a_weight_loader = composed_weight_loader(
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
+        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
+
+        self.out_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=config.use_bias,
+            input_is_parallel=True,
+        )
+        self.activation = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor,
+                attn_metadata: AttentionMetadata, conv_state: torch.Tensor,
+                ssm_state: torch.Tensor):
+
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
+        hidden_states, gate = projected_states.chunk(2, dim=-2)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            # |---------- N-1 iteration --------|
+            # |---------------- N iteration ---------------------|
+            # |- tokenA -|......................|-- newTokens ---|
+            # |---------- context_len ----------|
+            # |-------------------- seq_len ---------------------|
+            #                                   |-- query_len ---|
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=conv_state,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            hidden_states = causal_conv1d_update(
+                hidden_states.transpose(0, 1),
+                conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states = hidden_states.transpose(0, 1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
+
+        time_step, B, C = torch.split(
+            ssm_parameters,
+            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
+            dim=-1,
+        )
+
+        # Note that Jamba normalizes B, C, and time_step here but Mamba doesn't.
+
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            scan_outputs = selective_scan_fn(
+                hidden_states,
+                ssm_state,
+                discrete_time_step,
+                self.A,
+                B.transpose(-2, -1),
+                C.transpose(-2, -1),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            scan_outputs = selective_state_update(
+                ssm_state,
+                hidden_states.transpose(0, 1),
+                discrete_time_step.transpose(0, 1),
+                self.A,
+                B,
+                C,
+                self.D,
+                gate.transpose(0, 1),
+                time_proj_bias,
+                dt_softplus=True,
+            )
+            scan_outputs = scan_outputs.transpose(0, 1)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
+                                                                     -1))[0]
+        return contextualized_states
+
+
+class MambaMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: MambaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+        hidden_act = config.hidden_act
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MambaDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: MambaConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.config = config
+        self.mixer = MambaMixer(config, layer_idx)
+
+        self.feed_forward = MambaMLP(config, quant_config=quant_config)
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+        conv_state: torch.Tensor,
+        ssm_state: torch.Tensor,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states, attn_metadata, conv_state,
+                                   ssm_state)
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class MambaModel(nn.Module):
+
+    def __init__(
+        self,
+        config: MambaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embeddings = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        decoder_layers = []
+        for i in range(config.num_hidden_layers):
+            decoder_layers.append(
+                MambaDecoderLayer(config,
+                                  layer_idx=i,
+                                  cache_config=cache_config,
+                                  quant_config=quant_config))
+        self.layers = nn.ModuleList(decoder_layers)
+        self.norm_f = RMSNorm(config.hidden_size,
+                              eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        conv_state: torch.Tensor,
+        ssm_state: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(input_ids)
+        residual = None
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            current_ssm_state = ssm_state[i]
+            current_conv_state = conv_state[i]
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                attn_metadata=attn_metadata,
+                residual=residual,
+                conv_state=current_conv_state,
+                ssm_state=current_ssm_state,
+            )
+        hidden_states, _ = self.norm_f(hidden_states, residual)
+
+        return hidden_states
+
+
+class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embeddings": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(
+        self,
+        config: MambaConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        scheduler_config: Optional[SchedulerConfig] = None,
+    ) -> None:
+        assert not cache_config.enable_prefix_caching, \
+            "Mamba does not support prefix caching"
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.backbone = MambaModel(config,
+                                   cache_config=cache_config,
+                                   quant_config=quant_config,
+                                   lora_config=lora_config)
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        self.lm_head = self.backbone.embeddings
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: List[KVCache],
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+            max_batch_size = (_get_graph_batch_size(
+                self.scheduler_config.max_num_seqs) if self.scheduler_config
+                              else max(_BATCH_SIZES_TO_CAPTURE) + 2)
+            self.mamba_cache = MambaCacheManager(
+                self.lm_head.weight.dtype, self.config.num_hidden_layers,
+                max_batch_size, *self._get_mamba_cache_shape())
+
+        mamba_cache_tensors = self.mamba_cache.current_run_tensors(
+            input_ids, attn_metadata, **kwargs)
+
+        hidden_states = self.backbone(input_ids, positions, kv_caches,
+                                      attn_metadata, mamba_cache_tensors[0],
+                                      mamba_cache_tensors[1])
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        conv_state_shape = (
+            self.config.intermediate_size // world_size,
+            self.config.conv_kernel - 1,
+        )
+        temporal_state_shape = (
+            self.config.intermediate_size // world_size,
+            self.config.state_size,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
new file mode 100644
index 0000000000000..8d1ba3737d4a5
--- /dev/null
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -0,0 +1,222 @@
+from typing import Dict, List, Optional
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionMetadata
+
+
+class MambaCacheManager:
+
+    def __init__(self, dtype, num_mamba_layers, max_batch_size,
+                 conv_state_shape, temporal_state_shape):
+
+        conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
+                                 conv_state_shape,
+                                 dtype=dtype,
+                                 device="cuda")
+        temporal_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
+                                     temporal_state_shape,
+                                     dtype=dtype,
+                                     device="cuda")
+
+        self.mamba_cache = (conv_state, temporal_state)
+
+        # Maps between the request id and a dict that maps between the seq_id
+        # and its index inside the self.mamba_cache
+        self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
+
+    def current_run_tensors(self, input_ids: torch.Tensor,
+                            attn_metadata: AttentionMetadata, **kwargs):
+        """
+        Return the tensors for the current run's conv and ssm state.
+        """
+        if "seqlen_agnostic_capture_inputs" not in kwargs:
+            # We get here only on Prefill/Eager mode runs
+            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+            finished_requests_ids = kwargs["finished_requests_ids"]
+
+            self._release_finished_requests(finished_requests_ids)
+            mamba_cache_tensors = self._prepare_current_run_mamba_cache(
+                request_ids_to_seq_ids, finished_requests_ids)
+
+        else:
+            # CUDA graph capturing runs
+            mamba_cache_tensors = kwargs["seqlen_agnostic_capture_inputs"]
+
+        return mamba_cache_tensors
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        """
+        Copy the relevant Mamba cache into the CUDA graph input buffer
+        that was provided during the capture runs
+        (JambaForCausalLM.mamba_gc_cache_buffer).
+        """
+        assert all(
+            key in kwargs
+            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
+        finished_requests_ids = kwargs["finished_requests_ids"]
+        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+
+        self._release_finished_requests(finished_requests_ids)
+        self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
+                                              finished_requests_ids)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        """
+        Provide the CUDA graph capture runs with a buffer in adjusted size.
+        The buffer is used to maintain the Mamba Cache during the CUDA graph
+        replay runs.
+        """
+        return tuple(buffer[:, :batch_size] for buffer in self.mamba_cache)
+
+    def _swap_mamba_cache(self, from_index: int, to_index: int):
+        assert len(self.mamba_cache) > 0
+        for cache_t in self.mamba_cache:
+            cache_t[:, [to_index,from_index]] = \
+             cache_t[:, [from_index,to_index]]
+
+    def _copy_mamba_cache(self, from_index: int, to_index: int):
+        assert len(self.mamba_cache) > 0
+        for cache_t in self.mamba_cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)
+
+    def _move_out_if_already_occupied(self, index: int,
+                                      all_occupied_indices: List[int]):
+        if index in all_occupied_indices:
+            first_free_index = self._first_free_index_in_mamba_cache()
+            # In case occupied, move the occupied to a new empty block
+            self._move_cache_index_and_mappings(from_index=index,
+                                                to_index=first_free_index)
+
+    def _assign_seq_id_to_mamba_cache_in_specific_dest(self, cur_rid: str,
+                                                       seq_id: int,
+                                                       destination_index: int):
+        """
+        Assign (req_id,seq_id) pair to a `destination_index` index, if
+        already occupied, move the occupying index to a free index.
+        """
+        all_occupied_indices = self._get_all_occupied_indices()
+        if cur_rid not in self.mamba_cache_indices_mapping:
+            self._move_out_if_already_occupied(
+                index=destination_index,
+                all_occupied_indices=all_occupied_indices)
+            self.mamba_cache_indices_mapping[cur_rid] = {
+                seq_id: destination_index
+            }
+        elif seq_id not in (seq_ids2indices :=
+                            self.mamba_cache_indices_mapping[cur_rid]):
+            # parallel sampling , where n > 1, assume prefill have
+            # already happened now we only need to copy the already
+            # existing cache into the siblings seq_ids caches
+            self._move_out_if_already_occupied(
+                index=destination_index,
+                all_occupied_indices=all_occupied_indices)
+            index_exists = list(seq_ids2indices.values())[0]
+            # case of decoding n>1, copy prefill cache to decoding indices
+            self._copy_mamba_cache(from_index=index_exists,
+                                   to_index=destination_index)
+            self.mamba_cache_indices_mapping[cur_rid][
+                seq_id] = destination_index
+        else:
+            # already exists
+            cache_index_already_exists = self.mamba_cache_indices_mapping[
+                cur_rid][seq_id]
+            if cache_index_already_exists != destination_index:
+                # In case the seq id already exists but not in
+                # the right destination, swap it with what's occupying it
+                self._swap_pair_indices_and_mappings(
+                    from_index=cache_index_already_exists,
+                    to_index=destination_index)
+
+    def _prepare_current_run_mamba_cache(
+            self, request_ids_to_seq_ids: Dict[str, list[int]],
+            finished_requests_ids: List[str]):
+        running_indices = []
+        request_ids_to_seq_ids_flatten = [
+            (req_id, seq_id)
+            for req_id, seq_ids in request_ids_to_seq_ids.items()
+            for seq_id in seq_ids
+        ]
+        batch_size = len(request_ids_to_seq_ids_flatten)
+        for dest_index, (request_id,
+                         seq_id) in enumerate(request_ids_to_seq_ids_flatten):
+            if request_id in finished_requests_ids:
+                # Do not allocate cache index for requests that run
+                # and finish right after
+                continue
+            self._assign_seq_id_to_mamba_cache_in_specific_dest(
+                request_id, seq_id, dest_index)
+            running_indices.append(dest_index)
+
+        self._clean_up_first_bs_blocks(batch_size, running_indices)
+        conv_state = self.mamba_cache[0][:, :batch_size]
+        temporal_state = self.mamba_cache[1][:, :batch_size]
+
+        return (conv_state, temporal_state)
+
+    def _get_all_occupied_indices(self):
+        return [
+            cache_idx
+            for seq_ids2indices in self.mamba_cache_indices_mapping.values()
+            for cache_idx in seq_ids2indices.values()
+        ]
+
+    def _clean_up_first_bs_blocks(self, batch_size: int,
+                                  indices_for_current_run: List[int]):
+        # move out all of the occupied but currently not running blocks
+        # outside of the first n blocks
+        destination_indices = range(batch_size)
+        max_possible_batch_size = self.mamba_cache[0].shape[1]
+        for destination_index in destination_indices:
+            if destination_index in self._get_all_occupied_indices() and  \
+               destination_index not in indices_for_current_run:
+                # move not running indices outside of the batch
+                all_other_indices = list(
+                    range(batch_size, max_possible_batch_size))
+                first_avail_index = self._first_free_index_in_mamba_cache(
+                    all_other_indices)
+                self._swap_indices(from_index=destination_index,
+                                   to_index=first_avail_index)
+
+    def _move_cache_index_and_mappings(self, from_index: int, to_index: int):
+        self._copy_mamba_cache(from_index=from_index, to_index=to_index)
+        self._update_mapping_index(from_index=from_index, to_index=to_index)
+
+    def _swap_pair_indices_and_mappings(self, from_index: int, to_index: int):
+        self._swap_mamba_cache(from_index=from_index, to_index=to_index)
+        self._swap_mapping_index(from_index=from_index, to_index=to_index)
+
+    def _swap_mapping_index(self, from_index: int, to_index: int):
+        for seq_ids2index in self.mamba_cache_indices_mapping.values():
+            for seq_id, index in seq_ids2index.items():
+                if from_index == index:
+                    seq_ids2index.update({seq_id: to_index})
+                elif to_index == index:
+                    seq_ids2index.update({seq_id: from_index})
+
+    def _update_mapping_index(self, from_index: int, to_index: int):
+        for seq_ids2index in self.mamba_cache_indices_mapping.values():
+            for seq_id, index in seq_ids2index.items():
+                if from_index == index:
+                    seq_ids2index.update({seq_id: to_index})
+                    return
+
+    def _release_finished_requests(self,
+                                   finished_seq_groups_req_ids: List[str]):
+        for req_id in finished_seq_groups_req_ids:
+            if req_id in self.mamba_cache_indices_mapping:
+                self.mamba_cache_indices_mapping.pop(req_id)
+
+    def _first_free_index_in_mamba_cache(
+            self, indices_range: Optional[List[int]] = None) -> int:
+        assert self.mamba_cache is not None
+        if indices_range is None:
+            max_possible_batch_size = self.mamba_cache[0].shape[1]
+            indices_range = list(range(max_possible_batch_size))
+        all_occupied_indices = self._get_all_occupied_indices()
+        for i in indices_range:
+            if i not in all_occupied_indices:
+                return i
+        raise Exception("Couldn't find a free spot in the mamba cache! This"
+                        "should never happen")
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b37452877cf0c..3c8c600c2c026 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -14,7 +14,8 @@
 from vllm.logger import init_logger
 from vllm.utils import is_hip
 
-from .interfaces import supports_multimodal, supports_pp
+from .interfaces import (has_inner_state, is_attention_free,
+                         supports_multimodal, supports_pp)
 from .interfaces_base import is_embedding_model, is_text_generation_model
 
 logger = init_logger(__name__)
@@ -52,6 +53,7 @@
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
+    "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
     "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
@@ -157,6 +159,8 @@ class _ModelInfo:
     is_embedding_model: bool
     supports_multimodal: bool
     supports_pp: bool
+    has_inner_state: bool
+    is_attention_free: bool
 
     @staticmethod
     def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
@@ -165,6 +169,8 @@ def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
             is_embedding_model=is_embedding_model(model),
             supports_multimodal=supports_multimodal(model),
             supports_pp=supports_pp(model),
+            has_inner_state=has_inner_state(model),
+            is_attention_free=is_attention_free(model),
         )
 
 
@@ -380,6 +386,14 @@ def is_pp_supported_model(
     ) -> bool:
         return self.inspect_model_cls(architectures).supports_pp
 
+    def model_has_inner_state(self, architectures: Union[str,
+                                                         List[str]]) -> bool:
+        return self.inspect_model_cls(architectures).has_inner_state
+
+    def is_attention_free_model(self, architectures: Union[str,
+                                                           List[str]]) -> bool:
+        return self.inspect_model_cls(architectures).is_attention_free
+
 
 ModelRegistry = _ModelRegistry({
     model_arch: _LazyRegisteredModel(
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 252440c7b7e08..090f95e6e892c 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -52,15 +52,12 @@ def __init__(
             self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
 
         # Get attention backend.
-        self.attn_backend = get_attn_backend(
-            model_config.get_num_attention_heads(parallel_config),
-            self.head_size,
-            self.num_kv_heads,
-            model_config.get_sliding_window(),
-            model_config.dtype,
-            cache_config.cache_dtype,
-            self.block_size,
-        )
+        self.attn_backend = get_attn_backend(self.head_size,
+                                             model_config.get_sliding_window(),
+                                             model_config.dtype,
+                                             cache_config.cache_dtype,
+                                             self.block_size,
+                                             model_config.is_attention_free)
 
         # Initialize the cache.
         self.gpu_cache = self._allocate_kv_cache(
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index f67b086796411..795511aea6754 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -418,13 +418,12 @@ def __init__(
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free,
         )
 
         # Multi-modal data support
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index d6e3670e304d5..b84562851f0f8 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -56,13 +56,12 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig,
 
         # Get attention backend.
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             cache_config.cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free,
         )
 
         # Initialize the cache.
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 59b4b8c4ddf38..6a00444f5098b 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -196,7 +196,7 @@ def execute_model(
         seqlen_agnostic_kwargs = {
             "finished_requests_ids": model_input.finished_requests_ids,
             "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
-        } if self.has_seqlen_agnostic else {}
+        } if self.has_inner_state else {}
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
         with set_forward_context(model_input.attn_metadata):
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 5bc7100732291..9db3261b8ac36 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -17,7 +17,6 @@
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
-from vllm.attention.backends.utils import CommonAttentionState
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.levels import CompilationLevel
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
@@ -991,8 +990,7 @@ def __init__(
         self.graph_memory_pool: Optional[Tuple[
             int, int]] = None  # Set during graph capture.
 
-        self.has_seqlen_agnostic = model_config.contains_seqlen_agnostic_layers(
-            parallel_config)
+        self.has_inner_state = model_config.has_inner_state
 
         # When using CUDA graph, the input block tables must be padded to
         # max_seq_len_to_capture. However, creating the block table in
@@ -1003,22 +1001,16 @@ def __init__(
         self.graph_block_tables = np.zeros(
             (self.max_batchsize_to_capture, self.get_max_block_per_batch()),
             dtype=np.int32)
-        num_attn_heads = self.model_config.get_num_attention_heads(
-            self.parallel_config)
         self.attn_backend = get_attn_backend(
-            num_attn_heads,
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
-        ) if num_attn_heads else None
-        if self.attn_backend:
-            self.attn_state = self.attn_backend.get_state_cls()(
-                weakref.proxy(self))
-        else:
-            self.attn_state = CommonAttentionState(weakref.proxy(self))
+            self.model_config.is_attention_free,
+        )
+        self.attn_state = self.attn_backend.get_state_cls()(
+            weakref.proxy(self))
 
         # Multi-modal data support
         self.input_registry = input_registry
@@ -1498,7 +1490,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                             "previous_hidden_states"] = previous_hidden_states[:
                                                                                batch_size]
 
-                    if self.has_seqlen_agnostic:
+                    if self.has_inner_state:
                         # Only used by Mamba-based models CUDA graph atm (Jamba)
                         capture_inputs.update({
                             "seqlen_agnostic_capture_inputs":
@@ -1647,7 +1639,7 @@ def execute_model(
         seqlen_agnostic_kwargs = {
             "finished_requests_ids": model_input.finished_requests_ids,
             "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
-        } if self.has_seqlen_agnostic else {}
+        } if self.has_inner_state else {}
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_start = torch.cuda.Event(enable_timing=True)
@@ -1852,10 +1844,14 @@ def forward(
         # Copy the input tensors to the input buffers.
         self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
         self.input_buffers["positions"].copy_(positions, non_blocking=True)
-        self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
-                                                 non_blocking=True)
+
+        if self.backend_name != "placeholder-attn":
+            self.input_buffers["slot_mapping"].copy_(
+                attn_metadata.slot_mapping, non_blocking=True)
+
         self.attn_state.prepare_graph_input_buffers(
             self.input_buffers, attn_metadata, self._is_encoder_decoder_model)
+
         if "seqlen_agnostic_capture_inputs" in self.input_buffers:
             self.model.copy_inputs_before_cuda_graphs(self.input_buffers,
                                                       **kwargs)
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index de3088695dfef..760b18427e22b 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -74,13 +74,12 @@ def __init__(
         self.block_size = cache_config.block_size
 
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free,
         )
 
         # Multi-modal data support
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 6b818186779b6..24425fece850f 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -70,13 +70,12 @@ def __init__(
 
         # Get attention backend.
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.head_size,
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.cache_config.cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free,
         )
 
         # Initialize the cache.
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index b3ae5b4a9a0ce..f26d1c8cf7dff 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -113,13 +113,12 @@ def __init__(
             (self.scheduler_config.max_num_seqs, self.max_num_blocks_per_seq),
             dtype=np.int32)
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.cache_config.cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free,
             False,
         )
         self.cached_step_outputs: List[torch.Tensor] = []
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 3851843afc960..ab61e4377f900 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -236,11 +236,15 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             "not properly cleaned up before initializing the vLLM instance.")
 
         cache_block_size = self.get_cache_block_size_bytes()
-        num_gpu_blocks = int(
-            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
-             peak_memory) // cache_block_size)
-        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
-                             cache_block_size)
+        if cache_block_size == 0:
+            num_gpu_blocks = 0
+            num_cpu_blocks = 0
+        else:
+            num_gpu_blocks = int(
+                (total_gpu_memory * self.cache_config.gpu_memory_utilization -
+                 peak_memory) // cache_block_size)
+            num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                                 cache_block_size)
         num_gpu_blocks = max(num_gpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
         if self.model_runner.lora_manager:
@@ -257,6 +261,7 @@ def initialize_cache(self, num_gpu_blocks: int,
         """
         raise_if_cache_size_invalid(num_gpu_blocks,
                                     self.cache_config.block_size,
+                                    self.cache_config.is_attention_free,
                                     self.model_config.max_model_len)
 
         self.cache_config.num_gpu_blocks = num_gpu_blocks
@@ -472,14 +477,18 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
                 "`dtype` flag in CLI, for example: --dtype=half.")
 
 
-def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
+def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
                                 max_model_len) -> None:
-    if num_gpu_blocks <= 0:
+    if is_attention_free and num_gpu_blocks != 0:
+        raise ValueError("No memory should be allocated for the cache blocks "
+                         f"for an attention-free model, but {num_gpu_blocks}"
+                         "blocks are allocated.")
+    if not is_attention_free and num_gpu_blocks <= 0:
         raise ValueError("No available memory for the cache blocks. "
                          "Try increasing `gpu_memory_utilization` when "
                          "initializing the engine.")
     max_seq_len = block_size * num_gpu_blocks
-    if max_model_len > max_seq_len:
+    if not is_attention_free and max_model_len > max_seq_len:
         raise ValueError(
             f"The model's max seq len ({max_model_len}) "
             "is larger than the maximum number of tokens that can be "
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 612428180226a..20dceee849ae5 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -372,13 +372,12 @@ def __init__(
         self.block_size = cache_config.block_size
 
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free,
         )
 
         # Multi-modal data support

From f710090d8e40451879690b6a27b7d3b1a41b53ec Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <git@burkhard.engineer>
Date: Fri, 11 Oct 2024 08:54:22 -0700
Subject: [PATCH 19/31] [Kernel] adding fused moe kernel config for L40S TP4
 (#9245)

---
 .../E=8,N=3584,device_name=NVIDIA_L40S.json   | 173 ++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json
new file mode 100644
index 0000000000000..d720deb4bdd73
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json
@@ -0,0 +1,173 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 7
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 128,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "192": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 128,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 8
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "6144": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    }
+}
\ No newline at end of file

From 6cf1167c1a82296d1ad6b841138c91698b8f84b0 Mon Sep 17 00:00:00 2001
From: sixgod <evethwillbeok@outlook.com>
Date: Sat, 12 Oct 2024 01:36:13 +0800
Subject: [PATCH 20/31] [Model] Add GLM-4v support and meet vllm==0.6.2 
 (#9242)

---
 docs/source/models/supported_models.rst       |   6 +
 examples/offline_inference_vision_language.py |  16 +
 .../decoder_only/vision_language/test_glm4.py | 133 +++++++
 vllm/model_executor/models/chatglm.py         | 350 +++++++++++++++---
 .../models/glm4_vision_encoder.py             | 298 +++++++++++++++
 vllm/model_executor/models/registry.py        |   6 +-
 vllm/transformers_utils/tokenizer.py          |  39 +-
 7 files changed, 776 insertions(+), 72 deletions(-)
 create mode 100644 tests/models/decoder_only/vision_language/test_glm4.py
 create mode 100644 vllm/model_executor/models/glm4_vision_encoder.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f5d53edcebd35..bf86a72e20b57 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -351,6 +351,12 @@ Text Generation
     - :code:`adept/fuyu-8b` etc.
     - 
     - ✅︎
+  * - :code:`ChatGLMModel`
+    - GLM-4V
+    - Image
+    - :code:`THUDM/glm-4v-9b` etc.
+    - 
+    - ✅︎
   * - :code:`InternVLChatModel`
     - InternVL2
     - Image\ :sup:`E+`
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 5dd539c3d5ee4..8d6818e7dfd3e 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -300,6 +300,21 @@ def run_mllama(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# GLM-4v
+def run_glm4v(question: str, modality: str):
+    assert modality == "image"
+    model_name = "THUDM/glm-4v-9b"
+
+    llm = LLM(model=model_name,
+              max_model_len=2048,
+              max_num_seqs=2,
+              trust_remote_code=True,
+              enforce_eager=True)
+    prompt = question
+    stop_token_ids = [151329, 151336, 151338]
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
@@ -316,6 +331,7 @@ def run_mllama(question: str, modality: str):
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
     "mllama": run_mllama,
+    "glm4v": run_glm4v,
 }
 
 
diff --git a/tests/models/decoder_only/vision_language/test_glm4.py b/tests/models/decoder_only/vision_language/test_glm4.py
new file mode 100644
index 0000000000000..47922a57f680b
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_glm4.py
@@ -0,0 +1,133 @@
+from typing import List, Optional, Tuple, Type
+
+import pytest
+
+from vllm.multimodal.utils import rescale_image_size
+from vllm.transformers_utils.tokenizer import patch_padding_side
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "What's the content of the image?",
+    "cherry_blossom":
+    "What is the season?",
+})
+
+models = ["THUDM/glm-4v-9b"]
+target_dtype = "bfloat16"
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=2048,
+                     max_num_seqs=2,
+                     dtype=dtype,
+                     limit_mm_per_prompt={"image": mm_limit},
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        stop_token_ids = [151329, 151336, 151338]
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images,
+                                                stop_token_ids=stop_token_ids)
+            for prompts, images in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_processor = hf_model.processor
+        patch_padding_side(hf_processor)
+
+        def processor(*args, text="", images=None, **kwargs):
+            if images is None:
+                return hf_processor(*args, **kwargs)
+
+            return hf_processor.apply_chat_template(
+                [{
+                    "role": "user",
+                    "image": images,
+                    "content": text
+                }],
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                **kwargs,
+            )
+
+        hf_model.processor = processor
+        hf_model.model.get_output_embeddings = lambda: \
+            hf_model.model.transformer.output_layer
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            ) for prompts, images in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 879795c0d5955..f26c9f950dd36 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,42 +1,229 @@
 # coding=utf-8
 # Adapted from
-# https://github.com/THUDM/ChatGLM2-6B
+# https://github.com/THUDM/GLM-4
 """Inference-only ChatGLM model compatible with THUDM weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from argparse import Namespace
+from array import array
+from typing import Dict, Iterable, List, Mapping, Optional, Tuple, TypedDict
 
 import torch
+from PIL import Image
 from torch import nn
 from torch.nn import LayerNorm
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
+                             MultiModalInputs)
+from vllm.multimodal.base import MultiModalData
+from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
+                           SequenceData)
 from vllm.transformers_utils.configs import ChatGLMConfig
 
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+from .interfaces import SupportsLoRA, SupportsMultiModal
+
+logger = init_logger(__name__)
+
+
+def calculate_image_placeholder(vision_config):
+    return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2
+
+
+def mm_input_mapper_for_glmv(
+    ctx: InputContext,
+    data: MultiModalData[object],
+) -> Dict:
+    model_config = ctx.model_config
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+    if tokenizer is None:
+        raise RuntimeError("No HuggingFace processor is available "
+                           "to process the image object")
+    try:
+        raw_batch_data = tokenizer.apply_chat_template(
+            conversation=[{
+                "role": "user",
+                "image": data
+            }],
+            add_generation_prompt=True,
+            tokenize=True,
+            return_tensors="pt",
+            return_dict=True).data
+    except Exception:
+        logger.error("Failed to process image (%s)", data)
+        raise
+    pixel_values = raw_batch_data['images']
+
+    return MultiModalInputs({'pixel_values': pixel_values})
+
+
+def merge_glm_vision_embeddings(
+    input_ids: torch.Tensor,
+    inputs_embeds: torch.Tensor,
+    vision_embeddings: torch.Tensor,
+    boi_token_id: int,
+    eoi_token_id: int,
+) -> torch.Tensor:
+
+    boi_positions = (input_ids == boi_token_id).nonzero(as_tuple=True)[0]
+    eoi_positions = (input_ids == eoi_token_id).nonzero(as_tuple=True)[0]
+
+    mask = torch.zeros_like(input_ids, dtype=torch.bool)
+
+    for boi_pos, eoi_pos in zip(boi_positions, eoi_positions):
+        assert boi_pos < eoi_pos
+        mask[boi_pos:eoi_pos + 1] = True
+    inputs_embeds[mask] = vision_embeddings.view(-1,
+                                                 vision_embeddings.shape[-1])
+    return inputs_embeds
+
+
+class GLMImagePixelInputs(TypedDict):
+    pixel_values: torch.Tensor
+    """Shape: `(batch_size, num_channels, height, width)`"""
+
+
+def get_max_glmv_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(ChatGLMConfig)
+
+    vision_config = getattr(hf_config, 'vision_config', None)
+    if vision_config is None:
+        return 1
+    elif isinstance(vision_config, dict):
+        return calculate_image_placeholder(vision_config)
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def dummy_data_for_glmv(
+    ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]
+) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
+    hf_config = ctx.get_hf_config(ChatGLMConfig)
+    vision_config = getattr(hf_config, 'vision_config', None)
+
+    if vision_config is None:
+        token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len)
+        seq_data = SequenceData(token_ids)
+        return seq_data, None
+    elif isinstance(vision_config, dict):
+        image_size = vision_config["image_size"]
+        image_placeholder_length = calculate_image_placeholder(vision_config)
+        token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.boi_token_id] +
+                          [0] * image_placeholder_length +
+                          [hf_config.eoi_token_id])
+        token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                           [0] * (seq_len - image_placeholder_length - 2))
+        seq_data = SequenceData(token_ids)
+
+        mm_data = {
+            "image": Image.new("RGB", (image_size, image_size), color=0)
+        }
+
+        return seq_data, mm_data
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def find_all_positions(input_ids: List[int], target: int) -> List[int]:
+    return [index for index, value in enumerate(input_ids) if value == target]
+
+
+def input_processor_for_glmv(ctx: InputContext, llm_inputs: LLMInputs):
+    hf_config = ctx.get_hf_config(ChatGLMConfig)
+    vision_config = getattr(hf_config, 'vision_config', None)
+
+    if vision_config is None:
+        return llm_inputs
+    elif isinstance(vision_config, dict):
+        image_placeholder_length = calculate_image_placeholder(vision_config)
+    else:
+        msg = f"Unsupported vision config: {type(vision_config)}"
+        raise NotImplementedError(msg)
+
+    input_ids = llm_inputs.get("prompt_token_ids")
+    position_ids = llm_inputs.get("position_ids")
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.model,
+        trust_remote_code=ctx.model_config.trust_remote_code)
+
+    try:
+        raw_batch_data = tokenizer.apply_chat_template(
+            conversation=[{
+                "role": "user",
+                "image": llm_inputs['multi_modal_data']["image"],
+                "content": llm_inputs['prompt']
+            }],
+            add_generation_prompt=True,
+            tokenize=True,
+            return_tensors="pt",
+            return_dict=True).data
+    except Exception:
+        logger.error("Failed to process content (%s)", llm_inputs['prompt'])
+        raise
+    input_ids = raw_batch_data['input_ids'][0].tolist()
+
+    if position_ids is None:
+        position_ids = list(range(len(input_ids)))
+    boi_token_id = hf_config.boi_token_id
+    eoi_token_id = hf_config.eoi_token_id
+    boi_positions = find_all_positions(input_ids, boi_token_id)
+    eoi_positions = find_all_positions(input_ids, eoi_token_id)
+
+    assert len(boi_positions) == len(eoi_positions)
+
+    new_input_ids = []
+    new_position_ids = []
+    final_processed_position = 0
+    final_processed_position = 0
+
+    for boi_position, eoi_position in zip(boi_positions, eoi_positions):
+        assert boi_position < eoi_position
+        new_input_ids.extend(input_ids[final_processed_position:boi_position +
+                                       1])
+        new_position_ids.extend(
+            list(range(final_processed_position, boi_position + 1)))
+        new_input_ids.extend([input_ids[boi_position + 1]] *
+                             image_placeholder_length)
+        new_position_ids.extend([boi_position + 1] * image_placeholder_length)
+        final_processed_position = eoi_position
+
+    new_input_ids.extend(input_ids[final_processed_position:])
+    new_position_ids.extend(
+        list(range(final_processed_position, len(input_ids))))
+
+    assert len(new_input_ids) == len(new_position_ids)
+
+    llm_inputs["prompt_token_ids"] = new_input_ids
+    llm_inputs["position_ids"] = new_position_ids
+    return llm_inputs
 
 
 class GLMAttention(nn.Module):
 
     def __init__(
         self,
-        config: ChatGLMConfig,
+        config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
@@ -127,7 +314,7 @@ class GLMMLP(nn.Module):
 
     def __init__(
         self,
-        config: ChatGLMConfig,
+        config,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -170,7 +357,7 @@ class GLMBlock(nn.Module):
 
     def __init__(
         self,
-        config: ChatGLMConfig,
+        config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
@@ -241,10 +428,9 @@ class GLMTransformer(nn.Module):
 
     def __init__(
         self,
-        config: ChatGLMConfig,
+        config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
     ):
         super().__init__()
         self.post_layer_norm = config.post_layer_norm
@@ -253,11 +439,10 @@ def __init__(
         self.num_layers = config.num_layers
 
         # Transformer layers.
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            self.num_layers,
-            lambda prefix: GLMBlock(config, cache_config, quant_config),
-            prefix=f"{prefix}.layers",
-        )
+        self.layers = nn.ModuleList([
+            GLMBlock(config, cache_config, quant_config)
+            for i in range(self.num_layers)
+        ])
 
         if self.post_layer_norm:
             layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
@@ -272,16 +457,16 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        for i in range(self.start_layer, self.end_layer):
+        for i in range(self.num_layers):
             layer = self.layers[i]
             hidden_states = layer(
                 hidden_states=hidden_states,
                 position_ids=position_ids,
-                kv_cache=kv_caches[i - self.start_layer],
+                kv_cache=kv_caches[i],
                 attn_metadata=attn_metadata,
             )
         # Final layer norm.
-        if get_pp_group().is_last_rank and self.post_layer_norm:
+        if self.post_layer_norm:
             hidden_states = self.final_layernorm(hidden_states)
 
         return hidden_states
@@ -291,14 +476,17 @@ class ChatGLMModel(nn.Module):
 
     def __init__(
         self,
-        config: ChatGLMConfig,
+        config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
 
+        self.config = config
+
         self.embedding = VocabParallelEmbedding(config.padded_vocab_size,
-                                                config.hidden_size)
+                                                config.hidden_size,
+                                                quant_config=quant_config)
 
         self.num_layers = config.num_layers
         self.multi_query_group_num = config.multi_query_group_num
@@ -308,37 +496,73 @@ def __init__(
         self.output_layer = ParallelLMHead(config.padded_vocab_size,
                                            config.hidden_size,
                                            quant_config=quant_config)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+
+        vision_config_flag = getattr(config, 'vision_config', None)
+        if vision_config_flag is not None:
+            self.vision_config = Namespace(**config.vision_config)
+            self.vision = EVA2CLIPModel(self.config, quant_config)
+        else:
+            self.vision = None
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> GLMImagePixelInputs:
+
+        pixel_values = kwargs.pop("pixel_values", None)
+        if pixel_values is not None and self.vision is not None:
+            if isinstance(pixel_values, torch.Tensor):
+                if pixel_values.ndim > 2:
+                    pixel_values = torch.concat(list(pixel_values))
+            elif isinstance(pixel_values, list):
+                return torch.concat(pixel_values)
+            else:
+                raise TypeError("""pixel_values must be a torch.Tensor 
+                    or a list of torch.Tensor
+                    """)
+        return GLMImagePixelInputs(pixel_values=pixel_values)
 
     def forward(
         self,
         input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
+        positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors],
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            inputs_embeds = self.embedding(input_ids)
-        else:
-            inputs_embeds = intermediate_tensors["hidden_states"]
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.embedding(input_ids)
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input["pixel_values"] is not None:
+            pixel_values = image_input["pixel_values"].to(
+                dtype=inputs_embeds.dtype)
+            image_embeds = self.vision(pixel_values)
+
+            boi_token_id = self.config.boi_token_id
+            eoi_token_id = self.config.eoi_token_id
+
+            inputs_embeds = merge_glm_vision_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                vision_embeddings=image_embeds,
+                boi_token_id=boi_token_id,
+                eoi_token_id=eoi_token_id)
 
         # Run encoder.
         hidden_states = self.encoder(
             hidden_states=inputs_embeds,
-            position_ids=position_ids,
+            position_ids=positions,
             kv_caches=kv_caches,
             attn_metadata=attn_metadata,
         )
-
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({"hidden_states": hidden_states})
         return hidden_states
 
 
-class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+@MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_glmv)
+class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"]
@@ -356,6 +580,7 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(
         self,
         config: ChatGLMConfig,
+        multimodal_config: MultiModalConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
@@ -364,6 +589,7 @@ def __init__(
 
         self.config = config
         self.lora_config = lora_config
+        self.multimodal_config = multimodal_config
 
         self.quant_config = quant_config
         self.max_position_embeddings = getattr(config, "max_sequence_length",
@@ -375,19 +601,16 @@ def __init__(
         self.lm_head = self.transformer.output_layer
         self.logits_processor = LogitsProcessor(config.padded_vocab_size)
         self.sampler = Sampler()
-        self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: List[torch.Tensor],
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                **kwargs) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, **kwargs)
         return hidden_states
 
     def compute_logits(
@@ -408,8 +631,24 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # Merge two ColumnParallelLinear into one MergedColumnParallelLinear
+        merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = {
+            "transformer.vision.linear_proj.merged_proj.weight": {
+                "transformer.vision.linear_proj.gate_proj.weight": None,
+                "transformer.vision.linear_proj.dense_h_to_4h.weight": None,
+            }
+        }
+
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         for name, loaded_weight in weights:
+            is_weight_to_be_merge = False
+            for _, merged_weight_dict in merged_weights_dict.items():
+                if name in merged_weight_dict:
+                    assert merged_weight_dict[name] is None
+                    merged_weight_dict[name] = loaded_weight
+                    is_weight_to_be_merge = True
+            if is_weight_to_be_merge:
+                continue
             if "rotary_pos_emb.inv_freq" in name:
                 continue
             if "word_embeddings" in name:
@@ -417,9 +656,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             # Skip loading extra bias for GPTQ models.
             if name.endswith(".bias") and name not in params_dict:
                 continue
-            if is_pp_missing_parameter(name, self):
-                continue
             param = params_dict[name]
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+
+        for combined_name, merged_weight_dict in merged_weights_dict.items():
+            if combined_name in params_dict:
+                param = params_dict[combined_name]
+                combined_weight = torch.cat(list(merged_weight_dict.values()),
+                                            dim=0)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, combined_weight)
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
new file mode 100644
index 0000000000000..3213a8b29a104
--- /dev/null
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -0,0 +1,298 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/THUDM/GLM-4
+"""Inference-only GLM-4v model visual encoder compatible with THUDM weights."""
+from argparse import Namespace
+from typing import Optional
+
+import torch
+from torch import nn
+from torch.nn import LayerNorm
+
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+
+class PatchEmbedding(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.proj = nn.Conv2d(config.in_channels,
+                              config.hidden_size,
+                              kernel_size=config.patch_size,
+                              stride=config.patch_size)
+        self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.position_embedding = nn.Embedding(config.num_positions,
+                                               config.hidden_size)
+
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters:
+        images : torch.Tensor
+            Input image tensor with shape (B, C, H, W)
+
+        Returns:
+        torch.Tensor
+            Transformed tensor with shape (B, L, D)
+        """
+        images = images.to(self.proj.weight.device)
+        x = self.proj(images)
+        x = x.flatten(2).transpose(1, 2)
+        cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_token, x), dim=1)
+        x += self.position_embedding.weight.unsqueeze(0)
+        return x
+
+
+class Attention(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_rank = config.num_heads // self.tp_size
+        self.head_dim = config.hidden_size // config.num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.query_key_value = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            config.num_heads,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            config.hidden_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+
+        self.output_dropout = torch.nn.Dropout(config.dropout_prob)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, L, _ = x.shape
+        qkv, _ = self.query_key_value(x)  # B, L, 3 * H * D
+        q, k, v = qkv.chunk(3, dim=-1)
+        q = q.reshape(B, L, self.num_heads_per_rank,
+                      self.head_dim).permute(0, 2, 1, 3)  # B, H, L, D
+        k = k.reshape(B, L, self.num_heads_per_rank,
+                      self.head_dim).permute(0, 2, 1, 3)  # B, H, L, D
+        v = v.reshape(B, L, self.num_heads_per_rank,
+                      self.head_dim).permute(0, 2, 1, 3)  # B, H, L, D
+
+        out = torch.nn.functional.scaled_dot_product_attention(q,
+                                                               k,
+                                                               v,
+                                                               attn_mask=None,
+                                                               dropout_p=0.,
+                                                               is_causal=False)
+
+        output, _ = self.dense(out.transpose(1, 2).view(B, L, -1))
+        output = self.output_dropout(output)
+        return output
+
+
+class MLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.activation_fn(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class TransformerLayer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.input_layernorm = LayerNorm(config.hidden_size,
+                                         eps=config.layer_norm_eps)
+        self.attention = Attention(config, quant_config=quant_config)
+        self.mlp = MLP(config, quant_config=quant_config)
+        self.post_attention_layernorm = LayerNorm(config.hidden_size,
+                                                  eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        attention_input = hidden_states
+        attention_output = self.input_layernorm(
+            self.attention(attention_input))
+        hidden_states = attention_input + attention_output
+        mlp_input = hidden_states
+        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
+        output = mlp_input + mlp_output
+        return output
+
+
+class Transformer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            TransformerLayer(config, quant_config=quant_config)
+            for _ in range(config.num_hidden_layers)
+        ])
+
+    def forward(self, hidden_states):
+        for layer_module in self.layers:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class GLU(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        in_features,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        """
+        The original implementation is the same as:
+        ```python
+        self.dense_h_to_4h = ColumnParallelLinear(
+            config.hidden_size,
+            config.ffn_hidden_size,
+            bias=False,
+            quant_config=quant_config
+        )
+
+        self.gate_proj = ColumnParallelLinear(
+            config.hidden_size,
+            config.ffn_hidden_size,
+            bias=False,
+            quant_config=quant_config
+        )
+        ```
+        ```
+        gate_proj_output, _ = self.gate_proj(x)
+        dense_h_to_4h_output, _ = self.dense_h_to_4h(x)
+        x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1)
+        ```
+
+        We merge two ColumnParallelLinear into one MergedColumnParallelLinear:
+        ```
+        self.merged_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.ffn_hidden_size] * 2,
+            bias=False,
+            quant_config=quant_config
+        )
+        ```
+        ```
+        x, _ = self.merged_proj(x)
+        ```
+        """
+        super().__init__()
+        self.linear_proj = ReplicatedLinear(in_features,
+                                            config.hidden_size,
+                                            bias=False,
+                                            quant_config=quant_config)
+        self.norm1 = nn.LayerNorm(config.hidden_size)
+        self.act1 = nn.GELU()
+        self.act2 = SiluAndMul()
+
+        self.merged_proj = MergedColumnParallelLinear(
+            config.hidden_size, [config.ffn_hidden_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+
+        self.dense_4h_to_h = RowParallelLinear(config.ffn_hidden_size,
+                                               config.hidden_size,
+                                               bias=False,
+                                               quant_config=quant_config)
+
+    def forward(self, x):
+        x, _ = self.linear_proj(x)
+        x = self.act1(self.norm1(x))
+        x, _ = self.merged_proj(x)
+        x = self.act2(x)
+        x, _ = self.dense_4h_to_h(x)
+        return x
+
+
+class EVA2CLIPModel(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        vision_config = Namespace(**config.vision_config)
+        self.patch_embedding = PatchEmbedding(vision_config)
+        self.transformer = Transformer(vision_config,
+                                       quant_config=quant_config)
+        self.linear_proj = GLU(config,
+                               in_features=config.hidden_size,
+                               quant_config=quant_config)
+        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
+                              out_channels=config.hidden_size,
+                              kernel_size=2,
+                              stride=2)
+        self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.scaling_factor = vision_config.scaling_factor
+
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters:
+        images : torch.Tensor
+            Input image tensor with shape (B, C, H, W)
+
+        Returns:
+        torch.Tensor
+            Transformed tensor with shape (B, L, D)
+        """
+        x = self.patch_embedding(images)
+        x = self.transformer(x)
+        x = x[:, 1:]
+
+        b, s, h = x.shape
+        grid_size = int(s**0.5)
+        x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
+        x = self.conv(x)
+
+        x = x.flatten(2).transpose(1, 2)
+        x = self.linear_proj(x)
+        boi = self.boi.expand(x.shape[0], -1, -1)
+        eoi = self.eoi.expand(x.shape[0], -1, -1)
+        x = torch.cat((boi, x, eoi), dim=1)
+        x = x / self.scaling_factor
+        return x
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 3c8c600c2c026..8caaab9974666 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -29,8 +29,7 @@
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
-    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
-    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
+    # ChatGLMModel supports multimodal
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
@@ -72,6 +71,7 @@
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
+    # QWenLMHeadModel supports multimodal
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
@@ -95,6 +95,8 @@
     # [Decoder-only]
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
+    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
+    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 85c339df4a76c..94af2388d79db 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -59,6 +59,26 @@ def __len__(self):
     return tokenizer
 
 
+def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
+    """Patch _pad method to accept `padding_side` for older tokenizers."""
+    orig_pad = tokenizer._pad
+
+    def _pad(
+        self: PreTrainedTokenizer,
+        *args,
+        padding_side: Optional[str] = None,
+        **kwargs,
+    ):
+        if padding_side is not None and padding_side != self.padding_side:
+            msg = ("`padding_side` argument is not supported by "
+                   f"{type(tokenizer).__name__} and will be ignored.")
+            warnings.warn(msg, stacklevel=2)
+
+        return orig_pad(*args, **kwargs)
+
+    tokenizer._pad = MethodType(_pad, tokenizer)
+
+
 def get_tokenizer(
     tokenizer_name: Union[str, Path],
     *args,
@@ -143,24 +163,7 @@ def get_tokenizer(
         if type(tokenizer).__name__ in ("ChatGLMTokenizer",
                                         "ChatGLM4Tokenizer"):
             assert isinstance(tokenizer, PreTrainedTokenizer)
-            orig_pad = tokenizer._pad
-
-            # Patch _pad method to accept `padding_side`
-            def _pad(
-                self: PreTrainedTokenizer,
-                *args,
-                padding_side: Optional[str] = None,
-                **kwargs,
-            ):
-                if (padding_side is not None
-                        and padding_side != self.padding_side):
-                    msg = ("`padding_side` argument is not supported by "
-                           "ChatGLMTokenizer and will be ignored.")
-                    warnings.warn(msg, stacklevel=2)
-
-                return orig_pad(*args, **kwargs)
-
-            tokenizer._pad = MethodType(_pad, tokenizer)
+            patch_padding_side(tokenizer)
 
         if not isinstance(tokenizer, PreTrainedTokenizerFast):
             logger.warning(

From 1a1823871d76b9ce54f1c5bc0b61257aa9c53295 Mon Sep 17 00:00:00 2001
From: homeffjy <74026382+homeffjy@users.noreply.github.com>
Date: Sat, 12 Oct 2024 02:02:03 +0800
Subject: [PATCH 21/31] [Doc] Remove outdated comment to avoid misunderstanding
 (#9287)

---
 vllm/core/block_manager_v2.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index c7ee6609306d7..cb047c832e6cb 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -24,9 +24,8 @@ class BlockSpaceManagerV2(BlockSpaceManager):
     autoregressively-generated tokens, and other advanced features such as
     prefix caching, forking/copy-on-write, and sliding-window memory allocation.
 
-    The current implementation is partial; in particular prefix caching and
-    sliding-window are not feature complete. This class implements the design
-    described in https://github.com/vllm-project/vllm/pull/3492.
+    This class implements the design described in
+    https://github.com/vllm-project/vllm/pull/3492.
 
     Lookahead slots
         The block manager has the notion of a "lookahead slot". These are slots
@@ -190,7 +189,7 @@ def allocate(self, seq_group: SequenceGroup) -> None:
 
         assert (request_id
                 not in self.cross_block_tables), \
-                "block table already exists"
+            "block table already exists"
 
         check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
 

From 8baf85e4e9355611532e361a5cd4d458bc8fe1fe Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:18:50 -0300
Subject: [PATCH 22/31] [Doc] Compatibility matrix for mutual exclusive
 features (#8512)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 docs/source/index.rst                        |   1 +
 docs/source/models/performance.rst           |   2 +
 docs/source/serving/compatibility_matrix.rst | 427 +++++++++++++++++++
 vllm/attention/backends/rocm_flash_attn.py   |   2 +
 vllm/config.py                               |  10 +
 vllm/engine/arg_utils.py                     |   2 +
 vllm/engine/output_processor/multi_step.py   |   2 +
 vllm/executor/cpu_executor.py                |   8 +
 vllm/inputs/preprocess.py                    |   2 +
 vllm/spec_decode/spec_decode_worker.py       |   2 +
 vllm/utils.py                                |   3 +
 vllm/worker/multi_step_model_runner.py       |   3 +
 vllm/worker/utils.py                         |   3 +
 13 files changed, 467 insertions(+)
 create mode 100644 docs/source/serving/compatibility_matrix.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 961373eb71c0b..d20e46b4a3656 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -86,6 +86,7 @@ Documentation
    serving/usage_stats
    serving/integrations
    serving/tensorizer
+   serving/compatibility_matrix
    serving/faq
 
 .. toctree::
diff --git a/docs/source/models/performance.rst b/docs/source/models/performance.rst
index d8750ddc34e8e..23b5ab79a7378 100644
--- a/docs/source/models/performance.rst
+++ b/docs/source/models/performance.rst
@@ -22,6 +22,8 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo
 
 You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
 
+.. _chunked-prefill:
+
 Chunked Prefill
 ---------------
 vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
new file mode 100644
index 0000000000000..cac0605ca132b
--- /dev/null
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -0,0 +1,427 @@
+.. _compatibility_matrix:
+
+Compatibility Matrix
+====================
+
+The tables below show mutually exclusive features and the support on some hardware. 
+
+.. note::
+
+   Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
+
+Feature x Feature
+-----------------
+
+
+.. raw:: html
+
+    <style>
+      /* Make smaller to try to improve readability  */
+      td {
+        font-size: 0.8rem;
+        text-align: center;
+      }
+
+      th {
+        text-align: center;
+        font-size: 0.8rem;
+      }
+    </style>
+
+.. list-table::
+   :header-rows: 1
+   :widths: auto
+
+   * - Feature
+     - :ref:`CP <chunked-prefill>`
+     - :ref:`APC <apc>`
+     - :ref:`LoRA <lora>`
+     - :abbr:`prmpt adptr (Prompt Adapter)`
+     - :ref:`SD <spec_decode>`
+     - CUDA graph
+     - :abbr:`enc-dec (Encoder-Decoder Models)`
+     - :abbr:`logP (Logprobs)`
+     - :abbr:`prmpt logP (Prompt Logprobs)`
+     - :abbr:`async output (Async Output Processing)`
+     - multi-step
+     - :abbr:`MM (Multimodal)`
+     - best-of
+     - beam-search
+     - :abbr:`guided dec (Guided Decoding)`
+   * - :ref:`CP <chunked-prefill>`
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :ref:`APC <apc>`
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :ref:`LoRA <lora>`
+     - `✗ <https://github.com/vllm-project/vllm/pull/9057>`__ 
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`prmpt adptr (Prompt Adapter)`
+     - ✅
+     - ✅
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :ref:`SD <spec_decode>`
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - CUDA graph
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`enc-dec (Encoder-Decoder Models)`
+     - ✗
+     - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
+     - ✗ 
+     - ✗
+     - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`logP (Logprobs)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`prmpt logP (Prompt Logprobs)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/pull/8199>`__ 
+     - ✅
+     - ✅
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`async output (Async Output Processing)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅ 
+     - ✗
+     - ✅
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - multi-step
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/8198>`__ 
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`MM (Multimodal)`
+     -  `✗ <https://github.com/vllm-project/vllm/pull/8346>`__ 
+     -  `✗ <https://github.com/vllm-project/vllm/pull/8348>`__ 
+     -  `✗ <https://github.com/vllm-project/vllm/pull/7199>`__ 
+     - ?
+     - ?
+     - ✅
+     - ✗
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     - 
+     - 
+     - 
+     - 
+   * - best-of
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     - `✗ <https://github.com/vllm-project/vllm/issues/7968>`__ 
+     - ✅
+     - 
+     - 
+     - 
+   * - beam-search
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     - `✗ <https://github.com/vllm-project/vllm/issues/7968>`__ 
+     - ?
+     - ✅
+     - 
+     - 
+   * - :abbr:`guided dec (Guided Decoding)`
+     - ✅
+     - ✅
+     - ?
+     - ?
+     - ✅
+     - ✅
+     - ?
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ?
+     - ✅
+     - ✅
+     - 
+
+
+Feature x Hardware
+^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+   :widths: auto
+
+   * - Feature
+     - Volta
+     - Turing
+     - Ampere
+     - Ada
+     - Hopper
+     - CPU
+     - AMD
+   * - :ref:`CP <chunked-prefill>`
+     - `✗ <https://github.com/vllm-project/vllm/issues/2729>`__ 
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗ 
+     - ✅
+   * - :ref:`APC <apc>`
+     - `✗ <https://github.com/vllm-project/vllm/issues/3687>`__ 
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+   * - :ref:`LoRA <lora>`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/pull/4830>`__ 
+     - ✅
+   * - :abbr:`prmpt adptr (Prompt Adapter)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/8475>`__ 
+     - ✅
+   * - :ref:`SD <spec_decode>`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - CUDA graph
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+   * - :abbr:`enc-dec (Encoder-Decoder Models)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/blob/a84e598e2125960d3b4f716b78863f24ac562947/vllm/worker/cpu_model_runner.py#L125>`__ 
+     - ✗
+   * - :abbr:`logP (Logprobs)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - :abbr:`prmpt logP (Prompt Logprobs)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - :abbr:`async output (Async Output Processing)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✗
+   * - multi-step
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/8477>`__ 
+     - ✅
+   * - :abbr:`MM (Multimodal)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - best-of
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - beam-search
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - :abbr:`guided dec (Guided Decoding)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 7456aab8b8d2a..03fb9193f892d 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -420,6 +420,8 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
diff --git a/vllm/config.py b/vllm/config.py
index f964928aa0a68..b0761ae0ee869 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -359,6 +359,8 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if device_config.device_type not in ("cuda", "tpu"):
             logger.warning(
                 "Async output processing is only supported for CUDA or TPU. "
@@ -372,6 +374,8 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if device_config.device_type == "cuda" and self.enforce_eager:
             logger.warning(
                 "To see benefits of async output processing, enable CUDA "
@@ -385,6 +389,8 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
         if self.embedding_mode:
             self.use_async_output_proc = False
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if speculative_config:
             logger.warning("Async output processing is not supported with"
                            " speculative decoding currently.")
@@ -1200,6 +1206,8 @@ def maybe_create_spec_config(
                              "speculative decoding is > 1, but got "
                              f"{speculative_disable_by_batch_size=}")
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if enable_chunked_prefill:
             raise ValueError(
                 "Speculative decoding and chunked prefill are "
@@ -1561,6 +1569,8 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if scheduler_config.chunked_prefill_enabled:
             raise ValueError("LoRA is not supported with chunked prefill yet.")
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bdfecabf96f2c..1b132cf76a10d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1000,6 +1000,8 @@ def create_engine_config(self) -> EngineConfig:
             disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if self.num_scheduler_steps > 1:
             if speculative_config is not None:
                 raise ValueError("Speculative decoding is not supported with "
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 47de3656ca892..74ddb250ccd9e 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -62,6 +62,8 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
     @staticmethod
     @functools.lru_cache()
     def _log_prompt_logprob_unsupported_warning_once():
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
             "(e.g., speculative decode uses multi step workers).")
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 9ad240ef60820..e32993e0e452e 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -28,6 +28,8 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
         #
@@ -324,6 +326,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
     if config.dtype == torch.float16:
         logger.warning("float16 is not supported on CPU, casting to bfloat16.")
         config.dtype = torch.bfloat16
+    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # If the feature combo become valid
     if not config.enforce_eager:
         logger.warning(
             "CUDA graph is not supported on CPU, fallback to the eager "
@@ -334,6 +338,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
 
 def _verify_and_get_scheduler_config(
         config: SchedulerConfig) -> SchedulerConfig:
+    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # If the feature combo become valid
     if config.chunked_prefill_enabled:
         logger.warning("Chunked prefill is not supported on CPU, disable it.")
         config.chunked_prefill_enabled = False
@@ -342,6 +348,8 @@ def _verify_and_get_scheduler_config(
 
 
 def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
+    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # If the feature combo become valid
     if config.enable_prefix_caching:
         logger.warning("Prefix caching is not supported on CPU, disable it.")
         config.enable_prefix_caching = False
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 22adb1631d410..64387fd2fa47d 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -310,6 +310,8 @@ def _build_enc_dec_llm_inputs(
         encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps
         decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if decoder_mm_data is not None:
             raise ValueError(
                 "Multi-modality decoder inputs of encoder-decoder models are "
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index a67715290a515..13d39773944fb 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -87,6 +87,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     return spec_decode_worker
 
 
+# Reminder: Please update docs/source/serving/compatibility_matrix.rst
+# If the feature combo become valid
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     """Worker which implements speculative decoding.
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 314fec0a65c7b..8debae52b288c 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -41,6 +41,9 @@
 
 # Exception strings for non-implemented encoder/decoder scenarios
 
+# Reminder: Please update docs/source/serving/compatibility_matrix.rst
+# If the feature combo become valid
+
 STR_NOT_IMPL_ENC_DEC_SWA = \
     "Sliding window attention for encoder/decoder models " + \
                     "is not currently supported."
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 12aa473525c13..0cd0047bebf2d 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -816,6 +816,9 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
+        # (Check for Guided Decoding)
         if seq_group.sampling_params.logits_processors:
             assert len(seq_group.sampling_params.logits_processors) == 0, (
                 "Logits Processors are not supported in multi-step decoding")
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index a07395dfc61d8..f43635464ef00 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -13,6 +13,9 @@ def assert_enc_dec_mr_supported_scenario(
     a supported scenario.
     '''
 
+    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # If the feature combo become valid
+
     if enc_dec_mr.cache_config.enable_prefix_caching:
         raise NotImplementedError(
             STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])

From de9fb4bef8bb1f62d425dd44533810d838908df6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:57:39 -0400
Subject: [PATCH 23/31] [Bugfix][CI/Build] Fix docker build where CUDA archs <
 7.0 are being detected (#9254)

---
 CMakeLists.txt | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4be524808a23a..3a424ad7b110f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,27 +144,32 @@ else()
 endif()
 
 
-#
-# For cuda we want to be able to control which architectures we compile for on 
-# a per-file basis in order to cut down on compile time. So here we extract
-# the set of architectures we want to compile for and remove the from the 
-# CMAKE_CUDA_FLAGS so that they are not applied globally.
-#
 if(VLLM_GPU_LANG STREQUAL "CUDA")
+  #
+  # For cuda we want to be able to control which architectures we compile for on 
+  # a per-file basis in order to cut down on compile time. So here we extract
+  # the set of architectures we want to compile for and remove the from the 
+  # CMAKE_CUDA_FLAGS so that they are not applied globally.
+  #
   clear_cuda_arches(CUDA_ARCH_FLAGS)
   extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
   message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
+  # Filter the target architectures by the supported supported archs
+  # since for some files we will build for all CUDA_ARCHS.
+  cuda_archs_loose_intersection(CUDA_ARCHS 
+    "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
+  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
+else()
+  #
+  # For other GPU targets override the GPU architectures detected by cmake/torch
+  # and filter them by the supported versions for the current language.
+  # The final set of arches is stored in `VLLM_GPU_ARCHES`.
+  #
+  override_gpu_arches(VLLM_GPU_ARCHES
+    ${VLLM_GPU_LANG}
+    "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
 endif()
 
-#
-# Override the GPU architectures detected by cmake/torch and filter them by
-# the supported versions for the current language.
-# The final set of arches is stored in `VLLM_GPU_ARCHES`.
-#
-override_gpu_arches(VLLM_GPU_ARCHES
-  ${VLLM_GPU_LANG}
-  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
-
 #
 # Query torch for additional GPU compilation flags for the given
 # `VLLM_GPU_LANG`.

From c6cf9295e1dad2aeffbce1d92682971df9f71ddf Mon Sep 17 00:00:00 2001
From: Allen Wang <allencwang@google.com>
Date: Fri, 11 Oct 2024 15:28:10 -0500
Subject: [PATCH 24/31] [Bugfix] Sets `is_first_step_output` for TPUModelRunner
 (#9202)

---
 vllm/worker/tpu_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index f26d1c8cf7dff..c13e95f60af58 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -541,7 +541,8 @@ def execute_model(
                         seq_group_metadata_list=ctx.seq_group_metadata_list,
                         scheduler_outputs=ctx.scheduler_outputs,
                         is_async=False,
-                        is_last_step=False)
+                        is_last_step=False,
+                        is_first_step_output=i == 0)
                     model_input.async_callback()
             if use_async_out_proc:
                 return [sampler_outputs[-1]]

From d11b46f3a5aba3371456bf7ae7b1332aa14501d8 Mon Sep 17 00:00:00 2001
From: Prashant Gupta <prashantgupta@us.ibm.com>
Date: Fri, 11 Oct 2024 17:03:48 -0700
Subject: [PATCH 25/31] [bugfix] fix f-string for error (#9295)

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 788133059f12d..aae10d3ee25fd 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -30,12 +30,12 @@ def find_tokenizer_file(files: List[str]):
     matched_files = [file for file in files if file_pattern.match(file)]
     if len(matched_files) > 1:
         raise OSError(f"Found {len(matched_files)} files matching the "
-                      "pattern: {matched_files}. Make sure only one Mistral "
-                      "tokenizer is present in {tokenizer_name}.")
+                      f"pattern: {file_pattern}. Make sure only one Mistral "
+                      f"tokenizer is present in {files}.")
     elif len(matched_files) == 0:
         raise OSError(f"Found {len(matched_files)} files matching the "
-                      "pattern: {matched_files}. Make sure that a Mistral "
-                      "tokenizer is present in {tokenizer_name}.")
+                      f"pattern: {file_pattern}. Make sure that a Mistral "
+                      f"tokenizer is present in {files}.")
 
     return matched_files[0]
 

From ec10cb8511b7e30b8ff86caab2e4272ff3ceddca Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Fri, 11 Oct 2024 22:24:26 -0300
Subject: [PATCH 26/31] [BugFix] Fix tool call finish reason in streaming case
 (#9209)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/serving_chat.py | 26 ++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1e85167ea7619..4931195ae0e02 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -538,10 +538,12 @@ async def chat_completion_stream_generator(
                         #   any tokens that were generated but previously
                         #   matched by partial json parsing
                         # only happens if we are NOT using guided decoding
+                        auto_tools_called = False
                         if tool_parser:
-                            index = len(
-                                tool_parser.prev_tool_call_arr) - 1 if len(
-                                    tool_parser.prev_tool_call_arr) > 0 else 0
+                            auto_tools_called = len(
+                                tool_parser.prev_tool_call_arr) > 0
+                            index = len(tool_parser.prev_tool_call_arr
+                                        ) - 1 if auto_tools_called else 0
                         else:
                             index = 0
 
@@ -576,9 +578,7 @@ async def chat_completion_stream_generator(
                             delta=delta_message,
                             logprobs=logprobs,
                             finish_reason=output.finish_reason
-                            if not (tool_parser
-                                    and len(tool_parser.prev_tool_call_arr))
-                            else "tool_calls",
+                            if not auto_tools_called else "tool_calls",
                             stop_reason=output.stop_reason)
                         chunk = ChatCompletionStreamResponse(
                             id=request_id,
@@ -680,8 +680,10 @@ async def chat_completion_full_generator(
             else:
                 logprobs = None
 
-            # by default, tools are not used.
-            tools_called = False
+            # In the OpenAI API the finish_reason is "tools_called"
+            # if the tool choice is auto and the model produced a tool
+            # call. The same is not true for named function calls
+            auto_tools_called = False
 
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
@@ -703,7 +705,6 @@ async def chat_completion_full_generator(
                             name=request.tool_choice.function.name,
                             arguments=output.text))
                     ])
-                tools_called = True
 
             # if the request doesn't use tool choice
             # OR specifies to not use a tool
@@ -725,7 +726,10 @@ async def chat_completion_full_generator(
 
                 tool_call_info = tool_parser.extract_tool_calls(
                     output.text, request=request)
-                tools_called = tool_call_info.tools_called
+                # In the OpenAI API the finish_reason is "tools_called"
+                # if the tool choice is auto and the model produced a tool
+                # call. The same is not true for named function calls
+                auto_tools_called = tool_call_info.tools_called
                 if tool_call_info.tools_called:
                     message = ChatMessage(role=role,
                                           content=tool_call_info.content,
@@ -748,7 +752,7 @@ async def chat_completion_full_generator(
                 index=output.index,
                 message=message,
                 logprobs=logprobs,
-                finish_reason="tool_calls" if tools_called else
+                finish_reason="tool_calls" if auto_tools_called else
                 output.finish_reason if output.finish_reason else "stop",
                 stop_reason=output.stop_reason)
             choices.append(choice_data)

From 89feb4c84dc8938738ef5d7b613f0d351cc2dc11 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Fri, 11 Oct 2024 22:13:37 -0700
Subject: [PATCH 27/31] [SpecDec] Remove Batch Expansion (2/3) (#9298)

---
 tests/spec_decode/test_scorer.py            | 52 ++++++++++++----
 vllm/attention/backends/blocksparse_attn.py |  7 +--
 vllm/attention/backends/flash_attn.py       | 69 +++++++++++++--------
 vllm/attention/backends/rocm_flash_attn.py  |  7 +--
 vllm/attention/backends/utils.py            |  2 +-
 vllm/attention/backends/xformers.py         |  7 +--
 vllm/spec_decode/mqa_scorer.py              | 42 ++++++++++---
 vllm/spec_decode/spec_decode_worker.py      |  6 --
 8 files changed, 122 insertions(+), 70 deletions(-)

diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index 5f703b03ab7fe..e579c8b38db91 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -1,3 +1,6 @@
+import random
+from typing import List
+
 import pytest
 import torch
 
@@ -10,31 +13,45 @@
 from .utils import create_batch, create_worker
 
 
-def create_proposal(batch_size: int, propose_len: int, vocab_size: int,
+def create_proposal(propose_lens: List[int], vocab_size: int,
                     device: str) -> SpeculativeProposals:
-    proposal_probs = torch.rand((batch_size, propose_len, vocab_size),
+    batch_size = len(propose_lens)
+    max_propose_len = max(propose_lens)
+    proposal_probs = torch.rand((batch_size, max_propose_len, vocab_size),
                                 device=device)
-    proposal_token_ids = torch.argmax(proposal_probs, dim=-1)
-    proposal_lens = torch.tensor([propose_len] * batch_size, device=device)
+
+    proposal_token_ids = torch.full((batch_size, max_propose_len),
+                                    fill_value=-1,
+                                    device=device)
+    for i in range(batch_size):
+        proposal_token_ids[i][:propose_lens[i]] = torch.argmax(
+            proposal_probs[i][:propose_lens[i]], dim=-1)
+
+    propose_lens = torch.tensor(propose_lens, device=device)
     return SpeculativeProposals(proposal_token_ids, proposal_probs,
-                                proposal_lens)
+                                propose_lens)
 
 
 def assert_score_equal(score1: SpeculativeScores,
                        score2: SpeculativeScores) -> None:
     assert torch.allclose(score1.probs, score2.probs)
     assert torch.allclose(score1.logprobs, score2.logprobs)
-    assert torch.equal(score1.token_ids, score2.token_ids)
+    assert torch.equal(
+        score1.token_ids,
+        score2.token_ids), f"{score1.token_ids}, {score2.token_ids}"
 
 
 @pytest.mark.parametrize('model_name', ['facebook/opt-125m'])
 @pytest.mark.parametrize('batch_size', [1, 2, 4, 8, 16])
-@pytest.mark.parametrize('propose_len', [1, 3, 5])
+@pytest.mark.parametrize('max_propose_len', [1, 3, 5])
+@pytest.mark.parametrize('mixed_propose_len', [True])
 @pytest.mark.parametrize('device', ['cuda'])
-def test_scoroer(model_name: str, batch_size: int, propose_len: int,
-                 device: str) -> None:
+def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
+                mixed_propose_len: bool, device: str) -> None:
     """
-    Compare the batch expansion scorer and mqa scorer return the same score
+    Compare the batch expansion scorer and mqa scorer return the same score.
+    We test for both queries with the same propose length and different 
+    propose length.
     """
     seed = 0
     block_size = 32
@@ -46,13 +63,22 @@ def test_scoroer(model_name: str, batch_size: int, propose_len: int,
         should_modify_greedy_probs_inplace = True
 
     vocab_size = scorer_worker.vocab_size
-    proposals = create_proposal(batch_size, propose_len, vocab_size, device)
+
+    if not mixed_propose_len:
+        propose_lens = [max_propose_len] * batch_size
+    else:
+        non_zero_cnt = random.randint(0, batch_size)
+        propose_lens = [max_propose_len
+                        ] * non_zero_cnt + [0] * (batch_size - non_zero_cnt)
+        random.shuffle(propose_lens)
+
+    proposals = create_proposal(propose_lens, vocab_size, device)
     seq_group_metadatalist, _, _ = create_batch(batch_size,
-                                                propose_len,
+                                                max_propose_len,
                                                 block_size=block_size,
                                                 num_gpu_blocks=num_gpu_blocks)
     requests = ExecuteModelRequest(seq_group_metadatalist,
-                                   num_lookahead_slots=propose_len)
+                                   num_lookahead_slots=max_propose_len)
 
     batch_expansion_scorer = BatchExpansionTop1Scorer(scorer_worker, device,
                                                       vocab_size)
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 57ac152d9edb6..c216d195c9e7e 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -186,11 +186,8 @@ class BlocksparseFlashAttentionMetadata(AttentionMetadata):
     # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
     use_cuda_graph: bool
 
-    # Number of query tokens for each request in the batch.
-    # Currently, we require that all requests have the same number of query
-    # tokens during the decoding phase. When speculavie decoding is enabled,
-    # decode_query_len might be greater than 1. In all other cases, it is 1.
-    decode_query_len: Optional[int] = None
+    # Max number of query tokens for among request in the batch.
+    max_decode_query_len: Optional[int] = None
 
     _cached_prefill_metadata: Optional[
         "BlocksparseFlashAttentionMetadata"] = None
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index bba80262e52d3..8457bde066eb7 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -111,11 +111,8 @@ class FlashAttentionMetadata(AttentionMetadata):
     # Maximum query length in the batch.
     max_query_len: Optional[int]
 
-    # Number of query tokens for each request in the batch.
-    # Currently, we require that all requests have the same number of query
-    # tokens during the decoding phase. When speculavie decoding is enabled,
-    # decode_query_len might be greater than 1. In all other cases, it is 1.
-    decode_query_len: Optional[int]
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int]
 
     # Maximum sequence length among prefill batch. 0 if there are decoding
     # requests only.
@@ -173,9 +170,9 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
             slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
-            decode_query_len=0,
             max_query_len=self.max_query_len,
             max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_query_len=0,
             max_decode_seq_len=0,
             query_start_loc=self.query_start_loc[:self.num_prefills + 1],
             seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
@@ -202,12 +199,14 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
             slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
-            decode_query_len=self.decode_query_len,
+            max_decode_query_len=self.max_decode_query_len,
             max_query_len=self.max_query_len,
             max_prefill_seq_len=0,
             max_decode_seq_len=self.max_decode_seq_len,
-            query_start_loc=None,
-            seq_start_loc=None,
+            query_start_loc=self.query_start_loc[self.num_prefills:]
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
             context_lens_tensor=None,
             block_tables=self.block_tables[self.num_prefills:],
             use_cuda_graph=self.use_cuda_graph,
@@ -413,9 +412,9 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         max_query_len = max(query_lens)
         decode_query_lens = query_lens[self.num_prefills:]
         if len(decode_query_lens) > 0:
-            decode_query_len = max(decode_query_lens)
+            max_decode_query_len = max(decode_query_lens)
         else:
-            decode_query_len = 1
+            max_decode_query_len = 1
         max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
         max_decode_seq_len = max(self.curr_seq_lens, default=0)
         num_decode_tokens = self.num_decode_tokens
@@ -468,7 +467,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
-            decode_query_len=decode_query_len,
+            max_decode_query_len=max_decode_query_len,
             max_prefill_seq_len=max_prefill_seq_len,
             max_decode_seq_len=max_decode_seq_len,
             query_start_loc=query_start_loc,
@@ -714,20 +713,37 @@ def unified_flash_attention(
 
     if decode_meta := attn_metadata.decode_metadata:
         # Decoding run.
-        _, num_head, head_dim = decode_query.shape
-        decode_query = decode_query.reshape(-1, decode_meta.decode_query_len,
-                                            num_head, head_dim)
-        decode_output = flash_attn_with_kvcache(
-            q=decode_query,
-            k_cache=key_cache,
-            v_cache=value_cache,
-            block_table=decode_meta.block_tables,
-            cache_seqlens=decode_meta.seq_lens_tensor,
-            softmax_scale=softmax_scale,
-            causal=True,
-            alibi_slopes=alibi_slopes,
-            softcap=logits_soft_cap,
-        ).squeeze(1)
+        # Use flash_attn_varlen_func kernel for speculative decoding
+        # because different queries might have different lengths.
+        assert decode_meta.max_decode_query_len is not None
+        if decode_meta.max_decode_query_len > 1:
+            decode_output = flash_attn_varlen_func(
+                q=decode_query,
+                k=key_cache,
+                v=value_cache,
+                cu_seqlens_q=decode_meta.query_start_loc,
+                max_seqlen_q=decode_meta.max_decode_query_len,
+                cu_seqlens_k=decode_meta.seq_start_loc,
+                max_seqlen_k=decode_meta.max_decode_seq_len,
+                softmax_scale=softmax_scale,
+                causal=True,
+                alibi_slopes=alibi_slopes,
+                softcap=logits_soft_cap,
+                block_table=decode_meta.block_tables,
+            )
+        else:
+            # Use flash_attn_with_kvcache for normal decoding.
+            decode_output = flash_attn_with_kvcache(
+                q=decode_query.unsqueeze(1),
+                k_cache=key_cache,
+                v_cache=value_cache,
+                block_table=decode_meta.block_tables,
+                cache_seqlens=decode_meta.seq_lens_tensor,
+                softmax_scale=softmax_scale,
+                causal=True,
+                alibi_slopes=alibi_slopes,
+                softcap=logits_soft_cap,
+            ).squeeze(1)
 
     if prefill_output is None:
         assert decode_output is not None
@@ -739,7 +755,6 @@ def unified_flash_attention(
     # Chunked prefill does not work with speculative decoding.
     # Therefore, the query length for decode should be 1 in chunked prefill.
     assert decode_meta is not None
-    assert decode_meta.decode_query_len == 1
     decode_output = decode_output.squeeze(1)
     output = torch.cat([prefill_output, decode_output], dim=0)
     return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 03fb9193f892d..682eac50126ad 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -121,11 +121,8 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     # so far).
     context_lens_tensor: Optional[torch.Tensor]
 
-    # Number of query tokens for each request in the batch.
-    # Currently, we require that all requests have the same number of query
-    # tokens during the decoding phase. When speculavie decoding is enabled,
-    # decode_query_len might be greater than 1. In all other cases, it is 1.
-    decode_query_len: Optional[int] = None
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
 
     _cached_prefill_metadata: Optional["ROCmFlashAttentionMetadata"] = None
     _cached_decode_metadata: Optional["ROCmFlashAttentionMetadata"] = None
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 2b8c373178ab3..53e3a53badeae 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -313,7 +313,7 @@ def graph_capture_get_metadata_for_batch(
             seq_lens=None,
             seq_lens_tensor=self._graph_seq_lens[:batch_size],
             max_query_len=1,
-            decode_query_len=1,
+            max_decode_query_len=1,
             max_prefill_seq_len=0,
             max_decode_seq_len=self.runner.max_seq_len_to_capture,
             query_start_loc=None,
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index a3f9ff64f8b8b..9ad7c41e48b68 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -118,11 +118,8 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
     # Maximum query length in the batch. None for decoding.
     max_query_len: Optional[int] = None
 
-    # Number of query tokens for each request in the batch.
-    # Currently, we require that all requests have the same number of query
-    # tokens during the decoding phase. When speculavie decoding is enabled,
-    # decode_query_len might be greater than 1. In all other cases, it is 1.
-    decode_query_len: Optional[int] = None
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
 
     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
     # the batch, used to index into subquery. E.g., if the subquery length
diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
index 59f2a4191a8b2..f35a8a0ab8be3 100644
--- a/vllm/spec_decode/mqa_scorer.py
+++ b/vllm/spec_decode/mqa_scorer.py
@@ -18,6 +18,7 @@ def score_proposals(
         target_seq_id_start = max(
             get_all_seq_ids(execute_model_req.seq_group_metadata_list)) + 1
         all_proposal_tokens = proposals.proposal_token_ids.tolist()
+        all_proposal_lengths = proposals.proposal_lens.tolist()
         for i, seq_group_metadata in enumerate(
                 execute_model_req.seq_group_metadata_list):
             seq_data_dict = seq_group_metadata.seq_data
@@ -27,7 +28,8 @@ def score_proposals(
             seq_data: SequenceData = seq_data_dict[seq_id]
             prompt_token_ids = seq_data.get_prompt_token_ids()
             output_token_ids = seq_data.get_output_token_ids()
-            proposal_token_ids = all_proposal_tokens[i]
+            proposal_token_ids = all_proposal_tokens[
+                i][:all_proposal_lengths[i]]
             new_output_token_ids = [*output_token_ids, *proposal_token_ids]
 
             target_seq_id = target_seq_id_start + i
@@ -62,18 +64,42 @@ def score_proposals(
 
         target_sampler_output = target_sampler_output[0]
 
-        bs, k = proposals.proposal_token_ids.shape
-        all_tokens = target_sampler_output.sampled_token_ids.reshape(bs, k + 1)
-
-        all_probs = target_sampler_output.sampled_token_probs.reshape(
-            bs, k + 1, self._vocab_size)
-        all_logprobs = target_sampler_output.logprobs.reshape(
-            bs, k + 1, self._vocab_size)
+        k = execute_model_req.num_lookahead_slots
+        bs = len(execute_model_req.seq_group_metadata_list)
+        target_token_ids = target_sampler_output.sampled_token_ids
+        target_probs = target_sampler_output.sampled_token_probs
+        target_logprobs = target_sampler_output.logprobs
+        # If all requests have the same number of query tokens, we can avoid
+        # the for loop to build output for better performance.
+        if min(all_proposal_lengths) == k:
+            bs, _ = proposals.proposal_token_ids.shape
+            all_tokens = target_token_ids.reshape(bs, k + 1)
+            all_probs = target_probs.reshape(bs, k + 1, self._vocab_size)
+            all_logprobs = target_logprobs.reshape(bs, k + 1, self._vocab_size)
+        else:
+            all_tokens = target_token_ids.new_full(size=(bs, k + 1),
+                                                   fill_value=-1)
+            all_probs = target_probs.new_zeros(*all_tokens.shape,
+                                               self._vocab_size)
+            all_logprobs = target_logprobs.new_full(size=all_probs.shape,
+                                                    fill_value=-float("inf"))
+            target_token_ids = target_token_ids.flatten()
+            start_loc = 0
+            for i, proposed_len in enumerate(all_proposal_lengths):
+                output_len = proposed_len + 1
+                end_loc = start_loc + output_len
+                all_tokens[
+                    i, :output_len] = target_token_ids[start_loc:end_loc]
+                all_probs[i, :output_len] = target_probs[start_loc:end_loc]
+                all_logprobs[
+                    i, :output_len] = target_logprobs[start_loc:end_loc]
+                start_loc = end_loc
 
         hidden_states = None
         if target_sampler_output.hidden_states is not None:
             hidden_states = target_sampler_output.hidden_states.reshape(
                 bs, (k + 1), -1)
+
         return SpeculativeScores(probs=all_probs,
                                  token_ids=all_tokens,
                                  logprobs=all_logprobs,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 13d39773944fb..50d2767a03752 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -190,12 +190,6 @@ def create_worker(
                     "[Speculative Decoding] Disabling MQA scorer as the "
                     "MQA is only available with flash attn backend.")
 
-            if ngram_prompt_lookup_max > 0:
-                disable_mqa_scorer = True
-                logger.info(
-                    "[Speculative Decoding] Disabling MQA scorer as the "
-                    "NGramWorker does not support MQA scorer.")
-
             if "model_config" in draft_worker_kwargs and \
                 draft_worker_kwargs["model_config"].max_model_len < \
                     scorer_worker.model_config.max_model_len:

From 00298e092c38eb9819f6548a6a246fa207c20c36 Mon Sep 17 00:00:00 2001
From: Xiang Xu <117880274+xiangxu-google@users.noreply.github.com>
Date: Sat, 12 Oct 2024 00:00:43 -0700
Subject: [PATCH 28/31] [Bugfix] Fix bug of xformer prefill for encoder-decoder
 (#9026)

---
 vllm/attention/backends/xformers.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 9ad7c41e48b68..25b86176f630e 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -559,25 +559,32 @@ def forward(
                                                     self.kv_cache_dtype,
                                                     k_scale, v_scale)
 
-        if attn_type != AttentionType.ENCODER:
-            # Decoder self-attention supports chunked prefill.
-            # Encoder/decoder cross-attention requires no chunked
-            # prefill (100% prefill or 100% decode tokens, no mix)
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
-            num_decode_tokens = attn_metadata.num_decode_tokens
-        else:
+        if attn_type == AttentionType.ENCODER:
             # Encoder attention - chunked prefill is not applicable;
             # derive token-count from query shape & and treat them
             # as 100% prefill tokens
             assert attn_metadata.num_encoder_tokens is not None
             num_prefill_tokens = attn_metadata.num_encoder_tokens
+            num_encoder_tokens = attn_metadata.num_encoder_tokens
             num_decode_tokens = 0
-
-        if attn_type == AttentionType.DECODER:
+        elif attn_type == AttentionType.DECODER:
+            # Decoder self-attention supports chunked prefill.
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+            num_encoder_tokens = attn_metadata.num_prefill_tokens
+            num_decode_tokens = attn_metadata.num_decode_tokens
             # Only enforce this shape-constraint for decoder
             # self-attention
             assert key.shape[0] == num_prefill_tokens + num_decode_tokens
             assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+        else:  # attn_type == AttentionType.ENCODER_DECODER
+            # Encoder/decoder cross-attention requires no chunked
+            # prefill (100% prefill or 100% decode tokens, no mix)
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+            if attn_metadata.num_encoder_tokens is not None:
+                num_encoder_tokens = attn_metadata.num_encoder_tokens
+            else:
+                num_encoder_tokens = attn_metadata.num_prefill_tokens
+            num_decode_tokens = attn_metadata.num_decode_tokens
 
         output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
@@ -585,8 +592,8 @@ def forward(
         # QKV for prefill.
         query = query[:num_prefill_tokens]
         if key is not None and value is not None:
-            key = key[:num_prefill_tokens]
-            value = value[:num_prefill_tokens]
+            key = key[:num_encoder_tokens]
+            value = value[:num_encoder_tokens]
 
         assert query.shape[0] == num_prefill_tokens
         assert decode_query.shape[0] == num_decode_tokens

From 2b184ddd4f9e4ff5305af87327410b9845a06baf Mon Sep 17 00:00:00 2001
From: Yunmeng <cym103@126.com>
Date: Sun, 13 Oct 2024 00:36:40 +0800
Subject: [PATCH 29/31] [Misc][Installation] Improve source installation script
 and doc (#9309)

Co-authored-by: youkaichao <youkaichao@126.com>
---
 docs/source/getting_started/installation.rst | 19 ++++++
 python_only_dev.py                           | 62 ++++++++++++++++----
 2 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index 2e6f6cdd163ce..99c695ac4ddb1 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -84,6 +84,8 @@ Latest code can contain bugs and may not be stable. Please use it with caution.
 Build from source
 ==================
 
+.. _python-only-build:
+
 Python-only build (without compilation)
 ----------------------------------------
 
@@ -114,6 +116,23 @@ The script will:
 
 Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM.
 
+Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_ with the ``--quit-dev``(or ``-q`` for short) flag:
+
+.. code-block:: console
+
+    $ python python_only_dev.py --quit-dev
+
+The script with ``--quit-dev`` flag will:
+
+* Remove the symbolic link from the current directory to the vLLM package.
+* Restore the original vLLM package from the backup.
+
+If you update the vLLM wheel and want to rebuild from the source and make further edits, you will need to start `all above <#python-only-build>`_ over again.
+
+.. note::
+
+    There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
+    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the above section <#install-the-latest-code>`_ for instructions on how to install a specified wheel.
 
 Full build (with compilation)
 ---------------------------------
diff --git a/python_only_dev.py b/python_only_dev.py
index d84122280a3c2..72d4e78ee14f6 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -1,10 +1,20 @@
 # enable python only development
 # copy compiled files to the current directory directly
 
+import argparse
 import os
 import shutil
 import subprocess
 import sys
+import warnings
+
+parser = argparse.ArgumentParser(
+    description="Development mode for python-only code")
+parser.add_argument('-q',
+                    '--quit-dev',
+                    action='store_true',
+                    help='Set the flag to quit development mode')
+args = parser.parse_args()
 
 # cannot directly `import vllm` , because it will try to
 # import from the current directory
@@ -37,18 +47,46 @@
     # "vllm/_version.py", # not available in nightly wheels yet
 ]
 
-for file in files_to_copy:
-    src = os.path.join(package_path, file)
-    dst = file
-    print(f"Copying {src} to {dst}")
-    shutil.copyfile(src, dst)
+# Try to create _version.py to avoid version related warning
+# Refer to https://github.com/vllm-project/vllm/pull/8771
+try:
+    from setuptools_scm import get_version
+    get_version(write_to="vllm/_version.py")
+except ImportError:
+    warnings.warn(
+        "To avoid warnings related to vllm._version, "
+        "you should install setuptools-scm by `pip install setuptools-scm`",
+        stacklevel=2)
+
+if not args.quit_dev:
+    for file in files_to_copy:
+        src = os.path.join(package_path, file)
+        dst = file
+        print(f"Copying {src} to {dst}")
+        shutil.copyfile(src, dst)
+
+    pre_built_vllm_path = os.path.join(package_path, "vllm")
+    tmp_path = os.path.join(package_path, "vllm_pre_built")
+    current_vllm_path = os.path.join(cwd, "vllm")
+
+    print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup")
+    os.rename(pre_built_vllm_path, tmp_path)
 
-pre_built_vllm_path = os.path.join(package_path, "vllm")
-tmp_path = os.path.join(package_path, "vllm_pre_built")
-current_vllm_path = os.path.join(cwd, "vllm")
+    print(f"Linking {current_vllm_path} to {pre_built_vllm_path}")
+    os.symlink(current_vllm_path, pre_built_vllm_path)
+else:
+    vllm_symlink_path = os.path.join(package_path, "vllm")
+    vllm_backup_path = os.path.join(package_path, "vllm_pre_built")
+    current_vllm_path = os.path.join(cwd, "vllm")
 
-print(f"Renaming {pre_built_vllm_path} to {tmp_path}")
-os.rename(pre_built_vllm_path, tmp_path)
+    print(f"Unlinking {current_vllm_path} to {vllm_symlink_path}")
+    assert os.path.islink(
+        vllm_symlink_path
+    ), f"not in dev mode: {vllm_symlink_path} is not a symbolic link"
+    assert current_vllm_path == os.readlink(
+        vllm_symlink_path
+    ), "current directory is not the source code of package"
+    os.unlink(vllm_symlink_path)
 
-print(f"linking {current_vllm_path} to {pre_built_vllm_path}")
-os.symlink(current_vllm_path, pre_built_vllm_path)
+    print(f"Recovering backup from {vllm_backup_path} to {vllm_symlink_path}")
+    os.rename(vllm_backup_path, vllm_symlink_path)

From 250e26a63e241076d8182155b9c7ea4f9f157ea3 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 13 Oct 2024 00:36:47 +0800
Subject: [PATCH 30/31] [Bugfix]Fix MiniCPM's LoRA bug (#9286)

---
 vllm/lora/models.py                    |  6 +++++-
 vllm/model_executor/models/minicpm.py  | 29 ++++++++++++--------------
 vllm/model_executor/models/minicpm3.py | 22 +++++++++++++++++++
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 0dc54516f8671..aaadca9a4d16d 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -337,7 +337,11 @@ def __init__(
             self.packed_modules_mapping = copy.deepcopy(
                 self.model.packed_modules_mapping)
         # Used to indicate whether the model is a multimodal model
-        self.supports_mm: bool = supports_multimodal(self.model)
+        self.supports_mm: bool = (
+            supports_multimodal(self.model)
+            # In case the model only supports LoRA for
+            # text modules (e.g. ChatGLM)
+            and hasattr(self.model, "get_mm_mapping"))
         self.packed_modules: Dict[str, List[str]] = {}
         self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
         # Dict instead of a Set for compatibility with LRUCache.
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 6bba1594c270f..41c2877194bb2 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -474,17 +474,18 @@ def __init__(
         unpadded_vocab_size = config.vocab_size
         if lora_config:
             unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        if not self.config.tie_word_embeddings:
-            self.lm_head = ParallelLMHead(
-                unpadded_vocab_size,
-                config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
-                quant_config=quant_config,
-            )
+        self.lm_head = ParallelLMHead(
+            unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
         self.logits_processor = LogitsProcessor(unpadded_vocab_size,
@@ -517,11 +518,7 @@ def compute_logits(
         sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
         hidden_states = hidden_states / self.scale_width
-        if self.config.tie_word_embeddings:
-            lm_head = self.model.embed_tokens
-        else:
-            lm_head = self.lm_head
-        logits = self.logits_processor(lm_head, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index c37bc5ad7c38f..3b5fd95328d74 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -216,6 +216,28 @@ def _init_layers(
 
 
 class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
+    packed_modules_mapping = {
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "kv_a_proj_with_mqa",
+        "q_a_proj",
+        "q_b_proj",
+        "kv_b_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+
+    # `embedding_modules` and `embedding_padding_modules`
+    # are inherited from MiniCPMForCausalLM
 
     def _init_model(self):
         self.model = MiniCPM3Model(config=self.config,

From f519902c52cfd61da9026ab714fad9d95502d2f1 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Sat, 12 Oct 2024 23:41:23 -0700
Subject: [PATCH 31/31] [CI] Fix merge conflict (#9317)

---
 vllm/attention/backends/placeholder_attn.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 99c68a863f599..3987986f1786b 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -75,11 +75,8 @@ class PlaceholderAttentionMetadata(AttentionMetadata):
     # Maximum query length in the batch.
     max_query_len: Optional[int]
 
-    # Number of query tokens for each request in the batch.
-    # Currently, we require that all requests have the same number of query
-    # tokens during the decoding phase. When speculavie decoding is enabled,
-    # decode_query_len might be greater than 1. In all other cases, it is 1.
-    decode_query_len: Optional[int]
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int]
 
     # Maximum sequence length among prefill batch. 0 if there are decoding
     # requests only.
@@ -140,7 +137,7 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             slot_mapping=slot_mapping,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
-            decode_query_len=0,
+            max_decode_query_len=0,
             max_query_len=self.max_query_len,
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
@@ -172,7 +169,7 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             slot_mapping=slot_mapping,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
-            decode_query_len=self.decode_query_len,
+            max_decode_query_len=self.max_decode_query_len,
             max_query_len=None,
             max_prefill_seq_len=0,
             max_decode_seq_len=self.max_decode_seq_len,
@@ -256,9 +253,9 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         max_query_len = max(query_lens)
         decode_query_lens = query_lens[self.num_prefills:]
         if len(decode_query_lens) > 0:
-            decode_query_len = max(decode_query_lens)
+            max_decode_query_len = max(decode_query_lens)
         else:
-            decode_query_len = 1
+            max_decode_query_len = 1
         max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
         max_decode_seq_len = max(self.curr_seq_lens, default=0)
         num_decode_tokens = self.num_decode_tokens
@@ -304,7 +301,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
-            decode_query_len=decode_query_len,
+            max_decode_query_len=max_decode_query_len,
             max_prefill_seq_len=max_prefill_seq_len,
             max_decode_seq_len=max_decode_seq_len,
             query_start_loc=query_start_loc,