From 421e218b37bd98b52bb3737c5aacc5a60fd460c0 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 24 Jul 2024 16:22:16 -0400
Subject: [PATCH 01/66] [Bugfix] Bump transformers to 4.43.2 (#6752)

---
 requirements-common.txt |  2 +-
 tests/test_config.py    | 55 +++++++++++++++++++++--------------------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 940740722c7e6..3b8d473c1fe7a 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.43.1  # Required for Chameleon and Llama 3.1 hotfox.
+transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp
diff --git a/tests/test_config.py b/tests/test_config.py
index 9f7d85e39ad67..225d71c0bc0ea 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -64,8 +64,9 @@ def test_get_sliding_window():
 
 
 def test_rope_customization():
-    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
+    TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0}
     TEST_ROPE_THETA = 16_000_000.0
+    LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}
 
     llama_model_config = ModelConfig(
         "meta-llama/Meta-Llama-3-8B-Instruct",
@@ -95,29 +96,29 @@ def test_rope_customization():
                    None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
-    # TODO: add these back when the rope configs are fixed
-    # LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
-    # longchat_model_config = ModelConfig(
-    #     "lmsys/longchat-13b-16k",
-    #     "lmsys/longchat-13b-16k",
-    #     tokenizer_mode="auto",
-    #     trust_remote_code=False,
-    #     dtype="float16",
-    #     seed=0,
-    # )
-    # assert getattr(longchat_model_config.hf_config, "rope_scaling",
-    #                None) == LONGCHAT_ROPE_SCALING
-    # assert longchat_model_config.max_model_len == 16384
-
-    # longchat_model_config = ModelConfig(
-    #     "lmsys/longchat-13b-16k",
-    #     "lmsys/longchat-13b-16k",
-    #     tokenizer_mode="auto",
-    #     trust_remote_code=False,
-    #     dtype="float16",
-    #     seed=0,
-    #     rope_scaling=TEST_ROPE_SCALING,
-    # )
-    # assert getattr(longchat_model_config.hf_config, "rope_scaling",
-    #                None) == TEST_ROPE_SCALING
-    # assert longchat_model_config.max_model_len == 4096
+    longchat_model_config = ModelConfig(
+        "lmsys/longchat-13b-16k",
+        "lmsys/longchat-13b-16k",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+    )
+    # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
+    assert all(
+        longchat_model_config.hf_config.rope_scaling.get(key) == value
+        for key, value in LONGCHAT_ROPE_SCALING.items())
+    assert longchat_model_config.max_model_len == 16384
+
+    longchat_model_config = ModelConfig(
+        "lmsys/longchat-13b-16k",
+        "lmsys/longchat-13b-16k",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+        rope_scaling=TEST_ROPE_SCALING,
+    )
+    assert getattr(longchat_model_config.hf_config, "rope_scaling",
+                   None) == TEST_ROPE_SCALING
+    assert longchat_model_config.max_model_len == 4096

From d88c458f44f5bc0d01215310f8abb5d63fa106d4 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Wed, 24 Jul 2024 17:32:57 -0400
Subject: [PATCH 02/66] [Doc][AMD][ROCm]Added tips to refer to mi300x tuning
 guide for mi300x users (#6754)

---
 docs/source/getting_started/amd-installation.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 71d7527a3e706..1c7d274b7c47e 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -142,3 +142,10 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
     - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
     - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
     - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
+
+
+.. tip::
+    - For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html>`_ for performance optimization and tuning tips on system and workflow level.
+      For vLLM, please refer to `vLLM performance optimization <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization>`_.
+
+

From 740374d456a638df98ffbc7d9dab328752330e62 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 24 Jul 2024 17:37:12 -0700
Subject: [PATCH 03/66] [core][distributed] fix zmq hang (#6759)

---
 vllm/connections.py                           |  4 +-
 .../device_communicators/shm_broadcast.py     | 60 +++++++------------
 2 files changed, 23 insertions(+), 41 deletions(-)

diff --git a/vllm/connections.py b/vllm/connections.py
index 65d44176e2464..e785a0b3ebd74 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Mapping, Optional
+from typing import Mapping, MutableMapping, Optional
 from urllib.parse import urlparse
 
 import aiohttp
@@ -40,7 +40,7 @@ def _validate_http_url(self, url: str):
             raise ValueError("Invalid HTTP URL: A valid HTTP URL "
                              "must have scheme 'http' or 'https'.")
 
-    def _headers(self, **extras: str) -> Mapping[str, str]:
+    def _headers(self, **extras: str) -> MutableMapping[str, str]:
         return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras}
 
     def get_response(
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 75d84c7a71bc3..d4847542688c0 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -9,7 +9,7 @@
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
-from zmq import PUB, REP, REQ, SUB, SUBSCRIBE, Context  # type: ignore
+from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -153,9 +153,7 @@ class Handle:
 
     buffer: Optional[ShmRingBuffer] = None
     local_subscribe_port: Optional[int] = None
-    local_sync_port: Optional[int] = None
     remote_subscribe_port: Optional[int] = None
-    remote_sync_port: Optional[int] = None
 
 
 class MessageQueue:
@@ -189,38 +187,36 @@ def __init__(
             self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes,
                                         max_chunks)
 
-            self.local_socket = context.socket(PUB)
+            # XPUB is very similar to PUB,
+            # except that it can receive subscription messages
+            # to confirm the number of subscribers
+            self.local_socket = context.socket(XPUB)
+            # set the verbose option so that we can receive every subscription
+            # message. otherwise, we will only receive the first subscription
+            # see http://api.zeromq.org/3-3:zmq-setsockopt for more details
+            self.local_socket.setsockopt(XPUB_VERBOSE, True)
             local_subscribe_port = get_open_port()
             self.local_socket.bind(f"tcp://*:{local_subscribe_port}")
 
-            self.local_sync_socket = context.socket(REP)
-            local_sync_port = get_open_port()
-            self.local_sync_socket.bind(f"tcp://*:{local_sync_port}")
             self.current_idx = 0
 
         else:
             self.buffer = None  # type: ignore
             local_subscribe_port = None
-            local_sync_port = None
             self.local_socket = None
-            self.local_sync_socket = None
             self.current_idx = -1
 
         if n_remote_reader > 0:
             # for remote readers, we will:
             # create a publish-subscribe socket to communicate large data
-            self.remote_socket = context.socket(PUB)
+            self.remote_socket = context.socket(XPUB)
+            self.remote_socket.setsockopt(XPUB_VERBOSE, True)
             remote_subscribe_port = get_open_port()
             self.remote_socket.bind(f"tcp://*:{remote_subscribe_port}")
 
-            self.remote_sync_socket = context.socket(REP)
-            remote_sync_port = get_open_port()
-            self.remote_sync_socket.bind(f"tcp://*:{remote_sync_port}")
         else:
             remote_subscribe_port = None
-            remote_sync_port = None
             self.remote_socket = None
-            self.remote_sync_socket = None
 
         self._is_writer = True
         self._is_local_reader = False
@@ -233,9 +229,7 @@ def __init__(
             local_reader_ranks=local_reader_ranks,
             buffer=self.buffer,
             local_subscribe_port=local_subscribe_port,
-            local_sync_port=local_sync_port,
             remote_subscribe_port=remote_subscribe_port,
-            remote_sync_port=remote_sync_port,
         )
 
         logger.info("vLLM message queue communication handle: %s", self.handle)
@@ -264,12 +258,7 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue":
             self.local_socket.connect(
                 f"tcp://{handle.connect_ip}:{handle.local_subscribe_port}")
 
-            self.local_sync_socket = context.socket(REQ)
-            self.local_sync_socket.connect(
-                f"tcp://{handle.connect_ip}:{handle.local_sync_port}")
-
             self.remote_socket = None
-            self.remote_sync_socket = None
         else:
             self.buffer = None  # type: ignore
             self.current_idx = -1
@@ -278,17 +267,12 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue":
             self._is_remote_reader = True
 
             self.local_socket = None
-            self.local_sync_socket = None
 
             self.remote_socket = context.socket(SUB)
             self.remote_socket.setsockopt_string(SUBSCRIBE, "")
             self.remote_socket.connect(
                 f"tcp://{handle.connect_ip}:{handle.remote_subscribe_port}")
 
-            self.remote_sync_socket = context.socket(REQ)
-            self.remote_sync_socket.connect(
-                f"tcp://{handle.connect_ip}:{handle.remote_sync_port}")
-
         return self
 
     def wait_until_ready(self):
@@ -300,29 +284,27 @@ def wait_until_ready(self):
 
             # local readers
             for i in range(self.n_local_reader):
-                recv = self.local_sync_socket.recv()
-                assert recv == b"READY"
-                self.local_sync_socket.send(b"READY")
+                # wait for subscription messages from all local readers
+                self.local_socket.recv()
             if self.n_local_reader > 0:
+                # send a message to all local readers
+                # to make sure the publish channel is working
                 self.local_socket.send(b"READY")
 
             # remote readers
             for i in range(self.n_remote_reader):
-                recv = self.remote_sync_socket.recv()
-                assert recv == b"READY"
-                self.remote_sync_socket.send(b"READY")
+                # wait for subscription messages from all remote readers
+                self.remote_socket.recv()
             if self.n_remote_reader > 0:
+                # send a message to all remote readers
+                # to make sure the publish channel is working
                 self.remote_socket.send(b"READY")
         elif self._is_local_reader:
-            self.local_sync_socket.send(b"READY")
-            recv = self.local_sync_socket.recv()
-            assert recv == b"READY"
+            # wait for the writer to send a message
             recv = self.local_socket.recv()
             assert recv == b"READY"
         elif self._is_remote_reader:
-            self.remote_sync_socket.send(b"READY")
-            recv = self.remote_sync_socket.recv()
-            assert recv == b"READY"
+            # wait for the writer to send a message
             recv = self.remote_socket.recv()
             assert recv == b"READY"
 

From 5689e256baf0c45148a01ad147abf11ad82c9690 Mon Sep 17 00:00:00 2001
From: "Evan Z. Liu" <ezliu@users.noreply.github.com>
Date: Wed, 24 Jul 2024 18:51:00 -0700
Subject: [PATCH 04/66] [Frontend] Represent tokens with identifiable strings
 (#6626)

---
 tests/entrypoints/openai/test_completion.py   | 10 ++-
 .../openai/test_return_tokens_as_ids.py       | 83 +++++++++++++++++++
 vllm/entrypoints/openai/api_server.py         |  2 +
 vllm/entrypoints/openai/cli_args.py           |  6 ++
 vllm/entrypoints/openai/serving_chat.py       | 23 +++--
 vllm/entrypoints/openai/serving_completion.py | 19 ++++-
 vllm/entrypoints/openai/serving_engine.py     | 14 ++--
 7 files changed, 138 insertions(+), 19 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_return_tokens_as_ids.py

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 0896e337b5d24..fe00640c0021e 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -55,8 +55,9 @@ def zephyr_pa_files():
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files):
-    args = [
+def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
+                        zephyr_pa_files):
+    return [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -85,7 +86,10 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files):
         "128",
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
new file mode 100644
index 0000000000000..abe413978e0e5
--- /dev/null
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -0,0 +1,83 @@
+# Separate these tests out from test_completion and test_chat, because they
+# require launching a second server with a different flag. Running both servers
+# at the same time on a single node will OOM.
+
+import pytest
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+from .test_completion import default_server_args  # noqa: F401
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
+from .test_completion import zephyr_pa_files  # noqa: F401
+from .test_completion import MODEL_NAME
+
+
+@pytest.fixture(scope="module")
+def server_with_return_tokens_as_token_ids_flag(
+        default_server_args):  # noqa: F811
+    args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
+    with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_completion_return_tokens_as_token_ids_completion(
+        server_with_return_tokens_as_token_ids_flag):
+    client = server_with_return_tokens_as_token_ids_flag.get_async_client()
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        # Include Unicode characters to test for dividing a single
+        # character across multiple tokens: 🎉 is [28705, 31862] for the
+        # Zephyr tokenizer
+        prompt="Say 'Hello, world! 🎉'",
+        echo=True,
+        temperature=0,
+        max_tokens=10,
+        logprobs=1)
+
+    text = completion.choices[0].text
+    token_strs = completion.choices[0].logprobs.tokens
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # Check that the token representations are consistent between raw tokens
+    # and top_logprobs
+    # Slice off the first one, because there's no scoring associated with BOS
+    top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
+    top_logprob_keys = [
+        next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
+    ]
+    assert token_strs[1:] == top_logprob_keys
+
+    # Check that decoding the tokens gives the expected text
+    tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
+    assert text == tokenizer.decode(tokens, skip_special_tokens=True)
+
+
+@pytest.mark.asyncio
+async def test_chat_return_tokens_as_token_ids_completion(
+        server_with_return_tokens_as_token_ids_flag):
+    client = server_with_return_tokens_as_token_ids_flag.get_async_client()
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        # Include Unicode characters to test for dividing a single
+        # character across multiple tokens: 🎉 is [28705, 31862] for the
+        # Zephyr tokenizer
+        messages=[{
+            "role": "system",
+            "content": "You like to respond in only emojis, like 🎉"
+        }, {
+            "role": "user",
+            "content": "Please write some emojis: 🐱🐶🎉"
+        }],
+        temperature=0,
+        max_tokens=8,
+        logprobs=True)
+
+    text = response.choices[0].message.content
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    token_ids = []
+    for logprob_content in response.choices[0].logprobs.content:
+        token_ids.append(int(logprob_content.token.removeprefix("token_id:")))
+    assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index add5c91900b23..0fe4dd245b5e6 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -254,6 +254,7 @@ async def build_server(
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
         chat_template=args.chat_template,
+        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     )
     openai_serving_completion = OpenAIServingCompletion(
         engine,
@@ -262,6 +263,7 @@ async def build_server(
         lora_modules=args.lora_modules,
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
+        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     )
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 64919c8be8642..a4192937980f7 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -128,6 +128,12 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "using @app.middleware('http'). "
         "If a class is provided, vLLM will add it to the server "
         "using app.add_middleware(). ")
+    parser.add_argument(
+        "--return-tokens-as-token-ids",
+        action="store_true",
+        help="When --max-logprobs is specified, represents single tokens as"
+        "strings of the form 'token_id:{token_id}' so that tokens that"
+        "are not JSON-encodable can be identified.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 3899509ef3ff4..012f70e661100 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -50,13 +50,15 @@ def __init__(
         prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
+        return_tokens_as_token_ids: bool = False,
     ):
         super().__init__(engine=engine,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=lora_modules,
                          prompt_adapters=prompt_adapters,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids)
 
         self.response_role = response_role
 
@@ -522,11 +524,14 @@ def _get_top_logprobs(
             self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int],
             tokenizer: PreTrainedTokenizer) -> List[ChatCompletionLogProb]:
         return [
-            ChatCompletionLogProb(
-                token=(token := self._get_decoded_token(p[1], p[0],
-                                                        tokenizer)),
-                logprob=max(p[1].logprob, -9999.0),
-                bytes=list(token.encode("utf-8", errors="replace")))
+            ChatCompletionLogProb(token=(token := self._get_decoded_token(
+                p[1],
+                p[0],
+                tokenizer,
+                return_as_token_id=self.return_tokens_as_token_ids)),
+                                  logprob=max(p[1].logprob, -9999.0),
+                                  bytes=list(
+                                      token.encode("utf-8", errors="replace")))
             for i, p in enumerate(logprobs.items())
             if top_logprobs and i < top_logprobs
         ]
@@ -546,6 +551,8 @@ def _create_chat_logprobs(
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
                 token = tokenizer.decode(token_id)
+                if self.return_tokens_as_token_ids:
+                    token = f"token_id:{token_id}"
                 logprobs_content.append(
                     ChatCompletionLogProbsContent(
                         token=token,
@@ -553,7 +560,9 @@ def _create_chat_logprobs(
             else:
                 logprobs_content.append(
                     ChatCompletionLogProbsContent(
-                        token=step_top_logprobs[token_id].decoded_token,
+                        token=self._get_decoded_token(
+                            step_top_logprobs[token_id], token_id, tokenizer,
+                            self.return_tokens_as_token_ids),
                         logprob=max(step_top_logprobs[token_id].logprob,
                                     -9999.0),
                         bytes=list(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 6aef4c9f96150..73e420141813e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -51,13 +51,15 @@ def __init__(
         lora_modules: Optional[List[LoRAModulePath]],
         prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
+        return_tokens_as_token_ids: bool = False,
     ):
         super().__init__(engine=engine,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=lora_modules,
                          prompt_adapters=prompt_adapters,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids)
 
     async def create_completion(self, request: CompletionRequest,
                                 raw_request: Request):
@@ -430,12 +432,17 @@ def _create_completion_logprobs(
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
                 token = tokenizer.decode(token_id)
+                if self.return_tokens_as_token_ids:
+                    token = f"token_id:{token_id}"
                 out_tokens.append(token)
                 out_token_logprobs.append(None)
                 out_top_logprobs.append(None)
             else:
-                token = self._get_decoded_token(step_top_logprobs[token_id],
-                                                token_id, tokenizer)
+                token = self._get_decoded_token(
+                    step_top_logprobs[token_id],
+                    token_id,
+                    tokenizer,
+                    return_as_token_id=self.return_tokens_as_token_ids)
                 token_logprob = max(step_top_logprobs[token_id].logprob,
                                     -9999.0)
                 out_tokens.append(token)
@@ -448,7 +455,11 @@ def _create_completion_logprobs(
                 out_top_logprobs.append({
                     # Convert float("-inf") to the
                     # JSON-serializable float that OpenAI uses
-                    self._get_decoded_token(top_lp[1], top_lp[0], tokenizer):
+                    self._get_decoded_token(
+                        top_lp[1],
+                        top_lp[0],
+                        tokenizer,
+                        return_as_token_id=self.return_tokens_as_token_ids):
                     max(top_lp[1].logprob, -9999.0)
                     for i, top_lp in enumerate(step_top_logprobs.items())
                     if num_output_top_logprobs >= i
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 8c6bd10b9b4d4..321c9ac2c1d5f 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -68,6 +68,7 @@ def __init__(
         lora_modules: Optional[List[LoRAModulePath]],
         prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
+        return_tokens_as_token_ids: bool = False,
     ):
         super().__init__()
 
@@ -102,6 +103,7 @@ def __init__(
                         prompt_adapter_num_virtual_tokens=num_virtual_tokens))
 
         self.request_logger = request_logger
+        self.return_tokens_as_token_ids = return_tokens_as_token_ids
 
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
@@ -384,11 +386,13 @@ def _log_inputs(
         )
 
     @staticmethod
-    def _get_decoded_token(
-        logprob: Logprob,
-        token_id: int,
-        tokenizer: AnyTokenizer,
-    ) -> str:
+    def _get_decoded_token(logprob: Logprob,
+                           token_id: int,
+                           tokenizer: AnyTokenizer,
+                           return_as_token_id: bool = False) -> str:
+        if return_as_token_id:
+            return f"token_id:{token_id}"
+
         if logprob.decoded_token is not None:
             return logprob.decoded_token
         return tokenizer.decode(token_id)

From 9e169a4c619c33ec4f9a14c5e971e3aa34bc4444 Mon Sep 17 00:00:00 2001
From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com>
Date: Thu, 25 Jul 2024 11:59:30 +0800
Subject: [PATCH 05/66] [Model] Adding support for MiniCPM-V (#4087)

---
 .../dev/multimodal/multimodal_index.rst       |   2 +
 docs/source/models/supported_models.rst       |   4 +
 examples/minicpmv_example.py                  |  53 ++
 tests/conftest.py                             |  11 +-
 tests/models/test_minicpmv.py                 | 163 +++++
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/llama.py           |   4 +-
 vllm/model_executor/models/minicpm.py         |   3 +-
 vllm/model_executor/models/minicpmv.py        | 682 ++++++++++++++++++
 vllm/multimodal/__init__.py                   |   3 +-
 vllm/multimodal/base.py                       |  34 +-
 11 files changed, 942 insertions(+), 18 deletions(-)
 create mode 100644 examples/minicpmv_example.py
 create mode 100644 tests/models/test_minicpmv.py
 create mode 100644 vllm/model_executor/models/minicpmv.py

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index 7cdbec2c9e3d4..9784f4cc2e088 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -40,6 +40,8 @@ Registry
 Base Classes
 ------------
 
+.. autodata:: vllm.multimodal.NestedTensors
+
 .. autodata:: vllm.multimodal.BatchedTensors
 
 .. autoclass:: vllm.multimodal.MultiModalDataBuiltins
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 068c00da39cd9..dc8bd6fb245df 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -206,6 +206,10 @@ Vision Language Models
     - Phi-3-Vision
     - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
     -
+  * - :code:`MiniCPM-V`
+    - MiniCPM-V
+    - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
+    -
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py
new file mode 100644
index 0000000000000..52366a7030ad0
--- /dev/null
+++ b/examples/minicpmv_example.py
@@ -0,0 +1,53 @@
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.assets.image import ImageAsset
+
+# 2.0
+# MODEL_NAME = "HwwwH/MiniCPM-V-2"
+# 2.5
+MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
+
+image = ImageAsset("stop_sign").pil_image.convert("RGB")
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+llm = LLM(model=MODEL_NAME,
+          gpu_memory_utilization=1,
+          trust_remote_code=True,
+          max_model_len=4096)
+
+messages = [{
+    'role':
+    'user',
+    'content':
+    '(<image>./</image>)\n' + "What's the content of the image?"
+}]
+prompt = tokenizer.apply_chat_template(messages,
+                                       tokenize=False,
+                                       add_generation_prompt=True)
+# 2.0
+# stop_token_ids = [tokenizer.eos_id]
+# 2.5
+stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+
+sampling_params = SamplingParams(
+    stop_token_ids=stop_token_ids,
+    # temperature=0.7,
+    # top_p=0.8,
+    # top_k=100,
+    # seed=3472,
+    max_tokens=1024,
+    # min_tokens=150,
+    temperature=0,
+    use_beam_search=True,
+    # length_penalty=1.2,
+    best_of=3)
+
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": {
+        "image": image
+    }
+},
+                       sampling_params=sampling_params)
+print(outputs[0].outputs[0].text)
diff --git a/tests/conftest.py b/tests/conftest.py
index 7f507310cd255..59510075b0063 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,7 +11,7 @@
 import torch.nn.functional as F
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
-                          AutoTokenizer, BatchEncoding)
+                          AutoTokenizer, BatchEncoding, BatchFeature)
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
@@ -133,7 +133,7 @@ def image_assets() -> _ImageAssets:
     return IMAGE_ASSETS
 
 
-_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding)
+_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
 
 
 class HfRunner:
@@ -339,7 +339,6 @@ def generate_greedy_logprobs_limit(
                 processor_kwargs["images"] = images[i]
 
             inputs = self.processor(**processor_kwargs)
-            input_ids = inputs.input_ids
 
             output = self.model.generate(
                 **self.wrap_device(inputs),
@@ -381,7 +380,7 @@ def generate_greedy_logprobs_limit(
 
             all_logprobs.append(seq_logprobs_lst)
             seq_ids = output.sequences[0]
-            output_len = seq_ids.shape[0] - input_ids.shape[1]
+            output_len = len(seq_logprobs_lst)
             output_ids = seq_ids[-output_len:]
             all_output_ids.append(output_ids.tolist())
             all_output_strs.append(self.tokenizer.decode(output_ids))
@@ -514,10 +513,12 @@ def generate_greedy_logprobs(
         max_tokens: int,
         num_logprobs: int,
         images: Optional[List[Image.Image]] = None,
+        stop_token_ids: Optional[List[int]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                 max_tokens=max_tokens,
-                                                logprobs=num_logprobs)
+                                                logprobs=num_logprobs,
+                                                stop_token_ids=stop_token_ids)
         outputs = self.generate_w_logprobs(prompts,
                                            greedy_logprobs_params,
                                            images=images)
diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py
new file mode 100644
index 0000000000000..9124fa7a6238c
--- /dev/null
+++ b/tests/models/test_minicpmv.py
@@ -0,0 +1,163 @@
+from collections import UserDict
+from typing import List, Optional, Tuple, Type
+
+import pytest
+import torch
+import torch.types
+from transformers import BatchFeature
+
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
+        "(<image>./</image>)\nWhat's the content of the image?<|eot_id|>" \
+        "<|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+    "cherry_blossom":
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
+        "(<image>./</image>)\nWhat is the season?<|eot_id|>" \
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+})
+
+models = ["openbmb/MiniCPM-Llama3-V-2_5"]
+
+
+def trunc_hf_output(hf_output: Tuple[List[int], str,
+                                     Optional[SampleLogprobs]]):
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<|eot_id|>"):
+        output_str = output_str.split("<|eot_id|>")[0]
+    return output_ids, output_str, out_logprobs
+
+
+target_dtype = "half"
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=4096,
+                     max_num_seqs=1,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=vllm_images,
+                                                stop_token_ids=stop_token_ids)
+            for prompts, vllm_images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
+
+        class NestedInputs(UserDict):
+
+            def __init__(self, model_inputs: BatchFeature):
+                super().__init__({"model_inputs": model_inputs})
+
+                self.model_inputs = model_inputs
+
+            def to(self, device: torch.types.Device):
+                return NestedInputs(self.model_inputs.to(device))
+
+        hf_processor = hf_model.processor
+        hf_model.processor = lambda **kw: NestedInputs(
+            hf_processor(**kw)  # type: ignore
+        )
+
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    tokenizer=tokenizer)
+            for prompts, hf_images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=[
+                trunc_hf_output(hf_output) for hf_output in hf_outputs
+            ],
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 55a039a88d535..7df5b8fa64710 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -50,6 +50,7 @@
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
+    "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2052c443a8885..306d22e42ed1d 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -418,9 +418,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        input_embeds: Optional[torch.Tensor] = None
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+                                  attn_metadata, intermediate_tensors,
+                                  input_embeds)
         return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 4ccf1cf0fad76..7a8ac0bb1f949 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -463,10 +463,11 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        input_embeds: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, input_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
new file mode 100644
index 0000000000000..8563216d9c392
--- /dev/null
+++ b/vllm/model_executor/models/minicpmv.py
@@ -0,0 +1,682 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-V-2 model compatible with HuggingFace weights."""
+import math
+import re
+from functools import partial
+from typing import Iterable, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch import nn
+from torch.nn.init import trunc_normal_
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.idefics2.modeling_idefics2 import (
+    Idefics2VisionTransformer)
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import SupportsVision
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import (cached_get_image_processor,
+                                   cached_get_tokenizer)
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "language_model.lm_head": "lm_head",
+    "language_model.model": "language_model",
+}
+
+
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: (H, W)
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    # tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    return F.interpolate(
+        abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+        size=(tgt_size[0], tgt_size[1]),
+        mode="bicubic",
+        align_corners=False,
+    ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+
+
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim,
+                            grid_size,
+                            cls_token=False,
+                            version=2.0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or 
+                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_h_size, grid_w_size = grid_size, grid_size
+    else:
+        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    if version == 2.0:
+        grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+        if cls_token:
+            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed],
+                                       axis=0)
+    else:
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version=2.0):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[0], version)  # (H*W, D/2) or (H, W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[1], version)  # (H*W, D/2) or (H, W, D/2)
+
+    if version == 2.0:
+        emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    else:
+        emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, version=2.0):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,) / (H, W)
+    out: (M, D) / (H, W, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    if version == 2.0:
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    else:
+        out = np.einsum('hw,d->hwd', pos, omega)  # (H, W, D/2), outer product
+        emb_sin = np.sin(out)  # (H, W, D/2)
+        emb_cos = np.cos(out)  # (H, W, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
+    return emb
+
+
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+
+    default_norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+    def __init__(self,
+                 num_queries,
+                 grid_size,
+                 embed_dim,
+                 num_heads,
+                 kv_dim=None,
+                 norm_layer=default_norm_layer,
+                 adaptive=False,
+                 max_size=(70, 70),
+                 version=2.0):
+        super().__init__()
+
+        self.version = version
+        if self.version == 2.0:
+            self.num_queries = grid_size**2
+        else:
+            self.num_queries = num_queries
+            self.max_size = max_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.adaptive = adaptive
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=.02)
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+
+        self.ln_post = norm_layer(embed_dim)
+        self.proj = nn.Parameter(
+            (embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
+
+        if self.version == 2.0:
+            self.pos_embed = nn.Parameter(
+                torch.from_numpy(
+                    get_2d_sincos_pos_embed(
+                        embed_dim, grid_size,
+                        version=self.version)).float()).requires_grad_(False)
+        else:
+            self._set_2d_pos_cache(self.max_size)
+
+        self.apply(self._init_weights)
+
+    def _set_2d_pos_cache(self, max_size, device='cpu'):
+        pos_embed = torch.from_numpy(
+            get_2d_sincos_pos_embed(self.embed_dim,
+                                    max_size,
+                                    version=self.version)).float().to(device)
+        self.register_buffer("pos_embed", pos_embed, persistent=False)
+
+    def _adjust_pos_cache(self, tgt_sizes, device):
+        max_h = torch.max(tgt_sizes[:, 0])
+        max_w = torch.max(tgt_sizes[:, 1])
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = [
+                max(max_h, self.max_size[0]),
+                max(max_w, self.max_size[1])
+            ]
+            self._set_2d_pos_cache(self.max_size, device)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward_2_5(self, x, tgt_sizes=None):
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        max_patch_len = torch.max(patch_len)
+        key_padding_mask = torch.zeros((bs, max_patch_len),
+                                       dtype=torch.bool,
+                                       device=device)
+
+        pos_embed = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i]
+            pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
+                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
+            key_padding_mask[i, patch_len[i]:] = True
+
+        pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed,
+                                                    batch_first=True,
+                                                    padding_value=0.0).permute(
+                                                        1, 0,
+                                                        2)  # BLD => L * B * D
+
+        x = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+
+        q = self.ln_q(self.query)  # Q * D
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            x + pos_embed,  # L * B * D +  L * B * D
+            x,
+            key_padding_mask=key_padding_mask)[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+    def forward_2(self, x, tgt_sizes=None, attn_mask=None):
+        if self.adaptive:
+            pos_embed = torch.Tensor(
+                get_2d_sincos_pos_embed(self.embed_dim,
+                                        tgt_sizes)).float().to(device=x.device,
+                                                               dtype=x.dtype)
+        else:
+            pos_embed = get_abs_pos(self.pos_embed, tgt_sizes)
+
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        out = self.attn(self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+                        x + pos_embed.unsqueeze(1),
+                        x,
+                        attn_mask=attn_mask)[0]
+        x = out.permute(1, 0, 2)
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+    def forward(self, x, tgt_sizes=None, attn_mask=None):
+        if self.version == 2.0:
+            return self.forward_2(x, tgt_sizes=tgt_sizes, attn_mask=attn_mask)
+        else:
+            return self.forward_2_5(x, tgt_sizes=tgt_sizes)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+def get_max_minicpmv_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    return getattr(hf_config, "query_num", 64)
+
+
+def dummy_seq_data_for_minicpmv(seq_len: int):
+    token_ids = [0] * seq_len
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_minicpmv(hf_config):
+    width = height = hf_config.image_size
+    image = Image.new("RGB", (width, height), color=0)
+    return {"image": image}
+
+
+def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int):
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+
+    # image_feature_size = get_max_minicpmv_image_tokens(ctx)
+
+    seq_data = dummy_seq_data_for_minicpmv(seq_len)
+
+    mm_data = dummy_image_for_minicpmv(hf_config)
+
+    return seq_data, mm_data
+
+
+def input_processor_for_minicpmv(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+
+    prompt = llm_inputs.get("prompt")
+    if prompt is None:
+        token_ids = llm_inputs.get("prompt_token_ids")
+        prompt = tokenizer.decode(token_ids)
+    image_processor = cached_get_image_processor(model_config.tokenizer)
+
+    pattern = "(<image>./</image>)"
+    image = multi_modal_data["image"]
+    image_tags = re.findall(pattern, prompt)
+    assert len(image_tags) <= 1
+    text_chunks = prompt.split(pattern)
+    new_prompt = text_chunks[0] \
+        + image_processor.get_slice_image_placeholder(image.size) \
+        + text_chunks[1]
+
+    new_token_ids = tokenizer.encode(new_prompt)
+
+    llm_inputs = LLMInputs(prompt_token_ids=new_token_ids,
+                           prompt=new_prompt,
+                           multi_modal_data=multi_modal_data)
+    return llm_inputs
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv)
+class MiniCPMV(nn.Module, SupportsVision):
+
+    def __init__(
+        self,
+        config,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.version = float(self.config.version)
+        self.llm = self.init_llm(config, cache_config, quant_config)
+        self.vpm = self.init_vision_module()
+        param_dtype = torch.get_default_dtype()
+        self.vpm.to(dtype=param_dtype)
+        self.vision_dim = self.vpm.embed_dim if self.version == 2.0 \
+            else self.vpm.embeddings.embed_dim
+        self.embed_dim = self.llm.config.hidden_size
+        self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
+        self.resampler.to(device="cuda", dtype=param_dtype)
+        self.sampler = Sampler()
+
+    def init_llm(self, config, cache_config, quant_config):
+        if self.version == 2.0:
+            return MiniCPMForCausalLM(config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config)
+        else:
+            return LlamaForCausalLM(config,
+                                    cache_config=cache_config,
+                                    quant_config=quant_config)
+
+    def init_vision_module(self):
+        if self.version == 2.0:
+            try:
+                import timm
+            except ImportError:
+                raise ImportError(
+                    'Please install timm==0.9.10') from ImportError
+            default_dtype = torch.get_default_dtype()
+            torch.set_default_dtype(torch.float16)
+            model = timm.create_model('vit_so400m_patch14_siglip_384.webli',
+                                      pretrained=False,
+                                      num_classes=0,
+                                      dynamic_img_size=True,
+                                      dynamic_img_pad=True)
+            torch.set_default_dtype(default_dtype)
+            if isinstance(model, timm.models.VisionTransformer
+                          ) and model.attn_pool is not None:
+                model.attn_pool = torch.nn.Identity()
+
+            if self.config.drop_vision_last_layer:
+                model.blocks = model.blocks[:-1]
+        else:
+            model = Idefics2VisionTransformer(self.config.vision_config)
+            if self.config.drop_vision_last_layer:
+                model.encoder.layers = model.encoder.layers[:-1]
+        return model
+
+    def init_resampler(self, embed_dim, vision_dim):
+        default_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(torch.float16)
+        if self.version == 2.0:
+            resampler = Resampler(grid_size=int(
+                math.sqrt(self.config.query_num)),
+                                  num_queries=None,
+                                  embed_dim=embed_dim,
+                                  num_heads=embed_dim // 128,
+                                  kv_dim=vision_dim,
+                                  adaptive=True,
+                                  version=self.version)
+        else:
+            resampler = Resampler(num_queries=self.config.query_num,
+                                  grid_size=None,
+                                  embed_dim=embed_dim,
+                                  num_heads=embed_dim // 128,
+                                  kv_dim=vision_dim,
+                                  adaptive=True,
+                                  version=self.version)
+        torch.set_default_dtype(default_dtype)
+        return resampler
+
+    def get_vision_embedding(self,
+                             pixel_values,
+                             patch_attn_mask=None,
+                             tgt_sizes=None,
+                             version=2.0):
+        if version == 2.0:
+            res = []
+            dtype = self.vpm.pos_embed.data.dtype
+            for pixel_value in pixel_values:
+                # V2.0 start
+                H, W = pixel_value[0].shape[-2:]
+                tgt_size = (math.ceil(H / self.vpm.patch_embed.patch_size[0]),
+                            math.ceil(W / self.vpm.patch_embed.patch_size[0]))
+                # V2.0 end
+                vision_embedding = self.vpm.forward_features(
+                    pixel_value.unsqueeze(0).type(dtype))
+                if hasattr(self.vpm, 'num_prefix_tokens'
+                           ) and self.vpm.num_prefix_tokens > 0:
+                    vision_embedding = vision_embedding[:, self.vpm.
+                                                        num_prefix_tokens:]
+                res.append(self.resampler(vision_embedding, tgt_size))
+            return torch.vstack(res)
+        else:
+            vision_embedding = self.vpm(
+                pixel_values.type(dtype),
+                patch_attention_mask=patch_attn_mask).last_hidden_state
+            vision_embedding = self.resampler(vision_embedding, tgt_sizes)
+
+    def get_image_bounds(self, input_ids):
+        tokenizer = cached_get_tokenizer(self.config._name_or_path,
+                                         trust_remote_code=True)
+        im_start_token_id = tokenizer.im_start_id
+        im_end_token_id = tokenizer.im_end_id
+        image_start_tokens = torch.where(input_ids == im_start_token_id)[0]
+        image_start_tokens += 1
+        image_end_tokens = torch.where(input_ids == im_end_token_id)[0]
+        valid_image_nums = min(len(image_start_tokens), len(image_end_tokens))
+        if valid_image_nums == 0:
+            return []
+        image_bound = torch.hstack([
+            image_start_tokens[:valid_image_nums].unsqueeze(-1),
+            image_end_tokens[:valid_image_nums].unsqueeze(-1),
+        ])
+
+        return image_bound
+
+    def get_vision_hidden_states(self, data):
+        if "vision_hidden_states" not in data:
+            pixel_values = data["pixel_values"]
+            tgt_sizes = data["tgt_sizes"]
+            vision_hidden_states = []
+            if self.version == 2.0:
+                if pixel_values is not None and len(pixel_values) > 0:
+                    vision_hidden_states = self.get_vision_embedding(
+                        pixel_values)
+                else:
+                    vision_hidden_states = torch.tensor([]).to(
+                        data["input_ids"].device)
+            else:
+                device = self.vpm.embeddings.position_embedding.weight.device
+                dtype = self.vpm.embeddings.position_embedding.weight.dtype
+                all_pixel_values = [
+                    i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+                ]
+                if all_pixel_values:
+                    tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
+                    max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
+                    all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+                        all_pixel_values, batch_first=True, padding_value=0.0)
+                    B, L, _ = all_pixel_values.shape
+                    all_pixel_values = all_pixel_values.permute(
+                        0, 2, 1).reshape(B, 3, -1, L)
+
+                    patch_attn_mask = torch.zeros((B, 1, max_patches),
+                                                  dtype=torch.bool,
+                                                  device=device)
+                    for i in range(B):
+                        patch_attn_mask[i, :tgt_sizes[i][0] *
+                                        tgt_sizes[i][1]] = True
+
+                    vision_embedding = self.vpm(
+                        all_pixel_values.type(dtype),
+                        patch_attention_mask=patch_attn_mask).last_hidden_state
+                    vision_hidden_states = self.resampler(
+                        vision_embedding, tgt_sizes)
+
+                else:  # no image
+                    dummy_feature = []
+                    vision_hidden_states = dummy_feature
+        else:
+            vision_hidden_states = data["vision_hidden_states"]
+
+        return vision_hidden_states
+
+    def get_embedding(self, data):
+        input_ids = data["input_ids"]
+
+        vision_hidden_states = self.get_vision_hidden_states(data)
+        if vision_hidden_states is not None and len(vision_hidden_states) > 0:
+            image_bounds = self.get_image_bounds(input_ids)
+        else:
+            image_bounds = []
+
+        if hasattr(self.llm.config, 'scale_emb'):
+            vlm_embedding = self.llm.model.embed_tokens(
+                input_ids) * self.llm.config.scale_emb
+        else:
+            vlm_embedding = self.llm.model.embed_tokens(input_ids)
+        vision_hidden_states = [
+            i.type(vlm_embedding.dtype) if isinstance(i, torch.Tensor) else i
+            for i in vision_hidden_states
+        ]
+
+        if len(vision_hidden_states) > 0 and len(image_bounds) > 0:
+            vision_hidden_states = torch.cat(vision_hidden_states, dim=0)
+            image_indices = torch.stack([
+                torch.arange(r[0], r[1], dtype=torch.long)
+                for r in image_bounds
+            ]).to(vlm_embedding.device)
+            vlm_embedding.scatter_(
+                0,
+                image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
+                vision_hidden_states.view(-1, vision_hidden_states.shape[-1]))
+        return vlm_embedding, vision_hidden_states
+
+    def process_multimodal_inputs(self, inputs):
+        pixel_values = []
+        tgt_sizes = []
+        for b in range(len(inputs["pixel_values"])):
+            pixel_values += inputs["pixel_values"][b]
+            tgt_sizes += inputs["tgt_sizes"][b]
+        return {
+            "pixel_values": pixel_values,
+            "input_ids": inputs["input_ids"],
+            "tgt_sizes": tgt_sizes
+        }
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ):
+        inputs = {
+            "pixel_values": kwargs.pop("pixel_values", []),
+            "input_ids": input_ids,
+            "tgt_sizes": kwargs.pop("tgt_sizes", None),
+        }
+
+        inputs = self.process_multimodal_inputs(inputs)
+
+        vlm_embeddings, vision_hidden_states = self.get_embedding(inputs)
+
+        output = self.llm(input_ids=None,
+                          positions=positions,
+                          kv_caches=kv_caches,
+                          attn_metadata=attn_metadata,
+                          intermediate_tensors=intermediate_tensors,
+                          input_embeds=vlm_embeddings)
+        return output
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        return self.llm.compute_logits(hidden_states, sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.llm.sample(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            #     for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+            #         if key_to_modify in name:
+            #             name = name.replace(key_to_modify, new_key)
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            use_default_weight_loading = False
+            if "vpm" in name or 'resampler' in name:
+                # We only do sharding for language model and
+                # not vision model for now.
+                use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    param = params_dict[name.replace(weight_name, param_name)]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    use_default_weight_loading = True
+            if use_default_weight_loading:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 503dceab5b168..0e3b35d425cb7 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,5 +1,5 @@
 from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
-                   MultiModalInputs, MultiModalPlugin)
+                   MultiModalInputs, MultiModalPlugin, NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -17,6 +17,7 @@
     "MultiModalDataDict",
     "MultiModalInputs",
     "MultiModalPlugin",
+    "NestedTensors",
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
 ]
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 3ebc25c5930cf..0d435bd644e29 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -2,7 +2,7 @@
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict,
-                    TypeVar, Union)
+                    TypeVar, Union, cast)
 
 import torch
 import torch.types
@@ -15,10 +15,17 @@
 
 logger = init_logger(__name__)
 
-BatchedTensors = Union[torch.Tensor, List[torch.Tensor]]
+NestedTensors = Union[List[torch.Tensor], torch.Tensor]
+"""
+Use a list instead of a tensor if the dimensions of each element do not match.
+Currently only supports up to singly nested list of tensors.
+"""
+
+BatchedTensors = Union[List[NestedTensors], NestedTensors]
 """
 If each input tensor in the batch has the same size, this is a single batched
-tensor; otherwise, this is a list of tensors with one element per batch.
+tensor; otherwise, this is a list of :class:`NestedTensors` with one element
+per item in the batch.
 """
 
 if sys.version_info < (3, 9):
@@ -27,7 +34,7 @@ class _MultiModalInputsBase(UserDict):
         pass
 else:
 
-    class _MultiModalInputsBase(UserDict[str, torch.Tensor]):
+    class _MultiModalInputsBase(UserDict[str, NestedTensors]):
         pass
 
 
@@ -39,19 +46,26 @@ class MultiModalInputs(_MultiModalInputsBase):
 
     @staticmethod
     def try_concat(
-        tensors: List[torch.Tensor],
+        tensors: List[NestedTensors],
         *,
         device: torch.types.Device,
     ) -> BatchedTensors:
-        unbatched_shape = tensors[0].shape[1:]
+        # may be list rather than tensors
+        if isinstance(tensors[0], list):
+            return [[t.to(device=device) for t in tensor[0]]
+                    for tensor in tensors]
+
+        tensors_ = cast(List[torch.Tensor], tensors)
+
+        unbatched_shape = tensors_[0].shape[1:]
 
-        for tensor in tensors:
+        for tensor in tensors_:
             if tensor.shape[1:] != unbatched_shape:
                 return [
-                    tensor.squeeze(0).to(device=device) for tensor in tensors
+                    tensor.squeeze(0).to(device=device) for tensor in tensors_
                 ]
 
-        return torch.cat(tensors, dim=0).to(device=device)
+        return torch.cat(tensors_, dim=0).to(device=device)
 
     @staticmethod
     def batch(
@@ -64,7 +78,7 @@ def batch(
 
         keys = inputs_list[0].keys()
 
-        item_lists: Dict[str, List[torch.Tensor]] = defaultdict(list)
+        item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
 
         for inputs in inputs_list:
             if inputs.keys() != keys:

From 309aaef8255fb832bf674c6ed7d9d84211629421 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 24 Jul 2024 22:33:56 -0700
Subject: [PATCH 06/66] [Bugfix] Fix decode tokens w. CUDA graph (#6757)

---
 tests/worker/test_model_runner.py     |  1 +
 vllm/attention/backends/flash_attn.py | 12 ++++++++++--
 vllm/attention/backends/flashinfer.py | 11 ++++++++++-
 vllm/attention/backends/utils.py      | 11 ++++++++++-
 4 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index b5742c4338616..4a0e2b4184936 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -193,6 +193,7 @@ def test_prepare_decode_cuda_graph(batch_size):
     for _ in range(expected_bs - len(seq_lens)):
         seq_lens.append(1)
     assert attn_metadata.seq_lens == seq_lens
+    assert attn_metadata.num_decode_tokens == len(seq_lens)
     start_idx = 0
     start_loc = [start_idx]
     for _ in context_lens:
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 949bd973cf3c4..7d7aff9dc3cdc 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -272,7 +272,15 @@ def _add_seq_group(
 
     def build(self, seq_lens: List[int], query_lens: List[int],
               cuda_graph_pad_size: int, batch_size: int):
-        """Build attention metadata with on-device tensors."""
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
         for inter_data in self.input_builder.inter_data_list:
             self._add_seq_group(inter_data,
                                 self.input_builder.chunked_prefill_enabled)
@@ -297,7 +305,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         if use_captured_graph:
             self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
             self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size + cuda_graph_pad_size
+            num_decode_tokens = batch_size
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 9746304347d6e..83a420d76834b 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -320,6 +320,15 @@ def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int):
 
     def build(self, seq_lens: List[int], query_lens: List[int],
               cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
         for inter_data in self.input_builder.inter_data_list:
             self._add_seq_group(inter_data,
                                 self.input_builder.chunked_prefill_enabled)
@@ -334,7 +343,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         if use_captured_graph:
             self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
             self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size + cuda_graph_pad_size
+            num_decode_tokens = batch_size
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 5877712b9b7d3..dcd10ed410a79 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -149,6 +149,15 @@ def _add_seq_group(
 
     def build(self, seq_lens: List[int], query_lens: List[int],
               cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
         for inter_data in self.input_builder.inter_data_list:
             self._add_seq_group(inter_data,
                                 self.input_builder.chunked_prefill_enabled)
@@ -173,7 +182,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         if use_captured_graph:
             self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
             self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size + cuda_graph_pad_size
+            num_decode_tokens = batch_size
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].

From 0310029a2fc62171fae87155150326125e082a5a Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Thu, 25 Jul 2024 01:34:11 -0400
Subject: [PATCH 07/66] [Bugfix] Fix awq_marlin and gptq_marlin flags (#6745)

---
 vllm/model_executor/layers/quantization/awq_marlin.py  | 5 +++--
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 092f87b623e7f..5ffbb8e854e87 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -25,7 +25,7 @@ class AWQMarlinConfig(QuantizationConfig):
     def __init__(self, weight_bits: int, group_size: int, has_zp: bool,
                  lm_head_quantized: bool) -> None:
         self.weight_bits = weight_bits
-        self.pack_factor = 32 // self.weight_bits  # packed into int32
+        self.pack_factor = 32 // self.weight_bits  # packed into 32bits
         self.group_size = group_size
         self.has_zp = has_zp
         self.lm_head_quantized = lm_head_quantized
@@ -69,7 +69,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig":
     def override_quantization_method(cls, hf_quant_cfg,
                                      user_quant) -> Optional[str]:
         can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
-        is_valid_user_quant = (user_quant is None or user_quant == "marlin")
+        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
+                               or user_quant == "awq_marlin")
 
         if can_convert and is_valid_user_quant:
             msg = ("The model is convertible to {} during runtime."
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 5b4d614ae2e74..bdcc9c3b4f0c5 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -79,7 +79,8 @@ def override_quantization_method(cls, hf_quant_cfg,
                                      user_quant) -> Optional[str]:
         can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
 
-        is_valid_user_quant = (user_quant is None or user_quant == "marlin")
+        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
+                               or user_quant == "gptq_marlin")
 
         if can_convert and is_valid_user_quant:
             msg = ("The model is convertible to {} during runtime."

From 316a41ac1de4e6e46933cadb39b9b7af65b01abd Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Wed, 24 Jul 2024 22:48:07 -0700
Subject: [PATCH 08/66] [Bugfix] Fix encoding_format in
 examples/openai_embedding_client.py (#6755)

---
 examples/openai_embedding_client.py        | 13 ++++++++-----
 tests/entrypoints/openai/test_embedding.py |  1 -
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/openai_embedding_client.py b/examples/openai_embedding_client.py
index b73360fe15a24..b4f4c7ad6beb2 100644
--- a/examples/openai_embedding_client.py
+++ b/examples/openai_embedding_client.py
@@ -13,11 +13,14 @@
 models = client.models.list()
 model = models.data[0].id
 
-responses = client.embeddings.create(input=[
-    "Hello my name is",
-    "The best thing about vLLM is that it supports many different models"
-],
-                                     model=model)
+responses = client.embeddings.create(
+    input=[
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ],
+    model=model,
+    encoding_format="float",
+)
 
 for data in responses.data:
     print(data.embedding)  # list of float of len 4096
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 2ca0c0d63c25c..c9747339bbf15 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -18,7 +18,6 @@ def embedding_server():
         "--enforce-eager",
         "--max-model-len",
         "8192",
-        "--enforce-eager",
     ]
 
     with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:

From b75e314fff29bdc94b2fb1dd78519e92f9520e65 Mon Sep 17 00:00:00 2001
From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com>
Date: Fri, 26 Jul 2024 00:42:49 +0800
Subject: [PATCH 09/66] [Bugfix] Add image placeholder for OpenAI Compatible
 Server of MiniCPM-V (#6787)

Co-authored-by: hezhihui <hzh7269@modelbest.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 examples/minicpmv_example.py   | 2 ++
 vllm/entrypoints/chat_utils.py | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py
index 52366a7030ad0..bf20a7ea04ad4 100644
--- a/examples/minicpmv_example.py
+++ b/examples/minicpmv_example.py
@@ -4,6 +4,8 @@
 from vllm.assets.image import ImageAsset
 
 # 2.0
+# The official repo doesn't work yet, so we need to use a fork for now
+# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 # MODEL_NAME = "HwwwH/MiniCPM-V-2"
 # 2.5
 MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index dca4523d1a27d..1f6d77b828459 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -100,7 +100,9 @@ def _image_token_str(model_config: ModelConfig,
     if model_type == "phi3_v":
         # Workaround since this token is not defined in the tokenizer
         return "<|image_1|>"
-    if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv", "paligemma"):
+    if model_type == "minicpmv":
+        return "(<image>./</image>)"
+    if model_type in ("blip-2", "chatglm", "fuyu", "paligemma"):
         # These models do not use image tokens in the prompt
         return None
     if model_type.startswith("llava"):

From 889da130e747b1382268ed428352f2e73e51a30b Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 25 Jul 2024 09:46:04 -0700
Subject: [PATCH 10/66] [ Misc ] `fp8-marlin` channelwise via
 `compressed-tensors` (#6524)

Co-authored-by: mgoin <michael@neuralmagic.com>
---
 .../configs/Qwen2-1.5B-Instruct-FP8W8.yaml    |  11 ++
 .../lm-eval-harness/configs/models-small.txt  |   1 +
 .../compressed_tensors/compressed_tensors.py  |  61 ++++++++--
 .../compressed_tensors/schemes/__init__.py    |   2 +
 .../schemes/compressed_tensors_scheme.py      |   3 +-
 .../schemes/compressed_tensors_unquantized.py |   3 +-
 .../schemes/compressed_tensors_w4a16_24.py    |   3 +-
 .../schemes/compressed_tensors_w8a16_fp8.py   | 105 ++++++++++++++++++
 .../schemes/compressed_tensors_w8a8_fp8.py    |  10 +-
 .../schemes/compressed_tensors_w8a8_int8.py   |  19 ++--
 .../schemes/compressed_tensors_wNa16.py       |   3 +-
 .../model_executor/layers/quantization/fp8.py |  33 ++++--
 .../quantization/utils/marlin_utils_fp8.py    |  14 +--
 13 files changed, 219 insertions(+), 49 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py

diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
new file mode 100644
index 0000000000000..42936fbfbe7d4
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.578
+  - name: "exact_match,flexible-extract"
+    value: 0.585
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 1d1b0ed38671d..109692395acf6 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -5,3 +5,4 @@ Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+Qwen2-1.5B-Instruct-FP8W8.yaml
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index c4d0c9cb981da..39d00bd5733ff 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -10,7 +10,8 @@
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
     CompressedTensorsScheme, CompressedTensorsUnquantized,
     CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8, CompressedTensorsWNA16)
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     QuantizationType, find_matched_target, is_activation_quantization_format,
@@ -100,14 +101,18 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
     def get_config_filenames(cls) -> List[str]:
         return []
 
-    def _check_scheme_supported(self, min_capability: int):
+    def _check_scheme_supported(self,
+                                min_capability: int,
+                                error: bool = True) -> bool:
         capability = current_platform.get_device_capability()
         capability = capability[0] * 10 + capability[1]
-        if capability < min_capability:
+        supported = capability >= min_capability
+        if error and not supported:
             raise RuntimeError(
                 "Quantization scheme is not supported for ",
                 f"the current GPU. Min capability: {min_capability}. ",
                 f"Current capability: {capability}.")
+        return supported
 
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
@@ -170,6 +175,29 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel,
         # All conditions satisfied.
         return True
 
+    def _is_fp8_w8a16(self, weight_quant: BaseModel,
+                      input_quant: BaseModel) -> bool:
+        # Confirm weights quantized.
+        if weight_quant is None:
+            return False
+
+        # Confirm we have floating points.
+        if weight_quant.type != QuantizationType.FLOAT:
+            return False
+
+        # Confirm weight scheme is supported.
+        is_symmetric_weight = weight_quant.symmetric
+        is_static_weight = not weight_quant.dynamic
+        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
+            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
+        ])
+        if not (is_symmetric_weight and is_static_weight
+                and is_per_tensor_or_channel_weight):
+            return False
+
+        # All conditions satisfied.
+        return True
+
     def _is_wNa16_group_channel(self, weight_quant: BaseModel,
                                 input_quant: BaseModel) -> bool:
         input_quant_none = input_quant is None
@@ -204,9 +232,23 @@ def _get_scheme_from_parts(
         # Detect If Activation Quantization.
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp8_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8Fp8(
+                is_fp8_w8a8_supported = self._check_scheme_supported(
+                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
+                if is_fp8_w8a8_supported:
+                    return CompressedTensorsW8A8Fp8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=(not input_quant.dynamic))
+                else:
+                    return CompressedTensorsW8A16Fp8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=(input_quant
+                                                and not input_quant.dynamic))
+
+            if self._is_fp8_w8a16(weight_quant, input_quant):
+                return CompressedTensorsW8A16Fp8(
                     strategy=weight_quant.strategy,
-                    is_static_input_scheme=(not input_quant.dynamic))
+                    is_static_input_scheme=(input_quant
+                                            and not input_quant.dynamic))
 
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
                 return CompressedTensorsW8A8Int8(
@@ -257,11 +299,10 @@ def get_scheme(
             targets=self.target_scheme_map.keys())
 
         # Find the quant_scheme
-        scheme = self.target_scheme_map[matched_target]
-
-        return self._get_scheme_from_parts(
-            weight_quant=scheme["weights"],
-            input_quant=scheme["input_activations"])
+        scheme_dict = self.target_scheme_map[matched_target]
+        scheme = self._get_scheme_from_parts(
+            weight_quant=scheme_dict["weights"],
+            input_quant=scheme_dict["input_activations"])
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index dd94c49827f62..ca9e286ce5b2d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -4,6 +4,7 @@
                                           CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
 from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
+from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
                                        CompressedTensorsWNA16)
 
@@ -11,6 +12,7 @@
     "CompressedTensorsScheme",
     "CompressedTensorsUnquantized",
     "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8",
     "CompressedTensorsW4A16Sparse24",
     "CompressedTensorsW8A8Int8",
     "CompressedTensorsW8A8Fp8",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index d5f37b47bb87e..b4bab33e1fb1d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -12,8 +12,9 @@ class CompressedTensorsScheme(ABC):
     of different quantization schemes supported by CompressedTensors.
     """
 
+    @classmethod
     @abstractmethod
-    def get_min_capability(self) -> int:
+    def get_min_capability(cls) -> int:
         """
         Get minimum device capability.
         """
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
index 6203f02d25e90..b7ba29ddc9840 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -18,7 +18,8 @@ class CompressedTensorsUnquantized(CompressedTensorsScheme):
     in a linear transformation.
     """
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # volta and up
         return 70
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index eec523d00372c..b8ffb22d7a89d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -29,7 +29,8 @@ def __init__(self,
             raise ValueError(
                 "group_size must be given when using strategy group")
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # ampere + up
         return 80
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
new file mode 100644
index 0000000000000..eeb7c042e1d1f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -0,0 +1,105 @@
+from typing import Callable, List, Optional
+
+import torch
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise, create_per_channel_scale_param,
+    create_per_tensor_scale_param)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsW8A16Fp8"]
+
+SUPPORTED_STRATEGIES = [
+    QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR
+]
+
+
+class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
+
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
+        self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # ampere and up
+        return 80
+
+    # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
+    # So if we have a fused module (QKV, MLP) with per tensor scales,
+    # we expand each scale to its shard's channels.
+    def process_weights_after_loading(self, layer) -> None:
+        if self.strategy == QuantizationStrategy.TENSOR:
+            ws_channelwise = convert_to_channelwise(layer.weight_scale,
+                                                    layer.logical_widths)
+            layer.weight_scale = torch.nn.Parameter(ws_channelwise,
+                                                    requires_grad=False)
+
+        # Weights must be transposed for marlin
+        layer.weight = torch.nn.Parameter(layer.weight.t(),
+                                          requires_grad=False)
+
+        prepare_fp8_layer_for_marlin(layer, strategy="channel")
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
+        # WEIGHT
+        weight = torch.nn.Parameter(torch.empty(output_size_per_partition,
+                                                input_size_per_partition,
+                                                dtype=torch.float8_e4m3fn),
+                                    requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+            "weight_loader": weight_loader,
+        })
+
+        # WEIGHT SCALE
+        layer_kwargs = {"weight_loader": weight_loader}
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            weight_scale = create_per_channel_scale_param(
+                output_partition_sizes, **layer_kwargs)
+        elif self.strategy == QuantizationStrategy.TENSOR:
+            weight_scale = create_per_tensor_scale_param(
+                output_partition_sizes, **layer_kwargs)
+        else:
+            raise ValueError(
+                f"Unsupported weight strategy={self.strategy}, "
+                f"supported strategies are {SUPPORTED_STRATEGIES}")
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE (to deal with converted checkpoints)
+        if self.is_static_input_scheme:
+            input_scale = create_per_tensor_scale_param(
+                output_partition_sizes, **layer_kwargs)
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return apply_fp8_marlin_linear(input=x,
+                                       weight=layer.weight,
+                                       weight_scale=layer.weight_scale,
+                                       workspace=layer.workspace,
+                                       size_n=layer.output_size_per_partition,
+                                       size_k=layer.input_size_per_partition,
+                                       bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 51156a3bc07af..cc9d71db140c2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -23,7 +23,8 @@ def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.is_static_input_scheme = is_static_input_scheme
         self.cutlass_fp8_supported = cutlass_fp8_supported()
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # lovelace and up
         return 89
 
@@ -77,19 +78,20 @@ def create_weights(self, layer: torch.nn.Module,
         })
 
         # WEIGHT SCALE
+        layer_kwargs = {"weight_loader": weight_loader}
         if self.strategy == QuantizationStrategy.CHANNEL:
             weight_scale = create_per_channel_scale_param(
-                output_partition_sizes, weight_loader=weight_loader)
+                output_partition_sizes, **layer_kwargs)
         else:
             assert self.strategy == QuantizationStrategy.TENSOR
             weight_scale = create_per_tensor_scale_param(
-                output_partition_sizes, weight_loader=weight_loader)
+                output_partition_sizes, **layer_kwargs)
         layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
             input_scale = create_per_tensor_scale_param(
-                output_partition_sizes, weight_loader=weight_loader)
+                output_partition_sizes, **layer_kwargs)
             layer.register_parameter("input_scale", input_scale)
 
     def apply_weights(self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index e81496c89ac7f..3a80863d3abbe 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -19,7 +19,8 @@ def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # turing and up
         return 75
 
@@ -68,19 +69,19 @@ def create_weights(self, layer: torch.nn.Module,
         # WEIGHT SCALE
         layer_kwargs = {"weight_loader": weight_loader}
         if self.strategy == QuantizationStrategy.CHANNEL:
-            scale = create_per_channel_scale_param(output_partition_sizes,
-                                                   **layer_kwargs)
+            weight_scale = create_per_channel_scale_param(
+                output_partition_sizes, **layer_kwargs)
         else:
             assert self.strategy == QuantizationStrategy.TENSOR
-            scale = create_per_tensor_scale_param(output_partition_sizes,
-                                                  **layer_kwargs)
-        layer.register_parameter("weight_scale", scale)
+            weight_scale = create_per_tensor_scale_param(
+                output_partition_sizes, **layer_kwargs)
+        layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
-            scale = create_per_tensor_scale_param(output_partition_sizes,
-                                                  **layer_kwargs)
-            layer.register_parameter("input_scale", scale)
+            input_scale = create_per_tensor_scale_param(
+                output_partition_sizes, **layer_kwargs)
+            layer.register_parameter("input_scale", input_scale)
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
                       bias: Optional[torch.Tensor]) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index e4cf0c0b5d95b..996cba315c556 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -42,7 +42,8 @@ def __init__(self,
                                      group_size=self.group_size,
                                      is_sym=True)
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # ampere and up
         return 80
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 3a4f2a49a3497..6649b317ca838 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -18,8 +18,9 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    all_close_1d, apply_fp8_linear, create_per_tensor_scale_param,
-    cutlass_fp8_supported, per_tensor_dequantize, requantize_with_max_scale)
+    all_close_1d, apply_fp8_linear, convert_to_channelwise,
+    create_per_tensor_scale_param, cutlass_fp8_supported,
+    per_tensor_dequantize, requantize_with_max_scale)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import print_warning_once
@@ -179,19 +180,29 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.input_scale = None
 
-        # If checkpoint is fp8, requantize the separately quantized logical
-        # weights into a single fp8 weight with a single weight scale.
+        # If checkpoint is fp8, handle that there are N scales for N
+        # shards in a fused module
         else:
-            # Dequant -> Quant with max scale.
-            max_w_scale, weight = requantize_with_max_scale(
-                weight=layer.weight,
-                weight_scale=layer.weight_scale,
-                logical_widths=layer.logical_widths,
-            )
+            # If using marlin (w8a16), kernel uses channelwise weights,
+            # so extend the weight scales to be channelwise.
+            if self.use_marlin:
+                weight = layer.weight
+                weight_scale = convert_to_channelwise(layer.weight_scale,
+                                                      layer.logical_widths)
+
+            # If using w8a8, torch._scaled_mm needs per tensor, so
+            # requantize the logical shards as a single weight.
+            else:
+                # Dequant -> Quant with max scale so we can run per tensor.
+                weight_scale, weight = requantize_with_max_scale(
+                    weight=layer.weight,
+                    weight_scale=layer.weight_scale,
+                    logical_widths=layer.logical_widths,
+                )
 
             # Update layer with new values.
             layer.weight = Parameter(weight.t(), requires_grad=False)
-            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             if self.quant_config.activation_scheme == "static":
                 layer.input_scale = Parameter(layer.input_scale.max(),
                                               requires_grad=False)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index c878939580f10..5f9d8658a342f 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -46,7 +46,8 @@ def apply_fp8_marlin_linear(
     return output.reshape(out_shape)
 
 
-def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
+def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
+                                 strategy: str = "tensor") -> None:
     print_warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
@@ -74,16 +75,7 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
     layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
 
     # WEIGHT SCALES
-    # Currently Marlin doesn't support per-tensor scales, so we
-    # expand it to channelwise
-    is_channelwise = (len(layer.weight_scale.shape) > 0
-                      and layer.weight_scale.shape[0] == part_size_n)
-    if is_channelwise:
-        scales = layer.weight_scale
-    else:
-        scales = layer.weight_scale.repeat(1, part_size_n)
-    scales = scales.to(layer.orig_dtype).to(device)
-
+    scales = layer.weight_scale.to(layer.orig_dtype)
     # Permute scales
     marlin_scales = marlin_permute_scales(s=scales,
                                           size_k=part_size_k,

From 65b1f121c885f169da210946eddb0d52524677f1 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 25 Jul 2024 12:46:15 -0400
Subject: [PATCH 11/66] [Bugfix] Fix `kv_cache_dtype=fp8` without scales for
 FP8 checkpoints (#6761)

---
 tests/quantization/test_fp8.py                      | 12 ++++++++++--
 vllm/model_executor/layers/quantization/kv_cache.py |  6 ++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 0602fedf0b8e3..ad92f1f189f65 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -60,12 +60,20 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
 
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
-def test_load_fp16_model(vllm_runner) -> None:
-    with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None:
+    with vllm_runner("facebook/opt-125m",
+                     quantization="fp8",
+                     kv_cache_dtype=kv_cache_dtype) as llm:
 
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         fc1 = model.model.decoder.layers[0].fc1
         assert isinstance(fc1.quant_method, Fp8LinearMethod)
+        if kv_cache_dtype == "fp8":
+            attn = model.model.decoder.layers[0].self_attn.attn
+            assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+            assert attn._k_scale == 1.0
+            assert attn._v_scale == 1.0
 
         capability = torch.cuda.get_device_capability()
         capability = capability[0] * 10 + capability[1]
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index c1495711447fa..d79536d196b92 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -46,10 +46,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
                 # If no scales were loaded (both scales are invalid negative
                 # values), use the default value of 1.0
-                k_scale = torch.nn.Parameter(torch.tensor(1.0),
-                                             requires_grad=False)
-                v_scale = torch.nn.Parameter(torch.tensor(1.0),
-                                             requires_grad=False)
+                k_scale = 1.0
+                v_scale = 1.0
             else:
                 # If we find a single kv_scale in the checkpoint, we remap
                 # kv_scale to k_scale during weight loading, and duplicate

From 95db75de64bec34f4d80acff92c62d1cdfa94688 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 25 Jul 2024 13:40:01 -0400
Subject: [PATCH 12/66] [Bugfix] Add synchronize to prevent possible data race
 (#6788)

Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 vllm/distributed/parallel_state.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 128096c88a8b1..e9c6fc3a255e4 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -243,6 +243,13 @@ def graph_capture(
         ca_comm = self.ca_comm
         maybe_ca_context = nullcontext(
         ) if ca_comm is None else ca_comm.capture()
+
+        # ensure all initialization operations complete before attempting to
+        # capture the graph on another stream
+        curr_stream = torch.cuda.current_stream()
+        if curr_stream != stream:
+            stream.wait_stream(curr_stream)
+
         with torch.cuda.stream(stream), maybe_ca_context:
             # In graph mode, we have to be very careful about the collective
             # operations. The current status is:

From 6a1e25b1514a25d3da96d0d78c4568f6e581e242 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 25 Jul 2024 11:57:16 -0700
Subject: [PATCH 13/66] [Doc] Add documentations for nightly benchmarks (#6412)

---
 .buildkite/nightly-benchmarks/README.md       | 80 +++++++++++++++----
 README.md                                     |  2 +-
 docs/source/index.rst                         |  6 ++
 .../performance_benchmark/benchmarks.rst      | 23 ++++++
 4 files changed, 94 insertions(+), 17 deletions(-)
 create mode 100644 docs/source/performance_benchmark/benchmarks.rst

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index c84e150934306..c1aebaf5b3bbe 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -3,30 +3,51 @@
 
 ## Introduction
 
-This directory contains the performance benchmarking CI for vllm.
-The goal is to help developers know the impact of their PRs on the performance of vllm.
+This directory contains two sets of benchmark for vllm.
+- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
+- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 
-This benchmark will be *triggered* upon:
+
+See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+
+
+## Performance benchmark quick overview
+
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
+
+**Benchmarking Duration**: about 1hr.
+
+**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
+
+
+## Nightly benchmark quick overview
+
+**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. 
+
+**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
+
+**Benchmarking Duration**: about 3.5hrs.
+
+
+
+## Trigger the benchmark
+
+Performance benchmark will be triggered when:
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label.
 
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models.
+Nightly benchmark will be triggered when:
+- Every commit for those PRs with `nightly-benchmarks` label.
 
-**Benchmarking Duration**: about 1hr.
 
-**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
 
 
-## Configuring the workload
+## Performance benchmark details
 
-The benchmarking workload contains three parts:
-- Latency tests in `latency-tests.json`.
-- Throughput tests in `throughput-tests.json`.
-- Serving tests in `serving-tests.json`.
+See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 
-See [descriptions.md](tests/descriptions.md) for detailed descriptions. 
 
-### Latency test
+#### Latency test
 
 Here is an example of one test inside `latency-tests.json`:
 
@@ -54,12 +75,12 @@ Note that the performance numbers are highly sensitive to the value of the param
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
 
 
-### Throughput test
+#### Throughput test
 The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
 
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 
-### Serving test
+#### Serving test
 We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 
 ```
@@ -96,9 +117,36 @@ The number of this test is less stable compared to the delay and latency benchma
 
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 
-## Visualizing the results
+#### Visualizing the results
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
+
+
+
+## Nightly test details
+
+See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
+
+
+#### Workflow
+
+- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. 
+- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
+- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
+- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
+
+#### Nightly tests
+
+In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
+
+#### Docker containers
+
+The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
+
+WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
+
+WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
+
diff --git a/README.md b/README.md
index 8e508195cdceb..a9215f4c7e1c5 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ vLLM is fast with:
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
 - Optimized CUDA kernels
 
-**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
 
 vLLM is flexible and easy to use with:
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ded9a424ee68c..8f06f2f2e5469 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -117,6 +117,12 @@ Documentation
    automatic_prefix_caching/apc
    automatic_prefix_caching/details
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Performance benchmarks
+
+   performance_benchmark/benchmarks
+
 .. toctree::
    :maxdepth: 2
    :caption: Developer Documentation
diff --git a/docs/source/performance_benchmark/benchmarks.rst b/docs/source/performance_benchmark/benchmarks.rst
new file mode 100644
index 0000000000000..9a23aab10d03d
--- /dev/null
+++ b/docs/source/performance_benchmark/benchmarks.rst
@@ -0,0 +1,23 @@
+.. _benchmarks:
+
+Benchmark suites of vLLM
+========================
+
+
+
+vLLM contains two sets of benchmarks:
+
++ **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard <https://perf.vllm.ai>`_ for the latest performance results.
+
++ **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README <https://github.com/vllm-project/vllm/blob/main/README.md>`_.
+
+
+Trigger a benchmark
+-------------------
+
+The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`.
+
+
+.. note::
+
+   Please refer to `vLLM performance benchmark descriptions <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/tests/descriptions.md>`_ and `vLLM nightly benchmark descriptions <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md>`_ for detailed descriptions on benchmark environment, workload and metrics.

From cd7edc4e8726d4b87e121f9ec671ecb6dd0c45d6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 25 Jul 2024 18:05:09 -0400
Subject: [PATCH 14/66] [Bugfix] Fix empty (nullptr) channelwise  scales when
 loading wNa16 using compressed tensors (#6798)

---
 .../schemes/compressed_tensors_wNa16.py               | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 996cba315c556..a41962ccd66d8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -55,7 +55,12 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         output_size_per_partition = sum(output_partition_sizes)
 
         # If group_size is -1, we are in channelwise case.
-        group_size = input_size if self.group_size == -1 else self.group_size
+        channelwise = (self.group_size == -1)
+        group_size = input_size if channelwise else self.group_size
+        row_parallel = (input_size != input_size_per_partition)
+        # In the case of channelwise quantization, we need to replicate the
+        # scales across all gpus.
+        partition_scales = (row_parallel and not channelwise)
 
         verify_marlin_supports_shape(
             output_size_per_partition=output_size_per_partition,
@@ -66,8 +71,8 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         weight_scale_dim = None
         scales_and_zp_size = input_size // group_size
 
-        if (input_size != input_size_per_partition
-                and self.group_size is not None):
+        if partition_scales:
+            assert input_size_per_partition % group_size == 0
             weight_scale_dim = 1
             scales_and_zp_size = input_size_per_partition // group_size
 

From f3ff63c3f45974986f13f60647a258b09913c420 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 25 Jul 2024 15:38:32 -0700
Subject: [PATCH 15/66] [doc][distributed] improve multinode serving doc
 (#6804)

---
 docs/source/serving/distributed_serving.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 4fc36a680084c..5f14fd2b0ee0a 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -79,7 +79,7 @@ On the rest of the worker nodes, run the following command:
     $                   --worker \
     $                   /path/to/the/huggingface/home/in/this/node
 
-Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster.
+Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
 
 Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
@@ -101,7 +101,7 @@ You can also use tensor parallel without pipeline parallel, just set the tensor
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
 
 .. warning::
-    After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script <https://docs.vllm.ai/en/latest/getting_started/debugging.html>`_ for more information.
+    After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script <https://docs.vllm.ai/en/latest/getting_started/debugging.html>`_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion <https://github.com/vllm-project/vllm/issues/6803>`_ for more information.
 
 .. warning::
 

From b7215de2c5fcdf8af96cf941556d63934ea8f353 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 25 Jul 2024 16:47:55 -0700
Subject: [PATCH 16/66] [Docs] Publish 5th meetup slides (#6799)

---
 README.md                         | 10 +---------
 docs/source/community/meetups.rst |  1 +
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index a9215f4c7e1c5..5f23f0813f606 100644
--- a/README.md
+++ b/README.md
@@ -16,16 +16,8 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-**The Fifth vLLM Bay Area Meetup (July 24th 5pm-8pm PT)**
-
-We are excited to announce our fifth vLLM Meetup!
-Join us to hear the vLLM's recent updates and the upcoming roadmap.
-Additionally, our collaborators from AWS will be presenting their insights and experiences in deploying vLLM.
-Register now [here](https://lu.ma/lp0gyjqr) and be part of the event!
-
----
-
 *Latest News* 🔥
+- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst
index 0fde31ef9b059..3b01b109ebf2c 100644
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
 - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
 - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
 - `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__

From 1adddb14bf0e1a603581bca49e8d29e8bfb337dc Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Thu, 25 Jul 2024 16:53:25 -0700
Subject: [PATCH 17/66] [Core] Fix ray forward_dag error mssg (#6792)

---
 vllm/executor/ray_gpu_executor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index e4aaeaa24c1bc..564fa79acfd40 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -29,6 +29,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
+        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
@@ -60,8 +61,6 @@ def _init_executor(self) -> None:
         # Create the parallel GPU workers.
         self._init_workers_ray(placement_group)
 
-        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
-
     def _configure_ray_workers_use_nsight(self,
                                           ray_remote_kwargs) -> Dict[str, Any]:
         # If nsight profiling is enabled, we need to set the profiling

From 443c7cf4cf891e6957d4b31655e58cabceb5a2a7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 25 Jul 2024 17:44:09 -0700
Subject: [PATCH 18/66] [ci][distributed] fix flaky tests (#6806)

---
 tests/distributed/test_pipeline_parallel.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index d666b8a1d44bd..5ff39ddfbf996 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -1,3 +1,10 @@
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
 import os
 
 import pytest
@@ -78,7 +85,7 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
         "--pipeline-parallel-size",
         str(PP_SIZE),
         "--distributed-executor-backend",
-        "ray",
+        "mp",
     ]
     os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
 

From 2eb9f4ff262bb39859baebf8d2109abcdadee860 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 25 Jul 2024 18:08:33 -0700
Subject: [PATCH 19/66] [ci] Mark tensorizer as soft fail and separate from
 grouped test (#6810)

[ci] Mark tensorizer test as soft fail and separate it from grouped test in fast check (#6810)
Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e7dd1fdb2e660..633bc5ca95bf9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -17,11 +17,10 @@ steps:
   - pytest -v -s test_utils.py # Utils
   - pytest -v -s worker # Worker
 
-- label: Tensorizer, Metrics, Tracing Test
+- label: Metrics, Tracing Test
   fast_check: true
   fast_check_only: true
   commands:
-  - apt-get install -y curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
   - pytest -v -s metrics # Metrics
   - "pip install \
       opentelemetry-sdk \
@@ -221,6 +220,8 @@ steps:
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
+  soft_fail: true
+  fast_check: true
   commands:
     - apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn

From 062a1d0fab111723ab768f94bdd48a6adc054007 Mon Sep 17 00:00:00 2001
From: QQSong <ustcsqq@gmail.com>
Date: Thu, 25 Jul 2024 19:24:58 -0700
Subject: [PATCH 20/66] Fix ReplicatedLinear weight loading (#6793)

---
 vllm/model_executor/layers/linear.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 0e0a2b72f93d4..b6e280ae65049 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -199,12 +199,16 @@ def __init__(self,
                                          self.input_size,
                                          self.output_size,
                                          self.params_dtype,
+                                         weight_loader=self.weight_loader,
                                          prefix=prefix)
 
         if bias:
             self.bias = Parameter(
                 torch.empty(self.output_size, dtype=self.params_dtype))
-            set_weight_attrs(self.bias, {"output_dim": 0})
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
         else:
             self.register_parameter("bias", None)
 

From 084a01fd3544557990f8af8af6fd3c1185bae848 Mon Sep 17 00:00:00 2001
From: Anthony Platanios <e.a.platanios@gmail.com>
Date: Fri, 26 Jul 2024 00:25:35 -0400
Subject: [PATCH 21/66] [Bugfix] [Easy] Fixed a bug in the multiprocessing GPU
 executor. (#6770)

---
 vllm/executor/multiproc_gpu_executor.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 9811fc2a55199..19f7a497cdd9f 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import signal
+import threading
 import weakref
 from functools import partial
 from typing import Any, List, Optional
@@ -115,8 +116,9 @@ def shutdown(signum, frame):
             if executor := ref():
                 executor.shutdown()
 
-        signal.signal(signal.SIGINT, shutdown)
-        signal.signal(signal.SIGTERM, shutdown)
+        if threading.current_thread() is threading.main_thread():
+            signal.signal(signal.SIGINT, shutdown)
+            signal.signal(signal.SIGTERM, shutdown)
 
         self.driver_worker = self._create_worker(
             distributed_init_method=distributed_init_method)

From 89a84b0bb7b30706a02836234a94493ea8f780bf Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Fri, 26 Jul 2024 12:31:31 +0800
Subject: [PATCH 22/66] [Core] Use array to speedup padding (#6779)

---
 vllm/model_executor/layers/sampler.py    |  2 +-
 vllm/model_executor/sampling_metadata.py | 21 ++++++++++++---------
 vllm/sequence.py                         | 23 ++++++++++++++++-------
 3 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 5c376797a054f..121458f8156a1 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -220,7 +220,7 @@ def _apply_min_tokens_penalty(
             seqs_to_penalize: List[int] = []
             for j, seq_id in enumerate(seq_ids):
                 seq_data = seq_group.seq_data[seq_id]
-                if len(seq_data.output_token_ids) < min_tokens:
+                if len(seq_data.output_token_ids_array) < min_tokens:
                     seqs_to_penalize.append(j)
 
             if seqs_to_penalize:
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 390b5d173ebcd..27b37a9d53470 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -1,4 +1,5 @@
 import random
+from array import array
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
@@ -329,8 +330,8 @@ def from_sampling_metadata(
             user-defined seed for each sequence.
         extra_entropy: extra entropy to use when generating seeds.
         """
-        prompt_tokens: List[List[int]] = []
-        output_tokens: List[List[int]] = []
+        prompt_tokens: List[array] = []
+        output_tokens: List[array] = []
         top_ks: List[int] = []
         temperatures: List[float] = []
         top_ps: List[float] = []
@@ -432,13 +433,15 @@ def from_sampling_metadata(
                 if (seq_group.is_prompt
                         and sampling_params.prompt_logprobs is not None):
                     prefill_len = len(seq_group.prompt_logprob_indices)
-                    prompt_tokens.extend([] for _ in range(prefill_len))
-                    output_tokens.extend([] for _ in range(prefill_len))
+                    prompt_tokens.extend(
+                        array('l') for _ in range(prefill_len))
+                    output_tokens.extend(
+                        array('l') for _ in range(prefill_len))
                 if seq_group.do_sample:
                     for seq_id in seq_ids:
                         seq_data = seq_group.seq_data[seq_id]
-                        prompt_tokens.append(list(seq_data.prompt_token_ids))
-                        output_tokens.append(list(seq_data.output_token_ids))
+                        prompt_tokens.append(seq_data.prompt_token_ids_array)
+                        output_tokens.append(seq_data.output_token_ids_array)
 
         sampling_tensors = SamplingTensors.from_lists(
             temperatures, top_ps, top_ks, min_ps, presence_penalties,
@@ -454,9 +457,9 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
                    frequency_penalties: List[float],
                    repetition_penalties: List[float],
                    sampling_seeds: List[int], sample_indices: List[int],
-                   prompt_tokens: List[List[int]],
-                   output_tokens: List[List[int]], vocab_size: int,
-                   extra_seeds_to_generate: int, device: torch.device,
+                   prompt_tokens: List[array], output_tokens: List[array],
+                   vocab_size: int, extra_seeds_to_generate: int,
+                   device: torch.device,
                    dtype: torch.dtype) -> "SamplingTensors":
         # Note that the performance will be very bad without
         # pinned memory.
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 0cd4c7e71d78d..72821ecea0f47 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -3,6 +3,7 @@
 import enum
 import math
 from abc import ABC, abstractmethod
+from array import array
 from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Set, Tuple,
@@ -119,10 +120,10 @@ def __init__(
         prompt_token_ids: List[int],
         output_token_ids: Optional[List[int]] = None,
     ) -> None:
-        self._prompt_token_ids: List[int] = list(prompt_token_ids)
+        self._prompt_token_ids = array('l', prompt_token_ids)
         self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids)
-        self._output_token_ids: List[int] = (
-            list(output_token_ids) if output_token_ids is not None else [])
+        self._output_token_ids = array(
+            'l', output_token_ids if output_token_ids is not None else [])
 
         self.cumulative_logprob = 0.0
         # The number of tokens that are computed (that run against the model).
@@ -132,8 +133,8 @@ def __init__(
         self._update_cached_all_tokens()
 
     def _update_cached_all_tokens(self):
-        self._cached_all_token_ids: List[int] = (self._prompt_token_ids +
-                                                 self._output_token_ids)
+        self._cached_all_token_ids: List[int] = list(self._prompt_token_ids +
+                                                     self._output_token_ids)
 
     @property
     def prompt_token_ids(self) -> Tuple[int, ...]:
@@ -141,19 +142,27 @@ def prompt_token_ids(self) -> Tuple[int, ...]:
 
     @prompt_token_ids.setter
     def prompt_token_ids(self, new_prompt_token_ids) -> None:
-        self._prompt_token_ids = list(new_prompt_token_ids)
+        self._prompt_token_ids = array('l', new_prompt_token_ids)
         self._prompt_token_ids_tuple = tuple(new_prompt_token_ids)
         self._update_cached_all_tokens()
 
+    @property
+    def prompt_token_ids_array(self) -> array:
+        return self._prompt_token_ids
+
     @property
     def output_token_ids(self) -> Tuple[int, ...]:
         return tuple(self._output_token_ids)
 
     @output_token_ids.setter
     def output_token_ids(self, new_output_token_ids) -> None:
-        self._output_token_ids = list(new_output_token_ids)
+        self._output_token_ids = array('l', new_output_token_ids)
         self._update_cached_all_tokens()
 
+    @property
+    def output_token_ids_array(self) -> array:
+        return self._output_token_ids
+
     def append_token_id(self, token_id: int, logprob: float) -> None:
         self._output_token_ids.append(token_id)
         self._cached_all_token_ids.append(token_id)

From 85ad7e2d012edd87de9e84e93ed3204c80599695 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 25 Jul 2024 21:48:05 -0700
Subject: [PATCH 23/66] [doc][debugging] add known issues for hangs (#6816)

---
 docs/source/getting_started/debugging.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 2aa52e79888a3..d7066f2325b3a 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -65,6 +65,10 @@ Here are some common issues that can cause hangs:
 
 If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
 
+Some known issues:
+
+- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can cause hangs at a low probability (once in about 20 times, depending on the machine configuration). The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_ .
+
 .. warning::
 
     After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on.

From 07278c37ddd898d842bbddc382e4f67ac08dae35 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 26 Jul 2024 14:33:42 -0400
Subject: [PATCH 24/66] [Model] Support Nemotron models (Nemotron-3,
 Nemotron-4, Minitron) (#6611)

---
 .../configs/Minitron-4B-Base.yaml             |  11 +
 .../lm-eval-harness/configs/models-small.txt  |   1 +
 vllm/model_executor/layers/activation.py      |  16 +
 .../model_executor/layers/rotary_embedding.py |   3 +
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/nemotron.py        | 531 ++++++++++++++++++
 vllm/transformers_utils/config.py             |   3 +-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/nemotron.py   | 209 +++++++
 9 files changed, 776 insertions(+), 1 deletion(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml
 create mode 100644 vllm/model_executor/models/nemotron.py
 create mode 100644 vllm/transformers_utils/configs/nemotron.py

diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml
new file mode 100644
index 0000000000000..a0466748ea71e
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
+model_name: "nvidia/Minitron-4B-Base"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.252
+  - name: "exact_match,flexible-extract"
+    value: 0.252
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 109692395acf6..e4df4b547aa5e 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -4,5 +4,6 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+Minitron-4B-Base.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 5bfdba67b443d..6578193a31597 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -159,6 +159,21 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
     # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
 
 
+class ReLUSquaredActivation(CustomOp):
+    """
+    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
+    """
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        relu_applied = nn.functional.relu(x)
+        squared = torch.square(relu_applied)
+        return squared
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self.forward_native(x)
+
+
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
 
@@ -207,6 +222,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
     "gelu_new": NewGELU(),
     "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
     "relu": nn.ReLU(),
+    "relu2": ReLUSquaredActivation(),
     "quick_gelu": QuickGELU(),
 }
 
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 60ba4623edc38..aecba0ae74911 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -774,6 +774,7 @@ def get_rope(
     is_neox_style: bool = True,
     rope_scaling: Optional[Dict[str, Any]] = None,
     dtype: Optional[torch.dtype] = None,
+    rotary_percent: float = 1.0,
 ) -> RotaryEmbedding:
     if dtype is None:
         dtype = torch.get_default_dtype()
@@ -786,6 +787,8 @@ def get_rope(
         rope_scaling_args = tuple(rope_scaling_tuple.items())
     else:
         rope_scaling_args = None
+    if rotary_percent < 1.0:
+        rotary_dim = int(rotary_dim * rotary_percent)
     key = (head_size, rotary_dim, max_position, base, is_neox_style,
            rope_scaling_args, dtype)
     if key in _ROPE_DICT:
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 7df5b8fa64710..ead64c0e92553 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -51,6 +51,7 @@
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
+    "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
new file mode 100644
index 0000000000000..bb85f20ab9802
--- /dev/null
+++ b/vllm/model_executor/models/nemotron.py
@@ -0,0 +1,531 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Nemotron model compatible with HuggingFace weights."""
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, LoRAConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.transformers_utils.configs import NemotronConfig
+
+from .interfaces import SupportsLoRA
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+
+# The architecture is pretty similar to Llama, with these changes:
+# - There is no gate_proj, just up_proj
+# - Normal LayerNorm (with a +1 to the weights) instead of RMSNorm
+# - Squared ReLU instead of SwiGLU
+# - Adds a rotary_percent to RoPE
+
+
+def _cast_if_autocast_enabled(*args):
+    if not torch.is_autocast_enabled():
+        return args
+    else:
+        return torch.cuda.amp.autocast_mode._cast(
+            args, torch.get_autocast_gpu_dtype())
+
+
+class NemotronLayerNorm1P(nn.LayerNorm):
+
+    def __init__(self,
+                 normalized_shape: Union[int, List[int], torch.Size],
+                 eps: float = 1e-5,
+                 elementwise_affine: bool = True,
+                 bias: bool = True,
+                 device=None,
+                 dtype=None):
+        super().__init__(normalized_shape, eps, elementwise_affine, bias,
+                         device, dtype)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if residual is not None:
+            x = x + residual
+            residual = x
+        args = _cast_if_autocast_enabled(x, self.normalized_shape,
+                                         self.weight + 1, self.bias, self.eps)
+        with torch.cuda.amp.autocast(enabled=False):
+            x = torch.nn.functional.layer_norm(*args)
+            return x if residual is None else (x, residual)
+
+
+class NemotronMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(input_size=hidden_size,
+                                            output_size=intermediate_size,
+                                            bias=bias,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.up_proj")
+        self.down_proj = RowParallelLinear(input_size=intermediate_size,
+                                           output_size=hidden_size,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_fn = get_act_fn(hidden_act)
+
+    def forward(self, x):
+        up, _ = self.up_proj(x)
+        x = self.act_fn(up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class NemotronAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(config, "head_dim",
+                                self.hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.rotary_percent = config.rope_percent
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            rotary_percent=self.rotary_percent,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class NemotronDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        self.self_attn = NemotronAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = NemotronMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = NemotronLayerNorm1P(config.hidden_size,
+                                                   eps=config.norm_eps)
+        self.post_attention_layernorm = NemotronLayerNorm1P(
+            config.hidden_size, eps=config.norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class NemotronModel(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: NemotronDecoderLayer(config=config,
+                                                cache_config=cache_config,
+                                                quant_config=quant_config,
+                                                prefix=prefix),
+            prefix=f"{prefix}.layers")
+        if get_pp_group().is_last_rank:
+            self.norm = NemotronLayerNorm1P(config.hidden_size,
+                                            eps=config.norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i - self.start_layer],
+                attn_metadata,
+                residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class NemotronForCausalLM(nn.Module, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+    }
+
+    def __init__(
+        self,
+        config: NemotronConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        assert isinstance(config, NemotronConfig)
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = NemotronModel(config,
+                                   cache_config,
+                                   quant_config,
+                                   lora_config=lora_config,
+                                   prefix="model")
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+            self.sampler = Sampler()
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, kv_caches,
+                                  attn_metadata, intermediate_tensors)
+        return model_output
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 652505a892142..3ba2e01985598 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -8,7 +8,7 @@
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                              JAISConfig, MedusaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
-                                             RWConfig)
+                                             NemotronConfig, RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -26,6 +26,7 @@
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
     "medusa": MedusaConfig,
+    "nemotron": NemotronConfig,
 }
 
 for name, cls in _CONFIG_REGISTRY.items():
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 51de11ca3e42a..1750950b3c38b 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -8,6 +8,7 @@
 from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
+from vllm.transformers_utils.configs.nemotron import NemotronConfig
 
 __all__ = [
     "ChatGLMConfig",
@@ -17,4 +18,5 @@
     "JAISConfig",
     "MedusaConfig",
     "MLPSpeculatorConfig",
+    "NemotronConfig",
 ]
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
new file mode 100644
index 0000000000000..a22a9f475dda9
--- /dev/null
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -0,0 +1,209 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Nemotron model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronModel`]. It is used to instantiate an Nemotron model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Nemotron-8B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be
+    used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Nemotron model. Defines the number of
+            different tokens that can be represented by the
+            `inputs_ids` passed when calling [`NemotronModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        head_dim (`int`, *optional*, defaults to None):
+            Projection weights dimension in multi-head attention. Set to
+            hidden_size // num_attention_heads if None
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention
+            (MQA) otherwise GQA is used. When converting a multi-head
+            checkpoint to a GQA checkpoint, each group key and value
+            head should be constructed by meanpooling all the original
+            heads within that group. For more details checkout 
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
+            is not specified, will default to `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the
+            decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used
+            with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE
+            embeddings. Currently supports two scaling strategies: linear
+            and dynamic. Their scaling factor must be a float greater than 1.
+            The expected format is `{"type": strategy name,
+            "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output
+            projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj and down_proj layers in the MLP
+            layers.
+
+    ```python
+    >>> from transformers import NemotronModel, NemotronConfig
+
+    >>> # Initializing a Nemotron nemotron-15b style configuration
+    >>> configuration = NemotronConfig()
+
+    >>> # Initializing a model from the nemotron-15b style configuration
+    >>> model = NemotronModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "nemotron"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=6144,
+        intermediate_size=24576,
+        num_hidden_layers=32,
+        num_attention_heads=48,
+        head_dim=None,
+        num_key_value_heads=None,
+        hidden_act="relu2",
+        max_position_embeddings=4096,
+        initializer_range=0.0134,
+        norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=2,
+        eos_token_id=3,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        rope_percent=0.5,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        head_dim = head_dim or kwargs.get("kv_channels", None)
+        self.head_dim = head_dim if head_dim is not None else (
+            hidden_size // num_attention_heads)
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        rope_percent = rope_percent or kwargs.get("rope_percentage", None)
+        self.rope_percent = rope_percent
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling,
+                          dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, "
+                f"`type` and `factor`, got {self.rope_scaling}")
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in [
+                "linear", "dynamic"
+        ]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', "
+                f"'dynamic'], got {rope_scaling_type}")
+        if rope_scaling_factor is None or not isinstance(
+                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1, got "
+                f"{rope_scaling_factor}")

From 50704f52c4643777fb0e5dc99f6c048dd9f54f2d Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 26 Jul 2024 14:41:04 -0400
Subject: [PATCH 25/66] [Bugfix][Kernel] Promote another index to int64_t
 (#6838)

---
 csrc/quantization/fp8/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 090f95d1bda71..6dae32b25f9c4 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -48,7 +48,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
                                         const scalar_t* __restrict__ input,
                                         int64_t num_elems) {
   __shared__ float cache[1024];
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
 
   // First store maximum for all values processes by
   // the current thread in cache[threadIdx.x]

From 71734f1bf263ed4877e928d7d9c4522d12e9c61f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 26 Jul 2024 12:28:32 -0700
Subject: [PATCH 26/66] [Build/CI][ROCm] Minor simplification to
 Dockerfile.rocm (#6811)

---
 Dockerfile.rocm | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index ff39791456398..7b4c0166a04bd 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -53,9 +53,9 @@ RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(whic
 # Install torch == 2.5.0 on ROCm
 RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
         *"rocm-6.1"*) \
-            python3 -m pip uninstall -y torch torchaudio torchvision \
+            python3 -m pip uninstall -y torch torchvision \
             && python3 -m pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+                torch==2.5.0.dev20240710 \
                 torchvision==0.20.0.dev20240710 \
                --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
         *) ;; esac
@@ -127,13 +127,6 @@ FROM base AS final
 # Import the vLLM development directory from the build context
 COPY . .
 
-# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
-# Manually remove it so that later steps of numpy upgrade can continue
-RUN case "$(which python3)" in \
-        *"/opt/conda/envs/py_3.9"*) \
-            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
-        *) ;; esac
-
 # Package upgrades for useful functionality or to avoid dependency issues
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install --upgrade numba scipy huggingface-hub[cli]

From aa4867791ecd73a5f55b7bad4d9372954e661fe4 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 26 Jul 2024 12:39:49 -0700
Subject: [PATCH 27/66] [Misc][TPU] Support TPU in initialize_ray_cluster
 (#6812)

---
 vllm/executor/ray_utils.py | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index fcbfa30d7a38a..58b864070f727 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -3,7 +3,7 @@
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_ip, is_hip, is_xpu
+from vllm.utils import get_ip, is_hip, is_tpu, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -93,32 +93,38 @@ def initialize_ray_cluster(
         # Placement group is already set.
         return
 
+    device_str = "GPU" if not is_tpu() else "TPU"
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:
         # We are in a placement group
         bundles = current_placement_group.bundle_specs
         # Verify that we can use the placement group.
-        gpu_bundles = 0
+        device_bundles = 0
         for bundle in bundles:
-            bundle_gpus = bundle.get("GPU", 0)
-            if bundle_gpus > 1:
+            bundle_devices = bundle.get(device_str, 0)
+            if bundle_devices > 1:
                 raise ValueError(
-                    "Placement group bundle cannot have more than 1 GPU.")
-            if bundle_gpus:
-                gpu_bundles += 1
-        if parallel_config.world_size > gpu_bundles:
+                    "Placement group bundle cannot have more than 1 "
+                    f"{device_str}.")
+            if bundle_devices:
+                device_bundles += 1
+        if parallel_config.world_size > device_bundles:
             raise ValueError(
-                "The number of required GPUs exceeds the total number of "
-                "available GPUs in the placement group.")
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group."
+                f"Required number of devices: {parallel_config.world_size}. "
+                f"Total number of devices: {device_bundles}.")
     else:
-        num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
-        if parallel_config.world_size > num_gpus_in_cluster:
+        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        if parallel_config.world_size > num_devices_in_cluster:
             raise ValueError(
-                "The number of required GPUs exceeds the total number of "
-                "available GPUs in the cluster.")
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group.")
         # Create a new placement group
-        placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size)
+        placement_group_specs = ([{
+            device_str: 1
+        }] * parallel_config.world_size)
         current_placement_group = ray.util.placement_group(
             placement_group_specs)
         # Wait until PG is ready - this will block until all

From 3bbb4936dc5aa7737750410ab4b4647817dcf9a3 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 27 Jul 2024 04:50:10 +0800
Subject: [PATCH 28/66] [Hardware] [Intel] Enable Multiprocessing and tensor
 parallel in CPU backend and update documentation  (#6125)

---
 .buildkite/run-cpu-test.sh                    |  28 +-
 Dockerfile.cpu                                |   9 +-
 cmake/cpu_extension.cmake                     |   4 +
 csrc/cpu/torch_bindings.cpp                   |   7 +
 csrc/cpu/utils.cpp                            |  65 +++++
 .../getting_started/cpu-installation.rst      |  55 +++-
 requirements-cpu.txt                          |   4 +-
 vllm/distributed/parallel_state.py            |   3 +
 vllm/engine/async_llm_engine.py               |   2 -
 vllm/envs.py                                  |   8 +-
 vllm/executor/cpu_executor.py                 | 264 +++++++++++++++---
 vllm/utils.py                                 |  21 --
 vllm/worker/cpu_model_runner.py               |   7 +-
 vllm/worker/cpu_worker.py                     |  17 +-
 14 files changed, 404 insertions(+), 90 deletions(-)
 create mode 100644 csrc/cpu/utils.cpp

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index a7678aae54644..21deec2bba973 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -3,26 +3,38 @@
 set -ex
 
 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.cpu .
-docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
-# Run the image
+# Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
-  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
-  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
+ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 
 # offline inference
-docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
-docker exec cpu-test bash -c "cd tests;
+docker exec cpu-test bash -c "
   pip install pytest Pillow protobuf
-  cd ../
   pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
+
+# online inference
+docker exec cpu-test bash -c "
+  export VLLM_CPU_KVCACHE_SPACE=10 
+  export VLLM_CPU_OMP_THREADS_BIND=48-92 
+  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+  python3 benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --dataset-name random \
+    --model facebook/opt-125m \
+    --num-prompts 20 \
+    --endpoint /v1/completions \
+    --tokenizer facebook/opt-125m"
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index f95d748f1e4be..c473ba431e680 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -2,8 +2,8 @@
 
 FROM ubuntu:22.04 AS cpu-test-1
 
-RUN apt-get update  -y \
-    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
+RUN apt-get update -y \
+    && apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
@@ -13,8 +13,9 @@ RUN pip install intel-openmp
 
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
 
+RUN echo 'ulimit -c 0' >> ~/.bashrc
 
-RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
 
 RUN pip install --upgrade pip \
     && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
@@ -25,7 +26,7 @@ COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
 
-RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu
 
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 690559ee265e9..118f9b28e0ae3 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -83,6 +83,8 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
+list(APPEND LIBS "numa")
+
 
 #
 # Define extension targets
@@ -95,6 +97,7 @@ set(VLLM_EXT_SRC
     "csrc/cpu/activation.cpp"
     "csrc/cpu/attention.cpp"
     "csrc/cpu/cache.cpp"
+    "csrc/cpu/utils.cpp"
     "csrc/cpu/layernorm.cpp"
     "csrc/cpu/pos_encoding.cpp"
     "csrc/cpu/torch_bindings.cpp")
@@ -104,6 +107,7 @@ define_gpu_extension_target(
     DESTINATION vllm
     LANGUAGE CXX
     SOURCES ${VLLM_EXT_SRC}
+    LIBRARIES ${LIBS}
     COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
     USE_SABI 3
     WITH_SOABI
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 5be0e9810b5b9..7d549e271a30d 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,6 +4,8 @@
 
 #include <torch/library.h>
 
+void init_cpu_threads_env(const std::string& cpu_ids);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -107,4 +109,9 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
 }
 
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
+  // CPU utils
+  utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env);
+}
+
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
new file mode 100644
index 0000000000000..5782580baa861
--- /dev/null
+++ b/csrc/cpu/utils.cpp
@@ -0,0 +1,65 @@
+#include <numa.h>
+#include <unistd.h>
+#include <string>
+#include <sched.h>
+
+#include "cpu_types.hpp"
+
+void init_cpu_threads_env(const std::string& cpu_ids) {
+  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
+  TORCH_CHECK(omp_cpu_mask->size > 0);
+  std::vector<int> omp_cpu_ids;
+  omp_cpu_ids.reserve(omp_cpu_mask->size);
+
+  constexpr int group_size = 8 * sizeof(*omp_cpu_mask->maskp);
+
+  for (int offset = 0; offset < omp_cpu_mask->size; offset += group_size) {
+    unsigned long group_mask = omp_cpu_mask->maskp[offset / group_size];
+    int i = 0;
+    while (group_mask) {
+      if (group_mask & 1) {
+        omp_cpu_ids.emplace_back(offset + i);
+      }
+      ++i;
+      group_mask >>= 1;
+    }
+  }
+
+  // Memory node binding
+  if (numa_available() != -1) {
+    int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
+    bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
+    bitmask* src_mask = numa_get_membind();
+
+    int pid = getpid();
+
+    // move all existing pages to the specified numa node.
+    *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
+    int page_num = numa_migrate_pages(pid, src_mask, mask);
+    if (page_num == -1) {
+      TORCH_CHECK(false,
+                  "numa_migrate_pages failed. errno: " + std::to_string(errno));
+    }
+
+    // restrict memory allocation node.
+    numa_set_membind(mask);
+    numa_set_strict(1);
+  }
+
+  // OMP threads binding
+  omp_set_num_threads((int)omp_cpu_ids.size());
+  torch::set_num_threads((int)omp_cpu_ids.size());
+  TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
+  TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
+#pragma omp parallel for schedule(static, 1)
+  for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
+    cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size);
+    size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size);
+    CPU_ZERO_S(size, mask);
+    CPU_SET_S(omp_cpu_ids[i], size, mask);
+    sched_setaffinity(0, sizeof(cpu_set_t), mask);
+    CPU_FREE(mask);
+  }
+
+  numa_free_nodemask(omp_cpu_mask);
+}
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index 1c97515dbecd9..7fc469e06844f 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -10,6 +10,7 @@ Table of contents:
 #. :ref:`Requirements <cpu_backend_requirements>`
 #. :ref:`Quick start using Dockerfile <cpu_backend_quick_start_dockerfile>`
 #. :ref:`Build from source <build_cpu_backend_from_source>`
+#. :ref:`Related runtime environment variables <env_intro>`
 #. :ref:`Intel Extension for PyTorch <ipex_guidance>`
 #. :ref:`Performance tips <cpu_backend_performance_tips>`
 
@@ -47,7 +48,7 @@ Build from source
 .. code-block:: console
 
     $ sudo apt-get update  -y
-    $ sudo apt-get install -y gcc-12 g++-12
+    $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
     $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
 - Second, install Python packages for vLLM CPU backend building:
@@ -71,6 +72,15 @@ Build from source
     
     - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.    
 
+.. _env_intro:
+
+Related runtime environment variables
+-------------------------------------
+
+- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+
+- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+
 .. _ipex_guidance:
 
 Intel Extension for PyTorch
@@ -78,15 +88,11 @@ Intel Extension for PyTorch
 
 - `Intel Extension for PyTorch (IPEX) <https://github.com/intel/intel-extension-for-pytorch>`_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
 
-- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed.
-
 .. _cpu_backend_performance_tips:
 
 Performance tips
 -----------------
 
-- vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-
 - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
 
 .. code-block:: console
@@ -96,11 +102,44 @@ Performance tips
     $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
     $ python examples/offline_inference.py # run vLLM
 
-- vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription. 
+- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
 
-- If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading.
+.. code-block:: console
+
+    $ export VLLM_CPU_KVCACHE_SPACE=40
+    $ export VLLM_CPU_OMP_THREADS_BIND=0-29 
+    $ vllm serve facebook/opt-125m
+
+- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
+
+.. code-block:: console
 
-- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores and memory nodes, to avoid the remote memory node access. ``numactl`` is an useful tool for CPU core and memory binding on NUMA platform. Besides, ``--cpuset-cpus`` and ``--cpuset-mems`` arguments of ``docker run`` are also useful.
+    $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
+
+    # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. 
+    CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
+    0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+    1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+    2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+    3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+    4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+    5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+    7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+    8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+    9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+    10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+    11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+    12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+    13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+    15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+
+    # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+    $ export VLLM_CPU_OMP_THREADS_BIND=0-7 
+    $ python examples/offline_inference.py
+
+- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access.
 
 
 
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 754070df21c0a..a8ce104d83290 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,6 +2,6 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.3.1+cpu; platform_machine != "ppc64le"
-torchvision == 0.18.1+cpu; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
+torch == 2.4.0; platform_machine != "ppc64le"
+torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
 triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index e9c6fc3a255e4..58cae46d9af27 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -296,6 +296,9 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         pynccl_comm = self.pynccl_comm
         if (pynccl_comm is not None and not pynccl_comm.disabled):
             pynccl_comm.all_reduce(input_)
+        elif input_.is_cpu:
+            import intel_extension_for_pytorch as ipex
+            ipex.distributed.all_reduce(input_, group=self.device_group)
         else:
             torch.distributed.all_reduce(input_, group=self.device_group)
         return input_
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 16b7bc64a2849..93cc319f11c42 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -410,8 +410,6 @@ def _get_executor_cls(
             from vllm.executor.tpu_executor import TPUExecutorAsync
             executor_class = TPUExecutorAsync
         elif engine_config.device_config.device_type == "cpu":
-            assert distributed_executor_backend is None, (
-                "Distributed execution is not supported with the CPU backend.")
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync
         elif engine_config.device_config.device_type == "openvino":
diff --git a/vllm/envs.py b/vllm/envs.py
index 595992e51db87..f06b6d66ea6f4 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -29,6 +29,7 @@
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
+    VLLM_CPU_OMP_THREADS_BIND: str = ""
     VLLM_OPENVINO_KVCACHE_SPACE: int = 0
     VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
     VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
@@ -241,11 +242,16 @@ def get_default_config_root():
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
-    # CPU key-value cache space
+    # (CPU backend only) CPU key-value cache space.
     # default is 4GB
     "VLLM_CPU_KVCACHE_SPACE":
     lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
 
+    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
+    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
+    "VLLM_CPU_OMP_THREADS_BIND":
+    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),
+
     # OpenVINO key-value cache space
     # default is 4GB
     "VLLM_OPENVINO_KVCACHE_SPACE":
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 23e429dac7232..3229e5ad20afa 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -1,16 +1,21 @@
-from typing import List, Set, Tuple
+import os
+from functools import partial
+from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 
 import torch
 
 import vllm.envs as envs
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
+                                                  ResultHandler, WorkerMonitor)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
+from vllm.utils import (get_distributed_init_method, get_open_port,
+                        get_vllm_instance_id, make_async)
+from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
@@ -22,46 +27,173 @@ class CPUExecutor(ExecutorBase):
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
+
+        #
+        # Environment variables for CPU executor
+        #
+
+        # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
+        os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
+
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+
+        # Intel OpenMP setting
+        ld_prealod_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_prealod_str:
+            # The time(milliseconds) that a thread should wait after
+            # completing the execution of a parallel region, before sleeping.
+            os.environ['KMP_BLOCKTIME'] = "1"
+            # Prevents the CPU to run into low performance state
+            os.environ['KMP_TPAUSE'] = "0"
+            # Provides fine granularity parallelism
+            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+
+        # To hint IPEX uses shared memory based AllReduce
+        os.environ["LOCAL_WORLD_SIZE"] = str(
+            self.parallel_config.tensor_parallel_size)
+
         self.model_config = _verify_and_get_model_config(self.model_config)
         self.cache_config = _verify_and_get_cache_config(self.cache_config)
         self.scheduler_config = _verify_and_get_scheduler_config(
             self.scheduler_config)
 
-        # Instantiate the worker and load the model to CPU.
-        self._init_worker()
-
-    def _init_worker(self):
-        from vllm.worker.cpu_worker import CPUWorker
+        # Multiprocessing-based executor does not support multi-node setting.
+        # Since it only works for single node, we can use the loopback address
+        # 127.0.0.1 for communication.
+        ip = "127.0.0.1"
+        port = get_open_port()
+        self.distributed_init_method = get_distributed_init_method(ip, port)
+
+        is_async = isinstance(self, CPUExecutorAsync)
+
+        world_size = self.parallel_config.tensor_parallel_size
+        result_handler = ResultHandler()
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+        self.workers = []
+
+        if is_async:
+            self.workers = [
+                ProcessWorkerWrapper(
+                    result_handler,
+                    partial(
+                        self._create_worker,
+                        rank=rank,
+                        local_rank=rank,
+                    )) for rank in range(0, world_size)
+            ]
+            self.driver_worker = self.workers[0]
+            self.workers = self.workers[1:]
+            self.driver_method_invoker = _async_driver_method_invoker
+        else:
+            self.driver_worker = self._create_worker()
+            self.driver_method_invoker = _driver_method_invoker
+
+            if world_size != 1:
+                self.workers = [
+                    ProcessWorkerWrapper(
+                        result_handler,
+                        partial(
+                            self._create_worker,
+                            rank=rank,
+                            local_rank=rank,
+                        )) for rank in range(1, world_size)
+                ]
+
+        if world_size != 1 or is_async:
+            if is_async:
+                async_worker_list = self.workers + [self.driver_worker]
+            else:
+                async_worker_list = self.workers
+            self.worker_monitor = WorkerMonitor(async_worker_list,
+                                                result_handler)
+            result_handler.start()
+            self.worker_monitor.start()
+
+        self._run_workers("init_device")
+        self._run_workers("load_model")
+
+    def _create_worker(
+        self,
+        local_rank: int = 0,
+        rank: int = 0,
+    ):
+        worker_module_name = "vllm.worker.cpu_worker"
+        worker_class_name = "CPUWorker"
+
+        wrapper = WorkerWrapperBase(
+            worker_module_name=worker_module_name,
+            worker_class_name=worker_class_name,
+        )
 
-        assert self.parallel_config.world_size == 1, (
-            "CPUExecutor only supports single CPU socket currently.")
+        assert self.distributed_init_method is not None
 
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        self.driver_worker = CPUWorker(
+        kwargs = dict(
             model_config=self.model_config,
             parallel_config=self.parallel_config,
             scheduler_config=self.scheduler_config,
             device_config=self.device_config,
             cache_config=self.cache_config,
             load_config=self.load_config,
-            local_rank=0,
-            rank=0,
-            distributed_init_method=distributed_init_method,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=self.distributed_init_method,
             lora_config=self.lora_config,
             multimodal_config=self.multimodal_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             prompt_adapter_config=self.prompt_adapter_config,
-            is_driver_worker=True,
+            is_driver_worker=rank == 0,
         )
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
+        wrapper.init_worker(**kwargs)
+
+        return wrapper.worker
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_remote_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers.
+
+        Args:
+            async_run_remote_workers_only: If True the method will be run only
+                in the remote workers, not the driver worker. It will also be
+                run asynchronously and return a list of futures rather than
+                blocking on the results.
+        """
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        # Start the workers first.
+        worker_outputs = [
+            worker.execute_method(method, *args, **kwargs)
+            for worker in self.workers
+        ]
+
+        if async_run_remote_workers_only:
+            # Just return futures
+            return worker_outputs
+
+        driver_worker_output = self.driver_method_invoker(
+            self.driver_worker, method, *args, **kwargs)
+
+        # Get the results of the workers.
+        return [driver_worker_output
+                ] + [output.get() for output in worker_outputs]
 
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks by invoking the
         underlying worker.
         """
-        return self.driver_worker.determine_num_available_blocks()
+        return self.driver_method_invoker(self.driver_worker,
+                                          "determine_num_available_blocks")
 
     def initialize_cache(self, num_gpu_blocks: int,
                          num_cpu_blocks: int) -> None:
@@ -74,43 +206,95 @@ def initialize_cache(self, num_gpu_blocks: int,
         # referred as `gpu block`. Because we want to reuse the existing block
         # management procedure.
         logger.info("# CPU blocks: %d", num_gpu_blocks)
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+        self._run_workers("initialize_cache",
+                          num_gpu_blocks=num_gpu_blocks,
+                          num_cpu_blocks=num_cpu_blocks)
 
     def execute_model(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        output = self.driver_worker.execute_model(execute_model_req)
+        if (self.parallel_config.tensor_parallel_size > 1
+                and self.parallel_worker_tasks is None):
+            self.parallel_worker_tasks = self._run_workers(
+                "start_worker_execution_loop",
+                async_run_remote_workers_only=True,
+            )
+        output = self.driver_method_invoker(self.driver_worker,
+                                            "execute_model", execute_model_req)
         return output
 
+    def stop_remote_worker_execution_loop(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+        """
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        self.driver_method_invoker(self.driver_worker, "execute_model", None)
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        self._wait_for_tasks_completion(parallel_worker_tasks)
+
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.driver_worker.add_lora(lora_request)
+        return all(self._run_workers("add_lora", lora_request))
 
     def remove_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.remove_lora(lora_id)
+        return all(self._run_workers("remove_lora", lora_id))
 
     def pin_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.pin_lora(lora_id)
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self._run_workers(
+            "pin_lora",
+            lora_id=lora_id,
+        ))
 
     def list_loras(self) -> Set[int]:
-        return self.driver_worker.list_loras()
+        return self.driver_method_invoker(self.driver_worker, "list_loras")
 
     def add_prompt_adapter(
             self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
+        return all(
+            self._run_workers(
+                "add_prompt_adapter",
+                prompt_adapter_request,
+            ))
 
     def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
+        return all(
+            self._run_workers(
+                "remove_prompt_adapter",
+                prompt_adapter_id,
+            ))
 
     def list_prompt_adapters(self) -> Set[int]:
-        return self.driver_worker.list_prompt_adapters()
+        return self.driver_method_invoker(self.driver_worker,
+                                          "list_prompt_adapters")
 
     def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
+        return all(self._run_workers(
+            "pin_prompt_adapter",
+            prompt_adapter_id,
+        ))
 
     def check_health(self) -> None:
-        # CPUExecutor will always be healthy as long as
-        # it's running.
-        return
+        """Raises an error if engine is unhealthy."""
+        if self.worker_monitor is not None and not self.worker_monitor.is_alive(
+        ):
+            raise RuntimeError("Worker processes are not running")
+
+    def shutdown(self):
+        if (worker_monitor := getattr(self, "worker_monitor",
+                                      None)) is not None:
+            worker_monitor.close()
+
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        for result in parallel_worker_tasks:
+            result.get()
 
 
 class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
@@ -118,14 +302,12 @@ class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
     async def execute_model_async(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
+        output = await make_async(self.execute_model
                                   )(execute_model_req=execute_model_req, )
         return output
 
     async def check_health_async(self) -> None:
-        # CPUExecutor will always be healthy as long as
-        # it's running.
-        return
+        self.check_health()
 
 
 def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
@@ -170,3 +352,11 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
             f" {kv_cache_space}, expect a positive integer value.")
 
     return config
+
+
+def _driver_method_invoker(driver, method: str, *args, **kwargs):
+    return getattr(driver, method)(*args, **kwargs)
+
+
+def _async_driver_method_invoker(driver, method: str, *args, **kwargs):
+    return driver.execute_method(method, *args, **kwargs).get()
diff --git a/vllm/utils.py b/vllm/utils.py
index 876c3bf90b02c..90be09fc7b967 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -404,27 +404,6 @@ def update_environment_variables(envs: Dict[str, str]):
         os.environ[k] = v
 
 
-def init_kmp_env():
-    if not is_cpu():
-        return
-
-    ld_prealod_str = os.getenv("LD_PRELOAD", "")
-    if "libiomp5.so" not in ld_prealod_str:
-        return
-
-    # The time(milliseconds) that a thread should wait after completing the
-    # execution of a parallel region, before sleeping.
-    os.environ['KMP_BLOCKTIME'] = "1"
-    # dump settings on start up
-    os.environ['KMP_SETTINGS'] = "1"
-    # Prevents the CPU to run into low performance state
-    os.environ['KMP_TPAUSE'] = "0"
-    # Provides fine granularity parallelism
-    os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
-    os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
-    os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
-
-
 def chunk_list(lst: List[T], chunk_size: int):
     """Yield successive chunk_size chunks from lst."""
     for i in range(0, len(lst), chunk_size):
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 83f4ba69fb728..71763c08ec45f 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -42,6 +42,7 @@ class CPUModelInput(ModelRunnerInputBase):
     attn_metadata: Optional["AttentionMetadata"] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
     multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    virtual_engine: Optional[int] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -204,8 +205,8 @@ def _prepare_prompt(
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
             seq_lens=seq_lens,
-            seq_lens_tensor=None,
-            max_decode_seq_len=None,
+            seq_lens_tensor=torch.tensor([]),
+            max_decode_seq_len=0,
             num_prefills=len(seq_lens),
             num_prefill_tokens=num_prompt_tokens,
             num_decode_tokens=0,
@@ -345,7 +346,7 @@ def prepare_model_input(
             multi_modal_kwargs=multi_modal_kwargs,
         )
 
-    @torch.inference_mode()
+    @torch.no_grad()
     def execute_model(
         self,
         model_input: CPUModelInput,
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3c22c73267b7f..735d48c908d61 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -4,6 +4,7 @@
 import torch
 import torch.distributed
 
+import vllm.envs as envs
 from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
@@ -13,7 +14,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, init_kmp_env
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.worker.cpu_model_runner import CPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerInput)
@@ -152,13 +153,18 @@ def __init__(
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
-        # try to initialize intel openmp optimized tunings
-        init_kmp_env()
-
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
+
+        # Setup OpenMP threads affinity.
+        omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
+        if omp_cpuids == "all":
+            self.local_omp_cpuid = "all"
+        else:
+            self.local_omp_cpuid = omp_cpuids.split("|")[rank]
+
         self.model_runner: CPUModelRunner = CPUModelRunner(
             model_config,
             parallel_config,
@@ -177,6 +183,9 @@ def __init__(
         self.cpu_cache: List[List[torch.Tensor]]
 
     def init_device(self) -> None:
+        if self.local_omp_cpuid != "all":
+            torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+
         self.init_distributed_environment()
         # Set random seed.
         set_random_seed(self.model_config.seed)

From 281977bd6eccade50be461f5a22cc51b74006976 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 26 Jul 2024 17:32:44 -0400
Subject: [PATCH 29/66] [Doc] Add Nemotron to supported model docs (#6843)

---
 docs/source/models/supported_models.rst  | 4 ++++
 vllm/model_executor/layers/activation.py | 4 +---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index dc8bd6fb245df..483f552bba238 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -113,6 +113,10 @@ Decoder-only Language Models
     - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
     - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
     -
+  * - :code:`NemotronForCausalLM`
+    - Nemotron-3, Nemotron-4, Minitron
+    - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
+    - ✅︎
   * - :code:`OLMoForCausalLM`
     - OLMo
     - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 6578193a31597..4c14fe476ee4a 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -166,9 +166,7 @@ class ReLUSquaredActivation(CustomOp):
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
-        relu_applied = nn.functional.relu(x)
-        squared = torch.square(relu_applied)
-        return squared
+        return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         return self.forward_native(x)

From 150a1ffbfd3d0429d30fa5ab841f53903a0a8a62 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 26 Jul 2024 17:39:10 -0400
Subject: [PATCH 30/66] [Doc] Update SkyPilot doc for wrong indents and
 instructions for update service (#4283)

---
 docs/source/serving/run_on_sky.rst | 430 ++++++++++++++++-------------
 1 file changed, 243 insertions(+), 187 deletions(-)

diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst
index bd33c76cec3de..674b14a879bc3 100644
--- a/docs/source/serving/run_on_sky.rst
+++ b/docs/source/serving/run_on_sky.rst
@@ -5,9 +5,9 @@ Deploying and scaling up with SkyPilot
 
 .. raw:: html
 
-    <p align="center">
-        <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
-    </p>
+  <p align="center">
+    <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
+  </p>
 
 vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery <https://skypilot.readthedocs.io/en/latest/gallery/index.html>`__.
 
@@ -21,8 +21,8 @@ Prerequisites
 
 .. code-block:: console
 
-    pip install skypilot-nightly
-    sky check
+  pip install skypilot-nightly
+  sky check
 
 
 Run on a single instance
@@ -32,64 +32,64 @@ See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypil
 
 .. code-block:: yaml
 
-    resources:
-        accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-        use_spot: True
-        disk_size: 512  # Ensure model checkpoints can fit.
-        disk_tier: best
-        ports: 8081  # Expose to internet traffic.
-
-    envs:
-        MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-        HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-
-    setup: |
-        conda create -n vllm python=3.10 -y
-        conda activate vllm
-
-        pip install vllm==0.4.0.post1
-        # Install Gradio for web UI.
-        pip install gradio openai
-        pip install flash-attn==2.5.7
-
-    run: |
-        conda activate vllm
-        echo 'Starting vllm api server...'
-        python -u -m vllm.entrypoints.openai.api_server \
-            --port 8081 \
-            --model $MODEL_NAME \
-            --trust-remote-code \
-            --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-            2>&1 | tee api_server.log &
-        
-        echo 'Waiting for vllm api server to start...'
-        while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
-
-        echo 'Starting gradio server...'
-        git clone https://github.com/vllm-project/vllm.git || true
-        python vllm/examples/gradio_openai_chatbot_webserver.py \
-            -m $MODEL_NAME \
-            --port 8811 \
-            --model-url http://localhost:8081/v1 \
-            --stop-token-ids 128009,128001
+  resources:
+    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+    use_spot: True
+    disk_size: 512  # Ensure model checkpoints can fit.
+    disk_tier: best
+    ports: 8081  # Expose to internet traffic.
+
+  envs:
+    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+  setup: |
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
+
+    pip install vllm==0.4.0.post1
+    # Install Gradio for web UI.
+    pip install gradio openai
+    pip install flash-attn==2.5.7
+
+  run: |
+    conda activate vllm
+    echo 'Starting vllm api server...'
+    python -u -m vllm.entrypoints.openai.api_server \
+      --port 8081 \
+      --model $MODEL_NAME \
+      --trust-remote-code \
+      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+      2>&1 | tee api_server.log &
+    
+    echo 'Waiting for vllm api server to start...'
+    while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
+    echo 'Starting gradio server...'
+    git clone https://github.com/vllm-project/vllm.git || true
+    python vllm/examples/gradio_openai_chatbot_webserver.py \
+      -m $MODEL_NAME \
+      --port 8811 \
+      --model-url http://localhost:8081/v1 \
+      --stop-token-ids 128009,128001
 
 Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): 
 
 .. code-block:: console
 
-    HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
+  HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
 
 Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
 
 .. code-block:: console
 
-    (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
+  (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
 
 **Optional**: Serve the 70B model instead of the default 8B and use more GPU:
 
 .. code-block:: console
 
-    HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
+  HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
 
 
 Scale up to multiple replicas
@@ -99,151 +99,212 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
 
 .. code-block:: yaml
 
-    service:
-        replicas: 2
-        # An actual request for readiness probe.
-        readiness_probe:
-            path: /v1/chat/completions
-            post_data:
-            model: $MODEL_NAME
-            messages:
-                - role: user
-                content: Hello! What is your name?
-        max_tokens: 1
-        
+  service:
+    replicas: 2
+    # An actual request for readiness probe.
+    readiness_probe:
+      path: /v1/chat/completions
+      post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+    max_tokens: 1
+    
 .. raw:: html
 
-    <details>
-    <summary>Click to see the full recipe YAML</summary>
+  <details>
+  <summary>Click to see the full recipe YAML</summary>
 
 
 .. code-block:: yaml
 
-    service:
-        replicas: 2
-        # An actual request for readiness probe.
-        readiness_probe:
-            path: /v1/chat/completions
-            post_data:
-            model: $MODEL_NAME
-            messages:
-                - role: user
-                content: Hello! What is your name?
+  service:
+    replicas: 2
+    # An actual request for readiness probe.
+    readiness_probe:
+      path: /v1/chat/completions
+      post_data:
+        model: $MODEL_NAME
+        messages:
+          - role: user
+            content: Hello! What is your name?
         max_tokens: 1
 
-    resources:
-        accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-        use_spot: True
-        disk_size: 512  # Ensure model checkpoints can fit.
-        disk_tier: best
-        ports: 8081  # Expose to internet traffic.
-
-    envs:
-        MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-        HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-
-    setup: |
-        conda create -n vllm python=3.10 -y
-        conda activate vllm
-
-        pip install vllm==0.4.0.post1
-        # Install Gradio for web UI.
-        pip install gradio openai
-        pip install flash-attn==2.5.7
-
-    run: |
-        conda activate vllm
-        echo 'Starting vllm api server...'
-        python -u -m vllm.entrypoints.openai.api_server \
-            --port 8081 \
-            --model $MODEL_NAME \
-            --trust-remote-code \
-            --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-            2>&1 | tee api_server.log &
-        
-        echo 'Waiting for vllm api server to start...'
-        while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
-
-        echo 'Starting gradio server...'
-        git clone https://github.com/vllm-project/vllm.git || true
-        python vllm/examples/gradio_openai_chatbot_webserver.py \
-            -m $MODEL_NAME \
-            --port 8811 \
-            --model-url http://localhost:8081/v1 \
-            --stop-token-ids 128009,128001
+  resources:
+    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+    use_spot: True
+    disk_size: 512  # Ensure model checkpoints can fit.
+    disk_tier: best
+    ports: 8081  # Expose to internet traffic.
+
+  envs:
+    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+  setup: |
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
+
+    pip install vllm==0.4.0.post1
+    # Install Gradio for web UI.
+    pip install gradio openai
+    pip install flash-attn==2.5.7
+
+  run: |
+    conda activate vllm
+    echo 'Starting vllm api server...'
+    python -u -m vllm.entrypoints.openai.api_server \
+      --port 8081 \
+      --model $MODEL_NAME \
+      --trust-remote-code \
+      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+      2>&1 | tee api_server.log
 
 .. raw:: html
 
-    </details>
+  </details>
 
 Start the serving the Llama-3 8B model on multiple replicas:
 
 .. code-block:: console
 
-    HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
+  HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
 
 
 Wait until the service is ready:
 
 .. code-block:: console
 
-    watch -n10 sky serve status vllm
+  watch -n10 sky serve status vllm
 
 
 .. raw:: html
 
-    <details>
-    <summary>Example outputs:</summary>
+  <details>
+  <summary>Example outputs:</summary>
 
 .. code-block:: console
 
-    Services
-    NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
-    vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
+  Services
+  NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+  vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
 
-    Service Replicas
-    SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES          STATUS  REGION
-    vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP({'L4': 1})  READY   us-east4
-    vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP({'L4': 1})  READY   us-east4
+  Service Replicas
+  SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                STATUS  REGION
+  vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
+  vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 
 .. raw:: html
-    
-    </details>
+  
+  </details>
 
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
 
 .. code-block:: console
 
-    ENDPOINT=$(sky serve status --endpoint 8081 vllm)
-    curl -L http://$ENDPOINT/v1/chat/completions \
-        -H "Content-Type: application/json" \
-        -d '{
-            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-            "messages": [
-            {
-                "role": "system",
-                "content": "You are a helpful assistant."
-            },
-            {
-                "role": "user",
-                "content": "Who are you?"
-            }
-            ],
-            "stop_token_ids": [128009,  128001]
-        }'
-
-To enable autoscaling, you could specify additional configs in `services`:
+  ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+  curl -L http://$ENDPOINT/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+      "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+      "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Who are you?"
+      }
+      ],
+      "stop_token_ids": [128009,  128001]
+    }'
+
+To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
 
 .. code-block:: yaml
 
-    services:
-        replica_policy:
-            min_replicas: 0
-            max_replicas: 3
-        target_qps_per_replica: 2
+  service:
+    replica_policy:
+      min_replicas: 2
+      max_replicas: 4
+      target_qps_per_replica: 2
 
 This will scale the service up to when the QPS exceeds 2 for each replica.
 
+    
+.. raw:: html
+
+  <details>
+  <summary>Click to see the full recipe YAML</summary>
+
+
+.. code-block:: yaml
+
+  service:
+    replica_policy:
+      min_replicas: 2
+      max_replicas: 4
+      target_qps_per_replica: 2
+    # An actual request for readiness probe.
+    readiness_probe:
+      path: /v1/chat/completions
+      post_data:
+        model: $MODEL_NAME
+        messages:
+          - role: user
+            content: Hello! What is your name?
+        max_tokens: 1
+
+  resources:
+    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+    use_spot: True
+    disk_size: 512  # Ensure model checkpoints can fit.
+    disk_tier: best
+    ports: 8081  # Expose to internet traffic.
+
+  envs:
+    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+  setup: |
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
+
+    pip install vllm==0.4.0.post1
+    # Install Gradio for web UI.
+    pip install gradio openai
+    pip install flash-attn==2.5.7
+
+  run: |
+    conda activate vllm
+    echo 'Starting vllm api server...'
+    python -u -m vllm.entrypoints.openai.api_server \
+      --port 8081 \
+      --model $MODEL_NAME \
+      --trust-remote-code \
+      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+      2>&1 | tee api_server.log
+
+
+.. raw:: html
+  
+  </details>
+
+To update the service with the new config:
+
+.. code-block:: console
+
+  HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
+
+
+To stop the service:
+
+.. code-block:: console
+
+  sky serve down vllm
+
 
 **Optional**: Connect a GUI to the endpoint
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -253,58 +314,53 @@ It is also possible to access the Llama-3 service with a separate GUI frontend,
 
 .. raw:: html
 
-    <details>
-    <summary>Click to see the full GUI YAML</summary>
+  <details>
+  <summary>Click to see the full GUI YAML</summary>
 
 .. code-block:: yaml
 
-    envs:
-        MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
-        ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. 
-
-    resources:
-        cpus: 2
-
-    setup: |
-        conda activate vllm
-        if [ $? -ne 0 ]; then
-            conda create -n vllm python=3.10 -y
-            conda activate vllm
-        fi
-
-        # Install Gradio for web UI.
-        pip install gradio openai
-
-    run: |
-        conda activate vllm
-        export PATH=$PATH:/sbin
-        WORKER_IP=$(hostname -I | cut -d' ' -f1)
-        CONTROLLER_PORT=21001
-        WORKER_PORT=21002
-
-        echo 'Starting gradio server...'
-        git clone https://github.com/vllm-project/vllm.git || true
-        python vllm/examples/gradio_openai_chatbot_webserver.py \
-            -m $MODEL_NAME \
-            --port 8811 \
-            --model-url http://$ENDPOINT/v1 \
-            --stop-token-ids 128009,128001 | tee ~/gradio.log
+  envs:
+    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+    ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. 
+
+  resources:
+    cpus: 2
+
+  setup: |
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
+
+    # Install Gradio for web UI.
+    pip install gradio openai
+
+  run: |
+    conda activate vllm
+    export PATH=$PATH:/sbin
+
+    echo 'Starting gradio server...'
+    git clone https://github.com/vllm-project/vllm.git || true
+    python vllm/examples/gradio_openai_chatbot_webserver.py \
+      -m $MODEL_NAME \
+      --port 8811 \
+      --model-url http://$ENDPOINT/v1 \
+      --stop-token-ids 128009,128001 | tee ~/gradio.log
+
 
 .. raw:: html
-    
-    </details>
+  
+  </details>
 
 1. Start the chat web UI:
 
 .. code-block:: console
 
-    sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+  sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
 
 
 2. Then, we can access the GUI at the returned gradio link:
 
 .. code-block:: console
 
-    | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
+  | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
 
 

From b5f49ee55beac7bb314fd8880bdb718ed117dacb Mon Sep 17 00:00:00 2001
From: Gurpreet Singh Dhami <143527450+gurpreet-dhami@users.noreply.github.com>
Date: Fri, 26 Jul 2024 20:26:45 -0400
Subject: [PATCH 31/66] Update README.md (#6847)

---
 examples/fp8/quantizer/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fp8/quantizer/README.md b/examples/fp8/quantizer/README.md
index 8f89a74a6a367..0b6944f688b49 100644
--- a/examples/fp8/quantizer/README.md
+++ b/examples/fp8/quantizer/README.md
@@ -16,7 +16,7 @@
 #### Run on H100 system for speed if FP8; number of GPUs depends on the model size
 
 #### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache:
-`python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1`
+`python quantize.py --model-dir ./ll2-7b --dtype float16 --qformat fp8 --kv-cache-dtype fp8 --output-dir ./ll2_7b_fp8 --calib-size 512 --tp-size 1`
 
 Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference)
 ```

From bb5494676f5f57f1cf7cf72598de5434a2a22865 Mon Sep 17 00:00:00 2001
From: chenqianfzh <51831990+chenqianfzh@users.noreply.github.com>
Date: Fri, 26 Jul 2024 18:32:20 -0700
Subject: [PATCH 32/66] enforce eager mode with bnb quantization temporarily
 (#6846)

---
 vllm/config.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 6403a53f86281..92fde449b43fd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -282,6 +282,10 @@ def verify_with_parallel_config(
             raise ValueError(
                 "BitAndBytes quantization with TP or PP is not supported yet.")
 
+        if self.quantization == "bitsandbytes" and self.enforce_eager is False:
+            raise ValueError(
+                "BitAndBytes with enforce_eager = False is not supported yet.")
+
     def get_hf_config_sliding_window(self) -> Optional[int]:
         """Get the sliding window size, or None if disabled."""
 

From d09b94ca588c6de1e627194264357e14460ae2eb Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 26 Jul 2024 18:45:57 -0700
Subject: [PATCH 33/66] [TPU] Support collective communications in XLA devices
 (#6813)

---
 .../device_communicators/tpu_communicator.py  | 30 +++++++++++++++++++
 vllm/distributed/parallel_state.py            | 22 ++++++++++++++
 vllm/lora/layers.py                           |  4 +++
 .../model_executor/layers/logits_processor.py | 16 ++++++++--
 4 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 vllm/distributed/device_communicators/tpu_communicator.py

diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
new file mode 100644
index 0000000000000..69a9a516f3ebe
--- /dev/null
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -0,0 +1,30 @@
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.platforms import current_platform
+
+if current_platform.is_tpu():
+    import torch_xla.core.xla_model as xm
+    from torch_xla._internal import pjrt
+
+
+class TpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not current_platform.is_tpu():
+            self.disabled = True
+            return
+        self.disabled = False
+
+        local_rank = dist.get_rank(group)
+        world_size = dist.get_world_size(group)
+        pjrt.initialize_multiprocess(local_rank, world_size)
+        xm._init_world_size_ordinal()
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        return xm.all_reduce(xm.REDUCE_SUM, x)
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        assert dim == -1, "TPUs only support dim=-1 for all-gather."
+        return xm.all_gather(x, dim=dim)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 58cae46d9af27..4116b1729d188 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -133,6 +133,7 @@ def __init__(
         torch_distributed_backend: Union[str, Backend],
         use_pynccl: bool,
         use_custom_allreduce: bool,
+        use_tpu_communicator: bool,
         use_message_queue_broadcaster: bool = False,
     ):
 
@@ -164,6 +165,7 @@ def __init__(
 
         self.use_pynccl = use_pynccl
         self.use_custom_allreduce = use_custom_allreduce
+        self.use_tpu_communicator = use_tpu_communicator
 
         # lazy import to avoid documentation build error
         from vllm.distributed.device_communicators.custom_all_reduce import (
@@ -190,6 +192,12 @@ def __init__(
         else:
             self.ca_comm = None
 
+        from vllm.distributed.device_communicators.tpu_communicator import (
+            TpuCommunicator)
+        self.tpu_communicator: Optional[TpuCommunicator]
+        if use_tpu_communicator and self.world_size > 1:
+            self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
+
         from vllm.distributed.device_communicators.shm_broadcast import (
             MessageQueue)
         self.mq_broadcaster: Optional[MessageQueue] = None
@@ -289,6 +297,12 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         # Bypass the function if we are using only 1 GPU.
         if self.world_size == 1:
             return input_
+
+        # For TPUs, use TPU communicator.
+        tpu_comm = self.tpu_communicator
+        if tpu_comm is not None and not tpu_comm.disabled:
+            return tpu_comm.all_reduce(input_)
+
         if ca_comm is not None:
             out = ca_comm.custom_all_reduce(input_)
             if out is not None:
@@ -310,6 +324,12 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
             return input_
         assert -input_.dim() <= dim < input_.dim(), (
             f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+
+        # For TPUs, use TPU communicator.
+        tpu_comm = self.tpu_communicator
+        if tpu_comm is not None and not tpu_comm.disabled:
+            return tpu_comm.all_gather(input_, dim)
+
         if dim < 0:
             # Convert negative dim to positive.
             dim += input_.dim()
@@ -727,6 +747,7 @@ def init_world_group(ranks: List[int], local_rank: int,
         torch_distributed_backend=backend,
         use_pynccl=False,
         use_custom_allreduce=False,
+        use_tpu_communicator=False,
     )
 
 
@@ -745,6 +766,7 @@ def init_model_parallel_group(
         torch_distributed_backend=backend,
         use_pynccl=True,
         use_custom_allreduce=use_custom_allreduce,
+        use_tpu_communicator=True,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
     )
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 40de134c0a5ee..87de285a373a2 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1067,6 +1067,10 @@ def scale(self):
     def soft_cap(self):
         return self.base_layer.soft_cap
 
+    @property
+    def use_gather(self):
+        return self.base_layer.use_gather
+
     @property
     def org_vocab_size(self):
         return self.base_layer.org_vocab_size
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index f6fcf49ef464b..bd3e7e114204f 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -5,10 +5,12 @@
 import torch
 import torch.nn as nn
 
-from vllm.distributed import tensor_model_parallel_gather
+from vllm.distributed import (tensor_model_parallel_all_gather,
+                              tensor_model_parallel_gather)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 
 
 class LogitsProcessor(nn.Module):
@@ -39,6 +41,8 @@ def __init__(self,
         self.org_vocab_size = org_vocab_size or vocab_size
         # Soft cap the logits. Used in Gemma 2.
         self.soft_cap = soft_cap
+        # Whether to use gather or all-gather to gather the logits.
+        self.use_gather = not current_platform.is_tpu()
 
     def forward(
         self,
@@ -76,7 +80,15 @@ def _get_logits(self, hidden_states: torch.Tensor,
         logits = lm_head.linear_method.apply(lm_head,
                                              hidden_states,
                                              bias=embedding_bias)
-        logits = tensor_model_parallel_gather(logits)
+        if self.use_gather:
+            logits = tensor_model_parallel_gather(logits)
+        else:
+            # Gather is not supported for some devices such as TPUs.
+            # Use all-gather instead.
+            # NOTE(woosuk): Here, the outputs of every device should not be None
+            # because XLA requires strict SPMD among all devices. Every device
+            # should execute the same operations after gathering the logits.
+            logits = tensor_model_parallel_all_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
             logits = logits[:, :self.org_vocab_size]

From 981b0d567355063d5453e382a85970cae083c615 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 27 Jul 2024 09:58:25 +0800
Subject: [PATCH 34/66] [Frontend] Factor out code for running uvicorn (#6828)

---
 vllm/entrypoints/api_server.py        | 74 ++++++++++++++++++---------
 vllm/entrypoints/openai/api_server.py | 72 ++++++++------------------
 vllm/server/__init__.py               |  3 ++
 vllm/server/launch.py                 | 42 +++++++++++++++
 4 files changed, 116 insertions(+), 75 deletions(-)
 create mode 100644 vllm/server/__init__.py
 create mode 100644 vllm/server/launch.py

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 66941442c8c9c..3476357658522 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -5,12 +5,12 @@
 We are also not going to accept PRs modifying this file, please
 change `vllm/entrypoints/openai/api_server.py` instead.
 """
-
+import asyncio
 import json
 import ssl
-from typing import AsyncGenerator
+from argparse import Namespace
+from typing import Any, AsyncGenerator, Optional
 
-import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
@@ -18,8 +18,10 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
+from vllm.server import serve_http
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
+from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
 
@@ -81,6 +83,50 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     return JSONResponse(ret)
 
 
+def build_app(args: Namespace) -> FastAPI:
+    global app
+
+    app.root_path = args.root_path
+    return app
+
+
+async def init_app(
+    args: Namespace,
+    llm_engine: Optional[AsyncLLMEngine] = None,
+) -> FastAPI:
+    app = build_app(args)
+
+    global engine
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = (llm_engine
+              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
+                  engine_args, usage_context=UsageContext.API_SERVER))
+
+    return app
+
+
+async def run_server(args: Namespace,
+                     llm_engine: Optional[AsyncLLMEngine] = None,
+                     **uvicorn_kwargs: Any) -> None:
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
+
+    app = await init_app(args, llm_engine)
+    await serve_http(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level=args.log_level,
+        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+        ssl_keyfile=args.ssl_keyfile,
+        ssl_certfile=args.ssl_certfile,
+        ssl_ca_certs=args.ssl_ca_certs,
+        ssl_cert_reqs=args.ssl_cert_reqs,
+        **uvicorn_kwargs,
+    )
+
+
 if __name__ == "__main__":
     parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default=None)
@@ -105,25 +151,5 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     parser.add_argument("--log-level", type=str, default="debug")
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngine.from_engine_args(
-        engine_args, usage_context=UsageContext.API_SERVER)
-
-    app.root_path = args.root_path
 
-    logger.info("Available routes are:")
-    for route in app.routes:
-        if not hasattr(route, 'methods'):
-            continue
-        methods = ', '.join(route.methods)
-        logger.info("Route: %s, Methods: %s", route.path, methods)
-
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level=args.log_level,
-                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
-                ssl_keyfile=args.ssl_keyfile,
-                ssl_certfile=args.ssl_certfile,
-                ssl_ca_certs=args.ssl_ca_certs,
-                ssl_cert_reqs=args.ssl_cert_reqs)
+    asyncio.run(run_server(args))
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 0fe4dd245b5e6..c1640a10a407d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -2,14 +2,12 @@
 import importlib
 import inspect
 import re
-import signal
+from argparse import Namespace
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Optional, Set
+from typing import Any, Optional, Set
 
-import fastapi
-import uvicorn
-from fastapi import APIRouter, Request
+from fastapi import APIRouter, FastAPI, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -38,6 +36,7 @@
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.logger import init_logger
+from vllm.server import serve_http
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
@@ -57,7 +56,7 @@
 
 
 @asynccontextmanager
-async def lifespan(app: fastapi.FastAPI):
+async def lifespan(app: FastAPI):
 
     async def _force_log():
         while True:
@@ -75,7 +74,7 @@ async def _force_log():
 router = APIRouter()
 
 
-def mount_metrics(app: fastapi.FastAPI):
+def mount_metrics(app: FastAPI):
     # Add prometheus asgi middleware to route /metrics requests
     metrics_route = Mount("/metrics", make_asgi_app())
     # Workaround for 307 Redirect for /metrics
@@ -165,8 +164,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
-def build_app(args):
-    app = fastapi.FastAPI(lifespan=lifespan)
+def build_app(args: Namespace) -> FastAPI:
+    app = FastAPI(lifespan=lifespan)
     app.include_router(router)
     app.root_path = args.root_path
 
@@ -214,11 +213,8 @@ async def authentication(request: Request, call_next):
     return app
 
 
-async def build_server(
-    args,
-    llm_engine: Optional[AsyncLLMEngine] = None,
-    **uvicorn_kwargs,
-) -> uvicorn.Server:
+async def init_app(args: Namespace,
+                   llm_engine: Optional[AsyncLLMEngine] = None) -> FastAPI:
     app = build_app(args)
 
     if args.served_model_name is not None:
@@ -281,14 +277,17 @@ async def build_server(
     )
     app.root_path = args.root_path
 
-    logger.info("Available routes are:")
-    for route in app.routes:
-        if not hasattr(route, 'methods'):
-            continue
-        methods = ', '.join(route.methods)
-        logger.info("Route: %s, Methods: %s", route.path, methods)
+    return app
+
+
+async def run_server(args: Namespace,
+                     llm_engine: Optional[AsyncLLMEngine] = None,
+                     **uvicorn_kwargs: Any) -> None:
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
 
-    config = uvicorn.Config(
+    app = await init_app(args, llm_engine)
+    await serve_http(
         app,
         host=args.host,
         port=args.port,
@@ -301,36 +300,6 @@ async def build_server(
         **uvicorn_kwargs,
     )
 
-    return uvicorn.Server(config)
-
-
-async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
-    logger.info("vLLM API server version %s", VLLM_VERSION)
-    logger.info("args: %s", args)
-
-    server = await build_server(
-        args,
-        llm_engine,
-        **uvicorn_kwargs,
-    )
-
-    loop = asyncio.get_running_loop()
-
-    server_task = loop.create_task(server.serve())
-
-    def signal_handler() -> None:
-        # prevents the uvicorn signal handler to exit early
-        server_task.cancel()
-
-    loop.add_signal_handler(signal.SIGINT, signal_handler)
-    loop.add_signal_handler(signal.SIGTERM, signal_handler)
-
-    try:
-        await server_task
-    except asyncio.CancelledError:
-        print("Gracefully stopping http server")
-        await server.shutdown()
-
 
 if __name__ == "__main__":
     # NOTE(simon):
@@ -339,4 +308,5 @@ def signal_handler() -> None:
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
     args = parser.parse_args()
+
     asyncio.run(run_server(args))
diff --git a/vllm/server/__init__.py b/vllm/server/__init__.py
new file mode 100644
index 0000000000000..17c98b4dad6c9
--- /dev/null
+++ b/vllm/server/__init__.py
@@ -0,0 +1,3 @@
+from .launch import serve_http
+
+__all__ = ["serve_http"]
diff --git a/vllm/server/launch.py b/vllm/server/launch.py
new file mode 100644
index 0000000000000..1a8aeb7f1022b
--- /dev/null
+++ b/vllm/server/launch.py
@@ -0,0 +1,42 @@
+import asyncio
+import signal
+from typing import Any
+
+import uvicorn
+from fastapi import FastAPI
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+async def serve_http(app: FastAPI, **uvicorn_kwargs: Any) -> None:
+    logger.info("Available routes are:")
+    for route in app.routes:
+        methods = getattr(route, "methods", None)
+        path = getattr(route, "path", None)
+
+        if methods is None or path is None:
+            continue
+
+        logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
+
+    config = uvicorn.Config(app, **uvicorn_kwargs)
+    server = uvicorn.Server(config)
+
+    loop = asyncio.get_running_loop()
+
+    server_task = loop.create_task(server.serve())
+
+    def signal_handler() -> None:
+        # prevents the uvicorn signal handler to exit early
+        server_task.cancel()
+
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+    try:
+        await server_task
+    except asyncio.CancelledError:
+        logger.info("Gracefully stopping http server")
+        await server.shutdown()

From 55712941e57bcfd662db2905811d6e2807b9153f Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 26 Jul 2024 22:27:44 -0400
Subject: [PATCH 35/66] [Bug Fix] Illegal memory access, FP8 Llama 3.1 405b 
 (#6852)

---
 .../broadcast_load_epilogue_c3x.hpp           | 46 +++++++++++++++----
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
index 877a9f5b9e5de..e4bc9752ed7db 100644
--- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
+++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
@@ -328,20 +328,36 @@ struct Sm90ColOrScalarBroadcast {
     return EmptyProducerLoadCallbacks{};
   }
 
-  template<class GTensor, class RTensor>
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
   struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
     CUTLASS_DEVICE
-    ConsumerStoreCallbacks(GTensor&& tCgCol, RTensor&& tCrCol, Params const& params)
-      : tCgCol(cute::forward<GTensor>(tCgCol)),
-        tCrCol(cute::forward<RTensor>(tCrCol)),
-        params(params) {}
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      params(params) {}
 
     GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    RTensor tCrCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
     Params const& params;
+    int m;
 
     CUTLASS_DEVICE void
     begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
       if (!params.col_broadcast) {
         fill(tCrCol, *(params.ptr_col));
         return;
@@ -349,7 +365,7 @@ struct Sm90ColOrScalarBroadcast {
 
       // Filter so we don't issue redundant copies over stride-0 modes
       // (only works if 0-strides are in same location, which is by construction)
-      copy_aligned(filter(tCgCol), filter(tCrCol));
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
     }
 
     template <typename ElementAccumulator, int FragmentSize>
@@ -381,8 +397,20 @@ struct Sm90ColOrScalarBroadcast {
       mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
     Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
 
-    return ConsumerStoreCallbacks<decltype(tCgCol), decltype(tCrCol)>(
-      cute::move(tCgCol), cute::move(tCrCol), params);
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl, 
+      params
+    );
   }
 };
 

From 969d03226514d99f43cf7d17d1e336231d91751a Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Fri, 26 Jul 2024 23:02:25 -0400
Subject: [PATCH 36/66] [Bugfix]: Fix Tensorizer test failures (#6835)

---
 .buildkite/test-pipeline.yaml              |  1 -
 tests/tensorizer_loader/conftest.py        | 45 ++++++++++++++++++++++
 tests/tensorizer_loader/test_tensorizer.py |  7 +---
 3 files changed, 47 insertions(+), 6 deletions(-)
 create mode 100644 tests/tensorizer_loader/conftest.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 633bc5ca95bf9..5b4a786305e1f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -220,7 +220,6 @@ steps:
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
-  soft_fail: true
   fast_check: true
   commands:
     - apt-get install -y curl libsodium23
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
new file mode 100644
index 0000000000000..c5c6fc1057d31
--- /dev/null
+++ b/tests/tensorizer_loader/conftest.py
@@ -0,0 +1,45 @@
+# isort: skip_file
+
+import contextlib
+import gc
+
+import pytest
+import ray
+import torch
+
+from vllm.distributed import (destroy_distributed_environment,
+                              destroy_model_parallel)
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+
+
+def cleanup():
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.cuda.empty_cache()
+    ray.shutdown()
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    """Allow subdirectories to skip global cleanup by overriding this fixture.
+    This can provide a ~10x speedup for non-GPU unit tests since they don't need
+    to initialize torch.
+    """
+
+    return True
+
+
+@pytest.fixture(autouse=True)
+def cleanup_fixture(should_do_global_cleanup_after_test: bool):
+    yield
+    if should_do_global_cleanup_after_test:
+        cleanup()
+
+
+@pytest.fixture(autouse=True)
+def tensorizer_config():
+    config = TensorizerConfig(tensorizer_uri="vllm")
+    return config
\ No newline at end of file
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b7030e3cd6d42..2adeae8874bdb 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -40,7 +40,6 @@
 tensorize_model_for_testing_script = os.path.join(
     os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
 
-
 def is_curl_installed():
     try:
         subprocess.check_call(['curl', '--version'])
@@ -63,10 +62,6 @@ def write_keyfile(keyfile_path: str):
     with open(keyfile_path, 'wb') as f:
         f.write(encryption_params.key)
 
-@pytest.fixture(autouse=True)
-def tensorizer_config():
-    config = TensorizerConfig(tensorizer_uri="vllm")
-    return config
 
 
 @patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
@@ -105,6 +100,7 @@ def test_can_deserialize_s3(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
         vllm_runner, tmp_path):
+    cleanup()
     with vllm_runner(model_ref) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
         key_path = tmp_path / (model_ref + ".key")
@@ -316,6 +312,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
 
 
 def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
+    cleanup()
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
     config = TensorizerConfig(tensorizer_uri=str(model_path))

From ced36cd89b9c012eb066ef863b2d1ecf052f3e00 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 26 Jul 2024 20:16:13 -0700
Subject: [PATCH 37/66] [ROCm] Upgrade PyTorch nightly version (#6845)

---
 Dockerfile.rocm                                  | 4 ++--
 docs/source/getting_started/amd-installation.rst | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 7b4c0166a04bd..64bc0f3c12c75 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -55,8 +55,8 @@ RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
         *"rocm-6.1"*) \
             python3 -m pip uninstall -y torch torchvision \
             && python3 -m pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240710 \
-                torchvision==0.20.0.dev20240710 \
+                torch==2.5.0.dev20240726 \
+                torchvision==0.20.0.dev20240726 \
                --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
         *) ;; esac
 
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 1c7d274b7c47e..9648d07d2790c 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -117,7 +117,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
 
         $ # Install PyTorch
         $ pip uninstall torch -y
-        $ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
+        $ pip install --no-cache-dir --pre torch==2.5.0.dev20240726 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
 
         $ # Build & install AMD SMI
         $ pip install /opt/rocm/share/amd_smi

From 3c3012398e4aecde9e40981d79a0576203158d24 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Fri, 26 Jul 2024 20:20:16 -0700
Subject: [PATCH 38/66] [Doc] add VLLM_TARGET_DEVICE=neuron to documentation
 for neuron (#6844)

Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>
---
 docs/source/getting_started/neuron-installation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst
index 62bf779c339d5..0816524468cab 100644
--- a/docs/source/getting_started/neuron-installation.rst
+++ b/docs/source/getting_started/neuron-installation.rst
@@ -131,6 +131,6 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
     $ pip install -U -r requirements-neuron.txt
-    $ pip install .
+    $ VLLM_TARGET_DEVICE="neuron" pip install .
 
 If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed.

From ed94e4f427bce8611e198d051dbd3b0097b448e8 Mon Sep 17 00:00:00 2001
From: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Date: Sat, 27 Jul 2024 06:45:31 +0300
Subject: [PATCH 39/66] [Bugfix][Model] Jamba assertions and no chunked prefill
 by default for Jamba (#6784)

---
 vllm/engine/arg_utils.py            | 6 +++++-
 vllm/model_executor/models/jamba.py | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cd64d3345b830..bad5be4917216 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -754,10 +754,14 @@ def create_engine_config(self, ) -> EngineConfig:
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
                 use_spec_decode = self.speculative_model is not None
+                has_seqlen_agnostic_layers = (
+                    model_config.contains_seqlen_agnostic_layers(
+                        parallel_config))
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
-                        and not self.enable_prefix_caching):
+                        and not self.enable_prefix_caching
+                        and not has_seqlen_agnostic_layers):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models with "
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index d4e4f0055aa2b..3444578227259 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -644,6 +644,11 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
         scheduler_config: Optional[SchedulerConfig] = None,
     ) -> None:
+        assert not scheduler_config.chunked_prefill_enabled, \
+            "Jamba currently does not support chunked prefill"
+        assert not cache_config.enable_prefix_caching, \
+            "Jamba currently does not support prefix caching"
+
         super().__init__()
         self.config = config
         self.scheduler_config = scheduler_config

From 14dbd5a7674e5de2862c18adb711d9feecd35063 Mon Sep 17 00:00:00 2001
From: Joe <g-eoj@users.noreply.github.com>
Date: Fri, 26 Jul 2024 20:47:50 -0700
Subject: [PATCH 40/66] [Model] H2O Danube3-4b (#6451)

---
 .buildkite/run-cpu-test.sh                    |  2 +-
 .../kernels/benchmark_paged_attention.py      |  2 +-
 benchmarks/kernels/benchmark_rope.py          |  2 +-
 csrc/attention/attention_kernels.cu           |  6 +++
 tests/kernels/test_attention.py               |  4 +-
 tests/kernels/test_cache.py                   |  8 ++-
 tests/kernels/test_pos_encoding.py            |  2 +-
 tests/models/test_danube3_4b.py               | 52 +++++++++++++++++++
 vllm/attention/ops/paged_attn.py              |  2 +-
 vllm/utils.py                                 |  6 +++
 10 files changed, 79 insertions(+), 7 deletions(-)
 create mode 100644 tests/models/test_danube3_4b.py

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 21deec2bba973..45bc8eb2f8477 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest Pillow protobuf
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 
 # online inference
 docker exec cpu-test bash -c "
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 78cac8a555d1b..a04433142da42 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -175,7 +175,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
     parser.add_argument("--num-kv-heads", type=int, default=8)
     parser.add_argument("--head-size",
                         type=int,
-                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
                         default=128)
     parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
     parser.add_argument("--use-alibi", action="store_true")
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 78736c7a7ba6f..f542684a9a2a9 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -94,7 +94,7 @@ def benchmark_rope_kernels_multi_lora(
     parser.add_argument("--num-heads", type=int, default=8)
     parser.add_argument("--head-size",
                         type=int,
-                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
                         default=128)
     parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
     parser.add_argument("--dtype",
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 350dbce1d7ba9..875570a1e894f 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -751,6 +751,9 @@ void paged_attention_v1_launcher(
     case 112:
       LAUNCH_PAGED_ATTENTION_V1(112);
       break;
+    case 120:
+      LAUNCH_PAGED_ATTENTION_V1(120);
+      break;
     case 128:
       LAUNCH_PAGED_ATTENTION_V1(128);
       break;
@@ -912,6 +915,9 @@ void paged_attention_v2_launcher(
     case 112:
       LAUNCH_PAGED_ATTENTION_V2(112);
       break;
+    case 120:
+      LAUNCH_PAGED_ATTENTION_V2(120);
+      break;
     case 128:
       LAUNCH_PAGED_ATTENTION_V2(128);
       break;
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 2e6412c28958e..c7c6707461c3e 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -28,7 +28,7 @@
 
 # FlashAttention forward only supports head dimension at most 128
 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
-HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256
+HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256
               ] if not is_hip() else [64, 80, 96, 112, 128]
 
 BLOCK_SIZES = [16, 32]
@@ -134,6 +134,8 @@ def test_paged_attention(
     seed: int,
     device: str,
 ) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index f9a609464abfc..3fb9b59be1701 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -11,7 +11,7 @@
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256]
+HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
 BLOCK_SIZES = [8, 16, 32]
 
 # Arbitrary values for testing
@@ -52,6 +52,8 @@ def test_copy_blocks(
     kv_cache_dtype: str,
     device: str,
 ) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
@@ -124,6 +126,8 @@ def test_reshape_and_cache(
     device: str,
     kv_cache_dtype: str,
 ) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
@@ -325,6 +329,8 @@ def test_swap_blocks(
 ) -> None:
     if kv_cache_dtype == "fp8" and "cpu" in direction:
         pytest.skip()
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index 4c83659929d41..4a7ad6e0fa21d 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -10,7 +10,7 @@
 
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256]
+HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
 ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
 NUM_HEADS = [7, 17]  # Arbitrary values for testing
 BATCH_SIZES = [1, 5]  # Arbitrary values for testing
diff --git a/tests/models/test_danube3_4b.py b/tests/models/test_danube3_4b.py
new file mode 100644
index 0000000000000..bfaa275f73c19
--- /dev/null
+++ b/tests/models/test_danube3_4b.py
@@ -0,0 +1,52 @@
+"""Compare the outputs of HF and vLLM when using greedy sampling.
+
+This tests danube3 separately because its head size isn't supported on CPU yet.
+
+Run `pytest tests/models/test_danube3_4b.py`.
+"""
+import pytest
+
+from .utils import check_outputs_equal
+
+MODELS = ["h2oai/h2o-danube3-4b-base"]
+
+target_dtype = "half"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [32])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [target_dtype])
+def test_model_print(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index ce7b4d129779c..0f6d2f2d1ab3f 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -31,7 +31,7 @@ class PagedAttention:
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
-        return [64, 80, 96, 112, 128, 192, 256]
+        return [64, 80, 96, 112, 120, 128, 192, 256]
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/utils.py b/vllm/utils.py
index 90be09fc7b967..1448316e66edb 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -508,6 +508,12 @@ def create_kv_caches_with_random(
     seed: int = 0,
     device: Optional[str] = "cuda",
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+
+    if cache_dtype == "fp8" and head_size % 16:
+        raise ValueError(
+            f"Does not support key cache of type fp8 with head_size {head_size}"
+        )
+
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)

From 52f07e3dec2b76045208f5cfea5670b85a719cc6 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 26 Jul 2024 20:54:27 -0700
Subject: [PATCH 41/66] [Hardware][TPU] Implement tensor parallelism with Ray
 (#5871)

---
 requirements-tpu.txt              |   1 +
 vllm/attention/backends/pallas.py |   4 +-
 vllm/engine/llm_engine.py         |  10 +-
 vllm/executor/ray_tpu_executor.py | 313 ++++++++++++++++++++++++++++++
 vllm/worker/tpu_model_runner.py   |  42 ++--
 vllm/worker/tpu_worker.py         |  16 +-
 6 files changed, 365 insertions(+), 21 deletions(-)
 create mode 100644 vllm/executor/ray_tpu_executor.py

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 22487f5524dd7..c2140fbffec9f 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -4,4 +4,5 @@
 # Dependencies for TPU
 # Currently, the TPU backend uses a nightly version of PyTorch XLA.
 # You can install the dependencies in Dockerfile.tpu.
+ray
 triton  # To avoid import errors
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index b83a83bb177d4..c53a2f91b89d7 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -55,8 +55,8 @@ class PallasMetadata(AttentionMetadata):
 
     # Currently, input sequences can only contain all prefills
     # or all decoding.
-    block_tables: Optional[torch.Tensor]
-    context_lens: Optional[torch.Tensor]
+    block_tables: Optional[torch.Tensor] = None
+    context_lens: Optional[torch.Tensor] = None
 
     @property
     def prefill_metadata(self) -> Optional["PallasMetadata"]:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 48d5305892219..004348d4c49a3 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -394,8 +394,14 @@ def _get_executor_cls(cls,
             from vllm.executor.neuron_executor import NeuronExecutor
             executor_class = NeuronExecutor
         elif engine_config.device_config.device_type == "tpu":
-            from vllm.executor.tpu_executor import TPUExecutor
-            executor_class = TPUExecutor
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_tpu_executor import RayTPUExecutor
+                executor_class = RayTPUExecutor
+            else:
+                assert distributed_executor_backend is None
+                from vllm.executor.tpu_executor import TPUExecutor
+                executor_class = TPUExecutor
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutor
             executor_class = CPUExecutor
diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
new file mode 100644
index 0000000000000..7048d47980723
--- /dev/null
+++ b/vllm/executor/ray_tpu_executor.py
@@ -0,0 +1,313 @@
+import asyncio
+import os
+from collections import defaultdict
+from itertools import islice, repeat
+from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
+                    Union)
+
+import vllm.envs as envs
+from vllm.executor.executor_base import ExecutorAsyncBase
+from vllm.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.executor.tpu_executor import TPUExecutor
+from vllm.logger import init_logger
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        get_vllm_instance_id, make_async)
+
+if ray is not None:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+
+class RayTPUExecutor(TPUExecutor):
+
+    def __init__(self, *args, **kwargs):
+        # This is non-None when the execute model loop is running
+        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+        # Updated by implementations that require additional args to be passed
+        # to the _run_workers execute_model call
+        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
+
+        super().__init__(*args, **kwargs)
+
+    def _init_executor(self) -> None:
+        assert self.parallel_config.distributed_executor_backend == "ray"
+        placement_group = self.parallel_config.placement_group
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        # Create the parallel TPU workers.
+        self._init_workers_ray(placement_group)
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerWrapper] = []
+
+        # Create the workers.
+        driver_ip = get_ip()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("TPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            assert self.speculative_config is None
+            worker_module_name = "vllm.worker.tpu_worker"
+            worker_class_name = "TPUWorker"
+
+            worker = ray.remote(
+                num_cpus=0,
+                resources={"TPU": 1},
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerWrapper).remote(
+                worker_module_name=worker_module_name,
+                worker_class_name=worker_class_name,
+                trust_remote_code=self.model_config.trust_remote_code,
+            )
+
+            worker_ip = ray.get(worker.get_node_ip.remote())
+            if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                # If the worker is on the same node as the driver, we use it
+                # as the resource holder for the driver process.
+                self.driver_dummy_worker = worker
+                self.driver_worker = RayWorkerWrapper(
+                    worker_module_name=worker_module_name,
+                    worker_class_name=worker_class_name,
+                    trust_remote_code=self.model_config.trust_remote_code,
+                )
+            else:
+                # Else, added to the list of workers.
+                self.workers.append(worker)
+
+        if self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any TPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "TPU node.")
+
+        # Get the set of TPU IDs used on each node.
+        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
+                                                    use_dummy_driver=True)
+
+        node_workers = defaultdict(list)
+        for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+
+        VLLM_INSTANCE_ID = get_vllm_instance_id()
+
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [({
+            "VLLM_INSTANCE_ID":
+            VLLM_INSTANCE_ID,
+            "VLLM_TRACE_FUNCTION":
+            str(envs.VLLM_TRACE_FUNCTION),
+        }, ) for _ in worker_node_and_gpu_ids]
+        self._run_workers("update_environment_variables",
+                          all_args=all_args_to_update_environment_variables)
+
+        if len(node_workers) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        # Initialize the actual workers inside worker wrapper.
+        init_worker_all_kwargs = [
+            self._get_worker_kwargs(
+                local_rank=node_workers[node_id].index(rank),
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+        ]
+        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
+
+        self._run_workers("init_device")
+        self._run_workers("load_model",
+                          max_concurrent_workers=self.parallel_config.
+                          max_parallel_loading_workers)
+
+    def _driver_execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        """Run execute_model in the driver worker.
+
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        return self.driver_worker.execute_method("execute_model",
+                                                 execute_model_req)
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_remote_workers_only: bool = False,
+        all_args: Optional[List[Tuple[Any, ...]]] = None,
+        all_kwargs: Optional[List[Dict[str, Any]]] = None,
+        use_dummy_driver: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        use_ray_compiled_dag: bool = False,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers. Can be used in the following
+        ways:
+
+        - async_run_remote_workers_only: If True the method will be run only
+          in the remote workers, not the driver worker. It will also be
+          run asynchronously and return a list of futures rather than blocking
+          on the results.
+        - args/kwargs: All workers share the same args/kwargs
+        - all_args/all_kwargs: args/kwargs for each worker are specified
+          individually
+        """
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        count = len(self.workers)
+        all_worker_args = repeat(args, count) if all_args is None \
+            else islice(all_args, 1, None)
+        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+            else islice(all_kwargs, 1, None)
+
+        # Start the ray workers first.
+        ray_worker_outputs = [
+            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
+            for (worker, worker_args, worker_kwargs
+                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+        ]
+
+        if async_run_remote_workers_only:
+            # Just return futures
+            return ray_worker_outputs
+
+        driver_args = args if all_args is None else all_args[0]
+        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
+        # Start the driver worker after all the ray workers.
+        if not use_dummy_driver:
+            driver_worker_output = self.driver_worker.execute_method(
+                method, *driver_args, **driver_kwargs)
+        else:
+            assert self.driver_dummy_worker is not None
+            driver_worker_output = ray.get(
+                self.driver_dummy_worker.execute_method.remote(
+                    method, *driver_args, **driver_kwargs))
+        # Get the results of the ray workers.
+        if self.workers:
+            ray_worker_outputs = ray.get(ray_worker_outputs)
+
+        return [driver_worker_output] + ray_worker_outputs
+
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        ray.get(parallel_worker_tasks)
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        num_blocks = self._run_workers("determine_num_available_blocks", )
+        num_tpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+        return num_tpu_blocks, num_cpu_blocks
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+        self._run_workers("initialize_cache",
+                          num_gpu_blocks=num_gpu_blocks,
+                          num_cpu_blocks=num_cpu_blocks)
+
+    def execute_model(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        if self.parallel_worker_tasks is None:
+            self.parallel_worker_tasks = self._run_workers(
+                "start_worker_execution_loop",
+                async_run_remote_workers_only=True,
+                **self.extra_execute_model_run_workers_kwargs)
+
+        # Only the driver worker returns the sampling results.
+        return self._driver_execute_model(execute_model_req)
+
+    def stop_remote_worker_execution_loop(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+
+        self._driver_execute_model()
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        self._wait_for_tasks_completion(parallel_worker_tasks)
+
+
+class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.driver_exec_method = make_async(self.driver_worker.execute_method)
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if self.parallel_worker_tasks is None:
+            # Start model execution loop running in the parallel workers
+            self.parallel_worker_tasks = asyncio.create_task(
+                self._start_worker_execution_loop())
+
+        # Only the driver worker returns the sampling results.
+        return await self._driver_execute_model_async(execute_model_req)
+
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+
+        await self._driver_execute_model_async()
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        await parallel_worker_tasks
+
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        return await self.driver_exec_method("execute_model",
+                                             execute_model_req)
+
+    async def _start_worker_execution_loop(self):
+        coros = [
+            worker.execute_method.remote("start_worker_execution_loop")
+            for worker in self.workers
+        ]
+        return await asyncio.gather(*coros)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 8a8b412db6731..e5bb101fc7df4 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,6 +1,7 @@
 import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
+from unittest.mock import patch
 
 import numpy as np
 import torch
@@ -45,6 +46,7 @@ class ModelInputForTPU(ModelRunnerInputBase):
     num_samples: int
     best_of: List[int]
     seq_groups: List[List[int]]
+    virtual_engine: int = 0
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -55,6 +57,9 @@ def as_broadcastable_tensor_dict(
             "t": self.t,
             "p": self.p,
             "num_samples": self.num_samples,
+            "best_of": self.best_of,
+            "seq_groups": self.seq_groups,
+            "virtual_engine": self.virtual_engine,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -113,16 +118,30 @@ def __init__(
     def load_model(self) -> None:
         self.device = self.device_config.device
 
-        model = get_model(
-            model_config=self.model_config,
-            load_config=self.load_config,
-            device_config=self.device_config,
-            parallel_config=self.parallel_config,
-            cache_config=self.cache_config,
-            scheduler_config=self.scheduler_config,
-            multimodal_config=self.multimodal_config,
-            lora_config=None,
-        )
+        # NOTE(woosuk): While the executor assigns the TP ranks to the worker
+        # process, the ranks can be different from the ranks internally assigned
+        # by the xm runtime. Therefore, there is a mismatch in the rank
+        # assignment between the gloo (cpu) runtime and the xm (tpu) runtime.
+        # This is not a problem in linear layers because all-reduce is
+        # rank-agnostic. However, it matters for all-gather as the ranks
+        # determine the order of concatenating the output tensors.
+        # As a workaround, we use the xm's rank assignment only when loading
+        # the embedding weights.
+        xm_tp_rank = xm.get_ordinal()
+        with patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding."
+                "get_tensor_model_parallel_rank",
+                return_value=xm_tp_rank):
+            model = get_model(
+                model_config=self.model_config,
+                load_config=self.load_config,
+                device_config=self.device_config,
+                parallel_config=self.parallel_config,
+                cache_config=self.cache_config,
+                scheduler_config=self.scheduler_config,
+                multimodal_config=self.multimodal_config,
+                lora_config=None,
+            )
         model = model.eval()
         xm.wait_device_ops()
 
@@ -463,10 +482,11 @@ def make_model_input_from_broadcasted_tensor_dict(
             tensor_dict, attn_backend=self.attn_backend)
         return model_input
 
+    @torch.no_grad()
     def execute_model(
         self,
         model_input: ModelInputForTPU,
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        kv_caches: Optional[List[Any]],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
     ) -> List[SamplerOutput]:
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 03011e03058d8..c88aba7ae08cd 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -70,13 +70,13 @@ def __init__(
 
     def init_device(self) -> None:
         os.environ["PJRT_DEVICE"] = "TPU"
-        self.device = xm.xla_device()
-        self.device_config.device = self.device
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)
 
-        # NOTE(woosuk): This is just a hack to initialize the TP group.
-        # This cannot perform the actual communication ops.
+        # NOTE(woosuk): This is just to initialize the TP group and broadcast
+        # the input objects on CPU. The all-reduce and all-gather ops on TPU
+        # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
+        # own context.
         init_distributed_environment(
             world_size=self.parallel_config.world_size,
             rank=self.rank,
@@ -88,6 +88,11 @@ def init_device(self) -> None:
             self.parallel_config.tensor_parallel_size,
             self.parallel_config.pipeline_parallel_size)
 
+        # Device initialization should happen after initializing the distributed
+        # runtime.
+        self.device = xm.xla_device()
+        self.device_config.device = self.device
+
         # Set random seed.
         set_random_seed(self.model_config.seed)
         xm.set_rng_state(self.model_config.seed, self.device)
@@ -200,8 +205,7 @@ def get_cache_block_size_bytes(self) -> int:
 
     @property
     def do_metadata_broadcast(self) -> bool:
-        # TODO(woosuk): Support TP.
-        return False
+        return self.parallel_config.tensor_parallel_size > 1
 
     @property
     def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:

From c53041ae3b8ded4ac4c3fc745be6bc695b9f0c78 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 27 Jul 2024 05:47:33 +0100
Subject: [PATCH 42/66] [Doc] Add missing mock import to docs `conf.py` (#6834)

---
 .readthedocs.yaml   | 1 +
 docs/source/conf.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 428e199088589..f1959ad2743f3 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -10,6 +10,7 @@ build:
 
 sphinx:
    configuration: docs/source/conf.py
+   fail_on_warning: true
 
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f4cec05663fcd..b867bfd89dc17 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -94,6 +94,7 @@ def setup(app):
 
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
+    "aiohttp",
     "cpuinfo",
     "torch",
     "transformers",
@@ -141,5 +142,6 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
 }
 
 autodoc_preserve_defaults = True
+autodoc_warningiserror = True
 
 navigation_with_keys = False

From 593e79e7337f7fd9e92b7554dabdff96769dbf15 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Fri, 26 Jul 2024 23:15:20 -0600
Subject: [PATCH 43/66] [Bugfix] torch.set_num_threads() in
 multiproc_gpu_executor (#6802)

[Bugfix] Use torch.set_num_threads() to configure parallelism in multiproc_gpu_executor (#6802)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/executor/multiproc_gpu_executor.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 19f7a497cdd9f..e1e92958e667c 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -6,6 +6,8 @@
 from functools import partial
 from typing import Any, List, Optional
 
+import torch
+
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.gpu_executor import create_worker
@@ -45,10 +47,23 @@ def _init_executor(self) -> None:
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
-        # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
-        # contention amongst the shards
-        if "OMP_NUM_THREADS" not in os.environ:
-            os.environ["OMP_NUM_THREADS"] = "1"
+        # Configure thread parallelism if OMP_NUM_THREADS isn't set
+        #
+        # Helps to avoid CPU contention. The default of spawning a thread per
+        # core combined with multiprocessing for each GPU can have a negative
+        # impact on performance. The contention is amplified when running in a
+        # container where CPU limits can cause throttling.
+        default_omp_num_threads = 1
+        if "OMP_NUM_THREADS" not in os.environ and (
+                current_parallelism :=
+                torch.get_num_threads()) > default_omp_num_threads:
+            logger.warning(
+                "Reducing Torch parallelism from %d threads to %d to avoid "
+                "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
+                "external environment to tune this value as needed.",
+                current_parallelism, default_omp_num_threads)
+            os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
+            torch.set_num_threads(default_omp_num_threads)
 
         # workaround for https://github.com/vllm-project/vllm/issues/6103
         if world_size > 1:

From aa46953a20685377fc51dcde172114ddd7ffdc68 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 26 Jul 2024 22:44:13 -0700
Subject: [PATCH 44/66] [Misc][VLM][Doc] Consolidate offline examples for
 vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 examples/fuyu_example.py                      |  31 ----
 examples/llava_example.py                     |  25 ---
 examples/llava_next_example.py                |  36 ----
 examples/minicpmv_example.py                  |  55 ------
 examples/offline_inference_vision_language.py | 174 ++++++++++++++++++
 examples/paligemma_example.py                 |  25 ---
 examples/phi3v_example.py                     |  40 ----
 7 files changed, 174 insertions(+), 212 deletions(-)
 delete mode 100644 examples/fuyu_example.py
 delete mode 100644 examples/llava_example.py
 delete mode 100644 examples/llava_next_example.py
 delete mode 100644 examples/minicpmv_example.py
 create mode 100644 examples/offline_inference_vision_language.py
 delete mode 100644 examples/paligemma_example.py
 delete mode 100644 examples/phi3v_example.py

diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py
deleted file mode 100644
index c92b8fb4bc286..0000000000000
--- a/examples/fuyu_example.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import requests
-from PIL import Image
-
-from vllm import LLM, SamplingParams
-
-
-def run_fuyu():
-    llm = LLM(model="adept/fuyu-8b", max_model_len=4096)
-
-    # single-image prompt
-    prompt = "What is the highest life expectancy at of male?\n"
-    url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png"
-    image = Image.open(requests.get(url, stream=True).raw)
-    sampling_params = SamplingParams(temperature=0, max_tokens=64)
-
-    outputs = llm.generate(
-        {
-            "prompt": prompt,
-            "multi_modal_data": {
-                "image": image
-            },
-        },
-        sampling_params=sampling_params)
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-if __name__ == "__main__":
-    run_fuyu()
diff --git a/examples/llava_example.py b/examples/llava_example.py
deleted file mode 100644
index 4c9eabd261e5c..0000000000000
--- a/examples/llava_example.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from vllm import LLM
-from vllm.assets.image import ImageAsset
-
-
-def run_llava():
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-
-    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
-
-    image = ImageAsset("stop_sign").pil_image
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {
-            "image": image
-        },
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-if __name__ == "__main__":
-    run_llava()
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
deleted file mode 100644
index fd53a6def1a13..0000000000000
--- a/examples/llava_next_example.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from io import BytesIO
-
-import requests
-from PIL import Image
-
-from vllm import LLM, SamplingParams
-
-
-def run_llava_next():
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=4096)
-
-    prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
-    url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
-    image = Image.open(BytesIO(requests.get(url).content))
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=100)
-
-    outputs = llm.generate(
-        {
-            "prompt": prompt,
-            "multi_modal_data": {
-                "image": image
-            }
-        },
-        sampling_params=sampling_params)
-
-    generated_text = ""
-    for o in outputs:
-        generated_text += o.outputs[0].text
-
-    print(f"LLM output:{generated_text}")
-
-
-if __name__ == "__main__":
-    run_llava_next()
diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py
deleted file mode 100644
index bf20a7ea04ad4..0000000000000
--- a/examples/minicpmv_example.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from transformers import AutoTokenizer
-
-from vllm import LLM, SamplingParams
-from vllm.assets.image import ImageAsset
-
-# 2.0
-# The official repo doesn't work yet, so we need to use a fork for now
-# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
-# MODEL_NAME = "HwwwH/MiniCPM-V-2"
-# 2.5
-MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
-
-image = ImageAsset("stop_sign").pil_image.convert("RGB")
-
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-llm = LLM(model=MODEL_NAME,
-          gpu_memory_utilization=1,
-          trust_remote_code=True,
-          max_model_len=4096)
-
-messages = [{
-    'role':
-    'user',
-    'content':
-    '(<image>./</image>)\n' + "What's the content of the image?"
-}]
-prompt = tokenizer.apply_chat_template(messages,
-                                       tokenize=False,
-                                       add_generation_prompt=True)
-# 2.0
-# stop_token_ids = [tokenizer.eos_id]
-# 2.5
-stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
-
-sampling_params = SamplingParams(
-    stop_token_ids=stop_token_ids,
-    # temperature=0.7,
-    # top_p=0.8,
-    # top_k=100,
-    # seed=3472,
-    max_tokens=1024,
-    # min_tokens=150,
-    temperature=0,
-    use_beam_search=True,
-    # length_penalty=1.2,
-    best_of=3)
-
-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": {
-        "image": image
-    }
-},
-                       sampling_params=sampling_params)
-print(outputs[0].outputs[0].text)
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
new file mode 100644
index 0000000000000..4a09f77ca59f8
--- /dev/null
+++ b/examples/offline_inference_vision_language.py
@@ -0,0 +1,174 @@
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+# Input image and question
+image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+question = "What is the content of this image?"
+
+
+# LLaVA-1.5
+def run_llava(question):
+
+    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+    return llm, prompt
+
+
+# LLaVA-1.6/LLaVA-NeXT
+def run_llava_next(question):
+
+    prompt = f"[INST] <image>\n{question} [/INST]"
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
+
+    return llm, prompt
+
+
+# Fuyu
+def run_fuyu(question):
+
+    prompt = f"{question}\n"
+    llm = LLM(model="adept/fuyu-8b")
+
+    return llm, prompt
+
+
+# Phi-3-Vision
+def run_phi3v(question):
+
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (128k) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # In this example, we override max_num_seqs to 5 while
+    # keeping the original context length of 128k.
+    llm = LLM(
+        model="microsoft/Phi-3-vision-128k-instruct",
+        trust_remote_code=True,
+        max_num_seqs=5,
+    )
+    return llm, prompt
+
+
+# PaliGemma
+def run_paligemma(question):
+
+    prompt = question
+    llm = LLM(model="google/paligemma-3b-mix-224")
+
+    return llm, prompt
+
+
+# Chameleon
+def run_chameleon(question):
+
+    prompt = f"{question}<image>"
+    llm = LLM(model="facebook/chameleon-7b")
+    return llm, prompt
+
+
+# MiniCPM-V
+def run_minicpmv(question):
+
+    # 2.0
+    # The official repo doesn't work yet, so we need to use a fork for now
+    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
+    # model_name = "HwwwH/MiniCPM-V-2"
+
+    # 2.5
+    model_name = "openbmb/MiniCPM-Llama3-V-2_5"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+    )
+
+    messages = [{
+        'role': 'user',
+        'content': f'(<image>./</image>)\n{question}'
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    return llm, prompt
+
+
+model_example_map = {
+    "llava": run_llava,
+    "llava-next": run_llava_next,
+    "fuyu": run_fuyu,
+    "phi3_v": run_phi3v,
+    "paligemma": run_paligemma,
+    "chameleon": run_chameleon,
+    "minicpmv": run_minicpmv,
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    llm, prompt = model_example_map[model](question)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models')
+    args = parser.parse_args()
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/paligemma_example.py b/examples/paligemma_example.py
deleted file mode 100644
index 92a3cb3ac4129..0000000000000
--- a/examples/paligemma_example.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from vllm import LLM
-from vllm.assets.image import ImageAsset
-
-
-def run_paligemma():
-    llm = LLM(model="google/paligemma-3b-mix-224")
-
-    prompt = "caption es"
-
-    image = ImageAsset("stop_sign").pil_image
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {
-            "image": image
-        },
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-if __name__ == "__main__":
-    run_paligemma()
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
deleted file mode 100644
index ae8c38d84e8fd..0000000000000
--- a/examples/phi3v_example.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from vllm import LLM, SamplingParams
-from vllm.assets.image import ImageAsset
-
-
-def run_phi3v():
-    model_path = "microsoft/Phi-3-vision-128k-instruct"
-
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (128k) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-
-    # In this example, we override max_num_seqs to 5 while
-    # keeping the original context length of 128k.
-    llm = LLM(
-        model=model_path,
-        trust_remote_code=True,
-        max_num_seqs=5,
-    )
-
-    image = ImageAsset("cherry_blossom").pil_image
-
-    # single-image prompt
-    prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n"  # noqa: E501
-    sampling_params = SamplingParams(temperature=0, max_tokens=64)
-
-    outputs = llm.generate(
-        {
-            "prompt": prompt,
-            "multi_modal_data": {
-                "image": image
-            },
-        },
-        sampling_params=sampling_params)
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-if __name__ == "__main__":
-    run_phi3v()

From 925de97e05dd4709fcd80691cb37da5e582c22e8 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 26 Jul 2024 23:24:08 -0700
Subject: [PATCH 45/66] [Bugfix] Fix VLM example typo (#6859)

---
 examples/offline_inference_vision_language.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 4a09f77ca59f8..8a63653343db6 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -158,7 +158,6 @@ def main(args):
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'vision language models')
-    args = parser.parse_args()
     parser.add_argument('--model-type',
                         '-m',
                         type=str,

From a57d75821c6177da75fdebf171d528eef5301961 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= <wrran@outlook.com>
Date: Sat, 27 Jul 2024 17:07:02 +0800
Subject: [PATCH 46/66] [bugfix] make args.stream work (#6831)

---
 examples/api_client.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/api_client.py b/examples/api_client.py
index 27a2a08b7b0c3..49a085febdc57 100644
--- a/examples/api_client.py
+++ b/examples/api_client.py
@@ -31,7 +31,10 @@ def post_http_request(prompt: str,
         "max_tokens": 16,
         "stream": stream,
     }
-    response = requests.post(api_url, headers=headers, json=pload, stream=True)
+    response = requests.post(api_url,
+                             headers=headers,
+                             json=pload,
+                             stream=stream)
     return response
 
 

From ecb33a28cb6c10ebf3b1aa139f72e759cacb8c15 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 27 Jul 2024 02:54:14 -0700
Subject: [PATCH 47/66] [CI/Build][Doc] Update CI and Doc for VLM example
 changes (#6860)

---
 .buildkite/test-pipeline.yaml | 3 +--
 docs/source/models/vlm.rst    | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5b4a786305e1f..be8807df0b098 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -140,14 +140,13 @@ steps:
   working_dir: "/vllm-workspace/examples"
   mirror_hardwares: [amd]
   commands:
-    # install aws cli for llava_example.py
     # install tensorizer for tensorize_vllm_model.py
     - pip install awscli tensorizer
     - python3 offline_inference.py
     - python3 cpu_offload.py
     - python3 offline_inference_with_prefix.py
     - python3 llm_engine_example.py
-    - python3 llava_example.py
+    - python3 offline_inference_vision_language.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 
 - label: Inputs Test
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index ef4ce0d44a162..a385605c9f8f6 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -73,7 +73,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
         generated_text = o.outputs[0].text
         print(generated_text)
 
-A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
+A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
 
 
 Online OpenAI Vision API Compatible Inference

From 1ad86acf1789650e2ff27586e36a8159d52755dd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 27 Jul 2024 19:53:07 +0800
Subject: [PATCH 48/66] [Model] Initial support for BLIP-2 (#5920)

Co-authored-by: ywang96 <ywang@roblox.com>
---
 docs/source/models/supported_models.rst       |   8 +
 examples/offline_inference_vision_language.py |  11 +
 examples/template_blip2.jinja                 |  11 +
 tests/models/test_blip2.py                    | 102 +++
 tests/models/test_fuyu.py                     |   8 +-
 tests/models/test_minicpmv.py                 |   8 +-
 tests/models/test_phi3v.py                    |   8 +-
 vllm/model_executor/models/__init__.py        |   6 +-
 vllm/model_executor/models/blip.py            | 269 +++++++
 vllm/model_executor/models/blip2.py           | 669 ++++++++++++++++++
 vllm/model_executor/models/opt.py             |  17 +-
 vllm/multimodal/base.py                       |  11 +-
 12 files changed, 1107 insertions(+), 21 deletions(-)
 create mode 100644 examples/template_blip2.jinja
 create mode 100644 tests/models/test_blip2.py
 create mode 100644 vllm/model_executor/models/blip.py
 create mode 100644 vllm/model_executor/models/blip2.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 483f552bba238..83c1b9c8bce86 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo
 The following is the list of model architectures that are currently supported by vLLM.
 Alongside each architecture, we include some popular models that use it.
 
+----
+
 Decoder-only Language Models
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. list-table::
@@ -186,6 +188,10 @@ Vision Language Models
     - Models
     - Example HuggingFace Models
     - :ref:`LoRA <lora>`
+  * - :code:`Blip2ForConditionalGeneration`
+    - BLIP-2
+    - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
+    -
   * - :code:`ChameleonForConditionalGeneration`
     - Chameleon
     - :code:`facebook/chameleon-7b` etc.
@@ -215,6 +221,8 @@ Vision Language Models
     - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
     -
 
+----
+
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
 for instructions on how to implement support for your model.
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 8a63653343db6..04ba1a96314c9 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -106,6 +106,16 @@ def run_minicpmv(question):
     return llm, prompt
 
 
+# BLIP-2
+def run_blip2(question):
+
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompt = f"Question: {question} Answer:"
+    llm = LLM(model="Salesforce/blip2-opt-2.7b")
+    return llm, prompt
+
+
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
@@ -114,6 +124,7 @@ def run_minicpmv(question):
     "paligemma": run_paligemma,
     "chameleon": run_chameleon,
     "minicpmv": run_minicpmv,
+    "blip-2": run_blip2,
 }
 
 
diff --git a/examples/template_blip2.jinja b/examples/template_blip2.jinja
new file mode 100644
index 0000000000000..fd41a7f7fa666
--- /dev/null
+++ b/examples/template_blip2.jinja
@@ -0,0 +1,11 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- 'Question: ' + message['content'] + ' ' -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- 'Answer: ' + message['content'] + ' ' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{- 'Answer:' -}}
+{% endif %}
diff --git a/tests/models/test_blip2.py b/tests/models/test_blip2.py
new file mode 100644
index 0000000000000..26afd57ae6106
--- /dev/null
+++ b/tests/models/test_blip2.py
@@ -0,0 +1,102 @@
+from typing import List, Optional, Tuple
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
+
+from ..conftest import IMAGE_ASSETS
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "Question: What's the content of the image? Answer:",
+    "cherry_blossom":
+    "Question: What is the season? Answer:",
+})
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "\n"
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(hf_output_str)
+    assert hf_output_ids[0] == tokenizer.bos_token_id
+    hf_output_ids = hf_output_ids[1:]
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py
index 25f63a3d64d0e..7d0f3be5ea008 100644
--- a/tests/models/test_fuyu.py
+++ b/tests/models/test_fuyu.py
@@ -77,8 +77,8 @@ def run_test(
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=vllm_images)
-            for prompts, vllm_images in inputs_per_image
+                                                images=images)
+            for prompts, images in inputs_per_image
         ]
 
     with hf_runner(model, dtype=dtype) as hf_model:
@@ -89,9 +89,9 @@ def run_test(
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
-                                                    images=hf_images,
+                                                    images=images,
                                                     eos_token_id=eos_token_id)
-            for prompts, hf_images in inputs_per_image
+            for prompts, images in inputs_per_image
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py
index 9124fa7a6238c..c57f0f8c08548 100644
--- a/tests/models/test_minicpmv.py
+++ b/tests/models/test_minicpmv.py
@@ -88,9 +88,9 @@ def run_test(
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=vllm_images,
+                                                images=images,
                                                 stop_token_ids=stop_token_ids)
-            for prompts, vllm_images in inputs_per_image
+            for prompts, images in inputs_per_image
         ]
 
     with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
@@ -114,9 +114,9 @@ def to(self, device: torch.types.Device):
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
-                                                    images=hf_images,
+                                                    images=images,
                                                     tokenizer=tokenizer)
-            for prompts, hf_images in inputs_per_image
+            for prompts, images in inputs_per_image
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 9da25ab8d78fe..35ffe4ef50a85 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -101,8 +101,8 @@ def run_test(
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=vllm_images)
-            for prompts, vllm_images in inputs_per_image
+                                                images=images)
+            for prompts, images in inputs_per_image
         ]
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
@@ -114,9 +114,9 @@ def run_test(
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
-                                                    images=hf_images,
+                                                    images=images,
                                                     eos_token_id=eos_token_id)
-            for prompts, hf_images in inputs_per_image
+            for prompts, images in inputs_per_image
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index ead64c0e92553..fe04c6db5fbc2 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -16,6 +16,8 @@
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
+    "Blip2ForConditionalGeneration":
+    ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration":
     ("chameleon", "ChameleonForConditionalGeneration"),
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
@@ -56,8 +58,8 @@
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
-    "PaliGemmaForConditionalGeneration":
-    ("paligemma", "PaliGemmaForConditionalGeneration"),
+    "PaliGemmaForConditionalGeneration": ("paligemma",
+                                          "PaliGemmaForConditionalGeneration"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
new file mode 100644
index 0000000000000..0b124d5e8a85a
--- /dev/null
+++ b/vllm/model_executor/models/blip.py
@@ -0,0 +1,269 @@
+"""Minimal implementation of BlipVisionModel intended to be only used 
+within a vision language model."""
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from PIL import Image
+from transformers import Blip2VisionConfig, BlipVisionConfig
+from transformers.models.blip.modeling_blip import BlipAttention
+
+from vllm.config import ModelConfig
+from vllm.inputs import LLMInputs
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal.image import (cached_get_tokenizer,
+                                   repeat_and_pad_image_tokens)
+from vllm.sequence import SequenceData
+
+
+def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
+    assert image_size % patch_size == 0
+    return image_size // patch_size
+
+
+def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
+    grid_length = get_blip_patch_grid_length(image_size=image_size,
+                                             patch_size=patch_size)
+    return grid_length * grid_length
+
+
+def get_blip_image_feature_size(
+    hf_config: Union[BlipVisionConfig, Blip2VisionConfig], ) -> int:
+    return get_blip_num_patches(image_size=hf_config.image_size,
+                                patch_size=hf_config.patch_size)
+
+
+def get_max_blip_image_tokens(
+    hf_config: Union[BlipVisionConfig, Blip2VisionConfig], ) -> int:
+    return get_blip_image_feature_size(hf_config)
+
+
+def dummy_seq_data_for_blip(
+    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
+    seq_len: int,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = get_blip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_blip(
+    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
+    *,
+    image_width_override: Optional[int] = None,
+    image_height_override: Optional[int] = None,
+):
+    width = height = hf_config.image_size
+    if image_width_override is not None:
+        width = image_width_override
+    if image_height_override is not None:
+        height = image_height_override
+
+    image = Image.new("RGB", (width, height), color=0)
+    return {"image": image}
+
+
+def input_processor_for_blip(
+    model_config: ModelConfig,
+    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
+    llm_inputs: LLMInputs,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+    if image_feature_size_override is None:
+        image_feature_size = get_blip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    new_prompt, new_token_ids = repeat_and_pad_image_tokens(
+        tokenizer,
+        llm_inputs.get("prompt"),
+        llm_inputs["prompt_token_ids"],
+        image_token_id=image_token_id,
+        repeat_count=image_feature_size,
+    )
+
+    # NOTE: Create a defensive copy of the original inputs
+    return LLMInputs(prompt_token_ids=new_token_ids,
+                     prompt=new_prompt,
+                     multi_modal_data=multi_modal_data)
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
+class BlipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: BlipVisionConfig):
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+
+        self.num_patches = get_blip_num_patches(image_size=self.image_size,
+                                                patch_size=self.patch_size)
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.randn(1, self.num_positions, self.embed_dim))
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+
+        position_embeds = self.position_embedding.to(target_dtype)
+        embeddings = embeddings + position_embeds[:, :embeddings.size(1), :]
+
+        return embeddings
+
+
+class BlipMLP(nn.Module):
+
+    def __init__(self,
+                 config: BlipVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+
+        self.config = config
+
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class BlipEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: BlipVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+
+        self.self_attn = BlipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+        self.mlp = BlipMLP(config, quant_config=quant_config)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class BlipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self 
+    attention layers. Each layer is a [`BlipEncoderLayer`].
+
+    Args:
+        config: BlipConfig
+    """
+
+    def __init__(self,
+                 config: BlipVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
+        super().__init__()
+
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layers = nn.ModuleList([
+            BlipEncoderLayer(config=config, quant_config=quant_config)
+            for _ in range(num_hidden_layers)
+        ])
+
+    def forward(self, inputs_embeds: torch.Tensor):
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class BlipVisionModel(nn.Module):
+    config_class = BlipVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self,
+                 config: BlipVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
+        super().__init__()
+
+        self.config = config
+
+        self.embeddings = BlipVisionEmbeddings(config)
+        self.encoder = BlipEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+        )
+        self.post_layernorm = nn.LayerNorm(config.hidden_size,
+                                           eps=config.layer_norm_eps)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.encoder(inputs_embeds=hidden_states)
+
+        return self.post_layernorm(hidden_states)
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
new file mode 100644
index 0000000000000..e00e6c0806957
--- /dev/null
+++ b/vllm/model_executor/models/blip2.py
@@ -0,0 +1,669 @@
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+
+import torch
+import torch.nn as nn
+from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig,
+                          apply_chunking_to_forward)
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.opt import OPTModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
+
+from .blip import (BlipVisionModel, dummy_image_for_blip,
+                   get_max_blip_image_tokens)
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "language_model.lm_head": "lm_head",
+    "language_model.model": "language_model",
+}
+
+
+class Blip2QFormerMultiHeadAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        is_cross_attention: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of "
+                f"the number of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = (config.hidden_size //
+                                    config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.scaling = self.attention_head_size**-0.5
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            kv_hidden_size = config.encoder_hidden_size
+        else:
+            kv_hidden_size = config.hidden_size
+        self.key = nn.Linear(kv_hidden_size, self.all_head_size)
+        self.value = nn.Linear(kv_hidden_size, self.all_head_size)
+
+        self.position_embedding_type = getattr(config,
+                                               "position_embedding_type",
+                                               "absolute")
+        if self.position_embedding_type != "absolute":
+            raise NotImplementedError("Unsupported position_embedding_type: "
+                                      f"{self.position_embedding_type}")
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        x = x.view(*x.size()[:-1], self.num_attention_heads,
+                   self.attention_head_size)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ):
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_probs = torch.softmax(attention_scores * self.scaling,
+                                        dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        context_layer = context_layer.view(*context_layer.size()[:-2],
+                                           self.all_head_size)
+
+        return context_layer
+
+
+class Blip2QFormerSelfOutput(nn.Module):
+
+    def __init__(self, config: Blip2QFormerConfig) -> None:
+        super().__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_tensor: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        is_cross_attention: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.attention = Blip2QFormerMultiHeadAttention(
+            config,
+            quant_config=quant_config,
+            cache_config=cache_config,
+            is_cross_attention=is_cross_attention,
+        )
+
+        self.output = Blip2QFormerSelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.Tensor]:
+        self_output = self.attention(
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        attention_output = self.output(self_output, hidden_states)
+
+        return attention_output
+
+
+class Blip2QFormerIntermediate(nn.Module):
+
+    def __init__(self, config: Blip2QFormerConfig) -> None:
+        super().__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class Blip2QFormerOutput(nn.Module):
+
+    def __init__(self, config: Blip2QFormerConfig) -> None:
+        super().__init__()
+
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_tensor: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        layer_idx: int,
+    ) -> None:
+        super().__init__()
+
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = Blip2QFormerAttention(config,
+                                               quant_config=quant_config,
+                                               cache_config=cache_config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = Blip2QFormerAttention(
+                config,
+                quant_config=quant_config,
+                cache_config=cache_config,
+                is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate_query = Blip2QFormerIntermediate(config)
+        self.output_query = Blip2QFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        query_length: int,
+    ):
+        attention_output = self.attention(hidden_states)
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                query_attention_output = self.crossattention(
+                    query_attention_output,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text],
+                                         dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+
+        return layer_output
+
+    def feed_forward_chunk(self,
+                           attention_output: torch.Tensor) -> torch.Tensor:
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(
+            self, attention_output: torch.Tensor) -> torch.Tensor:
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class Blip2QFormerEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.layer = nn.ModuleList([
+            Blip2QFormerLayer(config,
+                              quant_config=quant_config,
+                              cache_config=cache_config,
+                              layer_idx=layer_idx)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        query_length: int,
+    ) -> torch.Tensor:
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+
+            hidden_states = layer_module(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                query_length=query_length,
+            )
+
+        return hidden_states
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1025
+class Blip2QFormerModel(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.layernorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = Blip2QFormerEncoder(config,
+                                           quant_config=quant_config,
+                                           cache_config=cache_config)
+
+    def forward(
+        self,
+        query_embeds: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+    ) -> torch.Tensor:
+        query_length = query_embeds.shape[1]
+
+        embedding_output = self.layernorm(query_embeds)
+        embedding_output = self.dropout(embedding_output)
+
+        sequence_output = self.encoder(
+            embedding_output,
+            encoder_hidden_states=encoder_hidden_states,
+            query_length=query_length,
+        )
+
+        return sequence_output
+
+
+class Blip2ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, num_channels, height, width)"""
+
+
+Blip2ImageInputs = Blip2ImagePixelInputs
+
+# We use this internally as placeholders since there is no image token
+# defined on the HuggingFace repo
+BLIP2_IMAGE_TOKEN = "<image>"
+BLIP2_IMAGE_TOKEN_ID = 50265
+
+
+def get_blip2_image_feature_size(hf_config: Blip2Config) -> int:
+    return hf_config.num_query_tokens
+
+
+def get_max_blip2_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(Blip2Config)
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, Blip2VisionConfig):
+        return get_max_blip_image_tokens(vision_config)
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def dummy_data_for_blip2(ctx: InputContext, seq_len: int):
+    hf_config = ctx.get_hf_config(Blip2Config)
+    vision_config = hf_config.vision_config
+
+    image_feature_size = get_blip2_image_feature_size(hf_config)
+    token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    seq_data = SequenceData(token_ids)
+
+    if isinstance(vision_config, Blip2VisionConfig):
+        mm_data = dummy_image_for_blip(vision_config)
+
+        return seq_data, mm_data
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def input_processor_for_blip2(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    hf_config = ctx.get_hf_config(Blip2Config)
+    image_feature_size = get_blip2_image_feature_size(hf_config)
+
+    # The original model places image tokens at the front
+    # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514
+    new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size
+    new_token_ids += llm_inputs["prompt_token_ids"]
+
+    new_prompt = llm_inputs.get("prompt")
+    if new_prompt is not None:
+        new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt
+
+    return LLMInputs(prompt_token_ids=new_token_ids,
+                     prompt=new_prompt,
+                     multi_modal_data=multi_modal_data)
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2)
+class Blip2ForConditionalGeneration(nn.Module, SupportsVision):
+
+    def __init__(self,
+                 config: Blip2Config,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_model = BlipVisionModel(config.vision_config)
+
+        self.query_tokens = nn.Parameter(
+            torch.zeros(1, config.num_query_tokens,
+                        config.qformer_config.hidden_size))
+
+        self.qformer = Blip2QFormerModel(config.qformer_config,
+                                         cache_config=cache_config,
+                                         quant_config=quant_config)
+
+        self.language_projection = nn.Linear(
+            config.qformer_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+
+        self.quant_config = quant_config
+
+        self.language_model = OPTModel(config.text_config, cache_config,
+                                       quant_config)
+
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size)
+        self.sampler = Sampler()
+
+    def get_lm_head(self):
+        return self.language_model.decoder.embed_tokens
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Blip2ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return Blip2ImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def _image_pixels_to_features(self, vision_model: BlipVisionModel,
+                                  pixel_values: torch.Tensor) -> torch.Tensor:
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_model(pixel_values)
+
+        return image_features
+
+    def _process_image_pixels(self,
+                              inputs: Blip2ImagePixelInputs) -> torch.Tensor:
+        assert self.vision_model is not None
+
+        pixel_values = inputs["data"]
+
+        return self._image_pixels_to_features(self.vision_model, pixel_values)
+
+    def _process_image_input(self,
+                             image_input: Blip2ImageInputs) -> torch.Tensor:
+        assert self.vision_model is not None
+        image_features = self._process_image_pixels(image_input)
+
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1,
+                                                -1)
+        query_output = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_features,
+        )
+
+        return self.language_projection(query_output)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        """Run forward pass for BLIP-2.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+
+        Concretely, consider a text prompt:
+        `"Question: What's the content of the image? Answer:"`.
+
+        Tokenizer outputs:
+        `[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends 
+        dummy tokens (denoted as `50265`), resulting in:
+        `[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`.
+
+        We insert 32 tokens since it corresponds to the number of query
+        embeddings outputted by the Q-Former and inputted to the language model.
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: The pixels in each input image.
+        
+        See also:
+            :class:`Blip2ImageInputs`
+        """
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input is not None:
+            vision_embeddings = self._process_image_input(image_input)
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds,
+                                                    vision_embeddings,
+                                                    BLIP2_IMAGE_TOKEN_ID)
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            kv_caches,
+                                            attn_metadata,
+                                            inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.get_lm_head(), hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # only doing this for language model part for now.
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "lm_head.weight" in name:
+                continue
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_model is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    param = params_dict[name.replace(weight_name, param_name)]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    use_default_weight_loading = True
+            if use_default_weight_loading:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index edc16710c0229..a05090cd46648 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -237,14 +237,19 @@ def __init__(
             for _ in range(config.num_hidden_layers)
         ])
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        inputs_embeds = self.embed_tokens(input_ids)
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings(input_ids)
         pos_embeds = self.embed_positions(positions)
         if self.project_in is not None:
             inputs_embeds, _ = self.project_in(inputs_embeds)
@@ -272,14 +277,22 @@ def __init__(
         super().__init__()
         self.decoder = OPTDecoder(config, cache_config, quant_config)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.decoder.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return self.decoder(input_ids, positions, kv_caches, attn_metadata)
+        return self.decoder(input_ids,
+                            positions,
+                            kv_caches,
+                            attn_metadata,
+                            inputs_embeds=inputs_embeds)
 
 
 class OPTForCausalLM(nn.Module):
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 0d435bd644e29..5abd0ad61cdf9 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,8 +1,9 @@
 import sys
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict,
-                    TypeVar, Union, cast)
+from typing import Any, Callable, Dict, List, Optional
+from typing import Sequence as GenericSequence
+from typing import Type, TypedDict, TypeVar, Union, cast
 
 import torch
 import torch.types
@@ -15,13 +16,13 @@
 
 logger = init_logger(__name__)
 
-NestedTensors = Union[List[torch.Tensor], torch.Tensor]
+NestedTensors = Union[GenericSequence[torch.Tensor], torch.Tensor]
 """
 Use a list instead of a tensor if the dimensions of each element do not match.
 Currently only supports up to singly nested list of tensors.
 """
 
-BatchedTensors = Union[List[NestedTensors], NestedTensors]
+BatchedTensors = Union[GenericSequence[NestedTensors], NestedTensors]
 """
 If each input tensor in the batch has the same size, this is a single batched
 tensor; otherwise, this is a list of :class:`NestedTensors` with one element
@@ -53,7 +54,7 @@ def try_concat(
         # may be list rather than tensors
         if isinstance(tensors[0], list):
             return [[t.to(device=device) for t in tensor[0]]
-                    for tensor in tensors]
+                    for tensor in cast(List[List[torch.Tensor]], tensors)]
 
         tensors_ = cast(List[torch.Tensor], tensors)
 

From f954d0715c8b68e780aac4a4f3ffd1ab56bebfcd Mon Sep 17 00:00:00 2001
From: Chenggang Wu <cgwu0530@gmail.com>
Date: Sat, 27 Jul 2024 09:24:46 -0700
Subject: [PATCH 49/66] [Docs] Add RunLLM chat widget (#6857)

---
 docs/source/_static/custom.js | 16 ++++++++++++++++
 docs/source/conf.py           |  2 ++
 2 files changed, 18 insertions(+)
 create mode 100644 docs/source/_static/custom.js

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
new file mode 100644
index 0000000000000..f475be71fc448
--- /dev/null
+++ b/docs/source/_static/custom.js
@@ -0,0 +1,16 @@
+document.addEventListener("DOMContentLoaded", function () {
+    var script = document.createElement("script");
+    script.type = "module";
+    script.id = "runllm-widget-script"
+  
+    script.src = "https://widget.runllm.com";
+  
+    script.setAttribute("version", "stable");
+    script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
+    script.setAttribute("runllm-name", "vLLM");
+    script.setAttribute("runllm-position", "BOTTOM_RIGHT");
+    script.setAttribute("runllm-assistant-id", "207");
+  
+    script.async = true;
+    document.head.appendChild(script);
+  });
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b867bfd89dc17..1093b30bca11d 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -68,6 +68,8 @@
     'use_repository_button': True,
     'use_edit_page_button': True,
 }
+html_static_path = ["_static"]
+html_js_files = ["custom.js"]
 
 # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
 READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')

From fad5576c58864a6c2cf528f67e60e03a949b3dac Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 27 Jul 2024 10:28:33 -0700
Subject: [PATCH 50/66] [TPU] Reduce compilation time & Upgrade PyTorch XLA
 version  (#6856)

---
 Dockerfile.tpu                                    |  2 +-
 docs/source/getting_started/tpu-installation.rst  |  9 ++++++++-
 vllm/attention/backends/pallas.py                 |  1 -
 .../device_communicators/tpu_communicator.py      |  3 ++-
 vllm/worker/tpu_model_runner.py                   | 15 +++++++++++++--
 vllm/worker/tpu_worker.py                         |  1 -
 6 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index be7dbe63cb237..4fc14d6bd186c 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20240713"
+ARG NIGHTLY_DATE="20240726"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index 5e2f514a4a509..2e6c522422c22 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -56,7 +56,7 @@ First, install the dependencies:
     $ pip uninstall torch torch-xla -y
 
     $ # Install PyTorch and PyTorch XLA.
-    $ export DATE="+20240713"
+    $ export DATE="+20240726"
     $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl
     $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl
 
@@ -75,6 +75,13 @@ Next, build vLLM from source. This will only take a few seconds:
     $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
 
 
+.. note::
+
+    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape.
+    The compilation time may take 20~30 minutes in the first run.
+    However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
+
+
 .. tip::
 
     If you encounter the following error:
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index c53a2f91b89d7..2269ac2606e89 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch_xla.experimental.custom_kernel  # Required to register custom ops.
-import torch_xla.experimental.dynamo_set_buffer_donor
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 69a9a516f3ebe..16525887cf4eb 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -6,6 +6,7 @@
 
 if current_platform.is_tpu():
     import torch_xla.core.xla_model as xm
+    import torch_xla.runtime as xr
     from torch_xla._internal import pjrt
 
 
@@ -20,7 +21,7 @@ def __init__(self, group: ProcessGroup):
         local_rank = dist.get_rank(group)
         world_size = dist.get_world_size(group)
         pjrt.initialize_multiprocess(local_rank, world_size)
-        xm._init_world_size_ordinal()
+        xr._init_world_size_ordinal()
 
     def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
         return xm.all_reduce(xm.REDUCE_SUM, x)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index e5bb101fc7df4..1692094af8c41 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -7,6 +7,7 @@
 import torch
 import torch.nn as nn
 import torch_xla.core.xla_model as xm
+import torch_xla.runtime as xr
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
@@ -127,7 +128,7 @@ def load_model(self) -> None:
         # determine the order of concatenating the output tensors.
         # As a workaround, we use the xm's rank assignment only when loading
         # the embedding weights.
-        xm_tp_rank = xm.get_ordinal()
+        xm_tp_rank = xr.global_ordinal()
         with patch(
                 "vllm.model_executor.layers.vocab_parallel_embedding."
                 "get_tensor_model_parallel_rank",
@@ -146,7 +147,17 @@ def load_model(self) -> None:
         xm.wait_device_ops()
 
         model = ModelWrapper(model)
-        self.model = torch.compile(model, backend="openxla", fullgraph=True)
+        # NOTE(woosuk): There are two stages of compilation: torch.compile and
+        # XLA compilation. Setting dynamic=True can reduce the torch.compile
+        # overhead by reusing the FX graph for different shapes.
+        # However, the XLA graph will still require static shapes and needs to
+        # be re-compiled for every different shapes. This overhead is inevitable
+        # in the first run, but can be skipped afterwards as we cache the XLA
+        # graphs in the disk (VLLM_XLA_CACHE_PATH).
+        self.model = torch.compile(model,
+                                   backend="openxla",
+                                   fullgraph=True,
+                                   dynamic=True)
 
     def _dummy_run(
         self,
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index c88aba7ae08cd..17fa5c35457c2 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch_xla.core.xla_model as xm
-import torch_xla.experimental.dynamo_set_buffer_donor  # noqa: F401
 import torch_xla.runtime as xr
 
 import vllm.envs as envs

From 75acdaa4b616c2e95c55a47d3158ceec9c72c503 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Sat, 27 Jul 2024 17:52:33 -0400
Subject: [PATCH 51/66] [Kernel] Increase precision of GPTQ/AWQ Marlin kernel
 (#6795)

---
 benchmarks/kernels/benchmark_marlin.py        |  23 ++-
 csrc/ops.h                                    |   3 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  | 150 ++++++++++++++----
 tests/kernels/test_marlin_gemm.py             |  13 +-
 vllm/_custom_ops.py                           |   6 +-
 .../layers/quantization/utils/marlin_utils.py |  17 +-
 6 files changed, 168 insertions(+), 44 deletions(-)

diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 3da4cecd7eeff..684985b81f690 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -10,7 +10,7 @@
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+    MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     MarlinWorkspace, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
@@ -56,6 +56,8 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
      marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
 
+    marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
+
     # GPTQ quant
     (w_ref, q_w, s, g_idx,
      rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
@@ -87,6 +89,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
         "marlin_w_ref": marlin_w_ref,
         "marlin_q_w": marlin_q_w,
         "marlin_s": marlin_s,
+        "marlin_zp": marlin_zp,
         "marlin_g_idx": marlin_g_idx,
         "marlin_sort_indices": marlin_sort_indices,
         "marlin_rand_perm": marlin_rand_perm,
@@ -125,11 +128,21 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="gptq_marlin_gemm_fp16",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
-            description="gptq_marlin_gemm",
+            description="gptq_marlin_gemm_fp32",
         ).blocked_autorange(min_run_time=min_run_time))
 
     if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
@@ -183,12 +196,12 @@ def main(args):
                            ) > 0 and is_k_full not in args.limit_k_full:
                         continue
 
-                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+                    for num_bits in MARLIN_SUPPORTED_NUM_BITS:
                         if len(args.limit_num_bits
                                ) > 0 and num_bits not in args.limit_num_bits:
                             continue
 
-                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+                        for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
                             if len(
                                     args.limit_group_size
                             ) > 0 and group_size not in args.limit_group_size:
diff --git a/csrc/ops.h b/csrc/ops.h
index 9ef1fcb465bf3..f075850248d1c 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -93,7 +93,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& g_idx, torch::Tensor& perm,
                                torch::Tensor& workspace, int64_t num_bits,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full, bool has_zp);
+                               bool is_k_full, bool has_zp,
+                               bool use_fp32_reduce);
 
 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 122c5c16b58ce..36ae2bfafa7c2 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -59,14 +59,16 @@ __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
     const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
     const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                           // (k/groupsize)xn
     const int* __restrict__ g_idx,        // int32 group indices of shape k
-    int num_groups,  // number of scale groups per output channel
-    int prob_m,      // batch dimension m
-    int prob_n,      // output dimension n
-    int prob_k,      // reduction dimension k
-    int* locks       // extra global storage for barrier synchronization
+    int num_groups,       // number of scale groups per output channel
+    int prob_m,           // batch dimension m
+    int prob_n,           // output dimension n
+    int prob_k,           // reduction dimension k
+    int* locks,           // extra global storage for barrier synchronization
+    bool use_fp32_reduce  // whether to use fp32 global reduce
 ) {}
 
 }  // namespace gptq_marlin
@@ -532,16 +534,18 @@ __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
     const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
     const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                           // (k/groupsize)xn
     const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
                                           // (k/groupsize)x(n/pack_factor)
     const int* __restrict__ g_idx,        // int32 group indices of shape k
-    int num_groups,  // number of scale groups per output channel
-    int prob_m,      // batch dimension m
-    int prob_n,      // output dimension n
-    int prob_k,      // reduction dimension k
-    int* locks       // extra global storage for barrier synchronization
+    int num_groups,       // number of scale groups per output channel
+    int prob_m,           // batch dimension m
+    int prob_n,           // output dimension n
+    int prob_k,           // reduction dimension k
+    int* locks,           // extra global storage for barrier synchronization
+    bool use_fp32_reduce  // whether to use fp32 global reduce
 ) {
   // Each threadblock processes one "stripe" of the B matrix with (roughly) the
   // same size, which might involve multiple column "slices" (of width 16 *
@@ -595,6 +599,8 @@ __global__ void Marlin(
   int slice_idx;  // index of threadblock in current slice; numbered bottom to
                   // top
 
+  int par_id = 0;
+
   // We can easily implement parallel problem execution by just remapping
   // indices and advancing global pointers
   if (slice_col_par >= n_tiles) {
@@ -602,6 +608,7 @@ __global__ void Marlin(
     C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
     locks += (slice_col_par / n_tiles) * n_tiles;
     slice_col = slice_col_par % n_tiles;
+    par_id = slice_col_par / n_tiles;
   }
 
   // Compute all information about the current slice which is required for
@@ -632,6 +639,7 @@ __global__ void Marlin(
       C += 16 * thread_m_blocks * prob_n / 8;
       locks += n_tiles;
       slice_col = 0;
+      par_id++;
     }
   };
   init_slice();
@@ -1321,7 +1329,7 @@ __global__ void Marlin(
   // finally have to globally reduce over the results. As the striped
   // partitioning minimizes the number of such reductions and our outputs are
   // usually rather small, we perform this reduction serially in L2 cache.
-  auto global_reduce = [&](bool first = false, bool last = false) {
+  auto global_reduce_fp16 = [&](bool first = false, bool last = false) {
     // We are very careful here to reduce directly in the output buffer to
     // maximize L2 cache utilization in this step. To do this, we write out
     // results in FP16 (but still reduce with FP32 compute).
@@ -1382,6 +1390,53 @@ __global__ void Marlin(
     }
   };
 
+  // Globally reduce over threadblocks that compute the same column block.
+  // We use a tmp C buffer to reduce in full fp32 precision.
+  auto global_reduce_fp32 = [&](bool first = false, bool last = false) {
+    constexpr int tb_m = thread_m_blocks * 16;
+    constexpr int tb_n = thread_n_blocks * 16;
+
+    constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
+
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+
+    int par_offset = c_size * n_tiles * par_id;
+    int slice_offset = c_size * slice_col;
+
+    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int th_size = num_floats * sizeof(float) / 16;
+
+    int c_cur_offset = par_offset + slice_offset;
+
+    if (!is_th_active) {
+      return;
+    }
+
+    if (!first) {
+      float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        sh[threadIdx.x] =
+            C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
+
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh[threadIdx.x]);
+  #pragma unroll
+        for (int f = 0; f < 4; f++) {
+          frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
+        }
+      }
+    }
+
+    if (!last) {
+      int4* frag_c_ptr = reinterpret_cast<int4*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k];
+      }
+    }
+  };
+
   // Write out the reduce final result in the correct layout. We only actually
   // reshuffle matrix fragments in this step, the reduction above is performed
   // in fragment layout.
@@ -1606,7 +1661,11 @@ __global__ void Marlin(
       if (slice_count > 1) {  // only globally reduce if there is more than one
                               // block in a slice
         barrier_acquire(&locks[slice_col], slice_idx);
-        global_reduce(slice_idx == 0, last);
+        if (use_fp32_reduce) {
+          global_reduce_fp32(slice_idx == 0, last);
+        } else {
+          global_reduce_fp16(slice_idx == 0, last);
+        }
         barrier_release(&locks[slice_col], last);
       }
       if (last)  // only the last block in a slice actually writes the result
@@ -1661,8 +1720,8 @@ __global__ void Marlin(
              THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,     \
              HAS_ZP, GROUP_BLOCKS>                                             \
           <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                   \
-              A_ptr, B_ptr, C_ptr, s_ptr, zp_ptr, g_idx_ptr, num_groups,       \
-              prob_m, prob_n, prob_k, locks);                                  \
+              A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,        \
+              num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce);     \
     }
 
 typedef struct {
@@ -1801,6 +1860,27 @@ bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
   return true;
 }
 
+int determine_reduce_max_m(int prob_m, int max_par) {
+  constexpr int tile_m_size = 16;
+
+  if (prob_m <= tile_m_size) {
+    return tile_m_size;
+
+  } else if (prob_m <= tile_m_size * 2) {
+    return tile_m_size * 2;
+
+  } else if (prob_m <= tile_m_size * 3) {
+    return tile_m_size * 3;
+
+  } else if (prob_m <= tile_m_size * 4) {
+    return tile_m_size * 4;
+
+  } else {
+    int cur_par = min(div_ceil(prob_m, tile_m_size * 4), max_par);
+    return tile_m_size * 4 * cur_par;
+  }
+}
+
 exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
                                       int num_bits, int group_size,
                                       bool has_act_order, bool is_k_full,
@@ -1880,13 +1960,13 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
     __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
 
 template <typename scalar_t>
-void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, void* zp,
-                     void* g_idx, void* perm, void* a_tmp, int prob_m,
-                     int prob_n, int prob_k, void* workspace, int num_bits,
-                     bool has_act_order, bool is_k_full, bool has_zp,
-                     int num_groups, int group_size, int dev,
+void marlin_mm_f16i4(const void* A, const void* B, void* C, void* C_tmp,
+                     void* s, void* zp, void* g_idx, void* perm, void* a_tmp,
+                     int prob_m, int prob_n, int prob_k, void* workspace,
+                     int num_bits, bool has_act_order, bool is_k_full,
+                     bool has_zp, int num_groups, int group_size, int dev,
                      cudaStream_t stream, int thread_k, int thread_n, int sms,
-                     int max_par) {
+                     int max_par, bool use_fp32_reduce) {
   TORCH_CHECK(num_bits == 4 || num_bits == 8,
               "num_bits must be 4 or 8. Got = ", num_bits);
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
@@ -1970,6 +2050,7 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, void* zp,
   const int4* A_ptr = (const int4*)A;
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
+  int4* C_tmp_ptr = (int4*)C_tmp;
   const int4* s_ptr = (const int4*)s;
   const int4* zp_ptr = (const int4*)zp;
   const int* g_idx_ptr = (const int*)g_idx;
@@ -2049,7 +2130,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& g_idx, torch::Tensor& perm,
                                torch::Tensor& workspace, int64_t num_bits,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full, bool has_zp) {
+                               bool is_k_full, bool has_zp,
+                               bool use_fp32_reduce) {
   // Verify num_bits
   TORCH_CHECK(num_bits == 4 || num_bits == 8,
               "num_bits must be 4 or 8. Got = ", num_bits);
@@ -2099,6 +2181,17 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   torch::Tensor c = torch::empty({size_m, size_n}, options);
   torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);
 
+  // Alloc C tmp buffer that is going to be used for the global reduce
+  int reduce_max_m = marlin::determine_reduce_max_m(size_m, marlin::max_par);
+  int reduce_n = size_n;
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+  if (!use_fp32_reduce) {
+    reduce_max_m = 0;
+    reduce_n = 0;
+  }
+  torch::Tensor c_tmp = torch::empty({reduce_max_m, reduce_n}, options_fp32);
+
   // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
   // auto -1)
   int thread_k = -1;
@@ -2171,20 +2264,21 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   if (a.scalar_type() == at::ScalarType::Half) {
     marlin::marlin_mm_f16i4<half>(
         a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        b_scales.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
+        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
+        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
         workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par);
+        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
     marlin::marlin_mm_f16i4<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
-        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
+        b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
         workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par);
+        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
   } else {
     TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
   }
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 42087fdcce959..bd35ef2eb2552 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -27,6 +27,7 @@
 
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
+USE_FP32_REDUCE_OPTS = [False, True]
 
 MARLIN_K_CHUNKS = [128]
 MARLIN_N_CHUNKS = [64, 128, 256]
@@ -175,6 +176,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
+@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
 def test_gptq_marlin_gemm(
     k_chunk,
     n_chunk,
@@ -183,6 +185,7 @@ def test_gptq_marlin_gemm(
     mnk_factors,
     act_order,
     is_k_full,
+    use_fp32_reduce,
 ):
     m_factor, n_factor, k_factor = mnk_factors
 
@@ -222,8 +225,9 @@ def test_gptq_marlin_gemm(
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
-        is_k_full,
+        is_k_full=is_k_full,
         has_zp=False,
+        use_fp32_reduce=use_fp32_reduce,
     )
     output_ref = torch.matmul(a_input, w_ref)
 
@@ -365,12 +369,14 @@ def test_fp8_marlin_gemm(
 @pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
 def test_awq_marlin_gemm(
     k_chunk,
     n_chunk,
     num_bits,
     group_size,
     mnk_factors,
+    use_fp32_reduce,
 ):
     m_factor, n_factor, k_factor = mnk_factors
 
@@ -407,8 +413,9 @@ def test_awq_marlin_gemm(
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
-        is_k_full,
-        has_zp,
+        is_k_full=is_k_full,
+        has_zp=has_zp,
+        use_fp32_reduce=use_fp32_reduce,
     )
     output_ref = torch.matmul(a_input, w_ref)
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0186594656cc1..ad9f01be6ddd4 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -286,12 +286,12 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                      b_scales: torch.Tensor, b_zeros: torch.Tensor,
                      g_idx: torch.Tensor, perm: torch.Tensor,
                      workspace: torch.Tensor, num_bits: int, size_m: int,
-                     size_n: int, size_k: int, is_k_full: bool,
-                     has_zp: bool) -> torch.Tensor:
+                     size_n: int, size_k: int, is_k_full: bool, has_zp: bool,
+                     use_fp32_reduce: bool) -> torch.Tensor:
     return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
                                          g_idx, perm, workspace, num_bits,
                                          size_m, size_n, size_k, is_k_full,
-                                         has_zp)
+                                         has_zp, use_fp32_reduce)
 
 
 # fp8 marlin
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 25a7cd7bde653..b789ca20cadb3 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -16,6 +16,11 @@
 MARLIN_SUPPORTED_NUM_BITS = [4, 8]
 MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
+# In case there is a performance issue with Marlin, the variable below can be
+# changed to False, which allows Marlin to perform global reductions in fp16
+# precision (instead of fp32), and therefore, save on some memory movements.
+USE_FP32_REDUCE_DEFAULT = True
+
 
 def _check_marlin_supported(num_bits: int, group_size: int, is_sym: bool,
                             min_capability: Optional[int],
@@ -244,7 +249,8 @@ def apply_gptq_marlin_linear(
         output_size_per_partition: int,
         input_size_per_partition: int,
         is_k_full: bool,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition, )
 
@@ -260,7 +266,8 @@ def apply_gptq_marlin_linear(
                                   size_n=output_size_per_partition,
                                   size_k=input_size_per_partition,
                                   is_k_full=is_k_full,
-                                  has_zp=False)
+                                  has_zp=False,
+                                  use_fp32_reduce=use_fp32_reduce)
 
     if bias is not None:
         output.add_(bias)  # In-place add
@@ -279,7 +286,8 @@ def apply_awq_marlin_linear(
         num_bits: int,
         output_size_per_partition: int,
         input_size_per_partition: int,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition, )
 
@@ -295,7 +303,8 @@ def apply_awq_marlin_linear(
                                   size_n=output_size_per_partition,
                                   size_k=input_size_per_partition,
                                   is_k_full=True,
-                                  has_zp=True)
+                                  has_zp=True,
+                                  use_fp32_reduce=use_fp32_reduce)
 
     if bias is not None:
         output.add_(bias)  # In-place add

From b1366a953498fde9c5e7ab91915367ebc69008b2 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sat, 27 Jul 2024 18:05:17 -0400
Subject: [PATCH 52/66] Add Nemotron to PP_SUPPORTED_MODELS (#6863)

---
 vllm/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/config.py b/vllm/config.py
index 92fde449b43fd..e7b54e04b00d5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -39,6 +39,7 @@
     "Phi3ForCausalLM",
     "GPT2LMHeadModel",
     "MixtralForCausalLM",
+    "NemotronForCausalLM",
 ]
 
 

From 3eeb148f467e3619e8890b1a5ebe86a173f91bc9 Mon Sep 17 00:00:00 2001
From: Elsa Granger <6374697+zeyugao@users.noreply.github.com>
Date: Sun, 28 Jul 2024 23:13:49 +0800
Subject: [PATCH 53/66] [Misc] Pass cutlass_fp8_supported correctly in
 fbgemm_fp8 (#6871)

---
 .../layers/quantization/fbgemm_fp8.py         | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 5e8d1f1947421..e7c3859967c71 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -9,6 +9,7 @@
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -72,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: FBGEMMFp8Config):
         self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
 
     def create_weights(
         self,
@@ -139,11 +141,12 @@ def apply(self,
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
-        return apply_fp8_linear(input=x,
-                                weight=layer.weight,
-                                weight_scale=layer.weight_scale,
-                                input_scale=None,
-                                input_scale_ub=layer.input_scale_ub,
-                                bias=bias,
-                                cutlass_fp8_supported=True,
-                                use_per_token_if_dynamic=True)
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            input_scale_ub=layer.input_scale_ub,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=True)

From 7cbd9ec7a9bfd4952ad522355b6bbb8e82b54fc9 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 29 Jul 2024 18:16:30 +0800
Subject: [PATCH 54/66] [Model] Initialize support for InternVL2 series models
 (#6514)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.rst       |   4 +
 examples/offline_inference_vision_language.py |  15 +
 examples/openai_vision_api_client.py          |   2 +
 requirements-test.txt                         |   1 +
 tests/models/test_internvl.py                 | 201 ++++++++
 vllm/entrypoints/chat_utils.py                |   2 +-
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/intern_vit.py      | 270 ++++++++++
 vllm/model_executor/models/internlm2.py       |  10 +-
 vllm/model_executor/models/internvl.py        | 471 ++++++++++++++++++
 vllm/model_executor/models/qwen2.py           |  10 +-
 vllm/transformers_utils/config.py             |   8 +-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/internvl.py   |  51 ++
 14 files changed, 1042 insertions(+), 6 deletions(-)
 create mode 100644 tests/models/test_internvl.py
 create mode 100644 vllm/model_executor/models/intern_vit.py
 create mode 100644 vllm/model_executor/models/internvl.py
 create mode 100644 vllm/transformers_utils/configs/internvl.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 83c1b9c8bce86..4fe33e5ab5d80 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -200,6 +200,10 @@ Vision Language Models
     - Fuyu
     - :code:`adept/fuyu-8b` etc.
     - 
+  * - :code:`InternVLChatModel`
+    - InternVL2
+    - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
+    - 
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
     - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 04ba1a96314c9..846246a2062a6 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -106,6 +106,20 @@ def run_minicpmv(question):
     return llm, prompt
 
 
+# InternVL
+def run_internvl(question):
+    # Generally, InternVL can use chatml template for conversation
+    TEMPLATE = "<|im_start|>User\n{prompt}<|im_end|>\n<|im_start|>Assistant\n"
+    prompt = f"<image>\n{question}\n"
+    prompt = TEMPLATE.format(prompt=prompt)
+    llm = LLM(
+        model="OpenGVLab/InternVL2-4B",
+        trust_remote_code=True,
+        max_num_seqs=5,
+    )
+    return llm, prompt
+
+
 # BLIP-2
 def run_blip2(question):
 
@@ -125,6 +139,7 @@ def run_blip2(question):
     "chameleon": run_chameleon,
     "minicpmv": run_minicpmv,
     "blip-2": run_blip2,
+    "internvl_chat": run_internvl,
 }
 
 
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index 2082c378e267c..be90394511f89 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -42,6 +42,7 @@
         ],
     }],
     model=model,
+    max_tokens=64,
 )
 
 result = chat_completion_from_url.choices[0].message.content
@@ -78,6 +79,7 @@ def encode_image_base64_from_url(image_url: str) -> str:
         ],
     }],
     model=model,
+    max_tokens=64,
 )
 
 result = chat_completion_from_base64.choices[0].message.content
diff --git a/requirements-test.txt b/requirements-test.txt
index a7604d2e1015e..9b88fcce3e842 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -16,6 +16,7 @@ ray
 sentence-transformers # required for embedding
 sparseml==1.8.0 # required for compressed-tensors
 compressed-tensors==0.4.0 # required for compressed-tensors
+timm # required for internvl test
 
 # Benchmarking
 aiohttp
diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py
new file mode 100644
index 0000000000000..66cb8dda248db
--- /dev/null
+++ b/tests/models/test_internvl.py
@@ -0,0 +1,201 @@
+import types
+from typing import List, Optional, Type
+
+import pytest
+import torch
+from huggingface_hub import snapshot_download
+from PIL.Image import Image
+
+from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END,
+                                                 IMG_START,
+                                                 image_to_pixel_values)
+from vllm.multimodal.utils import rescale_image_size
+from vllm.utils import is_cpu
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+    "cherry_blossom":
+    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+})
+
+# we use snapshot_download to prevent conflicts between
+# dynamic_module and trust_remote_code for hf_runner
+models = [
+    snapshot_download("OpenGVLab/InternVL2-1B"),
+    snapshot_download("OpenGVLab/InternVL2-2B"),
+    # snapshot_download("OpenGVLab/InternVL2-4B"),  # broken
+]
+
+
+class InternVLProcessor:
+    """A simple processor for InternVL2 HF model which misses a processor."""
+
+    def __init__(self, hf_runner: HfRunner):
+        self.num_image_token = hf_runner.model.num_image_token
+        self.tokenizer = hf_runner.tokenizer
+        self.dtype = hf_runner.model.dtype
+
+    def __call__(self, text: str, images: Image, **kwargs):
+        pixel_values = image_to_pixel_values(images).to(self.dtype)
+        num_patches_list = [pixel_values.shape[0]]
+        for num_patches in num_patches_list:
+            context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
+            image_tokens = IMG_START + context_tokens + IMG_END
+            text = text.replace('<image>', image_tokens, 1)
+        prompt = self.tokenizer(text, return_tensors="pt")
+        prompt.update({"pixel_values": pixel_values})
+        return prompt
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
+def generate(
+    self,
+    pixel_values: torch.FloatTensor,
+    input_ids: torch.FloatTensor,
+    attention_mask: Optional[torch.LongTensor] = None,
+    **generate_kwargs,
+) -> torch.LongTensor:
+    """Generate method for InternVL2 model without fixed use_cache."""
+    assert self.img_context_token_id is not None
+    vit_embeds = self.extract_feature(pixel_values)
+    input_embeds = self.language_model.get_input_embeddings()(input_ids)
+    B, N, C = input_embeds.shape
+    input_embeds = input_embeds.reshape(B * N, C)
+
+    input_ids = input_ids.reshape(B * N)
+    selected = (input_ids == self.img_context_token_id)
+    assert selected.sum() != 0
+    input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
+
+    input_embeds = input_embeds.reshape(B, N, C)
+
+    outputs = self.language_model.generate(
+        inputs_embeds=input_embeds,
+        attention_mask=attention_mask,
+        **generate_kwargs,
+    )
+
+    return outputs
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=4096,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+            "<IMG_CONTEXT>")
+        hf_model.model.img_context_token_id = img_context_token_id
+        hf_model.processor = InternVLProcessor(hf_model)
+        hf_model.model.get_output_embeddings = lambda: \
+            hf_model.model.language_model.get_output_embeddings()
+        hf_model.model.generate = types.MethodType(generate, hf_model.model)
+        eos_token_id = hf_model.tokenizer.eos_token_id
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    eos_token_id=eos_token_id)
+            for prompts, hf_images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+target_dtype = "half"
+if is_cpu():
+    target_dtype = "bfloat16"
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1f6d77b828459..fbb7f70b55e16 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -107,7 +107,7 @@ def _image_token_str(model_config: ModelConfig,
         return None
     if model_type.startswith("llava"):
         return tokenizer.decode(model_config.hf_config.image_token_index)
-    if model_type == "chameleon":
+    if model_type in ("chameleon", "internvl_chat"):
         return "<image>"
     raise TypeError(f"Unknown model type: {model_type}")
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index fe04c6db5fbc2..94c3cea98be7b 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -37,6 +37,7 @@
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
+    "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     "LlavaForConditionalGeneration":
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
new file mode 100644
index 0000000000000..86d0930d80126
--- /dev/null
+++ b/vllm/model_executor/models/intern_vit.py
@@ -0,0 +1,270 @@
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+
+NORM2FN = {
+    'rms_norm': RMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+
+
+class InternVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(in_channels=3,
+                                         out_channels=self.embed_dim,
+                                         kernel_size=self.patch_size,
+                                         stride=self.patch_size)
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.randn(1, self.num_positions, self.embed_dim))
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size,
+            self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed,
+                                  size=(H, W),
+                                  mode='bicubic',
+                                  align_corners=False)
+        pos_embed = pos_embed.reshape(1, -1, H * W).permute(0, 2,
+                                                            1).to(target_dtype)
+        return pos_embed
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            target_dtype))  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1,
+                                                   -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height,
+                                width)
+        ],
+                                       dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads '
+                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).')
+
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(self.embed_dim,
+                             3 * self.embed_dim,
+                             bias=config.qkv_bias)
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(
+                B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(
+                B_, N_, H_, D_).transpose(1, 2)
+
+        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
+        x = x.transpose(1, 2).reshape(B, N, C)
+
+        x = self.proj(x)
+        return x
+
+
+class InternMLP(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class InternVisionEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config, quant_config=quant_config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
+                                             eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim,
+                                             eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor *
+                                torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor *
+                                torch.ones(self.embed_dim))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states)) * self.ls1
+
+        hidden_states = hidden_states + self.mlp(
+            self.norm2(hidden_states)) * self.ls2
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
+        super().__init__()
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config=config, quant_config=quant_config)
+            for _ in range(num_hidden_layers)
+        ])
+
+    def forward(self, inputs_embeds: torch.Tensor):
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class InternVisionModel(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override)
+
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size,
+                                            old_size // patch_size,
+                                            -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(),
+                                size=new_size // patch_size,
+                                mode='bicubic',
+                                align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim,
+                                                    -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError(
+                'You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        elif pixel_values is not None:
+            if pixel_values.ndim == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(
+                    f'wrong pixel_values size: {pixel_values.shape}')
+
+        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+
+        return encoder_outputs
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 22132f40fc5e6..745fbf99a902d 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -219,14 +219,22 @@ def __init__(
         ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.tok_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: IntermediateTensors = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.tok_embeddings(input_ids)
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.tok_embeddings(input_ids)
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
new file mode 100644
index 0000000000000..f64c78c15f8ee
--- /dev/null
+++ b/vllm/model_executor/models/internvl.py
@@ -0,0 +1,471 @@
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
+
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models.intern_vit import InternVisionModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.image import cached_get_tokenizer
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
+                   get_clip_num_patches)
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+IMG_START = '<img>'
+IMG_END = '</img>'
+IMG_CONTEXT = '<IMG_CONTEXT>'
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+MAX_IMAGE_FEATURE_SIZE_WIDTH = 3000
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = 500
+
+
+class InternVLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: BatchedTensors
+    """
+    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
+    """
+
+
+# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size),
+                 interpolation=T.InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
+                              image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def calculate_num_blocks(orig_width: int,
+                         orig_height: int,
+                         min_num=1,
+                         max_num=6,
+                         image_size=448):
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
+                        for i in range(1, n + 1) for j in range(1, n + 1)
+                        if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
+                                                    target_ratios, orig_width,
+                                                    orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    return blocks, target_width, target_height
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def dynamic_preprocess(image,
+                       min_num=1,
+                       max_num=6,
+                       image_size=448,
+                       use_thumbnail=False):
+    orig_width, orig_height = image.size
+
+    blocks, target_width, target_height = calculate_num_blocks(
+        orig_width, orig_height, min_num, max_num, image_size)
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def image_to_pixel_values(image: Image.Image, input_size=448, max_num=6):
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image,
+                                image_size=input_size,
+                                use_thumbnail=True,
+                                max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+def get_internvl_num_patches(image_size: int, patch_size: int,
+                             downsample_ratio: float):
+    return int(
+        get_clip_num_patches(image_size=image_size, patch_size=patch_size) *
+        (downsample_ratio**2))
+
+
+def get_max_internvl_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    vision_config = hf_config.vision_config
+    image_size = vision_config.image_size
+    patch_size = vision_config.patch_size
+    downsample_ratio = hf_config.downsample_ratio
+    num_patches = get_internvl_num_patches(image_size, patch_size,
+                                           downsample_ratio)
+    return num_patches * 7
+
+
+def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    vision_config = hf_config.vision_config
+
+    image_data = multi_modal_data["image"]
+    if isinstance(image_data, Image.Image):
+        width, height = image_data.size
+        num_blocks, _, _ = calculate_num_blocks(width, height)
+    elif isinstance(image_data, torch.Tensor):
+        raise NotImplementedError("Embeddings input is not supported yet")
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
+
+    image_size = vision_config.image_size
+    patch_size = vision_config.patch_size
+    downsample_ratio = hf_config.downsample_ratio
+    num_patches = get_internvl_num_patches(image_size, patch_size,
+                                           downsample_ratio)
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+
+    prompt = llm_inputs["prompt"]
+    prompt_token_ids = llm_inputs["prompt_token_ids"]
+    if prompt is None:
+        prompt = tokenizer.decode(prompt_token_ids)
+    image_prompt = IMG_START + IMG_CONTEXT * (num_blocks +
+                                              1) * num_patches + IMG_END
+    new_prompt = prompt.replace('<image>', image_prompt, 1)
+    new_prompt_token_ids = tokenizer.encode(new_prompt)
+
+    return LLMInputs(prompt=prompt,
+                     prompt_token_ids=new_prompt_token_ids,
+                     multi_modal_data=multi_modal_data)
+
+
+def input_mapper_for_internvl(ctx: InputContext, data: object):
+    if isinstance(data, Image.Image):
+        data = image_to_pixel_values(data)
+    model_config = ctx.model_config
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+    image_token_id = tokenizer.encode(IMG_CONTEXT,
+                                      add_special_tokens=False,
+                                      return_tensors="pt")[0]
+
+    return MultiModalInputs({
+        "pixel_values": data,
+        "image_token_id": image_token_id
+    })
+
+
+def dummy_data_for_internvl(ctx: InputContext, seq_len: int):
+
+    image_feature_size = get_max_internvl_image_tokens(ctx)
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    vision_config = hf_config.vision_config
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+
+    seq_data = dummy_seq_data_for_clip(
+        vision_config,
+        seq_len,
+        image_token_id=tokenizer.encode(IMG_CONTEXT,
+                                        add_special_tokens=False)[0],
+        image_feature_size_override=image_feature_size,
+    )
+    mm_data = dummy_image_for_clip(
+        vision_config,
+        image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+        image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+    )
+
+    return seq_data, mm_data
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_internvl)
+class InternVLChatModel(nn.Module, SupportsVision):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.select_layer = config.select_layer
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        vision_feature_layer = self.select_layer
+        if vision_feature_layer < 0:
+            num_hidden_layers = config.vision_config.num_hidden_layers \
+                + vision_feature_layer + 1
+        else:
+            num_hidden_layers = vision_feature_layer + 1
+        self.vision_model = InternVisionModel(
+            config.vision_config, num_hidden_layers_override=num_hidden_layers)
+
+        llm_class = ModelRegistry.load_model_cls(
+            config.text_config.architectures[0])
+        self.language_model = llm_class(config.text_config, cache_config,
+                                        quant_config)
+
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                      llm_hidden_size), nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size))
+
+        self.img_context_token_id = None
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            pass
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values):
+        vit_embeds = self.vision_model(pixel_values=pixel_values)
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
+        if list(data.shape[1:]) != [2]:
+            raise ValueError(
+                f"The expected image sizes shape is batch dimension plus "
+                f"{[2]}. You supplied {data.shape}.")
+
+        return data
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values in each batch element "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[InternVLImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_token_id = kwargs.pop("image_token_id", None)
+
+        if pixel_values is None:
+            return None
+
+        self.img_context_token_id = image_token_id[0]
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return InternVLImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is not None:
+            inputs_embeds = self.language_model.model.get_input_embeddings(
+                input_ids)
+            vit_embeds = self.extract_feature(image_input["data"])
+            inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds,
+                                                    vit_embeds,
+                                                    self.img_context_token_id)
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  None,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+            (".gate_up_proj", ".w1", 0),
+            (".gate_up_proj", ".w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.config.text_config.tie_word_embeddings \
+                and "lm_head.weight" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # We only do sharding for language model
+                # and not vision model for now.
+                if "vision_embed_tokens" in name and self.vision_embed_tokens:
+                    continue
+                if weight_name not in name:
+                    continue
+                param = params_dict[name.replace(weight_name, param_name)]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                if "wqkv" in name:
+                    config = self.config.text_config
+                    kv_groups = (config.num_attention_heads //
+                                 config.num_key_value_heads)
+                    head_dim = config.hidden_size // config.num_attention_heads
+                    loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
+                                                       head_dim,
+                                                       loaded_weight.shape[-1])
+                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1],
+                                             dim=1)
+                    wq = wq.reshape(-1, wq.shape[-1])
+                    wk = wk.reshape(-1, wk.shape[-1])
+                    wv = wv.reshape(-1, wv.shape[-1])
+                    weight_loader = param.weight_loader
+                    weight_loader(param, wq, 'q')
+                    weight_loader(param, wk, 'k')
+                    weight_loader(param, wv, 'v')
+                    continue
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index e9aa4416eded4..3deb3d8840cc4 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -243,14 +243,22 @@ def __init__(
         ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_tokens(input_ids)
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 3ba2e01985598..3d13631b9b2b6 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -6,9 +6,10 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             JAISConfig, MedusaConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
-                                             NemotronConfig, RWConfig)
+                                             InternVLChatConfig, JAISConfig,
+                                             MedusaConfig, MLPSpeculatorConfig,
+                                             MPTConfig, NemotronConfig,
+                                             RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -26,6 +27,7 @@
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
     "medusa": MedusaConfig,
+    "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
 }
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 1750950b3c38b..5ccacd4a4c40a 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -4,6 +4,7 @@
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.internvl import InternVLChatConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
@@ -15,6 +16,7 @@
     "DbrxConfig",
     "MPTConfig",
     "RWConfig",
+    "InternVLChatConfig",
     "JAISConfig",
     "MedusaConfig",
     "MLPSpeculatorConfig",
diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py
new file mode 100644
index 0000000000000..ac2492317aa36
--- /dev/null
+++ b/vllm/transformers_utils/configs/internvl.py
@@ -0,0 +1,51 @@
+# Adapted from
+# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.configuration_utils import PretrainedConfig
+
+
+class InternVLChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+
+    def __init__(self,
+                 vision_config=None,
+                 llm_config=None,
+                 use_backbone_lora=0,
+                 use_llm_lora=0,
+                 select_layer=-1,
+                 force_image_size=None,
+                 downsample_ratio=0.5,
+                 template=None,
+                 dynamic_image_size=False,
+                 use_thumbnail=False,
+                 ps_version='v1',
+                 min_dynamic_patch=1,
+                 max_dynamic_patch=6,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = PretrainedConfig(**llm_config)
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch

From 766435e660a786933392eb8ef0a873bc38cf0c8b Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 29 Jul 2024 11:42:35 -0400
Subject: [PATCH 55/66] [Kernel] Tuned FP8 Kernels for Ada Lovelace (#6677)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../cutlass_benchmarks/w8a8_benchmarks.py     |   2 +-
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 520 ++----------------
 .../cutlass_w8a8/scaled_mm_c2x.cuh            | 340 ++++++++++++
 .../scaled_mm_c2x_sm80_dispatch.cuh           | 139 +++++
 .../scaled_mm_c2x_sm89_dispatch.cuh           | 362 ++++++++++++
 tests/kernels/test_cutlass.py                 |   4 +-
 6 files changed, 877 insertions(+), 490 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 234c2c8a1074c..70247e94e63cf 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -13,7 +13,7 @@
 from vllm import _custom_ops as ops
 from vllm.utils import FlexibleArgumentParser
 
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 6ce25c5ac897b..d26c43de522c9 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -1,470 +1,16 @@
 #include <stddef.h>
 #include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-// clang-format will break include orders
-// clang-format off
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/util/device_memory.h"
-
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm_coord.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/gemm/device/gemm.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
 
-#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
-#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
-
-#include "broadcast_load_epilogue_c2x.hpp"
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
+#include "scaled_mm_c2x.cuh"
+#include "scaled_mm_c2x_sm80_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_dispatch.cuh"
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 2.x API, for
    NVIDIA GPUs with SM versions prior to sm90 (Hopper).
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
 */
 
-namespace {
-
-// Wrappers for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm75_to_sm80 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm80_to_sm89 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm89_to_sm90 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-/*
- * This class provides the common ScaleA and ScaleB descriptors for the
- * ScaledEpilogue and ScaledEpilogueBias classes.
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
-
-  using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
-      OutputTileThreadMap, float, Stride<Int<1>, Int<0>, Int<0>>>;
-
-  using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
-      OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
-};
-
-/*
- This epilogue function defines a quantized GEMM operation similar to
- torch._scaled_mm.
-
- A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
- per-row. B can be quantized per-tensor or per-column.
- Any combination of per-tensor and per-row or column is supported.
- A and B must have symmetric quantization (zero point == 0).
-
- So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
- scales are applied elementwise with numpy-style broadcasting.
-
- ScaleA and ScaleB define the epilogue functions that apply the scales for
- the A and B operands respectively. These scales may be either per-tensor or
- per row or column.
-*/
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::ScaleA;
-  using ScaleB = typename SUPER::ScaleB;
-
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    using ScaleAArgs = typename ScaleA::Arguments;
-    using ScaleBArgs = typename ScaleB::Arguments;
-
-    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-
-    typename EVTCompute0::Arguments evt0_compute_args{b_args};
-
-    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
-    return evt_compute_args;
-  }
-};
-
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::ScaleA;
-  using ScaleB = typename SUPER::ScaleB;
-
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
-      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
-
- public:
-  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
-                                                             EVTCompute0, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    using ScaleAArgs = typename ScaleA::Arguments;
-    using ScaleBArgs = typename ScaleB::Arguments;
-    using BiasArgs = typename Bias::Arguments;
-
-    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
-
-    typename EVTCompute0::Arguments evt0_compute_args{b_args};
-
-    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
-                                                    bias_args};
-    return evt_compute_args;
-  }
-};
-
-template <typename Arch, template <typename> typename ArchGuard,
-          typename ElementAB_, typename ElementD_,
-          template <typename, typename> typename Epilogue_, typename TileShape,
-          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
-struct cutlass_2x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-
-  using Operator =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
-                                cutlass::arch::OpMultiplyAddSaturate,
-                                cutlass::arch::OpMultiplyAdd>::type;
-
-  using OutputTileThreadMap =
-      cutlass::epilogue::threadblock::OutputTileThreadLayout<
-          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
-          >;
-
-  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
-      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
-      Stride<int64_t, Int<1>, Int<0>>>;
-
-  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
-
-  // clang-format off
-  using RowMajor = typename cutlass::layout::RowMajor;
-  using ColumnMajor = typename cutlass::layout::ColumnMajor;
-  using KernelType =
-    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
-      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
-      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
-      float, cutlass::layout::RowMajor, 4,
-      ElementAcc, float, cutlass::arch::OpClassTensorOp,
-      Arch,
-      TileShape, WarpShape, InstructionShape,
-      EVTD,
-      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
-      MainLoopStages, Operator,
-      1 /* epilogue stages */
-      >::GemmKernel>;
-  // clang-format on
-
-  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-  cutlass::gemm::GemmCoord problem_size{m, n, k};
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-
-  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
-
-  using Epilogue = typename Gemm::Epilogue;
-  auto evt_args =
-      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
-
-  typename Gemm::EVTD::Arguments epilogue_args{
-      evt_args,
-      d_args,
-  };
-
-  typename Gemm::Op::Arguments args{
-      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
-      problem_size,                                           // problem size
-      1,                                                      // batch count
-      epilogue_args,
-      a_ptr,
-      b_ptr,
-      nullptr,
-      nullptr,
-      0,
-      0,
-      0,
-      0,
-      lda,
-      ldb,
-      ldc,
-      ldc};
-
-  // Launch the CUTLASS GEMM kernel.
-  typename Gemm::Op gemm_op;
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-  cutlass::Status status = gemm_op(args, workspace.get(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
-void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                                  torch::Tensor const& b,
-                                  EpilogueArgs&&... args) {
-  // In some cases, the GPU isn't able to accommodate the
-  // shared memory requirements of the Gemm. In such cases, use
-  // the FallbackGemm instead.
-  static const int max_shared_mem_per_block_opt_in =
-      get_cuda_max_shared_memory_per_block_opt_in(0);
-
-  size_t const gemm_shared_mem_size =
-      sizeof(typename Gemm::KernelType::SharedStorage);
-  size_t const fallback_gemm_shared_mem_size =
-      sizeof(typename FallbackGemm::KernelType::SharedStorage);
-
-  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
-    return cutlass_gemm_caller<Gemm>(out, a, b,
-                                     std::forward<EpilogueArgs>(args)...);
-  } else {
-    TORCH_CHECK(fallback_gemm_shared_mem_size <=
-                max_shared_mem_per_block_opt_in);
-    return cutlass_gemm_caller<FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_default {
-  // This config is used in 2 cases,
-  //  - M in (128, inf)
-  //  - M in (64, 128] and N >= 8192
-  // Shared Memory required by this Gemm - 81920 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M64 {
-  // This config is used in 2 cases,
-  // - M in (32, 64]
-  // - M in (64, 128] and N < 8192
-  // Shared Memory required by this Gemm - 122880 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M32 {
-  // M in (16, 32]
-  // Shared Memory required by this Gemm - 61440 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M16 {
-  // M in [1, 16]
-  // Shared Memory required by this Gemm - 51200 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-}  // namespace
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, int8_t>());
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
-
-  using Cutlass2xGemmDefault =
-      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM128BigN =
-      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM128SmallN =
-      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM64 =
-      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM32 =
-      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM16 =
-      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
-
-  // Due to shared memory requirements, some Gemms may fail to run on some
-  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
-  // in such cases.
-  // sm80_config_M16 has the least shared-memory requirement. However,
-  // based on some profiling, we select sm80_config_M32 as a better alternative
-  // performance wise.
-  using FallbackGemm =
-      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
-  if (mp2 <= 16) {
-    // M in [1, 16]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 32) {
-    // M in (16, 32]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 64) {
-    // M in (32, 64]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // M in (64, 128]
-    uint32_t const n = out.size(1);
-    bool const small_n = n < 8192;
-    if (small_n) {
-      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
-                                          FallbackGemm>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    } else {
-      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    }
-  } else {
-    // M in (128, inf)
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
 template <template <typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
@@ -478,14 +24,15 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
   using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
-        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+    return vllm::cutlass_gemm_caller<
+        vllm::cutlass_2x_gemm<cutlass::arch::Sm75, vllm::enable_sm75_to_sm80,
+                              int8_t, cutlass::bfloat16_t, Epilogue, TileShape,
+                              WarpShape, InstructionShape, 2>>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
+    return vllm::cutlass_gemm_caller<vllm::cutlass_2x_gemm<
+        cutlass::arch::Sm75, vllm::enable_sm75_to_sm80, int8_t, cutlass::half_t,
         Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
@@ -501,11 +48,11 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
   }
 }
 
@@ -518,11 +65,12 @@ void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
+                                            Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
@@ -537,11 +85,11 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
   }
 }
 
@@ -558,15 +106,17 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
-          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                int8_t, cutlass::bfloat16_t, Epilogue,
+                                TileShape, WarpShape, InstructionShape, 5>>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
-          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                int8_t, cutlass::half_t, Epilogue, TileShape,
+                                WarpShape, InstructionShape, 5>>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
@@ -574,17 +124,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<
-          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
-                          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue,
-                          TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
+                                              cutlass::bfloat16_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<
-          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
-                          cutlass::float_e4m3_t, cutlass::half_t, Epilogue,
-                          TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
+                                              cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
@@ -600,10 +146,10 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
new file mode 100644
index 0000000000000..ba620e85117b5
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -0,0 +1,340 @@
+#pragma once
+#include <stddef.h>
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+// clang-format will break include orders
+// clang-format off
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/util/device_memory.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+
+#include "broadcast_load_epilogue_c2x.hpp"
+#include "common.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace vllm {
+
+// Wrappers for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm75_to_sm80 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm89_to_sm90 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+/*
+ * This class provides the common ScaleA and ScaleB descriptors for the
+ * ScaledEpilogue and ScaledEpilogueBias classes.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+  using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
+      OutputTileThreadMap, float, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
+      OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
+    return evt_compute_args;
+  }
+};
+
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
+                                                             EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+    using BiasArgs = typename Bias::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
+                                                    bias_args};
+    return evt_compute_args;
+  }
+};
+
+template <typename Arch, template <typename> typename ArchGuard,
+          typename ElementAB_, typename ElementD_,
+          template <typename, typename> typename Epilogue_, typename TileShape,
+          typename WarpShape, typename InstructionShape, int32_t MainLoopStages,
+          typename FP8MathOperator = cutlass::arch::OpMultiplyAdd>
+struct cutlass_2x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Operator =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
+                                cutlass::arch::OpMultiplyAddSaturate,
+                                FP8MathOperator>::type;
+
+  using OutputTileThreadMap =
+      cutlass::epilogue::threadblock::OutputTileThreadLayout<
+          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
+          >;
+
+  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
+      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
+      Stride<int64_t, Int<1>, Int<0>>>;
+
+  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
+
+  // clang-format off
+  using RowMajor = typename cutlass::layout::RowMajor;
+  using ColumnMajor = typename cutlass::layout::ColumnMajor;
+  using KernelType =
+    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
+      float, cutlass::layout::RowMajor, 4,
+      ElementAcc, float, cutlass::arch::OpClassTensorOp,
+      Arch,
+      TileShape, WarpShape, InstructionShape,
+      EVTD,
+      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
+      MainLoopStages, Operator,
+      1 /* epilogue stages */
+      >::GemmKernel>;
+  // clang-format on
+
+  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+  cutlass::gemm::GemmCoord problem_size{m, n, k};
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
+
+  using Epilogue = typename Gemm::Epilogue;
+  auto evt_args =
+      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
+
+  typename Gemm::EVTD::Arguments epilogue_args{
+      evt_args,
+      d_args,
+  };
+
+  typename Gemm::Op::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
+      problem_size,                                           // problem size
+      1,                                                      // batch count
+      epilogue_args,
+      a_ptr,
+      b_ptr,
+      nullptr,
+      nullptr,
+      0,
+      0,
+      0,
+      0,
+      lda,
+      ldb,
+      ldc,
+      ldc};
+
+  // Launch the CUTLASS GEMM kernel.
+  typename Gemm::Op gemm_op;
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+  cutlass::Status status = gemm_op(args, workspace.get(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
+inline void fallback_cutlass_gemm_caller(torch::Tensor& out,
+                                         torch::Tensor const& a,
+                                         torch::Tensor const& b,
+                                         EpilogueArgs&&... args) {
+  // In some cases, the GPU isn't able to accommodate the
+  // shared memory requirements of the Gemm. In such cases, use
+  // the FallbackGemm instead.
+  static const int max_shared_mem_per_block_opt_in =
+      get_cuda_max_shared_memory_per_block_opt_in(0);
+
+  size_t const gemm_shared_mem_size =
+      sizeof(typename Gemm::KernelType::SharedStorage);
+  size_t const fallback_gemm_shared_mem_size =
+      sizeof(typename FallbackGemm::KernelType::SharedStorage);
+
+  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
+    return cutlass_gemm_caller<Gemm>(out, a, b,
+                                     std::forward<EpilogueArgs>(args)...);
+  } else {
+    TORCH_CHECK(fallback_gemm_shared_mem_size <=
+                max_shared_mem_per_block_opt_in);
+    return cutlass_gemm_caller<FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
new file mode 100644
index 0000000000000..89d101b0ed822
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
@@ -0,0 +1,139 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM80 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_default {
+  // This config is used in 2 cases,
+  //  - M in (128, inf)
+  //  - M in (64, 128] and N >= 8192
+  // Shared Memory required by this Gemm - 81920 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M64 {
+  // This config is used in 2 cases,
+  // - M in (32, 64]
+  // - M in (64, 128] and N < 8192
+  // Shared Memory required by this Gemm - 122880 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M32 {
+  // M in (16, 32]
+  // Shared Memory required by this Gemm - 61440 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M16 {
+  // M in [1, 16]
+  // Shared Memory required by this Gemm - 51200 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm80_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128BigN =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128SmallN =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM64 =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM16 =
+      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm80_config_M16 has the least shared-memory requirement. However,
+  // based on some profiling, we select sm80_config_M32 as a better alternative
+  // performance wise.
+  using FallbackGemm =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    uint32_t const n = out.size(1);
+    bool const small_n = n < 8192;
+    if (small_n) {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
+                                          FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    // M in (128, inf)
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
new file mode 100644
index 0000000000000..c9ed3ed67171e
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
@@ -0,0 +1,362 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+#include "cutlass/float8.h"
+
+/**
+ * This file defines Gemm kernel configurations for SM89 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm89_fallback_gemm {
+  // Shared Memory required by this Gemm - 61440 bytes
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5,
+                      FP8MathOperator>;
+};
+
+struct sm89_config_default {
+  // M in (256, inf)
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M256 {
+  // M in (128, 256]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M128 {
+  // M in (64, 128]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M64 {
+  // M in (32, 64]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8196) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M32 {
+  // M in (16, 32]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 128, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M16 {
+  // M in [1, 16]
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+  static const int32_t MainLoopStages = 5;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 24576) {
+      using TileShape = typename cutlass::gemm::GemmShape<16, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm89_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return sm89_config_M16::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return sm89_config_M32::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return sm89_config_M64::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return sm89_config_M128::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return sm89_config_M256::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return sm89_config_default::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index d409df34ee5e5..6b72875823b4b 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -106,8 +106,8 @@ def cutlass_int8_gemm_helper(m: int,
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
 
-@pytest.mark.parametrize("m", [512, 222, 100, 33, 1])
-@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
+@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])

From db9e5708a98b7209cf4465a0391139cf8fca7674 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Tue, 30 Jul 2024 00:47:31 +0800
Subject: [PATCH 56/66] [Core] Reduce unnecessary compute when logprobs=None
 (#6532)

---
 tests/samplers/test_logprobs.py       |  39 ++++++-
 vllm/model_executor/layers/sampler.py | 144 +++++++++++++++-----------
 vllm/outputs.py                       |  17 +--
 vllm/sampling_params.py               |  15 +--
 4 files changed, 135 insertions(+), 80 deletions(-)

diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index f7bcd4c855799..c07c71e38233f 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -14,7 +14,7 @@
 @pytest.mark.parametrize("dtype",
                          ["float"])  # needed for comparing logprobs with HF
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
-@pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
+@pytest.mark.parametrize("num_top_logprobs", [0, 6])  # 32000 == vocab_size
 @pytest.mark.parametrize("detokenize", [True, False])
 def test_get_prompt_logprobs(
     hf_runner,
@@ -63,7 +63,10 @@ def test_get_prompt_logprobs(
         assert result.outputs[0].logprobs is not None
         assert len(result.outputs[0].logprobs) == max_tokens
         for logprobs in result.outputs[0].logprobs:
-            assert len(logprobs) == num_top_logprobs
+            # If the output token is not included in the top X
+            # logprob, it can return 1 more data
+            assert (len(logprobs) == num_top_logprobs
+                    or len(logprobs) == num_top_logprobs + 1)
         output_text = result.outputs[0].text
         output_string_from_most_likely_tokens_lst: List[str] = []
         for top_logprobs in result.outputs[0].logprobs:
@@ -135,3 +138,35 @@ def test_max_logprobs():
     bad_sampling_params = SamplingParams(logprobs=2)
     with pytest.raises(ValueError):
         runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
+@pytest.mark.parametrize("detokenize", [True, False])
+def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
+                       detokenize: bool, example_prompts):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+    max_tokens = 5
+
+    with vllm_runner(
+            model,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=None,
+                                                       temperature=0.0,
+                                                       detokenize=detokenize)
+        results_logprobs_none = vllm_model.model.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_none)
+
+    for i in range(len(results_logprobs_none)):
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 121458f8156a1..60fa3fbb51be6 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,5 +1,6 @@
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
+from math import inf
 from typing import Dict, List, Optional, Tuple
 
 import torch
@@ -774,8 +775,11 @@ def _get_logprobs(
     # The next token ids to get the logprob value from.
     next_token_ids: List[int] = []
     # The largest requested number of logprobs. We find logprobs as many as the
-    # largest num logprobs in this API.
-    largest_num_logprobs = 1
+    # largest num logprobs in this API. If every logprobs is None, it will be
+    # set to -1.
+    largest_num_logprobs = -1
+    # If beam search is enabled.
+    use_beam_search = False
 
     # Select indices to compute logprob from, ranks of token ids, and the top
     # k token ids from logprobs.
@@ -808,6 +812,8 @@ def _get_logprobs(
                 largest_num_logprobs = max(largest_num_logprobs,
                                            sampling_params.logprobs)
 
+            use_beam_search = use_beam_search or sampling_params.use_beam_search
+
         assert len(next_token_ids) == len(query_indices)
 
     if len(query_indices) == 0:
@@ -815,35 +821,40 @@ def _get_logprobs(
         empty_prompt_logprob: Optional[PromptLogprobs] = None
         return [empty_prompt_logprob], [empty_sampled_logprob]
 
-    query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
-    next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device)
-
-    # (num_selected_query_tokens, num_logprobs). Note that query_indices can
-    # contain duplicates if beam search is enabled.
-    selected_logprobs = logprobs[[
-        query_indices_gpu,
-        next_token_ids_gpu,
-    ]]
-    ranks = _get_ranks(
-        logprobs[query_indices_gpu],
-        next_token_ids_gpu,
-    )
-    assert selected_logprobs.shape[0] == ranks.shape[0]
-
-    # Logprobs of topk tokens for a batch of sequence groups.
-    # (num_query_tokens_across_batch).
-    if largest_num_logprobs > 0:
-        top_logprobs, top_token_ids = torch.topk(logprobs,
-                                                 largest_num_logprobs,
-                                                 dim=-1)
-    else:
-        top_logprobs, top_token_ids = None, None
+    selected_logprobs, ranks = None, None
+    top_logprobs, top_token_ids = None, None
+
+    # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can
+    # skip the whole logprob calculation.
+    if largest_num_logprobs >= 0 or use_beam_search:
+        query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
+        next_token_ids_gpu = torch.tensor(next_token_ids,
+                                          device=logprobs.device)
+
+        # (num_selected_query_tokens, num_logprobs). Note that query_indices can
+        # contain duplicates if beam search is enabled.
+        selected_logprobs = logprobs[[
+            query_indices_gpu,
+            next_token_ids_gpu,
+        ]]
+        ranks = _get_ranks(
+            logprobs[query_indices_gpu],
+            next_token_ids_gpu,
+        )
+        assert selected_logprobs.shape[0] == ranks.shape[0]
+
+        # We need to compute top k only if there exists logprobs > 0.
+        if largest_num_logprobs > 0:
+            # Logprobs of topk tokens for a batch of sequence groups.
+            # (num_query_tokens_across_batch).
+            top_logprobs, top_token_ids = torch.topk(logprobs,
+                                                     largest_num_logprobs,
+                                                     dim=-1)
+            top_logprobs = top_logprobs.to('cpu')
+            top_token_ids = top_token_ids.to('cpu')
 
-    selected_logprobs = selected_logprobs.to('cpu')
-    ranks = ranks.to('cpu')
-    if top_logprobs is not None and top_token_ids is not None:
-        top_logprobs = top_logprobs.to('cpu')
-        top_token_ids = top_token_ids.to('cpu')
+        selected_logprobs = selected_logprobs.to('cpu')
+        ranks = ranks.to('cpu')
 
     # Find prompt/sample logprobs.
     prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = []
@@ -940,46 +951,53 @@ def _get_sampled_logprob_if_needed(
 ):
     """Compute the sample logprob if needed."""
     seq_ids = seq_group.seq_ids
-    num_logprobs = seq_group.sampling_params.logprobs or 0
+    num_logprobs = seq_group.sampling_params.logprobs
+    use_beam_search = seq_group.sampling_params.use_beam_search
     sampled_logprobs: SampleLogprobs = []
     next_token_ids, parent_seq_ids = sample_result
 
     if seq_group.do_sample:
         assert len(next_token_ids) > 0
-        # Pre-select items from tensor. tolist() is faster than repetitive
-        # `.item()` calls.
-        selected_logprob_items = selected_logprobs[
-            selected_logprobs_idx:selected_logprobs_idx +
-            len(next_token_ids)].tolist()
-        rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
-                           len(next_token_ids)].tolist()
-        for idx, (next_token_id,
-                  parent_id) in enumerate(zip(next_token_ids, parent_seq_ids)):
-            # Get the logprob of a sampled token.
-            sampled_logprobs_dict = {
-                next_token_id: (selected_logprob_items[idx], rank_items[idx])
-            }
-            # Get top K logprobs.
-            if num_logprobs > 0:
-                top_ids = top_token_ids[top_logprob_idx +
-                                        parent_id, :num_logprobs].tolist()
-                top_probs = top_logprobs[top_logprob_idx +
-                                         parent_id, :num_logprobs].tolist()
-                # Top K is already sorted by rank, so we can use 1 ~
-                # num_logprobs + 1 for rank.
-                top_ranks = range(1, num_logprobs + 1)
-                sampled_logprobs_dict.update({
-                    top_id: (top_prob, rank)
-                    for top_id, top_prob, rank in zip(top_ids, top_probs,
-                                                      top_ranks)
+        if num_logprobs is None and not use_beam_search:
+            for next_token_id in next_token_ids:
+                # Use a dummy logprob
+                sampled_logprobs.append({next_token_id: Logprob(inf)})
+        else:
+            # Pre-select items from tensor. tolist() is faster than repetitive
+            # `.item()` calls.
+            selected_logprob_items = selected_logprobs[
+                selected_logprobs_idx:selected_logprobs_idx +
+                len(next_token_ids)].tolist()
+            rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
+                               len(next_token_ids)].tolist()
+            for idx, (next_token_id, parent_id) in enumerate(
+                    zip(next_token_ids, parent_seq_ids)):
+                # Get the logprob of a sampled token.
+                sampled_logprobs_dict = {
+                    next_token_id:
+                    (selected_logprob_items[idx], rank_items[idx])
+                }
+                if num_logprobs is not None and num_logprobs > 0:
+                    # Get top K logprobs.
+                    top_ids = top_token_ids[top_logprob_idx +
+                                            parent_id, :num_logprobs].tolist()
+                    top_probs = top_logprobs[
+                        top_logprob_idx + parent_id, :num_logprobs].tolist()
+                    # Top K is already sorted by rank, so we can use 1 ~
+                    # num_logprobs + 1 for rank.
+                    top_ranks = range(1, num_logprobs + 1)
+                    sampled_logprobs_dict.update({
+                        top_id: (top_prob, rank)
+                        for top_id, top_prob, rank in zip(
+                            top_ids, top_probs, top_ranks)
+                    })
+
+                sampled_logprobs.append({
+                    token_id: Logprob(*logprob_and_rank)
+                    for token_id, logprob_and_rank in
+                    sampled_logprobs_dict.items()
                 })
 
-            sampled_logprobs.append({
-                token_id: Logprob(*logprob_and_rank)
-                for token_id, logprob_and_rank in
-                sampled_logprobs_dict.items()
-            })
-
         # NOTE: This part of code is not intuitive. `selected_logprobs` include
         # logprobs for the current step, which has len(next_token_ids) tokens
         # per sequence group. `logprobs` includes logprobs from the previous
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 4cb7f06bdb8c7..b1cb1cd07fbb1 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -29,7 +29,7 @@ class CompletionOutput:
     index: int
     text: str
     token_ids: Tuple[int, ...]
-    cumulative_logprob: float
+    cumulative_logprob: Optional[float]
     logprobs: Optional[SampleLogprobs]
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
@@ -124,13 +124,14 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
         include_logprobs = seq_group.sampling_params.logprobs is not None
         text_buffer_length = seq_group.sampling_params.output_text_buffer_length
         outputs = [
-            CompletionOutput(seqs.index(seq),
-                             seq.get_output_text_to_return(text_buffer_length),
-                             seq.get_output_token_ids(),
-                             seq.get_cumulative_logprob(),
-                             seq.output_logprobs if include_logprobs else None,
-                             SequenceStatus.get_finished_reason(seq.status),
-                             seq.stop_reason) for seq in top_n_seqs
+            CompletionOutput(
+                seqs.index(seq),
+                seq.get_output_text_to_return(text_buffer_length),
+                seq.get_output_token_ids(),
+                seq.get_cumulative_logprob() if include_logprobs else None,
+                seq.output_logprobs if include_logprobs else None,
+                SequenceStatus.get_finished_reason(seq.status),
+                seq.stop_reason) for seq in top_n_seqs
         ]
 
         # Every sequence in the sequence group should have the same prompt.
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 638c870c04371..2598325439ebf 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -92,11 +92,12 @@ class SamplingParams:
         min_tokens: Minimum number of tokens to generate per output sequence
             before EOS or stop_token_ids can be generated
         logprobs: Number of log probabilities to return per output token.
-            Note that the implementation follows the OpenAI API: The return
-            result includes the log probabilities on the `logprobs` most likely
-            tokens, as well the chosen tokens. The API will always return the
-            log probability of the sampled token, so there  may be up to
-            `logprobs+1` elements in the response.
+            When set to None, no probability is returned. If set to a non-None
+            value, the result includes the log probabilities of the specified
+            number of most likely tokens, as well as the chosen tokens.
+            Note that the implementation follows the OpenAI API: The API will
+            always return the log probability of the sampled token, so there
+            may be up to `logprobs+1` elements in the response.
         prompt_logprobs: Number of log probabilities to return per prompt token.
         detokenize: Whether to detokenize the output. Defaults to True.
         skip_special_tokens: Whether to skip special tokens in the output.
@@ -168,8 +169,8 @@ def __init__(
         self.ignore_eos = ignore_eos
         self.max_tokens = max_tokens
         self.min_tokens = min_tokens
-        self.logprobs = logprobs
-        self.prompt_logprobs = prompt_logprobs
+        self.logprobs = 1 if logprobs is True else logprobs
+        self.prompt_logprobs = 1 if prompt_logprobs is True else prompt_logprobs
         # NOTE: This parameter is only exposed at the engine level for now.
         # It is not exposed in the OpenAI API server, as the OpenAI API does
         # not support returning only a list of token IDs.

From 60d1c6e584203b07df5020529d678e69baad84a3 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 29 Jul 2024 12:59:02 -0400
Subject: [PATCH 57/66] [Kernel] Fix deprecation function warnings squeezellm
 quant_cuda_kernel (#6901)

---
 csrc/quantization/squeezellm/quant_cuda_kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
index 40baac6108695..714907428a1ab 100644
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -197,13 +197,13 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
 #ifndef USE_ROCM
-      (half2*)vec.data<at::Half>(),
+      (half2*)vec.data_ptr<at::Half>(),
 #else
       (__half2*)vec.data_ptr<at::Half>(),
 #endif
       mat.data_ptr<int>(),
 #ifndef USE_ROCM
-      (half2*)mul.data<at::Half>(), (__half*)lookup_table.data<at::Half>(),
+      (half2*)mul.data<at::Half>(), (__half*)lookup_table.data_ptr<at::Half>(),
 #else
       (float2*)mul.data_ptr<float>(),
       (__half*)lookup_table.data_ptr<at::Half>(),

From 7f8d612d24c66e9b5f8c0aa6cb562e853e9523a0 Mon Sep 17 00:00:00 2001
From: Earthwalker <48991073+etwk@users.noreply.github.com>
Date: Tue, 30 Jul 2024 03:42:21 +0800
Subject: [PATCH 58/66] [TPU] Support tensor parallelism in async llm engine
 (#6891)

---
 Dockerfile.tpu                  |  3 +++
 vllm/engine/async_llm_engine.py | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 4fc14d6bd186c..adebb8ab5adca 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -12,6 +12,9 @@ RUN pip install "numpy<2"
 RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 
+# Fix FastAPI dependence
+RUN pip install "starlette<0.38.0"
+
 # Build vLLM.
 COPY . /workspace/vllm
 ENV VLLM_TARGET_DEVICE="tpu"
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 93cc319f11c42..d3f9a0ab00f10 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -407,8 +407,14 @@ def _get_executor_cls(
             from vllm.executor.neuron_executor import NeuronExecutorAsync
             executor_class = NeuronExecutorAsync
         elif engine_config.device_config.device_type == "tpu":
-            from vllm.executor.tpu_executor import TPUExecutorAsync
-            executor_class = TPUExecutorAsync
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_tpu_executor import RayTPUExecutorAsync
+                executor_class = RayTPUExecutorAsync
+            else:
+                assert distributed_executor_backend is None
+                from vllm.executor.tpu_executor import TPUExecutorAsync
+                executor_class = TPUExecutorAsync
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync

From 9a7e2d053405da076ff0975660e82c4bc08a62be Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Mon, 29 Jul 2024 23:51:27 +0200
Subject: [PATCH 59/66] [Bugfix] Allow vllm to still work if triton is not
 installed. (#6786)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 requirements-cpu.txt                          |  1 -
 requirements-openvino.txt                     |  2 --
 requirements-tpu.txt                          |  1 -
 tests/kernels/test_sampler.py                 |  7 +++---
 vllm/attention/ops/paged_attn.py              |  5 ++++-
 .../layers/fused_moe/__init__.py              | 22 +++++++++++++------
 vllm/model_executor/layers/ops/sample.py      | 14 +-----------
 .../model_executor/layers/quantization/fp8.py |  4 ++--
 vllm/model_executor/layers/sampler.py         |  6 ++++-
 vllm/model_executor/sampling_metadata.py      |  2 +-
 vllm/triton_utils/__init__.py                 | 14 +++++++-----
 vllm/triton_utils/importing.py                | 11 ++++++++++
 vllm/triton_utils/sample.py                   | 13 +++++++++++
 13 files changed, 65 insertions(+), 37 deletions(-)
 create mode 100644 vllm/triton_utils/importing.py
 create mode 100644 vllm/triton_utils/sample.py

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index a8ce104d83290..2dcd86274a2a8 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -4,4 +4,3 @@
 # Dependencies for x86_64 CPUs
 torch == 2.4.0; platform_machine != "ppc64le"
 torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index e32c76fb0db21..fabac3c7bbaf9 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -5,5 +5,3 @@
 torch >= 2.1.2
 openvino ~= 2024.3.0.dev
 optimum-intel[openvino] >= 1.18.1
-
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index c2140fbffec9f..5eb27b39eb623 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -5,4 +5,3 @@
 # Currently, the TPU backend uses a nightly version of PyTorch XLA.
 # You can install the dependencies in Dockerfile.tpu.
 ray
-triton  # To avoid import errors
diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py
index e28f809309ec5..3c53f7decc6eb 100644
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -5,11 +5,12 @@
 import triton
 import triton.language as tl
 
-from vllm.model_executor.layers.ops.sample import (
-    MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits,
-    sample)
+from vllm.model_executor.layers.ops.sample import (_uniform_to_exponential,
+                                                   sample)
 from vllm.model_executor.sampling_metadata import SamplingTensors
 from vllm.model_executor.utils import set_random_seed
+from vllm.triton_utils.sample import (MAX_TRITON_N_COLS,
+                                      get_num_triton_sampler_splits)
 
 SINGLE_SPLIT_VOCAB_SIZE = 32000  # llama/mistral/mixtral vocab size
 MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 0f6d2f2d1ab3f..e88963ade16c9 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -4,7 +4,10 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index db837231c6ace..3e0767c7d2665 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,14 +1,22 @@
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk)
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
+from vllm.triton_utils import HAS_TRITON
 
 __all__ = [
-    "fused_moe",
-    "fused_topk",
-    "fused_experts",
-    "get_config_file_name",
-    "grouped_topk",
     "FusedMoE",
     "FusedMoEMethodBase",
 ]
+
+if HAS_TRITON:
+
+    from vllm.model_executor.layers.fused_moe.fused_moe import (
+        fused_experts, fused_moe, fused_topk, get_config_file_name,
+        grouped_topk)
+
+    __all__ += [
+        "fused_moe",
+        "fused_topk",
+        "fused_experts",
+        "get_config_file_name",
+        "grouped_topk",
+    ]
diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
index d08ae6064aa2a..bdb577da3172f 100644
--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
@@ -1,4 +1,3 @@
-import math
 from typing import Optional, Tuple
 
 import torch
@@ -6,21 +5,10 @@
 import triton.language as tl
 
 from vllm.model_executor.layers.ops.rand import seeded_uniform
+from vllm.triton_utils.sample import get_num_triton_sampler_splits
 
 _EPS = 1e-6
 
-# This is a hardcoded limit in Triton (max block size).
-MAX_TRITON_N_COLS = 131072
-
-
-def get_num_triton_sampler_splits(n_cols: int) -> int:
-    """Get the number of splits to use for Triton sampling.
-
-    Triton has a limit on the number of columns it can handle, so we need to
-    split the tensor and call the kernel multiple times if it's too large.
-    """
-    return math.ceil(n_cols / MAX_TRITON_N_COLS)
-
 
 def _multi_split_sample(
     probs: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 6649b317ca838..c829cb836ee4c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -6,8 +6,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
-                                                  fused_moe)
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -404,6 +403,7 @@ def apply(self,
               num_expert_group: Optional[int] = None,
               topk_group: Optional[int] = None) -> torch.Tensor:
 
+        from vllm.model_executor.layers.fused_moe import fused_moe
         return fused_moe(x,
                          layer.w13_weight,
                          layer.w2_weight,
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 60fa3fbb51be6..6632b1c434582 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -6,7 +6,11 @@
 import torch
 import torch.nn as nn
 
-from vllm.model_executor.layers.ops.sample import sample as sample_triton
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.model_executor.layers.ops.sample import sample as sample_triton
+
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
                                                    SamplingTensors,
                                                    SequenceGroupToSample)
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 27b37a9d53470..1caf9aa01d8c8 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -5,9 +5,9 @@
 
 import torch
 
-from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData, SequenceGroupMetadata
+from vllm.triton_utils.sample import get_num_triton_sampler_splits
 from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
                         make_tensor_with_pad, maybe_expand_dim)
 
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 09843e5d1f30b..568185383aa5c 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,6 +1,10 @@
-from vllm.triton_utils.custom_cache_manager import (
-    maybe_set_triton_cache_manager)
+from vllm.triton_utils.importing import HAS_TRITON
 
-__all__ = [
-    "maybe_set_triton_cache_manager",
-]
+__all__ = ["HAS_TRITON"]
+
+if HAS_TRITON:
+
+    from vllm.triton_utils.custom_cache_manager import (
+        maybe_set_triton_cache_manager)
+
+    __all__ += ["maybe_set_triton_cache_manager"]
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
new file mode 100644
index 0000000000000..3455036586a93
--- /dev/null
+++ b/vllm/triton_utils/importing.py
@@ -0,0 +1,11 @@
+from importlib.util import find_spec
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+HAS_TRITON = find_spec("triton") is not None
+
+if not HAS_TRITON:
+    logger.info("Triton not installed; certain GPU-related functions"
+                " will be not be available.")
diff --git a/vllm/triton_utils/sample.py b/vllm/triton_utils/sample.py
new file mode 100644
index 0000000000000..401e4d28a3c99
--- /dev/null
+++ b/vllm/triton_utils/sample.py
@@ -0,0 +1,13 @@
+import math
+
+# This is a hardcoded limit in Triton (max block size).
+MAX_TRITON_N_COLS = 131072
+
+
+def get_num_triton_sampler_splits(n_cols: int) -> int:
+    """Get the number of splits to use for Triton sampling.
+
+    Triton has a limit on the number of columns it can handle, so we need to
+    split the tensor and call the kernel multiple times if it's too large.
+    """
+    return math.ceil(n_cols / MAX_TRITON_N_COLS)

From 9f69d8245a695918cb80c218a83c42e9fd409687 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Mon, 29 Jul 2024 16:37:27 -0700
Subject: [PATCH 60/66] [Frontend] New `allowed_token_ids` decoding request
 parameter (#6753)

---
 tests/entrypoints/openai/test_completion.py   | 22 ++++++
 vllm/entrypoints/openai/logits_processors.py  | 74 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 60 ++++-----------
 vllm/entrypoints/openai/serving_chat.py       |  2 +-
 vllm/entrypoints/openai/serving_completion.py |  2 +-
 5 files changed, 114 insertions(+), 46 deletions(-)
 create mode 100644 vllm/entrypoints/openai/logits_processors.py

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index fe00640c0021e..50add84087a95 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -541,6 +541,28 @@ async def test_logits_bias(client: openai.AsyncOpenAI):
     assert first_response != completion.choices[0].text
 
 
+@pytest.mark.asyncio
+async def test_allowed_token_ids(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 1
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    allowed_ids = [21555, 21557, 21558]
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+        extra_body=dict(allowed_token_ids=allowed_ids),
+        logprobs=1,
+    )
+    response_tokens = completion.choices[0].logprobs.tokens
+    assert len(response_tokens) == 1
+    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
new file mode 100644
index 0000000000000..31eb5aa628c52
--- /dev/null
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -0,0 +1,74 @@
+from functools import lru_cache
+from typing import Dict, FrozenSet, Iterable, List, Optional, Union
+
+import torch
+from transformers import PreTrainedTokenizer
+
+from vllm.sampling_params import LogitsProcessor
+
+
+class AllowedTokenIdsLogitsProcessor:
+    """Logits processor for constraining generated tokens to a
+    specific set of token ids."""
+
+    def __init__(self, allowed_ids: Iterable[int]):
+        self.allowed_ids: Optional[List[int]] = list(allowed_ids)
+        self.mask: Optional[torch.Tensor] = None
+
+    def __call__(self, token_ids: List[int],
+                 logits: torch.Tensor) -> torch.Tensor:
+        if self.mask is None:
+            self.mask = torch.ones((logits.shape[-1], ),
+                                   dtype=torch.bool,
+                                   device=logits.device)
+            self.mask[self.allowed_ids] = False
+            self.allowed_ids = None
+        logits.masked_fill_(self.mask, float("-inf"))
+        return logits
+
+
+@lru_cache(maxsize=32)
+def _get_allowed_token_ids_logits_processor(
+    allowed_token_ids: FrozenSet[int],
+    vocab_size: int,
+) -> LogitsProcessor:
+    if not allowed_token_ids:
+        raise ValueError("Empty allowed_token_ids provided")
+    if not all(0 <= tid < vocab_size for tid in allowed_token_ids):
+        raise ValueError("allowed_token_ids contains "
+                         "out-of-vocab token id")
+    return AllowedTokenIdsLogitsProcessor(allowed_token_ids)
+
+
+def get_logits_processors(
+        logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]],
+        allowed_token_ids: Optional[List[int]],
+        tokenizer: PreTrainedTokenizer) -> List[LogitsProcessor]:
+    logits_processors = []
+    if logit_bias:
+        try:
+            # Convert token_id to integer
+            # Clamp the bias between -100 and 100 per OpenAI API spec
+            clamped_logit_bias: Dict[int, float] = {
+                int(token_id): min(100.0, max(-100.0, bias))
+                for token_id, bias in logit_bias.items()
+            }
+        except ValueError as exc:
+            raise ValueError(
+                "Found token_id in logit_bias that is not "
+                "an integer or string representing an integer") from exc
+
+        def logit_bias_logits_processor(token_ids: List[int],
+                                        logits: torch.Tensor) -> torch.Tensor:
+            for token_id, bias in clamped_logit_bias.items():
+                logits[token_id] += bias
+            return logits
+
+        logits_processors.append(logit_bias_logits_processor)
+
+    if allowed_token_ids is not None:
+        logits_processors.append(
+            _get_allowed_token_ids_logits_processor(
+                frozenset(allowed_token_ids), tokenizer.vocab_size))
+
+    return logits_processors
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index c024bbc07c069..205860aa8e722 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -5,9 +5,11 @@
 
 import torch
 from pydantic import BaseModel, ConfigDict, Field, model_validator
+from transformers import PreTrainedTokenizer
 from typing_extensions import Annotated
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
@@ -213,30 +215,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     # doc: end-chat-completion-extra-params
 
-    def to_sampling_params(self) -> SamplingParams:
+    def to_sampling_params(self,
+                           tokenizer: PreTrainedTokenizer) -> SamplingParams:
         # We now allow logprobs being true without top_logrobs.
 
-        logits_processors = None
-        if self.logit_bias:
-            logit_bias: Dict[int, float] = {}
-            try:
-                for token_id, bias in self.logit_bias.items():
-                    # Convert token_id to integer before we add to LLMEngine
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    logit_bias[int(token_id)] = min(100, max(-100, bias))
-            except ValueError as exc:
-                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
-                                 f"but token_id must be an integer or string "
-                                 f"representing an integer") from exc
-
-            def logit_bias_logits_processor(
-                    token_ids: List[int],
-                    logits: torch.Tensor) -> torch.Tensor:
-                for token_id, bias in logit_bias.items():
-                    logits[token_id] += bias
-                return logits
-
-            logits_processors = [logit_bias_logits_processor]
+        logits_processors = get_logits_processors(
+            logit_bias=self.logit_bias,
+            allowed_token_ids=None,
+            tokenizer=tokenizer,
+        )
 
         return SamplingParams(
             n=self.n,
@@ -358,6 +345,7 @@ class CompletionRequest(OpenAIBaseModel):
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    allowed_token_ids: Optional[List[int]] = None
     # doc: end-completion-sampling-params
 
     # doc: begin-completion-extra-params
@@ -407,30 +395,14 @@ class CompletionRequest(OpenAIBaseModel):
 
     # doc: end-completion-extra-params
 
-    def to_sampling_params(self):
+    def to_sampling_params(self, tokenizer: PreTrainedTokenizer):
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        logits_processors = None
-        if self.logit_bias:
-            logit_bias: Dict[int, float] = {}
-            try:
-                for token_id, bias in self.logit_bias.items():
-                    # Convert token_id to integer
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    logit_bias[int(token_id)] = min(100, max(-100, bias))
-            except ValueError as exc:
-                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
-                                 f"but token_id must be an integer or string "
-                                 f"representing an integer") from exc
-
-            def logit_bias_logits_processor(
-                    token_ids: List[int],
-                    logits: torch.Tensor) -> torch.Tensor:
-                for token_id, bias in logit_bias.items():
-                    logits[token_id] += bias
-                return logits
-
-            logits_processors = [logit_bias_logits_processor]
+        logits_processors = get_logits_processors(
+            logit_bias=self.logit_bias,
+            allowed_token_ids=self.allowed_token_ids,
+            tokenizer=tokenizer,
+        )
 
         return SamplingParams(
             n=self.n,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 012f70e661100..01843930bf11d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -134,7 +134,7 @@ async def create_chat_completion(
 
         request_id = f"chat-{random_uuid()}"
         try:
-            sampling_params = request.to_sampling_params()
+            sampling_params = request.to_sampling_params(tokenizer)
             decoding_config = await self.engine.get_decoding_config()
             guided_decoding_backend = request.guided_decoding_backend \
                 or decoding_config.guided_decoding_backend
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 73e420141813e..8548352791680 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -95,7 +95,7 @@ async def create_completion(self, request: CompletionRequest,
 
             tokenizer = await self.engine.get_tokenizer(lora_request)
 
-            sampling_params = request.to_sampling_params()
+            sampling_params = request.to_sampling_params(tokenizer)
             decoding_config = await self.engine.get_decoding_config()
             guided_decoding_backend = request.guided_decoding_backend \
                 or decoding_config.guided_decoding_backend

From aae6d36f7ebec2476f7a29ec72c91d7424bb66de Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 29 Jul 2024 20:01:17 -0400
Subject: [PATCH 61/66] [Kernel] Remove unused variables in awq/gemm_kernels.cu
 (#6908)

---
 csrc/quantization/awq/gemm_kernels.cu | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu
index 6d6da5f3d8746..9da724a1b43c3 100644
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -17,14 +17,6 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
 namespace vllm {
 namespace awq {
 
-// Pack two half values.
-static inline __device__ __host__ unsigned __pack_half2(const half x,
-                                                        const half y) {
-  unsigned v0 = *((unsigned short*)&x);
-  unsigned v1 = *((unsigned short*)&y);
-  return (v1 << 16) | v0;
-}
-
 template <int N>
 __global__ void __launch_bounds__(64)
     gemm_forward_4bit_cuda_m16nXk32(int G, int split_k_iters,
@@ -42,11 +34,7 @@ __global__ void __launch_bounds__(64)
   __shared__ half A_shared[16 * (32 + 8)];
   __shared__ half B_shared[32 * (N + 8)];
 
-  __shared__ half scaling_factors_shared[N];
-  __shared__ half zeros_shared[N];
-
   int j_factors1 = ((OC + N - 1) / N);
-  int blockIdx_x = 0;
   int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
   int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
 
@@ -60,7 +48,6 @@ __global__ void __launch_bounds__(64)
 
   static constexpr int row_stride_warp = 32 * 8 / 32;
   static constexpr int row_stride = 2 * 32 * 8 / N;
-  bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < N;
   // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
   bool ld_A_flag =
       (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp +
@@ -145,11 +132,7 @@ __global__ void __launch_bounds__(64)
       uint32_t B_loaded =
           *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
       uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
-      // uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N /
-      // 8)) * 8);
 
-      // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x
-      // % (cta_N / 8)) * 8);
       // - zero and * scale
       // TODO (Haotian): can save 4 assembly instructions if sormulate as deq =
       // q * scale - zero * scale.
@@ -367,17 +350,11 @@ __global__ void __launch_bounds__(64)
 __global__ void __launch_bounds__(64)
     dequantize_weights(int* __restrict__ B, half* __restrict__ scaling_factors,
                        int* __restrict__ zeros, half* __restrict__ C, int G) {
-  int j_factors1 = 4;
-  int row_stride2 = 4;
-  int split_k_iters = 1;
   static constexpr uint32_t ZERO = 0x0;
   half B_shared[32 * (128 + 8)];
 
   half* B_shared_ptr2 = B_shared;
 
-  half B_shared_warp[32];
-  int OC = 512;
-
   int N = blockDim.x * gridDim.x;  // 2
   int col = (blockIdx.x * blockDim.x + threadIdx.x);
   int row = blockIdx.y * blockDim.y + threadIdx.y;

From 4fbf4aa128c5f5fee62d520dea9d3dfd10f33cdb Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 29 Jul 2024 17:03:45 -0700
Subject: [PATCH 62/66] [ci] GHA workflow to remove ready label upon
 "/notready" comment (#6921)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .../remove_label_not_ready_comment.yml        | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 .github/workflows/remove_label_not_ready_comment.yml

diff --git a/.github/workflows/remove_label_not_ready_comment.yml b/.github/workflows/remove_label_not_ready_comment.yml
new file mode 100644
index 0000000000000..d1da7726eaee3
--- /dev/null
+++ b/.github/workflows/remove_label_not_ready_comment.yml
@@ -0,0 +1,23 @@
+name: Remove ready Label on notready Comment
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  add-ready-label:
+    runs-on: ubuntu-latest
+    if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
+    steps:
+        -   name: Remove ready label
+            uses: actions/github-script@v5
+            with:
+                script: |
+                    github.rest.issues.removeLabel({
+                        owner: context.repo.owner,
+                        repo: context.repo.repo,
+                        issue_number: context.issue.number,
+                        name: 'ready'
+                    })
+            env:
+                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 61a97c32f64641738d2cc623708f28046768224e Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 29 Jul 2024 21:26:07 -0400
Subject: [PATCH 63/66] [Kernel] Fix marlin divide-by-zero warnings (#6904)

---
 csrc/quantization/gptq_marlin/gptq_marlin.cu  | 61 ++++++++++---------
 .../marlin/dense/marlin_cuda_kernel.cu        | 18 ++++--
 .../marlin/sparse/marlin_24_cuda_kernel.cu    | 18 ++++--
 3 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 36ae2bfafa7c2..26cc248e6ac5d 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -1128,44 +1128,47 @@ __global__ void Marlin(
   };
 
   auto fetch_zp_to_registers = [&](int k, int full_pipe) {
-    if constexpr (!has_zp) {
-      return;
-    }
+    if constexpr (has_zp) {
+      // This code does not handle group_blocks == 0,
+      // which signifies act_order.
+      // has_zp implies AWQ, which doesn't have act_order,
+      static_assert(group_blocks != 0);
 
-    int pipe = full_pipe % stages;
+      int pipe = full_pipe % stages;
 
-    if constexpr (group_blocks == -1) {
-      for (int i = 0; i < num_ints_per_thread; i++) {
-        frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
-      }
+      if constexpr (group_blocks == -1) {
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+        }
 
-    } else if constexpr (group_blocks >= thread_k_blocks) {
-      int4* sh_zp_stage =
-          sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                                 (pipe / (group_blocks / thread_k_blocks)));
-      for (int i = 0; i < num_ints_per_thread; i++) {
-        frag_qzp[k % 2][i] =
-            (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
-      }
-    } else {
-      int warp_id = threadIdx.x / 32;
-      int n_warps = thread_n_blocks / 4;
+      } else if constexpr (group_blocks >= thread_k_blocks) {
+        int4* sh_zp_stage =
+            sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      } else {
+        int warp_id = threadIdx.x / 32;
+        int n_warps = thread_n_blocks / 4;
 
-      int warp_row = warp_id / n_warps;
+        int warp_row = warp_id / n_warps;
 
-      int cur_k = warp_row * 16;
-      cur_k += k_iter_size * (k % b_sh_wr_iters);
+        int cur_k = warp_row * 16;
+        cur_k += k_iter_size * (k % b_sh_wr_iters);
 
-      int k_blocks = cur_k / 16;
-      int cur_group_id = k_blocks / group_blocks;
+        int k_blocks = cur_k / 16;
+        int cur_group_id = k_blocks / group_blocks;
 
-      int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
-      sh_zp_stage += cur_group_id * zp_sh_stride;
+        sh_zp_stage += cur_group_id * zp_sh_stride;
 
-      for (int i = 0; i < num_ints_per_thread; i++) {
-        frag_qzp[k % 2][i] =
-            (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
       }
     }
   };
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index 37339b84ae25b..efbcc182a3ae4 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -452,10 +452,15 @@ __global__ void Marlin(
         B_ptr[i] += b_gl_rd_delta_o;
       }
       // Only fetch scales if this tile starts a new group
-      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
-        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
-        s_gl_rd += s_gl_rd_delta;
+      if constexpr (group_blocks != -1) {
+        // This assumes group_blocks >= thread_k_blocks
+        // and would need to be modified to support smaller groups.
+        static_assert(group_blocks >= thread_k_blocks);
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+          s_gl_rd += s_gl_rd_delta;
+        }
       }
     }
     // Insert a fence even when we are winding down the pipeline to ensure that
@@ -480,7 +485,10 @@ __global__ void Marlin(
     // however, this does not seem to be a significant bottleneck, while some
     // theoretically better attempts have lead to bad instruction ordering by
     // the compiler and correspondingly a noticeable drop in performance.
-    if (group_blocks != -1) {
+    if constexpr (group_blocks != -1) {
+      // This assumes group_blocks >= thread_k_blocks
+      // and would need to be modified to support smaller groups.
+      static_assert(group_blocks >= thread_k_blocks);
       int4* sh_s_stage =
           sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                (pipe / (group_blocks / thread_k_blocks)));
diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index b5effc3055441..3c50f1786bc68 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -404,10 +404,15 @@ __global__ void Marlin_24(
         meta_ptr[i] += m_gl_rd_delta_o;
       }
       // Only fetch scales if this tile starts a new group
-      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
-        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
-        s_gl_rd += s_gl_rd_delta;
+      if constexpr (group_blocks != -1) {
+        // This assumes group_blocks >= thread_k_blocks
+        // and would need to be modified to support smaller groups.
+        static_assert(group_blocks >= thread_k_blocks);
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+          s_gl_rd += s_gl_rd_delta;
+        }
       }
     }
     // Insert a fence even when we are winding down the pipeline to ensure that
@@ -432,7 +437,10 @@ __global__ void Marlin_24(
     // however, this does not seem to be a significant bottleneck, while some
     // theoretically better attempts have lead to bad instruction ordering by
     // the compiler and correspondingly a noticeable drop in performance.
-    if (group_blocks != -1) {
+    if constexpr (group_blocks != -1) {
+      // This assumes group_blocks >= thread_k_blocks
+      // and would need to be modified to support smaller groups.
+      static_assert(group_blocks >= thread_k_blocks);
       int4* sh_s_stage =
           sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                (pipe / (group_blocks / thread_k_blocks)));

From af647fb8b3ea9d910f7d1bc104af8986d048a8e2 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 29 Jul 2024 22:24:58 -0400
Subject: [PATCH 64/66] [Kernel] Tuned int8 kernels for Ada Lovelace (#6848)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../cutlass_w8a8/scaled_mm_c2x.cu             |  27 +-
 ...uh => scaled_mm_c2x_sm89_fp8_dispatch.cuh} |  54 +--
 .../scaled_mm_c2x_sm89_int8_dispatch.cuh      | 353 ++++++++++++++++++
 tests/kernels/test_cutlass.py                 |   4 +-
 4 files changed, 395 insertions(+), 43 deletions(-)
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_c2x_sm89_dispatch.cuh => scaled_mm_c2x_sm89_fp8_dispatch.cuh} (89%)
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index d26c43de522c9..aac4900f933a7 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -4,7 +4,8 @@
 
 #include "scaled_mm_c2x.cuh"
 #include "scaled_mm_c2x_sm80_dispatch.cuh"
-#include "scaled_mm_c2x_sm89_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 2.x API, for
@@ -98,25 +99,17 @@ template <template <typename, typename> typename Epilogue,
 void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
                                      torch::Tensor const& b,
                                      EpilogueArgs&&... epilogue_args) {
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
-                                int8_t, cutlass::bfloat16_t, Epilogue,
-                                TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                                   Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
-                                int8_t, cutlass::half_t, Epilogue, TileShape,
-                                WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t,
+                                                   Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
@@ -124,13 +117,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
-                                              cutlass::bfloat16_t, Epilogue>(
+      return vllm::cutlass_gemm_sm89_fp8_dispatch<
+          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
-                                              cutlass::half_t, Epilogue>(
+      return vllm::cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                                  cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
similarity index 89%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
rename to csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
index c9ed3ed67171e..4e82c99c3af31 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
@@ -4,7 +4,7 @@
 #include "cutlass/float8.h"
 
 /**
- * This file defines Gemm kernel configurations for SM89 based on the Gemm
+ * This file defines Gemm kernel configurations for SM89 (FP8) based on the Gemm
  * shape.
  */
 
@@ -12,7 +12,7 @@ namespace vllm {
 
 template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
-struct sm89_fallback_gemm {
+struct sm89_fp8_fallback_gemm {
   // Shared Memory required by this Gemm - 61440 bytes
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using TileShape = typename cutlass::gemm::GemmShape<64, 128, 64>;
@@ -25,7 +25,7 @@ struct sm89_fallback_gemm {
                       FP8MathOperator>;
 };
 
-struct sm89_config_default {
+struct sm89_fp8_config_default {
   // M in (256, inf)
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -40,7 +40,8 @@ struct sm89_config_default {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -74,7 +75,7 @@ struct sm89_config_default {
   }
 };
 
-struct sm89_config_M256 {
+struct sm89_fp8_config_M256 {
   // M in (128, 256]
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -89,7 +90,8 @@ struct sm89_config_M256 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -114,7 +116,7 @@ struct sm89_config_M256 {
   }
 };
 
-struct sm89_config_M128 {
+struct sm89_fp8_config_M128 {
   // M in (64, 128]
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -129,7 +131,8 @@ struct sm89_config_M128 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -163,7 +166,7 @@ struct sm89_config_M128 {
   }
 };
 
-struct sm89_config_M64 {
+struct sm89_fp8_config_M64 {
   // M in (32, 64]
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
 
@@ -176,7 +179,8 @@ struct sm89_config_M64 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -215,7 +219,7 @@ struct sm89_config_M64 {
   }
 };
 
-struct sm89_config_M32 {
+struct sm89_fp8_config_M32 {
   // M in (16, 32]
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
   using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
@@ -229,7 +233,8 @@ struct sm89_config_M32 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -265,7 +270,7 @@ struct sm89_config_M32 {
   }
 };
 
-struct sm89_config_M16 {
+struct sm89_fp8_config_M16 {
   // M in [1, 16]
   using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -281,7 +286,8 @@ struct sm89_config_M16 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -320,10 +326,10 @@ struct sm89_config_M16 {
 template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue,
           typename... EpilogueArgs>
-inline void cutlass_gemm_sm89_dispatch(torch::Tensor& out,
-                                       torch::Tensor const& a,
-                                       torch::Tensor const& b,
-                                       EpilogueArgs&&... args) {
+inline void cutlass_gemm_sm89_fp8_dispatch(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
   TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
@@ -334,27 +340,27 @@ inline void cutlass_gemm_sm89_dispatch(torch::Tensor& out,
 
   if (mp2 <= 16) {
     // M in [1, 16]
-    return sm89_config_M16::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M16::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 32) {
     // M in (16, 32]
-    return sm89_config_M32::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M32::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 64) {
     // M in (32, 64]
-    return sm89_config_M64::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M64::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // M in (64, 128]
-    return sm89_config_M128::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M128::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 256) {
     // M in (128, 256]
-    return sm89_config_M256::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M256::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else {
     // M in (256, inf)
-    return sm89_config_default::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_default::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
new file mode 100644
index 0000000000000..95723b31ca3ce
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
@@ -0,0 +1,353 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM89 (int8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm89_int8_fallback_gemm {
+  // Shared mem requirement : 61440
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  static int32_t const MainLoopStages = 5;
+
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+struct sm89_int8_config_default {
+  // M in (256, inf)
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M256 {
+  // M in (128, 256]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M128 {
+  // M in (64, 128]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M64 {
+  // M in (32, 64]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M32 {
+  // M in (16, 32]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<32, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M16 {
+  // M in [1, 16]
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<16, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<16, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm89_int8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return sm89_int8_config_M16::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return sm89_int8_config_M32::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return sm89_int8_config_M64::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return sm89_int8_config_M128::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return sm89_int8_config_M256::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return sm89_int8_config_default::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 6b72875823b4b..4ed9db37f60ab 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -119,8 +119,8 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
     cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
-@pytest.mark.parametrize("m", [512, 222, 33, 1])
-@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [2048, 8192, 16384, 256, 1024])
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])

From 6e063ea35b6fe3b6a9b87aae589725493849a632 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 30 Jul 2024 02:06:29 -0700
Subject: [PATCH 65/66] [TPU] Fix greedy decoding (#6933)

---
 vllm/worker/tpu_model_runner.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 1692094af8c41..cf4cc5535ba5b 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -28,7 +28,9 @@
 
 logger = init_logger(__name__)
 
-_PAD_SLOT_ID = -1  # NOTE(woosuk): In PyTorch XLA, index -1 is ignored.
+# Here we utilize the behavior that out-of-bound index is ignored.
+# FIXME(woosuk): Find a more reliable way to prevent possible bugs.
+_PAD_SLOT_ID = 1_000_000_000
 # FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow.
 _ENABLE_TOP_P = False
 # FIXME(woosuk): A temporary hack to support `n > 1`.
@@ -414,10 +416,7 @@ def _prepare_sample(
         best_of = []
         for seq_group_metadata in seq_group_metadata_list:
             sampling_params = seq_group_metadata.sampling_params
-            # NOTE(woosuk): Here we mimic argmax sampling by applying a very
-            # low temperature. This is not accurate.
-            t.append(sampling_params.temperature
-                     if sampling_params.temperature >= 1e-5 else 1e-5)
+            t.append(sampling_params.temperature)
             if sampling_params.top_p != 1 and not _ENABLE_TOP_P:
                 raise NotImplementedError(
                     "Top-p sampling is currently disabled for the TPU backend "
@@ -678,13 +677,23 @@ def forward(
         hidden_states = hidden_states.flatten(0, 1)
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
 
-        logits = logits / t.unsqueeze(dim=1)
+        # Argmax sampling.
+        argmax_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
+        argmax_token_ids = argmax_token_ids.repeat(1, num_samples)
+
+        # Zero temperature means greedy decoding. Avoid division by zero.
+        nonzero_t = torch.where(t != 0, t, 1.0)
+        logits = logits / nonzero_t.unsqueeze(dim=1)
         if _ENABLE_TOP_P:
             logits = _apply_top_p(logits, p.unsqueeze(dim=1))
+
+        # Random sampling.
         probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
-        next_token_ids = torch.multinomial(probs,
-                                           num_samples,
-                                           replacement=True)
+        sampled_token_ids = torch.multinomial(probs,
+                                              num_samples,
+                                              replacement=True)
+        next_token_ids = torch.where(t != 0, sampled_token_ids,
+                                     argmax_token_ids)
         return next_token_ids
 
 

From c66c7f86aca956014d9ec6cc7a3e6001037e4655 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 30 Jul 2024 02:20:57 -0700
Subject: [PATCH 66/66] [Bugfix] Fix PaliGemma MMP (#6930)

---
 vllm/model_executor/models/paligemma.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 8a2bacbd96b67..2af48b6bc190f 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -9,7 +9,6 @@
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -133,12 +132,10 @@ class PaliGemmaMultiModalProjector(nn.Module):
     def __init__(self, vision_hidden_size: int, projection_dim: int):
         super().__init__()
 
-        self.linear = ColumnParallelLinear(vision_hidden_size,
-                                           projection_dim,
-                                           bias=True)
+        self.linear = nn.Linear(vision_hidden_size, projection_dim, bias=True)
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
-        hidden_states, _ = self.linear(image_features)
+        hidden_states = self.linear(image_features)
         return hidden_states