vllm-project · WoosukKwon · Apr 9, 2024 · Mar 25, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -26,6 +26,9 @@ ARG BUILD_FA="1"
 # whether to build cupy on rocm
 ARG BUILD_CUPY="1"
 
+# whether to build triton on rocm
+ARG BUILD_TRITON="1"
+
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
 
@@ -95,6 +98,17 @@ RUN if [ "$BUILD_CUPY" = "1" ]; then \
     && cd ..; \
     fi
 
+# build triton
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
+    mkdir -p libs \
+    && cd libs \
+    && pip uninstall -y triton \
+    && git clone https://github.com/ROCmSoftwarePlatform/triton.git \
+    && cd triton/python \
+    && pip3 install . \
+    && cd ../..; \
+    fi
+
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
@@ -14,6 +14,7 @@
                                               AttentionMetadata)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
+from vllm.utils import is_hip
 
 
 class FlashAttentionBackend(AttentionBackend):
@@ -192,19 +193,33 @@ def forward(
                 # normal attention
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
-                output = flash_attn_varlen_func(
-                    q=query,
-                    k=key,
-                    v=value,
-                    cu_seqlens_q=attn_metadata.seq_start_loc,
-                    cu_seqlens_k=attn_metadata.seq_start_loc,
-                    max_seqlen_q=attn_metadata.max_prompt_len,
-                    max_seqlen_k=attn_metadata.max_prompt_len,
-                    softmax_scale=self.scale,
-                    causal=True,
-                    window_size=self.sliding_window,
-                    alibi_slopes=self.alibi_slopes,
-                )
+                if is_hip():
+                    output = flash_attn_varlen_func(
+                        q=query,
+                        k=key,
+                        v=value,
+                        cu_seqlens_q=attn_metadata.seq_start_loc,
+                        cu_seqlens_k=attn_metadata.seq_start_loc,
+                        max_seqlen_q=attn_metadata.max_prompt_len,
+                        max_seqlen_k=attn_metadata.max_prompt_len,
+                        softmax_scale=self.scale,
+                        causal=True,
+                    )
+                else:
+                    output = flash_attn_varlen_func(
+                        q=query,
+                        k=key,
+                        v=value,
+                        cu_seqlens_q=attn_metadata.seq_start_loc,
+                        cu_seqlens_k=attn_metadata.seq_start_loc,
+                        max_seqlen_q=attn_metadata.max_prompt_len,
+                        max_seqlen_k=attn_metadata.max_prompt_len,
+                        softmax_scale=self.scale,
+                        causal=True,
+                        window_size=self.sliding_window,
+                        alibi_slopes=self.alibi_slopes,
+                    )
+
             else:
                 # prefix-enabled attention
                 output = PagedAttention.forward_prefix(

diff --git a/vllm/attention/backends/flash_attn_triton.py b/vllm/attention/backends/flash_attn_triton.py
@@ -0,0 +1,176 @@
+"""Attention layer with Flash and PagedAttention.
+
+NOTE(woosuk): At the moment, this file includes a lot of duplicated code from
+XFormers backend. The duplicated code will be removed once we use flash-attn or
+flashinfer for all the attention operations.
+"""
+from typing import List, Optional, Type
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionImpl
+from vllm.attention.backends.flash_attn import (FlashAttentionBackend,
+                                                FlashAttentionMetadata)
+from vllm.attention.ops.flash_attention_triton import triton_attention
+from vllm.attention.ops.paged_attn import PagedAttention
+
+
+class FlashAttentionTritonBackend(FlashAttentionBackend):
+
+    @staticmethod
+    def get_impl_cls() -> Type["FlashAttentionTritonImpl"]:
+        return FlashAttentionTritonImpl
+
+
+class FlashAttentionTritonImpl(AttentionImpl):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prompt_tokens -------------->|
+    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->|
+
+    Otherwise, the layout is as follows:
+    |<------------------ num_generation_tokens (M) ----------------->|
+    |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->|
+
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+
+    NOTE: This code is mostly duplicate from flash_attn backend
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.sliding_window = ((sliding_window, sliding_window)
+                               if sliding_window is not None else (-1, -1))
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
+        if head_size not in suppored_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {suppored_head_sizes}.")
+
+    def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
+        """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
+        tokens, n_kv_heads, head_dim = x.shape
+        return (x[:, :,
+                  None, :].expand(tokens, n_kv_heads, n_rep,
+                                  head_dim).reshape(tokens, n_kv_heads * n_rep,
+                                                    head_dim))
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention and PagedAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        num_tokens, hidden_size = query.shape
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
+        if kv_cache is not None:
+            key_cache, value_cache = PagedAttention.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+
+            # Reshape the input keys and values and store them in the cache.
+            # If kv_cache is not provided, the new key and value tensors are
+            # not cached. This happens during the initial memory profiling run.
+            PagedAttention.write_to_paged_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                attn_metadata.kv_cache_dtype,
+            )
+
+        if attn_metadata.is_prompt:
+            # Prompt run.
+            if kv_cache is None or attn_metadata.block_tables.numel() == 0:
+                # triton attention
+                # When block_tables are not filled, it means q and k are the
+                # prompt, and they have the same length.
+
+                if self.num_kv_heads != self.num_heads:
+                    # Interleave for MQA workaround.
+                    key = self.repeat_kv(key, self.num_queries_per_kv)
+                    value = self.repeat_kv(value, self.num_queries_per_kv)
+
+                output, _ = triton_attention(
+                    query,
+                    key,
+                    value,
+                    None,
+                    attn_metadata.seq_start_loc,
+                    attn_metadata.seq_start_loc,
+                    attn_metadata.max_prompt_len,
+                    attn_metadata.max_prompt_len,
+                    True,
+                    self.scale,
+                )
+            else:
+                # prefix-enabled attention
+                output = PagedAttention.forward_prefix(
+                    query,
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.block_tables,
+                    attn_metadata.subquery_start_loc,
+                    attn_metadata.prompt_lens_tensor,
+                    attn_metadata.context_lens,
+                    attn_metadata.max_subquery_len,
+                    self.alibi_slopes,
+                )
+        else:
+            # Decoding run.
+            output = PagedAttention.forward_decode(
+                query,
+                key_cache,
+                value_cache,
+                attn_metadata.block_tables,
+                attn_metadata.context_lens,
+                attn_metadata.max_context_len,
+                attn_metadata.kv_cache_dtype,
+                self.num_kv_heads,
+                self.scale,
+                self.alibi_slopes,
+            )
+
+        # Reshape the output tensor.
+        return output.view(num_tokens, hidden_size)