From f60fd43a5cc67d09e217d2c920c5cbd0523c083e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Wed, 6 Mar 2024 18:15:51 +0800
Subject: [PATCH 1/9] remove triton version check

---
 .../layers/triton_kernel/prefix_prefill.py    | 1384 ++++++++---------
 1 file changed, 691 insertions(+), 693 deletions(-)

diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
index 70f09224f1cf6..65f183a5c55e8 100644
--- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
+++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -5,698 +5,646 @@
 import triton
 import triton.language as tl
 
-if triton.__version__ >= "2.1.0":
-
-    @triton.jit
-    def _fwd_kernel(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        B_Ctxlen,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-    ):
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        q = tl.load(
-            Q + off_q,
-            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0)
-
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k = tl.load(K_cache + off_k,
-                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            p = tl.exp(qk - m_ij[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)
-            alpha = tl.exp(m_i - m_i_new)
-            beta = tl.exp(m_ij - m_i_new)
-            l_i_new = alpha * l_i + beta * l_ij
-            # -- update output accumulator --
-            # scale p
-            p_scale = beta / l_i_new
-            p = p * p_scale[:, None]
-            # scale acc
-            acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(V_cache + off_v,
-                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            p = tl.exp(qk - m_ij[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)
-            alpha = tl.exp(m_i - m_i_new)
-            beta = tl.exp(m_ij - m_i_new)
-            l_i_new = alpha * l_i + beta * l_ij
-            # -- update output accumulator --
-            # scale p
-            p_scale = beta / l_i_new
-            p = p * p_scale[:, None]
-            # scale acc
-            acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
-        return
-
-    @triton.jit
-    def _fwd_kernel_flash_attn_v2(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        B_Ctxlen,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-    ):
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        q = tl.load(
-            Q + off_q,
-            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0)
-
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k = tl.load(K_cache + off_k,
-                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(V_cache + off_v,
-                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        # acc /= l_i[:, None]
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
-        return
-
-    @triton.jit
-    def _fwd_kernel_alibi(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        B_Ctxlen,
-        Alibi_slopes,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-    ):
-        # attn_bias[]
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        # cur_batch_seq_len: the length of prompts
-        # cur_batch_ctx_len: the length of prefix
-        # cur_batch_in_all_start_index: the start id of the dim=0
-        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        q = tl.load(
-            Q + off_q,
-            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0)
-
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-
-        alibi_slope = tl.load(Alibi_slopes + cur_head)
-        alibi_start_q = tl.arange(
-            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
-        alibi_start_k = 0
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k = tl.load(K_cache + off_k,
-                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-
-            # load alibi
-            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                     alibi_start_q[:, None]) * alibi_slope
-            alibi = tl.where(
-                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-                alibi, float("-inf"))
-            qk += alibi
-            alibi_start_k += BLOCK_N
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(V_cache + off_v,
-                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v, allow_tf32=False)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        # init alibi
-        alibi_slope = tl.load(Alibi_slopes + cur_head)
-        alibi_start_q = tl.arange(
-            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
-        alibi_start_k = cur_batch_ctx_len
-        # # init debugger
-        # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
-        # offset_db_k = tl.arange(0, BLOCK_N)
-        # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k, allow_tf32=False)
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # load alibi
-            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                     alibi_start_q[:, None]) * alibi_slope
-            alibi = tl.where(
-                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-                alibi, float("-inf"))
-            qk += alibi
-            alibi_start_k += BLOCK_N
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v, allow_tf32=False)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        acc = acc / l_i[:, None]
-
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
-        return
-
-    @torch.inference_mode()
-    def context_attention_fwd(q,
-                              k,
-                              v,
-                              o,
-                              k_cache,
-                              v_cache,
-                              b_loc,
-                              b_start_loc,
-                              b_seq_len,
-                              b_ctx_len,
-                              max_input_len,
-                              alibi_slopes=None):
-
-        cap = torch.cuda.get_device_capability()
-        BLOCK = 128 if cap[0] >= 8 else 64
-        # shape constraints
-        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-        assert Lq == Lk and Lk == Lv
-        assert Lk in {16, 32, 64, 128}
-
-        sm_scale = 1.0 / (Lq**0.5)
-        batch, head = b_seq_len.shape[0], q.shape[1]
-        num_queries_per_kv = q.shape[1] // k.shape[1]
-
-        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
-
-        num_warps = 8 if Lk <= 64 else 8
-        if alibi_slopes is not None:
-            _fwd_kernel_alibi[grid](
-                q,
-                k,
-                v,
-                k_cache,
-                v_cache,
-                b_loc,
-                sm_scale,
-                b_start_loc,
-                b_seq_len,
-                b_ctx_len,
-                alibi_slopes,
-                v_cache.shape[3],
-                8,
-                o,
-                b_loc.stride(0),
-                b_loc.stride(1),
-                q.stride(0),
-                q.stride(1),
-                q.stride(2),
-                k.stride(0),
-                k.stride(1),
-                k.stride(2),
-                v.stride(0),
-                v.stride(1),
-                v.stride(2),
-                o.stride(0),
-                o.stride(1),
-                o.stride(2),
-                k_cache.stride(0),
-                k_cache.stride(1),
-                k_cache.stride(2),
-                k_cache.stride(3),
-                k_cache.stride(
-                    4
-                ),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
-                v_cache.stride(0),
-                v_cache.stride(1),
-                v_cache.stride(2),
-                v_cache.stride(
-                    3),  #[num_blocks, num_kv_heads, head_size, block_size]
-                num_queries_per_kv=num_queries_per_kv,
-                BLOCK_M=BLOCK,
-                BLOCK_DMODEL=Lk,
-                BLOCK_N=BLOCK,
-                num_warps=num_warps,
-                num_stages=1,
-            )
-            return
-
-        _fwd_kernel[grid](
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    B_Ctxlen,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+        cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    q = tl.load(
+        Q + off_q,
+        mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+        other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                        ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                        mask=(start_n + offs_n) < cur_batch_ctx_len,
+                        other=0)
+        off_k = (bn[None, :] * stride_k_cache_bs +
+                    cur_kv_head * stride_k_cache_h +
+                    (offs_d[:, None] // x) * stride_k_cache_d +
+                    ((start_n + offs_n[None, :]) % block_size) *
+                    stride_k_cache_bl +
+                    (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (
+            bn[:, None] * stride_v_cache_bs +
+            cur_kv_head * stride_v_cache_h +
+            offs_d[None, :] * stride_v_cache_d +
+            (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k = tl.load(K_cache + off_k,
+                    mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                        float("-inf"))
+        qk *= sm_scale
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        m_i_new = tl.maximum(m_i, m_ij)
+        alpha = tl.exp(m_i - m_i_new)
+        beta = tl.exp(m_ij - m_i_new)
+        l_i_new = alpha * l_i + beta * l_ij
+        # -- update output accumulator --
+        # scale p
+        p_scale = beta / l_i_new
+        p = p * p_scale[:, None]
+        # scale acc
+        acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(V_cache + off_v,
+                    mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+                offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+                offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=(start_n + offs_n[None, :]) <
+                    cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                        float("-inf"))
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        m_i_new = tl.maximum(m_i, m_ij)
+        alpha = tl.exp(m_i - m_i_new)
+        beta = tl.exp(m_ij - m_i_new)
+        l_i_new = alpha * l_i + beta * l_ij
+        # -- update output accumulator --
+        # scale p
+        p_scale = beta / l_i_new
+        p = p * p_scale[:, None]
+        # scale acc
+        acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=(start_n + offs_n[:, None]) <
+                    cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+    # initialize pointers to output
+    off_o = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+        cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+                acc,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+    return
+
+@triton.jit
+def _fwd_kernel_flash_attn_v2(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    B_Ctxlen,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+        cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    q = tl.load(
+        Q + off_q,
+        mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+        other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                        ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                        mask=(start_n + offs_n) < cur_batch_ctx_len,
+                        other=0)
+        off_k = (bn[None, :] * stride_k_cache_bs +
+                    cur_kv_head * stride_k_cache_h +
+                    (offs_d[:, None] // x) * stride_k_cache_d +
+                    ((start_n + offs_n[None, :]) % block_size) *
+                    stride_k_cache_bl +
+                    (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (
+            bn[:, None] * stride_v_cache_bs +
+            cur_kv_head * stride_v_cache_h +
+            offs_d[None, :] * stride_v_cache_d +
+            (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k = tl.load(K_cache + off_k,
+                    mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                        float("-inf"))
+        qk *= sm_scale
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(V_cache + off_v,
+                    mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+                offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+                offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=(start_n + offs_n[None, :]) <
+                    cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                        float("-inf"))
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=(start_n + offs_n[:, None]) <
+                    cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    # acc /= l_i[:, None]
+    # initialize pointers to output
+    off_o = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+        cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+                acc,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+    return
+
+@triton.jit
+def _fwd_kernel_alibi(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    B_Ctxlen,
+    Alibi_slopes,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    # attn_bias[]
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    # cur_batch_seq_len: the length of prompts
+    # cur_batch_ctx_len: the length of prefix
+    # cur_batch_in_all_start_index: the start id of the dim=0
+    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+        cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    q = tl.load(
+        Q + off_q,
+        mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+        other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(
+        0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = 0
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                        ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                        mask=(start_n + offs_n) < cur_batch_ctx_len,
+                        other=0)
+        off_k = (bn[None, :] * stride_k_cache_bs +
+                    cur_kv_head * stride_k_cache_h +
+                    (offs_d[:, None] // x) * stride_k_cache_d +
+                    ((start_n + offs_n[None, :]) % block_size) *
+                    stride_k_cache_bl +
+                    (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (
+            bn[:, None] * stride_v_cache_bs +
+            cur_kv_head * stride_v_cache_h +
+            offs_d[None, :] * stride_v_cache_d +
+            (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k = tl.load(K_cache + off_k,
+                    mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                        float("-inf"))
+        qk *= sm_scale
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                    alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
+            alibi, float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(V_cache + off_v,
+                    mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v, allow_tf32=False)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+                offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+                offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    # init alibi
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(
+        0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = cur_batch_ctx_len
+    # # init debugger
+    # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
+    # offset_db_k = tl.arange(0, BLOCK_N)
+    # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=(start_n + offs_n[None, :]) <
+                    cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k, allow_tf32=False)
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                        float("-inf"))
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                    alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
+            alibi, float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=(start_n + offs_n[:, None]) <
+                    cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v, allow_tf32=False)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+        cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+                acc,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+    return
+
+@torch.inference_mode()
+def context_attention_fwd(q,
+                            k,
+                            v,
+                            o,
+                            k_cache,
+                            v_cache,
+                            b_loc,
+                            b_start_loc,
+                            b_seq_len,
+                            b_ctx_len,
+                            max_input_len,
+                            alibi_slopes=None):
+
+    cap = torch.cuda.get_device_capability()
+    BLOCK = 128 if cap[0] >= 8 else 64
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    assert Lk in {16, 32, 64, 128}
+
+    sm_scale = 1.0 / (Lq**0.5)
+    batch, head = b_seq_len.shape[0], q.shape[1]
+    num_queries_per_kv = q.shape[1] // k.shape[1]
+
+    grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
+
+    num_warps = 8 if Lk <= 64 else 8
+    if alibi_slopes is not None:
+        _fwd_kernel_alibi[grid](
             q,
             k,
             v,
@@ -707,6 +655,7 @@ def context_attention_fwd(q,
             b_start_loc,
             b_seq_len,
             b_ctx_len,
+            alibi_slopes,
             v_cache.shape[3],
             8,
             o,
@@ -729,7 +678,8 @@ def context_attention_fwd(q,
             k_cache.stride(2),
             k_cache.stride(3),
             k_cache.stride(
-                4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+                4
+            ),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
             v_cache.stride(0),
             v_cache.stride(1),
             v_cache.stride(2),
@@ -743,3 +693,51 @@ def context_attention_fwd(q,
             num_stages=1,
         )
         return
+
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        b_loc,
+        sm_scale,
+        b_start_loc,
+        b_seq_len,
+        b_ctx_len,
+        v_cache.shape[3],
+        8,
+        o,
+        b_loc.stride(0),
+        b_loc.stride(1),
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        o.stride(0),
+        o.stride(1),
+        o.stride(2),
+        k_cache.stride(0),
+        k_cache.stride(1),
+        k_cache.stride(2),
+        k_cache.stride(3),
+        k_cache.stride(
+            4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+        v_cache.stride(0),
+        v_cache.stride(1),
+        v_cache.stride(2),
+        v_cache.stride(
+            3),  #[num_blocks, num_kv_heads, head_size, block_size]
+        num_queries_per_kv=num_queries_per_kv,
+        BLOCK_M=BLOCK,
+        BLOCK_DMODEL=Lk,
+        BLOCK_N=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return

From 145e9c9d808fa9c15029bf3f63ae0e68f80ea98b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Wed, 6 Mar 2024 20:34:06 +0800
Subject: [PATCH 2/9] feat: prefix caching with fp8 kvcache

---
 vllm/model_executor/layers/attention.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 2a82325b80213..bd0b2c39acf93 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -209,6 +209,9 @@ def forward(
             else:
                 # prefix-enabled attention
                 output = torch.empty_like(query)
+                if input_metadata.kv_cache_dtype == "fp8_e5m2":
+                    key_cache = key_cache.view(dtype=torch.float8_e5m2)
+                    value_cache = value_cache.view(dtype=torch.float8_e5m2)
                 context_attention_fwd(
                     query,
                     key,

From 9970b790d78d5bc1d222778881fb475ef8e79641 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Wed, 6 Mar 2024 20:37:54 +0800
Subject: [PATCH 3/9] add triton version check

---
 .../layers/triton_kernel/prefix_prefill.py    | 219 +++++++++---------
 1 file changed, 104 insertions(+), 115 deletions(-)

diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
index 65f183a5c55e8..e28654c6d8598 100644
--- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
+++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -4,6 +4,11 @@
 import torch
 import triton
 import triton.language as tl
+import packaging
+
+assert packaging.version.parse(triton.__version__) >= packaging.version.parse(
+    "2.1.0"), "Triton version >= 2.1.0 is required."
+
 
 @triton.jit
 def _fwd_kernel(
@@ -64,14 +69,12 @@ def _fwd_kernel(
     offs_n = tl.arange(0, BLOCK_N)
     offs_d = tl.arange(0, BLOCK_DMODEL)
     offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_q = (
-        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-        cur_head * stride_qh + offs_d[None, :] * stride_qd)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
-    q = tl.load(
-        Q + off_q,
-        mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-        other=0.0)
+    q = tl.load(Q + off_q,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+                other=0.0)
 
     # # initialize pointer to m and l
     m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -82,20 +85,18 @@ def _fwd_kernel(
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                        ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                        mask=(start_n + offs_n) < cur_batch_ctx_len,
-                        other=0)
-        off_k = (bn[None, :] * stride_k_cache_bs +
-                    cur_kv_head * stride_k_cache_h +
-                    (offs_d[:, None] // x) * stride_k_cache_d +
-                    ((start_n + offs_n[None, :]) % block_size) *
-                    stride_k_cache_bl +
-                    (offs_d[:, None] % x) * stride_k_cache_x)
-        off_v = (
-            bn[:, None] * stride_v_cache_bs +
-            cur_kv_head * stride_v_cache_h +
-            offs_d[None, :] * stride_v_cache_d +
-            (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
         k = tl.load(K_cache + off_k,
                     mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
                     other=0.0)
@@ -103,7 +104,7 @@ def _fwd_kernel(
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
         qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                        float("-inf"))
+                      float("-inf"))
         qk *= sm_scale
 
         # -- compute m_ij, p, l_ij
@@ -134,9 +135,9 @@ def _fwd_kernel(
         m_i = m_i_new
 
     off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                offs_d[:, None] * stride_kd)
+             offs_d[:, None] * stride_kd)
     off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                offs_d[None, :] * stride_vd)
+             offs_d[None, :] * stride_vd)
     k_ptrs = K + off_k
     v_ptrs = V + off_v
 
@@ -156,7 +157,7 @@ def _fwd_kernel(
         qk += tl.dot(q, k)
         qk *= sm_scale
         qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                        float("-inf"))
+                      float("-inf"))
 
         # -- compute m_ij, p, l_ij
         m_ij = tl.max(qk, 1)
@@ -187,15 +188,15 @@ def _fwd_kernel(
         l_i = l_i_new
         m_i = m_i_new
     # initialize pointers to output
-    off_o = (
-        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-        cur_head * stride_oh + offs_d[None, :] * stride_od)
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
     out_ptrs = Out + off_o
     tl.store(out_ptrs,
-                acc,
-                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+             acc,
+             mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
     return
 
+
 @triton.jit
 def _fwd_kernel_flash_attn_v2(
     Q,
@@ -255,14 +256,12 @@ def _fwd_kernel_flash_attn_v2(
     offs_n = tl.arange(0, BLOCK_N)
     offs_d = tl.arange(0, BLOCK_DMODEL)
     offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_q = (
-        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-        cur_head * stride_qh + offs_d[None, :] * stride_qd)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
-    q = tl.load(
-        Q + off_q,
-        mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-        other=0.0)
+    q = tl.load(Q + off_q,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+                other=0.0)
 
     # # initialize pointer to m and l
     m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -273,20 +272,18 @@ def _fwd_kernel_flash_attn_v2(
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                        ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                        mask=(start_n + offs_n) < cur_batch_ctx_len,
-                        other=0)
-        off_k = (bn[None, :] * stride_k_cache_bs +
-                    cur_kv_head * stride_k_cache_h +
-                    (offs_d[:, None] // x) * stride_k_cache_d +
-                    ((start_n + offs_n[None, :]) % block_size) *
-                    stride_k_cache_bl +
-                    (offs_d[:, None] % x) * stride_k_cache_x)
-        off_v = (
-            bn[:, None] * stride_v_cache_bs +
-            cur_kv_head * stride_v_cache_h +
-            offs_d[None, :] * stride_v_cache_d +
-            (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
         k = tl.load(K_cache + off_k,
                     mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
                     other=0.0)
@@ -294,7 +291,7 @@ def _fwd_kernel_flash_attn_v2(
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
         qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                        float("-inf"))
+                      float("-inf"))
         qk *= sm_scale
 
         # -- compute m_ij, p, l_ij
@@ -324,9 +321,9 @@ def _fwd_kernel_flash_attn_v2(
         m_i = m_i_new
 
     off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                offs_d[:, None] * stride_kd)
+             offs_d[:, None] * stride_kd)
     off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                offs_d[None, :] * stride_vd)
+             offs_d[None, :] * stride_vd)
     k_ptrs = K + off_k
     v_ptrs = V + off_v
 
@@ -346,7 +343,7 @@ def _fwd_kernel_flash_attn_v2(
         qk += tl.dot(q, k)
         qk *= sm_scale
         qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                        float("-inf"))
+                      float("-inf"))
 
         # -- compute m_ij, p, l_ij
         m_ij = tl.max(qk, 1)
@@ -378,15 +375,15 @@ def _fwd_kernel_flash_attn_v2(
 
     # acc /= l_i[:, None]
     # initialize pointers to output
-    off_o = (
-        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-        cur_head * stride_oh + offs_d[None, :] * stride_od)
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
     out_ptrs = Out + off_o
     tl.store(out_ptrs,
-                acc,
-                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+             acc,
+             mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
     return
 
+
 @triton.jit
 def _fwd_kernel_alibi(
     Q,
@@ -451,14 +448,12 @@ def _fwd_kernel_alibi(
     offs_n = tl.arange(0, BLOCK_N)
     offs_d = tl.arange(0, BLOCK_DMODEL)
     offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_q = (
-        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-        cur_head * stride_qh + offs_d[None, :] * stride_qd)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
-    q = tl.load(
-        Q + off_q,
-        mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-        other=0.0)
+    q = tl.load(Q + off_q,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+                other=0.0)
 
     # # initialize pointer to m and l
     m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -466,27 +461,24 @@ def _fwd_kernel_alibi(
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
     alibi_slope = tl.load(Alibi_slopes + cur_head)
-    alibi_start_q = tl.arange(
-        0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
     alibi_start_k = 0
     for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                        ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                        mask=(start_n + offs_n) < cur_batch_ctx_len,
-                        other=0)
-        off_k = (bn[None, :] * stride_k_cache_bs +
-                    cur_kv_head * stride_k_cache_h +
-                    (offs_d[:, None] // x) * stride_k_cache_d +
-                    ((start_n + offs_n[None, :]) % block_size) *
-                    stride_k_cache_bl +
-                    (offs_d[:, None] % x) * stride_k_cache_x)
-        off_v = (
-            bn[:, None] * stride_v_cache_bs +
-            cur_kv_head * stride_v_cache_h +
-            offs_d[None, :] * stride_v_cache_d +
-            (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
         k = tl.load(K_cache + off_k,
                     mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
                     other=0.0)
@@ -494,15 +486,15 @@ def _fwd_kernel_alibi(
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
         qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                        float("-inf"))
+                      float("-inf"))
         qk *= sm_scale
 
         # load alibi
         alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                    alibi_start_q[:, None]) * alibi_slope
+                 alibi_start_q[:, None]) * alibi_slope
         alibi = tl.where(
-            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-            alibi, float("-inf"))
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
         qk += alibi
         alibi_start_k += BLOCK_N
 
@@ -533,9 +525,9 @@ def _fwd_kernel_alibi(
         m_i = m_i_new
 
     off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                offs_d[:, None] * stride_kd)
+             offs_d[:, None] * stride_kd)
     off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                offs_d[None, :] * stride_vd)
+             offs_d[None, :] * stride_vd)
     k_ptrs = K + off_k
     v_ptrs = V + off_v
 
@@ -544,8 +536,7 @@ def _fwd_kernel_alibi(
 
     # init alibi
     alibi_slope = tl.load(Alibi_slopes + cur_head)
-    alibi_start_q = tl.arange(
-        0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
     alibi_start_k = cur_batch_ctx_len
     # # init debugger
     # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
@@ -564,14 +555,14 @@ def _fwd_kernel_alibi(
         qk += tl.dot(q, k, allow_tf32=False)
         qk *= sm_scale
         qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                        float("-inf"))
+                      float("-inf"))
 
         # load alibi
         alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                    alibi_start_q[:, None]) * alibi_slope
+                 alibi_start_q[:, None]) * alibi_slope
         alibi = tl.where(
-            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-            alibi, float("-inf"))
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
         qk += alibi
         alibi_start_k += BLOCK_N
 
@@ -606,28 +597,28 @@ def _fwd_kernel_alibi(
     acc = acc / l_i[:, None]
 
     # initialize pointers to output
-    off_o = (
-        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-        cur_head * stride_oh + offs_d[None, :] * stride_od)
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
     out_ptrs = Out + off_o
     tl.store(out_ptrs,
-                acc,
-                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+             acc,
+             mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
     return
 
+
 @torch.inference_mode()
 def context_attention_fwd(q,
-                            k,
-                            v,
-                            o,
-                            k_cache,
-                            v_cache,
-                            b_loc,
-                            b_start_loc,
-                            b_seq_len,
-                            b_ctx_len,
-                            max_input_len,
-                            alibi_slopes=None):
+                          k,
+                          v,
+                          o,
+                          k_cache,
+                          v_cache,
+                          b_loc,
+                          b_start_loc,
+                          b_seq_len,
+                          b_ctx_len,
+                          max_input_len,
+                          alibi_slopes=None):
 
     cap = torch.cuda.get_device_capability()
     BLOCK = 128 if cap[0] >= 8 else 64
@@ -678,8 +669,7 @@ def context_attention_fwd(q,
             k_cache.stride(2),
             k_cache.stride(3),
             k_cache.stride(
-                4
-            ),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+                4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
             v_cache.stride(0),
             v_cache.stride(1),
             v_cache.stride(2),
@@ -731,8 +721,7 @@ def context_attention_fwd(q,
         v_cache.stride(0),
         v_cache.stride(1),
         v_cache.stride(2),
-        v_cache.stride(
-            3),  #[num_blocks, num_kv_heads, head_size, block_size]
+        v_cache.stride(3),  #[num_blocks, num_kv_heads, head_size, block_size]
         num_queries_per_kv=num_queries_per_kv,
         BLOCK_M=BLOCK,
         BLOCK_DMODEL=Lk,

From 5d31fe716fe66d3f64abcf1e0d165bb15093a7a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Thu, 7 Mar 2024 16:22:26 +0800
Subject: [PATCH 4/9] update triton version to 2.2.0

---
 requirements.txt                                           | 2 +-
 vllm/model_executor/layers/triton_kernel/prefix_prefill.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 05ec2e804e13b..6b46f778f57fb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,6 @@ uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 pynvml == 11.5.0
-triton >= 2.1.0
+triton >= 2.2.0
 outlines >= 0.0.27
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
index e28654c6d8598..d76f2ad75588f 100644
--- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
+++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -7,8 +7,7 @@
 import packaging
 
 assert packaging.version.parse(triton.__version__) >= packaging.version.parse(
-    "2.1.0"), "Triton version >= 2.1.0 is required."
-
+    "2.2.0"), "Triton version >= 2.2.0 is required."
 
 @triton.jit
 def _fwd_kernel(
@@ -99,7 +98,7 @@ def _fwd_kernel(
                  (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
         k = tl.load(K_cache + off_k,
                     mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                    other=0.0)
+                    other=0.0).to(q.dtype)
 
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
@@ -126,7 +125,7 @@ def _fwd_kernel(
         # update acc
         v = tl.load(V_cache + off_v,
                     mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                    other=0.0)
+                    other=0.0).to(k.dtype)
 
         p = p.to(v.dtype)
         acc += tl.dot(p, v)

From 22eddd882f45e36e088bedfd95a8f1f99e18ff61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Thu, 7 Mar 2024 16:50:27 +0800
Subject: [PATCH 5/9] format code

---
 vllm/model_executor/layers/triton_kernel/prefix_prefill.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
index d76f2ad75588f..7caacdfe4ae48 100644
--- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
+++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -9,6 +9,7 @@
 assert packaging.version.parse(triton.__version__) >= packaging.version.parse(
     "2.2.0"), "Triton version >= 2.2.0 is required."
 
+
 @triton.jit
 def _fwd_kernel(
     Q,

From d2a3036bb8dd1606a230b9d22b223b2f61f09eeb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Thu, 7 Mar 2024 16:53:30 +0800
Subject: [PATCH 6/9] convert to fp16 in _fwd_kernel_alibi

---
 vllm/model_executor/layers/triton_kernel/prefix_prefill.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
index 7caacdfe4ae48..232cf026d37a3 100644
--- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
+++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -481,7 +481,7 @@ def _fwd_kernel_alibi(
                  (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
         k = tl.load(K_cache + off_k,
                     mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                    other=0.0)
+                    other=0.0).to(q.dtype)
 
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
@@ -516,7 +516,7 @@ def _fwd_kernel_alibi(
         # update acc
         v = tl.load(V_cache + off_v,
                     mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                    other=0.0)
+                    other=0.0).to(q.dtype)
 
         p = p.to(v.dtype)
         acc += tl.dot(p, v, allow_tf32=False)

From 82239781fbd91c851e8eab47a9575563517d55e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Mon, 11 Mar 2024 12:59:50 +0800
Subject: [PATCH 7/9] Align to Q type

---
 vllm/model_executor/layers/attention/ops/prefix_prefill.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/attention/ops/prefix_prefill.py b/vllm/model_executor/layers/attention/ops/prefix_prefill.py
index 232cf026d37a3..c50c34e076a7c 100644
--- a/vllm/model_executor/layers/attention/ops/prefix_prefill.py
+++ b/vllm/model_executor/layers/attention/ops/prefix_prefill.py
@@ -126,7 +126,7 @@ def _fwd_kernel(
         # update acc
         v = tl.load(V_cache + off_v,
                     mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                    other=0.0).to(k.dtype)
+                    other=0.0).to(q.dtype)
 
         p = p.to(v.dtype)
         acc += tl.dot(p, v)
@@ -286,7 +286,7 @@ def _fwd_kernel_flash_attn_v2(
                  (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
         k = tl.load(K_cache + off_k,
                     mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                    other=0.0)
+                    other=0.0).to(q.dtype)
 
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
@@ -312,7 +312,7 @@ def _fwd_kernel_flash_attn_v2(
         # update acc
         v = tl.load(V_cache + off_v,
                     mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                    other=0.0)
+                    other=0.0).to(q.dtype)
 
         p = p.to(v.dtype)
         acc += tl.dot(p, v)

From 8ecc90d383e3360f9912eb077f3666479b5670a9 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Wed, 13 Mar 2024 07:38:19 +0000
Subject: [PATCH 8/9] Use torch.fp8_e5m2 instead of torch.uint8 in python
 interface

---
 csrc/dispatch_utils.h                                  | 3 ++-
 tests/kernels/test_cache.py                            | 8 ++++++++
 vllm/model_executor/layers/attention/ops/paged_attn.py | 4 ----
 vllm/utils.py                                          | 4 ++--
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 91abd9e85b4bb..d754fc46a0167 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -19,12 +19,13 @@
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)\
   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
 
 #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...)           \
   AT_DISPATCH_SWITCH(                                                    \
     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
-    
+
 #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)             \
   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)      \
   AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)      \
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index d8dc74bc7b003..0c56052c39213 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -92,9 +92,17 @@ def test_copy_blocks(
 
     # Compare the results.
     for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
+        # NOTE: torch.allclose has not supported
+        # torch.fp8_e5m2/torch.fp8_e4m3fn dtypes.
+        if kv_cache_dtype == "fp8_e5m2":
+            key_cache = key_cache.view(torch.half)
+            cloned_key_cache = cloned_key_cache.view(torch.half)
         assert torch.allclose(key_cache, cloned_key_cache)
     for value_cache, cloned_value_cache in zip(value_caches,
                                                cloned_value_caches):
+        if kv_cache_dtype == "fp8_e5m2":
+            value_cache = value_cache.view(torch.half)
+            cloned_value_cache = cloned_value_cache.view(torch.half)
         assert torch.allclose(value_cache, cloned_value_cache)
 
 
diff --git a/vllm/model_executor/layers/attention/ops/paged_attn.py b/vllm/model_executor/layers/attention/ops/paged_attn.py
index 7e2939c25d21a..c5a9618c2395b 100644
--- a/vllm/model_executor/layers/attention/ops/paged_attn.py
+++ b/vllm/model_executor/layers/attention/ops/paged_attn.py
@@ -121,10 +121,6 @@ def forward_prefix(
         alibi_slopes: Optional[torch.Tensor],
     ) -> torch.Tensor:
         output = torch.empty_like(query)
-        if input_metadata.kv_cache_dtype == "fp8_e5m2":
-            # Convert cache from uint8 to float8_e5m2.
-            key_cache = key_cache.view(dtype=torch.float8_e5m2)
-            value_cache = value_cache.view(dtype=torch.float8_e5m2)
         context_attention_fwd(
             query,
             key,
diff --git a/vllm/utils.py b/vllm/utils.py
index fe6fd27962cd3..7ba7a16b1e7da 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -29,7 +29,7 @@
     "half": torch.half,
     "bfloat16": torch.bfloat16,
     "float": torch.float,
-    "fp8_e5m2": torch.uint8,
+    "fp8_e5m2": torch.float8_e5m2,
 }
 
 
@@ -270,7 +270,7 @@ def create_kv_caches_with_random(
         elif cache_dtype in ["half", "bfloat16", "float"]:
             torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
         elif cache_dtype == "fp8_e5m2":
-            torch_dtype = torch.uint8
+            torch_dtype = torch.float8_e5m2
         else:
             raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
     elif isinstance(cache_dtype, torch.dtype):

From 5afb543025dd5e040d0d5b3eb5c5e979fbbc5d81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Wed, 13 Mar 2024 16:22:06 +0800
Subject: [PATCH 9/9] Use torch.fp8_e5m2 as fp8 kvcache dtype

---
 vllm/utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 7ba7a16b1e7da..29a8151f64889 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -267,10 +267,8 @@ def create_kv_caches_with_random(
                 torch_dtype = model_dtype
             else:
                 raise ValueError(f"Invalid model dtype: {model_dtype}")
-        elif cache_dtype in ["half", "bfloat16", "float"]:
+        elif cache_dtype in STR_DTYPE_TO_TORCH_DTYPE:
             torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
-        elif cache_dtype == "fp8_e5m2":
-            torch_dtype = torch.float8_e5m2
         else:
             raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
     elif isinstance(cache_dtype, torch.dtype):