diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0f920c7ec1442..f7c1569696249 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -15,10 +15,8 @@ steps: commands: - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py - - VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_basic_correctness.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_chunked_prefill.py - label: Core Test command: pytest -v -s core diff --git a/Dockerfile.rocm b/Dockerfile.rocm index b1c5fac9d78ef..3f84b949481d1 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -14,7 +14,7 @@ RUN echo "Base image is $BASE_IMAGE" ARG FA_GFX_ARCHS="gfx90a;gfx942" RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" -ARG FA_BRANCH="3d2b6f5" +ARG FA_BRANCH="ae7928c" RUN echo "FA_BRANCH is $FA_BRANCH" # whether to build flash-attention @@ -92,13 +92,10 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \ COPY ./ /app/vllm RUN python3 -m pip install --upgrade pip numba -RUN python3 -m pip install xformers==0.0.23 --no-deps RUN cd /app \ && cd vllm \ && pip install -U -r requirements-rocm.txt \ - && if [ "$BUILD_FA" = "1" ]; then \ - bash patch_xformers.rocm.sh; fi \ && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \ && python3 setup.py install \ && cd .. diff --git a/patch_xformers.rocm.sh b/patch_xformers.rocm.sh deleted file mode 100644 index de427b24d306f..0000000000000 --- a/patch_xformers.rocm.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -set -e - -XFORMERS_VERSION="0.0.23" - -export XFORMERS_INSTALLED_VERSION=$(python -c 'import xformers; print(xformers.__version__)') - -if [ "$XFORMERS_INSTALLED_VERSION" != "$XFORMERS_VERSION" ]; then - echo "ERROR: xformers version must be ${XFORMERS_VERSION}. ${XFORMERS_INSTALLED_VERSION} is installed" - exit 1 -fi - -export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)') -export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)') - -echo "XFORMERS_FMHA_FLASH_PATH = ${XFORMERS_FMHA_FLASH_PATH}" -echo "XFORMERS_FMHA_COMMON_PATH = ${XFORMERS_FMHA_COMMON_PATH}" - -if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then - echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}" - patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch" - echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}" -else - echo "${XFORMERS_FMHA_FLASH_PATH} was patched before" -fi - -if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then - echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}" - patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch" - echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}" -else - echo "${XFORMERS_FMHA_COMMON_PATH} was patched before" -fi diff --git a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch b/rocm_patch/commonpy_xformers-0.0.23.rocm.patch deleted file mode 100644 index 4d7495cf13e1d..0000000000000 --- a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch +++ /dev/null @@ -1,13 +0,0 @@ ---- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py 2023-11-29 03:17:03.930103539 +0000 -+++ common.py 2023-11-28 16:14:19.846233146 +0000 -@@ -298,8 +298,8 @@ - dtype = d.query.dtype - if device_type not in cls.SUPPORTED_DEVICES: - reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})") -- if device_type == "cuda" and not _built_with_cuda: -- reasons.append("xFormers wasn't build with CUDA support") -+ #if device_type == "cuda" and not _built_with_cuda: -+ # reasons.append("xFormers wasn't build with CUDA support") - if device_type == "cuda": - device_capability = torch.cuda.get_device_capability(d.device) - if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY: diff --git a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch deleted file mode 100644 index ac846728a7a91..0000000000000 --- a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch +++ /dev/null @@ -1,152 +0,0 @@ ---- flash_ori.py 2023-12-13 05:43:31.530752623 +0000 -+++ flash_patch.py 2023-12-13 06:00:45.962403104 +0000 -@@ -36,44 +36,44 @@ - - FLASH_VERSION = "0.0.0" - try: -- try: -- from ... import _C_flashattention # type: ignore[attr-defined] -- from ..._cpp_lib import _build_metadata -- -- if _build_metadata is not None: -- FLASH_VERSION = _build_metadata.flash_version -- except ImportError: -- import flash_attn -- from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention -- -- FLASH_VERSION = flash_attn.__version__ -- flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3]) -- if ( -- flash_ver_parsed != (2, 3, 6) -- and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1" -- ): -- raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api") -+ #try: -+ # from ... import _C_flashattention # type: ignore[attr-defined] -+ # from ..._cpp_lib import _build_metadata -+ -+ # if _build_metadata is not None: -+ # FLASH_VERSION = _build_metadata.flash_version -+ #except ImportError: -+ import flash_attn -+ from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention -+ -+ FLASH_VERSION = flash_attn.__version__ -+ # flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3]) -+ # if ( -+ # flash_ver_parsed != (2, 3, 6) -+ # and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1" -+ # ): -+ # raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api") - - # create library so that flash-attn goes through the PyTorch Dispatcher -- _flash_lib = torch.library.Library("xformers_flash", "DEF") -- -- _flash_lib.define( -- "flash_fwd(Tensor query, Tensor key, Tensor value, " -- "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, " -- "int max_seqlen_q, int max_seqlen_k, " -- "float p, float softmax_scale, " -- "bool is_causal, int window_left, " -- "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)" -- ) -+ #_flash_lib = torch.library.Library("xformers_flash", "DEF") - -- _flash_lib.define( -- "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, " -- "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, " -- "Tensor cu_seqlens_q, Tensor cu_seqlens_k, " -- "int max_seqlen_q, int max_seqlen_k, " -- "float p, float softmax_scale, bool is_causal, " -- "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)" -- ) -+ #_flash_lib.define( -+ # "flash_fwd(Tensor query, Tensor key, Tensor value, " -+ # "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, " -+ # "int max_seqlen_q, int max_seqlen_k, " -+ # "float p, float softmax_scale, " -+ # "bool is_causal, int window_left, " -+ # "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)" -+ #) -+ -+ #_flash_lib.define( -+ # "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, " -+ # "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, " -+ # "Tensor cu_seqlens_q, Tensor cu_seqlens_k, " -+ # "int max_seqlen_q, int max_seqlen_k, " -+ # "float p, float softmax_scale, bool is_causal, " -+ # "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)" -+ #) - - def _flash_fwd( - query, -@@ -111,8 +111,8 @@ - p, - softmax_scale, - is_causal, -- window_left, # window_size_left -- window_right, # window_size_right -+ # window_left, # window_size_left -+ # window_right, # window_size_right - return_softmax, - None, # rng - ) -@@ -134,15 +134,15 @@ - out, - cu_seq_lens_q, - cu_seq_lens_k, -- seqused_k, -+ # seqused_k, - max_seq_len_q, - max_seq_len_k, - p, - softmax_scale, - False, - is_causal, -- window_left, -- window_right, -+ # window_left, -+ # window_right, - return_softmax, - None, - ) -@@ -184,8 +184,8 @@ - p, - softmax_scale, - is_causal, -- window_left, -- window_right, -+ # window_left, -+ # window_right, - None, - rng_state, - ) -@@ -208,15 +208,15 @@ - softmax_scale, - False, # zero_tensors - is_causal, -- window_left, -- window_right, -+ # window_left, -+ # window_right, - None, - rng_state, - ) - return dq, dk, dv - -- _flash_lib.impl("flash_fwd", _flash_fwd, "CUDA") -- _flash_lib.impl("flash_bwd", _flash_bwd, "CUDA") -+ #_flash_lib.impl("flash_fwd", _flash_fwd, "CUDA") -+ #_flash_lib.impl("flash_bwd", _flash_bwd, "CUDA") - except ImportError: - pass - -@@ -400,7 +400,7 @@ - implementation. - """ - -- OPERATOR = get_operator("xformers_flash", "flash_fwd") -+ OPERATOR = _flash_fwd # get_operator("xformers_flash", "flash_fwd") - SUPPORTED_DEVICES: Set[str] = {"cuda"} - CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0) - SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16} diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index c42660fb8f74f..dbaa71fd16add 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -154,25 +154,30 @@ def __init__( f"Head size {head_size} is not supported by PagedAttention. " f"Supported head sizes are: {suppored_head_sizes}.") - self.use_naive_attn = torch.cuda.get_device_capability()[0] != 9 + self.use_naive_attn = False # NOTE: Allow for switching between Triton and CK. Defaulting to triton. self.use_triton_flash_attn = (os.environ.get( "VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")) - if self.use_naive_attn: - # AMD Radeon 7900 series (gfx1100) currently does not support - # xFormers nor FlashAttention. As a temporary workaround, we use - # naive PyTorch implementation of attention. - self.attn_fuc = _naive_attention - logger.debug("Using naive attention in ROCmBackend") - elif self.use_triton_flash_attn: + if self.use_triton_flash_attn: from vllm.attention.ops.triton_flash_attention import ( # noqa: F401 triton_attention) self.attn_func = triton_attention logger.debug("Using Triton FA in ROCmBackend") else: - from flash_attn import flash_attn_varlen_func # noqa: F401 - self.attn_func = flash_attn_varlen_func - logger.debug("Using CK FA in ROCmBackend") + # if not using triton, navi3x not use flash-attn either + if torch.cuda.get_device_capability()[0] == 11: + self.use_naive_attn = True + else: + try: + from flash_attn import flash_attn_varlen_func # noqa: F401 + self.attn_func = flash_attn_varlen_func + logger.debug("Using CK FA in ROCmBackend") + except ModuleNotFoundError: + self.use_naive_attn = True + + if self.use_naive_attn: + self.attn_func = _naive_attention + logger.debug("Using naive attention in ROCmBackend") def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor: """torch.repeat_interleave(x, dim=1, repeats=n_rep)""" @@ -247,13 +252,13 @@ def forward( # triton attention # When block_tables are not filled, it means q and k are the # prompt, and they have the same length. - if self.use_naive_attn or self.use_triton_flash_attn: + if self.use_triton_flash_attn or self.use_naive_attn: if self.num_kv_heads != self.num_heads: # Interleave for MQA workaround. key = self.repeat_kv(key, self.num_queries_per_kv) value = self.repeat_kv(value, self.num_queries_per_kv) if self.use_naive_attn: - out = self.attn_fuc( + out = self.attn_func( query, key, value,