From f7436ac26590c73f4f543c4d6f314fd76225f766 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 4 Oct 2024 05:54:02 +0000 Subject: [PATCH] set VLLM_GPU_ARCHES before flash attention in the cuda case --- CMakeLists.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a6c1fb14b2a9..7b24c4abc650e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -482,6 +482,17 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") return() endif () +# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target +# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the +# arches in the CUDA case (and instead set the gencodes on a per file basis) +# we need to manually set VLLM_GPU_ARCHES here. +if(VLLM_GPU_LANG STREQUAL "CUDA") + foreach(_ARCH ${CUDA_ARCHS}) + string(REPLACE "." "" _ARCH "${_ARCH}") + list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real") + endforeach() +endif() + # # Build vLLM flash attention from source #