From f7436ac26590c73f4f543c4d6f314fd76225f766 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Date: Fri, 4 Oct 2024 05:54:02 +0000
Subject: [PATCH] set VLLM_GPU_ARCHES before flash attention in the cuda case

---
 CMakeLists.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8a6c1fb14b2a9..7b24c4abc650e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -482,6 +482,17 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
   return()
 endif ()
 
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target  
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the 
+# arches in the CUDA case (and instead set the gencodes on a per file basis) 
+# we need to manually set VLLM_GPU_ARCHES here.
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  foreach(_ARCH ${CUDA_ARCHS})
+    string(REPLACE "." "" _ARCH "${_ARCH}")
+    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
+  endforeach()
+endif()
+
 #
 # Build vLLM flash attention from source
 #