refine

ROCm · Nov 19, 2024 · 1b0e54b · 1b0e54b
1 parent ae62c82
commit 1b0e54b
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 2 deletions.
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -18,6 +18,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.utils import is_navi3
 
 
 class PatchEmbedding(nn.Module):
@@ -80,8 +81,7 @@ def __init__(
         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        _ON_NAVI3 = "gfx11" in torch.cuda.get_device_properties("cuda").gcnArchName
-        if _ON_NAVI3:
+        if is_navi3():
             try:
                 # git clone -b howiejay/navi_support https://github.com/ROCm/flash-attention.git
                 from flash_attn import flash_attn_func

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -1641,6 +1641,14 @@ def is_navi() -> bool:
     archName = torch.cuda.get_device_properties('cuda').gcnArchName
     return archName is not None and "gfx1" in archName
 
+@lru_cache(maxsize=None)
+def is_navi3() -> bool:
+    if not current_platform.is_rocm() or not torch.cuda.is_available():
+        return False
+    # All (visible) GPUs must be of the same type,
+    # otherwise FP8 results can't be guaranteed.
+    archName = torch.cuda.get_device_properties('cuda').gcnArchName
+    return archName is not None and "gfx11" in archName
 
 def weak_ref_tensors(
     tensors: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]