vllm-project · WoosukKwon · Mar 29, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -90,6 +90,6 @@ RUN cd /app \
     && cd ..
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
+RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
 
 CMD ["/bin/bash"]
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
@@ -3,7 +3,7 @@ ninja  # For faster builds.
 typing-extensions>=4.8.0
 starlette
 psutil
-ray >= 2.9
+ray == 2.9.3
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
@@ -405,8 +405,8 @@ def _check_use_naive_attention() -> bool:
     if not is_hip():
         return False
     # For ROCm, check whether flash attention is installed or not.
-    has_flash_attn = importlib.util.find_spec("flash_attn") is None
-    if not has_flash_attn:
+    use_naive_attention = importlib.util.find_spec("flash_attn") is None
+    if use_naive_attention:
         logger.warning("flash_attn is not installed. Using naive attention. "
                        "This will take significantly more GPU memory.")
         return True

diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py
@@ -0,0 +1,40 @@
+import torch
+import subprocess
+
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+import re
+
+def get_library_path(library_name):
+    # Robust way to find the library path from torch installation
+    # Hard coding a library parth is error prone
+    try:
+        torch_dir = os.path.dirname(torch.__file__)
+        torch_path = os.path.join(torch_dir, "lib", "libtorch.so")
+
+        result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], capture_output=True, text=True)
+        if result.returncode == 0:
+            output_lines = result.stdout.split("\n")
+            for line in output_lines:
+                if library_name in line:
+                    match = re.search(r'=>\s*(\S+)', line)
+                    if match:
+                        library_path = match.group(1)
+                        return library_path
+        else:
+            logger.error(f"PyTorch is not installed properly. {result.stderr}")
+    except Exception as e:
+        logger.error(f"Error finding library path: {e}")
+        return None
+
+# you can test this
+if __name__ == "__main__":
+
+    # this works for librccl.so, librccl.so.1, etc
+    rccl_path = get_library_path("librccl.so")
+    if rccl_path:
+        print(f"location is {rccl_path}")
+    else:
+        print("librccl.so not found")
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
@@ -28,10 +28,17 @@
 import torch
 import torch.distributed as dist
 from torch.distributed import ReduceOp
+from vllm.model_executor.parallel_utils.find_lib import get_library_path
+from vllm.utils import is_hip
 
 logger = logging.getLogger(__name__)
 
-so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
+
+if is_hip():
+    # a robust way to get the path of librccl, no matter it is librccl.so, or librccl.so.1
+    so_file = get_library_path("librccl.so")
+else:
+    so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
 
 # manually load the nccl library
 if so_file:
@@ -41,7 +48,7 @@
     if torch.version.cuda is not None:
         so_file = "libnccl.so.2"
     elif torch.version.hip is not None:
-        so_file = "librccl.so.2"
+        so_file = "librccl.so.1"
     else:
         raise ValueError("NCCL only supports CUDA and ROCm backends.")
     logger.debug(f"Loading nccl from library {so_file}")