From b92361aa6acaa42a1d25079b6299e24fde170982 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxyan@amd.com>
Date: Thu, 28 Mar 2024 19:42:45 +0000
Subject: [PATCH 1/7] fix several bugs related to rccl path and attention
 selector logic

---
 Dockerfile.rocm                               |  2 +-
 vllm/attention/backends/xformers.py           |  4 +-
 .../model_executor/parallel_utils/find_lib.py | 40 +++++++++++++++++++
 vllm/model_executor/parallel_utils/pynccl.py  | 11 ++++-
 4 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 vllm/model_executor/parallel_utils/find_lib.py

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index a09de99f7a468..65a367994f960 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -90,6 +90,6 @@ RUN cd /app \
     && cd ..
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
+RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
 
 CMD ["/bin/bash"]
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index fcd903ddf5f51..8e510f975059e 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -405,8 +405,8 @@ def _check_use_naive_attention() -> bool:
     if not is_hip():
         return False
     # For ROCm, check whether flash attention is installed or not.
-    has_flash_attn = importlib.util.find_spec("flash_attn") is None
-    if not has_flash_attn:
+    use_naive_attention = importlib.util.find_spec("flash_attn") is None
+    if use_naive_attention:
         logger.warning("flash_attn is not installed. Using naive attention. "
                        "This will take significantly more GPU memory.")
         return True
diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py
new file mode 100644
index 0000000000000..eecc7f588ee0c
--- /dev/null
+++ b/vllm/model_executor/parallel_utils/find_lib.py
@@ -0,0 +1,40 @@
+import torch
+import subprocess
+
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+import re
+
+def get_library_path(library_name):
+    # Robust way to find the library path from torch installation
+    # Hard coding a library parth is error prone
+    try:
+        torch_dir = os.path.dirname(torch.__file__)
+        torch_path = os.path.join(torch_dir, "lib", "libtorch.so")
+
+        result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], capture_output=True, text=True)
+        if result.returncode == 0:
+            output_lines = result.stdout.split("\n")
+            for line in output_lines:
+                if library_name in line:
+                    match = re.search(r'=>\s*(\S+)', line)
+                    if match:
+                        library_path = match.group(1)
+                        return library_path
+        else:
+            logger.error(f"PyTorch is not installed properly. {result.stderr}")
+    except Exception as e:
+        logger.error(f"Error finding library path: {e}")
+        return None
+                
+# you can test this
+if __name__ == "__main__":
+
+    # this works for librccl.so, librccl.so.1, etc
+    rccl_path = get_library_path("librccl.so")
+    if rccl_path:
+        print(f"location is {rccl_path}")
+    else:
+        print("librccl.so not found")
\ No newline at end of file
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index 0eb75e02d62cf..54ac1ab5e60fe 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -28,10 +28,17 @@
 import torch
 import torch.distributed as dist
 from torch.distributed import ReduceOp
+from vllm.model_executor.parallel_utils.find_lib import get_library_path
+from vllm.utils import is_hip
 
 logger = logging.getLogger(__name__)
 
-so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
+
+if is_hip():
+    # a robust way to get the path of librccl, no matter it is librccl.so, or librccl.so.1
+    so_file = get_library_path("librccl.so")
+else:
+    so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
 
 # manually load the nccl library
 if so_file:
@@ -41,7 +48,7 @@
     if torch.version.cuda is not None:
         so_file = "libnccl.so.2"
     elif torch.version.hip is not None:
-        so_file = "librccl.so.2"
+        so_file = "librccl.so.1"
     else:
         raise ValueError("NCCL only supports CUDA and ROCm backends.")
     logger.debug(f"Loading nccl from library {so_file}")

From 415665c00bd2618fe97bcd7737e6333870eb4192 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxyan@amd.com>
Date: Thu, 28 Mar 2024 20:15:43 +0000
Subject: [PATCH 2/7] update requirements-rocm.txt as well for ray version

---
 requirements-rocm.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 6acf70695cef8..4883c44280a2e 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -3,7 +3,7 @@ ninja  # For faster builds.
 typing-extensions>=4.8.0
 starlette
 psutil
-ray >= 2.9
+ray == 2.9.3
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0

From 55ab3ecdbb115f5b0754dc66c91b7222b1883a4b Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxyan@amd.com>
Date: Thu, 28 Mar 2024 20:41:14 +0000
Subject: [PATCH 3/7] format fix

---
 vllm/model_executor/parallel_utils/find_lib.py | 11 +++++++----
 vllm/model_executor/parallel_utils/pynccl.py   |  4 ++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py
index eecc7f588ee0c..24ec98afe3713 100644
--- a/vllm/model_executor/parallel_utils/find_lib.py
+++ b/vllm/model_executor/parallel_utils/find_lib.py
@@ -3,9 +3,10 @@
 
 import logging
 import os
+import re
 
 logger = logging.getLogger(__name__)
-import re
+
 
 def get_library_path(library_name):
     # Robust way to find the library path from torch installation
@@ -14,7 +15,8 @@ def get_library_path(library_name):
         torch_dir = os.path.dirname(torch.__file__)
         torch_path = os.path.join(torch_dir, "lib", "libtorch.so")
 
-        result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], capture_output=True, text=True)
+        result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], 
+                                capture_output=True, text=True)
         if result.returncode == 0:
             output_lines = result.stdout.split("\n")
             for line in output_lines:
@@ -28,8 +30,9 @@ def get_library_path(library_name):
     except Exception as e:
         logger.error(f"Error finding library path: {e}")
         return None
-                
-# you can test this
+   
+
+# simple test
 if __name__ == "__main__":
 
     # this works for librccl.so, librccl.so.1, etc
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index 54ac1ab5e60fe..facc6213cf46e 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -33,9 +33,9 @@
 
 logger = logging.getLogger(__name__)
 
-
 if is_hip():
-    # a robust way to get the path of librccl, no matter it is librccl.so, or librccl.so.1
+    # a robust way to get the path of librccl, 
+    # no matter it is librccl.so, or librccl.so.1
     so_file = get_library_path("librccl.so")
 else:
     so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")

From 5106190ab70de086e9541e3e8ba20933e7611304 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxyan@amd.com>
Date: Thu, 28 Mar 2024 21:45:48 +0000
Subject: [PATCH 4/7] format

---
 vllm/model_executor/parallel_utils/find_lib.py | 11 ++++++-----
 vllm/model_executor/parallel_utils/pynccl.py   |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py
index 24ec98afe3713..79d69967a17b7 100644
--- a/vllm/model_executor/parallel_utils/find_lib.py
+++ b/vllm/model_executor/parallel_utils/find_lib.py
@@ -10,13 +10,14 @@
 
 def get_library_path(library_name):
     # Robust way to find the library path from torch installation
-    # Hard coding a library parth is error prone
+    # Hard coding a library path is error prone
     try:
         torch_dir = os.path.dirname(torch.__file__)
         torch_path = os.path.join(torch_dir, "lib", "libtorch.so")
 
-        result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], 
-                                capture_output=True, text=True)
+        result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path],
+                                capture_output=True,
+                                text=True)
         if result.returncode == 0:
             output_lines = result.stdout.split("\n")
             for line in output_lines:
@@ -30,7 +31,7 @@ def get_library_path(library_name):
     except Exception as e:
         logger.error(f"Error finding library path: {e}")
         return None
-   
+
 
 # simple test
 if __name__ == "__main__":
@@ -40,4 +41,4 @@ def get_library_path(library_name):
     if rccl_path:
         print(f"location is {rccl_path}")
     else:
-        print("librccl.so not found")
\ No newline at end of file
+        print("librccl.so not found")
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index facc6213cf46e..f2062c0e960c0 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -34,7 +34,7 @@
 logger = logging.getLogger(__name__)
 
 if is_hip():
-    # a robust way to get the path of librccl, 
+    # a robust way to get the path of librccl,
     # no matter it is librccl.so, or librccl.so.1
     so_file = get_library_path("librccl.so")
 else:

From 96f1594c9e2e529828dcd29ff1038201b109cbfe Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxyan@amd.com>
Date: Thu, 28 Mar 2024 22:15:54 +0000
Subject: [PATCH 5/7] format.sh

---
 vllm/model_executor/parallel_utils/find_lib.py | 6 +++---
 vllm/model_executor/parallel_utils/pynccl.py   | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py
index 79d69967a17b7..0027d1d19a986 100644
--- a/vllm/model_executor/parallel_utils/find_lib.py
+++ b/vllm/model_executor/parallel_utils/find_lib.py
@@ -1,9 +1,9 @@
-import torch
-import subprocess
-
 import logging
 import os
 import re
+import subprocess
+
+import torch
 
 logger = logging.getLogger(__name__)
 
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index f2062c0e960c0..4ea3f7f80884e 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -28,6 +28,7 @@
 import torch
 import torch.distributed as dist
 from torch.distributed import ReduceOp
+
 from vllm.model_executor.parallel_utils.find_lib import get_library_path
 from vllm.utils import is_hip
 

From 01431b6859457f82ca3eec879c30cdcbc8e02e2f Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxyan@amd.com>
Date: Fri, 29 Mar 2024 21:29:44 +0000
Subject: [PATCH 6/7] check more later on better way

---
 .../model_executor/parallel_utils/find_lib.py | 44 -------------------
 vllm/model_executor/parallel_utils/pynccl.py  | 12 ++---
 2 files changed, 3 insertions(+), 53 deletions(-)
 delete mode 100644 vllm/model_executor/parallel_utils/find_lib.py

diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py
deleted file mode 100644
index 0027d1d19a986..0000000000000
--- a/vllm/model_executor/parallel_utils/find_lib.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import logging
-import os
-import re
-import subprocess
-
-import torch
-
-logger = logging.getLogger(__name__)
-
-
-def get_library_path(library_name):
-    # Robust way to find the library path from torch installation
-    # Hard coding a library path is error prone
-    try:
-        torch_dir = os.path.dirname(torch.__file__)
-        torch_path = os.path.join(torch_dir, "lib", "libtorch.so")
-
-        result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path],
-                                capture_output=True,
-                                text=True)
-        if result.returncode == 0:
-            output_lines = result.stdout.split("\n")
-            for line in output_lines:
-                if library_name in line:
-                    match = re.search(r'=>\s*(\S+)', line)
-                    if match:
-                        library_path = match.group(1)
-                        return library_path
-        else:
-            logger.error(f"PyTorch is not installed properly. {result.stderr}")
-    except Exception as e:
-        logger.error(f"Error finding library path: {e}")
-        return None
-
-
-# simple test
-if __name__ == "__main__":
-
-    # this works for librccl.so, librccl.so.1, etc
-    rccl_path = get_library_path("librccl.so")
-    if rccl_path:
-        print(f"location is {rccl_path}")
-    else:
-        print("librccl.so not found")
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index 4ea3f7f80884e..6aac4f8019c6d 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -29,17 +29,9 @@
 import torch.distributed as dist
 from torch.distributed import ReduceOp
 
-from vllm.model_executor.parallel_utils.find_lib import get_library_path
-from vllm.utils import is_hip
-
 logger = logging.getLogger(__name__)
 
-if is_hip():
-    # a robust way to get the path of librccl,
-    # no matter it is librccl.so, or librccl.so.1
-    so_file = get_library_path("librccl.so")
-else:
-    so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
+so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
 
 # manually load the nccl library
 if so_file:
@@ -229,6 +221,7 @@ def __init__(
                                     pg_options=pg_options)
         self.world_size = dist.get_world_size()
         self.rank = dist.get_rank()
+        # this also caused invalid device ordinal (why we need two init process group? one from pytorch.dist, one from this place, again, duplicated)
         torch.cuda.set_device(self.rank)
         if self.rank == 0:
             self.unique_id = ncclGetUniqueId()
@@ -263,4 +256,5 @@ def all_reduce(self,
 
     def __del__(self):
         dist.destroy_process_group()
+        # AttributeError: 'NCCLCommunicator' object has no attribute 'comm'
         _c_ncclCommDestroy(self.comm)

From 1b66ceb41bf82fcabca04a6400cd82e4b5a76150 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxyan@amd.com>
Date: Fri, 29 Mar 2024 21:31:39 +0000
Subject: [PATCH 7/7] remove comments added during testing

---
 vllm/model_executor/parallel_utils/pynccl.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index 6aac4f8019c6d..f9a7f0509352c 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -221,7 +221,6 @@ def __init__(
                                     pg_options=pg_options)
         self.world_size = dist.get_world_size()
         self.rank = dist.get_rank()
-        # this also caused invalid device ordinal (why we need two init process group? one from pytorch.dist, one from this place, again, duplicated)
         torch.cuda.set_device(self.rank)
         if self.rank == 0:
             self.unique_id = ncclGetUniqueId()
@@ -256,5 +255,4 @@ def all_reduce(self,
 
     def __del__(self):
         dist.destroy_process_group()
-        # AttributeError: 'NCCLCommunicator' object has no attribute 'comm'
         _c_ncclCommDestroy(self.comm)