From b92361aa6acaa42a1d25079b6299e24fde170982 Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Thu, 28 Mar 2024 19:42:45 +0000 Subject: [PATCH 1/7] fix several bugs related to rccl path and attention selector logic --- Dockerfile.rocm | 2 +- vllm/attention/backends/xformers.py | 4 +- .../model_executor/parallel_utils/find_lib.py | 40 +++++++++++++++++++ vllm/model_executor/parallel_utils/pynccl.py | 11 ++++- 4 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 vllm/model_executor/parallel_utils/find_lib.py diff --git a/Dockerfile.rocm b/Dockerfile.rocm index a09de99f7a468..65a367994f960 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -90,6 +90,6 @@ RUN cd /app \ && cd .. RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir ray[all] +RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3 CMD ["/bin/bash"] diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index fcd903ddf5f51..8e510f975059e 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -405,8 +405,8 @@ def _check_use_naive_attention() -> bool: if not is_hip(): return False # For ROCm, check whether flash attention is installed or not. - has_flash_attn = importlib.util.find_spec("flash_attn") is None - if not has_flash_attn: + use_naive_attention = importlib.util.find_spec("flash_attn") is None + if use_naive_attention: logger.warning("flash_attn is not installed. Using naive attention. " "This will take significantly more GPU memory.") return True diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py new file mode 100644 index 0000000000000..eecc7f588ee0c --- /dev/null +++ b/vllm/model_executor/parallel_utils/find_lib.py @@ -0,0 +1,40 @@ +import torch +import subprocess + +import logging +import os + +logger = logging.getLogger(__name__) +import re + +def get_library_path(library_name): + # Robust way to find the library path from torch installation + # Hard coding a library parth is error prone + try: + torch_dir = os.path.dirname(torch.__file__) + torch_path = os.path.join(torch_dir, "lib", "libtorch.so") + + result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], capture_output=True, text=True) + if result.returncode == 0: + output_lines = result.stdout.split("\n") + for line in output_lines: + if library_name in line: + match = re.search(r'=>\s*(\S+)', line) + if match: + library_path = match.group(1) + return library_path + else: + logger.error(f"PyTorch is not installed properly. {result.stderr}") + except Exception as e: + logger.error(f"Error finding library path: {e}") + return None + +# you can test this +if __name__ == "__main__": + + # this works for librccl.so, librccl.so.1, etc + rccl_path = get_library_path("librccl.so") + if rccl_path: + print(f"location is {rccl_path}") + else: + print("librccl.so not found") \ No newline at end of file diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index 0eb75e02d62cf..54ac1ab5e60fe 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -28,10 +28,17 @@ import torch import torch.distributed as dist from torch.distributed import ReduceOp +from vllm.model_executor.parallel_utils.find_lib import get_library_path +from vllm.utils import is_hip logger = logging.getLogger(__name__) -so_file = os.environ.get("VLLM_NCCL_SO_PATH", "") + +if is_hip(): + # a robust way to get the path of librccl, no matter it is librccl.so, or librccl.so.1 + so_file = get_library_path("librccl.so") +else: + so_file = os.environ.get("VLLM_NCCL_SO_PATH", "") # manually load the nccl library if so_file: @@ -41,7 +48,7 @@ if torch.version.cuda is not None: so_file = "libnccl.so.2" elif torch.version.hip is not None: - so_file = "librccl.so.2" + so_file = "librccl.so.1" else: raise ValueError("NCCL only supports CUDA and ROCm backends.") logger.debug(f"Loading nccl from library {so_file}") From 415665c00bd2618fe97bcd7737e6333870eb4192 Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Thu, 28 Mar 2024 20:15:43 +0000 Subject: [PATCH 2/7] update requirements-rocm.txt as well for ray version --- requirements-rocm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-rocm.txt b/requirements-rocm.txt index 6acf70695cef8..4883c44280a2e 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -3,7 +3,7 @@ ninja # For faster builds. typing-extensions>=4.8.0 starlette psutil -ray >= 2.9 +ray == 2.9.3 sentencepiece # Required for LLaMA tokenizer. numpy tokenizers>=0.15.0 From 55ab3ecdbb115f5b0754dc66c91b7222b1883a4b Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Thu, 28 Mar 2024 20:41:14 +0000 Subject: [PATCH 3/7] format fix --- vllm/model_executor/parallel_utils/find_lib.py | 11 +++++++---- vllm/model_executor/parallel_utils/pynccl.py | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py index eecc7f588ee0c..24ec98afe3713 100644 --- a/vllm/model_executor/parallel_utils/find_lib.py +++ b/vllm/model_executor/parallel_utils/find_lib.py @@ -3,9 +3,10 @@ import logging import os +import re logger = logging.getLogger(__name__) -import re + def get_library_path(library_name): # Robust way to find the library path from torch installation @@ -14,7 +15,8 @@ def get_library_path(library_name): torch_dir = os.path.dirname(torch.__file__) torch_path = os.path.join(torch_dir, "lib", "libtorch.so") - result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], capture_output=True, text=True) + result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], + capture_output=True, text=True) if result.returncode == 0: output_lines = result.stdout.split("\n") for line in output_lines: @@ -28,8 +30,9 @@ def get_library_path(library_name): except Exception as e: logger.error(f"Error finding library path: {e}") return None - -# you can test this + + +# simple test if __name__ == "__main__": # this works for librccl.so, librccl.so.1, etc diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index 54ac1ab5e60fe..facc6213cf46e 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -33,9 +33,9 @@ logger = logging.getLogger(__name__) - if is_hip(): - # a robust way to get the path of librccl, no matter it is librccl.so, or librccl.so.1 + # a robust way to get the path of librccl, + # no matter it is librccl.so, or librccl.so.1 so_file = get_library_path("librccl.so") else: so_file = os.environ.get("VLLM_NCCL_SO_PATH", "") From 5106190ab70de086e9541e3e8ba20933e7611304 Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Thu, 28 Mar 2024 21:45:48 +0000 Subject: [PATCH 4/7] format --- vllm/model_executor/parallel_utils/find_lib.py | 11 ++++++----- vllm/model_executor/parallel_utils/pynccl.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py index 24ec98afe3713..79d69967a17b7 100644 --- a/vllm/model_executor/parallel_utils/find_lib.py +++ b/vllm/model_executor/parallel_utils/find_lib.py @@ -10,13 +10,14 @@ def get_library_path(library_name): # Robust way to find the library path from torch installation - # Hard coding a library parth is error prone + # Hard coding a library path is error prone try: torch_dir = os.path.dirname(torch.__file__) torch_path = os.path.join(torch_dir, "lib", "libtorch.so") - result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], - capture_output=True, text=True) + result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], + capture_output=True, + text=True) if result.returncode == 0: output_lines = result.stdout.split("\n") for line in output_lines: @@ -30,7 +31,7 @@ def get_library_path(library_name): except Exception as e: logger.error(f"Error finding library path: {e}") return None - + # simple test if __name__ == "__main__": @@ -40,4 +41,4 @@ def get_library_path(library_name): if rccl_path: print(f"location is {rccl_path}") else: - print("librccl.so not found") \ No newline at end of file + print("librccl.so not found") diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index facc6213cf46e..f2062c0e960c0 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -34,7 +34,7 @@ logger = logging.getLogger(__name__) if is_hip(): - # a robust way to get the path of librccl, + # a robust way to get the path of librccl, # no matter it is librccl.so, or librccl.so.1 so_file = get_library_path("librccl.so") else: From 96f1594c9e2e529828dcd29ff1038201b109cbfe Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Thu, 28 Mar 2024 22:15:54 +0000 Subject: [PATCH 5/7] format.sh --- vllm/model_executor/parallel_utils/find_lib.py | 6 +++--- vllm/model_executor/parallel_utils/pynccl.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py index 79d69967a17b7..0027d1d19a986 100644 --- a/vllm/model_executor/parallel_utils/find_lib.py +++ b/vllm/model_executor/parallel_utils/find_lib.py @@ -1,9 +1,9 @@ -import torch -import subprocess - import logging import os import re +import subprocess + +import torch logger = logging.getLogger(__name__) diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index f2062c0e960c0..4ea3f7f80884e 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -28,6 +28,7 @@ import torch import torch.distributed as dist from torch.distributed import ReduceOp + from vllm.model_executor.parallel_utils.find_lib import get_library_path from vllm.utils import is_hip From 01431b6859457f82ca3eec879c30cdcbc8e02e2f Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Fri, 29 Mar 2024 21:29:44 +0000 Subject: [PATCH 6/7] check more later on better way --- .../model_executor/parallel_utils/find_lib.py | 44 ------------------- vllm/model_executor/parallel_utils/pynccl.py | 12 ++--- 2 files changed, 3 insertions(+), 53 deletions(-) delete mode 100644 vllm/model_executor/parallel_utils/find_lib.py diff --git a/vllm/model_executor/parallel_utils/find_lib.py b/vllm/model_executor/parallel_utils/find_lib.py deleted file mode 100644 index 0027d1d19a986..0000000000000 --- a/vllm/model_executor/parallel_utils/find_lib.py +++ /dev/null @@ -1,44 +0,0 @@ -import logging -import os -import re -import subprocess - -import torch - -logger = logging.getLogger(__name__) - - -def get_library_path(library_name): - # Robust way to find the library path from torch installation - # Hard coding a library path is error prone - try: - torch_dir = os.path.dirname(torch.__file__) - torch_path = os.path.join(torch_dir, "lib", "libtorch.so") - - result = subprocess.run(['ldd', '-v', '-r', '-d', torch_path], - capture_output=True, - text=True) - if result.returncode == 0: - output_lines = result.stdout.split("\n") - for line in output_lines: - if library_name in line: - match = re.search(r'=>\s*(\S+)', line) - if match: - library_path = match.group(1) - return library_path - else: - logger.error(f"PyTorch is not installed properly. {result.stderr}") - except Exception as e: - logger.error(f"Error finding library path: {e}") - return None - - -# simple test -if __name__ == "__main__": - - # this works for librccl.so, librccl.so.1, etc - rccl_path = get_library_path("librccl.so") - if rccl_path: - print(f"location is {rccl_path}") - else: - print("librccl.so not found") diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index 4ea3f7f80884e..6aac4f8019c6d 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -29,17 +29,9 @@ import torch.distributed as dist from torch.distributed import ReduceOp -from vllm.model_executor.parallel_utils.find_lib import get_library_path -from vllm.utils import is_hip - logger = logging.getLogger(__name__) -if is_hip(): - # a robust way to get the path of librccl, - # no matter it is librccl.so, or librccl.so.1 - so_file = get_library_path("librccl.so") -else: - so_file = os.environ.get("VLLM_NCCL_SO_PATH", "") +so_file = os.environ.get("VLLM_NCCL_SO_PATH", "") # manually load the nccl library if so_file: @@ -229,6 +221,7 @@ def __init__( pg_options=pg_options) self.world_size = dist.get_world_size() self.rank = dist.get_rank() + # this also caused invalid device ordinal (why we need two init process group? one from pytorch.dist, one from this place, again, duplicated) torch.cuda.set_device(self.rank) if self.rank == 0: self.unique_id = ncclGetUniqueId() @@ -263,4 +256,5 @@ def all_reduce(self, def __del__(self): dist.destroy_process_group() + # AttributeError: 'NCCLCommunicator' object has no attribute 'comm' _c_ncclCommDestroy(self.comm) From 1b66ceb41bf82fcabca04a6400cd82e4b5a76150 Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Fri, 29 Mar 2024 21:31:39 +0000 Subject: [PATCH 7/7] remove comments added during testing --- vllm/model_executor/parallel_utils/pynccl.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index 6aac4f8019c6d..f9a7f0509352c 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -221,7 +221,6 @@ def __init__( pg_options=pg_options) self.world_size = dist.get_world_size() self.rank = dist.get_rank() - # this also caused invalid device ordinal (why we need two init process group? one from pytorch.dist, one from this place, again, duplicated) torch.cuda.set_device(self.rank) if self.rank == 0: self.unique_id = ncclGetUniqueId() @@ -256,5 +255,4 @@ def all_reduce(self, def __del__(self): dist.destroy_process_group() - # AttributeError: 'NCCLCommunicator' object has no attribute 'comm' _c_ncclCommDestroy(self.comm)