diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f39c3302ac2e9..2263dee20fbed 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -33,6 +33,7 @@ steps: num_gpus: 2 # only support 1 or 2 for now. commands: - pytest -v -s test_pynccl.py + - pytest -v -s test_pynccl_library.py - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py diff --git a/tests/distributed/test_pynccl_library.py b/tests/distributed/test_pynccl_library.py new file mode 100644 index 0000000000000..ec60a5ed3114d --- /dev/null +++ b/tests/distributed/test_pynccl_library.py @@ -0,0 +1,43 @@ +import multiprocessing +import tempfile + + +def target_fn(env, filepath): + from vllm.utils import update_environment_variables + update_environment_variables(env) + from vllm.utils import nccl_integrity_check + nccl_integrity_check(filepath) + + +def test_library_file(): + # note: don't import vllm.distributed.device_communicators.pynccl + # before running this test, otherwise the library file will be loaded + # and it might interfere with the test + from vllm.utils import find_nccl_library + so_file = find_nccl_library() + with open(so_file, 'rb') as f: + content = f.read() + try: + # corrupt the library file, should raise an exception + with open(so_file, 'wb') as f: + f.write(content[:len(content) // 2]) + p = multiprocessing.Process(target=target_fn, args=({}, so_file)) + p.start() + p.join() + assert p.exitcode != 0 + + # move the library file to a tmp path + # test VLLM_NCCL_SO_PATH + fd, path = tempfile.mkstemp() + with open(path, 'wb') as f: + f.write(content) + p = multiprocessing.Process(target=target_fn, + args=({ + "VLLM_NCCL_SO_PATH": path + }, path)) + p.start() + p.join() + assert p.exitcode == 0 + finally: + with open(so_file, 'wb') as f: + f.write(content) diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 0a8bb860efa1c..c57a4f59d442c 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -21,8 +21,7 @@ import ctypes import datetime -import glob -import os +import platform # ===================== import region ===================== import torch @@ -30,40 +29,27 @@ from torch.distributed import ReduceOp from vllm.logger import init_logger +from vllm.utils import find_nccl_library, nccl_integrity_check logger = init_logger(__name__) -so_file = os.environ.get("VLLM_NCCL_SO_PATH", "") - -# check if we have vllm-managed nccl -vllm_nccl_path = None -if torch.version.cuda is not None: - cuda_major = torch.version.cuda.split(".")[0] - path = os.path.expanduser( - f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*") - files = glob.glob(path) - vllm_nccl_path = files[0] if files else None - -# manually load the nccl library -if so_file: - logger.info( - f"Loading nccl from environment variable VLLM_NCCL_SO_PATH={so_file}") -else: - if torch.version.cuda is not None: - so_file = vllm_nccl_path or "libnccl.so.2" - elif torch.version.hip is not None: - so_file = "librccl.so.1" - else: - raise ValueError("NCCL only supports CUDA and ROCm backends.") - logger.info(f"Loading nccl from library {so_file}") +so_file = find_nccl_library() try: + # load the library in another process. + # if it core dumps, it will not crash the current process + nccl_integrity_check(so_file) nccl = ctypes.CDLL(so_file) except Exception as e: logger.error( f"Failed to load NCCL library from {so_file} ." "It is expected if you are not running on NVIDIA/AMD GPUs." - "Otherwise please set the environment variable VLLM_NCCL_SO_PATH" + "Otherwise, the nccl library might not exist, be corrupted " + f"or it does not support the current platform {platform.platform()}." + f"One solution is to download libnccl2 version 2.18 from " + f"https://developer.download.nvidia.com/compute/cuda/repos/ " + f"and extract the libnccl.so.2 file. If you already have the " + f"library, please set the environment variable VLLM_NCCL_SO_PATH" " to point to the correct nccl library path.") raise e diff --git a/vllm/utils.py b/vllm/utils.py index e132575e7bf81..49e7033c23e6a 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,6 +1,7 @@ import asyncio import enum import gc +import glob import os import socket import subprocess @@ -517,3 +518,53 @@ def init_cached_hf_modules(): """ from transformers.dynamic_module_utils import init_hf_modules init_hf_modules() + + +def nccl_integrity_check(filepath): + """ + when the library is corrupted, we cannot catch + the exception in python. it will crash the process. + instead, we use the exit code of `ldd` to check + if the library is corrupted. if not, we will return + the version of the library. + """ + exit_code = os.system(f"ldd {filepath} 2>&1 > /dev/null") + if exit_code != 0: + raise RuntimeError(f"Failed to load NCCL library from {filepath} .") + import ctypes + + nccl = ctypes.CDLL(filepath) + version = ctypes.c_int() + nccl.ncclGetVersion.restype = ctypes.c_int + nccl.ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)] + result = nccl.ncclGetVersion(ctypes.byref(version)) + assert result == 0 + return version.value + + +def find_nccl_library(): + so_file = os.environ.get("VLLM_NCCL_SO_PATH", "") + + # check if we have vllm-managed nccl + vllm_nccl_path = None + if torch.version.cuda is not None: + cuda_major = torch.version.cuda.split(".")[0] + path = os.path.expanduser( + f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*") + files = glob.glob(path) + vllm_nccl_path = files[0] if files else None + + # manually load the nccl library + if so_file: + logger.info( + f"Found nccl from environment variable VLLM_NCCL_SO_PATH={so_file}" + ) + else: + if torch.version.cuda is not None: + so_file = vllm_nccl_path or "libnccl.so.2" + elif torch.version.hip is not None: + so_file = "librccl.so.1" + else: + raise ValueError("NCCL only supports CUDA and ROCm backends.") + logger.info(f"Found nccl from library {so_file}") + return so_file