From 57329c9ca01496031d9ee5797fd48168d90d518e Mon Sep 17 00:00:00 2001 From: Adam Tilghman Date: Thu, 18 Apr 2024 15:32:47 -0700 Subject: [PATCH] [Bugfix] Fix CustomAllreduce nvlink topology detection (#3974) [Bugfix] Fix CustomAllreduce pcie nvlink topology detection (#3974) (#4159) --- vllm/distributed/device_communicators/custom_all_reduce.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index f83caef879da3..7602897d3dd8f 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -145,8 +145,10 @@ def _is_full_nvlink(rank, world_size): for i in range(world_size): if i != rank: try: - link_state = pynvml.nvmlDeviceGetNvLinkState(handle, i) - if not link_state: + peer_handle = pynvml.nvmlDeviceGetHandleByIndex(i) + p2p_status = pynvml.nvmlDeviceGetP2PStatus( + handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK) + if p2p_status != pynvml.NVML_P2P_STATUS_OK: return False except pynvml.NVMLError as error: logger.info(