diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index d8e6d27b82484..9cc18a0ea4611 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -8,12 +8,13 @@ import torch from vllm import _custom_ops as ops +from vllm.utils import get_device_capability_stateless CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] -capability = torch.cuda.get_device_capability() +capability = get_device_capability_stateless() capability = capability[0] * 10 + capability[1] diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 29085916afb4d..5c1b5ad9b9c07 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -1,6 +1,7 @@ import torch from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import get_device_capability_stateless def is_quant_method_supported(quant_method: str) -> bool: @@ -8,7 +9,7 @@ def is_quant_method_supported(quant_method: str) -> bool: if not torch.cuda.is_available(): return False - capability = torch.cuda.get_device_capability() + capability = get_device_capability_stateless() capability = capability[0] * 10 + capability[1] return (capability >= QUANTIZATION_METHODS[quant_method].get_min_capability()) diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py index 300211e70bb79..637cfda214964 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -2,13 +2,13 @@ import torch -from vllm.utils import is_cpu, is_hip +from vllm.utils import get_device_capability_stateless, is_cpu, is_hip from .utils import (dense_to_crow_col, get_head_sliding_step, get_sparse_attn_mask) IS_COMPUTE_8_OR_ABOVE = (torch.cuda.is_available() - and torch.cuda.get_device_capability()[0] >= 8) + and get_device_capability_stateless()[0] >= 8) if IS_COMPUTE_8_OR_ABOVE: from .blocksparse_attention_kernel import blocksparse_flash_attn_varlen_fwd @@ -235,4 +235,4 @@ def forward(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): v, cu_seqlens_k, cu_seqlens_q=cu_seqlens_q, - sm_scale=sm_scale) \ No newline at end of file + sm_scale=sm_scale) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index b99cf9a50d105..ca9f28fcb7013 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -5,6 +5,8 @@ import triton import triton.language as tl +from vllm.utils import get_device_capability_stateless + if triton.__version__ >= "2.1.0": @triton.jit @@ -683,7 +685,7 @@ def context_attention_fwd(q, alibi_slopes=None, sliding_window=None): - cap = torch.cuda.get_device_capability() + cap = get_device_capability_stateless() BLOCK = 128 if cap[0] >= 8 else 64 # shape constraints Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index b0cb21a022789..a303d0bd25a3c 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -11,66 +11,18 @@ gpu_p2p_access_check) from vllm.distributed.parallel_state import is_in_the_same_node from vllm.logger import init_logger -from vllm.utils import cuda_device_count_stateless +from vllm.utils import cuda_device_count_stateless, is_full_nvlink try: - import pynvml - - # Simulate ImportError if custom_ar ops are not supported. - if not ops.is_custom_op_supported("_C_custom_ar::meta_size"): - raise ImportError("custom_ar", __file__) - + assert ops.is_custom_op_supported("_C_custom_ar::meta_size") custom_ar = True - - @contextmanager - def _nvml(): - try: - pynvml.nvmlInit() - yield - finally: - pynvml.nvmlShutdown() - -except ImportError: - # For AMD GPUs +except Exception: + # For AMD GPUs and CPUs custom_ar = False - pynvml = None - - @contextmanager - def _nvml(): - try: - yield - finally: - pass - logger = init_logger(__name__) -@_nvml() -def _is_full_nvlink(device_ids: List[int]) -> bool: - """ - query if the set of gpus are fully connected by nvlink (1 hop) - Note that `pynvml` is not affected by `CUDA_VISIBLE_DEVICES`, - so it works on real physical device ids. - """ - handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids] - for i, handle in enumerate(handles): - for j, peer_handle in enumerate(handles): - if i < j: - try: - p2p_status = pynvml.nvmlDeviceGetP2PStatus( - handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK) - if p2p_status != pynvml.NVML_P2P_STATUS_OK: - return False - except pynvml.NVMLError as error: - logger.error( - "NVLink detection failed. This is normal if your" - " machine has no NVLink equipped.", - exc_info=error) - return False - return True - - def _can_p2p(rank: int, world_size: int) -> bool: for i in range(world_size): if i == rank: @@ -161,7 +113,7 @@ def __init__(self, # test nvlink first, this will filter out most of the cases # where custom allreduce is not supported # this checks hardware and driver support for NVLink - full_nvlink = _is_full_nvlink(physical_device_ids) + full_nvlink = is_full_nvlink(physical_device_ids) if world_size > 2 and not full_nvlink: logger.warning( "Custom allreduce is disabled because it's not supported on" diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 7ecaa450f1758..f30b2c13f28b8 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -5,13 +5,14 @@ import torch from vllm import _custom_ops as ops +from vllm.utils import get_device_capability_stateless def _check_punica_support(): if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"): return - if torch.cuda.get_device_capability() < (8, 0): + if get_device_capability_stateless() < (8, 0): raise ImportError( "punica LoRA kernels require compute capability >= 8.0") else: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 664eac3f9e97b..491396c3da6f7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match) +from vllm.utils import get_device_capability_stateless class CompressedTensorsConfig(QuantizationConfig): @@ -84,7 +85,7 @@ def get_config_filenames(cls) -> List[str]: return [] def _check_gptq_and_marlin_can_run(self): - capability = torch.cuda.get_device_capability() + capability = get_device_capability_stateless() capability = capability[0] * 10 + capability[1] if capability < 80: raise RuntimeError("The quantization config is not supported for ", diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index df6fe4c3d572e..5d503a2215c96 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -10,7 +10,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs -from vllm.utils import print_warning_once +from vllm.utils import get_device_capability_stateless, print_warning_once ACTIVATION_SCHEMES = ["static", "dynamic"] @@ -18,7 +18,7 @@ def cutlass_fp8_supported() -> bool: - capability = torch.cuda.get_device_capability() + capability = get_device_capability_stateless() capability = capability[0] * 10 + capability[1] return ops.cutlass_scaled_mm_supports_fp8(capability) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 599070f1550ca..c6e9279c8baea 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -11,6 +11,7 @@ set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.utils import get_device_capability_stateless logger = init_logger(__name__) @@ -165,7 +166,7 @@ def is_marlin_compatible(cls, quant_config: Dict[str, Any]): return False # If the capability of the device is too low, cannot convert. - major, minor = torch.cuda.get_device_capability() + major, minor = get_device_capability_stateless() device_capability = major * 10 + minor if device_capability < cls.get_min_capability(): return False diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 0d027d0620ab3..6f4aa2d77e680 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -12,8 +12,9 @@ marlin_perm, marlin_scale_perm, marlin_scale_perm_single) from vllm.model_executor.layers.quantization.utils.quant_utils import ( get_pack_factor, quantize_weights, sort_weights) +from vllm.utils import get_device_capability_stateless -__cuda_arch = torch.cuda.get_device_capability() +__cuda_arch = get_device_capability_stateless() MARLIN_TILE = 16 diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index e91bf7cf35b41..361dc7322f1b6 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -35,7 +35,7 @@ from vllm.model_executor.models.interfaces import (supports_lora, supports_vision) from vllm.model_executor.utils import set_weight_attrs -from vllm.utils import is_tpu +from vllm.utils import get_device_capability_stateless, is_tpu logger = init_logger(__name__) @@ -46,7 +46,7 @@ def _get_quantization_config( """Get the quantization config.""" if model_config.quantization is not None: quant_config = get_quant_config(model_config, load_config) - capability = torch.cuda.get_device_capability() + capability = get_device_capability_stateless() capability = capability[0] * 10 + capability[1] if capability < quant_config.get_min_capability(): raise ValueError( diff --git a/vllm/utils.py b/vllm/utils.py index 6e8d4624cd713..1977bc05d75e2 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -816,6 +816,63 @@ def cuda_device_count_stateless() -> int: return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) +# NVML utils +# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, +# all the related functions work on real physical device ids. +# the major benefit of using NVML is that it will not initialize CUDA + +try: + import pynvml +except ImportError: + # For non-NV devices + pynvml = None + + +def with_nvml_context(fn): + + @wraps(fn) + def wrapper(*args, **kwargs): + if pynvml is not None: + pynvml.nvmlInit() + try: + return fn(*args, **kwargs) + finally: + if pynvml is not None: + pynvml.nvmlShutdown() + + return wrapper + + +@with_nvml_context +def is_full_nvlink(device_ids: List[int]) -> bool: + """ + query if the set of gpus are fully connected by nvlink (1 hop) + """ + handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids] + for i, handle in enumerate(handles): + for j, peer_handle in enumerate(handles): + if i < j: + try: + p2p_status = pynvml.nvmlDeviceGetP2PStatus( + handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK) + if p2p_status != pynvml.NVML_P2P_STATUS_OK: + return False + except pynvml.NVMLError as error: + logger.error( + "NVLink detection failed. This is normal if your" + " machine has no NVLink equipped.", + exc_info=error) + return False + return True + + +@lru_cache(maxsize=8) +@with_nvml_context +def get_device_capability_stateless(device_id: int = 0) -> Tuple[int, int]: + handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) + return pynvml.nvmlDeviceGetCudaComputeCapability(handle) + + #From: https://stackoverflow.com/a/4104188/2749989 def run_once(f): diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 156d5278a292a..cc27d06b511f5 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -16,6 +16,7 @@ from vllm.model_executor import set_random_seed from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.sequence import ExecuteModelRequest +from vllm.utils import get_device_capability_stateless from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -322,7 +323,7 @@ def init_worker_distributed_environment( def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): # Check if the GPU supports the dtype. if torch_dtype == torch.bfloat16: - compute_capability = torch.cuda.get_device_capability() + compute_capability = get_device_capability_stateless() if compute_capability[0] < 8: gpu_name = torch.cuda.get_device_name() raise ValueError(