diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 90fce1a0349b2..42f4284c6c775 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -41,6 +41,8 @@ def _can_use_flash_attn(dtype: torch.dtype) -> bool: try: import flash_attn # noqa: F401 except ImportError: - logger.info("flash_attn is not found.") + logger.info( + "Cannot use FlashAttention because the package is not found. " + "Please install it for better performance.") return False return True diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index c5c8d0a05539b..160a86556f031 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -230,13 +230,12 @@ def __init__( self.watermark_blocks = int(watermark * num_gpu_blocks) if self.enable_caching: - logger.info("enable automatic prefix caching") + logger.info("Automatic prefix caching is enabled.") self.gpu_allocator = CachedBlockAllocator(Device.GPU, block_size, num_gpu_blocks) self.cpu_allocator = CachedBlockAllocator(Device.CPU, block_size, num_cpu_blocks) else: - logger.info("disable automatic prefix caching") self.gpu_allocator = UncachedBlockAllocator( Device.GPU, block_size, num_gpu_blocks) self.cpu_allocator = UncachedBlockAllocator( diff --git a/vllm/model_executor/parallel_utils/pynccl_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py index 5b8ee4c4de598..5b5eebbde44f6 100644 --- a/vllm/model_executor/parallel_utils/pynccl_utils.py +++ b/vllm/model_executor/parallel_utils/pynccl_utils.py @@ -10,7 +10,6 @@ try: from vllm.model_executor.parallel_utils.pynccl import (NCCLCommunicator, ncclGetVersion) - logger.info(f"vLLM is using nccl=={ncclGetVersion()}") except Exception as e: # in non-NVIDIA environments, we can't import the nccl module # e.g. when running on machines with AMD GPUs @@ -40,6 +39,7 @@ def init_process_group(world_size: int, local_rank: int, rank: int, init_method: str) -> None: assert not is_initialized() global comm + logger.info(f"vLLM is using nccl=={ncclGetVersion()}") comm = NCCLCommunicator(init_method=init_method, world_size=world_size, local_rank=local_rank,