From eb03b0299cd2b91600bc5939762f0edaa3fdcfcb Mon Sep 17 00:00:00 2001 From: gtyinstinct Date: Mon, 25 Mar 2024 16:24:32 +0800 Subject: [PATCH] fix automatic prefix args and add log info --- vllm/core/block_manager.py | 5 +++++ vllm/engine/arg_utils.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index ad9b557fd9a83..c26514b8c0936 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -9,6 +9,9 @@ from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor +from vllm.logger import init_logger + +logger = init_logger(__name__) class BlockAllocatorBase(ABC): @@ -241,11 +244,13 @@ def __init__( self.watermark_blocks = int(watermark * num_gpu_blocks) if self.enable_caching: + logger.info("enable automatic prefix caching") self.gpu_allocator = CachedBlockAllocator(Device.GPU, block_size, num_gpu_blocks) self.cpu_allocator = CachedBlockAllocator(Device.CPU, block_size, num_cpu_blocks) else: + logger.info("disable automatic prefix caching") self.gpu_allocator = UncachedBlockAllocator( Device.GPU, block_size, num_gpu_blocks) self.cpu_allocator = UncachedBlockAllocator( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2070686ea6e8e..a47edaf05a356 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -337,7 +337,8 @@ def create_engine_configs( cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, - model_config.get_sliding_window()) + model_config.get_sliding_window(), + self.enable_prefix_caching) parallel_config = ParallelConfig( self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray, self.max_parallel_loading_workers,