diff --git a/vllm/config.py b/vllm/config.py index 1310c07ade482..05d5f4998d74d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -970,7 +970,7 @@ def __init__(self, max_num_batched_tokens: Optional[int], max_num_seqs: int, max_model_len: int, - use_v2_block_manager: bool = False, + use_v2_block_manager: bool = True, num_lookahead_slots: int = 0, delay_factor: float = 0.0, enable_chunked_prefill: bool = False, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c97b6ffb093f7..097fe7c02444c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -107,7 +107,7 @@ class EngineArgs: block_size: int = 16 enable_prefix_caching: bool = False disable_sliding_window: bool = False - use_v2_block_manager: bool = False + use_v2_block_manager: bool = True swap_space: float = 4 # GiB cpu_offload_gb: float = 0 # GiB gpu_memory_utilization: float = 0.90 @@ -369,9 +369,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action='store_true', help='Disables sliding window, ' 'capping to sliding window size') - parser.add_argument('--use-v2-block-manager', - action='store_true', - help='Use BlockSpaceMangerV2.') + parser.add_argument( + '--use-v2-block-manager', + default=EngineArgs.use_v2_block_manager, + action='store_true', + help='Use BlockSpaceMangerV2. By default this is set to True. ' + 'Set to False to use BlockSpaceManagerV1') parser.add_argument( '--num-lookahead-slots', type=int,