From 252a0c7c2c491ce5c4e6ff70463cfacb7c2ed18e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 3 Apr 2024 14:17:28 -0700 Subject: [PATCH 001/109] wip --- vllm/executor/executor_base.py | 39 +++++++++++++++++++ vllm/worker/worker_base.py | 70 ++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 vllm/worker/worker_base.py diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 55180d6110b6b..7e21ded9b134e 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -28,6 +28,45 @@ def __init__( ) -> None: raise NotImplementedError + + #@abstractmethod + #def init_workers(self) -> None: + # """Initialize workers, such as loading the model or preparing on-device + # tensors. + # """ + # raise NotImplementedError + + + #@abstractmethod + #def profile_num_available_blocks(self, block_size: int, + # gpu_memory_utilization: float, + # cpu_swap_space: float, + # cache_dtype: str) -> tuple[int, int]: + # """Profile the model on-device to determine the maximum number of KV + # blocks that can be allocated. + + # Returns a tuple[num_device_blocks, num_cpu_blocks], where + # num_device_blocks refers to the number of blocks in the "active" KV + # cache (e.g. where blocks are appended to), and num_cpu_blocks refers + # to the number of blocks in the "passive" KV cache (e.g. where blocks + # are swapped to). + + # Examples: + # - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. + # - A future CPUExecutor can return [num_cpu_blocks, 0] or + # [num_cpu_blocks, num_swap_cpu_blocks]. + # """ + # raise NotImplementedError + + + #@abstractmethod + #def init_cache(self, cache_config: CacheConfig) -> None: + # """Given a fully-specified cache config, initialize the KV cache. This + # is separate from init_workers as profiling may be required to determine + # the maxmimum allowed KV cache size. + # """ + # raise NotImplementedError + @abstractmethod def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py new file mode 100644 index 0000000000000..c12b876451ca3 --- /dev/null +++ b/vllm/worker/worker_base.py @@ -0,0 +1,70 @@ +from abc import ABC, abstractmethod +from typing import Dict, List, Optional + +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata + + +class WorkerBase(ABC): + @abstractmethod + def init_device(self) -> None: + """Initialize device state, such as loading the model or other on-device + memory allocations. + """ + raise NotImplementedError + + @abstractmethod + def profile_num_available_blocks(self, block_size: int, + gpu_memory_utilization: float, + cpu_swap_space: float, + cache_dtype: str) -> tuple[int, int]: + """Profile the model on-device to determine the maximum number of KV + blocks that can be allocated. + + Returns a tuple[num_device_blocks, num_cpu_blocks], where + num_device_blocks refers to the number of blocks in the "active" KV + cache (e.g. where blocks are appended to), and num_cpu_blocks refers + to the number of blocks in the "passive" KV cache (e.g. where blocks + are swapped to). + + Examples: + - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. + - A future CPUExecutor can return [num_cpu_blocks, 0] or + [num_cpu_blocks, num_swap_cpu_blocks]. + """ + raise NotImplementedError + + @abstractmethod + def init_cache(self, cache_config: CacheConfig) -> None: + """Given a fully-specified cache config, initialize the KV cache. This + is separate from init_workers as profiling may be required to determine + the maxmimum allowed KV cache size. + """ + raise NotImplementedError + + @abstractmethod + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + """Executes one model step on the given sequences.""" + raise NotImplementedError + + @abstractmethod + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError + + @abstractmethod + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError + + @abstractmethod + def list_loras(self) -> List[int]: + raise NotImplementedError + + @abstractmethod + def check_health(self) -> None: + """Checks if the executor is healthy. If not, it should raise an + exception.""" + raise NotImplementedError From a34800fbf0270814f370b5b06b535a8f70c16e16 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 3 Apr 2024 16:11:21 -0700 Subject: [PATCH 002/109] wip --- vllm/engine/llm_engine.py | 8 ++++ vllm/entrypoints/llm.py | 1 + vllm/executor/executor_base.py | 6 +++ vllm/executor/gpu_executor.py | 83 +++++++++++++++++++++++++--------- 4 files changed, 76 insertions(+), 22 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 5c343921f07f7..831627ac72e91 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -127,6 +127,13 @@ def __init__( speculative_config=speculative_config, ) + # TODO cleanup location + profile_result = self.model_executor.profile_num_available_blocks() + self.model_executor.allocate_kv_cache( + num_active_kv_blocks=profile_result.num_active_kv_blocks, + num_swapped_kv_blocks=profile_result.num_swapped_kv_blocks, + ) + # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): from vllm.model_executor.model_loader import ( @@ -212,6 +219,7 @@ def from_engine_args( log_stats=not engine_args.disable_log_stats, usage_context=usage_context, ) + return engine def __reduce__(self): diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5777e8179a1c1..b079d7c117d84 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -109,6 +109,7 @@ def __init__( disable_custom_all_reduce=disable_custom_all_reduce, **kwargs, ) + self.llm_engine = LLMEngine.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS) self.request_counter = Counter() diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 1a069f2a971d3..94b2fe420838a 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from typing import Dict, List, Optional +from dataclasses import dataclass from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, @@ -115,3 +116,8 @@ async def check_health_async(self) -> None: """Checks if the executor is healthy. If not, it should raise an exception.""" raise NotImplementedError + +@dataclass(frozen=True) +class KvCacheProfileResult: + num_active_kv_blocks: int + num_swapped_kv_blocks: int diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 7b683107d30e5..95cd0fa8940f9 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -3,7 +3,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase, KvCacheProfileResult from vllm.executor.utils import check_block_size_valid from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -42,7 +42,7 @@ def __init__( self._init_worker() # Profile the memory usage and initialize the cache. - self._init_cache() + #self._init_cache() def _init_worker(self): # Lazy import the Worker to avoid importing torch.cuda/xformers @@ -70,17 +70,8 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine first profiles the existing memory usage. - Then, it allocates the remaining memory for KV blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. + def profile_num_available_blocks(self) -> KvCacheProfileResult: + # TODO clean up datastructure num_gpu_blocks, num_cpu_blocks = ( self.driver_worker.profile_num_available_blocks( block_size=self.cache_config.block_size, @@ -90,27 +81,75 @@ def _init_cache(self) -> None: cache_dtype=self.cache_config.cache_dtype, )) + return KvCacheProfileResult( + num_active_kv_blocks=num_gpu_blocks, + num_swapped_kv_blocks=num_cpu_blocks, + ) + + def allocate_kv_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> None: if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_gpu_blocks=} with " - f"{forced_num_gpu_blocks=}") - num_gpu_blocks = forced_num_gpu_blocks + forced_num_active_kv_blocks = self.cache_config.forced_num_gpu_blocks + logger.info(f"Replacing profiled {num_active_kv_blocks=} with " + f"{forced_num_active_kv_blocks=}") + num_active_kv_blocks = forced_num_active_kv_blocks - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") + logger.info(f"# GPU blocks: {num_active_kv_blocks}, " + f"# CPU blocks: {num_swapped_kv_blocks}") - check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + check_block_size_valid(num_active_kv_blocks, self.cache_config.block_size, self.model_config.max_model_len) - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks + self.cache_config.num_gpu_blocks = num_active_kv_blocks + self.cache_config.num_cpu_blocks = num_swapped_kv_blocks # Initialize the cache. self.driver_worker.init_cache_engine(cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph # if enforce_eager is False. self.driver_worker.warm_up_model() + #def _init_cache(self) -> None: + # """Profiles the memory usage and initializes the KV cache. + + # The engine first profiles the existing memory usage. + # Then, it allocates the remaining memory for KV blocks. + + # .. tip:: + # You may limit the usage of GPU memory + # by adjusting the `gpu_memory_utilization` parameter. + # """ + # # Get the maximum number of blocks that can be allocated on GPU and CPU. + # num_gpu_blocks, num_cpu_blocks = ( + # self.driver_worker.profile_num_available_blocks( + # block_size=self.cache_config.block_size, + # gpu_memory_utilization=self.cache_config. + # gpu_memory_utilization, + # cpu_swap_space=self.cache_config.swap_space_bytes, + # cache_dtype=self.cache_config.cache_dtype, + # )) + + # if self.cache_config.forced_num_gpu_blocks is not None: + # forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks + # logger.info(f"Replacing profiled {num_gpu_blocks=} with " + # f"{forced_num_gpu_blocks=}") + # num_gpu_blocks = forced_num_gpu_blocks + + # logger.info(f"# GPU blocks: {num_gpu_blocks}, " + # f"# CPU blocks: {num_cpu_blocks}") + + # check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + # self.model_config.max_model_len) + + # self.cache_config.num_gpu_blocks = num_gpu_blocks + # self.cache_config.num_cpu_blocks = num_cpu_blocks + + # # Initialize the cache. + # self.driver_worker.init_cache_engine(cache_config=self.cache_config) + # # Warm up the model. This includes capturing the model into CUDA graph + # # if enforce_eager is False. + # self.driver_worker.warm_up_model() + def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], From 09f30bde56f9f7709fcd14e5edcf4e98d345cf73 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 3 Apr 2024 16:45:05 -0700 Subject: [PATCH 003/109] wip --- vllm/executor/gpu_executor.py | 41 +---------------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 95cd0fa8940f9..df1f30b0e4028 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -86,6 +86,7 @@ def profile_num_available_blocks(self) -> KvCacheProfileResult: num_swapped_kv_blocks=num_cpu_blocks, ) + def allocate_kv_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> None: if self.cache_config.forced_num_gpu_blocks is not None: forced_num_active_kv_blocks = self.cache_config.forced_num_gpu_blocks @@ -109,46 +110,6 @@ def allocate_kv_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> # if enforce_eager is False. self.driver_worker.warm_up_model() - #def _init_cache(self) -> None: - # """Profiles the memory usage and initializes the KV cache. - - # The engine first profiles the existing memory usage. - # Then, it allocates the remaining memory for KV blocks. - - # .. tip:: - # You may limit the usage of GPU memory - # by adjusting the `gpu_memory_utilization` parameter. - # """ - # # Get the maximum number of blocks that can be allocated on GPU and CPU. - # num_gpu_blocks, num_cpu_blocks = ( - # self.driver_worker.profile_num_available_blocks( - # block_size=self.cache_config.block_size, - # gpu_memory_utilization=self.cache_config. - # gpu_memory_utilization, - # cpu_swap_space=self.cache_config.swap_space_bytes, - # cache_dtype=self.cache_config.cache_dtype, - # )) - - # if self.cache_config.forced_num_gpu_blocks is not None: - # forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - # logger.info(f"Replacing profiled {num_gpu_blocks=} with " - # f"{forced_num_gpu_blocks=}") - # num_gpu_blocks = forced_num_gpu_blocks - - # logger.info(f"# GPU blocks: {num_gpu_blocks}, " - # f"# CPU blocks: {num_cpu_blocks}") - - # check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - # self.model_config.max_model_len) - - # self.cache_config.num_gpu_blocks = num_gpu_blocks - # self.cache_config.num_cpu_blocks = num_cpu_blocks - - # # Initialize the cache. - # self.driver_worker.init_cache_engine(cache_config=self.cache_config) - # # Warm up the model. This includes capturing the model into CUDA graph - # # if enforce_eager is False. - # self.driver_worker.warm_up_model() def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], From 8b5bb8b98320d235fcba5c960c6fb1778bd314c6 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 00:33:02 -0700 Subject: [PATCH 004/109] clean --- vllm/engine/llm_engine.py | 2 +- vllm/executor/gpu_executor.py | 26 +++++++++++++++----------- vllm/worker/worker.py | 22 +++++++++++++++------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 831627ac72e91..c311c96b76e86 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -129,7 +129,7 @@ def __init__( # TODO cleanup location profile_result = self.model_executor.profile_num_available_blocks() - self.model_executor.allocate_kv_cache( + self.model_executor.initialize_cache( num_active_kv_blocks=profile_result.num_active_kv_blocks, num_swapped_kv_blocks=profile_result.num_swapped_kv_blocks, ) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index df1f30b0e4028..8095659c092e3 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -55,16 +55,17 @@ def _init_worker(self): distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, lora_config=self.lora_config, vision_language_config=self.vision_language_config, - kv_cache_dtype=self.cache_config.cache_dtype, + #kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=True, ) self.driver_worker.init_device() @@ -74,12 +75,15 @@ def profile_num_available_blocks(self) -> KvCacheProfileResult: # TODO clean up datastructure num_gpu_blocks, num_cpu_blocks = ( self.driver_worker.profile_num_available_blocks( - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config. - gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, + #self.cache_config, )) + #self.driver_worker.profile_num_available_blocks( + # block_size=self.cache_config.block_size, + # gpu_memory_utilization=self.cache_config. + # gpu_memory_utilization, + # cpu_swap_space=self.cache_config.swap_space_bytes, + # cache_dtype=self.cache_config.cache_dtype, + #)) return KvCacheProfileResult( num_active_kv_blocks=num_gpu_blocks, @@ -87,7 +91,7 @@ def profile_num_available_blocks(self) -> KvCacheProfileResult: ) - def allocate_kv_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> None: + def initialize_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> None: if self.cache_config.forced_num_gpu_blocks is not None: forced_num_active_kv_blocks = self.cache_config.forced_num_gpu_blocks logger.info(f"Replacing profiled {num_active_kv_blocks=} with " diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 48facb57de190..58a8752d9dcc6 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -35,18 +35,20 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + cache_config: CacheConfig, local_rank: int, rank: int, distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, - kv_cache_dtype: Optional[str] = "auto", + #kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, ) -> None: self.model_config = model_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config + self.cache_config = cache_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method @@ -66,12 +68,12 @@ def __init__( scheduler_config, device_config, lora_config=self.lora_config, - kv_cache_dtype=kv_cache_dtype, + kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, vision_language_config=vision_language_config) # Uninitialized cache engine. Will be initialized by # self.init_cache_engine(). - self.cache_config = None + #self.cache_config = None self.cache_engine = None self.gpu_cache = None @@ -109,10 +111,10 @@ def load_model(self): @torch.inference_mode() def profile_num_available_blocks( self, - block_size: int, - gpu_memory_utilization: float, - cpu_swap_space: int, - cache_dtype: str, + #block_size: int, + #gpu_memory_utilization: float, + #cpu_swap_space: int, + #cache_dtype: str, ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum number of GPU and CPU cache blocks that can be allocated. @@ -122,6 +124,12 @@ def profile_num_available_blocks( gpu_memory_utilization: The fraction of the total GPU memory to use. cpu_swap_space: The size of the CPU swap space in bytes. """ + + block_size = self.cache_config.block_size + gpu_memory_utilization = self.cache_config.gpu_memory_utilization + cpu_swap_space = self.cache_config.swap_space_bytes + cache_dtype = self.cache_config.cache_dtype + # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() From 6fd424f4391a5a4f8138e696c68ace58906e913c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 00:40:24 -0700 Subject: [PATCH 005/109] wip --- vllm/engine/llm_engine.py | 5 ++-- vllm/executor/executor_base.py | 5 ---- vllm/executor/gpu_executor.py | 45 ++++++++++++---------------------- vllm/worker/worker.py | 6 ++--- 4 files changed, 19 insertions(+), 42 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c311c96b76e86..8758edf0ef9b5 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -128,10 +128,9 @@ def __init__( ) # TODO cleanup location - profile_result = self.model_executor.profile_num_available_blocks() + num_gpu_blocks, num_cpu_blocks = self.model_executor.profile_num_available_blocks() self.model_executor.initialize_cache( - num_active_kv_blocks=profile_result.num_active_kv_blocks, - num_swapped_kv_blocks=profile_result.num_swapped_kv_blocks, + num_gpu_blocks, num_cpu_blocks, ) # If usage stat is enabled, collect relevant info. diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 94b2fe420838a..b531d2080be51 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -116,8 +116,3 @@ async def check_health_async(self) -> None: """Checks if the executor is healthy. If not, it should raise an exception.""" raise NotImplementedError - -@dataclass(frozen=True) -class KvCacheProfileResult: - num_active_kv_blocks: int - num_swapped_kv_blocks: int diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 8095659c092e3..7fa2c4eb6024b 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -1,9 +1,9 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase, KvCacheProfileResult +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.executor.utils import check_block_size_valid from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -71,41 +71,26 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def profile_num_available_blocks(self) -> KvCacheProfileResult: - # TODO clean up datastructure - num_gpu_blocks, num_cpu_blocks = ( - self.driver_worker.profile_num_available_blocks( - #self.cache_config, - )) - #self.driver_worker.profile_num_available_blocks( - # block_size=self.cache_config.block_size, - # gpu_memory_utilization=self.cache_config. - # gpu_memory_utilization, - # cpu_swap_space=self.cache_config.swap_space_bytes, - # cache_dtype=self.cache_config.cache_dtype, - #)) - - return KvCacheProfileResult( - num_active_kv_blocks=num_gpu_blocks, - num_swapped_kv_blocks=num_cpu_blocks, - ) + + def profile_num_available_blocks(self) -> Tuple[int, int]: + return self.driver_worker.profile_num_available_blocks() - def initialize_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> None: + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_active_kv_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_active_kv_blocks=} with " - f"{forced_num_active_kv_blocks=}") - num_active_kv_blocks = forced_num_active_kv_blocks + forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks + logger.info(f"Replacing profiled {num_gpu_blocks=} with " + f"{forced_num_gpu_blocks=}") + num_gpu_blocks = forced_num_gpu_blocks - logger.info(f"# GPU blocks: {num_active_kv_blocks}, " - f"# CPU blocks: {num_swapped_kv_blocks}") + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") - check_block_size_valid(num_active_kv_blocks, self.cache_config.block_size, + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) - self.cache_config.num_gpu_blocks = num_active_kv_blocks - self.cache_config.num_cpu_blocks = num_swapped_kv_blocks + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks # Initialize the cache. self.driver_worker.init_cache_engine(cache_config=self.cache_config) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 58a8752d9dcc6..8176529ba7bea 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -111,15 +111,12 @@ def load_model(self): @torch.inference_mode() def profile_num_available_blocks( self, - #block_size: int, - #gpu_memory_utilization: float, - #cpu_swap_space: int, - #cache_dtype: str, ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum number of GPU and CPU cache blocks that can be allocated. Args: + # TODO block_size: The size of the cache block. gpu_memory_utilization: The fraction of the total GPU memory to use. cpu_swap_space: The size of the CPU swap space in bytes. @@ -161,6 +158,7 @@ def profile_num_available_blocks( self.model_runner.remove_all_loras() gc.collect() torch.cuda.empty_cache() + return num_gpu_blocks, num_cpu_blocks def init_cache_engine(self, cache_config: CacheConfig) -> None: From 2a347bb39a4284fcf7710541838218ac1666b6ef Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 00:50:15 -0700 Subject: [PATCH 006/109] wip --- vllm/engine/llm_engine.py | 32 +++++++++++++++++++++++++++++--- vllm/executor/gpu_executor.py | 15 --------------- vllm/executor/utils.py | 1 + 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8758edf0ef9b5..e0e732681f7d0 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -129,9 +129,20 @@ def __init__( # TODO cleanup location num_gpu_blocks, num_cpu_blocks = self.model_executor.profile_num_available_blocks() - self.model_executor.initialize_cache( - num_gpu_blocks, num_cpu_blocks, - ) + + if self.cache_config.forced_num_gpu_blocks is not None: + forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks + logger.info(f"Replacing profiled {num_gpu_blocks=} with " + f"{forced_num_gpu_blocks=}") + num_gpu_blocks = forced_num_gpu_blocks + + raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + + logger.info( + f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): @@ -841,3 +852,18 @@ def list_loras(self) -> List[int]: def check_health(self) -> None: self.model_executor.check_health() + + +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 7fa2c4eb6024b..bc88571542f6c 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -41,9 +41,6 @@ def __init__( # Instantiate the worker and load the model to GPU. self._init_worker() - # Profile the memory usage and initialize the cache. - #self._init_cache() - def _init_worker(self): # Lazy import the Worker to avoid importing torch.cuda/xformers # before CUDA_VISIBLE_DEVICES is set in the Worker @@ -77,18 +74,6 @@ def profile_num_available_blocks(self) -> Tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: - if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_gpu_blocks=} with " - f"{forced_num_gpu_blocks=}") - num_gpu_blocks = forced_num_gpu_blocks - - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) - self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/executor/utils.py b/vllm/executor/utils.py index 44976696a77c6..666ab4b2927db 100644 --- a/vllm/executor/utils.py +++ b/vllm/executor/utils.py @@ -1,3 +1,4 @@ +# TODO def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None: if num_gpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " From 658ff9be6ac2a7bd3dadf3ff9542763c3f2518d0 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 00:55:26 -0700 Subject: [PATCH 007/109] wip --- vllm/config.py | 15 +++++++++++++++ vllm/engine/llm_engine.py | 4 +++- vllm/executor/gpu_executor.py | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index e27c8eb4fd257..7a137006d0b58 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -367,6 +367,21 @@ def metrics_info(self): # metrics info return {key: str(value) for key, value in self.__dict__.items()} + def shallow_copy(self): + cache_config = CacheConfig( + block_size=self.block_size, + gpu_memory_utilization=self.gpu_memory_utilization, + swap_space=self.swap_space_bytes // _GB, + cache_dtype=self.cache_dtype, + forced_num_gpu_blocks=self.forced_num_gpu_blocks, + sliding_window=self.sliding_window, + enable_prefix_caching=self.enable_prefix_caching + ) + + cache_config.num_gpu_blocks = self.num_gpu_blocks + cache_config.num_cpu_blocks = self.num_cpu_blocks + return cache_config + def _verify_args(self) -> None: if self.gpu_memory_utilization > 1.0: raise ValueError( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e0e732681f7d0..11dad5e5e86cf 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -118,7 +118,7 @@ def __init__( self.model_executor = executor_class( model_config=model_config, - cache_config=cache_config, + cache_config=cache_config.shallow_copy(), parallel_config=parallel_config, scheduler_config=scheduler_config, device_config=device_config, @@ -137,6 +137,8 @@ def __init__( num_gpu_blocks = forced_num_gpu_blocks raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks logger.info( f"# GPU blocks: {num_gpu_blocks}, " diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index bc88571542f6c..0156b735c7b29 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -56,7 +56,7 @@ def _init_worker(self): parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, device_config=self.device_config, - cache_config=self.cache_config, + cache_config=self.cache_config.shallow_copy(), local_rank=0, rank=0, distributed_init_method=distributed_init_method, From acee7bec37362863b7bd57eafcbc693bff76a64e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 00:59:41 -0700 Subject: [PATCH 008/109] wip --- vllm/engine/llm_engine.py | 1 + vllm/executor/executor_base.py | 51 ++++++++++++++++------------------ 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 11dad5e5e86cf..f030e7ebf6797 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -137,6 +137,7 @@ def __init__( num_gpu_blocks = forced_num_gpu_blocks raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index b531d2080be51..9dd372156b9ff 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -40,35 +40,32 @@ def __init__( # raise NotImplementedError - #@abstractmethod - #def profile_num_available_blocks(self, block_size: int, - # gpu_memory_utilization: float, - # cpu_swap_space: float, - # cache_dtype: str) -> tuple[int, int]: - # """Profile the model on-device to determine the maximum number of KV - # blocks that can be allocated. - - # Returns a tuple[num_device_blocks, num_cpu_blocks], where - # num_device_blocks refers to the number of blocks in the "active" KV - # cache (e.g. where blocks are appended to), and num_cpu_blocks refers - # to the number of blocks in the "passive" KV cache (e.g. where blocks - # are swapped to). - - # Examples: - # - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. - # - A future CPUExecutor can return [num_cpu_blocks, 0] or - # [num_cpu_blocks, num_swap_cpu_blocks]. - # """ - # raise NotImplementedError + @abstractmethod + def profile_num_available_blocks(self) -> tuple[int, int]: + """Profile the model on-device to determine the maximum number of KV + blocks that can be allocated. + + Returns a tuple[num_device_blocks, num_cpu_blocks], where + num_device_blocks refers to the number of blocks in the "active" KV + cache (e.g. where blocks are appended to), and num_cpu_blocks refers + to the number of blocks in the "passive" KV cache (e.g. where blocks + are swapped to). + + Examples: + - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. + - A future CPUExecutor can return [num_cpu_blocks, 0] or + [num_cpu_blocks, num_swap_cpu_blocks]. + """ + raise NotImplementedError - #@abstractmethod - #def init_cache(self, cache_config: CacheConfig) -> None: - # """Given a fully-specified cache config, initialize the KV cache. This - # is separate from init_workers as profiling may be required to determine - # the maxmimum allowed KV cache size. - # """ - # raise NotImplementedError + @abstractmethod + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Given a fully-specified cache config, initialize the KV cache. This + is separate from init_workers as profiling may be required to determine + the maxmimum allowed KV cache size. + """ + raise NotImplementedError @abstractmethod def execute_model(self, From 85760d63461700ff0a25f8bf53d2825d8d976d41 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:04:09 -0700 Subject: [PATCH 009/109] wip --- vllm/config.py | 2 +- vllm/engine/llm_engine.py | 23 +---------------------- vllm/executor/cpu_executor.py | 25 ++++++++++++++++++------- vllm/executor/gpu_executor.py | 11 +++++++++-- vllm/executor/neuron_executor.py | 22 ++++++++++++++++++++-- vllm/executor/utils.py | 14 ++++++++++++++ 6 files changed, 63 insertions(+), 34 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 7a137006d0b58..735462d1eba69 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -367,7 +367,7 @@ def metrics_info(self): # metrics info return {key: str(value) for key, value in self.__dict__.items()} - def shallow_copy(self): + def shallow_copy2(self): cache_config = CacheConfig( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f030e7ebf6797..f8cb7b0f38a11 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -118,7 +118,7 @@ def __init__( self.model_executor = executor_class( model_config=model_config, - cache_config=cache_config.shallow_copy(), + cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, device_config=device_config, @@ -136,15 +136,9 @@ def __init__( f"{forced_num_gpu_blocks=}") num_gpu_blocks = forced_num_gpu_blocks - raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) - self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - logger.info( - f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) # If usage stat is enabled, collect relevant info. @@ -855,18 +849,3 @@ def list_loras(self) -> List[int]: def check_health(self) -> None: self.model_executor.check_health() - - -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = block_size * num_gpu_blocks - if max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 7b3cc784c98e5..38d5fa0032c5b 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -35,7 +35,7 @@ def __init__(self, model_config: ModelConfig, cache_config: CacheConfig, # Instantiate the worker and load the model to CPU. self._init_worker() - self._init_cache() + #self._init_cache() def _init_worker(self): from vllm.worker.cpu_worker import CPUWorker @@ -60,13 +60,29 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def _init_cache(self) -> None: + def profile_num_available_blocks(self) -> tuple[int, int]: num_cpu_blocks = self.driver_worker.get_cpu_cache_block_num( block_size=self.cache_config.block_size, cache_space=self.cache_config.cpu_kvcache_space_bytes, cache_dtype=self.cache_config.cache_dtype, ) + # Note: To reuse the cache management procedure, + # use cpu cache as 'gpu cache'. + num_gpu_blocks = num_cpu_blocks + num_cpu_blocks = 0 + return num_gpu_blocks, num_cpu_blocks + + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + # Note: To reuse the cache management procedure, + # use cpu cache as 'gpu cache'. + assert num_cpu_blocks == 0 + num_cpu_blocks = num_gpu_blocks + num_gpu_blocks = 0 + self.cache_config.num_gpu_blocks = num_cpu_blocks + self.cache_config.num_cpu_blocks = 0 + logger.info(f"# CPU blocks: {num_cpu_blocks}") if num_cpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " @@ -82,11 +98,6 @@ def _init_cache(self) -> None: "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when " "initializing the engine.") - # Note: To reuse the cache management procedure, - # use cpu cache as 'gpu cache'. - self.cache_config.num_gpu_blocks = num_cpu_blocks # type: ignore - self.cache_config.num_cpu_blocks = 0 # type: ignore - # Initialize the cache. self.driver_worker.init_cache_engine(cache_config=self.cache_config) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 0156b735c7b29..afec559a59fb6 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -4,7 +4,7 @@ ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid +from vllm.executor.utils import check_block_size_valid, raise_if_cache_size_invalid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata @@ -56,7 +56,7 @@ def _init_worker(self): parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, device_config=self.device_config, - cache_config=self.cache_config.shallow_copy(), + cache_config=self.cache_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, @@ -74,6 +74,13 @@ def profile_num_available_blocks(self) -> Tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: + logger.info( + f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}" + ) + + raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index c0af058cb90b5..53441d8ecca05 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -36,8 +36,8 @@ def __init__( # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. - self.cache_config.num_gpu_blocks = self.scheduler_config.max_num_seqs - self.cache_config.num_cpu_blocks = 0 + #self.cache_config.num_gpu_blocks = self.scheduler_config.max_num_seqs + #self.cache_config.num_cpu_blocks = 0 # Instantiate the worker and load the model to the device. self._init_worker() @@ -54,6 +54,24 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() + # TODO change name + def profile_num_available_blocks(self) -> tuple[int, int]: + # Set the number of GPU blocks to be the same as the maximum number of + # sequences that can be processed in a single batch. This is equivalent + # to schedule without PagedAttention. + num_gpu_blocks = self.scheduler_config.max_num_seqs + + # Swap not yet supported with Neuron backend. + num_cpu_blocks = 0 + + return num_gpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + assert num_cpu_blocks == 0 + assert num_gpu_blocks == self.scheduler_config.max_num_seqs + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], diff --git a/vllm/executor/utils.py b/vllm/executor/utils.py index 666ab4b2927db..89fe04434062f 100644 --- a/vllm/executor/utils.py +++ b/vllm/executor/utils.py @@ -12,3 +12,17 @@ def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None: f"stored in KV cache ({max_seq_len}). Try increasing " "`gpu_memory_utilization` or decreasing `max_model_len` when " "initializing the engine.") + +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") From 408b29d318da99c8bb277df38341dc41ebf98655 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:07:02 -0700 Subject: [PATCH 010/109] wip --- vllm/executor/ray_gpu_executor.py | 36 +++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 24b3a8c18d920..80ec36e9a5d93 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -10,7 +10,7 @@ VisionLanguageConfig) from vllm.engine.ray_utils import RayWorkerVllm, ray from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid +from vllm.executor.utils import check_block_size_valid, raise_if_cache_size_invalid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata @@ -66,7 +66,7 @@ def __init__( self._init_workers_ray(placement_group) # Profile the memory usage and initialize the cache. - self._init_cache() + #self._init_cache() self.forward_dag = None if USE_RAY_COMPILED_DAG: @@ -256,6 +256,38 @@ def _init_cache(self) -> None: # if enforce_eager is False. self._run_workers("warm_up_model") + def profile_num_available_blocks(self) -> tuple[int, int]: + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers( + "profile_num_available_blocks", + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config.gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + ) + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self._run_workers("init_cache_engine", cache_config=self.cache_config) + + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self._run_workers("warm_up_model") + + def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], From 3149a03d2780f241c80b365a10b1d39e2af90abf Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:08:06 -0700 Subject: [PATCH 011/109] wip --- vllm/config.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 735462d1eba69..e27c8eb4fd257 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -367,21 +367,6 @@ def metrics_info(self): # metrics info return {key: str(value) for key, value in self.__dict__.items()} - def shallow_copy2(self): - cache_config = CacheConfig( - block_size=self.block_size, - gpu_memory_utilization=self.gpu_memory_utilization, - swap_space=self.swap_space_bytes // _GB, - cache_dtype=self.cache_dtype, - forced_num_gpu_blocks=self.forced_num_gpu_blocks, - sliding_window=self.sliding_window, - enable_prefix_caching=self.enable_prefix_caching - ) - - cache_config.num_gpu_blocks = self.num_gpu_blocks - cache_config.num_cpu_blocks = self.num_cpu_blocks - return cache_config - def _verify_args(self) -> None: if self.gpu_memory_utilization > 1.0: raise ValueError( From 0c32e0a793489d8f3b557f747e22bdb27bedb85f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:18:44 -0700 Subject: [PATCH 012/109] wip --- vllm/executor/gpu_executor.py | 2 +- vllm/worker/worker.py | 11 ++++++++--- vllm/worker/worker_base.py | 8 +------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index afec559a59fb6..6619cdc15a179 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -85,7 +85,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: self.cache_config.num_cpu_blocks = num_cpu_blocks # Initialize the cache. - self.driver_worker.init_cache_engine(cache_config=self.cache_config) + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) # Warm up the model. This includes capturing the model into CUDA graph # if enforce_eager is False. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 8176529ba7bea..b6955fa678bf7 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,9 +19,10 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner +from vllm.worker.worker_base import WorkerBase -class Worker: +class Worker(WorkerBase): """A worker class that executes (a partition of) the model on a GPU. Each worker is associated with a single GPU. The worker is responsible for @@ -161,13 +162,17 @@ def profile_num_available_blocks( return num_gpu_blocks, num_cpu_blocks - def init_cache_engine(self, cache_config: CacheConfig) -> None: - self.cache_config = cache_config + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + #self.cache_config = cache_config self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.parallel_config) self.gpu_cache = self.cache_engine.gpu_cache self.model_runner.set_block_size(self.cache_engine.block_size) + def warm_up_model(self) -> None: if not self.model_config.enforce_eager: self.model_runner.capture_model(self.gpu_cache) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index c12b876451ca3..7db8cc0fe591c 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -35,7 +35,7 @@ def profile_num_available_blocks(self, block_size: int, raise NotImplementedError @abstractmethod - def init_cache(self, cache_config: CacheConfig) -> None: + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Given a fully-specified cache config, initialize the KV cache. This is separate from init_workers as profiling may be required to determine the maxmimum allowed KV cache size. @@ -62,9 +62,3 @@ def remove_lora(self, lora_id: int) -> bool: @abstractmethod def list_loras(self) -> List[int]: raise NotImplementedError - - @abstractmethod - def check_health(self) -> None: - """Checks if the executor is healthy. If not, it should raise an - exception.""" - raise NotImplementedError From f64d5b14196dd3f21bfad900ba0033fd6441343b Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:24:04 -0700 Subject: [PATCH 013/109] wip --- vllm/executor/neuron_executor.py | 13 ++++++++----- vllm/worker/neuron_worker.py | 25 +++++++++++++++++++++++-- vllm/worker/worker_base.py | 5 +---- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 53441d8ecca05..c9ecca885ee2e 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -25,7 +25,7 @@ def __init__( speculative_config: Optional[SpeculativeConfig], ) -> None: self.model_config = model_config - self.cache_config = cache_config + #self.cache_config = cache_config assert lora_config is None, "LoRA is not supported for Neuron backend." self.parallel_config = parallel_config self.scheduler_config = scheduler_config @@ -56,6 +56,8 @@ def _init_worker(self): # TODO change name def profile_num_available_blocks(self) -> tuple[int, int]: + return self.driver_worker.profile_num_available_blocks() + # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. @@ -67,10 +69,11 @@ def profile_num_available_blocks(self) -> tuple[int, int]: return num_gpu_blocks, num_cpu_blocks def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - assert num_cpu_blocks == 0 - assert num_gpu_blocks == self.scheduler_config.max_num_seqs - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + #assert num_cpu_blocks == 0 + #assert num_gpu_blocks == self.scheduler_config.max_num_seqs + #self.cache_config.num_gpu_blocks = num_gpu_blocks + #self.cache_config.num_cpu_blocks = num_cpu_blocks def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 0ae067aafb29b..3f39808bf4ac9 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -5,13 +5,14 @@ import torch.distributed from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig, - SchedulerConfig) + SchedulerConfig, CacheConfig) from vllm.model_executor import set_random_seed from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.neuron_model_runner import NeuronModelRunner +from vllm.worker.worker_base import WorkerBase -class NeuronWorker: +class NeuronWorker(WorkerBase): """A worker class that executes the model on a group of neuron cores. """ @@ -21,11 +22,13 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + cache_config: CacheConfig, ) -> None: self.model_config = model_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config + self.cache_config = cache_config self.model_runner = NeuronModelRunner(model_config, parallel_config, scheduler_config, device_config) @@ -37,6 +40,24 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() + # TODO change name + def profile_num_available_blocks(self) -> tuple[int, int]: + # Set the number of GPU blocks to be the same as the maximum number of + # sequences that can be processed in a single batch. This is equivalent + # to schedule without PagedAttention. + num_gpu_blocks = self.scheduler_config.max_num_seqs + + # Swap not yet supported with Neuron backend. + num_cpu_blocks = 0 + + return num_gpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + assert num_cpu_blocks == 0 + assert num_gpu_blocks == self.scheduler_config.max_num_seqs + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + @torch.inference_mode() def execute_model( self, diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 7db8cc0fe591c..9c37459ed344e 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -14,10 +14,7 @@ def init_device(self) -> None: raise NotImplementedError @abstractmethod - def profile_num_available_blocks(self, block_size: int, - gpu_memory_utilization: float, - cpu_swap_space: float, - cache_dtype: str) -> tuple[int, int]: + def profile_num_available_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV blocks that can be allocated. From 7207f0c368b43f1e8edbcde55e729f781dafb549 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:28:18 -0700 Subject: [PATCH 014/109] wip --- vllm/worker/cpu_worker.py | 43 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 262ed9abd36b7..e9dff4a6bf5da 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -17,6 +17,7 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.worker.model_runner import ModelRunner +from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) @@ -112,7 +113,7 @@ def get_cache_block_size( return dtype_size * total -class CPUWorker: +class CPUWorker(WorkerBase): """A worker class that executes (a partition of) the model on a CPU socket. Each worker is associated with a single CPU socket. The worker is @@ -167,6 +168,46 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() + def profile_num_available_blocks(self) -> tuple[int, int]: + num_cpu_blocks = self.get_cpu_cache_block_num( + block_size=self.cache_config.block_size, + cache_space=self.cache_config.cpu_kvcache_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + ) + + # Note: To reuse the cache management procedure, + # use cpu cache as 'gpu cache'. + num_gpu_blocks = num_cpu_blocks + num_cpu_blocks = 0 + return num_gpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + # Note: To reuse the cache management procedure, + # use cpu cache as 'gpu cache'. + assert num_cpu_blocks == 0 + num_cpu_blocks = num_gpu_blocks + num_gpu_blocks = 0 + self.cache_config.num_gpu_blocks = num_cpu_blocks + self.cache_config.num_cpu_blocks = 0 + + logger.info(f"# CPU blocks: {num_cpu_blocks}") + if num_cpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `VLLM_CPU_KVCACHE_SPACE` when " + "initializing the engine.") + + max_seq_len = self.cache_config.block_size * num_cpu_blocks + if self.model_config.max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({self.model_config.max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when " + "initializing the engine.") + + # Initialize the cache. + self.init_cache_engine(cache_config=self.cache_config) + def get_cpu_cache_block_num( self, block_size: int, From 0c4df0b14612af9b8cd63097bfe6e543bf365e97 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 15:05:43 -0700 Subject: [PATCH 015/109] wip --- vllm/worker/cpu_worker.py | 2 +- vllm/worker/worker_base.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index e9dff4a6bf5da..13f8f050c6eac 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -113,7 +113,7 @@ def get_cache_block_size( return dtype_size * total -class CPUWorker(WorkerBase): +class CPUWorker(LoraNotSupportedWorkerBase): """A worker class that executes (a partition of) the model on a CPU socket. Each worker is associated with a single CPU socket. The worker is diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 9c37459ed344e..4ba985c0a39b1 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -59,3 +59,13 @@ def remove_lora(self, lora_id: int) -> bool: @abstractmethod def list_loras(self) -> List[int]: raise NotImplementedError + +class LoraNotSupportedWorkerBase(WorkerBase); + def add_lora(self, lora_request: LoRARequest) -> bool: + raise ValueError(f"{type(self)} does not support LoRA") + + def remove_lora(self, lora_id: int) -> bool: + raise ValueError(f"{type(self)} does not support LoRA") + + def list_loras(self) -> List[int]: + raise ValueError(f"{type(self)} does not support LoRA") From 2e355e7e02c095280878e93a8865443834adf0f5 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 15:07:13 -0700 Subject: [PATCH 016/109] wip --- vllm/executor/cpu_executor.py | 6 +++--- vllm/worker/worker_base.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 38d5fa0032c5b..44fdddc3f9426 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -115,13 +115,13 @@ def execute_model(self, return output def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError("LoRA is not implemented for cpu backend.") + return self.driver_worker.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for cpu backend.") + return self.driver_worker.remove_lora(lora_id) def list_loras(self) -> List[int]: - raise NotImplementedError("LoRA is not implemented for cpu backend.") + return self.driver_worker.list_loras() def check_health(self) -> None: # CPUExecutor will always be healthy as long as diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 4ba985c0a39b1..5fd381c5b9538 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -60,7 +60,7 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> List[int]: raise NotImplementedError -class LoraNotSupportedWorkerBase(WorkerBase); +class LoraNotSupportedWorkerBase(WorkerBase): def add_lora(self, lora_request: LoRARequest) -> bool: raise ValueError(f"{type(self)} does not support LoRA") From edb7f6281f065a0d6f89852876152199b75519aa Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 15:07:59 -0700 Subject: [PATCH 017/109] wip --- vllm/worker/worker_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 5fd381c5b9538..cf611eb2b88aa 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -60,6 +60,7 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> List[int]: raise NotImplementedError + class LoraNotSupportedWorkerBase(WorkerBase): def add_lora(self, lora_request: LoRARequest) -> bool: raise ValueError(f"{type(self)} does not support LoRA") From 48bb3e9b0340160b394c5a754e0ef39a08ffdff6 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 15:11:48 -0700 Subject: [PATCH 018/109] wip --- vllm/executor/neuron_executor.py | 9 +++------ vllm/worker/cpu_worker.py | 2 +- vllm/worker/neuron_worker.py | 4 ++-- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index c9ecca885ee2e..2626a5d816728 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -89,16 +89,13 @@ def execute_model(self, return output def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError( - "LoRA is not implemented for neuron backend.") + return self.driver_worker.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "LoRA is not implemented for neuron backend.") + return self.driver_worker.remove_lora(lora_request) def list_loras(self) -> List[int]: - raise NotImplementedError( - "LoRA is not implemented for neuron backend.") + return self.driver_worker.list_loras(lora_request) def check_health(self) -> None: # NeuronExecutor will always be healthy as long as diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 13f8f050c6eac..848f36d15abd7 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -17,7 +17,7 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.worker.model_runner import ModelRunner -from vllm.worker.worker_base import WorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase logger = init_logger(__name__) diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 3f39808bf4ac9..16e9a128d024c 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -9,10 +9,10 @@ from vllm.model_executor import set_random_seed from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.neuron_model_runner import NeuronModelRunner -from vllm.worker.worker_base import WorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase -class NeuronWorker(WorkerBase): +class NeuronWorker(LoraNotSupportedWorkerBase): """A worker class that executes the model on a group of neuron cores. """ From 7b390444dfb97d6639b035601e941edc7952b2d1 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 16:57:36 -0700 Subject: [PATCH 019/109] fix test --- tests/spec_decode/test_spec_decode_worker.py | 22 ++++++---------- tests/spec_decode/utils.py | 3 ++- vllm/spec_decode/spec_decode_worker.py | 27 ++++++++------------ vllm/worker/cache_engine.py | 10 +++----- vllm/worker/worker.py | 10 +++----- 5 files changed, 27 insertions(+), 45 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 87d3716ca98d7..d4c15d9aea50e 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -512,7 +512,7 @@ def test_init_device(): @torch.inference_mode() -def test_init_cache_engine(): +def test_initialize_cache(): """Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer workers. """ @@ -526,11 +526,12 @@ def test_init_cache_engine(): metrics_collector) cache_config = MagicMock() + + kwargs = {"num_gpu_blocks":1024, "num_cpu_blocks": 1023} + worker.initialize_cache(**kwargs) - worker.init_cache_engine(cache_config) - - draft_worker.init_cache_engine.assert_called_once_with(cache_config) - target_worker.init_cache_engine.assert_called_once_with(cache_config) + draft_worker.initialize_cache.assert_called_once_with(**kwargs) + target_worker.initialize_cache.assert_called_once_with(**kwargs) @pytest.mark.parametrize('available_gpu_blocks', [1, 1024]) @@ -561,17 +562,10 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, metrics_collector) - # These values do not directly impact the adjusted block size calculation, - # so they can be fixed. - gpu_memory_utilization = 0.9 - cpu_swap_space = 100 - block_size = 16 - num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks( - block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype="auto") + num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks() - target_worker.profile_num_available_blocks.assert_called_once_with( - block_size, gpu_memory_utilization, cpu_swap_space, "auto") + target_worker.profile_num_available_blocks.assert_called_once() assert num_cpu_blocks == available_cpu_blocks assert num_gpu_blocks == split_num_cache_blocks_evenly( diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 5ef1cc28253e9..5c78b3b780d86 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -117,6 +117,7 @@ def create_worker(cls: type, parallel_config=engine_config.parallel_config, scheduler_config=engine_config.scheduler_config, device_config=engine_config.device_config, + cache_config=engine_config.cache_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, @@ -128,7 +129,7 @@ def create_worker(cls: type, engine_config.cache_config.num_gpu_blocks = num_gpu_blocks engine_config.cache_config.num_cpu_blocks = 0 - worker.init_cache_engine(engine_config.cache_config) + worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) worker.warm_up_model() return worker diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 59f9d5b5107f3..659acc6620bc1 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -3,7 +3,6 @@ import torch -from vllm.config import CacheConfig from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceGroupOutput, SequenceOutput) @@ -15,9 +14,10 @@ from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, split_batch_by_proposal_len) from vllm.worker.worker import Worker +from vllm.worker.worker_base import LoraNotSupportedWorkerBase -class SpecDecodeWorker: +class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. Speculative decoding reduces decoding per-token latency by using a proposal @@ -94,10 +94,7 @@ def init_device(self) -> None: device=self.device, vocab_size=self._vocab_size) - def profile_num_available_blocks(self, block_size: int, - gpu_memory_utilization: float, - cpu_swap_space: int, - cache_dtype: str) -> Tuple[int, int]: + def profile_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of cache blocks to use. This is done by profiling the scorer model (which is typically the @@ -105,28 +102,24 @@ def profile_num_available_blocks(self, block_size: int, scorer cache is divided evenly between the proposer and scorer model KV, such that the number of blocks is equal in both KV caches. """ - num_gpu_blocks, num_cpu_blocks = ( - self.scorer_worker.profile_num_available_blocks( - block_size, gpu_memory_utilization, cpu_swap_space, - cache_dtype)) + num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.profile_num_available_blocks()) scorer_cache_block_size_bytes = ( - self.scorer_worker.get_cache_block_size_bytes( - block_size, cache_dtype)) + self.scorer_worker.get_cache_block_size_bytes()) proposer_cache_block_size_bytes = ( - self.proposer_worker.get_cache_block_size_bytes( - block_size, cache_dtype)) + self.proposer_worker.get_cache_block_size_bytes()) new_num_gpu_blocks = split_num_cache_blocks_evenly( scorer_cache_block_size_bytes, proposer_cache_block_size_bytes, num_gpu_blocks) return new_num_gpu_blocks, num_cpu_blocks - def init_cache_engine(self, cache_config: CacheConfig): + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Initialize the cache engine of the scorer and proposer workers. + TODO """ - self.scorer_worker.init_cache_engine(cache_config) - self.proposer_worker.init_cache_engine(cache_config) + self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) @torch.inference_mode() def execute_model( diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 27d1727cd16a3..011fc69c4b1cd 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -80,10 +80,8 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None: def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts) - @staticmethod def get_cache_block_size( - block_size: int, - cache_dtype: str, + self, model_config: ModelConfig, parallel_config: ParallelConfig, ) -> int: @@ -91,13 +89,13 @@ def get_cache_block_size( num_heads = model_config.get_num_kv_heads(parallel_config) num_layers = model_config.get_num_layers(parallel_config) - key_cache_block = block_size * num_heads * head_size + key_cache_block = self.block_size * num_heads * head_size value_cache_block = key_cache_block total = num_layers * (key_cache_block + value_cache_block) - if cache_dtype == "auto": + if self.cache_dtype == "auto": dtype = model_config.dtype else: - dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] + dtype = STR_DTYPE_TO_TORCH_DTYPE[self.cache_dtype] dtype_size = _get_dtype_size(dtype) return dtype_size * total diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b6955fa678bf7..5914079f713ae 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -147,8 +147,7 @@ def profile_num_available_blocks( "Error in memory profiling. This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") - cache_block_size = self.get_cache_block_size_bytes( - block_size, cache_dtype) + cache_block_size = self.get_cache_block_size_bytes() num_gpu_blocks = int( (total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size) @@ -250,13 +249,10 @@ def max_model_len(self) -> int: def vocab_size(self) -> int: return self.model_runner.vocab_size - def get_cache_block_size_bytes(self, block_size: int, - cache_dtype: str) -> int: + def get_cache_block_size_bytes(self) -> int: """Get the size of the KV cache block size in bytes. """ - return CacheEngine.get_cache_block_size(block_size, cache_dtype, - self.model_config, - self.parallel_config) + return self.cache_config.get_cache_block_size(self.model_config, self.parallel_config) def init_distributed_environment( From 9e5f2fbbeef1eb6aaf1a066546186767ad13928d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 17:03:04 -0700 Subject: [PATCH 020/109] fix test --- tests/worker/test_swap.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 5d6ba51ea0f06..bf89eec62b4db 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -22,6 +22,7 @@ def test_swap() -> None: parallel_config=engine_config.parallel_config, scheduler_config=engine_config.scheduler_config, device_config=engine_config.device_config, + cache_config=engine_config.cache_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, From 1a3e26ed81acce3c0cb5982f53525b6f7e8334c2 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 17:05:07 -0700 Subject: [PATCH 021/109] fix test --- tests/lora/test_worker.py | 1 + tests/worker/test_swap.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 60aa90fe4ee8a..11370b3ea1c6b 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -27,6 +27,7 @@ def test_worker_apply_lora(sql_lora_files): parallel_config=ParallelConfig(1, 1, False), scheduler_config=SchedulerConfig(32, 32, 32), device_config=DeviceConfig("cuda"), + cache_config=CacheConfig(block_size=16, gpu_memory_utilization=1., swap_space=0, cache_dtype="auto"), local_rank=0, rank=0, lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index bf89eec62b4db..7b58416257b88 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -32,7 +32,7 @@ def test_swap() -> None: # Initialize the worker. worker.init_device() worker.load_model() - worker.init_cache_engine(engine_config.cache_config) + worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) worker.warm_up_model() # Randomly initialize the cache. From cd2015c9a548f56d91b94951d13e30c378d55cd7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:02:03 -0700 Subject: [PATCH 022/109] fix test --- vllm/worker/cache_engine.py | 9 +++++---- vllm/worker/worker.py | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 011fc69c4b1cd..c34ee0648626b 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -80,8 +80,9 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None: def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts) + @staticmethod def get_cache_block_size( - self, + cache_config: CacheConfig, model_config: ModelConfig, parallel_config: ParallelConfig, ) -> int: @@ -89,13 +90,13 @@ def get_cache_block_size( num_heads = model_config.get_num_kv_heads(parallel_config) num_layers = model_config.get_num_layers(parallel_config) - key_cache_block = self.block_size * num_heads * head_size + key_cache_block = cache_config.block_size * num_heads * head_size value_cache_block = key_cache_block total = num_layers * (key_cache_block + value_cache_block) - if self.cache_dtype == "auto": + if cache_config.cache_dtype == "auto": dtype = model_config.dtype else: - dtype = STR_DTYPE_TO_TORCH_DTYPE[self.cache_dtype] + dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] dtype_size = _get_dtype_size(dtype) return dtype_size * total diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 5914079f713ae..1e051697fa7ed 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -252,7 +252,8 @@ def vocab_size(self) -> int: def get_cache_block_size_bytes(self) -> int: """Get the size of the KV cache block size in bytes. """ - return self.cache_config.get_cache_block_size(self.model_config, self.parallel_config) + return CacheEngine.get_cache_block_size( + self.cache_config, self.model_config, self.parallel_config) def init_distributed_environment( From d92603494025e923ea37629486d2255bdda53222 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:07:07 -0700 Subject: [PATCH 023/109] fix --- vllm/executor/ray_gpu_executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 80ec36e9a5d93..0855aaec47e72 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -172,7 +172,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", rank, distributed_init_method, lora_config=lora_config, - kv_cache_dtype=kv_cache_dtype, + #kv_cache_dtype=kv_cache_dtype, )) # Initialize the driver worker with the Worker class. @@ -188,7 +188,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", distributed_init_method, lora_config=self.lora_config, vision_language_config=self.vision_language_config, - kv_cache_dtype=kv_cache_dtype, + #kv_cache_dtype=kv_cache_dtype, is_driver_worker=True, ) From 607f7e22c5b5999809655dadc8f25b5651a0c107 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:10:57 -0700 Subject: [PATCH 024/109] fix --- vllm/executor/ray_gpu_executor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 0855aaec47e72..8f7c4d341562b 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -272,6 +272,8 @@ def profile_num_available_blocks(self) -> tuple[int, int]: num_gpu_blocks = min(b[0] for b in num_blocks) num_cpu_blocks = min(b[1] for b in num_blocks) + return num_gpu_blocks, num_cpu_blocks + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, From e127bb7094a8ad04173167906c9874441578bbc9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:11:41 -0700 Subject: [PATCH 025/109] fix --- vllm/executor/ray_gpu_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 8f7c4d341562b..bb93e438c0431 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -283,7 +283,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_cpu_blocks = num_cpu_blocks # Initialize the cache. - self._run_workers("init_cache_engine", cache_config=self.cache_config) + self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) # Warm up the model. This includes capturing the model into CUDA graph # if enforce_eager is False. From deaa8b059e48b9e229237bf2c6b55cc1368b1fdf Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:13:28 -0700 Subject: [PATCH 026/109] fix --- vllm/executor/cpu_executor.py | 39 ++--------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 44fdddc3f9426..40f366f987f0f 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -61,45 +61,10 @@ def _init_worker(self): self.driver_worker.load_model() def profile_num_available_blocks(self) -> tuple[int, int]: - num_cpu_blocks = self.driver_worker.get_cpu_cache_block_num( - block_size=self.cache_config.block_size, - cache_space=self.cache_config.cpu_kvcache_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) - - # Note: To reuse the cache management procedure, - # use cpu cache as 'gpu cache'. - num_gpu_blocks = num_cpu_blocks - num_cpu_blocks = 0 - return num_gpu_blocks, num_cpu_blocks - + return self.driver_worker.profile_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - # Note: To reuse the cache management procedure, - # use cpu cache as 'gpu cache'. - assert num_cpu_blocks == 0 - num_cpu_blocks = num_gpu_blocks - num_gpu_blocks = 0 - self.cache_config.num_gpu_blocks = num_cpu_blocks - self.cache_config.num_cpu_blocks = 0 - - logger.info(f"# CPU blocks: {num_cpu_blocks}") - if num_cpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `VLLM_CPU_KVCACHE_SPACE` when " - "initializing the engine.") - - max_seq_len = self.cache_config.block_size * num_cpu_blocks - if self.model_config.max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({self.model_config.max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when " - "initializing the engine.") - - # Initialize the cache. - self.driver_worker.init_cache_engine(cache_config=self.cache_config) + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], From 7817d61d25379b5b1d787926c1cb9858042b7159 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:15:07 -0700 Subject: [PATCH 027/109] clean --- vllm/executor/cpu_executor.py | 1 - vllm/executor/ray_gpu_executor.py | 115 +++++++++++++++--------------- vllm/worker/cpu_worker.py | 6 +- 3 files changed, 59 insertions(+), 63 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 40f366f987f0f..3c1b2b5e21e8a 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -35,7 +35,6 @@ def __init__(self, model_config: ModelConfig, cache_config: CacheConfig, # Instantiate the worker and load the model to CPU. self._init_worker() - #self._init_cache() def _init_worker(self): from vllm.worker.cpu_worker import CPUWorker diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index bb93e438c0431..8ffbc64a2cfcc 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -65,9 +65,6 @@ def __init__( # Create the parallel GPU workers. self._init_workers_ray(placement_group) - # Profile the memory usage and initialize the cache. - #self._init_cache() - self.forward_dag = None if USE_RAY_COMPILED_DAG: self.forward_dag = self._compiled_ray_dag() @@ -199,62 +196,62 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_parallel_loading_workers, ) - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - More details can be found in the - :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method - from class :class:`~vllm.worker.Worker`. - - Afterwards, as there may be multiple workers, - we take the minimum number of blocks across all workers - to ensure this can be applied to all of them. - - Finally, the engine will initialize the KV cache - with the calculated number of blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers( - "profile_num_available_blocks", - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - - if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_gpu_blocks=} with " - f"{forced_num_gpu_blocks=}") - num_gpu_blocks = forced_num_gpu_blocks - - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - # Initialize the cache. - self._run_workers("init_cache_engine", cache_config=self.cache_config) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self._run_workers("warm_up_model") + #def _init_cache(self) -> None: + # """Profiles the memory usage and initializes the KV cache. + + # The engine will first conduct a profiling of the existing memory usage. + # Then, it calculate the maximum possible number of GPU and CPU blocks + # that can be allocated with the remaining free memory. + # More details can be found in the + # :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method + # from class :class:`~vllm.worker.Worker`. + + # Afterwards, as there may be multiple workers, + # we take the minimum number of blocks across all workers + # to ensure this can be applied to all of them. + + # Finally, the engine will initialize the KV cache + # with the calculated number of blocks. + + # .. tip:: + # You may limit the usage of GPU memory + # by adjusting the `gpu_memory_utilization` parameter. + # """ + # # Get the maximum number of blocks that can be allocated on GPU and CPU. + # num_blocks = self._run_workers( + # "profile_num_available_blocks", + # block_size=self.cache_config.block_size, + # gpu_memory_utilization=self.cache_config.gpu_memory_utilization, + # cpu_swap_space=self.cache_config.swap_space_bytes, + # cache_dtype=self.cache_config.cache_dtype, + # ) + + # # Since we use a shared centralized controller, we take the minimum + # # number of blocks across all workers to make sure all the memory + # # operators can be applied to all workers. + # num_gpu_blocks = min(b[0] for b in num_blocks) + # num_cpu_blocks = min(b[1] for b in num_blocks) + + # if self.cache_config.forced_num_gpu_blocks is not None: + # forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks + # logger.info(f"Replacing profiled {num_gpu_blocks=} with " + # f"{forced_num_gpu_blocks=}") + # num_gpu_blocks = forced_num_gpu_blocks + + # logger.info(f"# GPU blocks: {num_gpu_blocks}, " + # f"# CPU blocks: {num_cpu_blocks}") + + # check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + # self.model_config.max_model_len) + + # self.cache_config.num_gpu_blocks = num_gpu_blocks + # self.cache_config.num_cpu_blocks = num_cpu_blocks + + # # Initialize the cache. + # self._run_workers("init_cache_engine", cache_config=self.cache_config) + # # Warm up the model. This includes capturing the model into CUDA graph + # # if enforce_eager is False. + # self._run_workers("warm_up_model") def profile_num_available_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 848f36d15abd7..1d14bb0bd6d2c 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -155,7 +155,7 @@ def __init__( kv_cache_dtype=kv_cache_dtype, is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by - # self.init_cache_engine(). + # initialize_cache. self.cache_config = None self.cache_engine = None self.cpu_cache = None @@ -206,7 +206,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: "initializing the engine.") # Initialize the cache. - self.init_cache_engine(cache_config=self.cache_config) + self._init_cache_engine(cache_config=self.cache_config) def get_cpu_cache_block_num( self, @@ -228,7 +228,7 @@ def get_cpu_cache_block_num( return num_cpu_blocks - def init_cache_engine(self, cache_config: CacheConfig) -> None: + def _init_cache_engine(self, cache_config: CacheConfig) -> None: self.cache_config = cache_config self.cache_engine = CPUCacheEngine(self.cache_config, self.model_config, From 99823a34607920537920b71c6221d7e3b285cca0 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:28:19 -0700 Subject: [PATCH 028/109] clean --- tests/spec_decode/test_spec_decode_worker.py | 2 +- vllm/executor/gpu_executor.py | 20 +++++++++++------- vllm/executor/neuron_executor.py | 14 ------------- vllm/executor/ray_gpu_executor.py | 22 ++++++++++++-------- vllm/worker/worker.py | 9 +++++++- 5 files changed, 34 insertions(+), 33 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index d4c15d9aea50e..ff7beff40dedb 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -513,7 +513,7 @@ def test_init_device(): @torch.inference_mode() def test_initialize_cache(): - """Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer + """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer workers. """ draft_worker = mock_worker(cls=MultiStepWorker) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 6619cdc15a179..9fb7b0df00aa8 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -79,17 +79,21 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: f"# CPU blocks: {num_cpu_blocks}" ) - raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + #return - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - # Initialize the cache. - self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + #raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + + #self.cache_config.num_gpu_blocks = num_gpu_blocks + #self.cache_config.num_cpu_blocks = num_cpu_blocks + + ## Initialize the cache. + #self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self.driver_worker.warm_up_model() + ## Warm up the model. This includes capturing the model into CUDA graph + ## if enforce_eager is False. + #self.driver_worker.warm_up_model() def execute_model(self, diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 2626a5d816728..d8cda2ee461c3 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -58,22 +58,8 @@ def _init_worker(self): def profile_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.profile_num_available_blocks() - # Set the number of GPU blocks to be the same as the maximum number of - # sequences that can be processed in a single batch. This is equivalent - # to schedule without PagedAttention. - num_gpu_blocks = self.scheduler_config.max_num_seqs - - # Swap not yet supported with Neuron backend. - num_cpu_blocks = 0 - - return num_gpu_blocks, num_cpu_blocks - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - #assert num_cpu_blocks == 0 - #assert num_gpu_blocks == self.scheduler_config.max_num_seqs - #self.cache_config.num_gpu_blocks = num_gpu_blocks - #self.cache_config.num_cpu_blocks = num_cpu_blocks def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 8ffbc64a2cfcc..a2b571242c6ec 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -273,18 +273,22 @@ def profile_num_available_blocks(self) -> tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) + self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks + #return - # Initialize the cache. - self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + #raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, + # self.model_config.max_model_len) + + #self.cache_config.num_gpu_blocks = num_gpu_blocks + #self.cache_config.num_cpu_blocks = num_cpu_blocks + + ## Initialize the cache. + #self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self._run_workers("warm_up_model") + ## Warm up the model. This includes capturing the model into CUDA graph + ## if enforce_eager is False. + #self._run_workers("warm_up_model") def execute_model(self, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 1e051697fa7ed..d84d11021a856 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -163,9 +163,16 @@ def profile_num_available_blocks( def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - #self.cache_config = cache_config + + self._init_cache_engine() + self.warm_up_model() + + def _init_cache_engine(self): + assert self.cache_config.num_gpu_blocks is not None self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.parallel_config) self.gpu_cache = self.cache_engine.gpu_cache From 849bfe911f99a761f9163e5c2496d10fb33e416a Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:34:52 -0700 Subject: [PATCH 029/109] fix --- vllm/executor/cpu_executor.py | 9 +++++---- vllm/executor/gpu_executor.py | 17 +++-------------- vllm/executor/ray_gpu_executor.py | 18 +++--------------- vllm/worker/cpu_worker.py | 3 ++- vllm/worker/worker.py | 4 +++- 5 files changed, 16 insertions(+), 35 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 3c1b2b5e21e8a..e17bdf34a98d9 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -45,10 +45,11 @@ def _init_worker(self): distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) self.driver_worker = CPUWorker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 9fb7b0df00aa8..889f1079efc92 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -74,26 +74,15 @@ def profile_num_available_blocks(self) -> Tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: + # NOTE: This is logged in the executor because there can be >1 worker + # with other executors. We could log in the engine level, but work + # remains to abstract away the device for non-GPU configurations. logger.info( f"# GPU blocks: {num_gpu_blocks}, " f"# CPU blocks: {num_cpu_blocks}" ) self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - #return - - - #raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) - - #self.cache_config.num_gpu_blocks = num_gpu_blocks - #self.cache_config.num_cpu_blocks = num_cpu_blocks - - ## Initialize the cache. - #self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - ## Warm up the model. This includes capturing the model into CUDA graph - ## if enforce_eager is False. - #self.driver_worker.warm_up_model() def execute_model(self, diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index a2b571242c6ec..b39d552d62dd1 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -273,22 +273,10 @@ def profile_num_available_blocks(self) -> tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - - #return - - #raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, - # self.model_config.max_model_len) + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks - #self.cache_config.num_gpu_blocks = num_gpu_blocks - #self.cache_config.num_cpu_blocks = num_cpu_blocks - - ## Initialize the cache. - #self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - - ## Warm up the model. This includes capturing the model into CUDA graph - ## if enforce_eager is False. - #self._run_workers("warm_up_model") + self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) def execute_model(self, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 1d14bb0bd6d2c..9decc83af6a97 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -128,6 +128,7 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + cache_config: CacheConfig, local_rank: int, rank: int, distributed_init_method: str, @@ -139,6 +140,7 @@ def __init__( self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config + self.cache_config = cache_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method @@ -156,7 +158,6 @@ def __init__( is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_config = None self.cache_engine = None self.cpu_cache = None diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d84d11021a856..d7fdaf6d1f883 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,7 +19,9 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner -from vllm.worker.worker_base import WorkerBase +from vllm.worker.worker_base import WorkerBase, raise_if_cache_size_invalid + +# TODO move raise_if_cache_size_invalid class Worker(WorkerBase): From 951ba8597dc08994b1484f9f49b226acd8bc373e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:42:46 -0700 Subject: [PATCH 030/109] fix --- tests/spec_decode/utils.py | 1 - tests/worker/test_swap.py | 1 - vllm/executor/ray_gpu_executor.py | 39 +++---------------------------- vllm/worker/worker.py | 7 +++--- 4 files changed, 7 insertions(+), 41 deletions(-) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 5c78b3b780d86..0916d3d494211 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -130,7 +130,6 @@ def create_worker(cls: type, engine_config.cache_config.num_gpu_blocks = num_gpu_blocks engine_config.cache_config.num_cpu_blocks = 0 worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) - worker.warm_up_model() return worker diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 7b58416257b88..b35bf583ecb46 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -33,7 +33,6 @@ def test_swap() -> None: worker.init_device() worker.load_model() worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) - worker.warm_up_model() # Randomly initialize the cache. gpu_cache = worker.cache_engine.gpu_cache diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index b39d552d62dd1..e7a52b5830f61 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -196,7 +196,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_parallel_loading_workers, ) - #def _init_cache(self) -> None: # """Profiles the memory usage and initializes the KV cache. # The engine will first conduct a profiling of the existing memory usage. @@ -217,41 +216,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # You may limit the usage of GPU memory # by adjusting the `gpu_memory_utilization` parameter. # """ - # # Get the maximum number of blocks that can be allocated on GPU and CPU. - # num_blocks = self._run_workers( - # "profile_num_available_blocks", - # block_size=self.cache_config.block_size, - # gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - # cpu_swap_space=self.cache_config.swap_space_bytes, - # cache_dtype=self.cache_config.cache_dtype, - # ) - - # # Since we use a shared centralized controller, we take the minimum - # # number of blocks across all workers to make sure all the memory - # # operators can be applied to all workers. - # num_gpu_blocks = min(b[0] for b in num_blocks) - # num_cpu_blocks = min(b[1] for b in num_blocks) - - # if self.cache_config.forced_num_gpu_blocks is not None: - # forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - # logger.info(f"Replacing profiled {num_gpu_blocks=} with " - # f"{forced_num_gpu_blocks=}") - # num_gpu_blocks = forced_num_gpu_blocks - - # logger.info(f"# GPU blocks: {num_gpu_blocks}, " - # f"# CPU blocks: {num_cpu_blocks}") - - # check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - # self.model_config.max_model_len) - - # self.cache_config.num_gpu_blocks = num_gpu_blocks - # self.cache_config.num_cpu_blocks = num_cpu_blocks - - # # Initialize the cache. - # self._run_workers("init_cache_engine", cache_config=self.cache_config) - # # Warm up the model. This includes capturing the model into CUDA graph - # # if enforce_eager is False. - # self._run_workers("warm_up_model") def profile_num_available_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. @@ -269,6 +233,9 @@ def profile_num_available_blocks(self) -> tuple[int, int]: num_gpu_blocks = min(b[0] for b in num_blocks) num_cpu_blocks = min(b[1] for b in num_blocks) + # logger.info(f"# GPU blocks: {num_gpu_blocks}, " + # f"# CPU blocks: {num_cpu_blocks}") + return num_gpu_blocks, num_cpu_blocks diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d7fdaf6d1f883..3dd233159d9b4 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,7 +19,8 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner -from vllm.worker.worker_base import WorkerBase, raise_if_cache_size_invalid +from vllm.worker.worker_base import WorkerBase +from vllm.executor.utils import raise_if_cache_size_invalid # TODO move raise_if_cache_size_invalid @@ -171,7 +172,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_cpu_blocks = num_cpu_blocks self._init_cache_engine() - self.warm_up_model() + self._warm_up_model() def _init_cache_engine(self): assert self.cache_config.num_gpu_blocks is not None @@ -181,7 +182,7 @@ def _init_cache_engine(self): self.model_runner.set_block_size(self.cache_engine.block_size) - def warm_up_model(self) -> None: + def _warm_up_model(self) -> None: if not self.model_config.enforce_eager: self.model_runner.capture_model(self.gpu_cache) # Reset the seed to ensure that the random state is not affected by From 38948df55a2f20c18a57647115709cf3ece6d0ec Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:49:22 -0700 Subject: [PATCH 031/109] speed up cpu test --- tests/conftest.py | 6 +++++- tests/spec_decode/test_batch_expansion.py | 3 +++ tests/spec_decode/test_spec_decode_worker.py | 8 ++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5c409c8cd5ee5..e00f3eb871e37 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,11 +56,15 @@ def cleanup(): @pytest.fixture() -def should_do_global_cleanup_after_test() -> bool: +def should_do_global_cleanup_after_test(request) -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. This can provide a ~10x speedup for non-GPU unit tests since they don't need to initialize torch. """ + + if request.node.get_closest_marker("skip_global_cleanup"): + return False + return True diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 80a960acf0be5..43cfd78ddb0cc 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -7,6 +7,7 @@ @pytest.mark.parametrize('num_target_seq_ids', [100]) +@pytest.mark.skip_global_cleanup def test_create_target_seq_id_iterator(num_target_seq_ids: int): """Verify all new sequence ids are greater than all input seq ids. @@ -27,6 +28,7 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int): @pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.skip_global_cleanup def test_get_token_ids_to_score(k: int): """Verify correct tokens are selected for scoring. """ @@ -53,6 +55,7 @@ def test_get_token_ids_to_score(k: int): @pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.skip_global_cleanup def test_create_single_target_seq_group_metadata(k: int): """Verify correct creation of a batch-expanded seq group metadata. """ diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index ff7beff40dedb..038de6a48d7a1 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -487,7 +487,7 @@ def test_empty_input_batch(k: int, batch_size: int): **execute_model_data.to_dict()) -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_init_device(): """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as well as other GPU initialization. @@ -511,7 +511,7 @@ def test_init_device(): rejection_sampler.init_gpu_tensors.assert_called_once() -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_initialize_cache(): """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer workers. @@ -538,7 +538,7 @@ def test_initialize_cache(): @pytest.mark.parametrize('available_cpu_blocks', [500]) @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_profile_num_available_blocks(available_gpu_blocks: int, available_cpu_blocks: int, target_cache_block_size_bytes: int, @@ -578,7 +578,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096, 2 * 2 * 8192]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, target_cache_block_size_bytes: int, draft_kv_size_bytes: int): From 397ec77d77db76d757841ba18da48128d9f918eb Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:54:35 -0700 Subject: [PATCH 032/109] wip --- vllm/worker/cpu_worker.py | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 9decc83af6a97..65f90fcbd86bd 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -170,11 +170,12 @@ def load_model(self): self.model_runner.load_model() def profile_num_available_blocks(self) -> tuple[int, int]: - num_cpu_blocks = self.get_cpu_cache_block_num( - block_size=self.cache_config.block_size, - cache_space=self.cache_config.cpu_kvcache_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) + # For CPU device, the block number will be calculated based on the + # cpu_kvcache_space. + cache_block_size = CPUCacheEngine.get_cache_block_size( + self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) + num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes // cache_block_size) + num_cpu_blocks = max(num_cpu_blocks, 0) # Note: To reuse the cache management procedure, # use cpu cache as 'gpu cache'. @@ -209,26 +210,6 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: # Initialize the cache. self._init_cache_engine(cache_config=self.cache_config) - def get_cpu_cache_block_num( - self, - block_size: int, - cache_space: int, - cache_dtype: str, - ) -> int: - """ - Args: - block_size: The size of the cache block. - cache_space: The size of the CPU KV cache space in bytes. - """ - # For CPU device, the block number will be calculated based on the - # cpu_kvcache_space. - cache_block_size = CPUCacheEngine.get_cache_block_size( - block_size, cache_dtype, self.model_config, self.parallel_config) - num_cpu_blocks = int(cache_space // cache_block_size) - num_cpu_blocks = max(num_cpu_blocks, 0) - - return num_cpu_blocks - def _init_cache_engine(self, cache_config: CacheConfig) -> None: self.cache_config = cache_config self.cache_engine = CPUCacheEngine(self.cache_config, From 23382b955b1a84c99a3ec169f14f05d0d2d3c4fe Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:59:27 -0700 Subject: [PATCH 033/109] wip --- vllm/executor/cpu_executor.py | 1 + vllm/worker/cpu_worker.py | 6 ++---- vllm/worker/worker.py | 10 ++-------- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index e17bdf34a98d9..c307d08ae0d72 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -64,6 +64,7 @@ def profile_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.profile_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + logger.info(f"# CPU blocks: {num_cpu_blocks}") self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) def execute_model(self, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 65f90fcbd86bd..781501dc610d9 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -192,7 +192,6 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_cpu_blocks self.cache_config.num_cpu_blocks = 0 - logger.info(f"# CPU blocks: {num_cpu_blocks}") if num_cpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `VLLM_CPU_KVCACHE_SPACE` when " @@ -208,10 +207,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: "initializing the engine.") # Initialize the cache. - self._init_cache_engine(cache_config=self.cache_config) + self._init_cache_engine() - def _init_cache_engine(self, cache_config: CacheConfig) -> None: - self.cache_config = cache_config + def _init_cache_engine(self) -> None: self.cache_engine = CPUCacheEngine(self.cache_config, self.model_config, self.parallel_config, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 3dd233159d9b4..2fc89635112a2 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -125,12 +125,6 @@ def profile_num_available_blocks( gpu_memory_utilization: The fraction of the total GPU memory to use. cpu_swap_space: The size of the CPU swap space in bytes. """ - - block_size = self.cache_config.block_size - gpu_memory_utilization = self.cache_config.gpu_memory_utilization - cpu_swap_space = self.cache_config.swap_space_bytes - cache_dtype = self.cache_config.cache_dtype - # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() @@ -152,9 +146,9 @@ def profile_num_available_blocks( cache_block_size = self.get_cache_block_size_bytes() num_gpu_blocks = int( - (total_gpu_memory * gpu_memory_utilization - peak_memory) // + (total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) // cache_block_size) - num_cpu_blocks = int(cpu_swap_space // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) if self.model_runner.lora_manager: From 7a0294cd0e47b618ac170275dc69fde532a0992d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:03:02 -0700 Subject: [PATCH 034/109] clean --- vllm/engine/llm_engine.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 23a952b4101a1..155b65e74434b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -127,19 +127,7 @@ def __init__( speculative_config=speculative_config, ) - # TODO cleanup location - num_gpu_blocks, num_cpu_blocks = self.model_executor.profile_num_available_blocks() - - if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_gpu_blocks=} with " - f"{forced_num_gpu_blocks=}") - num_gpu_blocks = forced_num_gpu_blocks - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) + self._initialize_kv_caches() # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): @@ -192,6 +180,20 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) + def _initialize_kv_caches(self) -> None: + num_gpu_blocks, num_cpu_blocks = self.model_executor.profile_num_available_blocks() + + if self.cache_config.forced_num_gpu_blocks is not None: + forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks + logger.info(f"Replacing profiled {num_gpu_blocks=} with " + f"{forced_num_gpu_blocks=}") + num_gpu_blocks = forced_num_gpu_blocks + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) + @classmethod def from_engine_args( cls, From dcdca688de21f994faa24dafd9ac6cb9455f2461 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:08:38 -0700 Subject: [PATCH 035/109] wip --- vllm/executor/ray_gpu_executor.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index e7a52b5830f61..637581b53f1f5 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -233,13 +233,17 @@ def profile_num_available_blocks(self) -> tuple[int, int]: num_gpu_blocks = min(b[0] for b in num_blocks) num_cpu_blocks = min(b[1] for b in num_blocks) - # logger.info(f"# GPU blocks: {num_gpu_blocks}, " - # f"# CPU blocks: {num_cpu_blocks}") - return num_gpu_blocks, num_cpu_blocks def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + + # NOTE: We log here to avoid multiple logs when number of workers is + # greater than one. We could log in the engine, but not all executors + # have GPUs. + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks From ed58af224b35e516d7e8ff3316a744d2d4c9f4c3 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:15:43 -0700 Subject: [PATCH 036/109] remove --- tests/spec_decode/test_spec_decode_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 038de6a48d7a1..c4dfbb5dc00c8 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -487,7 +487,6 @@ def test_empty_input_batch(k: int, batch_size: int): **execute_model_data.to_dict()) -@pytest.mark.skip_global_cleanup def test_init_device(): """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as well as other GPU initialization. From df8688e0cad205a4690a4d0f680ccf994959b350 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:19:12 -0700 Subject: [PATCH 037/109] Revert "more test speedup" This reverts commit 4c486f9bb4fc3b90efc1765ba46f4a666d1c9339. --- tests/conftest.py | 6 +----- tests/spec_decode/test_batch_expansion.py | 3 --- tests/spec_decode/test_spec_decode_worker.py | 5 +++-- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e00f3eb871e37..5c409c8cd5ee5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,15 +56,11 @@ def cleanup(): @pytest.fixture() -def should_do_global_cleanup_after_test(request) -> bool: +def should_do_global_cleanup_after_test() -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. This can provide a ~10x speedup for non-GPU unit tests since they don't need to initialize torch. """ - - if request.node.get_closest_marker("skip_global_cleanup"): - return False - return True diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 43cfd78ddb0cc..80a960acf0be5 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -7,7 +7,6 @@ @pytest.mark.parametrize('num_target_seq_ids', [100]) -@pytest.mark.skip_global_cleanup def test_create_target_seq_id_iterator(num_target_seq_ids: int): """Verify all new sequence ids are greater than all input seq ids. @@ -28,7 +27,6 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int): @pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.skip_global_cleanup def test_get_token_ids_to_score(k: int): """Verify correct tokens are selected for scoring. """ @@ -55,7 +53,6 @@ def test_get_token_ids_to_score(k: int): @pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.skip_global_cleanup def test_create_single_target_seq_group_metadata(k: int): """Verify correct creation of a batch-expanded seq group metadata. """ diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index c4dfbb5dc00c8..c7b11f7bbf684 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -487,6 +487,7 @@ def test_empty_input_batch(k: int, batch_size: int): **execute_model_data.to_dict()) +@torch.inference_mode() def test_init_device(): """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as well as other GPU initialization. @@ -537,7 +538,7 @@ def test_initialize_cache(): @pytest.mark.parametrize('available_cpu_blocks', [500]) @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@pytest.mark.skip_global_cleanup +@torch.inference_mode() def test_profile_num_available_blocks(available_gpu_blocks: int, available_cpu_blocks: int, target_cache_block_size_bytes: int, @@ -577,7 +578,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096, 2 * 2 * 8192]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@pytest.mark.skip_global_cleanup +@torch.inference_mode() def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, target_cache_block_size_bytes: int, draft_kv_size_bytes: int): From 55a5203484b1861ca91ecf661decb771d6c5603d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:19:38 -0700 Subject: [PATCH 038/109] wip --- tests/spec_decode/test_spec_decode_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index c7b11f7bbf684..8d33fa2f1e387 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -511,7 +511,6 @@ def test_init_device(): rejection_sampler.init_gpu_tensors.assert_called_once() -@pytest.mark.skip_global_cleanup def test_initialize_cache(): """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer workers. From 55d083bf1a761f20e4cf089283a5657282e118e7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:19:56 -0700 Subject: [PATCH 039/109] wip --- tests/spec_decode/test_spec_decode_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 8d33fa2f1e387..218704b4224ab 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -510,7 +510,7 @@ def test_init_device(): metrics_collector.init_gpu_tensors.assert_called_once() rejection_sampler.init_gpu_tensors.assert_called_once() - +@torch.inference_mode() def test_initialize_cache(): """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer workers. From 0814d245e896309ce6ca85214e391c9e99225dc3 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:28:55 -0700 Subject: [PATCH 040/109] wip --- vllm/entrypoints/llm.py | 1 - vllm/executor/executor_base.py | 8 -------- vllm/executor/gpu_executor.py | 1 - vllm/executor/neuron_executor.py | 7 ------- 4 files changed, 17 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index b079d7c117d84..5777e8179a1c1 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -109,7 +109,6 @@ def __init__( disable_custom_all_reduce=disable_custom_all_reduce, **kwargs, ) - self.llm_engine = LLMEngine.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS) self.request_counter = Counter() diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 9dd372156b9ff..b575d238696f2 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -32,14 +32,6 @@ def __init__( raise NotImplementedError - #@abstractmethod - #def init_workers(self) -> None: - # """Initialize workers, such as loading the model or preparing on-device - # tensors. - # """ - # raise NotImplementedError - - @abstractmethod def profile_num_available_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 889f1079efc92..f138258ec83ac 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -62,7 +62,6 @@ def _init_worker(self): distributed_init_method=distributed_init_method, lora_config=self.lora_config, vision_language_config=self.vision_language_config, - #kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=True, ) self.driver_worker.init_device() diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index d8cda2ee461c3..5290bbd8a82c8 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -25,7 +25,6 @@ def __init__( speculative_config: Optional[SpeculativeConfig], ) -> None: self.model_config = model_config - #self.cache_config = cache_config assert lora_config is None, "LoRA is not supported for Neuron backend." self.parallel_config = parallel_config self.scheduler_config = scheduler_config @@ -33,12 +32,6 @@ def __init__( assert (not speculative_config ), "Speculative decoding not yet supported for Neuron backend." - # Set the number of GPU blocks to be the same as the maximum number of - # sequences that can be processed in a single batch. This is equivalent - # to schedule without PagedAttention. - #self.cache_config.num_gpu_blocks = self.scheduler_config.max_num_seqs - #self.cache_config.num_cpu_blocks = 0 - # Instantiate the worker and load the model to the device. self._init_worker() From b18d00c6c0d7dbbda13768e12122f3e958b61667 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:35:18 -0700 Subject: [PATCH 041/109] rename profile_num_available_blocks to get_max_allowed_kv_blocks --- tests/spec_decode/test_spec_decode_worker.py | 8 ++++---- vllm/engine/llm_engine.py | 2 +- vllm/executor/cpu_executor.py | 4 ++-- vllm/executor/executor_base.py | 2 +- vllm/executor/gpu_executor.py | 4 ++-- vllm/executor/neuron_executor.py | 4 ++-- vllm/executor/ray_gpu_executor.py | 6 +++--- vllm/spec_decode/spec_decode_worker.py | 4 ++-- vllm/worker/cpu_worker.py | 2 +- vllm/worker/neuron_worker.py | 2 +- vllm/worker/worker.py | 2 +- vllm/worker/worker_base.py | 2 +- 12 files changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 218704b4224ab..e1dc33e8babcf 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -538,7 +538,7 @@ def test_initialize_cache(): @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) @torch.inference_mode() -def test_profile_num_available_blocks(available_gpu_blocks: int, +def test_get_max_allowed_kv_blocks(available_gpu_blocks: int, available_cpu_blocks: int, target_cache_block_size_bytes: int, draft_kv_size_bytes: int): @@ -552,7 +552,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) - target_worker.profile_num_available_blocks.return_value = ( + target_worker.get_max_allowed_kv_blocks.return_value = ( available_gpu_blocks, available_cpu_blocks) target_worker.get_cache_block_size_bytes.return_value = ( target_cache_block_size_bytes) @@ -562,9 +562,9 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, metrics_collector) - num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks() + num_gpu_blocks, num_cpu_blocks = worker.get_max_allowed_kv_blocks() - target_worker.profile_num_available_blocks.assert_called_once() + target_worker.get_max_allowed_kv_blocks.assert_called_once() assert num_cpu_blocks == available_cpu_blocks assert num_gpu_blocks == split_num_cache_blocks_evenly( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 155b65e74434b..4974cca23c484 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -181,7 +181,7 @@ def __init__( self.stat_logger.info("cache_config", self.cache_config) def _initialize_kv_caches(self) -> None: - num_gpu_blocks, num_cpu_blocks = self.model_executor.profile_num_available_blocks() + num_gpu_blocks, num_cpu_blocks = self.model_executor.get_max_allowed_kv_blocks() if self.cache_config.forced_num_gpu_blocks is not None: forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index c307d08ae0d72..42f773e1defa4 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -60,8 +60,8 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def profile_num_available_blocks(self) -> tuple[int, int]: - return self.driver_worker.profile_num_available_blocks() + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + return self.driver_worker.get_max_allowed_kv_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: logger.info(f"# CPU blocks: {num_cpu_blocks}") diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index b575d238696f2..5953aa3f4bdeb 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -33,7 +33,7 @@ def __init__( @abstractmethod - def profile_num_available_blocks(self) -> tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV blocks that can be allocated. diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index f138258ec83ac..f30ec45d3e4ea 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -68,8 +68,8 @@ def _init_worker(self): self.driver_worker.load_model() - def profile_num_available_blocks(self) -> Tuple[int, int]: - return self.driver_worker.profile_num_available_blocks() + def get_max_allowed_kv_blocks(self) -> Tuple[int, int]: + return self.driver_worker.get_max_allowed_kv_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 5290bbd8a82c8..82487a065d693 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -48,8 +48,8 @@ def _init_worker(self): self.driver_worker.load_model() # TODO change name - def profile_num_available_blocks(self) -> tuple[int, int]: - return self.driver_worker.profile_num_available_blocks() + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + return self.driver_worker.get_max_allowed_kv_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 637581b53f1f5..ca84485af0ca6 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -202,7 +202,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # Then, it calculate the maximum possible number of GPU and CPU blocks # that can be allocated with the remaining free memory. # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method + # :meth:`~vllm.worker.worker.Worker.get_max_allowed_kv_blocks` method # from class :class:`~vllm.worker.Worker`. # Afterwards, as there may be multiple workers, @@ -217,10 +217,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # by adjusting the `gpu_memory_utilization` parameter. # """ - def profile_num_available_blocks(self) -> tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. num_blocks = self._run_workers( - "profile_num_available_blocks", + "get_max_allowed_kv_blocks", block_size=self.cache_config.block_size, gpu_memory_utilization=self.cache_config.gpu_memory_utilization, cpu_swap_space=self.cache_config.swap_space_bytes, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 659acc6620bc1..863eccb47216e 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -94,7 +94,7 @@ def init_device(self) -> None: device=self.device, vocab_size=self._vocab_size) - def profile_num_available_blocks(self) -> Tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> Tuple[int, int]: """Determine the number of cache blocks to use. This is done by profiling the scorer model (which is typically the @@ -102,7 +102,7 @@ def profile_num_available_blocks(self) -> Tuple[int, int]: scorer cache is divided evenly between the proposer and scorer model KV, such that the number of blocks is equal in both KV caches. """ - num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.profile_num_available_blocks()) + num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.get_max_allowed_kv_blocks()) scorer_cache_block_size_bytes = ( self.scorer_worker.get_cache_block_size_bytes()) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 781501dc610d9..db238e81a5f6c 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -169,7 +169,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def profile_num_available_blocks(self) -> tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: # For CPU device, the block number will be calculated based on the # cpu_kvcache_space. cache_block_size = CPUCacheEngine.get_cache_block_size( diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 16e9a128d024c..7ba8c2c754e39 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -41,7 +41,7 @@ def load_model(self): self.model_runner.load_model() # TODO change name - def profile_num_available_blocks(self) -> tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 2fc89635112a2..a4aea636a4d9a 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -113,7 +113,7 @@ def load_model(self): self.model_runner.load_model() @torch.inference_mode() - def profile_num_available_blocks( + def get_max_allowed_kv_blocks( self, ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index cf611eb2b88aa..1708795b01767 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -14,7 +14,7 @@ def init_device(self) -> None: raise NotImplementedError @abstractmethod - def profile_num_available_blocks(self) -> tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV blocks that can be allocated. From 8fb7b9a45812e16939539f0c155503fde0b0ad1c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:37:58 -0700 Subject: [PATCH 042/109] rename again --- tests/spec_decode/test_spec_decode_worker.py | 8 ++++---- vllm/engine/llm_engine.py | 2 +- vllm/executor/cpu_executor.py | 4 ++-- vllm/executor/executor_base.py | 2 +- vllm/executor/gpu_executor.py | 4 ++-- vllm/executor/neuron_executor.py | 5 ++--- vllm/executor/ray_gpu_executor.py | 6 +++--- vllm/spec_decode/spec_decode_worker.py | 4 ++-- vllm/worker/cpu_worker.py | 2 +- vllm/worker/neuron_worker.py | 3 +-- vllm/worker/worker.py | 2 +- vllm/worker/worker_base.py | 2 +- 12 files changed, 21 insertions(+), 23 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index e1dc33e8babcf..511d600199a01 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -538,7 +538,7 @@ def test_initialize_cache(): @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) @torch.inference_mode() -def test_get_max_allowed_kv_blocks(available_gpu_blocks: int, +def test_determine_num_available_blocks(available_gpu_blocks: int, available_cpu_blocks: int, target_cache_block_size_bytes: int, draft_kv_size_bytes: int): @@ -552,7 +552,7 @@ def test_get_max_allowed_kv_blocks(available_gpu_blocks: int, rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) - target_worker.get_max_allowed_kv_blocks.return_value = ( + target_worker.determine_num_available_blocks.return_value = ( available_gpu_blocks, available_cpu_blocks) target_worker.get_cache_block_size_bytes.return_value = ( target_cache_block_size_bytes) @@ -562,9 +562,9 @@ def test_get_max_allowed_kv_blocks(available_gpu_blocks: int, metrics_collector) - num_gpu_blocks, num_cpu_blocks = worker.get_max_allowed_kv_blocks() + num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks() - target_worker.get_max_allowed_kv_blocks.assert_called_once() + target_worker.determine_num_available_blocks.assert_called_once() assert num_cpu_blocks == available_cpu_blocks assert num_gpu_blocks == split_num_cache_blocks_evenly( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4974cca23c484..ad037cf2e79bb 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -181,7 +181,7 @@ def __init__( self.stat_logger.info("cache_config", self.cache_config) def _initialize_kv_caches(self) -> None: - num_gpu_blocks, num_cpu_blocks = self.model_executor.get_max_allowed_kv_blocks() + num_gpu_blocks, num_cpu_blocks = self.model_executor.determine_num_available_blocks() if self.cache_config.forced_num_gpu_blocks is not None: forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 42f773e1defa4..b78f6d993453d 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -60,8 +60,8 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: - return self.driver_worker.get_max_allowed_kv_blocks() + def determine_num_available_blocks(self) -> tuple[int, int]: + return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: logger.info(f"# CPU blocks: {num_cpu_blocks}") diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 5953aa3f4bdeb..757549bdedbeb 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -33,7 +33,7 @@ def __init__( @abstractmethod - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV blocks that can be allocated. diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index f30ec45d3e4ea..e586cf810d78c 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -68,8 +68,8 @@ def _init_worker(self): self.driver_worker.load_model() - def get_max_allowed_kv_blocks(self) -> Tuple[int, int]: - return self.driver_worker.get_max_allowed_kv_blocks() + def determine_num_available_blocks(self) -> Tuple[int, int]: + return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 82487a065d693..b907fd472704c 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -47,9 +47,8 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - # TODO change name - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: - return self.driver_worker.get_max_allowed_kv_blocks() + def determine_num_available_blocks(self) -> tuple[int, int]: + return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index ca84485af0ca6..ca851dfc462b8 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -202,7 +202,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # Then, it calculate the maximum possible number of GPU and CPU blocks # that can be allocated with the remaining free memory. # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.get_max_allowed_kv_blocks` method + # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method # from class :class:`~vllm.worker.Worker`. # Afterwards, as there may be multiple workers, @@ -217,10 +217,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # by adjusting the `gpu_memory_utilization` parameter. # """ - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. num_blocks = self._run_workers( - "get_max_allowed_kv_blocks", + "determine_num_available_blocks", block_size=self.cache_config.block_size, gpu_memory_utilization=self.cache_config.gpu_memory_utilization, cpu_swap_space=self.cache_config.swap_space_bytes, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 863eccb47216e..5f03b1edc07a6 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -94,7 +94,7 @@ def init_device(self) -> None: device=self.device, vocab_size=self._vocab_size) - def get_max_allowed_kv_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of cache blocks to use. This is done by profiling the scorer model (which is typically the @@ -102,7 +102,7 @@ def get_max_allowed_kv_blocks(self) -> Tuple[int, int]: scorer cache is divided evenly between the proposer and scorer model KV, such that the number of blocks is equal in both KV caches. """ - num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.get_max_allowed_kv_blocks()) + num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.determine_num_available_blocks()) scorer_cache_block_size_bytes = ( self.scorer_worker.get_cache_block_size_bytes()) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index db238e81a5f6c..5250c15330e4d 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -169,7 +169,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: # For CPU device, the block number will be calculated based on the # cpu_kvcache_space. cache_block_size = CPUCacheEngine.get_cache_block_size( diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 7ba8c2c754e39..dab70d884db4e 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -40,8 +40,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - # TODO change name - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index a4aea636a4d9a..10396262101be 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -113,7 +113,7 @@ def load_model(self): self.model_runner.load_model() @torch.inference_mode() - def get_max_allowed_kv_blocks( + def determine_num_available_blocks( self, ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 1708795b01767..6bb605d954e16 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -14,7 +14,7 @@ def init_device(self) -> None: raise NotImplementedError @abstractmethod - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV blocks that can be allocated. From 3bb9e6f187d4745168005f9e995b7d45375a5429 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:39:57 -0700 Subject: [PATCH 043/109] rename --- tests/core/block/e2e/test_correctness.py | 6 +++--- vllm/config.py | 6 +++--- vllm/engine/arg_utils.py | 6 +++--- vllm/engine/llm_engine.py | 10 +++++----- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 5a7f828456e2d..94b65401e1dd4 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -16,7 +16,7 @@ # Allow only 5 sequences of ~1024 tokens in worst case. "block_size": 16, - "forced_num_gpu_blocks": 5 * (64 + 1), + "num_gpu_blocks_override": 5 * (64 + 1), }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{ @@ -162,14 +162,14 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator, # Allow only 2 sequences of ~128 tokens in worst case. # Note 8 = 128/block_size - "forced_num_gpu_blocks": 2 * (8 + 1), + "num_gpu_blocks_override": 2 * (8 + 1), }, { "block_size": 8, # Allow only 2 sequences of ~128 tokens in worst case. # Note 16 = 128/block_size - "forced_num_gpu_blocks": 2 * (16 + 1), + "num_gpu_blocks_override": 2 * (16 + 1), } ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{ diff --git a/vllm/config.py b/vllm/config.py index e27c8eb4fd257..5730997f639db 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -334,7 +334,7 @@ class CacheConfig: vLLM execution. swap_space: Size of the CPU swap space per GPU (in GiB). cache_dtype: Data type for kv cache storage. - forced_num_gpu_blocks: Number of GPU blocks to use. This overrides the + num_gpu_blocks_override: Number of GPU blocks to use. This overrides the profiled num_gpu_blocks if specified. Does nothing if None. """ @@ -344,14 +344,14 @@ def __init__( gpu_memory_utilization: float, swap_space: int, cache_dtype: str, - forced_num_gpu_blocks: Optional[int] = None, + num_gpu_blocks_override: Optional[int] = None, sliding_window: Optional[int] = None, enable_prefix_caching: bool = False, ) -> None: self.block_size = block_size self.gpu_memory_utilization = gpu_memory_utilization self.swap_space_bytes = swap_space * _GB - self.forced_num_gpu_blocks = forced_num_gpu_blocks + self.num_gpu_blocks_override = num_gpu_blocks_override self.cache_dtype = cache_dtype self.sliding_window = sliding_window self.enable_prefix_caching = enable_prefix_caching diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a6197942645e4..d4b573992c06c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -55,7 +55,7 @@ class EngineArgs: max_cpu_loras: Optional[int] = None device: str = 'auto' ray_workers_use_nsight: bool = False - forced_num_gpu_blocks: Optional[int] = None + num_gpu_blocks_override: Optional[int] = None num_lookahead_slots: int = 0 # Related to Vision-language models such as llava @@ -246,7 +246,7 @@ def add_cli_args( 'the model executor, which can range from 0 to 1.' 'If unspecified, will use the default value of 0.9.') parser.add_argument( - '--forced-num-gpu-blocks', + '--num-gpu-blocks-override', type=int, default=None, help='If specified, ignore GPU profiling result and use this number' @@ -426,7 +426,7 @@ def create_engine_config(self, ) -> EngineConfig: cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, - self.forced_num_gpu_blocks, + self.num_gpu_blocks_override, model_config.get_sliding_window(), self.enable_prefix_caching) parallel_config = ParallelConfig( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ad037cf2e79bb..2e50dff02a014 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -183,11 +183,11 @@ def __init__( def _initialize_kv_caches(self) -> None: num_gpu_blocks, num_cpu_blocks = self.model_executor.determine_num_available_blocks() - if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_gpu_blocks=} with " - f"{forced_num_gpu_blocks=}") - num_gpu_blocks = forced_num_gpu_blocks + if self.cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override + logger.info(f"Overriding {num_gpu_blocks=} with " + f"{num_gpu_blocks_override=}") + num_gpu_blocks = num_gpu_blocks_override self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks From edad09c2627c558a1b0567ff832fb4b7dd753499 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:44:20 -0700 Subject: [PATCH 044/109] wip --- vllm/engine/llm_engine.py | 1 - vllm/executor/gpu_executor.py | 1 - vllm/executor/ray_gpu_executor.py | 28 ---------------------- vllm/executor/utils.py | 28 ---------------------- vllm/worker/worker.py | 39 ++++++++++++++++++++++++++++--- 5 files changed, 36 insertions(+), 61 deletions(-) delete mode 100644 vllm/executor/utils.py diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2e50dff02a014..d2f3f3aae42cd 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -228,7 +228,6 @@ def from_engine_args( log_stats=not engine_args.disable_log_stats, usage_context=usage_context, ) - return engine def __reduce__(self): diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index e586cf810d78c..4c936fb81f2a6 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -4,7 +4,6 @@ ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid, raise_if_cache_size_invalid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index ca851dfc462b8..3647a46ef5277 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -10,7 +10,6 @@ VisionLanguageConfig) from vllm.engine.ray_utils import RayWorkerVllm, ray from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid, raise_if_cache_size_invalid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata @@ -169,7 +168,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", rank, distributed_init_method, lora_config=lora_config, - #kv_cache_dtype=kv_cache_dtype, )) # Initialize the driver worker with the Worker class. @@ -185,7 +183,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", distributed_init_method, lora_config=self.lora_config, vision_language_config=self.vision_language_config, - #kv_cache_dtype=kv_cache_dtype, is_driver_worker=True, ) @@ -196,35 +193,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_parallel_loading_workers, ) - # """Profiles the memory usage and initializes the KV cache. - - # The engine will first conduct a profiling of the existing memory usage. - # Then, it calculate the maximum possible number of GPU and CPU blocks - # that can be allocated with the remaining free memory. - # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method - # from class :class:`~vllm.worker.Worker`. - - # Afterwards, as there may be multiple workers, - # we take the minimum number of blocks across all workers - # to ensure this can be applied to all of them. - - # Finally, the engine will initialize the KV cache - # with the calculated number of blocks. - - # .. tip:: - # You may limit the usage of GPU memory - # by adjusting the `gpu_memory_utilization` parameter. - # """ - def determine_num_available_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. num_blocks = self._run_workers( "determine_num_available_blocks", - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, ) # Since we use a shared centralized controller, we take the minimum diff --git a/vllm/executor/utils.py b/vllm/executor/utils.py deleted file mode 100644 index 89fe04434062f..0000000000000 --- a/vllm/executor/utils.py +++ /dev/null @@ -1,28 +0,0 @@ -# TODO -def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None: - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = block_size * num_gpu_blocks - if max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") - -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = block_size * num_gpu_blocks - if max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 10396262101be..7c0af623be984 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -20,9 +20,6 @@ from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner from vllm.worker.worker_base import WorkerBase -from vllm.executor.utils import raise_if_cache_size_invalid - -# TODO move raise_if_cache_size_invalid class Worker(WorkerBase): @@ -125,6 +122,27 @@ def determine_num_available_blocks( gpu_memory_utilization: The fraction of the total GPU memory to use. cpu_swap_space: The size of the CPU swap space in bytes. """ + + # """Profiles the memory usage and initializes the KV cache. + + # The engine will first conduct a profiling of the existing memory usage. + # Then, it calculate the maximum possible number of GPU and CPU blocks + # that can be allocated with the remaining free memory. + # More details can be found in the + # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method + # from class :class:`~vllm.worker.Worker`. + + # Afterwards, as there may be multiple workers, + # we take the minimum number of blocks across all workers + # to ensure this can be applied to all of them. + + # Finally, the engine will initialize the KV cache + # with the calculated number of blocks. + + # .. tip:: + # You may limit the usage of GPU memory + # by adjusting the `gpu_memory_utilization` parameter. + # """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() @@ -327,3 +345,18 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): f"{compute_capability[0]}.{compute_capability[1]}. " "You can use float16 instead by explicitly setting the" "`dtype` flag in CLI, for example: --dtype=half.") + + +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") From f93c845872d250f6137f92ee2660baac43972433 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:45:24 -0700 Subject: [PATCH 045/109] wip --- vllm/executor/gpu_executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 4c936fb81f2a6..066502f9dc543 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, @@ -67,7 +67,7 @@ def _init_worker(self): self.driver_worker.load_model() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.determine_num_available_blocks() From d2d22186f05b061e131c3737174fcf49e06d7976 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:51:24 -0700 Subject: [PATCH 046/109] wip --- vllm/worker/cpu_worker.py | 8 +++++-- vllm/worker/neuron_worker.py | 3 +++ vllm/worker/worker.py | 43 ++++++++++++++---------------------- vllm/worker/worker_base.py | 4 ++++ 4 files changed, 30 insertions(+), 28 deletions(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 5250c15330e4d..4e51a8f10f4f1 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -172,8 +172,7 @@ def load_model(self): def determine_num_available_blocks(self) -> tuple[int, int]: # For CPU device, the block number will be calculated based on the # cpu_kvcache_space. - cache_block_size = CPUCacheEngine.get_cache_block_size( - self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) + cache_block_size = self.get_cache_block_size_bytes() num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes // cache_block_size) num_cpu_blocks = max(num_cpu_blocks, 0) @@ -299,3 +298,8 @@ def init_distributed_environment(self) -> None: ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) + + def get_cache_block_size_bytes(self) -> int: + return CPUCacheEngine.get_cache_block_size( + self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) + diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index dab70d884db4e..28bd10db72e55 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -70,3 +70,6 @@ def execute_model( output = self.model_runner.execute_model(seq_group_metadata_list) return output + + def get_cache_block_size_bytes(self) -> int: + raise NotImplementedError diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 7c0af623be984..77cf5c180a27e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -42,7 +42,6 @@ def __init__( distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, - #kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, ) -> None: self.model_config = model_config @@ -73,8 +72,7 @@ def __init__( is_driver_worker=is_driver_worker, vision_language_config=vision_language_config) # Uninitialized cache engine. Will be initialized by - # self.init_cache_engine(). - #self.cache_config = None + # initialize_cache. self.cache_engine = None self.gpu_cache = None @@ -115,34 +113,28 @@ def determine_num_available_blocks( ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum number of GPU and CPU cache blocks that can be allocated. - - Args: - # TODO - block_size: The size of the cache block. - gpu_memory_utilization: The fraction of the total GPU memory to use. - cpu_swap_space: The size of the CPU swap space in bytes. """ - # """Profiles the memory usage and initializes the KV cache. + # """Profiles the memory usage and initializes the KV cache. - # The engine will first conduct a profiling of the existing memory usage. - # Then, it calculate the maximum possible number of GPU and CPU blocks - # that can be allocated with the remaining free memory. - # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method - # from class :class:`~vllm.worker.Worker`. + # The engine will first conduct a profiling of the existing memory usage. + # Then, it calculate the maximum possible number of GPU and CPU blocks + # that can be allocated with the remaining free memory. + # More details can be found in the + # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method + # from class :class:`~vllm.worker.Worker`. - # Afterwards, as there may be multiple workers, - # we take the minimum number of blocks across all workers - # to ensure this can be applied to all of them. + # Afterwards, as there may be multiple workers, + # we take the minimum number of blocks across all workers + # to ensure this can be applied to all of them. - # Finally, the engine will initialize the KV cache - # with the calculated number of blocks. + # Finally, the engine will initialize the KV cache + # with the calculated number of blocks. - # .. tip:: - # You may limit the usage of GPU memory - # by adjusting the `gpu_memory_utilization` parameter. - # """ + # .. tip:: + # You may limit the usage of GPU memory + # by adjusting the `gpu_memory_utilization` parameter. + # """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() @@ -173,7 +165,6 @@ def determine_num_available_blocks( self.model_runner.remove_all_loras() gc.collect() torch.cuda.empty_cache() - return num_gpu_blocks, num_cpu_blocks diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 6bb605d954e16..42de84ab68f24 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -48,6 +48,10 @@ def execute_model(self, """Executes one model step on the given sequences.""" raise NotImplementedError + @abstractmethod + def get_cache_block_size_bytes() -> int: + raise NotImplementedError + @abstractmethod def add_lora(self, lora_request: LoRARequest) -> bool: raise NotImplementedError From 2f960e7d7d0a9c7c6af8ee931a61c8368608e94d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:55:04 -0700 Subject: [PATCH 047/109] lint --- tests/lora/test_worker.py | 7 +++-- tests/spec_decode/test_spec_decode_worker.py | 12 ++++----- tests/spec_decode/utils.py | 4 ++- tests/worker/test_swap.py | 4 ++- vllm/engine/llm_engine.py | 7 ++--- vllm/executor/cpu_executor.py | 3 ++- vllm/executor/executor_base.py | 8 +++--- vllm/executor/gpu_executor.py | 9 ++----- vllm/executor/neuron_executor.py | 7 ++--- vllm/executor/ray_gpu_executor.py | 14 +++++----- vllm/spec_decode/spec_decode_worker.py | 12 ++++++--- vllm/worker/cpu_worker.py | 10 ++++--- vllm/worker/neuron_worker.py | 3 ++- vllm/worker/worker.py | 28 +++++++++++--------- vllm/worker/worker_base.py | 9 ++++--- 15 files changed, 74 insertions(+), 63 deletions(-) diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 11370b3ea1c6b..3fd7d000d31b8 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -4,7 +4,7 @@ from unittest.mock import patch from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, - SchedulerConfig) + SchedulerConfig, CacheConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker @@ -27,7 +27,10 @@ def test_worker_apply_lora(sql_lora_files): parallel_config=ParallelConfig(1, 1, False), scheduler_config=SchedulerConfig(32, 32, 32), device_config=DeviceConfig("cuda"), - cache_config=CacheConfig(block_size=16, gpu_memory_utilization=1., swap_space=0, cache_dtype="auto"), + cache_config=CacheConfig(block_size=16, + gpu_memory_utilization=1., + swap_space=0, + cache_dtype="auto"), local_rank=0, rank=0, lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 511d600199a01..3c513e5d881f5 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -510,6 +510,7 @@ def test_init_device(): metrics_collector.init_gpu_tensors.assert_called_once() rejection_sampler.init_gpu_tensors.assert_called_once() + @torch.inference_mode() def test_initialize_cache(): """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer @@ -524,9 +525,7 @@ def test_initialize_cache(): worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, metrics_collector) - cache_config = MagicMock() - - kwargs = {"num_gpu_blocks":1024, "num_cpu_blocks": 1023} + kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023} worker.initialize_cache(**kwargs) draft_worker.initialize_cache.assert_called_once_with(**kwargs) @@ -539,9 +538,9 @@ def test_initialize_cache(): @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) @torch.inference_mode() def test_determine_num_available_blocks(available_gpu_blocks: int, - available_cpu_blocks: int, - target_cache_block_size_bytes: int, - draft_kv_size_bytes: int): + available_cpu_blocks: int, + target_cache_block_size_bytes: int, + draft_kv_size_bytes: int): """Verify SpecDecodeWorker correctly profiles num available GPU blocks. Specifically, it should run profiling in the scorer worker, and then evenly split the blocks between proposer and scorer worker. @@ -561,7 +560,6 @@ def test_determine_num_available_blocks(available_gpu_blocks: int, worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, metrics_collector) - num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks() target_worker.determine_num_available_blocks.assert_called_once() diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 0916d3d494211..4637826f254d6 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -129,7 +129,9 @@ def create_worker(cls: type, engine_config.cache_config.num_gpu_blocks = num_gpu_blocks engine_config.cache_config.num_cpu_blocks = 0 - worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) + worker.initialize_cache( + num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, + num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) return worker diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index b35bf583ecb46..893637d92f859 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -32,7 +32,9 @@ def test_swap() -> None: # Initialize the worker. worker.init_device() worker.load_model() - worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) + worker.initialize_cache( + num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, + num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) # Randomly initialize the cache. gpu_cache = worker.cache_engine.gpu_cache diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d2f3f3aae42cd..57be4835e5bed 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -181,12 +181,13 @@ def __init__( self.stat_logger.info("cache_config", self.cache_config) def _initialize_kv_caches(self) -> None: - num_gpu_blocks, num_cpu_blocks = self.model_executor.determine_num_available_blocks() - + num_gpu_blocks, num_cpu_blocks = (self.model_executor.determine_num_available_blocks( + )) + if self.cache_config.num_gpu_blocks_override is not None: num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override logger.info(f"Overriding {num_gpu_blocks=} with " - f"{num_gpu_blocks_override=}") + f"{num_gpu_blocks_override=}") num_gpu_blocks = num_gpu_blocks_override self.cache_config.num_gpu_blocks = num_gpu_blocks diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index b78f6d993453d..f44667f5112cc 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -63,7 +63,8 @@ def _init_worker(self): def determine_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.determine_num_available_blocks() - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: logger.info(f"# CPU blocks: {num_cpu_blocks}") self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 757549bdedbeb..63c3766b6221d 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,6 +1,5 @@ from abc import ABC, abstractmethod from typing import Dict, List, Optional -from dataclasses import dataclass from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, @@ -31,7 +30,6 @@ def __init__( ) -> None: raise NotImplementedError - @abstractmethod def determine_num_available_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV @@ -50,12 +48,12 @@ def determine_num_available_blocks(self) -> tuple[int, int]: """ raise NotImplementedError - @abstractmethod - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: """Given a fully-specified cache config, initialize the KV cache. This is separate from init_workers as profiling may be required to determine - the maxmimum allowed KV cache size. + the maximum allowed KV cache size. """ raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 066502f9dc543..caedea97dc6d4 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -66,23 +66,18 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def determine_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.determine_num_available_blocks() - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: # NOTE: This is logged in the executor because there can be >1 worker # with other executors. We could log in the engine level, but work # remains to abstract away the device for non-GPU configurations. - logger.info( - f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}" - ) + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index b907fd472704c..d9f52adc49f66 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -50,7 +50,8 @@ def _init_worker(self): def determine_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.determine_num_available_blocks() - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) def execute_model(self, @@ -70,10 +71,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool: return self.driver_worker.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - return self.driver_worker.remove_lora(lora_request) + return self.driver_worker.remove_lora(lora_id) def list_loras(self) -> List[int]: - return self.driver_worker.list_loras(lora_request) + return self.driver_worker.list_loras() def check_health(self) -> None: # NeuronExecutor will always be healthy as long as diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 3647a46ef5277..e71f0a4b7b820 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -150,7 +150,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", scheduler_config = copy.deepcopy(self.scheduler_config) device_config = copy.deepcopy(self.device_config) lora_config = copy.deepcopy(self.lora_config) - kv_cache_dtype = self.cache_config.cache_dtype # Initialize the actual workers with the Worker class. for rank, (worker, (node_id, _)) in enumerate( @@ -195,9 +194,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", def determine_num_available_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers( - "determine_num_available_blocks", - ) + num_blocks = self._run_workers("determine_num_available_blocks", ) # Since we use a shared centralized controller, we take the minimum # number of blocks across all workers to make sure all the memory @@ -207,8 +204,8 @@ def determine_num_available_blocks(self) -> tuple[int, int]: return num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: # NOTE: We log here to avoid multiple logs when number of workers is # greater than one. We could log in the engine, but not all executors @@ -219,8 +216,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - + self._run_workers("initialize_cache", + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 5f03b1edc07a6..a13748fd94059 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -102,7 +102,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: scorer cache is divided evenly between the proposer and scorer model KV, such that the number of blocks is equal in both KV caches. """ - num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.determine_num_available_blocks()) + num_gpu_blocks, num_cpu_blocks = ( + self.scorer_worker.determine_num_available_blocks()) scorer_cache_block_size_bytes = ( self.scorer_worker.get_cache_block_size_bytes()) @@ -114,12 +115,15 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: num_gpu_blocks) return new_num_gpu_blocks, num_cpu_blocks - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: """Initialize the cache engine of the scorer and proposer workers. TODO """ - self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) + self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) @torch.inference_mode() def execute_model( diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 4e51a8f10f4f1..bb611b4b173f8 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -173,7 +173,8 @@ def determine_num_available_blocks(self) -> tuple[int, int]: # For CPU device, the block number will be calculated based on the # cpu_kvcache_space. cache_block_size = self.get_cache_block_size_bytes() - num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes // cache_block_size) + num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes // + cache_block_size) num_cpu_blocks = max(num_cpu_blocks, 0) # Note: To reuse the cache management procedure, @@ -182,7 +183,8 @@ def determine_num_available_blocks(self) -> tuple[int, int]: num_cpu_blocks = 0 return num_gpu_blocks, num_cpu_blocks - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: # Note: To reuse the cache management procedure, # use cpu cache as 'gpu cache'. assert num_cpu_blocks == 0 @@ -301,5 +303,5 @@ def init_distributed_environment(self) -> None: def get_cache_block_size_bytes(self) -> int: return CPUCacheEngine.get_cache_block_size( - self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) - + self.cache_config.block_size, self.cache_config.cache_dtype, + self.model_config, self.parallel_config) diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 28bd10db72e55..d862600c5c934 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -51,7 +51,8 @@ def determine_num_available_blocks(self) -> tuple[int, int]: return num_gpu_blocks, num_cpu_blocks - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: assert num_cpu_blocks == 0 assert num_gpu_blocks == self.scheduler_config.max_num_seqs self.cache_config.num_gpu_blocks = num_gpu_blocks diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 77cf5c180a27e..4a273347927af 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -108,9 +108,7 @@ def load_model(self): self.model_runner.load_model() @torch.inference_mode() - def determine_num_available_blocks( - self, - ) -> Tuple[int, int]: + def determine_num_available_blocks(self, ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum number of GPU and CPU cache blocks that can be allocated. """ @@ -156,9 +154,10 @@ def determine_num_available_blocks( cache_block_size = self.get_cache_block_size_bytes() num_gpu_blocks = int( - (total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) // - cache_block_size) - num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) + (total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // + cache_block_size) num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) if self.model_runner.lora_manager: @@ -167,9 +166,11 @@ def determine_num_available_blocks( torch.cuda.empty_cache() return num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + raise_if_cache_size_invalid(num_gpu_blocks, + self.cache_config.block_size, + self.model_config.max_model_len) self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks @@ -184,7 +185,6 @@ def _init_cache_engine(self): self.gpu_cache = self.cache_engine.gpu_cache self.model_runner.set_block_size(self.cache_engine.block_size) - def _warm_up_model(self) -> None: if not self.model_config.enforce_eager: self.model_runner.capture_model(self.gpu_cache) @@ -265,8 +265,9 @@ def vocab_size(self) -> int: def get_cache_block_size_bytes(self) -> int: """Get the size of the KV cache block size in bytes. """ - return CacheEngine.get_cache_block_size( - self.cache_config, self.model_config, self.parallel_config) + return CacheEngine.get_cache_block_size(self.cache_config, + self.model_config, + self.parallel_config) def init_distributed_environment( @@ -338,7 +339,8 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): "`dtype` flag in CLI, for example: --dtype=half.") -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, + max_model_len) -> None: if num_gpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 42de84ab68f24..4675dbd4b3149 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,11 +1,12 @@ from abc import ABC, abstractmethod -from typing import Dict, List, Optional +from typing import Dict, List from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata class WorkerBase(ABC): + @abstractmethod def init_device(self) -> None: """Initialize device state, such as loading the model or other on-device @@ -32,10 +33,11 @@ def determine_num_available_blocks(self) -> tuple[int, int]: raise NotImplementedError @abstractmethod - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: """Given a fully-specified cache config, initialize the KV cache. This is separate from init_workers as profiling may be required to determine - the maxmimum allowed KV cache size. + the maximum allowed KV cache size. """ raise NotImplementedError @@ -66,6 +68,7 @@ def list_loras(self) -> List[int]: class LoraNotSupportedWorkerBase(WorkerBase): + def add_lora(self, lora_request: LoRARequest) -> bool: raise ValueError(f"{type(self)} does not support LoRA") From 68552e105c997892ff2ea65128025bd1c90f5fb0 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 20:06:31 -0700 Subject: [PATCH 048/109] wip --- vllm/engine/llm_engine.py | 4 ++-- vllm/worker/worker.py | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 57be4835e5bed..fc2d476cf5343 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -181,8 +181,8 @@ def __init__( self.stat_logger.info("cache_config", self.cache_config) def _initialize_kv_caches(self) -> None: - num_gpu_blocks, num_cpu_blocks = (self.model_executor.determine_num_available_blocks( - )) + num_gpu_blocks, num_cpu_blocks = ( + self.model_executor.determine_num_available_blocks()) if self.cache_config.num_gpu_blocks_override is not None: num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 4a273347927af..24c5ab6ff6c19 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -115,11 +115,14 @@ def determine_num_available_blocks(self, ) -> Tuple[int, int]: # """Profiles the memory usage and initializes the KV cache. - # The engine will first conduct a profiling of the existing memory usage. - # Then, it calculate the maximum possible number of GPU and CPU blocks + # The engine will first conduct a profiling of the existing memory + # usage. + # Then, it calculate the maximum possible number of GPU and CPU + # blocks # that can be allocated with the remaining free memory. # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method + # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` + # method # from class :class:`~vllm.worker.Worker`. # Afterwards, as there may be multiple workers, From 42983ba1617aab7aa9b6ab5fb0a90e71d0b7c7c9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 20:09:35 -0700 Subject: [PATCH 049/109] import order --- tests/lora/test_worker.py | 4 ++-- vllm/worker/neuron_worker.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 3fd7d000d31b8..54594690f7922 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -3,8 +3,8 @@ import tempfile from unittest.mock import patch -from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, - SchedulerConfig, CacheConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index d862600c5c934..d37cd048031dc 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -4,8 +4,8 @@ import torch import torch.distributed -from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig, - SchedulerConfig, CacheConfig) +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig) from vllm.model_executor import set_random_seed from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.neuron_model_runner import NeuronModelRunner From 2d5dbb89378d94c025cf34ac6e7b9ba4126aa738 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 20:37:11 -0700 Subject: [PATCH 050/109] fix --- tests/worker/test_swap.py | 4 ++-- vllm/executor/ray_gpu_executor.py | 3 +++ vllm/spec_decode/spec_decode_worker.py | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 893637d92f859..8edb1cf05c08e 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -11,8 +11,8 @@ def test_swap() -> None: dtype="half", load_format="dummy") engine_config = engine_args.create_engine_config() - engine_config.cache_config.num_gpu_blocks = 100 - engine_config.cache_config.num_cpu_blocks = 100 + engine_config.cache_config.num_gpu_blocks = 1000 + engine_config.cache_config.num_cpu_blocks = 1000 # Create the worker. distributed_init_method = get_distributed_init_method( diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index e71f0a4b7b820..1175a400fdc68 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -150,6 +150,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", scheduler_config = copy.deepcopy(self.scheduler_config) device_config = copy.deepcopy(self.device_config) lora_config = copy.deepcopy(self.lora_config) + cache_config = copy.deepcopy(self.cache_config) # Initialize the actual workers with the Worker class. for rank, (worker, (node_id, _)) in enumerate( @@ -163,6 +164,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", parallel_config, scheduler_config, device_config, + cache_config, local_rank, rank, distributed_init_method, @@ -177,6 +179,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", self.parallel_config, self.scheduler_config, self.device_config, + self.cache_config, driver_local_rank, driver_rank, distributed_init_method, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a13748fd94059..180dea26c0d6e 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -348,6 +348,9 @@ def rank(self): def device(self): return self.scorer_worker.device + def get_cache_block_size_bytes(self): + raise NotImplementedError + def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int, proposer_cache_block_size_bytes: int, From ae2f7e6b6b97cf0847712e99da7c0ce3e8a92447 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 22:46:43 -0700 Subject: [PATCH 051/109] docstrings --- vllm/engine/llm_engine.py | 5 +++ vllm/executor/cpu_executor.py | 8 +++++ vllm/executor/executor_base.py | 28 +++++++---------- vllm/executor/gpu_executor.py | 5 +++ vllm/executor/neuron_executor.py | 5 +++ vllm/executor/ray_gpu_executor.py | 43 ++++++++++++++++---------- vllm/spec_decode/spec_decode_worker.py | 8 ++++- vllm/worker/cpu_worker.py | 25 +++++++++++++-- vllm/worker/neuron_worker.py | 15 +++++++++ vllm/worker/worker.py | 42 +++++++++---------------- vllm/worker/worker_base.py | 36 +++++++++++---------- 11 files changed, 142 insertions(+), 78 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fc2d476cf5343..1db6c740733a5 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -181,6 +181,11 @@ def __init__( self.stat_logger.info("cache_config", self.cache_config) def _initialize_kv_caches(self) -> None: + """Initialize the KV cache in the worker(s). + + The workers will determine the number of blocks in both the GPU cache + and the swap CPU cache. + """ num_gpu_blocks, num_cpu_blocks = ( self.model_executor.determine_num_available_blocks()) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index f44667f5112cc..2bf97338da0ed 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -61,10 +61,18 @@ def _init_worker(self): self.driver_worker.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ + # NOTE: We log here to avoid multiple logs when number of workers is + # greater than one. We could log in the engine, but not all executors + # have GPUs. logger.info(f"# CPU blocks: {num_cpu_blocks}") self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 63c3766b6221d..c18edd75d7a4d 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -32,28 +32,24 @@ def __init__( @abstractmethod def determine_num_available_blocks(self) -> tuple[int, int]: - """Profile the model on-device to determine the maximum number of KV - blocks that can be allocated. - - Returns a tuple[num_device_blocks, num_cpu_blocks], where - num_device_blocks refers to the number of blocks in the "active" KV - cache (e.g. where blocks are appended to), and num_cpu_blocks refers - to the number of blocks in the "passive" KV cache (e.g. where blocks - are swapped to). - - Examples: - - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. - - A future CPUExecutor can return [num_cpu_blocks, 0] or - [num_cpu_blocks, num_swap_cpu_blocks]. + """Determine the number of available blocks for the GPU KV cache and + swappable CPU KV cache. + + Normally, this should simply delegate to the underlying Worker. Some + ExecutorBase may require modification of the result, e.g. to ensure the + selected cache sizes are compatible with all workers. + + Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks + are blocks that are "active" on the device and can be appended to. + num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be + appended to. """ raise NotImplementedError @abstractmethod def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - """Given a fully-specified cache config, initialize the KV cache. This - is separate from init_workers as profiling may be required to determine - the maximum allowed KV cache size. + """Initialize the KV cache with the given size in blocks. """ raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index caedea97dc6d4..80ca5cb7367c5 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -67,9 +67,14 @@ def _init_worker(self): self.driver_worker.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ # NOTE: This is logged in the executor because there can be >1 worker # with other executors. We could log in the engine level, but work # remains to abstract away the device for non-GPU configurations. diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index d9f52adc49f66..57436a85cfa27 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -48,10 +48,15 @@ def _init_worker(self): self.driver_worker.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) def execute_model(self, diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 1175a400fdc68..a508d1e8fe600 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -160,14 +160,14 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", local_rank = node_workers[node_id].index(rank) worker.init_worker.remote( lambda rank=rank, local_rank=local_rank: Worker( - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - local_rank, - rank, - distributed_init_method, + model_config=model_config, + parallel_config=parallel_config, + scheduler_config=scheduler_config, + device_config=device_config, + cache_config=cache_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, lora_config=lora_config, )) @@ -175,14 +175,14 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", driver_rank = 0 driver_local_rank = node_workers[driver_node_id].index(driver_rank) self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - self.cache_config, - driver_local_rank, - driver_rank, - distributed_init_method, + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=driver_local_rank, + rank=driver_rank, + distributed_init_method=distributed_init_method, lora_config=self.lora_config, vision_language_config=self.vision_language_config, is_driver_worker=True, @@ -196,6 +196,15 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", ) def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks. + + This invokes `determine_num_available_blocks` on each worker and takes + the min of the results, guaranteeing that the selected cache sizes are + compatible with all workers. + + Returns: + - tuple[num_gpu_blocks, num_cpu_blocks] + """ # Get the maximum number of blocks that can be allocated on GPU and CPU. num_blocks = self._run_workers("determine_num_available_blocks", ) @@ -209,6 +218,8 @@ def determine_num_available_blocks(self) -> tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initialize the KV cache in all workers. + """ # NOTE: We log here to avoid multiple logs when number of workers is # greater than one. We could log in the engine, but not all executors diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 180dea26c0d6e..885bf537568e3 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -118,7 +118,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Initialize the cache engine of the scorer and proposer workers. - TODO """ self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) @@ -349,6 +348,13 @@ def device(self): return self.scorer_worker.device def get_cache_block_size_bytes(self): + """Return the size of a cache block in bytes. + + This function is only used to compose workers within a SpecDecodeWorker. + We leave composing a SpecDecodeWorker within a SpecDecodeWorker + undefined for now, although it could be implemented in the future. + See https://arxiv.org/abs/2308.04623. + """ raise NotImplementedError diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index bb611b4b173f8..bd67f9f8850ac 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -170,6 +170,16 @@ def load_model(self): self.model_runner.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of blocks available for the KV cache. + + This determines how many KV blocks can fit into the configured CPU + KV cache space. + + Note that since vLLM assumes a block resides on GPU if it can be + modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0. + This allows us to reuse the scheduler of vLLM without generalizing it + to different devices. + """ # For CPU device, the block number will be calculated based on the # cpu_kvcache_space. cache_block_size = self.get_cache_block_size_bytes() @@ -185,11 +195,20 @@ def determine_num_available_blocks(self) -> tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initialize the KV cache. Currently, swappable CPU memory is not + supported. + + Since this worker does not support GPUs, we use the num_gpu_blocks to + determine how many non-swappable CPU blocks to allocate. + """ + assert (num_cpu_blocks == 0 + ), f"{type(self)} does not support swappable cache" + # Note: To reuse the cache management procedure, # use cpu cache as 'gpu cache'. - assert num_cpu_blocks == 0 num_cpu_blocks = num_gpu_blocks - num_gpu_blocks = 0 + del num_gpu_blocks + self.cache_config.num_gpu_blocks = num_cpu_blocks self.cache_config.num_cpu_blocks = 0 @@ -302,6 +321,8 @@ def init_distributed_environment(self) -> None: parallel_config.pipeline_parallel_size) def get_cache_block_size_bytes(self) -> int: + """Return the size in bytes of a single KV cache block. + """ return CPUCacheEngine.get_cache_block_size( self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index d37cd048031dc..6136d50d0c068 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -41,6 +41,12 @@ def load_model(self): self.model_runner.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks. + + Swapping is not yet supported, so always return num_cpu_blocks=0. + + We configure num_gpu_blocks to be equal to max_num_seqs. + """ # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. @@ -53,8 +59,13 @@ def determine_num_available_blocks(self) -> tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initialize the KV cache. + """ + + # Different values are not tested. assert num_cpu_blocks == 0 assert num_gpu_blocks == self.scheduler_config.max_num_seqs + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks @@ -73,4 +84,8 @@ def execute_model( return output def get_cache_block_size_bytes(self) -> int: + """Determine the size in bytes of a cache block. + + This is required for speculative decoding; it is not yet implemented. + """ raise NotImplementedError diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 24c5ab6ff6c19..b46229c5b6943 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -108,34 +108,18 @@ def load_model(self): self.model_runner.load_model() @torch.inference_mode() - def determine_num_available_blocks(self, ) -> Tuple[int, int]: - """Profiles the peak memory usage of the model and returns the maximum - number of GPU and CPU cache blocks that can be allocated. - """ + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. - # """Profiles the memory usage and initializes the KV cache. - - # The engine will first conduct a profiling of the existing memory - # usage. - # Then, it calculate the maximum possible number of GPU and CPU - # blocks - # that can be allocated with the remaining free memory. - # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` - # method - # from class :class:`~vllm.worker.Worker`. - - # Afterwards, as there may be multiple workers, - # we take the minimum number of blocks across all workers - # to ensure this can be applied to all of them. - - # Finally, the engine will initialize the KV cache - # with the calculated number of blocks. - - # .. tip:: - # You may limit the usage of GPU memory - # by adjusting the `gpu_memory_utilization` parameter. - # """ + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() @@ -171,6 +155,10 @@ def determine_num_available_blocks(self, ) -> Tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Allocate GPU and CPU KV cache with the specified number of blocks. + + This also warms up the model, which may record CUDA graphs. + """ raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 4675dbd4b3149..e3027c406ffeb 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -6,6 +6,9 @@ class WorkerBase(ABC): + """Worker interface that allows vLLM to cleanly separate implementations for + different hardware. + """ @abstractmethod def init_device(self) -> None: @@ -16,28 +19,23 @@ def init_device(self) -> None: @abstractmethod def determine_num_available_blocks(self) -> tuple[int, int]: - """Profile the model on-device to determine the maximum number of KV - blocks that can be allocated. - - Returns a tuple[num_device_blocks, num_cpu_blocks], where - num_device_blocks refers to the number of blocks in the "active" KV - cache (e.g. where blocks are appended to), and num_cpu_blocks refers - to the number of blocks in the "passive" KV cache (e.g. where blocks - are swapped to). - - Examples: - - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. - - A future CPUExecutor can return [num_cpu_blocks, 0] or - [num_cpu_blocks, num_swap_cpu_blocks]. + """Determine the number of available blocks for the GPU KV cache and + swappable CPU KV cache. + + The implementation may run profiling or other heuristics to determine + the size of caches. + + Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks + are blocks that are "active" on the device and can be appended to. + num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be + appended to. """ raise NotImplementedError @abstractmethod def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - """Given a fully-specified cache config, initialize the KV cache. This - is separate from init_workers as profiling may be required to determine - the maximum allowed KV cache size. + """Initialize the KV cache with the given size in blocks. """ raise NotImplementedError @@ -52,6 +50,9 @@ def execute_model(self, @abstractmethod def get_cache_block_size_bytes() -> int: + """Return the size of a single cache block, in bytes. Used in + speculative decoding. + """ raise NotImplementedError @abstractmethod @@ -68,6 +69,9 @@ def list_loras(self) -> List[int]: class LoraNotSupportedWorkerBase(WorkerBase): + """Partial implementation of WorkerBase that raises exceptions when LoRA + methods are invoked. + """ def add_lora(self, lora_request: LoRARequest) -> bool: raise ValueError(f"{type(self)} does not support LoRA") From fa8705de390cc727acc5a094abbba2f070de27dd Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sat, 6 Apr 2024 22:29:36 -0700 Subject: [PATCH 052/109] wip --- vllm/executor/gpu_executor.py | 71 +++++++++++++++++++++++++- vllm/spec_decode/spec_decode_worker.py | 4 ++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 80ca5cb7367c5..ac7e4c5dda744 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -33,14 +33,81 @@ def __init__( self.scheduler_config = scheduler_config self.device_config = device_config self.vision_language_config = vision_language_config + self.speculative_config = speculative_config - assert (not speculative_config - ), "Speculative decoding not yet supported for GPU backend" + #assert (not speculative_config + # ), "Speculative decoding not yet supported for GPU backend" # Instantiate the worker and load the model to GPU. self._init_worker() def _init_worker(self): + if self.speculative_config is None: + self._init_non_spec_worker() + else: + self._init_spec_worker() + + def _init_spec_worker(self): + from vllm.worker.worker import Worker + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + from vllm.spec_decode.multi_step_worker import MultiStepWorker + + #from vllm.worker.multi_step_worker import MultiStepWorker # pylint: disable=import-outside-toplevel + #from vllm.worker.single_tp_worker import SingleTpWorker # pylint: disable=import-outside-toplevel + #from vllm.worker.draft_target_worker import DraftTargetWorker # pylint: disable=import-outside-toplevel + + #scheduler_config: "SchedulerConfig" = worker_kwargs.pop( + # "scheduler_config") + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + + target_worker = Worker( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + + from vllm.spec_decode.multi_step_worker import MultiStepWorker + draft_worker = MultiStepWorker( + model_config=self.speculative_config.draft_model_config, + parallel_config=self.speculative_config.draft_parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + from vllm.model_executor.layers.rejection_sampler import RejectionSampler + spec_decode_worker = SpecDecodeWorker( + proposer_worker=draft_worker, + scorer_worker=target_worker, + rejection_sampler=RejectionSampler(), + ) + + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") + + self.driver_worker = spec_decode_worker + + self.driver_worker.init_device() + #self.driver_worker.load_model() + + def _init_non_spec_worker(self): # Lazy import the Worker to avoid importing torch.cuda/xformers # before CUDA_VISIBLE_DEVICES is set in the Worker from vllm.worker.worker import Worker diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 885bf537568e3..d555f27650e19 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -87,6 +87,10 @@ def init_device(self) -> None: self.scorer_worker.init_device() self.proposer_worker.init_device() + # TODO separate from init_device? + self.scorer_worker.load_model() + self.proposer_worker.load_model() + self._metrics.init_gpu_tensors(self.rank) self.rejection_sampler.init_gpu_tensors(self.rank) self.scorer = BatchExpansionTop1Scorer( From 84953210e527c011704974435ae1b61ed7296a26 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sat, 6 Apr 2024 22:36:27 -0700 Subject: [PATCH 053/109] wip --- tests/spec_decode/e2e/test_correctness.py | 3 +++ vllm/engine/llm_engine.py | 10 ++++++---- vllm/executor/gpu_executor.py | 5 ++++- vllm/spec_decode/spec_decode_worker.py | 9 +++++---- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index b5a6fcb7900a3..c427fbc7a05bb 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -11,6 +11,9 @@ "speculative_model": "facebook/opt-125m", "num_speculative_tokens": 5, + # Skip cuda graph recording for fast test. + "enforce_eager": True, + # Required for spec decode. "use_v2_block_manager": True }]) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1c639af696544..9ca809f51d0f8 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -709,12 +709,14 @@ def step(self) -> List[RequestOutput]: if not scheduler_outputs.is_empty(): output = self.model_executor.execute_model( - seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in, - scheduler_outputs.blocks_to_swap_out, - scheduler_outputs.blocks_to_copy) + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, + blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out, + blocks_to_copy=scheduler_outputs.blocks_to_copy, + num_lookahead_slots=scheduler_outputs.num_lookahead_slots) else: output = [] - + return self._process_model_outputs(output, scheduler_outputs) def do_log_stats(self) -> None: diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index ac7e4c5dda744..80ec79ba3c3c6 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -154,12 +154,15 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int, + ) -> SamplerOutput: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, + num_lookahead_slots=num_lookahead_slots, ) return output diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index d555f27650e19..a2c9a9944af5b 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -135,7 +135,7 @@ def execute_model( blocks_to_swap_in: Optional[Dict[int, int]], blocks_to_swap_out: Optional[Dict[int, int]], blocks_to_copy: Optional[Dict[int, List[int]]], - num_spec_tokens: int, + num_lookahead_slots: int, ) -> List[SamplerOutput]: """Perform speculative decoding on the input batch. """ @@ -146,7 +146,7 @@ def execute_model( # If no spec tokens, call the proposer and scorer workers normally. # Used for prefill. - if num_spec_tokens == 0 or len(seq_group_metadata_list) == 0: + if num_lookahead_slots == 0 or len(seq_group_metadata_list) == 0: return self._run_no_spec( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, @@ -159,7 +159,7 @@ def execute_model( blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, - k=num_spec_tokens, + k=num_lookahead_slots, ) @nvtx_range("spec_decode_worker._run_no_spec") @@ -180,7 +180,8 @@ def _run_no_spec( blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, - return_python_output=False) + #return_python_output=False + ) sampler_output = self.scorer_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, From b63975bd45ea1a1770a8c742dc732b91e6f3cbf9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 00:06:56 -0700 Subject: [PATCH 054/109] wip --- tests/spec_decode/e2e/test_correctness.py | 18 ++-- vllm/core/scheduler.py | 14 +-- vllm/engine/llm_engine.py | 121 +++++++++++++++++++++- vllm/model_executor/layers/sampler.py | 8 +- vllm/spec_decode/batch_expansion.py | 3 +- vllm/spec_decode/multi_step_worker.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 4 +- 7 files changed, 145 insertions(+), 25 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index c427fbc7a05bb..782bd9d0cecbd 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -21,14 +21,14 @@ @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_config(test_llm_generator): - output_len = 1024 + output_len = 128 temperature = 0.0 prompts = [ "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", + #"The president of the United States is", + #"The capital of France is", + #"The future of AI is", ] sampling_params = SamplingParams( @@ -37,11 +37,11 @@ def test_spec_decode_config(test_llm_generator): temperature=temperature, ) - with pytest.raises( - AssertionError, - match="Speculative decoding not yet supported for GPU backend"): - get_token_ids_from_llm_generator(test_llm_generator, prompts, - sampling_params) + #with pytest.raises( + # AssertionError, + # match="Speculative decoding not yet supported for GPU backend"): + get_token_ids_from_llm_generator(test_llm_generator, prompts, + sampling_params) def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 0ae53f9374960..e176848c04909 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -753,9 +753,10 @@ def _schedule_default(self) -> SchedulerOutputs: blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy, swapped_in.blocks_to_copy), ignored_seq_groups=prefills.ignored_seq_groups, - num_lookahead_slots=(prefills.num_lookahead_slots + - running_scheduled.num_lookahead_slots + - swapped_in.num_lookahead_slots), + num_lookahead_slots=running_scheduled.num_lookahead_slots, + #num_lookahead_slots=(prefills.num_lookahead_slots + + # running_scheduled.num_lookahead_slots + + # swapped_in.num_lookahead_slots), ) def _schedule_chunked_prefill(self): @@ -842,9 +843,10 @@ def _schedule_chunked_prefill(self): blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy, swapped_in.blocks_to_copy), ignored_seq_groups=prefills.ignored_seq_groups, - num_lookahead_slots=(prefills.num_lookahead_slots + - running_scheduled.num_lookahead_slots + - swapped_in.num_lookahead_slots), + num_lookahead_slots=running_scheduled.num_lookahead_slots, + #num_lookahead_slots=(prefills.num_lookahead_slots + + # running_scheduled.num_lookahead_slots + + # swapped_in.num_lookahead_slots), ) def _schedule(self) -> SchedulerOutputs: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9ca809f51d0f8..1bd4129090c2d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -626,14 +626,38 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, def _process_model_outputs( self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: + + + if not isinstance(output, list): + all_output = [output] + else: + all_output = output + + scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups + + # Organize list of sampler output by sequence group. + output_by_sequence_group: List[List[SequenceGroupOutputs]] = [ + [] for _ in scheduled_seq_groups + ] + for step in output: + for i, sequence_group_output in enumerate(step): + output_by_sequence_group[i].append(sequence_group_output) + now = time.time() + # Update the scheduled sequence groups with the model outputs. - scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output): + for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output_by_sequence_group): + seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) - self._process_sequence_group_outputs(seq_group, outputs) + + assert len(outputs) > 0 + # TODO can spec decode go through second path? + if len(outputs) > 1: + self._process_sequence_group_outputs_multi_step(seq_group, outputs) + else: + self._process_sequence_group_outputs(seq_group, outputs[0]) # Free the finished sequence groups. self.scheduler.free_finished_seq_groups() @@ -654,6 +678,91 @@ def _process_model_outputs( self.stat_logger.log(self._get_stats(scheduler_outputs)) return request_outputs + def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): + seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + + assert seqs + #if not seqs: + # return [] + + assert len(seqs) == 1, ("Beam search not supported in speculative " + "decoding.") + seq = seqs[0] + + # Since there's only one sequence per sequence group, we can take the + # first sample. + samples = [outputs[step].samples[0] for step in range(len(outputs))] + + # -1 means the output token is not valid (eg. due to spec decode + # rejecting tokens). + valid_samples = [ + sample for sample in samples if sample.output_token != -1 + ] + + # Draft target worker pads all outputs with -1 to have same length. + output_token_ids = [sample.output_token for sample in valid_samples] + #successes = [sample.success for sample in samples] + + ## Truncate to max_tokens if necessary. + #remaining_tokens = seq_group.sampling_params.max_tokens - ( + # seq.get_output_len() + len(output_token_ids)) + #if remaining_tokens < 0: + # valid_samples = valid_samples[:remaining_tokens] + # output_token_ids = output_token_ids[:remaining_tokens] + + ## Truncate any tokens after EOS. This is required as spec decode + ## generates tokens in fixed blocks, which may go beyond the EOS token. + #if not seq_group.sampling_params.ignore_eos: + # eos_token_id = self.tokenizer.get_lora_tokenizer( + # seq.lora_request).eos_token_id + # # Avoiding .index calls as exception throwing in the happy path + # # is expensive. + # for i in range(len(output_token_ids)): + # if output_token_ids[i] == eos_token_id: + # output_token_ids = output_token_ids[:i + 1] + # valid_samples = valid_samples[:i + 1] + # break + + #output_logprobs = [sample.logprobs for sample in valid_samples] + + ## Use the last sample for the sequence as it will have + ## the speculation and num_unprocessed_tokens for all the + ## previous samples (they are cumulative when it comes + ## to those two attributes). + #speculation = valid_samples[-1].speculation + #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens + + for output_token_id in output_token_ids: + from vllm.sequence import Logprob + seq.append_token_id( + token_id=output_token_id, + logprobs={output_token_id: Logprob(0.0)}, + ) + print(f'Appended token id {output_token_id=}') + + #seq.append_token_ids(output_token_ids, + # output_logprobs, + # ) + # #num_unprocessed_tokens=num_unprocessed_tokens) + ##seq.set_last_speculation(speculation) + + #if not all(successes): + # seq.set_status_to_failed() + + #if decode: + # self._decode_sequence(seq, + # seq_group.sampling_params, + # token_ids=seq.get_token_ids(), + # unseen_token_ids=output_token_ids, + # prefix_offset=seq.prefix_offset, + # read_offset=seq.read_offset) + #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, + # output_token_ids) + # TODO pass output token ids + self._check_stop(seq, seq_group.sampling_params) + if seq.is_finished(): + self.scheduler.free_seq(seq) + def step(self) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results. @@ -804,9 +913,11 @@ def _check_stop(self, seq: Sequence, if seq.get_len() > self.scheduler_config.max_model_len: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return - + + breakpoint() # Check if the sequence has reached max_tokens. - if seq.get_output_len() == sampling_params.max_tokens: + if seq.get_output_len() >= sampling_params.max_tokens: + # TODO should cap block seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index cb1480de03e3a..4f0cc4405e814 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -684,4 +684,10 @@ def _build_sampler_output( SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return SamplerOutput(outputs=sampler_output) + + return SamplerOutput( + outputs=sampler_output, + # TODO + sampled_token_probs=torch.empty((len(sampler_output), 50_272), device='cuda', dtype=torch.float32), + sampled_token_ids=torch.empty((len(sampler_output), 1), device='cuda', dtype=torch.long), + ) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index e0b75837e8a39..89be25252c2c6 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -83,7 +83,8 @@ def score_proposals( blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, - return_python_output=False) + #return_python_output=False + ) all_tokens, all_probs = self._contract_batch( original_bs=len(seq_group_metadata_list), diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 73b6e201c67a9..c817f54d7fe3c 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -340,7 +340,7 @@ def _merge_outputs( return proposal_tokens, proposal_probs, proposal_lens sampler_output = maybe_sampler_output - + proposal_tokens, proposal_probs = sampler_output_to_torch( sampler_output) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a2c9a9944af5b..85667a6c3dd49 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -5,7 +5,7 @@ from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, - SequenceGroupOutput, SequenceOutput) + SequenceGroupOutput, SequenceOutput, Logprob) from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) @@ -316,7 +316,7 @@ def _create_output_sampler_list( parent_seq_id=seq_id, output_token=token_id, # TODO Add verifier logprobs. - logprobs={token_id: 0.0}, + logprobs={token_id: Logprob(0.0)}, ) ], prompt_logprobs=None, From cb23e8ca4e6ff3c667b44e9ce4f179f629740008 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 00:07:10 -0700 Subject: [PATCH 055/109] wip --- vllm/engine/llm_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1bd4129090c2d..15ef7df26b0b0 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -914,7 +914,6 @@ def _check_stop(self, seq: Sequence, seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return - breakpoint() # Check if the sequence has reached max_tokens. if seq.get_output_len() >= sampling_params.max_tokens: # TODO should cap block From 143ca28e5de41f1d32e730bc3e9da2a954a2024e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 00:14:02 -0700 Subject: [PATCH 056/109] wip --- vllm/executor/cpu_executor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 2bf97338da0ed..835ba18ab756a 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -80,7 +80,8 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int) -> SamplerOutput: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, From d8d4725d3365e25c67cbb115e5a437fd7e574fd0 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 13:41:20 -0700 Subject: [PATCH 057/109] fix --- tests/spec_decode/e2e/test_correctness.py | 7 +++++-- vllm/model_executor/layers/sampler.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 782bd9d0cecbd..fc5640d23ab56 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -7,10 +7,13 @@ "common_llm_kwargs", [{ # Use a small model for a fast test. - "model": "facebook/opt-125m", - "speculative_model": "facebook/opt-125m", + "model": "JackFram/llama-68m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + # Skip real loading for fast test. + "load_format": "dummy", + # Skip cuda graph recording for fast test. "enforce_eager": True, diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 4f0cc4405e814..9540a3d89bd81 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -688,6 +688,6 @@ def _build_sampler_output( return SamplerOutput( outputs=sampler_output, # TODO - sampled_token_probs=torch.empty((len(sampler_output), 50_272), device='cuda', dtype=torch.float32), + sampled_token_probs=torch.empty((len(sampler_output), 32_000), device='cuda', dtype=torch.float32), sampled_token_ids=torch.empty((len(sampler_output), 1), device='cuda', dtype=torch.long), ) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 85667a6c3dd49..f665c3b72219c 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -15,7 +15,9 @@ split_batch_by_proposal_len) from vllm.worker.worker import Worker from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.logger import init_logger +logger = init_logger(__name__) class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. @@ -144,6 +146,8 @@ def execute_model( "speculative decoding " "requires non-None seq_group_metadata_list") + logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}") + # If no spec tokens, call the proposer and scorer workers normally. # Used for prefill. if num_lookahead_slots == 0 or len(seq_group_metadata_list) == 0: @@ -174,6 +178,7 @@ def _run_no_spec( proposer and scorer model so that the KV cache is consistent between the two. """ + logger.info("run proposer worker no spec") self.proposer_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, @@ -183,6 +188,7 @@ def _run_no_spec( #return_python_output=False ) + logger.info("run target worker no spec") sampler_output = self.scorer_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, @@ -214,11 +220,14 @@ def _run_speculative_decoding_step( sequence. """ + logger.info("get spec proposals") # Generate proposals using draft worker. proposals = self.proposer_worker.get_spec_proposals( seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy, k) + #logger.info(f"score proposals {proposals=}") + logger.info(f"score proposals") proposal_scores = self.scorer.score_proposals( seq_group_metadata_list, blocks_to_swap_in, @@ -228,9 +237,11 @@ def _run_speculative_decoding_step( proposals, ) + logger.info("verify proposals") accepted_token_ids = self._verify_tokens(seq_group_metadata_list, proposal_scores, proposals, k) + logger.info("create output list") return self._create_output_sampler_list(seq_group_metadata_list, accepted_token_ids, k) From b2728e03de0703d9e479bd9e0e4aa3f158f426f6 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:03:53 -0700 Subject: [PATCH 058/109] wip --- tests/spec_decode/e2e/test_correctness.py | 54 +++++++++++++++++++++- vllm/spec_decode/spec_decode_worker.py | 55 ++++++++++++++++++++++- vllm/worker/worker.py | 3 ++ 3 files changed, 109 insertions(+), 3 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index fc5640d23ab56..28a88a750edb1 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -20,10 +20,14 @@ # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "tensor_parallel_size": 1, + }, +]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_config(test_llm_generator): +def test_spec_decode(test_llm_generator): output_len = 128 temperature = 0.0 @@ -46,6 +50,51 @@ def test_spec_decode_config(test_llm_generator): get_token_ids_from_llm_generator(test_llm_generator, prompts, sampling_params) +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + "model": "JackFram/llama-68m", + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Skip real loading for fast test. + "load_format": "dummy", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + # Expect failure as spec decode not supported by + # Ray backend. + "tensor_parallel_size": 2, + }, +]) +@pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_xfail(test_llm_generator): + output_len = 128 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + ] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + with pytest.raises( + AssertionError, + match="Speculative decoding not yet supported for "): + get_token_ids_from_llm_generator(test_llm_generator, prompts, + sampling_params) def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): for llm in llm_generator: @@ -54,3 +103,4 @@ def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): del llm return token_ids + diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index f665c3b72219c..3802ed42f786f 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -19,6 +19,60 @@ logger = init_logger(__name__) +def create_spec_decode_worker(): + + from vllm.worker.worker import Worker + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + from vllm.spec_decode.multi_step_worker import MultiStepWorker + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + + target_worker = Worker( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + + from vllm.spec_decode.multi_step_worker import MultiStepWorker + draft_worker = MultiStepWorker( + model_config=self.speculative_config.draft_model_config, + parallel_config=self.speculative_config.draft_parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + from vllm.model_executor.layers.rejection_sampler import RejectionSampler + spec_decode_worker = SpecDecodeWorker( + proposer_worker=draft_worker, + scorer_worker=target_worker, + rejection_sampler=RejectionSampler(), + ) + + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") + + self.driver_worker = spec_decode_worker + + self.driver_worker.init_device() + #self.driver_worker.load_model() + class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. @@ -226,7 +280,6 @@ def _run_speculative_decoding_step( seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy, k) - #logger.info(f"score proposals {proposals=}") logger.info(f"score proposals") proposal_scores = self.scorer.score_proposals( seq_group_metadata_list, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b46229c5b6943..5d9a9acd763e7 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -205,7 +205,10 @@ def execute_model( blocks_to_swap_in: Optional[Dict[int, int]] = None, blocks_to_swap_out: Optional[Dict[int, int]] = None, blocks_to_copy: Optional[Dict[int, List[int]]] = None, + num_lookahead_slots: int = 0, ) -> Optional[SamplerOutput]: + assert (num_lookahead_slots == 0), "worker does not support lookahead slots" + if self.is_driver_worker: assert seq_group_metadata_list is not None num_seq_groups = len(seq_group_metadata_list) From 6250f6cf32842de588edfe58f93e942a64cfd5b6 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:12:50 -0700 Subject: [PATCH 059/109] assertion --- tests/spec_decode/e2e/test_correctness.py | 26 ++++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 28a88a750edb1..92076d88ea836 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -1,4 +1,5 @@ import pytest +from itertools import cycle from vllm import SamplingParams @@ -26,30 +27,39 @@ }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("batch_size", [1, 10]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode(test_llm_generator): +def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): + """Run generation with speculative decoding on a batch. Verify the number + of output tokens is equal to the expected number. + """ output_len = 128 temperature = 0.0 prompts = [ "Hello, my name is", - #"The president of the United States is", - #"The capital of France is", - #"The future of AI is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", ] + prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] + sampling_params = SamplingParams( max_tokens=output_len, ignore_eos=True, temperature=temperature, ) - #with pytest.raises( - # AssertionError, - # match="Speculative decoding not yet supported for GPU backend"): - get_token_ids_from_llm_generator(test_llm_generator, prompts, + batch_token_ids = get_token_ids_from_llm_generator(test_llm_generator, prompts, sampling_params) + # Expect a generation for each prompt in the batch. + assert len(batch_token_ids) == len(prompts) + + # TODO(cadedaniel) check for equality once block truncation is implemented. + assert all(len(token_ids) >= output_len for token_ids in batch_token_ids) + @pytest.mark.parametrize( "common_llm_kwargs", [{ From a930755de760545726cfcc9de5fc8d51a4b6fb71 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:18:19 -0700 Subject: [PATCH 060/109] fix --- vllm/model_executor/layers/sampler.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 9540a3d89bd81..7c7148b12229f 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -78,8 +78,15 @@ def forward( # Get the logprobs query results. prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) + + breakpoint() + + return _build_sampler_output(sample_results, sampling_metadata, - prompt_logprobs, sample_logprobs) + prompt_logprobs, sample_logprobs, + sampled_token_probs=probs, + sampled_token_ids=torch.empty((len(sampling_metadata.seq_groups), 1), device=probs.device, dtype=torch.long), + ) def _get_bin_counts_and_mask( @@ -668,6 +675,8 @@ def _build_sampler_output( sampling_metadata: SamplingMetadata, prompt_logprobs: List[Optional[PromptLogprobs]], sample_logprobs: List[SampleLogprobs], + sampled_token_ids: Optional[torch.Tensor] = None, + sampled_token_probs: Optional[torch.Tensor] = None, ) -> SamplerOutput: sampler_output = [] for (seq_group, sample_result, group_prompt_logprobs, @@ -687,7 +696,6 @@ def _build_sampler_output( return SamplerOutput( outputs=sampler_output, - # TODO - sampled_token_probs=torch.empty((len(sampler_output), 32_000), device='cuda', dtype=torch.float32), - sampled_token_ids=torch.empty((len(sampler_output), 1), device='cuda', dtype=torch.long), + sampled_token_probs=sampled_token_probs, + sampled_token_ids=sampled_token_ids, ) From 5b896a3fe4e9614ee2557a9361cb381f88eeb15d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:18:43 -0700 Subject: [PATCH 061/109] fix --- vllm/model_executor/layers/sampler.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 7c7148b12229f..71807b25834a7 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -79,9 +79,7 @@ def forward( prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) - breakpoint() - - + # TODO gate by config return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs, sampled_token_probs=probs, From bb43b530ce2eeecaa29a8108dc17e0f24b80b099 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:19:23 -0700 Subject: [PATCH 062/109] lint --- tests/spec_decode/e2e/test_correctness.py | 29 +++++++++++++---------- vllm/engine/llm_engine.py | 11 +++++---- vllm/executor/gpu_executor.py | 19 ++++++++------- vllm/model_executor/layers/sampler.py | 14 +++++++---- vllm/spec_decode/batch_expansion.py | 2 +- vllm/spec_decode/multi_step_worker.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 10 ++++---- vllm/worker/worker.py | 3 ++- 8 files changed, 52 insertions(+), 38 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 92076d88ea836..36a66ea2ec389 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -51,8 +51,9 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): temperature=temperature, ) - batch_token_ids = get_token_ids_from_llm_generator(test_llm_generator, prompts, - sampling_params) + batch_token_ids = get_token_ids_from_llm_generator(test_llm_generator, + prompts, + sampling_params) # Expect a generation for each prompt in the batch. assert len(batch_token_ids) == len(prompts) @@ -60,6 +61,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): # TODO(cadedaniel) check for equality once block truncation is implemented. assert all(len(token_ids) >= output_len for token_ids in batch_token_ids) + @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -77,13 +79,15 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - # Expect failure as spec decode not supported by - # Ray backend. - "tensor_parallel_size": 2, - }, -]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + { + # Expect failure as spec decode not supported by + # Ray backend. + "tensor_parallel_size": 2, + }, + ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_xfail(test_llm_generator): @@ -100,12 +104,12 @@ def test_spec_decode_xfail(test_llm_generator): temperature=temperature, ) - with pytest.raises( - AssertionError, - match="Speculative decoding not yet supported for "): + with pytest.raises(AssertionError, + match="Speculative decoding not yet supported for "): get_token_ids_from_llm_generator(test_llm_generator, prompts, sampling_params) + def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): for llm in llm_generator: outputs = llm.generate(prompts, sampling_params, use_tqdm=True) @@ -113,4 +117,3 @@ def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): del llm return token_ids - diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 15ef7df26b0b0..1ca447890d4ca 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -627,7 +627,6 @@ def _process_model_outputs( self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: - if not isinstance(output, list): all_output = [output] else: @@ -646,7 +645,8 @@ def _process_model_outputs( now = time.time() # Update the scheduled sequence groups with the model outputs. - for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output_by_sequence_group): + for scheduled_seq_group, outputs in zip(scheduled_seq_groups, + output_by_sequence_group): seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( @@ -655,7 +655,8 @@ def _process_model_outputs( assert len(outputs) > 0 # TODO can spec decode go through second path? if len(outputs) > 1: - self._process_sequence_group_outputs_multi_step(seq_group, outputs) + self._process_sequence_group_outputs_multi_step( + seq_group, outputs) else: self._process_sequence_group_outputs(seq_group, outputs[0]) @@ -825,7 +826,7 @@ def step(self) -> List[RequestOutput]: num_lookahead_slots=scheduler_outputs.num_lookahead_slots) else: output = [] - + return self._process_model_outputs(output, scheduler_outputs) def do_log_stats(self) -> None: @@ -913,7 +914,7 @@ def _check_stop(self, seq: Sequence, if seq.get_len() > self.scheduler_config.max_model_len: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return - + # Check if the sequence has reached max_tokens. if seq.get_output_len() >= sampling_params.max_tokens: # TODO should cap block diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 80ec79ba3c3c6..60c9a9ca3c788 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -75,7 +75,7 @@ def _init_spec_worker(self): vision_language_config=self.vision_language_config, is_driver_worker=True, ) - + from vllm.spec_decode.multi_step_worker import MultiStepWorker draft_worker = MultiStepWorker( model_config=self.speculative_config.draft_model_config, @@ -90,7 +90,7 @@ def _init_spec_worker(self): vision_language_config=self.vision_language_config, is_driver_worker=True, ) - + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.model_executor.layers.rejection_sampler import RejectionSampler spec_decode_worker = SpecDecodeWorker( @@ -150,13 +150,14 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - def execute_model(self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - num_lookahead_slots: int, - ) -> SamplerOutput: + def execute_model( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int, + ) -> SamplerOutput: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 71807b25834a7..5c1017207878b 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -80,11 +80,17 @@ def forward( logprobs, sampling_metadata, sample_results) # TODO gate by config - return _build_sampler_output(sample_results, sampling_metadata, - prompt_logprobs, sample_logprobs, + return _build_sampler_output( + sample_results, + sampling_metadata, + prompt_logprobs, + sample_logprobs, sampled_token_probs=probs, - sampled_token_ids=torch.empty((len(sampling_metadata.seq_groups), 1), device=probs.device, dtype=torch.long), - ) + sampled_token_ids=torch.empty( + (len(sampling_metadata.seq_groups), 1), + device=probs.device, + dtype=torch.long), + ) def _get_bin_counts_and_mask( diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 89be25252c2c6..6be8c843cf7a1 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -84,7 +84,7 @@ def score_proposals( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, #return_python_output=False - ) + ) all_tokens, all_probs = self._contract_batch( original_bs=len(seq_group_metadata_list), diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index c817f54d7fe3c..73b6e201c67a9 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -340,7 +340,7 @@ def _merge_outputs( return proposal_tokens, proposal_probs, proposal_lens sampler_output = maybe_sampler_output - + proposal_tokens, proposal_probs = sampler_output_to_torch( sampler_output) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 3802ed42f786f..12a70d402e98f 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -19,8 +19,9 @@ logger = init_logger(__name__) + def create_spec_decode_worker(): - + from vllm.worker.worker import Worker from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker @@ -41,7 +42,7 @@ def create_spec_decode_worker(): vision_language_config=self.vision_language_config, is_driver_worker=True, ) - + from vllm.spec_decode.multi_step_worker import MultiStepWorker draft_worker = MultiStepWorker( model_config=self.speculative_config.draft_model_config, @@ -56,7 +57,7 @@ def create_spec_decode_worker(): vision_language_config=self.vision_language_config, is_driver_worker=True, ) - + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.model_executor.layers.rejection_sampler import RejectionSampler spec_decode_worker = SpecDecodeWorker( @@ -73,6 +74,7 @@ def create_spec_decode_worker(): self.driver_worker.init_device() #self.driver_worker.load_model() + class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. @@ -240,7 +242,7 @@ def _run_no_spec( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, #return_python_output=False - ) + ) logger.info("run target worker no spec") sampler_output = self.scorer_worker.execute_model( diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 5d9a9acd763e7..941c062081290 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -207,7 +207,8 @@ def execute_model( blocks_to_copy: Optional[Dict[int, List[int]]] = None, num_lookahead_slots: int = 0, ) -> Optional[SamplerOutput]: - assert (num_lookahead_slots == 0), "worker does not support lookahead slots" + assert (num_lookahead_slots == 0 + ), "worker does not support lookahead slots" if self.is_driver_worker: assert seq_group_metadata_list is not None From cde3160fdd542b80abba0d9855c98d8a12d959ac Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:57:45 -0700 Subject: [PATCH 063/109] fix --- vllm/executor/gpu_executor.py | 2 +- vllm/model_executor/layers/sampler.py | 11 ++++++----- vllm/sequence.py | 10 ++++++++++ vllm/spec_decode/batch_expansion.py | 10 +++++++++- vllm/spec_decode/multi_step_worker.py | 10 +++++++++- vllm/spec_decode/util.py | 7 +++++++ 6 files changed, 42 insertions(+), 8 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 60c9a9ca3c788..ac445cd51a7e4 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -96,7 +96,7 @@ def _init_spec_worker(self): spec_decode_worker = SpecDecodeWorker( proposer_worker=draft_worker, scorer_worker=target_worker, - rejection_sampler=RejectionSampler(), + rejection_sampler=RejectionSampler(strict_mode=True), ) assert self.parallel_config.world_size == 1, ( diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 5c1017207878b..135bc13e8d7c7 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -85,11 +85,12 @@ def forward( sampling_metadata, prompt_logprobs, sample_logprobs, - sampled_token_probs=probs, - sampled_token_ids=torch.empty( - (len(sampling_metadata.seq_groups), 1), - device=probs.device, - dtype=torch.long), + #sampled_token_probs=probs, + ## TODO + #sampled_token_ids=torch.empty( + # (len(sampling_metadata.seq_groups), 1), + # device=probs.device, + # dtype=torch.long), ) diff --git a/vllm/sequence.py b/vllm/sequence.py index 576bbe8c4f6c4..223a7cf80232f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -686,3 +686,13 @@ def __len__(self): def __eq__(self, other: object): return isinstance(other, self.__class__) and self.outputs == other.outputs + + def __repr__(self) -> str: + """Show the shape of a tensor instead of its values to reduce noise. + """ + sampled_token_probs_repr = ("None" if self.sampled_token_probs is None else self.sampled_token_probs.shape) + sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else self.sampled_token_ids.shape) + return (f"SamplerOutput(outputs={self.outputs}, " + f"sampled_token_probs={sampled_token_probs_repr}, " + f"sampled_token_ids={sampled_token_ids_repr}, " + f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 6be8c843cf7a1..701324c16dfe5 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -8,7 +8,7 @@ SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, sampler_output_to_torch, - split_batch_by_proposal_len) + split_batch_by_proposal_len, mock_device_tensors) from vllm.worker.worker import Worker SeqId = int @@ -143,6 +143,14 @@ def _contract_batch(self, original_bs: int, This maps the scores of speculative tokens back to their original sequences. """ + + mock_device_tensors( + sampler_output=target_sampler_output, + batch_size=len(non_spec_indices) + num_scoring_tokens, + vocab_size=self._vocab_size, + device=self._device, + ) + (target_token_ids, target_probs, non_spec_target_token_ids, non_spec_target_probs) = self._split_scoring_output( target_sampler_output, num_scoring_tokens) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 73b6e201c67a9..262bab1626495 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,7 +6,7 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) -from vllm.spec_decode.util import sampler_output_to_torch +from vllm.spec_decode.util import (sampler_output_to_torch, mock_device_tensors) from vllm.worker.worker import Worker @@ -341,6 +341,14 @@ def _merge_outputs( sampler_output = maybe_sampler_output + for step_output in sampler_output: + mock_device_tensors( + sampler_output=step_output, + batch_size=len(proposal_lens), + vocab_size=self._vocab_size, + device=self._device, + ) + proposal_tokens, proposal_probs = sampler_output_to_torch( sampler_output) diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 406568a4bc08c..234ed9e44f4e4 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -82,6 +82,13 @@ def sampler_output_to_torch( return sampled_token_ids, sampled_token_probs +def mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, vocab_size: int, device: str) -> None: + assert sampler_output.sampled_token_probs is None + assert sampler_output.sampled_token_ids is None + + sampler_output.sampled_token_probs = torch.nn.functional.softmax(torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) + sampler_output.sampled_token_ids = torch.randint(low=0, high=vocab_size, size=(batch_size,), dtype=torch.long, device=device) + @contextmanager def nvtx_range(msg, *args, **kwargs): """ From dd8aeff307f7c035b7db4a5184d00172cad6c3e9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 15:00:34 -0700 Subject: [PATCH 064/109] fix --- vllm/engine/llm_engine.py | 1 - vllm/sequence.py | 9 +++-- vllm/spec_decode/batch_expansion.py | 3 +- vllm/spec_decode/multi_step_worker.py | 5 ++- vllm/spec_decode/spec_decode_worker.py | 55 -------------------------- vllm/spec_decode/util.py | 14 +++++-- 6 files changed, 22 insertions(+), 65 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1ca447890d4ca..9d65ec1a2faa6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -739,7 +739,6 @@ def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): token_id=output_token_id, logprobs={output_token_id: Logprob(0.0)}, ) - print(f'Appended token id {output_token_id=}') #seq.append_token_ids(output_token_ids, # output_logprobs, diff --git a/vllm/sequence.py b/vllm/sequence.py index 223a7cf80232f..fa51483301a3d 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -690,9 +690,12 @@ def __eq__(self, other: object): def __repr__(self) -> str: """Show the shape of a tensor instead of its values to reduce noise. """ - sampled_token_probs_repr = ("None" if self.sampled_token_probs is None else self.sampled_token_probs.shape) - sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else self.sampled_token_ids.shape) - return (f"SamplerOutput(outputs={self.outputs}, " + sampled_token_probs_repr = ("None" if self.sampled_token_probs is None + else self.sampled_token_probs.shape) + sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else + self.sampled_token_ids.shape) + return ( + f"SamplerOutput(outputs={self.outputs}, " f"sampled_token_probs={sampled_token_probs_repr}, " f"sampled_token_ids={sampled_token_ids_repr}, " f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 701324c16dfe5..bba3c4733e4ff 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -8,7 +8,8 @@ SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, sampler_output_to_torch, - split_batch_by_proposal_len, mock_device_tensors) + split_batch_by_proposal_len, + mock_device_tensors) from vllm.worker.worker import Worker SeqId = int diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 262bab1626495..0ac189a7baccb 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,7 +6,8 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) -from vllm.spec_decode.util import (sampler_output_to_torch, mock_device_tensors) +from vllm.spec_decode.util import (sampler_output_to_torch, + mock_device_tensors) from vllm.worker.worker import Worker @@ -343,7 +344,7 @@ def _merge_outputs( for step_output in sampler_output: mock_device_tensors( - sampler_output=step_output, + sampler_output=step_output, batch_size=len(proposal_lens), vocab_size=self._vocab_size, device=self._device, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 12a70d402e98f..3e33371edadf0 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -20,61 +20,6 @@ logger = init_logger(__name__) -def create_spec_decode_worker(): - - from vllm.worker.worker import Worker - from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker - from vllm.spec_decode.multi_step_worker import MultiStepWorker - - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - - target_worker = Worker( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - vision_language_config=self.vision_language_config, - is_driver_worker=True, - ) - - from vllm.spec_decode.multi_step_worker import MultiStepWorker - draft_worker = MultiStepWorker( - model_config=self.speculative_config.draft_model_config, - parallel_config=self.speculative_config.draft_parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - vision_language_config=self.vision_language_config, - is_driver_worker=True, - ) - - from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker - from vllm.model_executor.layers.rejection_sampler import RejectionSampler - spec_decode_worker = SpecDecodeWorker( - proposer_worker=draft_worker, - scorer_worker=target_worker, - rejection_sampler=RejectionSampler(), - ) - - assert self.parallel_config.world_size == 1, ( - "GPUExecutor only supports single GPU.") - - self.driver_worker = spec_decode_worker - - self.driver_worker.init_device() - #self.driver_worker.load_model() - - class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 234ed9e44f4e4..7129f47d65f6a 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -82,12 +82,20 @@ def sampler_output_to_torch( return sampled_token_ids, sampled_token_probs -def mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, vocab_size: int, device: str) -> None: +def mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, + vocab_size: int, device: str) -> None: assert sampler_output.sampled_token_probs is None assert sampler_output.sampled_token_ids is None - sampler_output.sampled_token_probs = torch.nn.functional.softmax(torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) - sampler_output.sampled_token_ids = torch.randint(low=0, high=vocab_size, size=(batch_size,), dtype=torch.long, device=device) + sampler_output.sampled_token_probs = torch.nn.functional.softmax( + torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), + dim=-1) + sampler_output.sampled_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, ), + dtype=torch.long, + device=device) + @contextmanager def nvtx_range(msg, *args, **kwargs): From 46e48474ab355254f4d831b86f2b3303abde0d22 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 15:10:22 -0700 Subject: [PATCH 065/109] test --- tests/spec_decode/e2e/test_correctness.py | 8 +++++--- vllm/engine/llm_engine.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 36a66ea2ec389..a1df4dccbe3b7 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -9,8 +9,6 @@ [{ # Use a small model for a fast test. "model": "JackFram/llama-68m", - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, # Skip real loading for fast test. "load_format": "dummy", @@ -23,7 +21,11 @@ }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [ { - "tensor_parallel_size": 1, + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + # No spec decode. }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9d65ec1a2faa6..a08a883539a98 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -627,7 +627,7 @@ def _process_model_outputs( self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: - if not isinstance(output, list): + if self.speculative_config is None: all_output = [output] else: all_output = output @@ -638,7 +638,7 @@ def _process_model_outputs( output_by_sequence_group: List[List[SequenceGroupOutputs]] = [ [] for _ in scheduled_seq_groups ] - for step in output: + for step in all_output: for i, sequence_group_output in enumerate(step): output_by_sequence_group[i].append(sequence_group_output) From 8454edc8bf13cb04936b7f552f7e6ec368a6693f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 15:41:02 -0700 Subject: [PATCH 066/109] test fixes --- tests/spec_decode/test_spec_decode_worker.py | 14 +++++++------- vllm/engine/llm_engine.py | 2 +- vllm/executor/ray_gpu_executor.py | 3 ++- vllm/worker/worker.py | 2 -- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 47aff8f575413..bd06d5b17d07b 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -37,7 +37,7 @@ def test_correctly_calls_draft_model(k: int, batch_size: int): execute_model_data, _, _ = create_batch(batch_size, k) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) call_args_list = draft_worker.get_spec_proposals.call_args_list assert len(call_args_list) == 1 @@ -102,7 +102,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int): target_worker.execute_model.side_effect = ValueError(exception_secret) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) seen_contexts = [] @@ -195,7 +195,7 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int): rejection_sampler.side_effect = ValueError(exception_secret) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) assert len(rejection_sampler.call_args_list) == 1 args, _ = rejection_sampler.call_args_list[0] @@ -283,7 +283,7 @@ def test_correctly_formats_output(k: int, batch_size: int): rejection_sampler.return_value = rejection_sampler_output output = worker.execute_model(**execute_model_data.to_dict(), - num_spec_tokens=k) + num_lookahead_slots=k) expected_output = create_sampler_output_list( rejection_sampler_output.transpose(0, 1), [None for _ in range(k + 1)]) @@ -400,7 +400,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): mock_rejsample_metrics) output = worker.execute_model(**execute_model_data.to_dict(), - num_spec_tokens=k) + num_lookahead_slots=k) assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics call_args_list = ( @@ -435,7 +435,7 @@ def test_k_equals_zero(k: int, batch_size: int): batch_size, k, prev_output_token_len=0) out = worker.execute_model(**execute_model_data.to_dict(), - num_spec_tokens=k) + num_lookahead_slots=k) assert len(out) == 1, f"expected only one token output when {k=}" assert out[0].probs is None, "expect gpu tensor references to be None" @@ -474,7 +474,7 @@ def test_empty_input_batch(k: int, batch_size: int): batch_size, k, prev_output_token_len=0) out = worker.execute_model(**execute_model_data.to_dict(), - num_spec_tokens=k) + num_lookahead_slots=k) assert len(out) == 1, f"expected only one token output when {k=}" assert out[0].probs is None, "expect gpu tensor references to be None" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a08a883539a98..e47af8dfcf9e9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -915,7 +915,7 @@ def _check_stop(self, seq: Sequence, return # Check if the sequence has reached max_tokens. - if seq.get_output_len() >= sampling_params.max_tokens: + if seq.get_output_len() >= int(sampling_params.max_tokens): # TODO should cap block seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index a508d1e8fe600..226183855708d 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -238,7 +238,8 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int = 0) -> SamplerOutput: all_outputs = self._run_workers( "execute_model", driver_kwargs={ diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 941c062081290..cb30f658482bd 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -207,8 +207,6 @@ def execute_model( blocks_to_copy: Optional[Dict[int, List[int]]] = None, num_lookahead_slots: int = 0, ) -> Optional[SamplerOutput]: - assert (num_lookahead_slots == 0 - ), "worker does not support lookahead slots" if self.is_driver_worker: assert seq_group_metadata_list is not None From 819e65695455e9d63e4ed306f313b1d96f6b2c9a Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 15:41:22 -0700 Subject: [PATCH 067/109] lint --- tests/spec_decode/e2e/test_correctness.py | 20 +++++++++++--------- tests/spec_decode/test_spec_decode_worker.py | 9 ++++++--- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index a1df4dccbe3b7..d8b09ce5b77a0 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -19,15 +19,17 @@ # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - { - # No spec decode. - }, -]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + # No spec decode. + }, + ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("batch_size", [1, 10]) @pytest.mark.parametrize("seed", [1]) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index bd06d5b17d07b..3725924ea89ce 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -37,7 +37,8 @@ def test_correctly_calls_draft_model(k: int, batch_size: int): execute_model_data, _, _ = create_batch(batch_size, k) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) + worker.execute_model(**execute_model_data.to_dict(), + num_lookahead_slots=k) call_args_list = draft_worker.get_spec_proposals.call_args_list assert len(call_args_list) == 1 @@ -102,7 +103,8 @@ def test_correctly_calls_target_model(k: int, batch_size: int): target_worker.execute_model.side_effect = ValueError(exception_secret) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) + worker.execute_model(**execute_model_data.to_dict(), + num_lookahead_slots=k) seen_contexts = [] @@ -195,7 +197,8 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int): rejection_sampler.side_effect = ValueError(exception_secret) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) + worker.execute_model(**execute_model_data.to_dict(), + num_lookahead_slots=k) assert len(rejection_sampler.call_args_list) == 1 args, _ = rejection_sampler.call_args_list[0] From d0fbe47bdb778b9ba32bda2b0d9a621d9ecd1134 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:01:35 -0700 Subject: [PATCH 068/109] clean --- vllm/model_executor/layers/sampler.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 135bc13e8d7c7..bed915faf3fbd 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -79,19 +79,7 @@ def forward( prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) - # TODO gate by config - return _build_sampler_output( - sample_results, - sampling_metadata, - prompt_logprobs, - sample_logprobs, - #sampled_token_probs=probs, - ## TODO - #sampled_token_ids=torch.empty( - # (len(sampling_metadata.seq_groups), 1), - # device=probs.device, - # dtype=torch.long), - ) + return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs) def _get_bin_counts_and_mask( @@ -699,8 +687,4 @@ def _build_sampler_output( sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return SamplerOutput( - outputs=sampler_output, - sampled_token_probs=sampled_token_probs, - sampled_token_ids=sampled_token_ids, - ) + return SamplerOutput(outputs=sampler_output) From 5445af6ddf43cf9b1b82dc53260627e455d0ae81 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:45:19 -0700 Subject: [PATCH 069/109] refactor out beam search model processor --- vllm/engine/llm_engine.py | 537 ++++++++++--------- vllm/engine/output_processor/__init__.py | 0 vllm/engine/output_processor/beam_search.py | 321 +++++++++++ vllm/engine/output_processor/block_decode.py | 186 +++++++ vllm/engine/output_processor/interfaces.py | 36 ++ 5 files changed, 817 insertions(+), 263 deletions(-) create mode 100644 vllm/engine/output_processor/__init__.py create mode 100644 vllm/engine/output_processor/beam_search.py create mode 100644 vllm/engine/output_processor/block_decode.py create mode 100644 vllm/engine/output_processor/interfaces.py diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e47af8dfcf9e9..1ac73bc874def 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -25,6 +25,7 @@ from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) from vllm.utils import Counter +from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -180,6 +181,14 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) + self.output_processor = SequenceGroupOutputProcessor.create_output_processor( + self.scheduler_config, + self.detokenizer, + self.scheduler, + self.seq_counter, + self.get_tokenizer_for_seq, + ) + def _initialize_kv_caches(self) -> None: """Initialize the KV cache in the worker(s). @@ -449,179 +458,179 @@ def _check_beam_search_early_stopping( eos_token_id=best_running_seq.eos_token_id)) return current_worst_score >= highest_attainable_score - def _process_sequence_group_outputs(self, seq_group: SequenceGroup, - outputs: SequenceGroupOutput) -> None: - - # Process prompt logprobs - prompt_logprobs = outputs.prompt_logprobs - if prompt_logprobs is not None and seq_group.sampling_params.detokenize: - self.detokenizer.decode_prompt_logprobs_inplace( - seq_group, prompt_logprobs) - seq_group.prompt_logprobs = prompt_logprobs - - # Process samples - samples = outputs.samples - parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - existing_finished_seqs = seq_group.get_finished_seqs() - parent_child_dict = { - parent_seq.seq_id: [] - for parent_seq in parent_seqs - } - for sample in samples: - parent_child_dict[sample.parent_seq_id].append(sample) - # List of (child, parent) - child_seqs: List[Tuple[Sequence, Sequence]] = [] - - # Process the child samples for each parent sequence - for parent in parent_seqs: - child_samples: List[SequenceOutput] = parent_child_dict[ - parent.seq_id] - if len(child_samples) == 0: - # This parent sequence has no children samples. Remove - # the parent sequence from the sequence group since it will - # not be used in the future iterations. - parent.status = SequenceStatus.FINISHED_ABORTED - seq_group.remove(parent.seq_id) - self.scheduler.free_seq(parent) - continue - # Fork the parent sequence if there are multiple child samples. - for child_sample in child_samples[:-1]: - new_child_seq_id = next(self.seq_counter) - child = parent.fork(new_child_seq_id) - child.append_token_id(child_sample.output_token, - child_sample.logprobs) - child_seqs.append((child, parent)) - # Continue the parent sequence for the last child sample. - # We reuse the parent sequence here to reduce redundant memory - # copies, especially when using non-beam search sampling methods. - last_child_sample = child_samples[-1] - parent.append_token_id(last_child_sample.output_token, - last_child_sample.logprobs) - child_seqs.append((parent, parent)) - - for seq, _ in child_seqs: - if seq_group.sampling_params.detokenize: - self.detokenizer.decode_sequence_inplace( - seq, seq_group.sampling_params) - self._check_stop(seq, seq_group.sampling_params) - - # Non-beam search case - if not seq_group.sampling_params.use_beam_search: - # For newly created child sequences, add them to the sequence group - # and fork them in block manager if they are not finished. - for seq, parent in child_seqs: - if seq is not parent: - seq_group.add(seq) - if not seq.is_finished(): - self.scheduler.fork_seq(parent, seq) - - # Free the finished and selected parent sequences' memory in block - # manager. Keep them in the sequence group as candidate output. - # NOTE: we need to fork the new sequences before freeing the - # old sequences. - for seq, parent in child_seqs: - if seq is parent and seq.is_finished(): - self.scheduler.free_seq(seq) - return - - # Beam search case - # Select the child sequences to keep in the sequence group. - selected_child_seqs = [] - unselected_child_seqs = [] - beam_width = seq_group.sampling_params.best_of - length_penalty = seq_group.sampling_params.length_penalty - - # Select the newly finished sequences with the highest scores - # to replace existing finished sequences. - # Tuple of (seq, parent, is_new) - existing_finished_seqs = [(seq, None, False) - for seq in existing_finished_seqs] - new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs - if seq.is_finished()] - all_finished_seqs = existing_finished_seqs + new_finished_seqs - # Sort the finished sequences by their scores. - all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - reverse=True) - for seq, parent, is_new in all_finished_seqs[:beam_width]: - if is_new: - # A newly generated child sequence finishes and has a high - # score, so we will add it into the sequence group. - selected_child_seqs.append((seq, parent)) - for seq, parent, is_new in all_finished_seqs[beam_width:]: - if is_new: - # A newly generated child sequence finishes but has a low - # score, so we will not add it into the sequence group. - # Additionally, if this sequence is a continuation of a - # parent sequence, we will need remove the parent sequence - # from the sequence group. - unselected_child_seqs.append((seq, parent)) - else: - # An existing finished sequence has a low score, so we will - # remove it from the sequence group. - seq_group.remove(seq.seq_id) - - # select the top beam_width sequences from the running - # sequences for the next iteration to continue the beam - # search. - running_child_seqs = [(seq, parent) for seq, parent in child_seqs - if not seq.is_finished()] - # Sort the running sequences by their scores. - running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - reverse=True) - - # Check if we can stop the beam search. - if len(running_child_seqs) == 0: - # No running sequences, stop the beam search. - stop_beam_search = True - elif len(all_finished_seqs) < beam_width: - # Not enough finished sequences, continue the beam search. - stop_beam_search = False - else: - # Check the early stopping criteria - best_running_seq = running_child_seqs[0][0] - current_worst_seq = all_finished_seqs[beam_width - 1][0] - stop_beam_search = self._check_beam_search_early_stopping( - seq_group.sampling_params.early_stopping, - seq_group.sampling_params, best_running_seq, current_worst_seq) - - if stop_beam_search: - # Stop the beam search and remove all the running sequences from - # the sequence group. - unselected_child_seqs.extend(running_child_seqs) - else: - # Continue the beam search and select the top beam_width sequences - # to continue the beam search. - selected_child_seqs.extend(running_child_seqs[:beam_width]) - # The remaining running sequences will not be used in the next - # iteration. Again, if these sequences are continuations of - # parent sequences, we will need to remove the parent sequences - # from the sequence group. - unselected_child_seqs.extend(running_child_seqs[beam_width:]) - - # For newly created child sequences, add them to the sequence group - # and fork them in block manager if they are not finished. - for seq, parent in selected_child_seqs: - if seq is not parent: - seq_group.add(seq) - if not seq.is_finished(): - self.scheduler.fork_seq(parent, seq) - - # Free the finished and selected parent sequences' memory in block - # manager. Keep them in the sequence group as candidate output. - for seq, parent in selected_child_seqs: - if seq is parent and seq.is_finished(): - self.scheduler.free_seq(seq) - - # Remove the unselected parent sequences from the sequence group and - # free their memory in block manager. - for seq, parent in unselected_child_seqs: - if seq is parent: - # Remove the parent sequence if it is not selected for next - # iteration - seq_group.remove(seq.seq_id) - self.scheduler.free_seq(seq) + #def _process_sequence_group_outputs(self, seq_group: SequenceGroup, + # outputs: SequenceGroupOutput) -> None: + + # # Process prompt logprobs + # prompt_logprobs = outputs.prompt_logprobs + # if prompt_logprobs is not None and seq_group.sampling_params.detokenize: + # self.detokenizer.decode_prompt_logprobs_inplace( + # seq_group, prompt_logprobs) + # seq_group.prompt_logprobs = prompt_logprobs + + # # Process samples + # samples = outputs.samples + # parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + # existing_finished_seqs = seq_group.get_finished_seqs() + # parent_child_dict = { + # parent_seq.seq_id: [] + # for parent_seq in parent_seqs + # } + # for sample in samples: + # parent_child_dict[sample.parent_seq_id].append(sample) + # # List of (child, parent) + # child_seqs: List[Tuple[Sequence, Sequence]] = [] + + # # Process the child samples for each parent sequence + # for parent in parent_seqs: + # child_samples: List[SequenceOutput] = parent_child_dict[ + # parent.seq_id] + # if len(child_samples) == 0: + # # This parent sequence has no children samples. Remove + # # the parent sequence from the sequence group since it will + # # not be used in the future iterations. + # parent.status = SequenceStatus.FINISHED_ABORTED + # seq_group.remove(parent.seq_id) + # self.scheduler.free_seq(parent) + # continue + # # Fork the parent sequence if there are multiple child samples. + # for child_sample in child_samples[:-1]: + # new_child_seq_id = next(self.seq_counter) + # child = parent.fork(new_child_seq_id) + # child.append_token_id(child_sample.output_token, + # child_sample.logprobs) + # child_seqs.append((child, parent)) + # # Continue the parent sequence for the last child sample. + # # We reuse the parent sequence here to reduce redundant memory + # # copies, especially when using non-beam search sampling methods. + # last_child_sample = child_samples[-1] + # parent.append_token_id(last_child_sample.output_token, + # last_child_sample.logprobs) + # child_seqs.append((parent, parent)) + + # for seq, _ in child_seqs: + # if seq_group.sampling_params.detokenize: + # self.detokenizer.decode_sequence_inplace( + # seq, seq_group.sampling_params) + # self._check_stop(seq, seq_group.sampling_params) + + # # Non-beam search case + # if not seq_group.sampling_params.use_beam_search: + # # For newly created child sequences, add them to the sequence group + # # and fork them in block manager if they are not finished. + # for seq, parent in child_seqs: + # if seq is not parent: + # seq_group.add(seq) + # if not seq.is_finished(): + # self.scheduler.fork_seq(parent, seq) + + # # Free the finished and selected parent sequences' memory in block + # # manager. Keep them in the sequence group as candidate output. + # # NOTE: we need to fork the new sequences before freeing the + # # old sequences. + # for seq, parent in child_seqs: + # if seq is parent and seq.is_finished(): + # self.scheduler.free_seq(seq) + # return + + # # Beam search case + # # Select the child sequences to keep in the sequence group. + # selected_child_seqs = [] + # unselected_child_seqs = [] + # beam_width = seq_group.sampling_params.best_of + # length_penalty = seq_group.sampling_params.length_penalty + + # # Select the newly finished sequences with the highest scores + # # to replace existing finished sequences. + # # Tuple of (seq, parent, is_new) + # existing_finished_seqs = [(seq, None, False) + # for seq in existing_finished_seqs] + # new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs + # if seq.is_finished()] + # all_finished_seqs = existing_finished_seqs + new_finished_seqs + # # Sort the finished sequences by their scores. + # all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( + # length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), + # reverse=True) + # for seq, parent, is_new in all_finished_seqs[:beam_width]: + # if is_new: + # # A newly generated child sequence finishes and has a high + # # score, so we will add it into the sequence group. + # selected_child_seqs.append((seq, parent)) + # for seq, parent, is_new in all_finished_seqs[beam_width:]: + # if is_new: + # # A newly generated child sequence finishes but has a low + # # score, so we will not add it into the sequence group. + # # Additionally, if this sequence is a continuation of a + # # parent sequence, we will need remove the parent sequence + # # from the sequence group. + # unselected_child_seqs.append((seq, parent)) + # else: + # # An existing finished sequence has a low score, so we will + # # remove it from the sequence group. + # seq_group.remove(seq.seq_id) + + # # select the top beam_width sequences from the running + # # sequences for the next iteration to continue the beam + # # search. + # running_child_seqs = [(seq, parent) for seq, parent in child_seqs + # if not seq.is_finished()] + # # Sort the running sequences by their scores. + # running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( + # length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), + # reverse=True) + + # # Check if we can stop the beam search. + # if len(running_child_seqs) == 0: + # # No running sequences, stop the beam search. + # stop_beam_search = True + # elif len(all_finished_seqs) < beam_width: + # # Not enough finished sequences, continue the beam search. + # stop_beam_search = False + # else: + # # Check the early stopping criteria + # best_running_seq = running_child_seqs[0][0] + # current_worst_seq = all_finished_seqs[beam_width - 1][0] + # stop_beam_search = self._check_beam_search_early_stopping( + # seq_group.sampling_params.early_stopping, + # seq_group.sampling_params, best_running_seq, current_worst_seq) + + # if stop_beam_search: + # # Stop the beam search and remove all the running sequences from + # # the sequence group. + # unselected_child_seqs.extend(running_child_seqs) + # else: + # # Continue the beam search and select the top beam_width sequences + # # to continue the beam search. + # selected_child_seqs.extend(running_child_seqs[:beam_width]) + # # The remaining running sequences will not be used in the next + # # iteration. Again, if these sequences are continuations of + # # parent sequences, we will need to remove the parent sequences + # # from the sequence group. + # unselected_child_seqs.extend(running_child_seqs[beam_width:]) + + # # For newly created child sequences, add them to the sequence group + # # and fork them in block manager if they are not finished. + # for seq, parent in selected_child_seqs: + # if seq is not parent: + # seq_group.add(seq) + # if not seq.is_finished(): + # self.scheduler.fork_seq(parent, seq) + + # # Free the finished and selected parent sequences' memory in block + # # manager. Keep them in the sequence group as candidate output. + # for seq, parent in selected_child_seqs: + # if seq is parent and seq.is_finished(): + # self.scheduler.free_seq(seq) + + # # Remove the unselected parent sequences from the sequence group and + # # free their memory in block manager. + # for seq, parent in unselected_child_seqs: + # if seq is parent: + # # Remove the parent sequence if it is not selected for next + # # iteration + # seq_group.remove(seq.seq_id) + # self.scheduler.free_seq(seq) def _process_model_outputs( self, output: SamplerOutput, @@ -651,14 +660,16 @@ def _process_model_outputs( seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) + + self.output_processor.process_outputs(seq_group, outputs) - assert len(outputs) > 0 - # TODO can spec decode go through second path? - if len(outputs) > 1: - self._process_sequence_group_outputs_multi_step( - seq_group, outputs) - else: - self._process_sequence_group_outputs(seq_group, outputs[0]) + #assert len(outputs) > 0 + ## TODO can spec decode go through second path? + #if len(outputs) > 1: + # self._process_sequence_group_outputs_multi_step( + # seq_group, outputs) + #else: + # self._process_sequence_group_outputs(seq_group, outputs[0]) # Free the finished sequence groups. self.scheduler.free_finished_seq_groups() @@ -679,89 +690,89 @@ def _process_model_outputs( self.stat_logger.log(self._get_stats(scheduler_outputs)) return request_outputs - def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): - seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - - assert seqs - #if not seqs: - # return [] - - assert len(seqs) == 1, ("Beam search not supported in speculative " - "decoding.") - seq = seqs[0] - - # Since there's only one sequence per sequence group, we can take the - # first sample. - samples = [outputs[step].samples[0] for step in range(len(outputs))] - - # -1 means the output token is not valid (eg. due to spec decode - # rejecting tokens). - valid_samples = [ - sample for sample in samples if sample.output_token != -1 - ] - - # Draft target worker pads all outputs with -1 to have same length. - output_token_ids = [sample.output_token for sample in valid_samples] - #successes = [sample.success for sample in samples] - - ## Truncate to max_tokens if necessary. - #remaining_tokens = seq_group.sampling_params.max_tokens - ( - # seq.get_output_len() + len(output_token_ids)) - #if remaining_tokens < 0: - # valid_samples = valid_samples[:remaining_tokens] - # output_token_ids = output_token_ids[:remaining_tokens] - - ## Truncate any tokens after EOS. This is required as spec decode - ## generates tokens in fixed blocks, which may go beyond the EOS token. - #if not seq_group.sampling_params.ignore_eos: - # eos_token_id = self.tokenizer.get_lora_tokenizer( - # seq.lora_request).eos_token_id - # # Avoiding .index calls as exception throwing in the happy path - # # is expensive. - # for i in range(len(output_token_ids)): - # if output_token_ids[i] == eos_token_id: - # output_token_ids = output_token_ids[:i + 1] - # valid_samples = valid_samples[:i + 1] - # break - - #output_logprobs = [sample.logprobs for sample in valid_samples] - - ## Use the last sample for the sequence as it will have - ## the speculation and num_unprocessed_tokens for all the - ## previous samples (they are cumulative when it comes - ## to those two attributes). - #speculation = valid_samples[-1].speculation - #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens - - for output_token_id in output_token_ids: - from vllm.sequence import Logprob - seq.append_token_id( - token_id=output_token_id, - logprobs={output_token_id: Logprob(0.0)}, - ) - - #seq.append_token_ids(output_token_ids, - # output_logprobs, - # ) - # #num_unprocessed_tokens=num_unprocessed_tokens) - ##seq.set_last_speculation(speculation) - - #if not all(successes): - # seq.set_status_to_failed() - - #if decode: - # self._decode_sequence(seq, - # seq_group.sampling_params, - # token_ids=seq.get_token_ids(), - # unseen_token_ids=output_token_ids, - # prefix_offset=seq.prefix_offset, - # read_offset=seq.read_offset) - #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, - # output_token_ids) - # TODO pass output token ids - self._check_stop(seq, seq_group.sampling_params) - if seq.is_finished(): - self.scheduler.free_seq(seq) + #def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): + # seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + + # assert seqs + # #if not seqs: + # # return [] + + # assert len(seqs) == 1, ("Beam search not supported in speculative " + # "decoding.") + # seq = seqs[0] + + # # Since there's only one sequence per sequence group, we can take the + # # first sample. + # samples = [outputs[step].samples[0] for step in range(len(outputs))] + + # # -1 means the output token is not valid (eg. due to spec decode + # # rejecting tokens). + # valid_samples = [ + # sample for sample in samples if sample.output_token != -1 + # ] + + # # Draft target worker pads all outputs with -1 to have same length. + # output_token_ids = [sample.output_token for sample in valid_samples] + # #successes = [sample.success for sample in samples] + + # ## Truncate to max_tokens if necessary. + # #remaining_tokens = seq_group.sampling_params.max_tokens - ( + # # seq.get_output_len() + len(output_token_ids)) + # #if remaining_tokens < 0: + # # valid_samples = valid_samples[:remaining_tokens] + # # output_token_ids = output_token_ids[:remaining_tokens] + + # ## Truncate any tokens after EOS. This is required as spec decode + # ## generates tokens in fixed blocks, which may go beyond the EOS token. + # #if not seq_group.sampling_params.ignore_eos: + # # eos_token_id = self.tokenizer.get_lora_tokenizer( + # # seq.lora_request).eos_token_id + # # # Avoiding .index calls as exception throwing in the happy path + # # # is expensive. + # # for i in range(len(output_token_ids)): + # # if output_token_ids[i] == eos_token_id: + # # output_token_ids = output_token_ids[:i + 1] + # # valid_samples = valid_samples[:i + 1] + # # break + + # #output_logprobs = [sample.logprobs for sample in valid_samples] + + # ## Use the last sample for the sequence as it will have + # ## the speculation and num_unprocessed_tokens for all the + # ## previous samples (they are cumulative when it comes + # ## to those two attributes). + # #speculation = valid_samples[-1].speculation + # #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens + + # for output_token_id in output_token_ids: + # from vllm.sequence import Logprob + # seq.append_token_id( + # token_id=output_token_id, + # logprobs={output_token_id: Logprob(0.0)}, + # ) + + # #seq.append_token_ids(output_token_ids, + # # output_logprobs, + # # ) + # # #num_unprocessed_tokens=num_unprocessed_tokens) + # ##seq.set_last_speculation(speculation) + + # #if not all(successes): + # # seq.set_status_to_failed() + + # #if decode: + # # self._decode_sequence(seq, + # # seq_group.sampling_params, + # # token_ids=seq.get_token_ids(), + # # unseen_token_ids=output_token_ids, + # # prefix_offset=seq.prefix_offset, + # # read_offset=seq.read_offset) + # #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, + # # output_token_ids) + # # TODO pass output token ids + # self._check_stop(seq, seq_group.sampling_params) + # if seq.is_finished(): + # self.scheduler.free_seq(seq) def step(self) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results. diff --git a/vllm/engine/output_processor/__init__.py b/vllm/engine/output_processor/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py new file mode 100644 index 0000000000000..5f823b5c5c729 --- /dev/null +++ b/vllm/engine/output_processor/beam_search.py @@ -0,0 +1,321 @@ +import time +from typing import Iterable, List, Optional, Tuple, Type, Union + +from transformers import PreTrainedTokenizer + +import vllm +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, SpeculativeConfig, + VisionLanguageConfig) +from vllm.core.scheduler import Scheduler, SchedulerOutputs +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.metrics import StatLogger, Stats +from vllm.engine.ray_utils import initialize_ray_cluster +from vllm.executor.executor_base import ExecutorBase +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, + SequenceGroup, SequenceGroupOutput, SequenceOutput, + SequenceStatus) +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, + get_tokenizer_group) +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) +from vllm.utils import Counter +from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor + +logger = init_logger(__name__) + + +class BeamSearchOutputProcessor(SequenceGroupOutputProcessor): + + def __init__( + self, + scheduler_config: SchedulerConfig, + detokenizer, + scheduler, + seq_counter, + get_tokenizer_for_seq, + ): + self.scheduler_config = scheduler_config + self.detokenizer = detokenizer + self.scheduler = scheduler + self.seq_counter = seq_counter + self.get_tokenizer_for_seq = get_tokenizer_for_seq + + def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + assert (len(outputs) == 1), f"{type(self)} does not support multiple outputs per step" + return self._process_sequence_group_outputs(sequence_group, outputs[0]) + + def _check_beam_search_early_stopping( + self, + early_stopping: Union[bool, str], + sampling_params: SamplingParams, + best_running_seq: Sequence, + current_worst_seq: Sequence, + ) -> bool: + assert sampling_params.use_beam_search + length_penalty = sampling_params.length_penalty + if early_stopping is True: + return True + + current_worst_score = current_worst_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=current_worst_seq.eos_token_id) + if early_stopping is False: + highest_attainable_score = best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id) + else: + assert early_stopping == "never" + if length_penalty > 0.0: + # If length_penalty > 0.0, beam search will prefer longer + # sequences. The highest attainable score calculation is + # based on the longest possible sequence length in this case. + max_possible_length = max( + best_running_seq.get_prompt_len() + + sampling_params.max_tokens, + self.scheduler_config.max_model_len) + highest_attainable_score = ( + best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id, + seq_len=max_possible_length)) + else: + # Otherwise, beam search will prefer shorter sequences. The + # highest attainable score calculation is based on the current + # sequence length. + highest_attainable_score = ( + best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id)) + return current_worst_score >= highest_attainable_score + + def _process_sequence_group_outputs(self, seq_group: SequenceGroup, + outputs: SequenceGroupOutput) -> None: + + # Process prompt logprobs + prompt_logprobs = outputs.prompt_logprobs + if prompt_logprobs is not None and seq_group.sampling_params.detokenize: + self.detokenizer.decode_prompt_logprobs_inplace( + seq_group, prompt_logprobs) + seq_group.prompt_logprobs = prompt_logprobs + + # Process samples + samples = outputs.samples + parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + existing_finished_seqs = seq_group.get_finished_seqs() + parent_child_dict = { + parent_seq.seq_id: [] + for parent_seq in parent_seqs + } + for sample in samples: + parent_child_dict[sample.parent_seq_id].append(sample) + # List of (child, parent) + child_seqs: List[Tuple[Sequence, Sequence]] = [] + + # Process the child samples for each parent sequence + for parent in parent_seqs: + child_samples: List[SequenceOutput] = parent_child_dict[ + parent.seq_id] + if len(child_samples) == 0: + # This parent sequence has no children samples. Remove + # the parent sequence from the sequence group since it will + # not be used in the future iterations. + parent.status = SequenceStatus.FINISHED_ABORTED + seq_group.remove(parent.seq_id) + self.scheduler.free_seq(parent) + continue + # Fork the parent sequence if there are multiple child samples. + for child_sample in child_samples[:-1]: + new_child_seq_id = next(self.seq_counter) + child = parent.fork(new_child_seq_id) + child.append_token_id(child_sample.output_token, + child_sample.logprobs) + child_seqs.append((child, parent)) + # Continue the parent sequence for the last child sample. + # We reuse the parent sequence here to reduce redundant memory + # copies, especially when using non-beam search sampling methods. + last_child_sample = child_samples[-1] + parent.append_token_id(last_child_sample.output_token, + last_child_sample.logprobs) + child_seqs.append((parent, parent)) + + for seq, _ in child_seqs: + if seq_group.sampling_params.detokenize: + self.detokenizer.decode_sequence_inplace( + seq, seq_group.sampling_params) + self._check_stop(seq, seq_group.sampling_params) + + # Non-beam search case + if not seq_group.sampling_params.use_beam_search: + # For newly created child sequences, add them to the sequence group + # and fork them in block manager if they are not finished. + for seq, parent in child_seqs: + if seq is not parent: + seq_group.add(seq) + if not seq.is_finished(): + self.scheduler.fork_seq(parent, seq) + + # Free the finished and selected parent sequences' memory in block + # manager. Keep them in the sequence group as candidate output. + # NOTE: we need to fork the new sequences before freeing the + # old sequences. + for seq, parent in child_seqs: + if seq is parent and seq.is_finished(): + self.scheduler.free_seq(seq) + return + + # Beam search case + # Select the child sequences to keep in the sequence group. + selected_child_seqs = [] + unselected_child_seqs = [] + beam_width = seq_group.sampling_params.best_of + length_penalty = seq_group.sampling_params.length_penalty + + # Select the newly finished sequences with the highest scores + # to replace existing finished sequences. + # Tuple of (seq, parent, is_new) + existing_finished_seqs = [(seq, None, False) + for seq in existing_finished_seqs] + new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs + if seq.is_finished()] + all_finished_seqs = existing_finished_seqs + new_finished_seqs + # Sort the finished sequences by their scores. + all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( + length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), + reverse=True) + for seq, parent, is_new in all_finished_seqs[:beam_width]: + if is_new: + # A newly generated child sequence finishes and has a high + # score, so we will add it into the sequence group. + selected_child_seqs.append((seq, parent)) + for seq, parent, is_new in all_finished_seqs[beam_width:]: + if is_new: + # A newly generated child sequence finishes but has a low + # score, so we will not add it into the sequence group. + # Additionally, if this sequence is a continuation of a + # parent sequence, we will need remove the parent sequence + # from the sequence group. + unselected_child_seqs.append((seq, parent)) + else: + # An existing finished sequence has a low score, so we will + # remove it from the sequence group. + seq_group.remove(seq.seq_id) + + # select the top beam_width sequences from the running + # sequences for the next iteration to continue the beam + # search. + running_child_seqs = [(seq, parent) for seq, parent in child_seqs + if not seq.is_finished()] + # Sort the running sequences by their scores. + running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( + length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), + reverse=True) + + # Check if we can stop the beam search. + if len(running_child_seqs) == 0: + # No running sequences, stop the beam search. + stop_beam_search = True + elif len(all_finished_seqs) < beam_width: + # Not enough finished sequences, continue the beam search. + stop_beam_search = False + else: + # Check the early stopping criteria + best_running_seq = running_child_seqs[0][0] + current_worst_seq = all_finished_seqs[beam_width - 1][0] + stop_beam_search = self._check_beam_search_early_stopping( + seq_group.sampling_params.early_stopping, + seq_group.sampling_params, best_running_seq, current_worst_seq) + + if stop_beam_search: + # Stop the beam search and remove all the running sequences from + # the sequence group. + unselected_child_seqs.extend(running_child_seqs) + else: + # Continue the beam search and select the top beam_width sequences + # to continue the beam search. + selected_child_seqs.extend(running_child_seqs[:beam_width]) + # The remaining running sequences will not be used in the next + # iteration. Again, if these sequences are continuations of + # parent sequences, we will need to remove the parent sequences + # from the sequence group. + unselected_child_seqs.extend(running_child_seqs[beam_width:]) + + # For newly created child sequences, add them to the sequence group + # and fork them in block manager if they are not finished. + for seq, parent in selected_child_seqs: + if seq is not parent: + seq_group.add(seq) + if not seq.is_finished(): + self.scheduler.fork_seq(parent, seq) + + # Free the finished and selected parent sequences' memory in block + # manager. Keep them in the sequence group as candidate output. + for seq, parent in selected_child_seqs: + if seq is parent and seq.is_finished(): + self.scheduler.free_seq(seq) + + # Remove the unselected parent sequences from the sequence group and + # free their memory in block manager. + for seq, parent in unselected_child_seqs: + if seq is parent: + # Remove the parent sequence if it is not selected for next + # iteration + seq_group.remove(seq.seq_id) + self.scheduler.free_seq(seq) + + def _check_stop(self, seq: Sequence, + sampling_params: SamplingParams) -> None: + """Stop the finished sequences.""" + # Check if the sequence has reached max_model_len. + if seq.get_len() > self.scheduler_config.max_model_len: + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the sequence has reached max_tokens. + if seq.get_output_len() >= int(sampling_params.max_tokens): + # TODO should cap block + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the minimum number of tokens has been generated yet; + # skip the stop string/token checks if not + if seq.get_output_len() < sampling_params.min_tokens: + return + + if sampling_params.detokenize: + for stop_str in sampling_params.stop: + if seq.output_text.endswith(stop_str): + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = stop_str + return + last_token_id = seq.get_last_token_id() + if last_token_id in sampling_params.stop_token_ids: + stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( + last_token_id) + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = last_token_id + return + + # Check if the sequence has generated the EOS token. + if ((not sampling_params.ignore_eos) + and seq.get_last_token_id() == seq.eos_token_id): + seq.status = SequenceStatus.FINISHED_STOPPED + return + + def _finalize_sequence(self, seq: Sequence, + sampling_params: SamplingParams, + stop_string: str) -> None: + if sampling_params.include_stop_str_in_output: + return + + if stop_string and seq.output_text.endswith(stop_string): + # Truncate the output text so that the stop string is + # not included in the output. + seq.output_text = seq.output_text[:-len(stop_string)] diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py new file mode 100644 index 0000000000000..f11520d3a7e90 --- /dev/null +++ b/vllm/engine/output_processor/block_decode.py @@ -0,0 +1,186 @@ +import time +from typing import Iterable, List, Optional, Tuple, Type, Union + +from transformers import PreTrainedTokenizer + +import vllm +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, SpeculativeConfig, + VisionLanguageConfig) +from vllm.core.scheduler import Scheduler, SchedulerOutputs +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.metrics import StatLogger, Stats +from vllm.engine.ray_utils import initialize_ray_cluster +from vllm.executor.executor_base import ExecutorBase +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, + SequenceGroup, SequenceGroupOutput, SequenceOutput, + SequenceStatus) +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, + get_tokenizer_group) +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) +from vllm.utils import Counter +from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor + +logger = init_logger(__name__) + + +class BlockDecodeOutputProcessor(SequenceGroupOutputProcessor): + + def __init__( + self, + scheduler_config: SchedulerConfig, + detokenizer, + scheduler, + seq_counter, + get_tokenizer_for_seq, + ): + self.scheduler_config = scheduler_config + self.detokenizer = detokenizer + self.scheduler = scheduler + self.seq_counter = seq_counter + self.get_tokenizer_for_seq = get_tokenizer_for_seq + + def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: + return self._process_sequence_group_outputs_multi_step(sequence_group, outputs) + + def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): + seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + + assert seqs + #if not seqs: + # return [] + + assert len(seqs) == 1, ("Beam search not supported in speculative " + "decoding.") + seq = seqs[0] + + # Since there's only one sequence per sequence group, we can take the + # first sample. + samples = [outputs[step].samples[0] for step in range(len(outputs))] + + # -1 means the output token is not valid (eg. due to spec decode + # rejecting tokens). + valid_samples = [ + sample for sample in samples if sample.output_token != -1 + ] + + # Draft target worker pads all outputs with -1 to have same length. + output_token_ids = [sample.output_token for sample in valid_samples] + #successes = [sample.success for sample in samples] + + ## Truncate to max_tokens if necessary. + #remaining_tokens = seq_group.sampling_params.max_tokens - ( + # seq.get_output_len() + len(output_token_ids)) + #if remaining_tokens < 0: + # valid_samples = valid_samples[:remaining_tokens] + # output_token_ids = output_token_ids[:remaining_tokens] + + ## Truncate any tokens after EOS. This is required as spec decode + ## generates tokens in fixed blocks, which may go beyond the EOS token. + #if not seq_group.sampling_params.ignore_eos: + # eos_token_id = self.tokenizer.get_lora_tokenizer( + # seq.lora_request).eos_token_id + # # Avoiding .index calls as exception throwing in the happy path + # # is expensive. + # for i in range(len(output_token_ids)): + # if output_token_ids[i] == eos_token_id: + # output_token_ids = output_token_ids[:i + 1] + # valid_samples = valid_samples[:i + 1] + # break + + #output_logprobs = [sample.logprobs for sample in valid_samples] + + ## Use the last sample for the sequence as it will have + ## the speculation and num_unprocessed_tokens for all the + ## previous samples (they are cumulative when it comes + ## to those two attributes). + #speculation = valid_samples[-1].speculation + #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens + + for output_token_id in output_token_ids: + from vllm.sequence import Logprob + seq.append_token_id( + token_id=output_token_id, + logprobs={output_token_id: Logprob(0.0)}, + ) + + #seq.append_token_ids(output_token_ids, + # output_logprobs, + # ) + # #num_unprocessed_tokens=num_unprocessed_tokens) + ##seq.set_last_speculation(speculation) + + #if not all(successes): + # seq.set_status_to_failed() + + #if decode: + # self._decode_sequence(seq, + # seq_group.sampling_params, + # token_ids=seq.get_token_ids(), + # unseen_token_ids=output_token_ids, + # prefix_offset=seq.prefix_offset, + # read_offset=seq.read_offset) + #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, + # output_token_ids) + # TODO pass output token ids + self._check_stop(seq, seq_group.sampling_params) + if seq.is_finished(): + self.scheduler.free_seq(seq) + + def _check_stop(self, seq: Sequence, + sampling_params: SamplingParams) -> None: + """Stop the finished sequences.""" + # Check if the sequence has reached max_model_len. + if seq.get_len() > self.scheduler_config.max_model_len: + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the sequence has reached max_tokens. + if seq.get_output_len() >= int(sampling_params.max_tokens): + # TODO should cap block + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the minimum number of tokens has been generated yet; + # skip the stop string/token checks if not + if seq.get_output_len() < sampling_params.min_tokens: + return + + if sampling_params.detokenize: + for stop_str in sampling_params.stop: + if seq.output_text.endswith(stop_str): + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = stop_str + return + last_token_id = seq.get_last_token_id() + if last_token_id in sampling_params.stop_token_ids: + stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( + last_token_id) + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = last_token_id + return + + # Check if the sequence has generated the EOS token. + if ((not sampling_params.ignore_eos) + and seq.get_last_token_id() == seq.eos_token_id): + seq.status = SequenceStatus.FINISHED_STOPPED + return + + def _finalize_sequence(self, seq: Sequence, + sampling_params: SamplingParams, + stop_string: str) -> None: + if sampling_params.include_stop_str_in_output: + return + + if stop_string and seq.output_text.endswith(stop_string): + # Truncate the output text so that the stop string is + # not included in the output. + seq.output_text = seq.output_text[:-len(stop_string)] diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py new file mode 100644 index 0000000000000..4d1da960dc41d --- /dev/null +++ b/vllm/engine/output_processor/interfaces.py @@ -0,0 +1,36 @@ +from abc import ABC, abstractmethod +from vllm.config import SchedulerConfig +from vllm.sequence import SequenceGroup, SequenceGroupOutput + +class SequenceGroupOutputProcessor(ABC): + + @staticmethod + def create_output_processor( + scheduler_config: SchedulerConfig, + detokenizer, + scheduler, + seq_counter, + get_tokenizer_for_seq, + ): + if scheduler_config.num_lookahead_slots == 0: + from vllm.engine.output_processor.beam_search import BeamSearchOutputProcessor + return BeamSearchOutputProcessor( + scheduler_config, + detokenizer, + scheduler, + seq_counter, + get_tokenizer_for_seq, + ) + else: + from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor + return BlockDecodeOutputProcessor( + scheduler_config, + detokenizer, + scheduler, + seq_counter, + get_tokenizer_for_seq, + ) + + @abstractmethod + def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: + pass From 632b439541021309fbc0f83b78210532e1a94606 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:46:14 -0700 Subject: [PATCH 070/109] fix --- vllm/engine/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1ac73bc874def..60b0f46b2318d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -926,7 +926,7 @@ def _check_stop(self, seq: Sequence, return # Check if the sequence has reached max_tokens. - if seq.get_output_len() >= int(sampling_params.max_tokens): + if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): # TODO should cap block seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return From 26e7368e95f824fdce6cac30f476529d270ac6ed Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:53:01 -0700 Subject: [PATCH 071/109] dedup stop check --- vllm/engine/llm_engine.py | 104 +++++++------- vllm/engine/output_processor/beam_search.py | 140 ++++++------------- vllm/engine/output_processor/block_decode.py | 57 +------- vllm/engine/output_processor/interfaces.py | 3 + vllm/engine/output_processor/stop_checker.py | 89 ++++++++++++ 5 files changed, 194 insertions(+), 199 deletions(-) create mode 100644 vllm/engine/output_processor/stop_checker.py diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 60b0f46b2318d..570b5eff581d3 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -26,6 +26,7 @@ usage_message) from vllm.utils import Counter from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor +from vllm.engine.output_processor.stop_checker import StopChecker logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -187,6 +188,7 @@ def __init__( self.scheduler, self.seq_counter, self.get_tokenizer_for_seq, + stop_checker=StopChecker(scheduler, self.get_tokenizer_for_seq), ) def _initialize_kv_caches(self) -> None: @@ -917,57 +919,57 @@ def _get_stats(self, time_e2e_requests=time_e2e_requests, ) - def _check_stop(self, seq: Sequence, - sampling_params: SamplingParams) -> None: - """Stop the finished sequences.""" - # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the sequence has reached max_tokens. - if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): - # TODO should cap block - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the minimum number of tokens has been generated yet; - # skip the stop string/token checks if not - if seq.get_output_len() < sampling_params.min_tokens: - return - - if sampling_params.detokenize: - for stop_str in sampling_params.stop: - if seq.output_text.endswith(stop_str): - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = stop_str - return - last_token_id = seq.get_last_token_id() - if last_token_id in sampling_params.stop_token_ids: - stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - last_token_id) - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = last_token_id - return - - # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) - and seq.get_last_token_id() == seq.eos_token_id): - seq.status = SequenceStatus.FINISHED_STOPPED - return - - def _finalize_sequence(self, seq: Sequence, - sampling_params: SamplingParams, - stop_string: str) -> None: - if sampling_params.include_stop_str_in_output: - return - - if stop_string and seq.output_text.endswith(stop_string): - # Truncate the output text so that the stop string is - # not included in the output. - seq.output_text = seq.output_text[:-len(stop_string)] + #def _check_stop(self, seq: Sequence, + # sampling_params: SamplingParams) -> None: + # """Stop the finished sequences.""" + # # Check if the sequence has reached max_model_len. + # if seq.get_len() > self.scheduler_config.max_model_len: + # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + # return + + # # Check if the sequence has reached max_tokens. + # if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): + # # TODO should cap block + # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + # return + + # # Check if the minimum number of tokens has been generated yet; + # # skip the stop string/token checks if not + # if seq.get_output_len() < sampling_params.min_tokens: + # return + + # if sampling_params.detokenize: + # for stop_str in sampling_params.stop: + # if seq.output_text.endswith(stop_str): + # self._finalize_sequence(seq, sampling_params, stop_str) + # seq.status = SequenceStatus.FINISHED_STOPPED + # seq.stop_reason = stop_str + # return + # last_token_id = seq.get_last_token_id() + # if last_token_id in sampling_params.stop_token_ids: + # stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( + # last_token_id) + # self._finalize_sequence(seq, sampling_params, stop_str) + # seq.status = SequenceStatus.FINISHED_STOPPED + # seq.stop_reason = last_token_id + # return + + # # Check if the sequence has generated the EOS token. + # if ((not sampling_params.ignore_eos) + # and seq.get_last_token_id() == seq.eos_token_id): + # seq.status = SequenceStatus.FINISHED_STOPPED + # return + + #def _finalize_sequence(self, seq: Sequence, + # sampling_params: SamplingParams, + # stop_string: str) -> None: + # if sampling_params.include_stop_str_in_output: + # return + + # if stop_string and seq.output_text.endswith(stop_string): + # # Truncate the output text so that the stop string is + # # not included in the output. + # seq.output_text = seq.output_text[:-len(stop_string)] def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 5f823b5c5c729..c9ded11711515 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -39,61 +39,19 @@ def __init__( scheduler, seq_counter, get_tokenizer_for_seq, + stop_checker, ): self.scheduler_config = scheduler_config self.detokenizer = detokenizer self.scheduler = scheduler self.seq_counter = seq_counter self.get_tokenizer_for_seq = get_tokenizer_for_seq + self.stop_checker = stop_checker def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: assert (len(outputs) == 1), f"{type(self)} does not support multiple outputs per step" return self._process_sequence_group_outputs(sequence_group, outputs[0]) - def _check_beam_search_early_stopping( - self, - early_stopping: Union[bool, str], - sampling_params: SamplingParams, - best_running_seq: Sequence, - current_worst_seq: Sequence, - ) -> bool: - assert sampling_params.use_beam_search - length_penalty = sampling_params.length_penalty - if early_stopping is True: - return True - - current_worst_score = current_worst_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=current_worst_seq.eos_token_id) - if early_stopping is False: - highest_attainable_score = best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id) - else: - assert early_stopping == "never" - if length_penalty > 0.0: - # If length_penalty > 0.0, beam search will prefer longer - # sequences. The highest attainable score calculation is - # based on the longest possible sequence length in this case. - max_possible_length = max( - best_running_seq.get_prompt_len() + - sampling_params.max_tokens, - self.scheduler_config.max_model_len) - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id, - seq_len=max_possible_length)) - else: - # Otherwise, beam search will prefer shorter sequences. The - # highest attainable score calculation is based on the current - # sequence length. - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id)) - return current_worst_score >= highest_attainable_score - def _process_sequence_group_outputs(self, seq_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: @@ -148,7 +106,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, if seq_group.sampling_params.detokenize: self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) - self._check_stop(seq, seq_group.sampling_params) + self.stop_checker.check_stop(seq, seq_group.sampling_params) # Non-beam search case if not seq_group.sampling_params.use_beam_search: @@ -268,54 +226,46 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, seq_group.remove(seq.seq_id) self.scheduler.free_seq(seq) - def _check_stop(self, seq: Sequence, - sampling_params: SamplingParams) -> None: - """Stop the finished sequences.""" - # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the sequence has reached max_tokens. - if seq.get_output_len() >= int(sampling_params.max_tokens): - # TODO should cap block - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the minimum number of tokens has been generated yet; - # skip the stop string/token checks if not - if seq.get_output_len() < sampling_params.min_tokens: - return - - if sampling_params.detokenize: - for stop_str in sampling_params.stop: - if seq.output_text.endswith(stop_str): - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = stop_str - return - last_token_id = seq.get_last_token_id() - if last_token_id in sampling_params.stop_token_ids: - stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - last_token_id) - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = last_token_id - return - - # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) - and seq.get_last_token_id() == seq.eos_token_id): - seq.status = SequenceStatus.FINISHED_STOPPED - return - - def _finalize_sequence(self, seq: Sequence, - sampling_params: SamplingParams, - stop_string: str) -> None: - if sampling_params.include_stop_str_in_output: - return + def _check_beam_search_early_stopping( + self, + early_stopping: Union[bool, str], + sampling_params: SamplingParams, + best_running_seq: Sequence, + current_worst_seq: Sequence, + ) -> bool: + assert sampling_params.use_beam_search + length_penalty = sampling_params.length_penalty + if early_stopping is True: + return True - if stop_string and seq.output_text.endswith(stop_string): - # Truncate the output text so that the stop string is - # not included in the output. - seq.output_text = seq.output_text[:-len(stop_string)] + current_worst_score = current_worst_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=current_worst_seq.eos_token_id) + if early_stopping is False: + highest_attainable_score = best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id) + else: + assert early_stopping == "never" + if length_penalty > 0.0: + # If length_penalty > 0.0, beam search will prefer longer + # sequences. The highest attainable score calculation is + # based on the longest possible sequence length in this case. + max_possible_length = max( + best_running_seq.get_prompt_len() + + sampling_params.max_tokens, + self.scheduler_config.max_model_len) + highest_attainable_score = ( + best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id, + seq_len=max_possible_length)) + else: + # Otherwise, beam search will prefer shorter sequences. The + # highest attainable score calculation is based on the current + # sequence length. + highest_attainable_score = ( + best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id)) + return current_worst_score >= highest_attainable_score diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index f11520d3a7e90..90ad03df32dda 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -39,12 +39,14 @@ def __init__( scheduler, seq_counter, get_tokenizer_for_seq, + stop_checker, ): self.scheduler_config = scheduler_config self.detokenizer = detokenizer self.scheduler = scheduler self.seq_counter = seq_counter self.get_tokenizer_for_seq = get_tokenizer_for_seq + self.stop_checker = stop_checker def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: return self._process_sequence_group_outputs_multi_step(sequence_group, outputs) @@ -129,58 +131,7 @@ def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, # output_token_ids) # TODO pass output token ids - self._check_stop(seq, seq_group.sampling_params) + self.stop_checker.check_stop(seq, seq_group.sampling_params) + if seq.is_finished(): self.scheduler.free_seq(seq) - - def _check_stop(self, seq: Sequence, - sampling_params: SamplingParams) -> None: - """Stop the finished sequences.""" - # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the sequence has reached max_tokens. - if seq.get_output_len() >= int(sampling_params.max_tokens): - # TODO should cap block - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the minimum number of tokens has been generated yet; - # skip the stop string/token checks if not - if seq.get_output_len() < sampling_params.min_tokens: - return - - if sampling_params.detokenize: - for stop_str in sampling_params.stop: - if seq.output_text.endswith(stop_str): - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = stop_str - return - last_token_id = seq.get_last_token_id() - if last_token_id in sampling_params.stop_token_ids: - stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - last_token_id) - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = last_token_id - return - - # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) - and seq.get_last_token_id() == seq.eos_token_id): - seq.status = SequenceStatus.FINISHED_STOPPED - return - - def _finalize_sequence(self, seq: Sequence, - sampling_params: SamplingParams, - stop_string: str) -> None: - if sampling_params.include_stop_str_in_output: - return - - if stop_string and seq.output_text.endswith(stop_string): - # Truncate the output text so that the stop string is - # not included in the output. - seq.output_text = seq.output_text[:-len(stop_string)] diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 4d1da960dc41d..d2368fc811a00 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -11,6 +11,7 @@ def create_output_processor( scheduler, seq_counter, get_tokenizer_for_seq, + stop_checker, ): if scheduler_config.num_lookahead_slots == 0: from vllm.engine.output_processor.beam_search import BeamSearchOutputProcessor @@ -20,6 +21,7 @@ def create_output_processor( scheduler, seq_counter, get_tokenizer_for_seq, + stop_checker, ) else: from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor @@ -29,6 +31,7 @@ def create_output_processor( scheduler, seq_counter, get_tokenizer_for_seq, + stop_checker, ) @abstractmethod diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py new file mode 100644 index 0000000000000..feeef1c0f24ab --- /dev/null +++ b/vllm/engine/output_processor/stop_checker.py @@ -0,0 +1,89 @@ +import time +from typing import Iterable, List, Optional, Tuple, Type, Union + +from transformers import PreTrainedTokenizer + +import vllm +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, SpeculativeConfig, + VisionLanguageConfig) +from vllm.core.scheduler import Scheduler, SchedulerOutputs +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.metrics import StatLogger, Stats +from vllm.engine.ray_utils import initialize_ray_cluster +from vllm.executor.executor_base import ExecutorBase +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, + SequenceGroup, SequenceGroupOutput, SequenceOutput, + SequenceStatus) +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, + get_tokenizer_group) +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) +from vllm.utils import Counter +from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor + +logger = init_logger(__name__) +_LOCAL_LOGGING_INTERVAL_SEC = 5 + +class StopChecker: + + def __init__(self, scheduler, get_tokenizer_for_seq): + self.scheduler = scheduler + self.get_tokenizer_for_seq = get_tokenizer_for_seq + + def check_stop(self, seq: Sequence, + sampling_params: SamplingParams) -> None: + """Stop the finished sequences.""" + # Check if the sequence has reached max_model_len. + if seq.get_len() > self.scheduler_config.max_model_len: + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the sequence has reached max_tokens. + if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): + # TODO should cap block + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the minimum number of tokens has been generated yet; + # skip the stop string/token checks if not + if seq.get_output_len() < sampling_params.min_tokens: + return + + if sampling_params.detokenize: + for stop_str in sampling_params.stop: + if seq.output_text.endswith(stop_str): + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = stop_str + return + last_token_id = seq.get_last_token_id() + if last_token_id in sampling_params.stop_token_ids: + stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( + last_token_id) + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = last_token_id + return + + # Check if the sequence has generated the EOS token. + if ((not sampling_params.ignore_eos) + and seq.get_last_token_id() == seq.eos_token_id): + seq.status = SequenceStatus.FINISHED_STOPPED + return + + def _finalize_sequence(self, seq: Sequence, + sampling_params: SamplingParams, + stop_string: str) -> None: + if sampling_params.include_stop_str_in_output: + return + + if stop_string and seq.output_text.endswith(stop_string): + # Truncate the output text so that the stop string is + # not included in the output. + seq.output_text = seq.output_text[:-len(stop_string)] From 06e7c01d3867439289e8f5958cf1bc00be0c305a Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:55:20 -0700 Subject: [PATCH 072/109] wip --- vllm/engine/llm_engine.py | 6 +++++- vllm/engine/output_processor/stop_checker.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 570b5eff581d3..036709a414c2e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -188,7 +188,11 @@ def __init__( self.scheduler, self.seq_counter, self.get_tokenizer_for_seq, - stop_checker=StopChecker(scheduler, self.get_tokenizer_for_seq), + stop_checker=StopChecker( + self.scheduler, + self.scheduler_config, + self.get_tokenizer_for_seq, + ), ) def _initialize_kv_caches(self) -> None: diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index feeef1c0f24ab..cc6655b7aaa71 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -32,8 +32,9 @@ class StopChecker: - def __init__(self, scheduler, get_tokenizer_for_seq): + def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): self.scheduler = scheduler + self.scheduler_config = scheduler_config self.get_tokenizer_for_seq = get_tokenizer_for_seq def check_stop(self, seq: Sequence, From 184a52c166ec6eeb75dfedbb544c65188322ece7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:56:18 -0700 Subject: [PATCH 073/109] del --- vllm/engine/llm_engine.py | 352 -------------------------------------- 1 file changed, 352 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 036709a414c2e..2be4a260f1647 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -420,224 +420,6 @@ def has_unfinished_requests(self) -> bool: """Returns True if there are unfinished requests.""" return self.scheduler.has_unfinished_seqs() - def _check_beam_search_early_stopping( - self, - early_stopping: Union[bool, str], - sampling_params: SamplingParams, - best_running_seq: Sequence, - current_worst_seq: Sequence, - ) -> bool: - assert sampling_params.use_beam_search - length_penalty = sampling_params.length_penalty - if early_stopping is True: - return True - - current_worst_score = current_worst_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=current_worst_seq.eos_token_id) - if early_stopping is False: - highest_attainable_score = best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id) - else: - assert early_stopping == "never" - if length_penalty > 0.0: - # If length_penalty > 0.0, beam search will prefer longer - # sequences. The highest attainable score calculation is - # based on the longest possible sequence length in this case. - max_possible_length = max( - best_running_seq.get_prompt_len() + - sampling_params.max_tokens, - self.scheduler_config.max_model_len) - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id, - seq_len=max_possible_length)) - else: - # Otherwise, beam search will prefer shorter sequences. The - # highest attainable score calculation is based on the current - # sequence length. - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id)) - return current_worst_score >= highest_attainable_score - - #def _process_sequence_group_outputs(self, seq_group: SequenceGroup, - # outputs: SequenceGroupOutput) -> None: - - # # Process prompt logprobs - # prompt_logprobs = outputs.prompt_logprobs - # if prompt_logprobs is not None and seq_group.sampling_params.detokenize: - # self.detokenizer.decode_prompt_logprobs_inplace( - # seq_group, prompt_logprobs) - # seq_group.prompt_logprobs = prompt_logprobs - - # # Process samples - # samples = outputs.samples - # parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - # existing_finished_seqs = seq_group.get_finished_seqs() - # parent_child_dict = { - # parent_seq.seq_id: [] - # for parent_seq in parent_seqs - # } - # for sample in samples: - # parent_child_dict[sample.parent_seq_id].append(sample) - # # List of (child, parent) - # child_seqs: List[Tuple[Sequence, Sequence]] = [] - - # # Process the child samples for each parent sequence - # for parent in parent_seqs: - # child_samples: List[SequenceOutput] = parent_child_dict[ - # parent.seq_id] - # if len(child_samples) == 0: - # # This parent sequence has no children samples. Remove - # # the parent sequence from the sequence group since it will - # # not be used in the future iterations. - # parent.status = SequenceStatus.FINISHED_ABORTED - # seq_group.remove(parent.seq_id) - # self.scheduler.free_seq(parent) - # continue - # # Fork the parent sequence if there are multiple child samples. - # for child_sample in child_samples[:-1]: - # new_child_seq_id = next(self.seq_counter) - # child = parent.fork(new_child_seq_id) - # child.append_token_id(child_sample.output_token, - # child_sample.logprobs) - # child_seqs.append((child, parent)) - # # Continue the parent sequence for the last child sample. - # # We reuse the parent sequence here to reduce redundant memory - # # copies, especially when using non-beam search sampling methods. - # last_child_sample = child_samples[-1] - # parent.append_token_id(last_child_sample.output_token, - # last_child_sample.logprobs) - # child_seqs.append((parent, parent)) - - # for seq, _ in child_seqs: - # if seq_group.sampling_params.detokenize: - # self.detokenizer.decode_sequence_inplace( - # seq, seq_group.sampling_params) - # self._check_stop(seq, seq_group.sampling_params) - - # # Non-beam search case - # if not seq_group.sampling_params.use_beam_search: - # # For newly created child sequences, add them to the sequence group - # # and fork them in block manager if they are not finished. - # for seq, parent in child_seqs: - # if seq is not parent: - # seq_group.add(seq) - # if not seq.is_finished(): - # self.scheduler.fork_seq(parent, seq) - - # # Free the finished and selected parent sequences' memory in block - # # manager. Keep them in the sequence group as candidate output. - # # NOTE: we need to fork the new sequences before freeing the - # # old sequences. - # for seq, parent in child_seqs: - # if seq is parent and seq.is_finished(): - # self.scheduler.free_seq(seq) - # return - - # # Beam search case - # # Select the child sequences to keep in the sequence group. - # selected_child_seqs = [] - # unselected_child_seqs = [] - # beam_width = seq_group.sampling_params.best_of - # length_penalty = seq_group.sampling_params.length_penalty - - # # Select the newly finished sequences with the highest scores - # # to replace existing finished sequences. - # # Tuple of (seq, parent, is_new) - # existing_finished_seqs = [(seq, None, False) - # for seq in existing_finished_seqs] - # new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs - # if seq.is_finished()] - # all_finished_seqs = existing_finished_seqs + new_finished_seqs - # # Sort the finished sequences by their scores. - # all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( - # length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - # reverse=True) - # for seq, parent, is_new in all_finished_seqs[:beam_width]: - # if is_new: - # # A newly generated child sequence finishes and has a high - # # score, so we will add it into the sequence group. - # selected_child_seqs.append((seq, parent)) - # for seq, parent, is_new in all_finished_seqs[beam_width:]: - # if is_new: - # # A newly generated child sequence finishes but has a low - # # score, so we will not add it into the sequence group. - # # Additionally, if this sequence is a continuation of a - # # parent sequence, we will need remove the parent sequence - # # from the sequence group. - # unselected_child_seqs.append((seq, parent)) - # else: - # # An existing finished sequence has a low score, so we will - # # remove it from the sequence group. - # seq_group.remove(seq.seq_id) - - # # select the top beam_width sequences from the running - # # sequences for the next iteration to continue the beam - # # search. - # running_child_seqs = [(seq, parent) for seq, parent in child_seqs - # if not seq.is_finished()] - # # Sort the running sequences by their scores. - # running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( - # length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - # reverse=True) - - # # Check if we can stop the beam search. - # if len(running_child_seqs) == 0: - # # No running sequences, stop the beam search. - # stop_beam_search = True - # elif len(all_finished_seqs) < beam_width: - # # Not enough finished sequences, continue the beam search. - # stop_beam_search = False - # else: - # # Check the early stopping criteria - # best_running_seq = running_child_seqs[0][0] - # current_worst_seq = all_finished_seqs[beam_width - 1][0] - # stop_beam_search = self._check_beam_search_early_stopping( - # seq_group.sampling_params.early_stopping, - # seq_group.sampling_params, best_running_seq, current_worst_seq) - - # if stop_beam_search: - # # Stop the beam search and remove all the running sequences from - # # the sequence group. - # unselected_child_seqs.extend(running_child_seqs) - # else: - # # Continue the beam search and select the top beam_width sequences - # # to continue the beam search. - # selected_child_seqs.extend(running_child_seqs[:beam_width]) - # # The remaining running sequences will not be used in the next - # # iteration. Again, if these sequences are continuations of - # # parent sequences, we will need to remove the parent sequences - # # from the sequence group. - # unselected_child_seqs.extend(running_child_seqs[beam_width:]) - - # # For newly created child sequences, add them to the sequence group - # # and fork them in block manager if they are not finished. - # for seq, parent in selected_child_seqs: - # if seq is not parent: - # seq_group.add(seq) - # if not seq.is_finished(): - # self.scheduler.fork_seq(parent, seq) - - # # Free the finished and selected parent sequences' memory in block - # # manager. Keep them in the sequence group as candidate output. - # for seq, parent in selected_child_seqs: - # if seq is parent and seq.is_finished(): - # self.scheduler.free_seq(seq) - - # # Remove the unselected parent sequences from the sequence group and - # # free their memory in block manager. - # for seq, parent in unselected_child_seqs: - # if seq is parent: - # # Remove the parent sequence if it is not selected for next - # # iteration - # seq_group.remove(seq.seq_id) - # self.scheduler.free_seq(seq) - def _process_model_outputs( self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: @@ -696,89 +478,6 @@ def _process_model_outputs( self.stat_logger.log(self._get_stats(scheduler_outputs)) return request_outputs - #def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): - # seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - - # assert seqs - # #if not seqs: - # # return [] - - # assert len(seqs) == 1, ("Beam search not supported in speculative " - # "decoding.") - # seq = seqs[0] - - # # Since there's only one sequence per sequence group, we can take the - # # first sample. - # samples = [outputs[step].samples[0] for step in range(len(outputs))] - - # # -1 means the output token is not valid (eg. due to spec decode - # # rejecting tokens). - # valid_samples = [ - # sample for sample in samples if sample.output_token != -1 - # ] - - # # Draft target worker pads all outputs with -1 to have same length. - # output_token_ids = [sample.output_token for sample in valid_samples] - # #successes = [sample.success for sample in samples] - - # ## Truncate to max_tokens if necessary. - # #remaining_tokens = seq_group.sampling_params.max_tokens - ( - # # seq.get_output_len() + len(output_token_ids)) - # #if remaining_tokens < 0: - # # valid_samples = valid_samples[:remaining_tokens] - # # output_token_ids = output_token_ids[:remaining_tokens] - - # ## Truncate any tokens after EOS. This is required as spec decode - # ## generates tokens in fixed blocks, which may go beyond the EOS token. - # #if not seq_group.sampling_params.ignore_eos: - # # eos_token_id = self.tokenizer.get_lora_tokenizer( - # # seq.lora_request).eos_token_id - # # # Avoiding .index calls as exception throwing in the happy path - # # # is expensive. - # # for i in range(len(output_token_ids)): - # # if output_token_ids[i] == eos_token_id: - # # output_token_ids = output_token_ids[:i + 1] - # # valid_samples = valid_samples[:i + 1] - # # break - - # #output_logprobs = [sample.logprobs for sample in valid_samples] - - # ## Use the last sample for the sequence as it will have - # ## the speculation and num_unprocessed_tokens for all the - # ## previous samples (they are cumulative when it comes - # ## to those two attributes). - # #speculation = valid_samples[-1].speculation - # #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens - - # for output_token_id in output_token_ids: - # from vllm.sequence import Logprob - # seq.append_token_id( - # token_id=output_token_id, - # logprobs={output_token_id: Logprob(0.0)}, - # ) - - # #seq.append_token_ids(output_token_ids, - # # output_logprobs, - # # ) - # # #num_unprocessed_tokens=num_unprocessed_tokens) - # ##seq.set_last_speculation(speculation) - - # #if not all(successes): - # # seq.set_status_to_failed() - - # #if decode: - # # self._decode_sequence(seq, - # # seq_group.sampling_params, - # # token_ids=seq.get_token_ids(), - # # unseen_token_ids=output_token_ids, - # # prefix_offset=seq.prefix_offset, - # # read_offset=seq.read_offset) - # #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, - # # output_token_ids) - # # TODO pass output token ids - # self._check_stop(seq, seq_group.sampling_params) - # if seq.is_finished(): - # self.scheduler.free_seq(seq) def step(self) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results. @@ -923,57 +622,6 @@ def _get_stats(self, time_e2e_requests=time_e2e_requests, ) - #def _check_stop(self, seq: Sequence, - # sampling_params: SamplingParams) -> None: - # """Stop the finished sequences.""" - # # Check if the sequence has reached max_model_len. - # if seq.get_len() > self.scheduler_config.max_model_len: - # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - # return - - # # Check if the sequence has reached max_tokens. - # if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): - # # TODO should cap block - # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - # return - - # # Check if the minimum number of tokens has been generated yet; - # # skip the stop string/token checks if not - # if seq.get_output_len() < sampling_params.min_tokens: - # return - - # if sampling_params.detokenize: - # for stop_str in sampling_params.stop: - # if seq.output_text.endswith(stop_str): - # self._finalize_sequence(seq, sampling_params, stop_str) - # seq.status = SequenceStatus.FINISHED_STOPPED - # seq.stop_reason = stop_str - # return - # last_token_id = seq.get_last_token_id() - # if last_token_id in sampling_params.stop_token_ids: - # stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - # last_token_id) - # self._finalize_sequence(seq, sampling_params, stop_str) - # seq.status = SequenceStatus.FINISHED_STOPPED - # seq.stop_reason = last_token_id - # return - - # # Check if the sequence has generated the EOS token. - # if ((not sampling_params.ignore_eos) - # and seq.get_last_token_id() == seq.eos_token_id): - # seq.status = SequenceStatus.FINISHED_STOPPED - # return - - #def _finalize_sequence(self, seq: Sequence, - # sampling_params: SamplingParams, - # stop_string: str) -> None: - # if sampling_params.include_stop_str_in_output: - # return - - # if stop_string and seq.output_text.endswith(stop_string): - # # Truncate the output text so that the stop string is - # # not included in the output. - # seq.output_text = seq.output_text[:-len(stop_string)] def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) From 34468fe8af84d0a2bd313e9b4dc06582e17c1458 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:57:51 -0700 Subject: [PATCH 074/109] rename --- vllm/engine/output_processor/beam_search.py | 2 +- vllm/engine/output_processor/block_decode.py | 2 +- vllm/engine/output_processor/stop_checker.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index c9ded11711515..829c5ecd78399 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -106,7 +106,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, if seq_group.sampling_params.detokenize: self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) - self.stop_checker.check_stop(seq, seq_group.sampling_params) + self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params) # Non-beam search case if not seq_group.sampling_params.use_beam_search: diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 90ad03df32dda..44b4efba63726 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -131,7 +131,7 @@ def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, # output_token_ids) # TODO pass output token ids - self.stop_checker.check_stop(seq, seq_group.sampling_params) + self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params) if seq.is_finished(): self.scheduler.free_seq(seq) diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index cc6655b7aaa71..82973e3042021 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -37,7 +37,7 @@ def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): self.scheduler_config = scheduler_config self.get_tokenizer_for_seq = get_tokenizer_for_seq - def check_stop(self, seq: Sequence, + def maybe_stop_sequence(self, seq: Sequence, sampling_params: SamplingParams) -> None: """Stop the finished sequences.""" # Check if the sequence has reached max_model_len. From 208c4671593534e9a2f9ed7f64da80c5a74a4fb4 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 17:10:05 -0700 Subject: [PATCH 075/109] wip --- vllm/engine/llm_engine.py | 23 +++++------------------ vllm/engine/output_processor/util.py | 12 ++++++++++++ 2 files changed, 17 insertions(+), 18 deletions(-) create mode 100644 vllm/engine/output_processor/util.py diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2be4a260f1647..86ba020236273 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -27,6 +27,7 @@ from vllm.utils import Counter from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.engine.output_processor.util import create_output_by_sequence_group logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -424,6 +425,9 @@ def _process_model_outputs( self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: + now = time.time() + + # TODO if self.speculative_config is None: all_output = [output] else: @@ -431,34 +435,17 @@ def _process_model_outputs( scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - # Organize list of sampler output by sequence group. - output_by_sequence_group: List[List[SequenceGroupOutputs]] = [ - [] for _ in scheduled_seq_groups - ] - for step in all_output: - for i, sequence_group_output in enumerate(step): - output_by_sequence_group[i].append(sequence_group_output) - - now = time.time() + output_by_sequence_group = create_output_by_sequence_group(sampler_outputs=all_output, num_seq_groups=len(scheduled_seq_groups)) # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output_by_sequence_group): - seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) self.output_processor.process_outputs(seq_group, outputs) - #assert len(outputs) > 0 - ## TODO can spec decode go through second path? - #if len(outputs) > 1: - # self._process_sequence_group_outputs_multi_step( - # seq_group, outputs) - #else: - # self._process_sequence_group_outputs(seq_group, outputs[0]) - # Free the finished sequence groups. self.scheduler.free_finished_seq_groups() diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py new file mode 100644 index 0000000000000..1fcd651deef15 --- /dev/null +++ b/vllm/engine/output_processor/util.py @@ -0,0 +1,12 @@ +from vllm.sequence import SequenceGroupOutput, SamplerOutput +from typing import List + +def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput], num_seq_groups: int): + output_by_sequence_group = [ + [] for _ in range(num_seq_groups) + ] + for step in sampler_outputs: + for i, sequence_group_output in enumerate(step): + output_by_sequence_group[i].append(sequence_group_output) + + return output_by_sequence_group From 3c6abcc564bafc242316797ccbed1e10db54dff7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 17:14:22 -0700 Subject: [PATCH 076/109] wip --- vllm/engine/llm_engine.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 86ba020236273..72af9c3da9f7f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -423,7 +423,8 @@ def has_unfinished_requests(self) -> bool: def _process_model_outputs( self, output: SamplerOutput, - scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: + scheduled_seq_groups: List[SequenceGroup], + ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]: now = time.time() @@ -433,8 +434,6 @@ def _process_model_outputs( else: all_output = output - scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - output_by_sequence_group = create_output_by_sequence_group(sampler_outputs=all_output, num_seq_groups=len(scheduled_seq_groups)) # Update the scheduled sequence groups with the model outputs. @@ -456,13 +455,9 @@ def _process_model_outputs( seq_group.maybe_set_first_token_time(now) request_output = RequestOutput.from_seq_group(seq_group) request_outputs.append(request_output) - for seq_group in scheduler_outputs.ignored_seq_groups: + for seq_group in ignored_seq_groups: request_output = RequestOutput.from_seq_group(seq_group) request_outputs.append(request_output) - - # Log stats. - if self.log_stats: - self.stat_logger.log(self._get_stats(scheduler_outputs)) return request_outputs @@ -529,7 +524,13 @@ def step(self) -> List[RequestOutput]: else: output = [] - return self._process_model_outputs(output, scheduler_outputs) + request_outputs = self._process_model_outputs(output, scheduler_outputs.scheduled_seq_groups, scheduler_outputs.ignored_seq_groups) + + # Log stats. + if self.log_stats: + self.stat_logger.log(self._get_stats(scheduler_outputs)) + + return request_outputs def do_log_stats(self) -> None: """Forced log when no requests active.""" From bbbcef70d603ab791ecc62336a56ef25b1566d33 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 17:27:24 -0700 Subject: [PATCH 077/109] wip --- tests/spec_decode/e2e/test_correctness.py | 2 +- vllm/engine/llm_engine.py | 11 +++-------- vllm/executor/cpu_executor.py | 2 +- vllm/executor/executor_base.py | 5 +++-- vllm/executor/gpu_executor.py | 2 +- vllm/spec_decode/multi_step_worker.py | 2 ++ vllm/spec_decode/spec_decode_worker.py | 2 ++ vllm/worker/cpu_worker.py | 8 +++++--- vllm/worker/neuron_worker.py | 9 ++++++--- vllm/worker/worker.py | 9 ++++++--- vllm/worker/worker_base.py | 5 +++-- 11 files changed, 33 insertions(+), 24 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index d8b09ce5b77a0..eb6d1e1c5ddd5 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -89,7 +89,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): { # Expect failure as spec decode not supported by # Ray backend. - "tensor_parallel_size": 2, + "worker_use_ray": True, }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 72af9c3da9f7f..bce36ddccc816 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -422,19 +422,14 @@ def has_unfinished_requests(self) -> bool: return self.scheduler.has_unfinished_seqs() def _process_model_outputs( - self, output: SamplerOutput, + self, + output: List[SamplerOutput], scheduled_seq_groups: List[SequenceGroup], ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]: now = time.time() - # TODO - if self.speculative_config is None: - all_output = [output] - else: - all_output = output - - output_by_sequence_group = create_output_by_sequence_group(sampler_outputs=all_output, num_seq_groups=len(scheduled_seq_groups)) + output_by_sequence_group = create_output_by_sequence_group(sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs in zip(scheduled_seq_groups, diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 835ba18ab756a..f308f91494757 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -81,7 +81,7 @@ def execute_model(self, blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], blocks_to_copy: Dict[int, List[int]], - num_lookahead_slots: int) -> SamplerOutput: + num_lookahead_slots: int) -> List[SamplerOutput]: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index c18edd75d7a4d..23927c113744c 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -58,8 +58,9 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: - """Executes one model step on the given sequences.""" + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int) -> List[SamplerOutput]: + """Executes at least one model step on the given sequences.""" raise NotImplementedError @abstractmethod diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index ac445cd51a7e4..90a534dc1271a 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -157,7 +157,7 @@ def execute_model( blocks_to_swap_out: Dict[int, int], blocks_to_copy: Dict[int, List[int]], num_lookahead_slots: int, - ) -> SamplerOutput: + ) -> List[SamplerOutput]: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 0ac189a7baccb..4cdbe09234557 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -70,6 +70,8 @@ def execute_model_multi_step( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, ) + assert (len(model_output) == 1), "composing multistep workers not supported" + model_output = model_output[0] self._append_new_tokens(model_output, copied_seq_group_metadata_list) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 3e33371edadf0..894377c9421e8 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -196,6 +196,8 @@ def _run_no_spec( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, ) + assert len(sampler_output) == 1, "expected single output from scorer worker" + sampler_output = sampler_output[0] # Clear device tensors from sampler output. This reduces communication # overhead when the engine runs in a different process than the workers. diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index bd67f9f8850ac..09a37c25783a1 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -257,7 +257,7 @@ def execute_model( blocks_to_swap_in: Optional[Dict[int, int]] = None, blocks_to_swap_out: Optional[Dict[int, int]] = None, blocks_to_copy: Optional[Dict[int, List[int]]] = None, - ) -> Optional[SamplerOutput]: + ) -> List[SamplerOutput]: if self.is_driver_worker: assert seq_group_metadata_list is not None num_seq_groups = len(seq_group_metadata_list) @@ -280,11 +280,13 @@ def execute_model( # If there is no input, we don't need to execute the model. if num_seq_groups == 0: - return {} + return [] output = self.model_runner.execute_model(seq_group_metadata_list, self.cpu_cache) - return output + + # CPU worker only supports single-step execution. + return [output] def init_distributed_environment(self) -> None: """Initialize the distributed environment.""" diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 6136d50d0c068..d0f01b893bc62 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -73,15 +73,18 @@ def initialize_cache(self, num_gpu_blocks: int, def execute_model( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Optional[SamplerOutput]: + ) -> List[SamplerOutput]: num_seq_groups = len(seq_group_metadata_list) # If there is no input, we don't need to execute the model. if num_seq_groups == 0: - return {} + return [] output = self.model_runner.execute_model(seq_group_metadata_list) - return output + + # Neuron worker only supports single-step output. Wrap the output in a + # list to conform to interface. + return [output] def get_cache_block_size_bytes(self) -> int: """Determine the size in bytes of a cache block. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index cb30f658482bd..95e62b9e6a757 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -206,7 +206,7 @@ def execute_model( blocks_to_swap_out: Optional[Dict[int, int]] = None, blocks_to_copy: Optional[Dict[int, List[int]]] = None, num_lookahead_slots: int = 0, - ) -> Optional[SamplerOutput]: + ) -> List[SamplerOutput]: if self.is_driver_worker: assert seq_group_metadata_list is not None @@ -232,11 +232,14 @@ def execute_model( # If there is no input, we don't need to execute the model. if num_seq_groups == 0: - return {} + return [] output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache) - return output + + # Worker only supports single-step execution. Wrap the output in a list + # to conform to interface. + return [output] def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index e3027c406ffeb..1481a4c2eef4c 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -44,8 +44,9 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: - """Executes one model step on the given sequences.""" + blocks_to_copy: Dict[int, List[int]]) -> List[SamplerOutput]: + """Executes at least one model step on the given sequences, unless no + sequences are provided.""" raise NotImplementedError @abstractmethod From b58762d4fa0f64eb29af5a649650d6293c5d988f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 17:29:07 -0700 Subject: [PATCH 078/109] fix --- vllm/spec_decode/batch_expansion.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index bba3c4733e4ff..f7bac45861a7b 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -86,6 +86,8 @@ def score_proposals( blocks_to_copy=blocks_to_copy, #return_python_output=False ) + assert len(target_sampler_output) == 1, "expected single-step output" + target_sampler_output = target_sampler_output[0] all_tokens, all_probs = self._contract_batch( original_bs=len(seq_group_metadata_list), From 8b500d404b81b10857f75503e312ecf44ee9dd9f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 17:43:04 -0700 Subject: [PATCH 079/109] wip --- vllm/engine/output_processor/block_decode.py | 67 ++++++-------------- 1 file changed, 18 insertions(+), 49 deletions(-) diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 44b4efba63726..3fb2b7ee3235b 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -18,7 +18,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus) + SequenceStatus, Logprob) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) @@ -49,17 +49,10 @@ def __init__( self.stop_checker = stop_checker def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: - return self._process_sequence_group_outputs_multi_step(sequence_group, outputs) + seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) - def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): - seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - - assert seqs - #if not seqs: - # return [] - - assert len(seqs) == 1, ("Beam search not supported in speculative " - "decoding.") + assert seqs, "expected running sequences" + assert len(seqs) == 1, ("Beam search not supported in block decoding.") seq = seqs[0] # Since there's only one sequence per sequence group, we can take the @@ -71,21 +64,23 @@ def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): valid_samples = [ sample for sample in samples if sample.output_token != -1 ] + assert valid_samples + + self._process_seq_outputs(seq, valid_samples, sequence_group.sampling_params) - # Draft target worker pads all outputs with -1 to have same length. + def _process_seq_outputs(self, seq: Sequence, valid_samples: List[SequenceOutput], sampling_params: SamplingParams) -> None: output_token_ids = [sample.output_token for sample in valid_samples] - #successes = [sample.success for sample in samples] - ## Truncate to max_tokens if necessary. - #remaining_tokens = seq_group.sampling_params.max_tokens - ( - # seq.get_output_len() + len(output_token_ids)) - #if remaining_tokens < 0: - # valid_samples = valid_samples[:remaining_tokens] - # output_token_ids = output_token_ids[:remaining_tokens] + # Truncate to max_tokens if necessary. + remaining_tokens = sampling_params.max_tokens - ( + seq.get_output_len() + len(output_token_ids)) + if remaining_tokens < 0: + valid_samples = valid_samples[:remaining_tokens] + output_token_ids = output_token_ids[:remaining_tokens] ## Truncate any tokens after EOS. This is required as spec decode ## generates tokens in fixed blocks, which may go beyond the EOS token. - #if not seq_group.sampling_params.ignore_eos: + #if not sampling_params.ignore_eos: # eos_token_id = self.tokenizer.get_lora_tokenizer( # seq.lora_request).eos_token_id # # Avoiding .index calls as exception throwing in the happy path @@ -96,42 +91,16 @@ def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): # valid_samples = valid_samples[:i + 1] # break - #output_logprobs = [sample.logprobs for sample in valid_samples] - - ## Use the last sample for the sequence as it will have - ## the speculation and num_unprocessed_tokens for all the - ## previous samples (they are cumulative when it comes - ## to those two attributes). - #speculation = valid_samples[-1].speculation - #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens - for output_token_id in output_token_ids: - from vllm.sequence import Logprob seq.append_token_id( token_id=output_token_id, + # TODO emit logprobs in block decoding. logprobs={output_token_id: Logprob(0.0)}, ) - #seq.append_token_ids(output_token_ids, - # output_logprobs, - # ) - # #num_unprocessed_tokens=num_unprocessed_tokens) - ##seq.set_last_speculation(speculation) - - #if not all(successes): - # seq.set_status_to_failed() - - #if decode: - # self._decode_sequence(seq, - # seq_group.sampling_params, - # token_ids=seq.get_token_ids(), - # unseen_token_ids=output_token_ids, - # prefix_offset=seq.prefix_offset, - # read_offset=seq.read_offset) - #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, - # output_token_ids) + # TODO detokenize # TODO pass output token ids - self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params) + self.stop_checker.maybe_stop_sequence(seq, sampling_params) if seq.is_finished(): self.scheduler.free_seq(seq) From 782ce22d604291a64ac6dce3efbb9b4c662c0557 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 18:26:30 -0700 Subject: [PATCH 080/109] unit tests for block decode --- tests/core/utils.py | 16 +- .../output_processor/test_block_decode.py | 238 ++++++++++++++++++ vllm/engine/output_processor/beam_search.py | 2 - vllm/engine/output_processor/block_decode.py | 27 +- vllm/engine/output_processor/interfaces.py | 5 +- 5 files changed, 262 insertions(+), 26 deletions(-) create mode 100644 tests/engine/output_processor/test_block_decode.py diff --git a/tests/core/utils.py b/tests/core/utils.py index fbbdb07cb8e6e..d9d2eeaee1b96 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,5 +1,5 @@ import time -from typing import Optional, Tuple +from typing import Optional, Tuple, Iterable from vllm import SamplingParams from vllm.lora.request import LoRARequest @@ -31,14 +31,18 @@ def create_dummy_prompt( def create_seq_group( - seq_prompt_len=1024, - seq_output_lens=(128, ), - request_id='0', - seq_id_start=0, + seq_prompt_len: int=1024, + seq_output_lens: Iterable[int]=(128, ), + request_id: str='0', + seq_id_start: int=0, + sampling_params: Optional[SamplingParams] = None ) -> SequenceGroup: assert len(seq_output_lens) > 0 + if sampling_params is None: + sampling_params = SamplingParams() + prompt_token_ids = [0] * seq_prompt_len seqs = [] @@ -60,7 +64,7 @@ def create_seq_group( seq_group = SequenceGroup( request_id=request_id, seqs=seqs, - sampling_params=SamplingParams(), + sampling_params=sampling_params, arrival_time=time.time(), ) diff --git a/tests/engine/output_processor/test_block_decode.py b/tests/engine/output_processor/test_block_decode.py new file mode 100644 index 0000000000000..aae184c164473 --- /dev/null +++ b/tests/engine/output_processor/test_block_decode.py @@ -0,0 +1,238 @@ +import pytest +from unittest.mock import MagicMock +import random + +from transformers import PreTrainedTokenizer + +from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.core.scheduler import Scheduler +from vllm.utils import Counter +from vllm.sequence import SequenceStatus, SequenceGroupOutput, SequenceOutput, Logprob +from vllm.sampling_params import SamplingParams +from tests.core.utils import create_seq_group + +@pytest.mark.parametrize("seq_output_len", [128]) +@pytest.mark.parametrize("num_new_tokens", [1, 12]) +@pytest.mark.skip_global_cleanup +def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): + detokenizer = MagicMock(spec=Detokenizer) + scheduler = MagicMock(spec=Scheduler) + stop_checker = MagicMock(spec=StopChecker) + seq_counter = Counter() + + output_processor = BlockDecodeOutputProcessor( + detokenizer=detokenizer, + scheduler=scheduler, + seq_counter=seq_counter, + get_tokenizer_for_seq=lambda _: mock_tokenizer(), + stop_checker=stop_checker, + ) + + seq_group = create_seq_group( + seq_prompt_len=1024, + seq_output_lens=[seq_output_len], + sampling_params=SamplingParams( + max_tokens=seq_output_len + num_new_tokens, + ), + ) + + seq = seq_group.get_seqs()[0] + seq.status = SequenceStatus.RUNNING + + new_token_ids = list(range(num_new_tokens)) + + outputs = [SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids] + + assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids + output_processor.process_outputs(seq_group, outputs) + assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids + +@pytest.mark.parametrize("seq_prompt_len", [1024]) +@pytest.mark.parametrize("seq_output_len", [128]) +@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8]) +@pytest.mark.parametrize("max_tokens", [128 + 3]) +@pytest.mark.skip_global_cleanup +def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, max_tokens: int): + detokenizer = MagicMock(spec=Detokenizer) + scheduler = MagicMock(spec=Scheduler) + stop_checker = MagicMock(spec=StopChecker) + seq_counter = Counter() + + output_processor = BlockDecodeOutputProcessor( + detokenizer=detokenizer, + scheduler=scheduler, + seq_counter=seq_counter, + get_tokenizer_for_seq=lambda _: mock_tokenizer(), + stop_checker=stop_checker, + ) + + seq_group = create_seq_group( + seq_prompt_len=seq_prompt_len, + seq_output_lens=[seq_output_len], + sampling_params=SamplingParams( + max_tokens=max_tokens, + ), + ) + + seq = seq_group.get_seqs()[0] + seq.status = SequenceStatus.RUNNING + + new_token_ids = list(range(num_new_tokens)) + + outputs = [SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids] + + assert seq.get_len() == seq_prompt_len + seq_output_len + output_processor.process_outputs(seq_group, outputs) + + # Expect the processed sequence to not go over max tokens in len. + assert seq.get_len() == seq_prompt_len + max_tokens + + # Expect the correct tokens were appended. + expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len] + assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + +@pytest.mark.parametrize("seq_prompt_len", [1024]) +@pytest.mark.parametrize("seq_output_len", [128]) +@pytest.mark.parametrize("num_new_tokens", [12]) +@pytest.mark.parametrize("seed", list(range(6))) +@pytest.mark.skip_global_cleanup +def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): + random.seed(seed) + detokenizer = MagicMock(spec=Detokenizer) + scheduler = MagicMock(spec=Scheduler) + stop_checker = MagicMock(spec=StopChecker) + seq_counter = Counter() + + eos_token_id = 100 + + output_processor = BlockDecodeOutputProcessor( + detokenizer=detokenizer, + scheduler=scheduler, + seq_counter=seq_counter, + get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id), + stop_checker=stop_checker, + ) + + seq_group = create_seq_group( + seq_prompt_len=seq_prompt_len, + seq_output_lens=[seq_output_len], + sampling_params=SamplingParams( + # Ensure enough space. + max_tokens=seq_output_len + num_new_tokens, + ), + ) + + seq = seq_group.get_seqs()[0] + seq.status = SequenceStatus.RUNNING + + new_token_ids = list(range(num_new_tokens)) + assert eos_token_id not in new_token_ids + eos_index = random.randint(0, len(new_token_ids) - 1) + new_token_ids[eos_index] = eos_token_id + + outputs = [SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids] + + assert seq.get_len() == seq_prompt_len + seq_output_len + output_processor.process_outputs(seq_group, outputs) + + # Expect the processed sequence to not go beyond provided eos. + assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1) + + # Expect the correct tokens were appended. + expected_appended_tokens = new_token_ids[:eos_index+1] + assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + +@pytest.mark.parametrize("seq_prompt_len", [1024]) +@pytest.mark.parametrize("seq_output_len", [128]) +@pytest.mark.parametrize("num_new_tokens", [12]) +@pytest.mark.parametrize("seed", list(range(6))) +@pytest.mark.skip_global_cleanup +def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): + random.seed(seed) + detokenizer = MagicMock(spec=Detokenizer) + scheduler = MagicMock(spec=Scheduler) + stop_checker = MagicMock(spec=StopChecker) + seq_counter = Counter() + + eos_token_id = 100 + + output_processor = BlockDecodeOutputProcessor( + detokenizer=detokenizer, + scheduler=scheduler, + seq_counter=seq_counter, + get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id), + stop_checker=stop_checker, + ) + + seq_group = create_seq_group( + seq_prompt_len=seq_prompt_len, + seq_output_lens=[seq_output_len], + sampling_params=SamplingParams( + # Ensure enough space. + max_tokens=seq_output_len + num_new_tokens, + ignore_eos=True, + ), + ) + + seq = seq_group.get_seqs()[0] + seq.status = SequenceStatus.RUNNING + + new_token_ids = list(range(num_new_tokens)) + assert eos_token_id not in new_token_ids + eos_index = random.randint(0, len(new_token_ids) - 1) + new_token_ids[eos_index] = eos_token_id + + outputs = [SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids] + + assert seq.get_len() == seq_prompt_len + seq_output_len + output_processor.process_outputs(seq_group, outputs) + + # Expect the processed sequence to go beyond eos. + assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens + + # Expect the correct tokens were appended. + expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens - seq_output_len] + assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + +def mock_tokenizer(eos_token_id=1000): + tokenizer = MagicMock(spec=PreTrainedTokenizer) + tokenizer.eos_token_id = eos_token_id + return tokenizer diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 829c5ecd78399..827142bd4bf52 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -38,14 +38,12 @@ def __init__( detokenizer, scheduler, seq_counter, - get_tokenizer_for_seq, stop_checker, ): self.scheduler_config = scheduler_config self.detokenizer = detokenizer self.scheduler = scheduler self.seq_counter = seq_counter - self.get_tokenizer_for_seq = get_tokenizer_for_seq self.stop_checker = stop_checker def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 3fb2b7ee3235b..06d3ee9306ef7 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -34,21 +34,19 @@ class BlockDecodeOutputProcessor(SequenceGroupOutputProcessor): def __init__( self, - scheduler_config: SchedulerConfig, detokenizer, scheduler, seq_counter, get_tokenizer_for_seq, stop_checker, ): - self.scheduler_config = scheduler_config self.detokenizer = detokenizer self.scheduler = scheduler self.seq_counter = seq_counter self.get_tokenizer_for_seq = get_tokenizer_for_seq self.stop_checker = stop_checker - def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: + def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) assert seqs, "expected running sequences" @@ -78,18 +76,17 @@ def _process_seq_outputs(self, seq: Sequence, valid_samples: List[SequenceOutput valid_samples = valid_samples[:remaining_tokens] output_token_ids = output_token_ids[:remaining_tokens] - ## Truncate any tokens after EOS. This is required as spec decode - ## generates tokens in fixed blocks, which may go beyond the EOS token. - #if not sampling_params.ignore_eos: - # eos_token_id = self.tokenizer.get_lora_tokenizer( - # seq.lora_request).eos_token_id - # # Avoiding .index calls as exception throwing in the happy path - # # is expensive. - # for i in range(len(output_token_ids)): - # if output_token_ids[i] == eos_token_id: - # output_token_ids = output_token_ids[:i + 1] - # valid_samples = valid_samples[:i + 1] - # break + # Truncate any tokens after EOS. This is required as spec decode + # generates tokens in fixed blocks, which may go beyond the EOS token. + if not sampling_params.ignore_eos: + eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id + # Avoiding .index calls as exception throwing in the happy path + # is expensive. + for i in range(len(output_token_ids)): + if output_token_ids[i] == eos_token_id: + output_token_ids = output_token_ids[:i + 1] + valid_samples = valid_samples[:i + 1] + break for output_token_id in output_token_ids: seq.append_token_id( diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index d2368fc811a00..8a7e27645b4df 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from vllm.config import SchedulerConfig from vllm.sequence import SequenceGroup, SequenceGroupOutput +from typing import List class SequenceGroupOutputProcessor(ABC): @@ -20,13 +21,11 @@ def create_output_processor( detokenizer, scheduler, seq_counter, - get_tokenizer_for_seq, stop_checker, ) else: from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor return BlockDecodeOutputProcessor( - scheduler_config, detokenizer, scheduler, seq_counter, @@ -35,5 +34,5 @@ def create_output_processor( ) @abstractmethod - def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: + def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: pass From 3062e1cbeb11d66a8904d05c6ef935784caf44ef Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 18:34:53 -0700 Subject: [PATCH 081/109] stop token ids --- vllm/engine/output_processor/beam_search.py | 2 +- vllm/engine/output_processor/block_decode.py | 3 +-- vllm/engine/output_processor/stop_checker.py | 20 ++++++++++++-------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 827142bd4bf52..2b5657d37ccd7 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -104,7 +104,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, if seq_group.sampling_params.detokenize: self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) - self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params) + self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params, [seq.get_last_token_id()]) # Non-beam search case if not seq_group.sampling_params.use_beam_search: diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 06d3ee9306ef7..e218fa99b0e6d 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -96,8 +96,7 @@ def _process_seq_outputs(self, seq: Sequence, valid_samples: List[SequenceOutput ) # TODO detokenize - # TODO pass output token ids - self.stop_checker.maybe_stop_sequence(seq, sampling_params) + self.stop_checker.maybe_stop_sequence(seq, sampling_params, new_token_ids=output_token_ids) if seq.is_finished(): self.scheduler.free_seq(seq) diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 82973e3042021..4d8f3730e9f6a 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -38,7 +38,7 @@ def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): self.get_tokenizer_for_seq = get_tokenizer_for_seq def maybe_stop_sequence(self, seq: Sequence, - sampling_params: SamplingParams) -> None: + sampling_params: SamplingParams, new_token_ids: List[int]) -> None: """Stop the finished sequences.""" # Check if the sequence has reached max_model_len. if seq.get_len() > self.scheduler_config.max_model_len: @@ -46,8 +46,7 @@ def maybe_stop_sequence(self, seq: Sequence, return # Check if the sequence has reached max_tokens. - if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): - # TODO should cap block + if seq.get_output_len() == sampling_params.max_tokens: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return @@ -63,18 +62,23 @@ def maybe_stop_sequence(self, seq: Sequence, seq.status = SequenceStatus.FINISHED_STOPPED seq.stop_reason = stop_str return - last_token_id = seq.get_last_token_id() - if last_token_id in sampling_params.stop_token_ids: + + # Determine if any stop_token_ids are in new_token_ids. + intersection = set(new_token_ids).intersection(sampling_params.stop_token_ids) + if intersection: + # Get arbitrary token id that caused the stop. + stop_token_id = next(iter(intersection)) + stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - last_token_id) + stop_token_id) self._finalize_sequence(seq, sampling_params, stop_str) seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = last_token_id + seq.stop_reason = stop_token_id return # Check if the sequence has generated the EOS token. if ((not sampling_params.ignore_eos) - and seq.get_last_token_id() == seq.eos_token_id): + and seq.eos_token_id in new_token_ids): seq.status = SequenceStatus.FINISHED_STOPPED return From fba3b300f66e047750eb3a392e0b2f3aee0e0cd8 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 18:35:16 -0700 Subject: [PATCH 082/109] format --- tests/core/utils.py | 11 +- .../output_processor/test_block_decode.py | 136 ++++++++++-------- vllm/engine/llm_engine.py | 14 +- vllm/engine/output_processor/beam_search.py | 12 +- vllm/engine/output_processor/block_decode.py | 20 ++- vllm/engine/output_processor/interfaces.py | 6 +- vllm/engine/output_processor/stop_checker.py | 7 +- vllm/engine/output_processor/util.py | 10 +- vllm/model_executor/layers/sampler.py | 3 +- vllm/spec_decode/multi_step_worker.py | 3 +- vllm/spec_decode/spec_decode_worker.py | 3 +- vllm/worker/worker_base.py | 10 +- 12 files changed, 134 insertions(+), 101 deletions(-) diff --git a/tests/core/utils.py b/tests/core/utils.py index d9d2eeaee1b96..39f8e507d0f1d 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -31,12 +31,11 @@ def create_dummy_prompt( def create_seq_group( - seq_prompt_len: int=1024, - seq_output_lens: Iterable[int]=(128, ), - request_id: str='0', - seq_id_start: int=0, - sampling_params: Optional[SamplingParams] = None -) -> SequenceGroup: + seq_prompt_len: int = 1024, + seq_output_lens: Iterable[int] = (128, ), + request_id: str = '0', + seq_id_start: int = 0, + sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: assert len(seq_output_lens) > 0 diff --git a/tests/engine/output_processor/test_block_decode.py b/tests/engine/output_processor/test_block_decode.py index aae184c164473..f426f1d32d7a6 100644 --- a/tests/engine/output_processor/test_block_decode.py +++ b/tests/engine/output_processor/test_block_decode.py @@ -13,6 +13,7 @@ from vllm.sampling_params import SamplingParams from tests.core.utils import create_seq_group + @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [1, 12]) @pytest.mark.skip_global_cleanup @@ -33,37 +34,40 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): seq_group = create_seq_group( seq_prompt_len=1024, seq_output_lens=[seq_output_len], - sampling_params=SamplingParams( - max_tokens=seq_output_len + num_new_tokens, - ), + sampling_params=SamplingParams(max_tokens=seq_output_len + + num_new_tokens, ), ) - + seq = seq_group.get_seqs()[0] seq.status = SequenceStatus.RUNNING new_token_ids = list(range(num_new_tokens)) - outputs = [SequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids] + outputs = [ + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids + ] assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids output_processor.process_outputs(seq_group, outputs) assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids + @pytest.mark.parametrize("seq_prompt_len", [1024]) @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8]) @pytest.mark.parametrize("max_tokens", [128 + 3]) @pytest.mark.skip_global_cleanup -def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, max_tokens: int): +def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, + seq_output_len: int, max_tokens: int): detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) stop_checker = MagicMock(spec=StopChecker) @@ -80,26 +84,26 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, seq_outpu seq_group = create_seq_group( seq_prompt_len=seq_prompt_len, seq_output_lens=[seq_output_len], - sampling_params=SamplingParams( - max_tokens=max_tokens, - ), + sampling_params=SamplingParams(max_tokens=max_tokens, ), ) - + seq = seq_group.get_seqs()[0] seq.status = SequenceStatus.RUNNING new_token_ids = list(range(num_new_tokens)) - outputs = [SequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids] + outputs = [ + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids + ] assert seq.get_len() == seq_prompt_len + seq_output_len output_processor.process_outputs(seq_group, outputs) @@ -109,14 +113,17 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, seq_outpu # Expect the correct tokens were appended. expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len] - assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + assert seq.get_token_ids( + )[-len(expected_appended_tokens):] == expected_appended_tokens + @pytest.mark.parametrize("seq_prompt_len", [1024]) @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [12]) @pytest.mark.parametrize("seed", list(range(6))) @pytest.mark.skip_global_cleanup -def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): +def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, + seq_output_len: int, seed: int): random.seed(seed) detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) @@ -138,10 +145,9 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_out seq_output_lens=[seq_output_len], sampling_params=SamplingParams( # Ensure enough space. - max_tokens=seq_output_len + num_new_tokens, - ), + max_tokens=seq_output_len + num_new_tokens, ), ) - + seq = seq_group.get_seqs()[0] seq.status = SequenceStatus.RUNNING @@ -150,16 +156,18 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_out eos_index = random.randint(0, len(new_token_ids) - 1) new_token_ids[eos_index] = eos_token_id - outputs = [SequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids] + outputs = [ + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids + ] assert seq.get_len() == seq_prompt_len + seq_output_len output_processor.process_outputs(seq_group, outputs) @@ -168,15 +176,18 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_out assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1) # Expect the correct tokens were appended. - expected_appended_tokens = new_token_ids[:eos_index+1] - assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + expected_appended_tokens = new_token_ids[:eos_index + 1] + assert seq.get_token_ids( + )[-len(expected_appended_tokens):] == expected_appended_tokens + @pytest.mark.parametrize("seq_prompt_len", [1024]) @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [12]) @pytest.mark.parametrize("seed", list(range(6))) @pytest.mark.skip_global_cleanup -def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): +def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, + seq_output_len: int, seed: int): random.seed(seed) detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) @@ -202,7 +213,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_outp ignore_eos=True, ), ) - + seq = seq_group.get_seqs()[0] seq.status = SequenceStatus.RUNNING @@ -211,16 +222,18 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_outp eos_index = random.randint(0, len(new_token_ids) - 1) new_token_ids[eos_index] = eos_token_id - outputs = [SequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids] + outputs = [ + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids + ] assert seq.get_len() == seq_prompt_len + seq_output_len output_processor.process_outputs(seq_group, outputs) @@ -229,8 +242,11 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_outp assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens # Expect the correct tokens were appended. - expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens - seq_output_len] - assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens - + seq_output_len] + assert seq.get_token_ids( + )[-len(expected_appended_tokens):] == expected_appended_tokens + def mock_tokenizer(eos_token_id=1000): tokenizer = MagicMock(spec=PreTrainedTokenizer) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index bce36ddccc816..9936eb18c0320 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -422,14 +422,14 @@ def has_unfinished_requests(self) -> bool: return self.scheduler.has_unfinished_seqs() def _process_model_outputs( - self, - output: List[SamplerOutput], + self, output: List[SamplerOutput], scheduled_seq_groups: List[SequenceGroup], ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]: now = time.time() - output_by_sequence_group = create_output_by_sequence_group(sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) + output_by_sequence_group = create_output_by_sequence_group( + sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs in zip(scheduled_seq_groups, @@ -437,7 +437,7 @@ def _process_model_outputs( seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) - + self.output_processor.process_outputs(seq_group, outputs) # Free the finished sequence groups. @@ -455,7 +455,6 @@ def _process_model_outputs( request_outputs.append(request_output) return request_outputs - def step(self) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results. @@ -519,7 +518,9 @@ def step(self) -> List[RequestOutput]: else: output = [] - request_outputs = self._process_model_outputs(output, scheduler_outputs.scheduled_seq_groups, scheduler_outputs.ignored_seq_groups) + request_outputs = self._process_model_outputs( + output, scheduler_outputs.scheduled_seq_groups, + scheduler_outputs.ignored_seq_groups) # Log stats. if self.log_stats: @@ -605,7 +606,6 @@ def _get_stats(self, time_e2e_requests=time_e2e_requests, ) - def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 2b5657d37ccd7..94af809e26738 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -31,7 +31,7 @@ class BeamSearchOutputProcessor(SequenceGroupOutputProcessor): - + def __init__( self, scheduler_config: SchedulerConfig, @@ -46,8 +46,10 @@ def __init__( self.seq_counter = seq_counter self.stop_checker = stop_checker - def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: - assert (len(outputs) == 1), f"{type(self)} does not support multiple outputs per step" + def process_outputs(self, sequence_group: SequenceGroup, + outputs: List[SequenceGroupOutput]) -> None: + assert (len(outputs) == 1 + ), f"{type(self)} does not support multiple outputs per step" return self._process_sequence_group_outputs(sequence_group, outputs[0]) def _process_sequence_group_outputs(self, seq_group: SequenceGroup, @@ -104,7 +106,9 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, if seq_group.sampling_params.detokenize: self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) - self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params, [seq.get_last_token_id()]) + self.stop_checker.maybe_stop_sequence(seq, + seq_group.sampling_params, + [seq.get_last_token_id()]) # Non-beam search case if not seq_group.sampling_params.use_beam_search: diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index e218fa99b0e6d..3b6a60e857fa0 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -31,7 +31,7 @@ class BlockDecodeOutputProcessor(SequenceGroupOutputProcessor): - + def __init__( self, detokenizer, @@ -46,7 +46,8 @@ def __init__( self.get_tokenizer_for_seq = get_tokenizer_for_seq self.stop_checker = stop_checker - def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + def process_outputs(self, sequence_group: SequenceGroup, + outputs: List[SequenceGroupOutput]) -> None: seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) assert seqs, "expected running sequences" @@ -64,14 +65,17 @@ def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceG ] assert valid_samples - self._process_seq_outputs(seq, valid_samples, sequence_group.sampling_params) + self._process_seq_outputs(seq, valid_samples, + sequence_group.sampling_params) - def _process_seq_outputs(self, seq: Sequence, valid_samples: List[SequenceOutput], sampling_params: SamplingParams) -> None: + def _process_seq_outputs(self, seq: Sequence, + valid_samples: List[SequenceOutput], + sampling_params: SamplingParams) -> None: output_token_ids = [sample.output_token for sample in valid_samples] # Truncate to max_tokens if necessary. - remaining_tokens = sampling_params.max_tokens - ( - seq.get_output_len() + len(output_token_ids)) + remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() + + len(output_token_ids)) if remaining_tokens < 0: valid_samples = valid_samples[:remaining_tokens] output_token_ids = output_token_ids[:remaining_tokens] @@ -96,7 +100,9 @@ def _process_seq_outputs(self, seq: Sequence, valid_samples: List[SequenceOutput ) # TODO detokenize - self.stop_checker.maybe_stop_sequence(seq, sampling_params, new_token_ids=output_token_ids) + self.stop_checker.maybe_stop_sequence(seq, + sampling_params, + new_token_ids=output_token_ids) if seq.is_finished(): self.scheduler.free_seq(seq) diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 8a7e27645b4df..2b931a0b2f41b 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -3,8 +3,9 @@ from vllm.sequence import SequenceGroup, SequenceGroupOutput from typing import List + class SequenceGroupOutputProcessor(ABC): - + @staticmethod def create_output_processor( scheduler_config: SchedulerConfig, @@ -34,5 +35,6 @@ def create_output_processor( ) @abstractmethod - def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + def process_outputs(self, sequence_group: SequenceGroup, + outputs: List[SequenceGroupOutput]) -> None: pass diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 4d8f3730e9f6a..3f03373f2698a 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -30,6 +30,7 @@ logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 + class StopChecker: def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): @@ -38,7 +39,8 @@ def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): self.get_tokenizer_for_seq = get_tokenizer_for_seq def maybe_stop_sequence(self, seq: Sequence, - sampling_params: SamplingParams, new_token_ids: List[int]) -> None: + sampling_params: SamplingParams, + new_token_ids: List[int]) -> None: """Stop the finished sequences.""" # Check if the sequence has reached max_model_len. if seq.get_len() > self.scheduler_config.max_model_len: @@ -64,7 +66,8 @@ def maybe_stop_sequence(self, seq: Sequence, return # Determine if any stop_token_ids are in new_token_ids. - intersection = set(new_token_ids).intersection(sampling_params.stop_token_ids) + intersection = set(new_token_ids).intersection( + sampling_params.stop_token_ids) if intersection: # Get arbitrary token id that caused the stop. stop_token_id = next(iter(intersection)) diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index 1fcd651deef15..b49bbb2fab327 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,12 +1,12 @@ from vllm.sequence import SequenceGroupOutput, SamplerOutput from typing import List -def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput], num_seq_groups: int): - output_by_sequence_group = [ - [] for _ in range(num_seq_groups) - ] + +def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput], + num_seq_groups: int): + output_by_sequence_group = [[] for _ in range(num_seq_groups)] for step in sampler_outputs: for i, sequence_group_output in enumerate(step): - output_by_sequence_group[i].append(sequence_group_output) + output_by_sequence_group[i].append(sequence_group_output) return output_by_sequence_group diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index bed915faf3fbd..be970e56b6119 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -79,7 +79,8 @@ def forward( prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) - return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs) + return _build_sampler_output(sample_results, sampling_metadata, + prompt_logprobs, sample_logprobs) def _get_bin_counts_and_mask( diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 4cdbe09234557..85060ccf2b15e 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -70,7 +70,8 @@ def execute_model_multi_step( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, ) - assert (len(model_output) == 1), "composing multistep workers not supported" + assert (len(model_output) == 1 + ), "composing multistep workers not supported" model_output = model_output[0] self._append_new_tokens(model_output, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 894377c9421e8..b9824937a9441 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -196,7 +196,8 @@ def _run_no_spec( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, ) - assert len(sampler_output) == 1, "expected single output from scorer worker" + assert len( + sampler_output) == 1, "expected single output from scorer worker" sampler_output = sampler_output[0] # Clear device tensors from sampler output. This reduces communication diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 1481a4c2eef4c..d5d3ffda1f431 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -40,11 +40,11 @@ def initialize_cache(self, num_gpu_blocks: int, raise NotImplementedError @abstractmethod - def execute_model(self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> List[SamplerOutput]: + def execute_model( + self, seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, + int], + blocks_to_copy: Dict[int, List[int]]) -> List[SamplerOutput]: """Executes at least one model step on the given sequences, unless no sequences are provided.""" raise NotImplementedError From bda141fe4dca51b53edf0bafb97882155b2b6839 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 18:56:05 -0700 Subject: [PATCH 083/109] fixing spec tests --- tests/spec_decode/test_multi_step_worker.py | 5 +++-- tests/spec_decode/test_spec_decode_worker.py | 16 +++++++++++----- tests/spec_decode/utils.py | 4 ++-- vllm/engine/async_llm_engine.py | 2 +- vllm/spec_decode/batch_expansion.py | 4 ++-- vllm/spec_decode/multi_step_worker.py | 5 +++-- vllm/spec_decode/spec_decode_worker.py | 3 +-- vllm/spec_decode/util.py | 17 ++++++++++------- 8 files changed, 33 insertions(+), 23 deletions(-) diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index f4d44108b47c2..f9840d6157c39 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -125,7 +125,7 @@ def test_same_output_for_single_step(): zero_kv_cache(worker.cache_engine) set_random_seed(seed) expected_output = worker.execute_model( - **single_step_execute_model_data.to_dict(), ) + **single_step_execute_model_data.to_dict(), )[0] actual_token_ids = [ output.samples[0].output_token for output in actual_output @@ -219,7 +219,7 @@ def test_same_output_for_multi_step(): continuations=continuations, final_seq_lens=final_seq_lens)) - single_step_output.append( + single_step_output.extend( worker.execute_model(**execute_model_data.to_dict(), )) # Append output tokens to new sequence data. @@ -352,6 +352,7 @@ def test_draft_proposals_no_speculations(): @torch.inference_mode() +#@pytest.skip("Broken because output is padded.") def test_draft_proposals_mixed_k(): """Verify DraftModelTop1Proposer correctly handles case some sequences can speculate and some can't. diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 3725924ea89ce..889712fb9360f 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -12,6 +12,7 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) +from vllm.sequence import SamplerOutput from .utils import (ExecuteModelData, create_batch, create_sampler_output_list, mock_worker) @@ -191,7 +192,7 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int): target_output = create_sampler_output_list(target_token_ids, target_token_probs) - target_worker.execute_model.return_value = target_output[0] + target_worker.execute_model.return_value = [target_output[0]] exception_secret = 'artifical stop' rejection_sampler.side_effect = ValueError(exception_secret) @@ -271,7 +272,7 @@ def test_correctly_formats_output(k: int, batch_size: int): target_output = create_sampler_output_list(target_token_ids, target_token_probs) - target_worker.execute_model.return_value = target_output[0] + target_worker.execute_model.return_value = [target_output[0]] rejection_sampler_output = torch.randint(low=0, high=vocab_size, @@ -340,6 +341,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): rejection_sampler = MagicMock(spec=RejectionSampler) rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' target_worker.device = 'cuda' @@ -383,7 +385,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): target_output = create_sampler_output_list(target_token_ids, target_token_probs) - target_worker.execute_model.return_value = target_output[0] + target_worker.execute_model.return_value = [target_output[0]] rejection_sampler_output = torch.randint(low=0, high=vocab_size, @@ -426,6 +428,8 @@ def test_k_equals_zero(k: int, batch_size: int): rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) + target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)] + draft_worker.device = 'cuda' target_worker.device = 'cuda' @@ -446,7 +450,7 @@ def test_k_equals_zero(k: int, batch_size: int): 0].sampled_tokens is None, "expect gpu tensor references to be None" draft_worker.execute_model.assert_called_once_with( - **execute_model_data.to_dict(), return_python_output=False) + **execute_model_data.to_dict()) target_worker.execute_model.assert_called_once_with( **execute_model_data.to_dict()) @@ -465,6 +469,8 @@ def test_empty_input_batch(k: int, batch_size: int): rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) + target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)] + draft_worker.device = 'cuda' target_worker.device = 'cuda' @@ -485,7 +491,7 @@ def test_empty_input_batch(k: int, batch_size: int): 0].sampled_tokens is None, "expect gpu tensor references to be None" draft_worker.execute_model.assert_called_once_with( - **execute_model_data.to_dict(), return_python_output=False) + **execute_model_data.to_dict()) target_worker.execute_model.assert_called_once_with( **execute_model_data.to_dict()) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 4637826f254d6..3914af945eff0 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -10,7 +10,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, SamplerOutput, SequenceData, SequenceGroupMetadata, SequenceGroupOutput, - SequenceOutput) + SequenceOutput, Logprob) from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker @@ -211,7 +211,7 @@ def create_sampler_output_list( SequenceOutput( output_token=token_id, parent_seq_id=seq_ids[seq_index], - logprobs={token_id: 0}, + logprobs={token_id: Logprob(0)}, ) ], prompt_logprobs=None, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index f610495135121..3784845102475 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -217,7 +217,7 @@ async def step_async(self) -> List[RequestOutput]: else: output = [] - return self._process_model_outputs(output, scheduler_outputs) + return self._process_model_outputs(output, scheduler_outputs.scheduled_seq_groups, scheduler_outputs.ignored_seq_groups) async def encode_request_async( self, diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index f7bac45861a7b..1011dd970ebc8 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -9,7 +9,7 @@ from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, sampler_output_to_torch, split_batch_by_proposal_len, - mock_device_tensors) + maybe_mock_device_tensors) from vllm.worker.worker import Worker SeqId = int @@ -147,7 +147,7 @@ def _contract_batch(self, original_bs: int, sequences. """ - mock_device_tensors( + maybe_mock_device_tensors( sampler_output=target_sampler_output, batch_size=len(non_spec_indices) + num_scoring_tokens, vocab_size=self._vocab_size, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 85060ccf2b15e..4182b8758465e 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -7,7 +7,7 @@ from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) from vllm.spec_decode.util import (sampler_output_to_torch, - mock_device_tensors) + maybe_mock_device_tensors) from vllm.worker.worker import Worker @@ -346,7 +346,7 @@ def _merge_outputs( sampler_output = maybe_sampler_output for step_output in sampler_output: - mock_device_tensors( + maybe_mock_device_tensors( sampler_output=step_output, batch_size=len(proposal_lens), vocab_size=self._vocab_size, @@ -364,6 +364,7 @@ def _merge_outputs( fill_value=-1, dtype=torch.long, device=self._device) + entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens entire_proposal_probs = torch.zeros(batch_size, *proposal_probs.shape[1:], diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index b9824937a9441..c221f0421f537 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -196,8 +196,7 @@ def _run_no_spec( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, ) - assert len( - sampler_output) == 1, "expected single output from scorer worker" + assert len(sampler_output) == 1 sampler_output = sampler_output[0] # Clear device tensors from sampler output. This reduces communication diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 7129f47d65f6a..c47d5b8781535 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -82,19 +82,22 @@ def sampler_output_to_torch( return sampled_token_ids, sampled_token_probs -def mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, +def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, vocab_size: int, device: str) -> None: - assert sampler_output.sampled_token_probs is None - assert sampler_output.sampled_token_ids is None + values = [sampler_output.sampled_token_probs, sampler_output.sampled_token_ids] + assert all(v is None for v in values) or not any(v is None for v in values) + if not any(v is None for v in values): + return sampler_output.sampled_token_probs = torch.nn.functional.softmax( torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) + sampler_output.sampled_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, ), - dtype=torch.long, - device=device) + high=vocab_size, + size=(batch_size, ), + dtype=torch.long, + device=device) @contextmanager From 49865fba9be8aeb19735b3b08ec9a830bf9caee7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:05:55 -0700 Subject: [PATCH 084/109] lint --- vllm/engine/async_llm_engine.py | 4 +++- vllm/spec_decode/multi_step_worker.py | 2 +- vllm/spec_decode/util.py | 16 +++++++++------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 3784845102475..4bab116dcb145 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -217,7 +217,9 @@ async def step_async(self) -> List[RequestOutput]: else: output = [] - return self._process_model_outputs(output, scheduler_outputs.scheduled_seq_groups, scheduler_outputs.ignored_seq_groups) + return self._process_model_outputs( + output, scheduler_outputs.scheduled_seq_groups, + scheduler_outputs.ignored_seq_groups) async def encode_request_async( self, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 4182b8758465e..c79d79930a18c 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -364,7 +364,7 @@ def _merge_outputs( fill_value=-1, dtype=torch.long, device=self._device) - + entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens entire_proposal_probs = torch.zeros(batch_size, *proposal_probs.shape[1:], diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index c47d5b8781535..efc54c4de4cf4 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -83,8 +83,10 @@ def sampler_output_to_torch( def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, - vocab_size: int, device: str) -> None: - values = [sampler_output.sampled_token_probs, sampler_output.sampled_token_ids] + vocab_size: int, device: str) -> None: + values = [ + sampler_output.sampled_token_probs, sampler_output.sampled_token_ids + ] assert all(v is None for v in values) or not any(v is None for v in values) if not any(v is None for v in values): return @@ -92,12 +94,12 @@ def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, sampler_output.sampled_token_probs = torch.nn.functional.softmax( torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) - + sampler_output.sampled_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, ), - dtype=torch.long, - device=device) + high=vocab_size, + size=(batch_size, ), + dtype=torch.long, + device=device) @contextmanager From 1a17ed14a57c13def30b6d7e99236ffa92cdfb61 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:15:03 -0700 Subject: [PATCH 085/109] clean up gpu executor --- vllm/executor/gpu_executor.py | 70 +++++++++++--------------- vllm/spec_decode/spec_decode_worker.py | 9 ++++ 2 files changed, 37 insertions(+), 42 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 90a534dc1271a..18be6da10ce95 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -47,18 +47,37 @@ def _init_worker(self): else: self._init_spec_worker() + def _init_non_spec_worker(self): + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + from vllm.worker.worker import Worker + + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + self.driver_worker = Worker( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + self.driver_worker.init_device() + self.driver_worker.load_model() + def _init_spec_worker(self): from vllm.worker.worker import Worker from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker - #from vllm.worker.multi_step_worker import MultiStepWorker # pylint: disable=import-outside-toplevel - #from vllm.worker.single_tp_worker import SingleTpWorker # pylint: disable=import-outside-toplevel - #from vllm.worker.draft_target_worker import DraftTargetWorker # pylint: disable=import-outside-toplevel - - #scheduler_config: "SchedulerConfig" = worker_kwargs.pop( - # "scheduler_config") - distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) @@ -76,7 +95,6 @@ def _init_spec_worker(self): is_driver_worker=True, ) - from vllm.spec_decode.multi_step_worker import MultiStepWorker draft_worker = MultiStepWorker( model_config=self.speculative_config.draft_model_config, parallel_config=self.speculative_config.draft_parallel_config, @@ -91,47 +109,15 @@ def _init_spec_worker(self): is_driver_worker=True, ) - from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker - from vllm.model_executor.layers.rejection_sampler import RejectionSampler - spec_decode_worker = SpecDecodeWorker( - proposer_worker=draft_worker, - scorer_worker=target_worker, - rejection_sampler=RejectionSampler(strict_mode=True), - ) + spec_decode_worker = SpecDecodeWorker.from_workers(proposer_worker=draft_worker, scorer_worker=target_worker) assert self.parallel_config.world_size == 1, ( "GPUExecutor only supports single GPU.") self.driver_worker = spec_decode_worker + # Load model handled in spec decode worker. self.driver_worker.init_device() - #self.driver_worker.load_model() - - def _init_non_spec_worker(self): - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.worker import Worker - - assert self.parallel_config.world_size == 1, ( - "GPUExecutor only supports single GPU.") - - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = Worker( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - vision_language_config=self.vision_language_config, - is_driver_worker=True, - ) - self.driver_worker.init_device() - self.driver_worker.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of available KV blocks by invoking the diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index c221f0421f537..91bc530084e70 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -48,6 +48,15 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit. """ + @classmethod + def from_workers(cls, proposer_worker: MultiStepWorker, scorer_worker: WorkerBase) -> "SpecDecodeWorker": + return SpecDecodeWorker( + proposer_worker, + scorer_worker, + # TODO(cade) disable strict mode for speedup. + rejection_sampler=RejectionSampler(strict_mode=True), + ) + def __init__( self, proposer_worker: MultiStepWorker, From dea67bbd6fb1f0278ee4c605d8be77991c8657ae Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:16:16 -0700 Subject: [PATCH 086/109] wip --- vllm/spec_decode/batch_expansion.py | 4 ++-- vllm/spec_decode/spec_decode_worker.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 1011dd970ebc8..4dc34f1ab7c73 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -10,7 +10,7 @@ sampler_output_to_torch, split_batch_by_proposal_len, maybe_mock_device_tensors) -from vllm.worker.worker import Worker +from vllm.worker.worker_base import WorkerBase SeqId = int TargetSeqId = int @@ -32,7 +32,7 @@ class BatchExpansionTop1Scorer(SpeculativeScorer): of topk/tree. """ - def __init__(self, scorer_worker: Worker, device: str, vocab_size: int): + def __init__(self, scorer_worker: WorkerBase, device: str, vocab_size: int): self._scorer_worker = scorer_worker self._device = device self._vocab_size = vocab_size diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 91bc530084e70..e5b493c46c6cb 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -14,7 +14,7 @@ from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, split_batch_by_proposal_len) from vllm.worker.worker import Worker -from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase from vllm.logger import init_logger logger = init_logger(__name__) @@ -60,7 +60,7 @@ def from_workers(cls, proposer_worker: MultiStepWorker, scorer_worker: WorkerBas def __init__( self, proposer_worker: MultiStepWorker, - scorer_worker: Worker, + scorer_worker: WorkerBase, rejection_sampler: RejectionSampler, metrics_collector: Optional[AsyncMetricsCollector] = None, ): From 189d7ebab4a783cb651fb339b2fba88fd8b1f019 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:17:59 -0700 Subject: [PATCH 087/109] fix --- tests/spec_decode/e2e/test_correctness.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index eb6d1e1c5ddd5..6b01936e81788 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -62,8 +62,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): # Expect a generation for each prompt in the batch. assert len(batch_token_ids) == len(prompts) - # TODO(cadedaniel) check for equality once block truncation is implemented. - assert all(len(token_ids) >= output_len for token_ids in batch_token_ids) + assert all(len(token_ids) == output_len for token_ids in batch_token_ids) @pytest.mark.parametrize( From a70a0408b12631ca00a78e7cbbcf1db7ef211f33 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:18:47 -0700 Subject: [PATCH 088/109] wip --- vllm/executor/gpu_executor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 18be6da10ce95..22cd2797282ee 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -35,9 +35,6 @@ def __init__( self.vision_language_config = vision_language_config self.speculative_config = speculative_config - #assert (not speculative_config - # ), "Speculative decoding not yet supported for GPU backend" - # Instantiate the worker and load the model to GPU. self._init_worker() From 3e1b8f5c17e8ac0a96a1ddc05300b4eeb1996e66 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:42:50 -0700 Subject: [PATCH 089/109] detokenization --- tests/spec_decode/e2e/test_correctness.py | 20 ++++++++++++++++---- vllm/engine/output_processor/block_decode.py | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 6b01936e81788..d2f07f729f5a8 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -1,13 +1,16 @@ import pytest from itertools import cycle +from typing import Tuple, List from vllm import SamplingParams +from transformers import AutoTokenizer @pytest.mark.parametrize( "common_llm_kwargs", [{ # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. "model": "JackFram/llama-68m", # Skip real loading for fast test. @@ -55,15 +58,23 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): temperature=temperature, ) - batch_token_ids = get_token_ids_from_llm_generator(test_llm_generator, + batch_tokens, batch_token_ids = get_output_from_llm_generator(test_llm_generator, prompts, sampling_params) # Expect a generation for each prompt in the batch. assert len(batch_token_ids) == len(prompts) + # Expect each generation to have expected number of tokens (note + # ignore_eos=True). assert all(len(token_ids) == output_len for token_ids in batch_token_ids) + # Expect detokenized string to match. + tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") + for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): + expected_tokens = tok.decode(actual_token_ids) + assert actual_tokens == expected_tokens + @pytest.mark.parametrize( "common_llm_kwargs", @@ -109,14 +120,15 @@ def test_spec_decode_xfail(test_llm_generator): with pytest.raises(AssertionError, match="Speculative decoding not yet supported for "): - get_token_ids_from_llm_generator(test_llm_generator, prompts, + get_output_from_llm_generator(test_llm_generator, prompts, sampling_params) -def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): +def get_output_from_llm_generator(llm_generator, prompts, sampling_params) -> Tuple[List[str], List[List[int]]]: for llm in llm_generator: outputs = llm.generate(prompts, sampling_params, use_tqdm=True) token_ids = [output.outputs[0].token_ids for output in outputs] + tokens = [output.outputs[0].text for output in outputs] del llm - return token_ids + return tokens, token_ids diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 3b6a60e857fa0..99963111e2190 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -98,8 +98,8 @@ def _process_seq_outputs(self, seq: Sequence, # TODO emit logprobs in block decoding. logprobs={output_token_id: Logprob(0.0)}, ) + self.detokenizer.decode_sequence_inplace(seq, sampling_params) - # TODO detokenize self.stop_checker.maybe_stop_sequence(seq, sampling_params, new_token_ids=output_token_ids) From b9777a6ea80e4d0340e406dfe0748a32d5d34138 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:48:20 -0700 Subject: [PATCH 090/109] lint --- tests/core/utils.py | 2 +- .../output_processor/test_block_decode.py | 16 +++++---- tests/spec_decode/e2e/test_correctness.py | 18 +++++----- tests/spec_decode/test_spec_decode_worker.py | 2 +- tests/spec_decode/utils.py | 2 +- vllm/engine/llm_engine.py | 33 ++++++++++--------- vllm/engine/output_processor/beam_search.py | 31 ++++------------- vllm/engine/output_processor/block_decode.py | 32 ++++-------------- vllm/engine/output_processor/interfaces.py | 11 +++++-- vllm/engine/output_processor/stop_checker.py | 27 ++------------- vllm/engine/output_processor/util.py | 3 +- vllm/executor/gpu_executor.py | 7 ++-- vllm/spec_decode/batch_expansion.py | 10 +++--- vllm/spec_decode/multi_step_worker.py | 4 +-- vllm/spec_decode/spec_decode_worker.py | 12 +++---- vllm/worker/neuron_worker.py | 2 +- 16 files changed, 81 insertions(+), 131 deletions(-) diff --git a/tests/core/utils.py b/tests/core/utils.py index 39f8e507d0f1d..22c1d3826dff4 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,5 +1,5 @@ import time -from typing import Optional, Tuple, Iterable +from typing import Iterable, Optional, Tuple from vllm import SamplingParams from vllm.lora.request import LoRARequest diff --git a/tests/engine/output_processor/test_block_decode.py b/tests/engine/output_processor/test_block_decode.py index f426f1d32d7a6..87f451da7c292 100644 --- a/tests/engine/output_processor/test_block_decode.py +++ b/tests/engine/output_processor/test_block_decode.py @@ -1,17 +1,19 @@ -import pytest -from unittest.mock import MagicMock import random +from unittest.mock import MagicMock +import pytest from transformers import PreTrainedTokenizer -from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor +from tests.core.utils import create_seq_group +from vllm.core.scheduler import Scheduler +from vllm.engine.output_processor.block_decode import ( + BlockDecodeOutputProcessor) from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.sampling_params import SamplingParams +from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput, + SequenceStatus) from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.core.scheduler import Scheduler from vllm.utils import Counter -from vllm.sequence import SequenceStatus, SequenceGroupOutput, SequenceOutput, Logprob -from vllm.sampling_params import SamplingParams -from tests.core.utils import create_seq_group @pytest.mark.parametrize("seq_output_len", [128]) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index d2f07f729f5a8..fe543dfda552c 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -1,10 +1,11 @@ -import pytest from itertools import cycle -from typing import Tuple, List +from typing import List, Tuple -from vllm import SamplingParams +import pytest from transformers import AutoTokenizer +from vllm import SamplingParams + @pytest.mark.parametrize( "common_llm_kwargs", @@ -58,9 +59,8 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): temperature=temperature, ) - batch_tokens, batch_token_ids = get_output_from_llm_generator(test_llm_generator, - prompts, - sampling_params) + batch_tokens, batch_token_ids = get_output_from_llm_generator( + test_llm_generator, prompts, sampling_params) # Expect a generation for each prompt in the batch. assert len(batch_token_ids) == len(prompts) @@ -121,10 +121,12 @@ def test_spec_decode_xfail(test_llm_generator): with pytest.raises(AssertionError, match="Speculative decoding not yet supported for "): get_output_from_llm_generator(test_llm_generator, prompts, - sampling_params) + sampling_params) -def get_output_from_llm_generator(llm_generator, prompts, sampling_params) -> Tuple[List[str], List[List[int]]]: +def get_output_from_llm_generator( + llm_generator, prompts, + sampling_params) -> Tuple[List[str], List[List[int]]]: for llm in llm_generator: outputs = llm.generate(prompts, sampling_params, use_tqdm=True) token_ids = [output.outputs[0].token_ids for output in outputs] diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 889712fb9360f..4470cee78eeda 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -6,13 +6,13 @@ from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed +from vllm.sequence import SamplerOutput from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.metrics import (AsyncMetricsCollector, SpecDecodeWorkerMetrics) from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) -from vllm.sequence import SamplerOutput from .utils import (ExecuteModelData, create_batch, create_sampler_output_list, mock_worker) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 3914af945eff0..c428c4258c144 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -10,7 +10,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, SamplerOutput, SequenceData, SequenceGroupMetadata, SequenceGroupOutput, - SequenceOutput, Logprob) + SequenceOutput) from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9936eb18c0320..8c3786354f40d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,5 +1,5 @@ import time -from typing import Iterable, List, Optional, Tuple, Type, Union +from typing import Iterable, List, Optional, Type, Union from transformers import PreTrainedTokenizer @@ -10,6 +10,10 @@ from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics import StatLogger, Stats +from vllm.engine.output_processor.interfaces import ( + SequenceGroupOutputProcessor) +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.engine.output_processor.util import create_output_by_sequence_group from vllm.engine.ray_utils import initialize_ray_cluster from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger @@ -17,17 +21,13 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus) + SequenceGroup) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) from vllm.utils import Counter -from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.engine.output_processor.util import create_output_by_sequence_group logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -183,18 +183,19 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) - self.output_processor = SequenceGroupOutputProcessor.create_output_processor( - self.scheduler_config, - self.detokenizer, - self.scheduler, - self.seq_counter, - self.get_tokenizer_for_seq, - stop_checker=StopChecker( - self.scheduler, + self.output_processor = ( + SequenceGroupOutputProcessor.create_output_processor( self.scheduler_config, + self.detokenizer, + self.scheduler, + self.seq_counter, self.get_tokenizer_for_seq, - ), - ) + stop_checker=StopChecker( + self.scheduler, + self.scheduler_config, + self.get_tokenizer_for_seq, + ), + )) def _initialize_kv_caches(self) -> None: """Initialize the KV cache in the worker(s). diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 94af809e26738..885a241f7b2d9 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -1,31 +1,12 @@ -import time -from typing import Iterable, List, Optional, Tuple, Type, Union +from typing import List, Tuple, Union -from transformers import PreTrainedTokenizer - -import vllm -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, SpeculativeConfig, - VisionLanguageConfig) -from vllm.core.scheduler import Scheduler, SchedulerOutputs -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics import StatLogger, Stats -from vllm.engine.ray_utils import initialize_ray_cluster -from vllm.executor.executor_base import ExecutorBase +from vllm.config import SchedulerConfig +from vllm.engine.output_processor.interfaces import ( + SequenceGroupOutputProcessor) from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams -from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, - get_tokenizer_group) -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) -from vllm.utils import Counter -from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor +from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput, + SequenceOutput, SequenceStatus) logger = init_logger(__name__) diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 99963111e2190..f63ce7d0ef410 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -1,31 +1,11 @@ -import time -from typing import Iterable, List, Optional, Tuple, Type, Union - -from transformers import PreTrainedTokenizer - -import vllm -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, SpeculativeConfig, - VisionLanguageConfig) -from vllm.core.scheduler import Scheduler, SchedulerOutputs -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics import StatLogger, Stats -from vllm.engine.ray_utils import initialize_ray_cluster -from vllm.executor.executor_base import ExecutorBase +from typing import List + +from vllm.engine.output_processor.interfaces import ( + SequenceGroupOutputProcessor) from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams -from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus, Logprob) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, - get_tokenizer_group) -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) -from vllm.utils import Counter -from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor +from vllm.sequence import (Logprob, Sequence, SequenceGroup, + SequenceGroupOutput, SequenceOutput, SequenceStatus) logger = init_logger(__name__) diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 2b931a0b2f41b..5596bc3f3d670 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod +from typing import List + from vllm.config import SchedulerConfig from vllm.sequence import SequenceGroup, SequenceGroupOutput -from typing import List class SequenceGroupOutputProcessor(ABC): @@ -16,7 +17,9 @@ def create_output_processor( stop_checker, ): if scheduler_config.num_lookahead_slots == 0: - from vllm.engine.output_processor.beam_search import BeamSearchOutputProcessor + # Importing here to avoid cycle. + from vllm.engine.output_processor.beam_search import ( + BeamSearchOutputProcessor) return BeamSearchOutputProcessor( scheduler_config, detokenizer, @@ -25,7 +28,9 @@ def create_output_processor( stop_checker, ) else: - from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor + # Importing here to avoid cycle. + from vllm.engine.output_processor.block_decode import ( + BlockDecodeOutputProcessor) return BlockDecodeOutputProcessor( detokenizer, scheduler, diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 3f03373f2698a..b55e47ab3c128 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,31 +1,8 @@ -import time -from typing import Iterable, List, Optional, Tuple, Type, Union +from typing import List -from transformers import PreTrainedTokenizer - -import vllm -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, SpeculativeConfig, - VisionLanguageConfig) -from vllm.core.scheduler import Scheduler, SchedulerOutputs -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics import StatLogger, Stats -from vllm.engine.ray_utils import initialize_ray_cluster -from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams -from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, - get_tokenizer_group) -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) -from vllm.utils import Counter -from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor +from vllm.sequence import Sequence, SequenceStatus logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index b49bbb2fab327..e4939b9be4457 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,6 +1,7 @@ -from vllm.sequence import SequenceGroupOutput, SamplerOutput from typing import List +from vllm.sequence import SamplerOutput + def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput], num_seq_groups: int): diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 22cd2797282ee..b5e64843213a2 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -71,9 +71,9 @@ def _init_non_spec_worker(self): self.driver_worker.load_model() def _init_spec_worker(self): - from vllm.worker.worker import Worker - from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + from vllm.worker.worker import Worker distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) @@ -106,7 +106,8 @@ def _init_spec_worker(self): is_driver_worker=True, ) - spec_decode_worker = SpecDecodeWorker.from_workers(proposer_worker=draft_worker, scorer_worker=target_worker) + spec_decode_worker = SpecDecodeWorker.from_workers( + proposer_worker=draft_worker, scorer_worker=target_worker) assert self.parallel_config.world_size == 1, ( "GPUExecutor only supports single GPU.") diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 4dc34f1ab7c73..6945877fbf34b 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -6,10 +6,9 @@ from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, - sampler_output_to_torch, - split_batch_by_proposal_len, - maybe_mock_device_tensors) +from vllm.spec_decode.util import (get_all_seq_ids, maybe_mock_device_tensors, + nvtx_range, sampler_output_to_torch, + split_batch_by_proposal_len) from vllm.worker.worker_base import WorkerBase SeqId = int @@ -32,7 +31,8 @@ class BatchExpansionTop1Scorer(SpeculativeScorer): of topk/tree. """ - def __init__(self, scorer_worker: WorkerBase, device: str, vocab_size: int): + def __init__(self, scorer_worker: WorkerBase, device: str, + vocab_size: int): self._scorer_worker = scorer_worker self._device = device self._vocab_size = vocab_size diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index c79d79930a18c..6fdc3b294295d 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,8 +6,8 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) -from vllm.spec_decode.util import (sampler_output_to_torch, - maybe_mock_device_tensors) +from vllm.spec_decode.util import (maybe_mock_device_tensors, + sampler_output_to_torch) from vllm.worker.worker import Worker diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index e5b493c46c6cb..84aa562eba500 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -3,9 +3,10 @@ import torch +from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, - SequenceGroupOutput, SequenceOutput, Logprob) +from vllm.sequence import (Logprob, SamplerOutput, SequenceGroupMetadata, + SequenceGroupOutput, SequenceOutput) from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) @@ -13,9 +14,7 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, split_batch_by_proposal_len) -from vllm.worker.worker import Worker from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase -from vllm.logger import init_logger logger = init_logger(__name__) @@ -49,7 +48,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): """ @classmethod - def from_workers(cls, proposer_worker: MultiStepWorker, scorer_worker: WorkerBase) -> "SpecDecodeWorker": + def from_workers(cls, proposer_worker: MultiStepWorker, + scorer_worker: WorkerBase) -> "SpecDecodeWorker": return SpecDecodeWorker( proposer_worker, scorer_worker, @@ -238,7 +238,7 @@ def _run_speculative_decoding_step( seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy, k) - logger.info(f"score proposals") + logger.info("score proposals") proposal_scores = self.scorer.score_proposals( seq_group_metadata_list, blocks_to_swap_in, diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index d0f01b893bc62..7472a795fb518 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,5 +1,5 @@ """A Neuron worker class.""" -from typing import List, Optional +from typing import List import torch import torch.distributed From 29b4f12dc07a1c4d5238d9e5cc6fe9211d57b4d9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 20:21:51 -0700 Subject: [PATCH 091/109] docstrings --- .../output_processor/test_block_decode.py | 17 ++++++++- tests/spec_decode/e2e/test_correctness.py | 7 +++- tests/spec_decode/test_multi_step_worker.py | 1 - tests/spec_decode/test_spec_decode_worker.py | 1 - vllm/core/scheduler.py | 6 --- vllm/engine/llm_engine.py | 9 ++++- vllm/engine/output_processor/beam_search.py | 28 +++++++++++--- vllm/engine/output_processor/block_decode.py | 36 +++++++++++++++--- vllm/engine/output_processor/interfaces.py | 37 +++++++++++++++---- vllm/engine/output_processor/stop_checker.py | 14 ++++--- vllm/engine/output_processor/util.py | 3 ++ vllm/executor/gpu_executor.py | 2 + vllm/model_executor/layers/sampler.py | 4 -- vllm/spec_decode/batch_expansion.py | 3 +- vllm/spec_decode/multi_step_worker.py | 3 +- vllm/spec_decode/spec_decode_worker.py | 3 +- vllm/spec_decode/util.py | 6 +++ 17 files changed, 137 insertions(+), 43 deletions(-) diff --git a/tests/engine/output_processor/test_block_decode.py b/tests/engine/output_processor/test_block_decode.py index 87f451da7c292..c4a88d67cabc2 100644 --- a/tests/engine/output_processor/test_block_decode.py +++ b/tests/engine/output_processor/test_block_decode.py @@ -20,6 +20,11 @@ @pytest.mark.parametrize("num_new_tokens", [1, 12]) @pytest.mark.skip_global_cleanup def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): + """Verify block decoding appends token ids correctly. + + We append token ids and verify all the token ids were appended correctly. + Note that ignore_eos=True. + """ detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) stop_checker = MagicMock(spec=StopChecker) @@ -37,7 +42,8 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): seq_prompt_len=1024, seq_output_lens=[seq_output_len], sampling_params=SamplingParams(max_tokens=seq_output_len + - num_new_tokens, ), + num_new_tokens, + ignore_eos=True), ) seq = seq_group.get_seqs()[0] @@ -70,6 +76,9 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): @pytest.mark.skip_global_cleanup def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, max_tokens: int): + """Verify tokens after max_tokens are dropped and not appended to the + sequence. + """ detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) stop_checker = MagicMock(spec=StopChecker) @@ -126,6 +135,9 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, @pytest.mark.skip_global_cleanup def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): + """Verify the eos token id is included in the sequence, but subsequent + tokens are dropped (not appended to sequence). + """ random.seed(seed) detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) @@ -190,6 +202,9 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, @pytest.mark.skip_global_cleanup def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): + """When sampling parameters dictate that we should ignore the eos token id, + ensure all token ids are appended even if the eos token id is emitted. + """ random.seed(seed) detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index fe543dfda552c..160510e6c0c0f 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -38,8 +38,9 @@ @pytest.mark.parametrize("batch_size", [1, 10]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): - """Run generation with speculative decoding on a batch. Verify the number - of output tokens is equal to the expected number. + """Run generation with speculative decoding on a batch. Verify the engine + generates the correct number of tokens (via ignore_eos=True), and that the + detokenization matches HF transformers. """ output_len = 128 temperature = 0.0 @@ -105,6 +106,8 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_xfail(test_llm_generator): + """Verify that speculative decoding with Ray fails. + """ output_len = 128 temperature = 0.0 diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index f9840d6157c39..d6edbab579afd 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -352,7 +352,6 @@ def test_draft_proposals_no_speculations(): @torch.inference_mode() -#@pytest.skip("Broken because output is padded.") def test_draft_proposals_mixed_k(): """Verify DraftModelTop1Proposer correctly handles case some sequences can speculate and some can't. diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 4470cee78eeda..0a3110775e2d6 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -341,7 +341,6 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): rejection_sampler = MagicMock(spec=RejectionSampler) rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) - draft_worker.device = 'cuda' target_worker.device = 'cuda' diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index e176848c04909..db48a1f7f0d25 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -754,9 +754,6 @@ def _schedule_default(self) -> SchedulerOutputs: swapped_in.blocks_to_copy), ignored_seq_groups=prefills.ignored_seq_groups, num_lookahead_slots=running_scheduled.num_lookahead_slots, - #num_lookahead_slots=(prefills.num_lookahead_slots + - # running_scheduled.num_lookahead_slots + - # swapped_in.num_lookahead_slots), ) def _schedule_chunked_prefill(self): @@ -844,9 +841,6 @@ def _schedule_chunked_prefill(self): swapped_in.blocks_to_copy), ignored_seq_groups=prefills.ignored_seq_groups, num_lookahead_slots=running_scheduled.num_lookahead_slots, - #num_lookahead_slots=(prefills.num_lookahead_slots + - # running_scheduled.num_lookahead_slots + - # swapped_in.num_lookahead_slots), ) def _schedule(self) -> SchedulerOutputs: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8c3786354f40d..e6e75ee59c76d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -183,6 +183,8 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) + # Create sequence output processor, e.g. for beam search or + # speculative decoding. self.output_processor = ( SequenceGroupOutputProcessor.create_output_processor( self.scheduler_config, @@ -426,9 +428,15 @@ def _process_model_outputs( self, output: List[SamplerOutput], scheduled_seq_groups: List[SequenceGroup], ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]: + """Apply the model output to the sequences in the scheduled seq groups. + + Returns RequestOutputs that can be returned to the client. + """ now = time.time() + # Organize outputs by [sequence group][step] instead of + # [step][sequence group]. output_by_sequence_group = create_output_by_sequence_group( sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) @@ -438,7 +446,6 @@ def _process_model_outputs( seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) - self.output_processor.process_outputs(seq_group, outputs) # Free the finished sequence groups. diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 885a241f7b2d9..330eeced21cff 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Union +from typing import List, Tuple, Union, Iterable from vllm.config import SchedulerConfig from vllm.engine.output_processor.interfaces import ( @@ -7,19 +7,31 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.core.scheduler import Scheduler +from vllm.engine.output_processor.stop_checker import StopChecker logger = init_logger(__name__) class BeamSearchOutputProcessor(SequenceGroupOutputProcessor): + """SequenceGroupOutputProcessor which handles logic related to beam search + sequence management and coupled logic like detokenization and stop logic. + + This class is in charge of sorting out which sequences survive after beam + sampling. It manages forking and freeing of sequences. + + It does not support lookahead decoding, e.g. where the model generates >1 + token per scheduling invocation. + """ def __init__( self, scheduler_config: SchedulerConfig, - detokenizer, - scheduler, - seq_counter, - stop_checker, + detokenizer: Detokenizer, + scheduler: Scheduler, + seq_counter: Iterable[int], + stop_checker: StopChecker, ): self.scheduler_config = scheduler_config self.detokenizer = detokenizer @@ -29,6 +41,12 @@ def __init__( def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + """Append all new tokens to sequences in the sequence group. Fork any + surviving beam candidates; free any unsurviving ones. + + Invokes detokenizer to detokenize new tokens, and also marks sequences + as finished if they meet stop conditions. + """ assert (len(outputs) == 1 ), f"{type(self)} does not support multiple outputs per step" return self._process_sequence_group_outputs(sequence_group, outputs[0]) diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index f63ce7d0ef410..8c9b3e25598f9 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -1,24 +1,39 @@ -from typing import List +from typing import List, Iterable, Callable from vllm.engine.output_processor.interfaces import ( SequenceGroupOutputProcessor) +from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) +from vllm.core.scheduler import Scheduler +from vllm.transformers_utils.detokenizer import Detokenizer +from transformers import PreTrainedTokenizer logger = init_logger(__name__) class BlockDecodeOutputProcessor(SequenceGroupOutputProcessor): + """SequenceGroupOutputProcessor which handles logic related to + detokenization and stopping conditions. Besides not supporting beam search, + this differs from BeamSearchOutputProcessor in that it supports lookahead + scheduling (where the model may generate >1 token per scheduler invocation). + + This allows it to support speculative decoding and cases where the model + runs more than once. We generalize these cases as "block decoding", where + the model emits a block of tokens at the same time. In this case, this class + is responsible for correctly appending all token ids to sequences and + detokenizing new token ids. + """ def __init__( self, - detokenizer, - scheduler, - seq_counter, - get_tokenizer_for_seq, - stop_checker, + detokenizer: Detokenizer, + scheduler: Scheduler, + seq_counter: Iterable[int], + get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer], + stop_checker: StopChecker, ): self.detokenizer = detokenizer self.scheduler = scheduler @@ -28,6 +43,15 @@ def __init__( def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + """Append new tokens in the outputs to sequences in the sequence group. + + This only supports sequence groups of size 1. It supports greater than + one new token per sequence. + + This applies logic like stop condition checking and detokenization, + including freeing finished sequences. It also handles cases where there + are tokens emitted after the EOS token. + """ seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) assert seqs, "expected running sequences" diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 5596bc3f3d670..1f940f2924064 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,21 +1,40 @@ from abc import ABC, abstractmethod -from typing import List +from typing import List, Callable, Iterable from vllm.config import SchedulerConfig -from vllm.sequence import SequenceGroup, SequenceGroupOutput +from vllm.sequence import SequenceGroup, SequenceGroupOutput, Sequence +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.core.scheduler import Scheduler +from vllm.engine.output_processor.stop_checker import StopChecker class SequenceGroupOutputProcessor(ABC): + """Interface for logic that processes new token ids in sequence groups, + managing detokenization, stop checking, and freeing/forking sequences with + the scheduler. + + This is highly coupled with the LLMEngine and should be seen as an extension + of it. The logic is separated out to simplify the LLMEngine class and to + allow a beam search implementation (which handles forking, etc) and a block + decode implementation (which handles decoding >1 token per step). + """ @staticmethod def create_output_processor( scheduler_config: SchedulerConfig, - detokenizer, - scheduler, - seq_counter, - get_tokenizer_for_seq, - stop_checker, + detokenizer: Detokenizer, + scheduler: Scheduler, + seq_counter: Iterable[int], + get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer], + stop_checker: "StopChecker", ): + """Create an output processor. + + This returns an output processor compatible with beam search if the + scheduler is not configured to scheduler lookahead slots. Otherwise, it + returns an output processor that is incompatible with beam search but + which supports decoding more than one token per scheduling invocation. + """ if scheduler_config.num_lookahead_slots == 0: # Importing here to avoid cycle. from vllm.engine.output_processor.beam_search import ( @@ -42,4 +61,8 @@ def create_output_processor( @abstractmethod def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + """Process new token ids for the sequence group. Handles logic such as + detokenization, stop checking, and freeing/forking sequences in the + scheduler. + """ pass diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index b55e47ab3c128..2a6c79d2dc026 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,14 +1,15 @@ from typing import List -from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.sequence import Sequence, SequenceStatus -logger = init_logger(__name__) -_LOCAL_LOGGING_INTERVAL_SEC = 5 - class StopChecker: + """LLMEngine helper class which separates out the logic involving stop + checking. This checks things such as: whether the eos token was emitted, + whether the max_tokens has been consumed, whether a stop string has been + emitted, or if we have exceeded the max model len. + """ def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): self.scheduler = scheduler @@ -18,7 +19,9 @@ def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): def maybe_stop_sequence(self, seq: Sequence, sampling_params: SamplingParams, new_token_ids: List[int]) -> None: - """Stop the finished sequences.""" + """Check if the sequences should be stopped. If so, mark it as finished. + """ + # Check if the sequence has reached max_model_len. if seq.get_len() > self.scheduler_config.max_model_len: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED @@ -36,6 +39,7 @@ def maybe_stop_sequence(self, seq: Sequence, if sampling_params.detokenize: for stop_str in sampling_params.stop: + # TODO(cade) Fix this for speculative decoding. if seq.output_text.endswith(stop_str): self._finalize_sequence(seq, sampling_params, stop_str) seq.status = SequenceStatus.FINISHED_STOPPED diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index e4939b9be4457..5fbb09a857a46 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -5,6 +5,9 @@ def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput], num_seq_groups: int): + """Helper method which transforms a 2d list organized by + [step][sequence group] into [sequence group][step]. + """ output_by_sequence_group = [[] for _ in range(num_seq_groups)] for step in sampler_outputs: for i, sequence_group_output in enumerate(step): diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index b5e64843213a2..9330d754d5d74 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -71,6 +71,8 @@ def _init_non_spec_worker(self): self.driver_worker.load_model() def _init_spec_worker(self): + """Initialize a SpecDecodeWorker, using a draft model for proposals. + """ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.worker.worker import Worker diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index be970e56b6119..cb1480de03e3a 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -78,7 +78,6 @@ def forward( # Get the logprobs query results. prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) - return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs) @@ -669,8 +668,6 @@ def _build_sampler_output( sampling_metadata: SamplingMetadata, prompt_logprobs: List[Optional[PromptLogprobs]], sample_logprobs: List[SampleLogprobs], - sampled_token_ids: Optional[torch.Tensor] = None, - sampled_token_probs: Optional[torch.Tensor] = None, ) -> SamplerOutput: sampler_output = [] for (seq_group, sample_result, group_prompt_logprobs, @@ -687,5 +684,4 @@ def _build_sampler_output( SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return SamplerOutput(outputs=sampler_output) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 6945877fbf34b..88af1dd360155 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -84,7 +84,6 @@ def score_proposals( blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, - #return_python_output=False ) assert len(target_sampler_output) == 1, "expected single-step output" target_sampler_output = target_sampler_output[0] @@ -147,6 +146,8 @@ def _contract_batch(self, original_bs: int, sequences. """ + # We mock the device tensors until PR 7/9 is merged (e2e correctness). + # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer maybe_mock_device_tensors( sampler_output=target_sampler_output, batch_size=len(non_spec_indices) + num_scoring_tokens, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 6fdc3b294295d..ce63c329a40aa 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -345,6 +345,8 @@ def _merge_outputs( sampler_output = maybe_sampler_output + # We mock the device tensors until PR 7/9 is merged (e2e correctness). + # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer for step_output in sampler_output: maybe_mock_device_tensors( sampler_output=step_output, @@ -364,7 +366,6 @@ def _merge_outputs( fill_value=-1, dtype=torch.long, device=self._device) - entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens entire_proposal_probs = torch.zeros(batch_size, *proposal_probs.shape[1:], diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 84aa562eba500..be3af7be93864 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -99,7 +99,7 @@ def init_device(self) -> None: self.scorer_worker.init_device() self.proposer_worker.init_device() - # TODO separate from init_device? + # NOTE(cade): load_model is not part of the WorkerBase interface. self.scorer_worker.load_model() self.proposer_worker.load_model() @@ -195,7 +195,6 @@ def _run_no_spec( blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, - #return_python_output=False ) logger.info("run target worker no spec") diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index efc54c4de4cf4..85aee137dcbc9 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -84,13 +84,19 @@ def sampler_output_to_torch( def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, vocab_size: int, device: str) -> None: + """Helper method which mocks out the GPU tensors in SamplerOutput with dummy + values. This will be removed in PR 7/9. + https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer + """ values = [ sampler_output.sampled_token_probs, sampler_output.sampled_token_ids ] assert all(v is None for v in values) or not any(v is None for v in values) if not any(v is None for v in values): + # Do nothing if the tensors are already created (usually in unit tests). return + # Softmax to ensure valid probs. sampler_output.sampled_token_probs = torch.nn.functional.softmax( torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) From 42aa0bc45900b49ca5ae7878f90e371a123e0e66 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 20:30:23 -0700 Subject: [PATCH 092/109] fix --- vllm/engine/output_processor/beam_search.py | 6 +++--- vllm/engine/output_processor/block_decode.py | 7 ++++--- vllm/engine/output_processor/interfaces.py | 8 +++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 330eeced21cff..b0c0246b99359 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -1,15 +1,15 @@ -from typing import List, Tuple, Union, Iterable +from typing import Iterable, List, Tuple, Union from vllm.config import SchedulerConfig +from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.interfaces import ( SequenceGroupOutputProcessor) +from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.core.scheduler import Scheduler -from vllm.engine.output_processor.stop_checker import StopChecker logger = init_logger(__name__) diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 8c9b3e25598f9..e309b57af6ded 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -1,5 +1,8 @@ -from typing import List, Iterable, Callable +from typing import Callable, Iterable, List +from transformers import PreTrainedTokenizer + +from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.interfaces import ( SequenceGroupOutputProcessor) from vllm.engine.output_processor.stop_checker import StopChecker @@ -7,9 +10,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) -from vllm.core.scheduler import Scheduler from vllm.transformers_utils.detokenizer import Detokenizer -from transformers import PreTrainedTokenizer logger = init_logger(__name__) diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 1f940f2924064..26ec982cc13f1 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,11 +1,13 @@ from abc import ABC, abstractmethod -from typing import List, Callable, Iterable +from typing import Callable, Iterable, List + +from transformers import PreTrainedTokenizer from vllm.config import SchedulerConfig -from vllm.sequence import SequenceGroup, SequenceGroupOutput, Sequence -from vllm.transformers_utils.detokenizer import Detokenizer from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput +from vllm.transformers_utils.detokenizer import Detokenizer class SequenceGroupOutputProcessor(ABC): From 0ebd93b98f1c334aca3f4f4f6b651a7301a4f427 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 20:31:51 -0700 Subject: [PATCH 093/109] more spec test --- tests/spec_decode/e2e/test_correctness.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 160510e6c0c0f..c9665ee5bbc24 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -26,6 +26,10 @@ @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + }, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, From 33a3d7230b1e6f6a699b3863046494ecf5aca365 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 20:37:05 -0700 Subject: [PATCH 094/109] remove --- tests/spec_decode/e2e/test_correctness.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index c9665ee5bbc24..160510e6c0c0f 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -26,10 +26,6 @@ @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - }, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, From 15c942dfc8a49e294d803a1088bd8776bfd69aa2 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 20:37:29 -0700 Subject: [PATCH 095/109] wip --- tests/spec_decode/e2e/test_correctness.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 160510e6c0c0f..ac79f977ce399 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -26,6 +26,7 @@ @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ + # TODO(cade) handle output { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, From 063e34b370e0dcd8080faa3e397f303f0e4d3795 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 21:24:25 -0700 Subject: [PATCH 096/109] strip --- tests/spec_decode/e2e/test_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index ac79f977ce399..173f96c4de600 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -75,7 +75,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): expected_tokens = tok.decode(actual_token_ids) - assert actual_tokens == expected_tokens + assert actual_tokens.strip() == expected_tokens.strip() @pytest.mark.parametrize( From 672a855bb1ca4a074a9158d79eb99253fe3b2540 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 22:57:11 -0700 Subject: [PATCH 097/109] print --- tests/spec_decode/e2e/test_correctness.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 173f96c4de600..d76dbc50c8725 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -75,6 +75,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): expected_tokens = tok.decode(actual_token_ids) + print(f"{actual_token_ids=}") assert actual_tokens.strip() == expected_tokens.strip() From 8021b38ab38f85e187c6462fa804f8e55a18f8c2 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 8 Apr 2024 15:25:49 -0700 Subject: [PATCH 098/109] fix flaky test --- tests/spec_decode/e2e/test_correctness.py | 16 +++++++++++++--- vllm/spec_decode/util.py | 4 ++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index d76dbc50c8725..1041a5ddac122 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -26,17 +26,25 @@ @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ - # TODO(cade) handle output { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + }, { # No spec decode. }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [1, 10]) +@pytest.mark.parametrize("batch_size", [1]) +# NOTE: We should run more permutations of this test (more BS, more seeds). But +# because our spec decode generates gibberish token ids, the likelihood of +# emitting an invalid token combination is nontrivial. This causes divergence in +# behavior of vLLM detokenization vs. hf tokenizer, for example when two "utf- +# start" bytes are emitted. @pytest.mark.parametrize("seed", [1]) def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): """Run generation with speculative decoding on a batch. Verify the engine @@ -59,6 +67,8 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): max_tokens=output_len, ignore_eos=True, temperature=temperature, + skip_special_tokens=True, + spaces_between_special_tokens=False, ) batch_tokens, batch_token_ids = get_output_from_llm_generator( @@ -76,7 +86,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): expected_tokens = tok.decode(actual_token_ids) print(f"{actual_token_ids=}") - assert actual_tokens.strip() == expected_tokens.strip() + assert actual_tokens == expected_tokens @pytest.mark.parametrize( diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 85aee137dcbc9..eb6d4ca1da8e6 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -101,8 +101,8 @@ def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) - sampler_output.sampled_token_ids = torch.randint(low=0, - high=vocab_size, + sampler_output.sampled_token_ids = torch.randint(low=10, + high=100, size=(batch_size, ), dtype=torch.long, device=device) From 8e93fff38628411da884e35290f547f42c6f3d27 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 8 Apr 2024 15:55:21 -0700 Subject: [PATCH 099/109] reduce output len --- tests/spec_decode/e2e/test_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 1041a5ddac122..c8b6cf0d7df74 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -51,7 +51,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): generates the correct number of tokens (via ignore_eos=True), and that the detokenization matches HF transformers. """ - output_len = 128 + output_len = 32 temperature = 0.0 prompts = [ From d06e9a482125150d7d94ac8095203e86481c4c55 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 8 Apr 2024 16:44:05 -0700 Subject: [PATCH 100/109] strip --- tests/spec_decode/e2e/test_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index c8b6cf0d7df74..a8ebd66841eb2 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -86,7 +86,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): expected_tokens = tok.decode(actual_token_ids) print(f"{actual_token_ids=}") - assert actual_tokens == expected_tokens + assert actual_tokens.strip() == expected_tokens.strip() @pytest.mark.parametrize( From ca516aa614db261075c24780490e8b3d9767efed Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 8 Apr 2024 21:41:16 -0700 Subject: [PATCH 101/109] pr feedback --- vllm/worker/cpu_worker.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index bd67f9f8850ac..9debe3f0dfd12 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -207,11 +207,17 @@ def initialize_cache(self, num_gpu_blocks: int, # Note: To reuse the cache management procedure, # use cpu cache as 'gpu cache'. num_cpu_blocks = num_gpu_blocks - del num_gpu_blocks + self._validate_num_cpu_blocks(num_cpu_blocks) self.cache_config.num_gpu_blocks = num_cpu_blocks self.cache_config.num_cpu_blocks = 0 + # Initialize the cache. + self._init_cache_engine() + + def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None: + """Raise errors if the num_cpu_blocks is invalid. + """ if num_cpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `VLLM_CPU_KVCACHE_SPACE` when " @@ -226,9 +232,6 @@ def initialize_cache(self, num_gpu_blocks: int, "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when " "initializing the engine.") - # Initialize the cache. - self._init_cache_engine() - def _init_cache_engine(self) -> None: self.cache_engine = CPUCacheEngine(self.cache_config, self.model_config, From f6c7b2ecded9a7b7e9575aec2ca405d7ae3dd9a7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 11:59:09 -0700 Subject: [PATCH 102/109] Zhuohan offline pr feedback --- ...est_block_decode.py => test_multi_step.py} | 13 ++++---- vllm/engine/output_processor/interfaces.py | 25 ++++++++-------- .../{block_decode.py => multi_step.py} | 30 +++++++++++-------- .../{beam_search.py => single_step.py} | 19 +++++++----- 4 files changed, 46 insertions(+), 41 deletions(-) rename tests/engine/output_processor/{test_block_decode.py => test_multi_step.py} (96%) rename vllm/engine/output_processor/{block_decode.py => multi_step.py} (79%) rename vllm/engine/output_processor/{beam_search.py => single_step.py} (94%) diff --git a/tests/engine/output_processor/test_block_decode.py b/tests/engine/output_processor/test_multi_step.py similarity index 96% rename from tests/engine/output_processor/test_block_decode.py rename to tests/engine/output_processor/test_multi_step.py index c4a88d67cabc2..6da3da091db78 100644 --- a/tests/engine/output_processor/test_block_decode.py +++ b/tests/engine/output_processor/test_multi_step.py @@ -6,8 +6,7 @@ from tests.core.utils import create_seq_group from vllm.core.scheduler import Scheduler -from vllm.engine.output_processor.block_decode import ( - BlockDecodeOutputProcessor) +from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor from vllm.engine.output_processor.stop_checker import StopChecker from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput, @@ -20,7 +19,7 @@ @pytest.mark.parametrize("num_new_tokens", [1, 12]) @pytest.mark.skip_global_cleanup def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): - """Verify block decoding appends token ids correctly. + """Verify multi-step decoding appends token ids correctly. We append token ids and verify all the token ids were appended correctly. Note that ignore_eos=True. @@ -30,7 +29,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): stop_checker = MagicMock(spec=StopChecker) seq_counter = Counter() - output_processor = BlockDecodeOutputProcessor( + output_processor = MultiStepOutputProcessor( detokenizer=detokenizer, scheduler=scheduler, seq_counter=seq_counter, @@ -84,7 +83,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, stop_checker = MagicMock(spec=StopChecker) seq_counter = Counter() - output_processor = BlockDecodeOutputProcessor( + output_processor = MultiStepOutputProcessor( detokenizer=detokenizer, scheduler=scheduler, seq_counter=seq_counter, @@ -146,7 +145,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, eos_token_id = 100 - output_processor = BlockDecodeOutputProcessor( + output_processor = MultiStepOutputProcessor( detokenizer=detokenizer, scheduler=scheduler, seq_counter=seq_counter, @@ -213,7 +212,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, eos_token_id = 100 - output_processor = BlockDecodeOutputProcessor( + output_processor = MultiStepOutputProcessor( detokenizer=detokenizer, scheduler=scheduler, seq_counter=seq_counter, diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 26ec982cc13f1..9ddac7a04cb36 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -16,9 +16,10 @@ class SequenceGroupOutputProcessor(ABC): the scheduler. This is highly coupled with the LLMEngine and should be seen as an extension - of it. The logic is separated out to simplify the LLMEngine class and to - allow a beam search implementation (which handles forking, etc) and a block - decode implementation (which handles decoding >1 token per step). + of it. The logic is separated to simplify the LLMEngine class and allow + separate implementations for single-step decoding (which supports beam + search sequence forking) and multi-step decoding (which does not support + beam search, but does support speculative decoding). """ @staticmethod @@ -32,16 +33,14 @@ def create_output_processor( ): """Create an output processor. - This returns an output processor compatible with beam search if the - scheduler is not configured to scheduler lookahead slots. Otherwise, it - returns an output processor that is incompatible with beam search but - which supports decoding more than one token per scheduling invocation. + This returns a single-step output processor if num_lookahead_slots is + zero, else returns a multi-step output processor. """ if scheduler_config.num_lookahead_slots == 0: # Importing here to avoid cycle. - from vllm.engine.output_processor.beam_search import ( - BeamSearchOutputProcessor) - return BeamSearchOutputProcessor( + from vllm.engine.output_processor.single_step import ( + SingleStepOutputProcessor) + return SingleStepOutputProcessor( scheduler_config, detokenizer, scheduler, @@ -50,9 +49,9 @@ def create_output_processor( ) else: # Importing here to avoid cycle. - from vllm.engine.output_processor.block_decode import ( - BlockDecodeOutputProcessor) - return BlockDecodeOutputProcessor( + from vllm.engine.output_processor.multi_step import ( + MultiStepOutputProcessor) + return MultiStepOutputProcessor( detokenizer, scheduler, seq_counter, diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/multi_step.py similarity index 79% rename from vllm/engine/output_processor/block_decode.py rename to vllm/engine/output_processor/multi_step.py index e309b57af6ded..6b01a94f59e42 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/multi_step.py @@ -15,17 +15,18 @@ logger = init_logger(__name__) -class BlockDecodeOutputProcessor(SequenceGroupOutputProcessor): +class MultiStepOutputProcessor(SequenceGroupOutputProcessor): """SequenceGroupOutputProcessor which handles logic related to - detokenization and stopping conditions. Besides not supporting beam search, - this differs from BeamSearchOutputProcessor in that it supports lookahead - scheduling (where the model may generate >1 token per scheduler invocation). - - This allows it to support speculative decoding and cases where the model - runs more than once. We generalize these cases as "block decoding", where - the model emits a block of tokens at the same time. In this case, this class - is responsible for correctly appending all token ids to sequences and - detokenizing new token ids. + detokenization and stopping conditions. It specializes to "multi-step + decoding", where vLLM's worker may generate multiple tokens per invocation. + This is currently mutually exclusive with advanced sampling techniques like + beam search, which motivates the separation of this logic from the single + step output processor. + + This class is responsible for things such as correctly appending all new + token ids to their sequence, detokenizing new token ids, truncating new + output tokens after an eos token, and correctly handling the case where the + number of new output tokens per sequence differs in a single batch. """ def __init__( @@ -56,7 +57,8 @@ def process_outputs(self, sequence_group: SequenceGroup, seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) assert seqs, "expected running sequences" - assert len(seqs) == 1, ("Beam search not supported in block decoding.") + assert len(seqs) == 1, ( + "Beam search not supported in multi-step decoding.") seq = seqs[0] # Since there's only one sequence per sequence group, we can take the @@ -86,7 +88,9 @@ def _process_seq_outputs(self, seq: Sequence, output_token_ids = output_token_ids[:remaining_tokens] # Truncate any tokens after EOS. This is required as spec decode - # generates tokens in fixed blocks, which may go beyond the EOS token. + # generates a fixed number of tokens without evaluating stopping + # conditions within the block. This can cause an eos token to be + # unintentionally ignored. if not sampling_params.ignore_eos: eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id # Avoiding .index calls as exception throwing in the happy path @@ -100,7 +104,7 @@ def _process_seq_outputs(self, seq: Sequence, for output_token_id in output_token_ids: seq.append_token_id( token_id=output_token_id, - # TODO emit logprobs in block decoding. + # TODO emit logprobs in multi-step decoding. logprobs={output_token_id: Logprob(0.0)}, ) self.detokenizer.decode_sequence_inplace(seq, sampling_params) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/single_step.py similarity index 94% rename from vllm/engine/output_processor/beam_search.py rename to vllm/engine/output_processor/single_step.py index b0c0246b99359..a642070dce600 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/single_step.py @@ -14,15 +14,18 @@ logger = init_logger(__name__) -class BeamSearchOutputProcessor(SequenceGroupOutputProcessor): - """SequenceGroupOutputProcessor which handles logic related to beam search - sequence management and coupled logic like detokenization and stop logic. +class SingleStepOutputProcessor(SequenceGroupOutputProcessor): + """SequenceGroupOutputProcessor which handles "output processing" logic, + which happens after the model returns generated token ids and before + scheduling of the next batch. Output processing logic includes + detokenization, and determining if a sequence is finished (e.g. via max len + or eos token). - This class is in charge of sorting out which sequences survive after beam - sampling. It manages forking and freeing of sequences. - - It does not support lookahead decoding, e.g. where the model generates >1 - token per scheduling invocation. + The SingleStepOutputProcessor is specialized to the case where the model + emits at most a single token per invocation, which precludes configurations + such as speculative decoding or multi-step decoding. This enables beam + search sampling, which requires forking/finishing/freeing sequences in a way + that is currently difficult to schedule multiple steps ahead of time. """ def __init__( From 96f81c4abdb4157b68bd33db3ff07a7825e6695e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 12:18:07 -0700 Subject: [PATCH 103/109] lint --- vllm/spec_decode/spec_decode_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 29144f70ff6f8..be3af7be93864 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -18,6 +18,7 @@ logger = init_logger(__name__) + class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. From de1691929e58af704c72b329c9e608d06f2d8320 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 10 Apr 2024 14:13:32 -0700 Subject: [PATCH 104/109] pr feedback --- vllm/engine/llm_engine.py | 3 +-- vllm/engine/output_processor/stop_checker.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e6e75ee59c76d..59add1faba441 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -193,8 +193,7 @@ def __init__( self.seq_counter, self.get_tokenizer_for_seq, stop_checker=StopChecker( - self.scheduler, - self.scheduler_config, + self.scheduler_config.max_model_len, self.get_tokenizer_for_seq, ), )) diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 2a6c79d2dc026..37d53fa3c7faa 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,4 +1,6 @@ -from typing import List +from typing import Callable, List + +from transformers import PreTrainedTokenizer from vllm.sampling_params import SamplingParams from vllm.sequence import Sequence, SequenceStatus @@ -11,9 +13,10 @@ class StopChecker: emitted, or if we have exceeded the max model len. """ - def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): - self.scheduler = scheduler - self.scheduler_config = scheduler_config + def __init__(self, max_model_len: int, + get_tokenizer_for_seq: Callable[[Sequence], + PreTrainedTokenizer]): + self.max_model_len = max_model_len self.get_tokenizer_for_seq = get_tokenizer_for_seq def maybe_stop_sequence(self, seq: Sequence, @@ -23,7 +26,7 @@ def maybe_stop_sequence(self, seq: Sequence, """ # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: + if seq.get_len() > self.max_model_len: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return From 2a19f5e58f36efb090434adb57e55a411144669b Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 01:39:25 -0700 Subject: [PATCH 105/109] allow append empty tokens in block table --- tests/core/block/e2e/test_correctness.py | 65 ++++++++++++++++++++++++ vllm/core/block/block_table.py | 1 - 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 94b65401e1dd4..a403d442d7af9 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -229,6 +229,71 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, assert baseline_token_ids == test_token_ids +@pytest.mark.parametrize("common_llm_kwargs", [ + { + # Use a small model for a fast test. + "model": "facebook/opt-125m", + + # skip cuda graph creation for fast test. + "enforce_eager": True, + + "enable_chunked_prefill": True, + "max_num_batched_tokens": 2, + "max_num_seqs": 2, + }, +]) +@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [ + { + "use_v2_block_manager": False, + }, +]) +@pytest.mark.parametrize( + "test_llm_kwargs", [ + { + "use_v2_block_manager": True, + "num_lookahead_slots": 0, + }, + { + "use_v2_block_manager": True, + "num_lookahead_slots": 5, + }, +]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +def test_chunked_prefill_block_manager_v2(baseline_llm_generator, test_llm_generator, batch_size): + output_len = 32 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + print('Getting token ids with BlockManagerV1') + baseline_token_ids = get_token_ids_from_llm_generator( + baseline_llm_generator, prompts, sampling_params) + + print('Getting token ids with BlockManagerV2') + test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, + prompts, sampling_params) + + for expected_token_ids, actual_token_ids in zip(baseline_token_ids, + test_token_ids): + assert expected_token_ids == actual_token_ids + + assert baseline_token_ids == test_token_ids + def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): for llm in llm_generator: diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index ba061bbc4fbcb..560267e55ea3a 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -104,7 +104,6 @@ def append_token_ids(self, token_ids (List[int]): The sequence of token IDs to be appended. """ assert self._is_allocated - assert token_ids, "can't append empty token ids" self.ensure_num_empty_slots(num_empty_slots=len(token_ids) + num_lookahead_slots) From b6e9e826604123654224a5d598fd140c1cfedde5 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 02:58:43 -0700 Subject: [PATCH 106/109] rebase on stop string fixes --- vllm/engine/output_processor/multi_step.py | 15 +++-- vllm/engine/output_processor/stop_checker.py | 63 +------------------- 2 files changed, 13 insertions(+), 65 deletions(-) diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 6b01a94f59e42..bae903acda66c 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -101,17 +101,24 @@ def _process_seq_outputs(self, seq: Sequence, valid_samples = valid_samples[:i + 1] break + # Incrementally append tokens to the sequence, as if we had only one new + # token. for output_token_id in output_token_ids: seq.append_token_id( token_id=output_token_id, # TODO emit logprobs in multi-step decoding. logprobs={output_token_id: Logprob(0.0)}, ) - self.detokenizer.decode_sequence_inplace(seq, sampling_params) - self.stop_checker.maybe_stop_sequence(seq, - sampling_params, - new_token_ids=output_token_ids) + new_char_count = 0 + if sampling_params.detokenize: + new_char_count = self.detokenizer.decode_sequence_inplace(seq, sampling_params) + + self.stop_checker.maybe_stop_sequence(seq, + new_char_count=new_char_count, + sampling_params=sampling_params) + if seq.is_finished(): + break if seq.is_finished(): self.scheduler.free_seq(seq) diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index f259b818748e4..93e2fe6ac17c6 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,4 +1,4 @@ -from typing import Callable, List +from typing import Callable, List, Optional from transformers import PreTrainedTokenizer @@ -61,7 +61,7 @@ def maybe_stop_sequence(self, seq: Sequence, return # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: + if seq.get_len() > self.max_model_len: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return @@ -101,62 +101,3 @@ def _check_stop_strings(seq: Sequence, new_char_count: int, seq.output_text = seq.output_text[:stop_index] return stop_str return None - # TODO spec decode - ## - # """Check if the sequences should be stopped. If so, mark it as finished. - # """ - - # # Check if the sequence has reached max_model_len. - # if seq.get_len() > self.max_model_len: - # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - # return - - # # Check if the sequence has reached max_tokens. - # if seq.get_output_len() == sampling_params.max_tokens: - # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - # return - - # # Check if the minimum number of tokens has been generated yet; - # # skip the stop string/token checks if not - # if seq.get_output_len() < sampling_params.min_tokens: - # return - - # if sampling_params.detokenize: - # for stop_str in sampling_params.stop: - # # TODO(cade) Fix this for speculative decoding. - # if seq.output_text.endswith(stop_str): - # self._finalize_sequence(seq, sampling_params, stop_str) - # seq.status = SequenceStatus.FINISHED_STOPPED - # seq.stop_reason = stop_str - # return - - # # Determine if any stop_token_ids are in new_token_ids. - # intersection = set(new_token_ids).intersection( - # sampling_params.stop_token_ids) - # if intersection: - # # Get arbitrary token id that caused the stop. - # stop_token_id = next(iter(intersection)) - - # stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - # stop_token_id) - # self._finalize_sequence(seq, sampling_params, stop_str) - # seq.status = SequenceStatus.FINISHED_STOPPED - # seq.stop_reason = stop_token_id - # return - - # # Check if the sequence has generated the EOS token. - # if ((not sampling_params.ignore_eos) - # and seq.eos_token_id in new_token_ids): - # seq.status = SequenceStatus.FINISHED_STOPPED - # return - - #def _finalize_sequence(self, seq: Sequence, - # sampling_params: SamplingParams, - # stop_string: str) -> None: - # if sampling_params.include_stop_str_in_output: - # return - - # if stop_string and seq.output_text.endswith(stop_string): - # # Truncate the output text so that the stop string is - # # not included in the output. - # seq.output_text = seq.output_text[:-len(stop_string)] From bf0c37cbbd2f0f034edbd77a6292d9ba3509bf19 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 03:00:13 -0700 Subject: [PATCH 107/109] test spec --- vllm/executor/gpu_executor.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 4fd9735669fd2..9268b646a18ab 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -13,13 +13,6 @@ class GPUExecutor(ExecutorBase): def _init_executor(self) -> None: - assert (not self.speculative_config - ), "Speculative decoding not yet supported for GPU backend" - - # Instantiate the worker and load the model to GPU. - self._init_worker() - - def _init_worker(self): if self.speculative_config is None: self._init_non_spec_worker() else: From a158256acb08f0c954feaf953590b0668d6f8904 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 03:07:16 -0700 Subject: [PATCH 108/109] lint & mypy --- tests/core/block/e2e/test_correctness.py | 34 +++++++++++--------- vllm/engine/output_processor/multi_step.py | 10 +++--- vllm/engine/output_processor/single_step.py | 3 +- vllm/engine/output_processor/stop_checker.py | 6 ++-- vllm/executor/gpu_executor.py | 2 ++ vllm/executor/neuron_executor.py | 5 ++- 6 files changed, 34 insertions(+), 26 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index a403d442d7af9..1015892b67a4f 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -229,27 +229,28 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, assert baseline_token_ids == test_token_ids -@pytest.mark.parametrize("common_llm_kwargs", [ - { - # Use a small model for a fast test. - "model": "facebook/opt-125m", - - # skip cuda graph creation for fast test. - "enforce_eager": True, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 2, - "max_num_seqs": 2, - }, -]) -@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize( + "common_llm_kwargs", + [ + { + # Use a small model for a fast test. + "model": "facebook/opt-125m", + + # skip cuda graph creation for fast test. + "enforce_eager": True, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 2, + "max_num_seqs": 2, + }, + ]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [ { "use_v2_block_manager": False, }, ]) -@pytest.mark.parametrize( - "test_llm_kwargs", [ +@pytest.mark.parametrize("test_llm_kwargs", [ { "use_v2_block_manager": True, "num_lookahead_slots": 0, @@ -261,7 +262,8 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, ]) @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("seed", [1]) -def test_chunked_prefill_block_manager_v2(baseline_llm_generator, test_llm_generator, batch_size): +def test_chunked_prefill_block_manager_v2(baseline_llm_generator, + test_llm_generator, batch_size): output_len = 32 temperature = 0.0 diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index bae903acda66c..50da0d35fcec1 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -112,11 +112,13 @@ def _process_seq_outputs(self, seq: Sequence, new_char_count = 0 if sampling_params.detokenize: - new_char_count = self.detokenizer.decode_sequence_inplace(seq, sampling_params) + new_char_count = self.detokenizer.decode_sequence_inplace( + seq, sampling_params) - self.stop_checker.maybe_stop_sequence(seq, - new_char_count=new_char_count, - sampling_params=sampling_params) + self.stop_checker.maybe_stop_sequence( + seq, + new_char_count=new_char_count, + sampling_params=sampling_params) if seq.is_finished(): break diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 3ded72db30921..1b7eb014f802b 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -110,7 +110,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, seq, seq_group.sampling_params) else: new_char_count = 0 - self.stop_checker.maybe_stop_sequence(seq, new_char_count, seq_group.sampling_params) + self.stop_checker.maybe_stop_sequence(seq, new_char_count, + seq_group.sampling_params) # Non-beam search case if not seq_group.sampling_params.use_beam_search: diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 93e2fe6ac17c6..66deb9b591746 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Optional +from typing import Callable, Optional from transformers import PreTrainedTokenizer @@ -19,10 +19,8 @@ def __init__(self, max_model_len: int, self.max_model_len = max_model_len self.get_tokenizer_for_seq = get_tokenizer_for_seq - def maybe_stop_sequence(self, seq: Sequence, - new_char_count: int, + def maybe_stop_sequence(self, seq: Sequence, new_char_count: int, sampling_params: SamplingParams) -> None: - """Stop the finished sequences. new_char_count is the number of chars added to the diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 9268b646a18ab..b7ab9481eb9f0 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -48,6 +48,8 @@ def _init_non_spec_worker(self): def _init_spec_worker(self): """Initialize a SpecDecodeWorker, using a draft model for proposals. """ + assert self.speculative_config is not None + from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.worker.worker import Worker diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 273b17a927efd..7cc187e297c9f 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -48,10 +48,13 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int) -> List[SamplerOutput]: assert (blocks_to_swap_in == {} and blocks_to_swap_out == {} and blocks_to_copy == {}), ( "Cache operations are not supported for Neuron backend.") + assert num_lookahead_slots == 0, ( + "lookahead not supported for Neuron backend.") output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list) From 5a69f6c25ad51515fcc9d1e5ecc9d43fea3af89c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 03:15:31 -0700 Subject: [PATCH 109/109] doc --- tests/core/block/e2e/test_correctness.py | 3 +++ vllm/executor/gpu_executor.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 1015892b67a4f..0ee78a9b0a8ea 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -264,6 +264,9 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, @pytest.mark.parametrize("seed", [1]) def test_chunked_prefill_block_manager_v2(baseline_llm_generator, test_llm_generator, batch_size): + """Verify that chunked prefill works with BlockManagerV2, with and without + lookahead scheduling. + """ output_len = 32 temperature = 0.0 diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index b7ab9481eb9f0..962cac585bb25 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -13,6 +13,11 @@ class GPUExecutor(ExecutorBase): def _init_executor(self) -> None: + """Initialize the worker and load the model. + + If speculative decoding is enabled, we instead create the speculative + worker. + """ if self.speculative_config is None: self._init_non_spec_worker() else: