diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 3474bd3861598..ea767f4c3e264 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -35,7 +35,6 @@ jobs: mypy mypy tests --follow-imports skip mypy vllm/attention --follow-imports skip - mypy vllm/core --follow-imports skip mypy vllm/distributed --follow-imports skip mypy vllm/engine --follow-imports skip mypy vllm/executor --follow-imports skip diff --git a/format.sh b/format.sh index 9e0780870303d..2204b3ba59498 100755 --- a/format.sh +++ b/format.sh @@ -99,7 +99,6 @@ echo 'vLLM mypy:' mypy --follow-imports skip # Note that this is less strict than CI mypy tests --follow-imports skip mypy vllm/attention --follow-imports skip -mypy vllm/core --follow-imports skip mypy vllm/distributed --follow-imports skip mypy vllm/engine --follow-imports skip mypy vllm/executor --follow-imports skip diff --git a/pyproject.toml b/pyproject.toml index bcedbb53ab887..22a25d9cf32e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ files = [ "vllm/adapter_commons", "vllm/assets", "vllm/entrypoints", + "vllm/core", "vllm/inputs", "vllm/logging", "vllm/multimodal", diff --git a/vllm/block.py b/vllm/block.py index 95286048d9115..47c381c19383b 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -1,9 +1,9 @@ """Token blocks.""" -from typing import List, Optional +from typing import TYPE_CHECKING, Iterator, List, Optional from vllm.utils import Device -DEFAULT_LAST_ACCESSED_TIME = -1 +DEFAULT_LAST_ACCESSED_TIME: float = -1 class PhysicalTokenBlock: @@ -59,6 +59,11 @@ def __len__(self) -> int: def __getitem__(self, key): return self._blocks[key] + if TYPE_CHECKING: + + def __iter__(self) -> Iterator[PhysicalTokenBlock]: + raise RuntimeError("Method should be automatically generated") + def __setitem__(self, key, value): if isinstance(key, slice): blocks = value diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index c6330df2a485a..c87246c1c6d6a 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -132,7 +132,7 @@ def allocate_mutable_block(self, prev_block: Optional[Block], def allocate_immutable_blocks(self, prev_block: Optional[Block], block_token_ids: List[List[int]], - device: Optional[Device]) -> List[Block]: + device: Device) -> List[Block]: """Allocates a new group of immutable blocks with the provided block token IDs on the specified device. diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 0af04399a4b31..666723313c829 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -278,7 +278,7 @@ def __init__( # request ID self.cross_block_tables: Dict[str, BlockTable] = {} - def _get_seq_num_required_blocks(self, seq: Sequence) -> int: + def _get_seq_num_required_blocks(self, seq: Optional[Sequence]) -> int: return 0 if seq is None else seq.n_blocks def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: @@ -310,13 +310,14 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: return AllocStatus.LATER def _allocate_sequence(self, \ - seq: Sequence, \ + seq: Optional[Sequence], \ ref_count: int, \ is_encoder_decoder: bool = True) -> BlockTable: # Allocate new physical token blocks that will store the prompt tokens. - num_prompt_blocks = seq.n_blocks + num_prompt_blocks = self._get_seq_num_required_blocks(seq) block_table: BlockTable = BlockTable() + assert seq is not None for logical_idx in range(num_prompt_blocks): if (self.block_sliding_window is not None and logical_idx >= self.block_sliding_window): diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 7d4919a0d94a8..7d2db43cb4602 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -120,8 +120,10 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: ) if seq_group.is_encoder_decoder(): + encoder_seq = seq_group.get_encoder_seq() + assert encoder_seq is not None num_required_blocks += BlockTable.get_num_required_blocks( - seq_group.get_encoder_seq().get_token_ids(), + encoder_seq.get_token_ids(), block_size=self.block_size, ) @@ -189,7 +191,9 @@ def allocate(self, seq_group: SequenceGroup) -> None: check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) if seq_group.is_encoder_decoder(): - block_table = self._allocate_sequence(seq_group.get_encoder_seq()) + encoder_seq = seq_group.get_encoder_seq() + assert encoder_seq is not None + block_table = self._allocate_sequence(encoder_seq) self.cross_block_tables[request_id] = block_table def can_append_slots(self, seq_group: SequenceGroup, diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/embedding_model_block_manager.py index 3d864a73f91d0..f16f66e99e7f8 100644 --- a/vllm/core/embedding_model_block_manager.py +++ b/vllm/core/embedding_model_block_manager.py @@ -77,8 +77,8 @@ def access_all_blocks_in_seq( pass def get_common_computed_block_ids(self, - seq_group: SequenceGroup) -> List[int]: - return None # type: ignore + seq_group: List[Sequence]) -> List[int]: + return [] def mark_blocks_as_computed(self, seq_group: SequenceGroup): pass diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 280d7b7e61e2c..de1988eead97b 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -221,10 +221,10 @@ class SchedulerSwappedInOutputs: """ # Selected sequences that are going to be swapped in and is in a # decoding phase. - decode_seq_groups: List[SequenceGroup] + decode_seq_groups: List[ScheduledSequenceGroup] # Selected sequences that are going to be swapped in and in a prefill # phase. I.e., it means the prefill has been chunked. - prefill_seq_groups: List[SequenceGroup] + prefill_seq_groups: List[ScheduledSequenceGroup] # The blocks to swap in. blocks_to_swap_in: List[Tuple[int, int]] # The blocks to copy. @@ -254,7 +254,7 @@ class SchedulerPrefillOutputs: to be recomputed from scratch. """ # Selected sequences for prefill. - seq_groups: List[SequenceGroup] + seq_groups: List[ScheduledSequenceGroup] # Ignored sequence groups. ignored_seq_groups: List[SequenceGroup] num_lookahead_slots: int @@ -289,7 +289,9 @@ def scheduler_running_outputs_builder(): def scheduled_seq_group_builder(): - return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0) + return ScheduledSequenceGroup(SequenceGroup("", [], -1), + token_chunk_size=0) + # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0) class Scheduler: @@ -791,7 +793,7 @@ def _schedule_prefills( SchedulerPrefillOutputs. """ ignored_seq_groups: List[SequenceGroup] = [] - seq_groups: List[SequenceGroup] = [] + seq_groups: List[ScheduledSequenceGroup] = [] waiting_queue = self.waiting @@ -1130,7 +1132,9 @@ def schedule( if seq_group.is_encoder_decoder(): # Encoder associated with SequenceGroup - encoder_seq_data = seq_group.get_encoder_seq().data + encoder_seq = seq_group.get_encoder_seq() + assert encoder_seq is not None + encoder_seq_data = encoder_seq.data # Block table for cross-attention # Also managed at SequenceGroup level cross_block_table = self.block_manager.get_cross_block_table(