diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index a19be8525f902..5b2bad1476dc3 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -33,6 +33,7 @@ jobs: - name: Mypy run: | mypy vllm/attention --config-file pyproject.toml + mypy vllm/core --config-file pyproject.toml mypy vllm/distributed --config-file pyproject.toml mypy vllm/entrypoints --config-file pyproject.toml mypy vllm/executor --config-file pyproject.toml @@ -42,9 +43,6 @@ jobs: mypy vllm/engine --config-file pyproject.toml mypy vllm/worker --config-file pyproject.toml mypy vllm/spec_decode --config-file pyproject.toml - mypy vllm/lora --config-file pyproject.toml mypy vllm/model_executor --config-file pyproject.toml - - # TODO(sang): Fix nested dir - mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml + mypy vllm/lora --config-file pyproject.toml diff --git a/format.sh b/format.sh index bd12e61d77806..49149afe41d04 100755 --- a/format.sh +++ b/format.sh @@ -95,7 +95,7 @@ echo 'vLLM yapf: Done' # Run mypy echo 'vLLM mypy:' mypy vllm/attention --config-file pyproject.toml -mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml +mypy vllm/core --config-file pyproject.toml mypy vllm/distributed --config-file pyproject.toml mypy vllm/entrypoints --config-file pyproject.toml mypy vllm/executor --config-file pyproject.toml diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index f1b65b2514f76..b0d9511fba521 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -40,7 +40,9 @@ def __init__( ): self._block_size = block_size self._allocator = block_allocator - self._blocks: Optional[List[Block]] = _blocks + if _blocks is None: + _blocks = [] + self._blocks: List[Block] = _blocks # Use helper method instead of directly calculating, as blocks # may not be allocated. @@ -104,7 +106,7 @@ def append_token_ids(self, token_ids (List[int]): The sequence of token IDs to be appended. """ assert self._is_allocated - assert self._blocks is not None + assert len(self._blocks) > 0 self.ensure_num_empty_slots(num_empty_slots=len(token_ids) + num_lookahead_slots) @@ -141,6 +143,7 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None: blocks_to_allocate = cdiv(slots_to_allocate, self._block_size) for _ in range(blocks_to_allocate): + assert len(self._blocks) > 0 self._blocks.append( self._allocator.allocate_mutable(prev_block=self._blocks[-1], device=device)) @@ -159,6 +162,7 @@ def fork(self) -> "BlockTable": the current instance. """ assert self._is_allocated + assert len(self._blocks) > 0 forked_blocks = self._allocator.fork(self._blocks[-1]) return BlockTable( block_size=self._block_size, @@ -177,10 +181,10 @@ def free(self) -> None: assert self._is_allocated for block in self._blocks: self._allocator.free(block) - self._blocks = None + self._blocks = [] @property - def physical_block_ids(self) -> List[int]: + def physical_block_ids(self) -> List[Optional[int]]: """Returns a list of physical block indices for the blocks in the BlockTable. @@ -235,7 +239,7 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block], def _get_all_token_ids(self) -> List[int]: # NOTE: This function is O(seq_len); use sparingly. - token_ids = [] + token_ids: List[int] = [] if not self._is_allocated: return token_ids @@ -247,7 +251,7 @@ def _get_all_token_ids(self) -> List[int]: @property def _is_allocated(self) -> bool: - return self._blocks is not None + return len(self._blocks) > 0 @property def _num_empty_slots(self) -> int: diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index f11234a0bf2dd..3f97a1210b096 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Dict, Iterable, List, Optional +from typing import Dict, Iterable, List, Optional, Protocol from vllm.core.block.interfaces import Block, BlockAllocator @@ -7,7 +7,19 @@ RefCount = int -class RefCounter: +class RefCounterProtocol(Protocol): + + def incr(self, block_id: BlockId) -> RefCount: + raise NotImplementedError + + def decr(self, block_id: BlockId) -> RefCount: + raise NotImplementedError + + def get(self, block_id: BlockId) -> RefCount: + raise NotImplementedError + + +class RefCounter(RefCounterProtocol): """A class for managing reference counts for a set of block indices. The RefCounter class maintains a dictionary that maps block indices to their @@ -54,7 +66,7 @@ def as_readonly(self) -> "ReadOnlyRefCounter": return ReadOnlyRefCounter(self) -class ReadOnlyRefCounter: +class ReadOnlyRefCounter(RefCounterProtocol): """A read-only view of the RefCounter class. The ReadOnlyRefCounter class provides a read-only interface to access the @@ -96,7 +108,7 @@ class CopyOnWriteTracker: def __init__( self, - refcounter: RefCounter, + refcounter: RefCounterProtocol, allocator: BlockAllocator, ): self._copy_on_writes: Dict[BlockId, List[BlockId]] = defaultdict(list) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 23e1a4cf91266..d25d22cf52838 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,6 +1,6 @@ -from typing import Dict, List, Optional +from typing import Dict, FrozenSet, List, Optional -from vllm.core.block.interfaces import (Block, BlockAllocator, +from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator @@ -57,15 +57,15 @@ def create( cpu_block_ids = block_ids[num_gpu_blocks:] if allocator_type == "naive": - gpu_allocator = NaiveBlockAllocator( - create_block=NaiveBlock, + gpu_allocator: BlockAllocator = NaiveBlockAllocator( + create_block=NaiveBlock, # type: ignore num_blocks=num_gpu_blocks, block_size=block_size, block_ids=gpu_block_ids, ) - cpu_allocator = NaiveBlockAllocator( - create_block=NaiveBlock, + cpu_allocator: BlockAllocator = NaiveBlockAllocator( + create_block=NaiveBlock, # type: ignore num_blocks=num_cpu_blocks, block_size=block_size, block_ids=cpu_block_ids, @@ -105,13 +105,14 @@ def __init__( Device.GPU: gpu_block_allocator, } - self._block_ids_to_allocator = {} + self._block_ids_to_allocator: Dict[int, BlockAllocator] = {} for _, allocator in self._allocators.items(): for block_id in allocator.all_block_ids: self._block_ids_to_allocator[block_id] = allocator - def allocate_mutable(self, prev_block: Optional[Block], - device: Device) -> Block: + def allocate_mutable(self, + prev_block: Optional[Block], + device: Optional[Device] = None) -> Block: """Allocates a new mutable block on the specified device. Args: @@ -122,10 +123,13 @@ def allocate_mutable(self, prev_block: Optional[Block], Returns: Block: The newly allocated mutable block. """ + assert device is not None return self._allocators[device].allocate_mutable(prev_block) - def allocate_immutable(self, prev_block: Optional[Block], - token_ids: List[int], device: Device) -> Block: + def allocate_immutable(self, + prev_block: Optional[Block], + token_ids: List[int], + device: Optional[Device] = None) -> Block: """Allocates a new immutable block with the provided token IDs on the specified device. @@ -140,6 +144,7 @@ def allocate_immutable(self, prev_block: Optional[Block], Block: The newly allocated immutable block containing the provided token IDs. """ + assert device is not None return self._allocators[device].allocate_immutable( prev_block, token_ids) @@ -149,7 +154,9 @@ def free(self, block: Block) -> None: Args: block (Block): The block to be freed. """ - allocator = self._block_ids_to_allocator[block.block_id] + block_id = block.block_id + assert block_id is not None + allocator = self._block_ids_to_allocator[block_id] return allocator.free(block) def fork(self, last_block: Block) -> List[Block]: @@ -163,19 +170,22 @@ def fork(self, last_block: Block) -> List[Block]: List[Block]: A new list of blocks that shares the same memory as the original sequence. """ - allocator = self._block_ids_to_allocator[last_block.block_id] + block_id = last_block.block_id + assert block_id is not None + allocator = self._block_ids_to_allocator[block_id] return allocator.fork(last_block) - def get_num_free_blocks(self, device: Device) -> int: + def get_num_free_blocks(self, device: Optional[Device] = None) -> int: """Returns the number of free blocks available on the specified device. Args: device (Device): The device for which to query the number of free - blocks. + blocks. AssertionError is raised if None is passed. Returns: int: The number of free blocks available on the specified device. """ + assert device is not None return self._allocators[device].get_num_free_blocks() def clear_copy_on_writes(self) -> Dict[int, List[int]]: @@ -210,5 +220,12 @@ def get_common_computed_block_ids( return self._allocators[device].get_common_computed_block_ids( seq_block_ids) - def all_block_ids(self) -> frozenset[int]: + @property + def all_block_ids(self) -> FrozenSet[int]: return frozenset(self._block_ids_to_allocator.keys()) + + def promote_to_immutable_block(self, block: Block) -> BlockId: + raise NotImplementedError + + def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]: + raise NotImplementedError diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 440d6a4b04d3b..08d2f87301d92 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -3,6 +3,8 @@ from vllm.utils import Device +BlockId = int + class Block(ABC): @@ -15,6 +17,12 @@ def append_token_ids(self, token_ids: List[int]) -> None: def block_id(self) -> Optional[int]: pass + @block_id.setter + @abstractmethod + def block_id(self, value: Optional[int]) -> None: + """NOTE: Do not use this API outside Block.""" + self._block_id = value + @property @abstractmethod def token_ids(self) -> List[int]: @@ -35,6 +43,27 @@ def is_full(self) -> bool: def prev_block(self) -> Optional["Block"]: pass + @property + @abstractmethod + def computed(self) -> bool: + raise NotImplementedError + + @computed.setter + @abstractmethod + def computed(self, value) -> bool: + """Should be only used by PrefixCacingAllocator""" + raise NotImplementedError + + @property + @abstractmethod + def last_accessed(self) -> float: + raise NotImplementedError + + @last_accessed.setter + @abstractmethod + def last_accessed(self, last_accessed_ts: float): + raise NotImplementedError + class Factory(Protocol): @abstractmethod @@ -48,6 +77,17 @@ def __call__( ) -> "Block": pass + @property + @abstractmethod + def content_hash(self) -> Optional[int]: + """Return the content-based hash of the current block, or None if it is + not yet defined or not supported. + + For the content-based hash to be defined, the current block must be + full. + """ + return None + class BlockAllocator(ABC): @@ -57,7 +97,7 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block: @abstractmethod def allocate_immutable(self, prev_block: Optional[Block], - token_ids: List[int], device: Device) -> Block: + token_ids: List[int]) -> Block: pass @abstractmethod @@ -69,7 +109,7 @@ def fork(self, last_block: Block) -> List[Block]: pass @abstractmethod - def get_num_free_blocks(self, device: Device) -> int: + def get_num_free_blocks(self) -> int: pass @property @@ -82,11 +122,12 @@ def clear_copy_on_writes(self) -> Dict[int, List[int]]: pass @abstractmethod - def mark_blocks_as_accessed(self) -> None: + def mark_blocks_as_accessed(self, block_ids: List[int], + now: float) -> None: pass @abstractmethod - def mark_blocks_as_computed(self) -> None: + def mark_blocks_as_computed(self, block_ids: List[int]) -> None: pass @abstractmethod @@ -94,21 +135,66 @@ def get_common_computed_block_ids( self, seq_block_ids: List[List[int]]) -> List[int]: pass + @abstractmethod + def cow_block_if_not_appendable(self, block: Block) -> Optional["BlockId"]: + """NOTE: This should not be used besides Block""" + pass + + @abstractmethod + def promote_to_immutable_block(self, block: Block) -> BlockId: + """NOTE: This should not be used besides Block""" + pass + class NoFreeBlocksError(ValueError): pass -class DeviceAwareBlockAllocator(BlockAllocator): +class DeviceAwareBlockAllocator(ABC): @abstractmethod - def allocate_mutable(self, prev_block: Optional[Block]) -> Block: + def allocate_mutable(self, + prev_block: Optional[Block], + device: Optional[Device] = None) -> Block: pass @abstractmethod - def allocate_immutable(self, prev_block: Optional[Block], - token_ids: List[int], device: Device) -> Block: + def allocate_immutable(self, + prev_block: Optional[Block], + token_ids: List[int], + device: Optional[Device] = None) -> Block: pass @abstractmethod - def get_num_free_blocks(self, device: Device) -> int: + def get_num_free_blocks(self, device: Optional[Device] = None) -> int: + pass + + @abstractmethod + def free(self, block: Block) -> None: + pass + + @abstractmethod + def fork(self, last_block: Block) -> List[Block]: + pass + + @property + @abstractmethod + def all_block_ids(self) -> FrozenSet[int]: + pass + + @abstractmethod + def clear_copy_on_writes(self) -> Dict[int, List[int]]: + pass + + @abstractmethod + def mark_blocks_as_accessed(self, block_ids: List[int], + now: float) -> None: + pass + + @abstractmethod + def mark_blocks_as_computed(self, block_ids: List[int]) -> None: + pass + + @abstractmethod + def get_common_computed_block_ids( + self, seq_block_ids: List[List[int]]) -> List[int]: pass diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index a0bf33912d935..10af129246889 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,10 +1,9 @@ -from typing import Dict, Iterable, List, Optional, Set +from typing import Dict, FrozenSet, Iterable, List, Optional, Set from vllm.core.block.common import (CopyOnWriteTracker, RefCounter, get_all_blocks_recursively) -from vllm.core.block.interfaces import Block, BlockAllocator +from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device -BlockId = int Refcount = int @@ -49,8 +48,10 @@ def __init__( allocator=self, ) - def allocate_immutable(self, prev_block: Optional[Block], - token_ids: List[int]) -> Block: + def allocate_immutable(self, + prev_block: Optional[Block], + token_ids: List[int], + device: Optional[Device] = None) -> Block: """Allocates a new immutable block with the given token IDs, linked to the previous block. @@ -63,11 +64,14 @@ def allocate_immutable(self, prev_block: Optional[Block], Returns: Block: The newly allocated immutable block. """ + assert device is None block = self.allocate_mutable(prev_block=prev_block) block.append_token_ids(token_ids) return block - def allocate_mutable(self, prev_block: Optional[Block]) -> Block: + def allocate_mutable(self, + prev_block: Optional[Block], + device: Optional[Device] = None) -> Block: """Allocates a new mutable block, linked to the previous block. Args: @@ -78,6 +82,7 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block: Returns: Block: The newly allocated mutable block. """ + assert device is None block_id = self._allocate_new_block_id() return self._create_block( prev_block=prev_block, @@ -88,6 +93,7 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block: ) def free(self, block: Block) -> None: + assert block.block_id is not None self._free_block_id(block.block_id) # Mark the block as having no allocation. @@ -111,6 +117,7 @@ def fork(self, last_block: Block) -> List[Block]: for block in source_blocks: # Increment refcount for each block. + assert block.block_id is not None refcount = self._refcounter.incr(block.block_id) assert refcount != 1, "can't fork free'd block" @@ -126,7 +133,8 @@ def fork(self, last_block: Block) -> List[Block]: return forked_blocks - def get_num_free_blocks(self) -> int: + def get_num_free_blocks(self, device: Optional[Device] = None) -> int: + assert device is None return len(self._free_block_indices) def _allocate_new_block_id(self) -> BlockId: @@ -148,7 +156,7 @@ def refcounter(self): return self._refcounter @property - def all_block_ids(self): + def all_block_ids(self) -> FrozenSet[int]: return self._all_block_indices def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]: @@ -200,6 +208,9 @@ def get_common_computed_block_ids( """ return [] + def promote_to_immutable_block(self, block: Block) -> BlockId: + raise NotImplementedError + class NaiveBlock(Block): """An implementation of the Block class that does not support prefix @@ -224,13 +235,13 @@ class NaiveBlock(Block): """ def __init__(self, - prev_block: Block, + prev_block: Optional[Block], token_ids: List[int], block_size: int, allocator: BlockAllocator, block_id: Optional[int] = None, _cow_target: Optional[Block] = None): - self._token_ids = [] + self._token_ids: List[int] = [] self._block_size = block_size self._prev_block = prev_block self._block_id = block_id @@ -256,6 +267,22 @@ def _append_token_ids_no_cow(self, token_ids: List[int]) -> None: assert self.num_empty_slots >= len(token_ids) self._token_ids.extend(token_ids) + @property + def computed(self) -> bool: + raise NotImplementedError + + @computed.setter + def computed(self, value) -> None: + raise NotImplementedError + + @property + def last_accessed(self) -> float: + raise NotImplementedError + + @last_accessed.setter + def last_accessed(self, last_accessed_ts: float): + raise NotImplementedError + @property def block_id(self) -> Optional[int]: return self._block_id @@ -276,9 +303,14 @@ def num_empty_slots(self) -> int: def token_ids(self) -> List[int]: return self._token_ids + @property def block_size(self) -> int: return self._block_size @property def prev_block(self) -> Optional["Block"]: return self._prev_block + + @property + def content_hash(self) -> Optional[int]: + return None diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 292a750146ae6..e9000c9bfff7f 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -1,16 +1,15 @@ """Token blocks.""" from itertools import takewhile from os.path import commonprefix -from typing import Dict, Iterable, List, Optional +from typing import Dict, FrozenSet, Iterable, List, Optional from vllm.core.block.common import (CopyOnWriteTracker, get_all_blocks_recursively) -from vllm.core.block.interfaces import Block, BlockAllocator +from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor PrefixHash = int -BlockId = int # By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME # so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME, @@ -38,7 +37,7 @@ def __init__( num_blocks: int, block_size: int, block_ids: Optional[Iterable[int]] = None, - eviction_policy: Optional[EvictionPolicy] = EvictionPolicy.LRU, + eviction_policy: EvictionPolicy = EvictionPolicy.LRU, ): # A mapping of prefix hash to block index. All blocks which have a # prefix hash will be in this dict, even if they have refcount 0. @@ -49,7 +48,7 @@ def __init__( # An allocator for blocks that do not have prefix hashes. self._hashless_allocator = NaiveBlockAllocator( - create_block=self._create_block, + create_block=self._create_block, # type: ignore num_blocks=num_blocks, block_size=block_size, block_ids=block_ids, @@ -79,7 +78,7 @@ def _create_block( block_size: int, allocator: BlockAllocator, block_id: Optional[int] = None, - computed: Optional[bool] = False, + computed: bool = False, ) -> Block: # Bind block to self. allocator = self @@ -93,8 +92,10 @@ def _create_block( computed=computed, ) - def allocate_immutable(self, prev_block: Optional[Block], - token_ids: List[int]) -> Block: + def allocate_immutable(self, + prev_block: Optional[Block], + token_ids: List[int], + device: Optional[Device] = None) -> Block: """Allocates an immutable block with the given token IDs, reusing cached blocks if possible. @@ -105,6 +106,7 @@ def allocate_immutable(self, prev_block: Optional[Block], Returns: Block: The allocated immutable block. """ + assert device is None assert_prefix_caching_block_or_none(prev_block) block = self._create_block( @@ -127,16 +129,20 @@ def allocate_immutable(self, prev_block: Optional[Block], return block - def allocate_mutable(self, prev_block: Block) -> Block: + def allocate_mutable(self, + prev_block: Optional[Block], + device: Optional[Device] = None) -> Block: """Allocates a mutable block. If there are no free blocks, this will evict unused cached blocks. Args: prev_block (Block): The previous block in the sequence. + None is not allowed unlike it is super class. Returns: Block: The allocated mutable block. """ + assert device is None assert_prefix_caching_block_or_none(prev_block) try: @@ -144,6 +150,7 @@ def allocate_mutable(self, prev_block: Block) -> Block: prev_block=prev_block) assert block.block_id not in self._blocks + assert block.block_id is not None self._blocks[block.block_id] = block return block except BlockAllocator.NoFreeBlocksError: @@ -183,6 +190,7 @@ def allocate_mutable(self, prev_block: Block) -> Block: assert block.content_hash is None assert block.block_id not in self._blocks + assert block.block_id is not None self._blocks[block.block_id] = block return block @@ -225,6 +233,7 @@ def _free_block_id_for_block(self, block_id: BlockId, # We have fork case where block would get more than one ref, # so we cannot free it from tracking if ref cnt large than 1 if refcount <= 1: + assert block.block_id is not None del self._blocks[block.block_id] return self._hashless_allocator.free(block) @@ -233,6 +242,7 @@ def _free_block_id_for_block(self, block_id: BlockId, # If no longer used, add the block to the evictor. if refcount == 0: assert block.content_hash in self._cached_blocks + assert block.block_id is not None del self._blocks[block.block_id] self.evictor.add(block.block_id, block.content_hash, block.num_tokens_total, block.last_accessed) @@ -268,18 +278,18 @@ def fork(self, last_block: Block) -> List[Block]: return forked_blocks - def get_num_free_blocks(self) -> int: + def get_num_free_blocks(self, device: Optional[Device] = None) -> int: + assert device is None # The number of free blocks is the number of hashless free blocks # plus the number of blocks evictor could free from its list. return self._hashless_allocator.get_num_free_blocks( ) + self.evictor.num_blocks @property - def all_block_ids(self) -> frozenset[int]: + def all_block_ids(self) -> FrozenSet[int]: return self._hashless_allocator.all_block_ids - def promote_to_immutable_block(self, - block: "PrefixCachingBlock") -> BlockId: + def promote_to_immutable_block(self, block: Block) -> BlockId: """Once a mutable block is full, it can be promoted to an immutable block. This means that its content can be referenced by future blocks having the same prefix. @@ -289,7 +299,7 @@ def promote_to_immutable_block(self, block. Args: - block (PrefixCachingBlock): The mutable block to be promoted. + block: The mutable block to be promoted. Returns: BlockId: Either the original block index, or the block index of @@ -385,8 +395,11 @@ def get_common_computed_block_ids( takewhile(lambda block_id: self.block_is_computed(block_id), seq[:-1])) for seq in seq_block_ids ] - res = commonprefix([ids for ids in ids_list if ids != []]) - return res + # It returns a list of int although type annotation says list of string. + return commonprefix([ + ids for ids in ids_list # type: ignore + if ids != [] + ]) class PrefixCachingBlock(Block): @@ -403,7 +416,7 @@ class PrefixCachingBlock(Block): token_ids (List[int]): The initial token IDs to be stored in the block. block_size (int): The maximum number of token IDs that can be stored in the block. - prefix_caching_allocator (PrefixCachingBlockAllocator): The prefix + prefix_caching_allocator (BlockAllocator): The prefix caching block allocator associated with this block. block_id (Optional[int], optional): The physical block index of this block. Defaults to None. @@ -411,21 +424,25 @@ class PrefixCachingBlock(Block): def __init__( self, - prev_block: Optional["PrefixCachingBlock"], + prev_block: Optional[Block], token_ids: List[int], block_size: int, - prefix_caching_allocator: PrefixCachingBlockAllocator, + prefix_caching_allocator: BlockAllocator, block_id: Optional[int] = None, - computed: Optional[bool] = False, + computed: bool = False, ): + assert isinstance(prefix_caching_allocator, + PrefixCachingBlockAllocator), ( + "Currently this class is only tested with " + "PrefixCachingBlockAllocator.") assert_prefix_caching_block_or_none(prev_block) self._prev_block = prev_block self._cached_content_hash: Optional[int] = None self._cached_num_tokens_total: Optional[int] = None self._prefix_caching_allocator = prefix_caching_allocator - self.last_accessed = _DEFAULT_LAST_ACCESSED_TIME - self.computed = computed + self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME + self._computed = computed self._block = NaiveBlock( prev_block=prev_block, @@ -436,6 +453,22 @@ def __init__( _cow_target=self, ) + @property + def computed(self) -> bool: + return self._computed + + @computed.setter + def computed(self, value) -> None: + self._computed = value + + @property + def last_accessed(self) -> float: + return self._last_accessed + + @last_accessed.setter + def last_accessed(self, last_accessed_ts: float): + self._last_accessed = last_accessed_ts + def append_token_ids(self, token_ids: List[int]) -> None: """Appends the given token IDs to the block and registers the block as immutable if the block becomes full. @@ -483,7 +516,7 @@ def num_tokens_total(self) -> int: if self._cached_num_tokens_total is not None: return self._cached_num_tokens_total - _block = self + _block: Optional[Block] = self self._cached_num_tokens_total = 0 # TODO: current implement here take O(N^2), we expect future @@ -524,8 +557,10 @@ def content_hash(self) -> Optional[int]: return None is_first_block = self._prev_block is None - prev_block_hash = (None if is_first_block else - self._prev_block.content_hash) + prev_block_hash = ( + None if is_first_block else + self._prev_block.content_hash # type: ignore + ) # Previous block exists but does not yet have a hash. # Return no hash in this case. diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 0857605e2d005..3fbd8b787cf6c 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -190,7 +190,7 @@ def get_block_table(self, seq: Sequence) -> List[int]: assert seq.seq_id in self.block_tables block_ids = self.block_tables[seq.seq_id].physical_block_ids assert all(b is not None for b in block_ids) - return block_ids + return block_ids # type: ignore def access_all_blocks_in_seq(self, seq: Sequence, now: float): # Update the last accessed time of all the blocks accessed @@ -204,7 +204,9 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float): block_ids = [] for block_id in block_table.physical_block_ids: block_ids.append(block_id) - self.block_allocator.mark_blocks_as_accessed(block_ids, now) + self.block_allocator.mark_blocks_as_accessed( + block_ids, # type: ignore + now) def mark_blocks_as_computed(self, seq_group: SequenceGroup): # The only need for mark block as computed is for prefix caching, @@ -227,8 +229,9 @@ def get_common_computed_block_ids( seq_block_ids = [ self.block_tables[seq.seq_id].physical_block_ids for seq in seqs ] + # NOTE(sang): This assumes seq_block_ids doesn't contain any None. return self.block_allocator.get_common_computed_block_ids( - seq_block_ids) + seq_block_ids) # type: ignore def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: src_block_table = self.block_tables[parent_seq.seq_id] diff --git a/vllm/core/evictor_v2.py b/vllm/core/evictor_v2.py index b902a39263d14..57759b29347f4 100644 --- a/vllm/core/evictor_v2.py +++ b/vllm/core/evictor_v2.py @@ -32,15 +32,20 @@ def evict(self) -> Tuple[int, int]: @abstractmethod def add(self, block_id: int, content_hash: int, num_hashed_tokens: int, - last_accessed: int): + last_accessed: float): """Adds block to the evictor, making it a candidate for eviction""" pass @abstractmethod - def update(self, block_id: int, last_accessed: int): + def update(self, block_id: int, last_accessed: float): """Update corresponding block's access time in metadata""" pass + @abstractmethod + def remove(self, block_id: int): + """Remove a given block id from the cache.""" + pass + @abstractproperty def num_blocks(self) -> int: pass @@ -55,7 +60,7 @@ class BlockMetaData(): """ def __init__(self, content_hash: int, num_hashed_tokens: int, - last_accessed: int): + last_accessed: float): self.content_hash = content_hash self.num_hashed_tokens = num_hashed_tokens self.last_accessed = last_accessed @@ -96,12 +101,12 @@ def evict(self) -> Tuple[int, int]: return evicted_block_id, evicted_block.content_hash def add(self, block_id: int, content_hash: int, num_hashed_tokens: int, - last_accessed: int): + last_accessed: float): self.free_table[block_id] = BlockMetaData(content_hash, num_hashed_tokens, last_accessed) - def update(self, block_id: int, last_accessed: int): + def update(self, block_id: int, last_accessed: float): self.free_table[block_id].last_accessed = last_accessed def remove(self, block_id: int):