diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index c4c680e109a84..bcf08cda09f46 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -410,6 +410,123 @@ def test_get_common_computed_block_ids(num_blocks: int, block_size: int, assert (len(res) == zero_point_blocks) + # Test case that assume those prompted block after first immutable would + # be freed into hashless allocator, while first immutable block get ref + # increased. + @staticmethod + @pytest.mark.parametrize("num_blocks", [3]) + @pytest.mark.parametrize("block_size", [16]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_alloc_promotion(num_blocks: int, block_size: int, seed: int): + random.seed(seed) + + allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, + block_size=block_size) + token_ids = list(range(block_size)) + + block = allocator.allocate_immutable(prev_block=None, + token_ids=token_ids) + + assert allocator._refcounter.get(block.block_id) == 1 + m = allocator.allocate_mutable(prev_block=None) + + block_id = m.block_id + for i in range(block_size): + m.append_token_ids([i]) + # After block get promoted to immutable from mutable, if there is + # already same content hash block, then it shall be released into + # hashless_allocator + # And first immutable block's ref get increased by 1 + assert m.block_id == block.block_id + assert block_id in allocator._hashless_allocator._free_block_indices + assert allocator._refcounter.get(block.block_id) == 2 + + # Test case when eviction and allocation are mixed, + # make sure they work as expected + @staticmethod + @pytest.mark.parametrize("num_blocks", [3]) + @pytest.mark.parametrize("block_size", [16]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int): + random.seed(seed) + + all_blocks_list = [i for i in range(num_blocks)] + zero_ref = {i: 0 for i in range(num_blocks)} + allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, + block_size=block_size) + token_ids = list(range(num_blocks * block_size)) + + # now we have num_blocks free blocks in hashless allocator + # with internal tracking list _blocks _cached_blocks and evictor + # empty and block's ref shall be 0 + assert list(allocator._hashless_allocator._free_block_indices + ) == all_blocks_list + assert len(allocator._blocks.keys()) == 0 + assert len(allocator._cached_blocks.values()) == 0 + assert len(allocator.evictor.free_table.keys()) == 0 + assert allocator._refcounter._refcounts == zero_ref + + # Allocate immutable chains with only one block residuled in + new_block = [] + for i in range(num_blocks): + block = allocator.allocate_immutable( + prev_block=None, + token_ids=token_ids[block_size * i:block_size * (i + 1)]) + new_block.append(block) + + # Free all blocks, and now all blocks shall be in the evictor + # there shall be no tracking data left in _blocks + # all blocks shall be tracked in _cached_blocks + # all blocks' ref shall be zero + for block in new_block: + allocator.free(block) + + assert len(allocator._blocks.keys()) == 0 + assert len(allocator._hashless_allocator._free_block_indices) == 0 + assert list(allocator._cached_blocks.values()) == all_blocks_list + assert list(allocator.evictor.free_table.keys()) == all_blocks_list + assert allocator._refcounter._refcounts == zero_ref + + # Allocate a mutable block, and the first block shall be evicted + # and set its content hash into None, ref to 1 + mutable = allocator.allocate_mutable(prev_block=None) + + assert mutable.block_id == 0 + assert mutable.content_hash is None + assert 0 in allocator._blocks + assert allocator._refcounter.get(0) == 1 + assert 0 not in allocator._cached_blocks + assert 0 not in allocator.evictor + + # Since this mutable block has no hash yet, it shall be released into + # hashless allocator + allocator.free(mutable) + + assert len(allocator._blocks.keys()) == 0 + assert allocator._refcounter._refcounts == zero_ref + assert 0 not in allocator._cached_blocks + assert 0 not in allocator.evictor + assert 0 in allocator._hashless_allocator._free_block_indices + + # when allocate immutable with first block_size tokens, we + # shall get free block from hashless allocator, thus no block left + # in hashless + block = allocator.allocate_immutable(prev_block=None, + token_ids=token_ids[:block_size]) + + assert block.block_id == 0 + assert len(allocator._hashless_allocator._free_block_indices) == 0 + assert 0 in allocator._blocks + assert 0 in allocator._cached_blocks.values() + assert allocator._refcounter.get(0) == 1 + assert 0 not in allocator.evictor + + # allocate mutable block again, it shall be popped from evictor + mutable = allocator.allocate_mutable(prev_block=None) + assert len(allocator._hashless_allocator._free_block_indices) == 0 + assert mutable.block_id not in allocator.evictor.free_table + assert allocator._refcounter.get(mutable.block_id) == 1 + # Test case where two last accessed times are equal @staticmethod @pytest.mark.parametrize("num_blocks", [1024]) diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 882f301c1f697..4eb32f145b05b 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -160,21 +160,17 @@ def allocate_mutable(self, # If the evictor has blocks available for eviction, evict a block # and return it. if self.evictor.num_blocks > 0: + # here we get an evicted block, which is only added + # into evictor if its ref counter is 0 + # and since its content would be changed, we need + # to remove it from _cached_blocks's tracking list block_id, content_hash_to_evict = self.evictor.evict() - # Here we may have scenario that several blocks have - # the same content hash, but due to the latter coming block - # is coming from mutable to immutable path, their physical - # block is added into evictor. - # However in this case, we shall not pop the _cached_blocks, - # as the same content is still used by others, which means - # we need to check ref before decide to pop the list. - _block_id = self._cached_blocks[content_hash_to_evict] - refcount = self._refcounter.get(_block_id) - if refcount == 1: - self._cached_blocks.pop(content_hash_to_evict) - assert _block_id == block_id + assert self._refcounter.get(_block_id) == 0 + assert _block_id == block_id + + self._cached_blocks.pop(content_hash_to_evict) self._refcounter.incr(block_id) @@ -199,7 +195,11 @@ def allocate_mutable(self, def _incr_refcount_cached_block(self, block: Block, block_id: BlockId) -> None: - # since block is already computed, mark it + # now _incr_refcount_cached_block comes from two place + # allocate_immutable/promote_to_immutable_block where hit + # _cached_blocks hash key. + # In both cases, it means that already exists a already + # computed block which shared with block now block.computed = True refcount = self._refcounter.incr(block_id) @@ -228,13 +228,19 @@ def _free_block_id_for_block(self, block_id: BlockId, block: Block) -> None: assert isinstance(block, PrefixCachingBlock) - if block.content_hash is None: + # if we comes from promote_to_immutable_block, it means that + # block.content_hash is never None. + # However we need to release the same content block, so that + # physical block could get reused. + if block.block_id != block_id or block.content_hash is None: refcount = self._refcounter.get(block_id) # We have fork case where block would get more than one ref, # so we cannot free it from tracking if ref cnt large than 1 - if refcount <= 1: - assert block.block_id is not None + assert block.block_id is not None + refcount = self._refcounter.get(block.block_id) + if refcount == 1: del self._blocks[block.block_id] + return self._hashless_allocator.free(block) refcount = self._refcounter.decr(block_id) @@ -317,7 +323,8 @@ def promote_to_immutable_block(self, block: Block) -> BlockId: if block.content_hash not in self._cached_blocks: self._cached_blocks[block.content_hash] = block.block_id else: - self._free_block_id_for_block(block.block_id, block) + self._free_block_id_for_block( + self._cached_blocks[block.content_hash], block) self._incr_refcount_cached_block( block, self._cached_blocks[block.content_hash])