mirror of https://github.com/vllm-project/vllm
parent
b0925b3878
commit
49a3c8662b
|
@ -274,3 +274,90 @@ def test_reset():
|
|||
# Resetting block manager frees all allocated blocks.
|
||||
block_manager.reset()
|
||||
assert block_manager.get_num_free_gpu_blocks() == original_blocks
|
||||
|
||||
|
||||
def test_sliding_window_multi_seq():
|
||||
"""
|
||||
Tests that memory allocation and deallocation is handled
|
||||
correctly with multiple sequences that exceed the sliding
|
||||
window's capacity.
|
||||
"""
|
||||
block_size = 1
|
||||
num_cpu_blocks = 8
|
||||
num_gpu_blocks = 8
|
||||
sliding_window = 2
|
||||
block_manager = BlockSpaceManager(block_size,
|
||||
num_cpu_blocks,
|
||||
num_gpu_blocks,
|
||||
sliding_window=sliding_window,
|
||||
watermark=0)
|
||||
|
||||
assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
|
||||
|
||||
parent = Sequence(1, "one two three", [0, 1, 2], block_size)
|
||||
seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
|
||||
None)
|
||||
block_manager.allocate(seq_group)
|
||||
|
||||
# assert the number of blocks allocated is correct
|
||||
# the parent seq has len 3, but since sliding_window is 2,
|
||||
# we will use at most 2 blocks
|
||||
assert block_manager.get_num_free_gpu_blocks(
|
||||
) == num_gpu_blocks - sliding_window
|
||||
|
||||
# Fork prompt and copy block tables.
|
||||
child = parent.fork(2)
|
||||
block_manager.fork(parent, child)
|
||||
|
||||
# assert the number of blocks allocated is correct
|
||||
# forking does not increase memory consumption
|
||||
assert block_manager.get_num_free_gpu_blocks(
|
||||
) == num_gpu_blocks - sliding_window
|
||||
|
||||
# assert both parent and child share all blocks
|
||||
assert block_manager.get_block_table(
|
||||
parent) == block_manager.get_block_table(child)
|
||||
|
||||
token_id = 4
|
||||
# Append token to child. Block is shared so copy on write occurs.
|
||||
child.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
block_manager.append_slot(child)
|
||||
|
||||
# assert the number of blocks allocated is correct
|
||||
# we will use now one block more. Each seq will use 2 blocks,
|
||||
# but only one can be shared
|
||||
assert block_manager.get_num_free_gpu_blocks(
|
||||
) == num_gpu_blocks - sliding_window - 1
|
||||
|
||||
token_id = 5
|
||||
parent.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
block_manager.append_slot(parent)
|
||||
|
||||
# assert the number of blocks allocated is correct
|
||||
# no change, because both sequences are still just sharing one block
|
||||
assert block_manager.get_num_free_gpu_blocks(
|
||||
) == num_gpu_blocks - sliding_window - 1
|
||||
|
||||
block_table_parent = block_manager.get_block_table(parent)
|
||||
block_table_child = block_manager.get_block_table(child)
|
||||
|
||||
assert block_table_parent != block_table_child
|
||||
|
||||
# assert both blocks are sharing the second-last block
|
||||
assert block_table_parent[-2] == block_table_child[-2]
|
||||
|
||||
# now let's clean up...
|
||||
block_manager.free(parent)
|
||||
|
||||
# assert the number of blocks allocated is correct
|
||||
# We have freed one seq, reducing the ref count of two blocks by one.
|
||||
# One of the two was only used by the parent seq, so this is now free.
|
||||
# The child seq still consumes sliding_window blocks
|
||||
assert block_manager.get_num_free_gpu_blocks(
|
||||
) == num_gpu_blocks - sliding_window
|
||||
|
||||
# free all blocks
|
||||
block_manager.free(child)
|
||||
|
||||
# assert all blocks are free now
|
||||
assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
|
||||
|
|
|
@ -312,7 +312,12 @@ class BlockSpaceManager:
|
|||
# Thus, it is always safe from OOM.
|
||||
src_block_table = self.block_tables[parent_seq.seq_id]
|
||||
self.block_tables[child_seq.seq_id] = src_block_table.copy()
|
||||
for block in src_block_table:
|
||||
# When using a sliding window, blocks will be eventually reused.
|
||||
# In this case the block tables will contain repeated blocks.
|
||||
# When forking, we must make sure that each block's `ref_count`
|
||||
# is only incremented by one, so we deduplicate them by wrapping
|
||||
# them in a set.
|
||||
for block in set(src_block_table):
|
||||
block.ref_count += 1
|
||||
|
||||
def _get_physical_blocks(
|
||||
|
@ -393,7 +398,15 @@ class BlockSpaceManager:
|
|||
return block_number_mapping
|
||||
|
||||
def _free_block_table(self, block_table: BlockTable) -> None:
|
||||
for block in set(block_table):
|
||||
# when using a sliding window, each seq will only use up
|
||||
# to `self.block_sliding_window` blocks. When freeing
|
||||
# the block table, we must make sure to not free blocks more
|
||||
# than once. If no sliding window is used, there is no block
|
||||
# reuse in the block table, so we must free all blocks.
|
||||
blocks_to_free = (block_table[-self.block_sliding_window:]
|
||||
if self.block_sliding_window is not None else
|
||||
block_table)
|
||||
for block in set(blocks_to_free):
|
||||
if block.device == Device.GPU:
|
||||
self.gpu_allocator.free(block)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue