Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplified partial preemption algorithm. #730

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 5 additions & 42 deletions src/cpp/src/block_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,52 +251,17 @@ class BlockManager {
return m_block_table[seq_id];
}

const size_t free_rightest_blocks(SequenceGroup::Ptr sequence_group) {
size_t blocks_released = 0;
const size_t free_group_partially(SequenceGroup::Ptr sequence_group, size_t num_required_blocks) {
size_t blocks_num = std::ceil(num_required_blocks / sequence_group->get_not_finished_sequences().size());
auto running_sequences = sequence_group->get_not_finished_sequences();
std::set<size_t> blocks_released_indices;
for (size_t idx = 0; idx < running_sequences.size(); ++idx) {
auto seq_id = running_sequences[idx]->get_id();
OPENVINO_ASSERT(m_block_table.count(seq_id) > 0, "Invalid sequence group.");
auto block_table = m_block_table[seq_id];
if (free_last_block(seq_id)) {
blocks_released++;
}
}
return blocks_released;
}

const bool free_group_partially_multiple_runnning_sequence(SequenceGroup::Ptr sequence_group, size_t num_required_blocks, size_t& phisical_blocks_released, size_t& logical_blocks_released) {
phisical_blocks_released = 0;
logical_blocks_released = 0;
while (num_required_blocks > phisical_blocks_released) {
size_t released_count = free_rightest_blocks(sequence_group);
logical_blocks_released += 1;
if (get_number_of_blocks_occupied_by_sequence(sequence_group) == 0) {
break;
}
phisical_blocks_released += released_count;
free_sequence_partially(seq_id, blocks_num);
}
return num_required_blocks <= phisical_blocks_released;
}

const bool free_group_partially_single_runnning_sequence(SequenceGroup::Ptr sequence_group, size_t num_required_blocks, size_t& phisical_blocks_released) {
auto sequences = sequence_group->get_not_finished_sequences();
OPENVINO_ASSERT(sequences.size() == 1);
auto running_sequence = sequences[0];
auto seq_id = running_sequence->get_id();
if (!has_block_table(seq_id)) {
// no blocks are allocated for this sequence, so it can't be preempted
return false;
}
auto block_table = get_block_table(seq_id);
auto prev_blocks_count = num_free_blocks();
free_sequence_partially_single_runnning_sequence(seq_id, num_required_blocks);

// calculate the number of released blocks
phisical_blocks_released = num_free_blocks() - prev_blocks_count;

return num_required_blocks <= phisical_blocks_released;
return blocks_num;
}

const size_t get_number_of_blocks_occupied_by_sequence(SequenceGroup::Ptr sequence_group) {
Expand Down Expand Up @@ -399,15 +364,13 @@ class BlockManager {
return block_table[block_idx]->is_free();
}

void free_sequence_partially_single_runnning_sequence(size_t seq_id, size_t block_num) {
// this method is applicable only for groups with single sequences
void free_sequence_partially(size_t seq_id, size_t block_num) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here during multiple sequences case, we suppose that we can free the same amount of blocks for each sequence, which can be wrong for beam search case...


auto block_table = m_block_table[seq_id];
OPENVINO_ASSERT(block_table.size() >= block_num);
for (size_t idx = 0; idx < block_num; idx++) {
size_t block_idx = m_block_table[seq_id].size() - idx - 1;
m_allocator.free(block_table[block_idx]);
OPENVINO_ASSERT(block_table[block_idx]->is_free());
}
m_block_table[seq_id].resize(m_block_table[seq_id].size() - block_num);

Expand Down
29 changes: 6 additions & 23 deletions src/cpp/src/scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,31 +117,14 @@ class Scheduler {
return m_block_manager.num_free_blocks() > prev_blocks_count;
}

if (num_running_sequences > 1) {
size_t phisycal_blocks_released;
size_t logical_blocks_released;
m_block_manager.free_group_partially_multiple_runnning_sequence(sequence_group, blocks_needed, phisycal_blocks_released, logical_blocks_released);

// calculate the number of preempted tokens
auto tokens_in_last_block = processed_tokens % block_size;
if (tokens_in_last_block == 0) {
tokens_in_last_block = block_size;
}
preempted_tokens = tokens_in_last_block + std::max<size_t>((int)logical_blocks_released - 1, 0) * block_size;
size_t logical_blocks_released = m_block_manager.free_group_partially(sequence_group, blocks_needed);

// calculate the number of preempted tokens
auto tokens_in_last_block = processed_tokens % block_size;
if (tokens_in_last_block == 0) {
tokens_in_last_block = block_size;
}
else {
OPENVINO_ASSERT(num_running_sequences == 1);
size_t phisycal_blocks_released;
m_block_manager.free_group_partially_single_runnning_sequence(sequence_group, blocks_needed, phisycal_blocks_released);

// calculate the number of preempted tokens
auto tokens_in_last_block = processed_tokens % block_size;
if (tokens_in_last_block == 0) {
tokens_in_last_block = block_size;
}
preempted_tokens = tokens_in_last_block + std::max<size_t>((int)phisycal_blocks_released - 1, 0) * block_size;
}
preempted_tokens = tokens_in_last_block + std::max<size_t>((int)logical_blocks_released - 1, 0) * block_size;

// case when preemption requires preempt prompt tokens
if (!m_config.dynamic_split_fuse && processed_tokens - preempted_tokens < sequence_group->get_prompt_len()) {
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/block_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ TEST(TestBlockManager, general_test) {
EXPECT_EQ(bm.get_block_table(seq_id).size(), 6);
EXPECT_EQ(bm.num_free_blocks(), 0);

bm.free_sequence_partially_single_runnning_sequence(seq_id, 4);
bm.free_sequence_partially(seq_id, 4);
EXPECT_EQ(bm.get_block_table(seq_id).size(), 2);
EXPECT_EQ(bm.num_free_blocks(), 4);

Expand Down
Loading