-
Notifications
You must be signed in to change notification settings - Fork 192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Partial preemption for groups with multiple sequences #574
Changes from all commits
54481f0
8f0c1b7
d7fe87f
2cc5114
59d9cde
a2fd89a
0c60e64
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -110,6 +110,78 @@ class BlockManager { | |
return m_block_table[seq_id]; | ||
} | ||
|
||
const size_t free_rightest_blocks(SequenceGroup::Ptr sequence_group) { | ||
size_t blocks_released = 0; | ||
auto running_sequences = sequence_group->get_not_finished_sequences(); | ||
std::set<size_t> blocks_released_indices; | ||
for (size_t idx = 0; idx < running_sequences.size(); ++idx) { | ||
auto seq_id = running_sequences[idx]->get_id(); | ||
OPENVINO_ASSERT(m_block_table.count(seq_id) > 0, "Invalid sequence group."); | ||
auto block_table = m_block_table[seq_id]; | ||
if (free_last_block(seq_id)) { | ||
blocks_released++; | ||
} | ||
} | ||
return blocks_released; | ||
} | ||
|
||
const bool free_group_partially_multiple_runnning_sequence(SequenceGroup::Ptr sequence_group, size_t num_required_blocks, size_t& phisical_blocks_released, size_t& logical_blocks_released) { | ||
phisical_blocks_released = 0; | ||
logical_blocks_released = 0; | ||
while (num_required_blocks > phisical_blocks_released) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if case we need to preempt very long sequence, such loop can be expensive. If it's possible, it would be great to compute a number of preempted blocks based on required number of blocks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Simplified this method using formula |
||
size_t released_count = free_rightest_blocks(sequence_group); | ||
logical_blocks_released += 1; | ||
if (get_number_of_blocks_occupied_by_sequence(sequence_group) == 0) { | ||
break; | ||
} | ||
phisical_blocks_released += released_count; | ||
} | ||
phisical_blocks_released = phisical_blocks_released; | ||
return num_required_blocks <= phisical_blocks_released; | ||
} | ||
|
||
const bool free_group_partially_single_runnning_sequence(SequenceGroup::Ptr sequence_group, size_t num_required_blocks, size_t& phisical_blocks_released) { | ||
auto sequences = sequence_group->get_not_finished_sequences(); | ||
OPENVINO_ASSERT(sequences.size() == 1); | ||
auto running_sequence = sequences[0]; | ||
auto seq_id = running_sequence->get_id(); | ||
if (!has_block_table(seq_id)) { | ||
// no blocks are allocated for this sequence, so it can't be preempted | ||
return false; | ||
} | ||
auto block_table = get_block_table(seq_id); | ||
auto prev_blocks_count = num_free_blocks(); | ||
free_sequence_partially_single_runnning_sequence(seq_id, num_required_blocks); | ||
|
||
// calculate the number of released blocks | ||
phisical_blocks_released = num_free_blocks() - prev_blocks_count; | ||
|
||
return num_required_blocks <= phisical_blocks_released; | ||
} | ||
|
||
const size_t get_number_of_blocks_occupied_by_sequence(SequenceGroup::Ptr sequence_group) { | ||
auto running_sequences = sequence_group->get_not_finished_sequences(); | ||
size_t num_blocks = 0; | ||
std::set<size_t> indices; | ||
for (size_t idx = 0; idx < running_sequences.size(); ++idx) { | ||
auto seq_id = running_sequences[idx]->get_id(); | ||
if (m_block_table.count(seq_id) == 0) { | ||
continue; | ||
} | ||
// OPENVINO_ASSERT(m_block_table.count(seq_id) > 0, "Invalid sequence group."); | ||
auto block_table = m_block_table[seq_id]; | ||
size_t last_idx = block_table.back()->get_index(); | ||
if (indices.find(last_idx) != indices.end()) { | ||
continue; | ||
} | ||
else { | ||
indices.insert(last_idx); | ||
num_blocks += block_table.size(); | ||
} | ||
} | ||
return num_blocks; | ||
} | ||
|
||
const bool has_block_table(uint64_t seq_id) { | ||
return m_block_table.count(seq_id) > 0; | ||
} | ||
|
@@ -153,11 +225,23 @@ class BlockManager { | |
OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); | ||
} | ||
|
||
void free_sequence_partially(size_t seq_id, size_t block_num) { | ||
// currently this method is applicable only for groups with single sequences | ||
// TODO: support for groups with multiple sequences | ||
bool free_last_block(size_t seq_id) { | ||
auto block_table = m_block_table[seq_id]; | ||
OPENVINO_ASSERT(block_table.size() >= 1); | ||
size_t block_idx = m_block_table[seq_id].size() - 1; | ||
m_allocator.free(block_table[block_idx]); | ||
m_block_table[seq_id].resize(m_block_table[seq_id].size() - 1); | ||
|
||
if (m_block_table[seq_id].size() == 0) { | ||
OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); | ||
} | ||
return block_table[block_idx]->is_free(); | ||
} | ||
|
||
void free_sequence_partially_single_runnning_sequence(size_t seq_id, size_t block_num) { | ||
// this method is applicable only for groups with single sequences | ||
|
||
auto block_table = m_block_table[seq_id]; | ||
OPENVINO_ASSERT(block_table.size() >= block_num); | ||
for (size_t idx = 0; idx < block_num; idx++) { | ||
size_t block_idx = m_block_table[seq_id].size() - idx - 1; | ||
|
@@ -166,7 +250,7 @@ class BlockManager { | |
} | ||
m_block_table[seq_id].resize(m_block_table[seq_id].size() - block_num); | ||
|
||
if (m_block_table.size() == 0) { | ||
if (m_block_table[seq_id].size() == 0) { | ||
OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); | ||
} | ||
} | ||
|
@@ -200,6 +284,7 @@ class BlockManager { | |
if (last_block_ids.find(last_block_id) != last_block_ids.end()) | ||
// this block was already processed | ||
continue; | ||
last_block_ids.insert(last_block_id); | ||
|
||
size_t needed_blocks_per_sequence = seq_group->get_num_logical_blocks() - num_physical_blocks; | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -101,52 +101,45 @@ class Scheduler { | |||
size_t prev_blocks_count = m_block_manager.num_free_blocks(); | ||||
size_t num_running_sequences = sequence_group->num_running_seqs(); | ||||
size_t preempted_tokens = 0; | ||||
size_t num_blocks_occupied_by_sequence = m_block_manager.get_number_of_blocks_occupied_by_sequence(sequence_group); | ||||
|
||||
if (num_running_sequences > 1) { | ||||
for (size_t s = 0; s < sequence_group->num_running_seqs(); ++s) { | ||||
auto seq_id = (*sequence_group)[s]->get_id(); | ||||
if (num_blocks_occupied_by_sequence <= blocks_needed) { | ||||
auto sequences = sequence_group->get_not_finished_sequences(); | ||||
for (size_t s = 0; s < sequences.size(); ++s) { | ||||
auto seq_id = sequences[s]->get_id(); | ||||
m_block_manager.free_sequence(seq_id); | ||||
} | ||||
sequence_group->reset(); | ||||
sequence_group->preempt_tokens(processed_tokens); | ||||
sequence_group->set_waiting(); | ||||
return m_block_manager.num_free_blocks() > prev_blocks_count; | ||||
} | ||||
|
||||
// currently partial preemtion is enabled only for single running sequence case | ||||
// TODO: implement partial preemption for case with muliple sequences in group | ||||
for (size_t s = 0; s < num_running_sequences; ++s) { | ||||
auto seq_id = (*sequence_group)[s]->get_id(); | ||||
if (!m_block_manager.has_block_table(seq_id)) { | ||||
// no blocks are allocated for this sequence, so it can't be preempted | ||||
return false; | ||||
} | ||||
auto block_table = m_block_manager.get_block_table(seq_id); | ||||
size_t required_blocks = blocks_needed - total_num_released_blocks; | ||||
if (required_blocks >= block_table.size()) { | ||||
// fully drop a sequence(s) from block_manager | ||||
m_block_manager.free_sequence(seq_id); | ||||
} | ||||
else { | ||||
m_block_manager.free_sequence_partially(seq_id, required_blocks); | ||||
} | ||||
|
||||
// calculate the number of released blocks | ||||
auto released_blocks = m_block_manager.num_free_blocks() - prev_blocks_count; | ||||
total_num_released_blocks += released_blocks; | ||||
prev_blocks_count = m_block_manager.num_free_blocks(); | ||||
|
||||
if (num_running_sequences > 1) { | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need to distinguish such cases? Looks like multiple sequences within a group is more generic case and should cover single sequence case as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not necessary to distinguish. But for single sequence case we can release blocks more efficiently than in general case using resize and not release blocks layer by layer: openvino.genai/src/cpp/src/block_manager.hpp Line 413 in 9d35767
So it was distinguished only in terms of efficiency. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed distinguishing between these cases in PR as discussed. |
||||
size_t phisycal_blocks_released; | ||||
size_t logical_blocks_released; | ||||
m_block_manager.free_group_partially_multiple_runnning_sequence(sequence_group, blocks_needed, phisycal_blocks_released, logical_blocks_released); | ||||
|
||||
// calculate the number of preempted tokens | ||||
auto tokens_in_last_block = processed_tokens % block_size; | ||||
if (tokens_in_last_block == 0) { | ||||
tokens_in_last_block = block_size; | ||||
} | ||||
preempted_tokens = tokens_in_last_block + std::max<size_t>((int)logical_blocks_released - 1, 0) * block_size; | ||||
|
||||
preempted_tokens += tokens_in_last_block + std::max<size_t>((int)released_blocks - 1, 0) * block_size; | ||||
if (m_block_manager.num_free_blocks() >= blocks_needed) { | ||||
break; | ||||
} | ||||
else { | ||||
OPENVINO_ASSERT(num_running_sequences == 1); | ||||
size_t phisycal_blocks_released; | ||||
m_block_manager.free_group_partially_single_runnning_sequence(sequence_group, blocks_needed, phisycal_blocks_released); | ||||
|
||||
// calculate the number of preempted tokens | ||||
auto tokens_in_last_block = processed_tokens % block_size; | ||||
if (tokens_in_last_block == 0) { | ||||
tokens_in_last_block = block_size; | ||||
} | ||||
preempted_tokens = tokens_in_last_block + std::max<size_t>((int)phisycal_blocks_released - 1, 0) * block_size; | ||||
} | ||||
|
||||
// case when preemption requires preempt prompt tokens | ||||
if (!m_config.dynamic_split_fuse && processed_tokens - preempted_tokens < sequence_group->get_prompt_len()) { | ||||
// preempt prompt fully to not leave partially generated prompt | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if this (or some other) methods are not planned to be used as public API, let's move them to private section.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed this method.