Skip to content

Commit

Permalink
Don't create loops with increment > work_amount
Browse files Browse the repository at this point in the history
  • Loading branch information
v-Golubev committed Dec 5, 2023
1 parent b2e1d84 commit 82b5294
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 53 deletions.
37 changes: 17 additions & 20 deletions src/common/snippets/src/lowered/loop_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,27 +342,23 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
}

for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) {
if (*(loop_subtensor.rbegin() + dim_idx) == PortDescriptor::ServiceDimensions::FULL_DIM) {
OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup");
const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx);
if (subtensor_value == PortDescriptor::ServiceDimensions::FULL_DIM) {
continue;
}

OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup");
const auto work_amount =
loop_tensor.size() > dim_idx ? *(loop_tensor.rbegin() + dim_idx)
: 0;
const auto work_amount_increment =
loop_subtensor.size() > dim_idx ? *(loop_subtensor.rbegin() + dim_idx)
: (dim_idx == 0 ? vector_size : 1);
const auto id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, work_amount_increment, dim_idx, loop_entry_points, loop_exit_points);
const auto work_amount = *(loop_tensor.rbegin() + dim_idx);
const auto increment = subtensor_value <= work_amount ? subtensor_value : work_amount;
const auto id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, dim_idx, loop_entry_points, loop_exit_points);
const auto loop_info = get_loop_info(id);

const auto tail_size = work_amount % work_amount_increment;
const auto tail_size = work_amount % increment;
if (tail_size != 0) {
loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::DefaultTailLoopHandler>(tail_size);
if (work_amount > work_amount_increment) {
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ReduceWorkAmount>(tail_size);
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ZeroFinalizationOffsets>();
}
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ReduceWorkAmount>(tail_size);
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ZeroFinalizationOffsets>();
}
}
}
Expand Down Expand Up @@ -421,13 +417,14 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target,
loop_info->set_entry_points(new_entries);
loop_info->set_exit_points(new_exits);

// WA: if one of the fused loops is broadcastable (wa = 1), its handlers have less priority.
// Need to fix it by avoiding handlers creation for the loops whose work amount less than increment
if (loop_info_upper->get_work_amount() > loop_info_lower->get_work_amount()) {
loop_info->handlers = fuse_loop_handlers(loop_info_upper->handlers, loop_info_lower->handlers);
} else {
loop_info->handlers = fuse_loop_handlers(loop_info_lower->handlers, loop_info_upper->handlers);
}
loop_info->handlers = fuse_loop_handlers(loop_info_upper->handlers, loop_info_lower->handlers);
// Since fusion can be called for broadcastable loops (one of the loops has work_amount = increment = 1),
// maximum value is set to the fused loop
loop_info->set_work_amount(std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount()));
loop_info->set_increment(std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment()));
// If one of the Loops is outer for nested loops that splits the same dimension,
// after fusion new common Loop saves this status
loop_info->set_outer_splited_loop(loop_info_upper->get_outer_splited_loop() || loop_info_lower->get_outer_splited_loop());

const auto& from = fuse_into_upper ? loop_id_lower : loop_id_upper;
const auto& to = fuse_into_upper ? loop_id_upper : loop_id_lower;
Expand Down
34 changes: 16 additions & 18 deletions src/common/snippets/src/lowered/pass/fuse_loops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,29 @@ bool FuseLoops::loop_ports_are_compatible(const LinearIR::LoopManagerPtr& loop_m
}

bool FuseLoops::can_be_fused(const LoopInfoPtr& loop_current, const LoopInfoPtr& loop_target) {
auto current_work_amount = loop_current->get_work_amount();
auto target_work_amount = loop_target->get_work_amount();
// Loop fusion is supported only if Loops have equal increments and the equal/broadcastable work amounts.
const auto current_work_amount = loop_current->get_work_amount();
const auto target_work_amount = loop_target->get_work_amount();
const auto current_increment = loop_current->get_increment();
const auto target_increment = loop_target->get_increment();
// Loop fusion is supported only if Loops have equal/broadcastable increments and work amounts.
// Note: For example, Broadcastable work amounts are possible in the following case:
// Relu_0 [16x1] Relu_1 [16x128]
// \ /
// Add [16x128]
// Because of expression order in linear IR and work of MarkLoop algorithm, there are 2 Inner Loops:
// - Relu_0 with work amount `1` and increment `vector size`
// - Relu_0 with work amount `1` and increment `1`
// - Relu_1 and Add with work amount `128` and increment `vector size`
// We can fuse them into one Loop with work amount `128` and increment `vector size`
const auto supported_work_amount = current_work_amount == target_work_amount || current_work_amount == 1 || target_work_amount == 1;
const auto supported_increment = loop_current->get_increment() == loop_target->get_increment();
return supported_work_amount && supported_increment;

// WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't,
// because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters
// (e.g. tail size). This logic is not implemented for now.
const bool first_iter_handlers_match = loop_current->handlers[LoopManager::LoopInfo::FIRST_ITER].empty() ==
loop_target->handlers[LoopManager::LoopInfo::FIRST_ITER].empty();
const bool equal_parameters = current_work_amount == target_work_amount && current_increment == target_increment;
const bool current_bcastable = current_work_amount == 1 && current_increment == 1;
const bool target_bcastable = target_work_amount == 1 && target_increment == 1;
return equal_parameters || current_bcastable || target_bcastable;
}

void FuseLoops::move(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id,
Expand Down Expand Up @@ -124,12 +133,6 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo
LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos;
loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos);
loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, target_loop_id, current_loop_id, false);
// Update work_amount for Loop (increment is constant because increments must be the identical for fusion):
loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount()));
// If one of the Loops is outer for nested loops that splits the same dimension,
// after fusion new common Loop save this status
loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop());

const auto insertion_place = current_loop_begin_pos;
const auto is_move_needed = target_loop_end_pos != current_loop_begin_pos;
if (is_move_needed)
Expand Down Expand Up @@ -169,11 +172,6 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo
LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos;
loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos);
loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, current_loop_id, target_loop_id);
// Update work_amount for Loop (increment is constant because increments must be the identical for fusion):
loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount()));
// If one of the Loops is outer for nested loops that splits the same dimension,
// after fusion new common Loop save this status
loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop());

const auto insertion_place = current_loop_end_pos;
const auto is_move_needed = insertion_place != target_loop_begin_pos;
Expand Down
10 changes: 5 additions & 5 deletions src/common/snippets/src/lowered/pass/insert_load_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ using LoopInfoPtr = LoopManager::LoopInfoPtr;
InsertLoadStore::InsertLoadStore(size_t vector_size) : m_vector_size(vector_size) {}

size_t InsertLoadStore::get_count(const PortDescriptorPtr& port_desc) const {
const auto layout = port_desc->get_layout();
const auto shape = port_desc->get_shape();
const auto& layout = port_desc->get_layout();
const auto& shape = port_desc->get_shape();
// Find last dimension by layout
const auto last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1);
const auto& last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1);
OPENVINO_ASSERT(last_dim_idx != layout.end() && *last_dim_idx < shape.size(), "Load/Store expression have incorrect layout");
const auto dim = shape[*last_dim_idx];
return dim == 1 ? 1 : m_vector_size;
const auto& dim = shape[*last_dim_idx];
return std::min(dim, m_vector_size);
}

bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) {
Expand Down
13 changes: 6 additions & 7 deletions src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
const auto& reduce_type_info = reduce->get_type_info();
const auto& input_shape = reduce_expr->get_input_port_descriptor(0)->get_shape();
const auto work_amount = *(input_shape.rbegin());
const auto increment = m_vector_size <= work_amount ? m_vector_size : work_amount;
const bool is_dynamic = reduce->is_dynamic();

// We need an iterator to the inserted element
Expand All @@ -87,26 +88,24 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
const auto initial_fill = push_node(std::make_shared<op::Fill>(vector_buffer.second, 0, fill_value));

// Reduce loop
const auto fill = push_node(std::make_shared<op::Fill>(reduce->get_input_source_output(0), m_vector_size, fill_value));
const auto fill = push_node(std::make_shared<op::Fill>(reduce->get_input_source_output(0), increment, fill_value));
const auto accumulation = push_node(get_accumulation_node(fill.second, initial_fill.second, reduce_type_info));

const auto reduce_loop_id = loop_manager->mark_loop(
fill.first,
expr_it,
work_amount,
m_vector_size,
increment,
0,
std::vector<ExpressionPort>{(*fill.first)->get_input_port(0), (*accumulation.first)->get_input_port(1)},
std::vector<ExpressionPort>{(*accumulation.first)->get_output_port(0)});
const auto reduce_loop_info = loop_manager->get_loop_info(reduce_loop_id);
const auto tail_size = work_amount % m_vector_size;
const auto tail_size = work_amount % increment;
if (tail_size != 0) {
reduce_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<DefaultTailLoopHandler>(tail_size);
reduce_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
if (work_amount > m_vector_size) {
reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
}
reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
}
const auto horizon = push_node(get_horizon_node(accumulation.second, reduce_type_info));

Expand Down
2 changes: 0 additions & 2 deletions src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
// TODO: remove SoftmaxDecomposition pass
pipeline.register_pass<lowered::pass::SoftmaxDecomposition>(vector_size);
pipeline.register_pass<lowered::pass::ReduceDecomposition>(vector_size);
// pipeline.register_pass<lowered::pass::ReduceSumDecomposition>(vector_size);
pipeline.register_pass<lowered::pass::FuseLoops>();
pipeline.register_pass<lowered::pass::SplitLoops>();
pipeline.register_pass<lowered::pass::MoveResultOutOfLoop>();
Expand All @@ -465,7 +464,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,

pipeline.register_positioned_passes(backend_passes);
pipeline.run(linear_ir);
linear_ir.serialize("/home/vgolubev/models/control_flow.xml", "");

lowering_result.buffer_scratchpad_size = buffer_allocation_pass->get_scratchpad_size();
}
Expand Down
2 changes: 1 addition & 1 deletion src/common/snippets/tests/src/lowered/pass/loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ static void init_linear_ir(const std::vector<ov::PartialShape>& in_shapes, Linea
const auto in_shape0 = in_shapes[0].get_shape();
const auto in_shape1 = in_shapes[1].get_shape();
const auto inner_wa = std::max(*in_shape0.rbegin(), *in_shape1.rbegin());
const auto inner_inc = vector_size;
const auto inner_inc = std::min(vector_size, inner_wa);
const auto blocked_wa = block_size;
const auto blocked_inc = 1;
const auto outer_wa = std::max(*(in_shape0.rbegin() + 1), *(in_shape1.rbegin() + 1));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,11 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetBrgemmKBlockSize>(tail_size);
loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetBrgemmBeta>(1.f);
}
} else {
loop_info->handlers[LoopInfo::FIRST_ITER].register_pass<SetSingleIterationWithWorkAmount>(block_size_k);
loop_info->handlers[LoopInfo::FIRST_ITER].register_pass<ZeroFinalizationOffsets>();
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(block_size_k);
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<SetBrgemmBeta>(1.f);
}
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ std::vector<std::vector<ov::PartialShape>> input_shapes{
{{1, 1, 32, 23}, {1, 1, 23, 68}},
{{1, 16, 384, 64}, {1, 16, 64, 384}},
{{1, 1, 100, 700}, {1, 1, 700, 100}},
{{1, 1, 100, 1024}, {1, 1, 1024, 100}},
{{1, 1, 100, 2500}, {1, 1, 2500, 100}},
{{1, 1, 100, 4500}, {1, 1, 4500, 100}},
};
Expand Down

0 comments on commit 82b5294

Please sign in to comment.