diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp index a9abc50c28be49..01381c6bbc3d45 100644 --- a/src/common/snippets/src/lowered/loop_manager.cpp +++ b/src/common/snippets/src/lowered/loop_manager.cpp @@ -342,27 +342,23 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, } for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) { - if (*(loop_subtensor.rbegin() + dim_idx) == PortDescriptor::ServiceDimensions::FULL_DIM) { + OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup"); + const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx); + if (subtensor_value == PortDescriptor::ServiceDimensions::FULL_DIM) { continue; } OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup"); - const auto work_amount = - loop_tensor.size() > dim_idx ? *(loop_tensor.rbegin() + dim_idx) - : 0; - const auto work_amount_increment = - loop_subtensor.size() > dim_idx ? *(loop_subtensor.rbegin() + dim_idx) - : (dim_idx == 0 ? vector_size : 1); - const auto id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, work_amount_increment, dim_idx, loop_entry_points, loop_exit_points); + const auto work_amount = *(loop_tensor.rbegin() + dim_idx); + const auto increment = subtensor_value <= work_amount ? subtensor_value : work_amount; + const auto id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, dim_idx, loop_entry_points, loop_exit_points); const auto loop_info = get_loop_info(id); - const auto tail_size = work_amount % work_amount_increment; + const auto tail_size = work_amount % increment; if (tail_size != 0) { loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); - if (work_amount > work_amount_increment) { - loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(tail_size); - loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(); - } + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(tail_size); + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(); } } } @@ -421,13 +417,14 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, loop_info->set_entry_points(new_entries); loop_info->set_exit_points(new_exits); - // WA: if one of the fused loops is broadcastable (wa = 1), its handlers have less priority. - // Need to fix it by avoiding handlers creation for the loops whose work amount less than increment - if (loop_info_upper->get_work_amount() > loop_info_lower->get_work_amount()) { - loop_info->handlers = fuse_loop_handlers(loop_info_upper->handlers, loop_info_lower->handlers); - } else { - loop_info->handlers = fuse_loop_handlers(loop_info_lower->handlers, loop_info_upper->handlers); - } + loop_info->handlers = fuse_loop_handlers(loop_info_upper->handlers, loop_info_lower->handlers); + // Since fusion can be called for broadcastable loops (one of the loops has work_amount = increment = 1), + // maximum value is set to the fused loop + loop_info->set_work_amount(std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount())); + loop_info->set_increment(std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment())); + // If one of the Loops is outer for nested loops that splits the same dimension, + // after fusion new common Loop saves this status + loop_info->set_outer_splited_loop(loop_info_upper->get_outer_splited_loop() || loop_info_lower->get_outer_splited_loop()); const auto& from = fuse_into_upper ? loop_id_lower : loop_id_upper; const auto& to = fuse_into_upper ? loop_id_upper : loop_id_lower; diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp index 1738d6d8fe9574..ebb5ae40b79559 100644 --- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp +++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp @@ -44,20 +44,29 @@ bool FuseLoops::loop_ports_are_compatible(const LinearIR::LoopManagerPtr& loop_m } bool FuseLoops::can_be_fused(const LoopInfoPtr& loop_current, const LoopInfoPtr& loop_target) { - auto current_work_amount = loop_current->get_work_amount(); - auto target_work_amount = loop_target->get_work_amount(); - // Loop fusion is supported only if Loops have equal increments and the equal/broadcastable work amounts. + const auto current_work_amount = loop_current->get_work_amount(); + const auto target_work_amount = loop_target->get_work_amount(); + const auto current_increment = loop_current->get_increment(); + const auto target_increment = loop_target->get_increment(); + // Loop fusion is supported only if Loops have equal/broadcastable increments and work amounts. // Note: For example, Broadcastable work amounts are possible in the following case: // Relu_0 [16x1] Relu_1 [16x128] // \ / // Add [16x128] // Because of expression order in linear IR and work of MarkLoop algorithm, there are 2 Inner Loops: - // - Relu_0 with work amount `1` and increment `vector size` + // - Relu_0 with work amount `1` and increment `1` // - Relu_1 and Add with work amount `128` and increment `vector size` // We can fuse them into one Loop with work amount `128` and increment `vector size` - const auto supported_work_amount = current_work_amount == target_work_amount || current_work_amount == 1 || target_work_amount == 1; - const auto supported_increment = loop_current->get_increment() == loop_target->get_increment(); - return supported_work_amount && supported_increment; + + // WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't, + // because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters + // (e.g. tail size). This logic is not implemented for now. + const bool first_iter_handlers_match = loop_current->handlers[LoopManager::LoopInfo::FIRST_ITER].empty() == + loop_target->handlers[LoopManager::LoopInfo::FIRST_ITER].empty(); + const bool equal_parameters = current_work_amount == target_work_amount && current_increment == target_increment; + const bool current_bcastable = current_work_amount == 1 && current_increment == 1; + const bool target_bcastable = target_work_amount == 1 && target_increment == 1; + return equal_parameters || current_bcastable || target_bcastable; } void FuseLoops::move(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id, @@ -124,12 +133,6 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos; loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos); loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, target_loop_id, current_loop_id, false); - // Update work_amount for Loop (increment is constant because increments must be the identical for fusion): - loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount())); - // If one of the Loops is outer for nested loops that splits the same dimension, - // after fusion new common Loop save this status - loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop()); - const auto insertion_place = current_loop_begin_pos; const auto is_move_needed = target_loop_end_pos != current_loop_begin_pos; if (is_move_needed) @@ -169,11 +172,6 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos; loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos); loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, current_loop_id, target_loop_id); - // Update work_amount for Loop (increment is constant because increments must be the identical for fusion): - loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount())); - // If one of the Loops is outer for nested loops that splits the same dimension, - // after fusion new common Loop save this status - loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop()); const auto insertion_place = current_loop_end_pos; const auto is_move_needed = insertion_place != target_loop_begin_pos; diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp index 75e70c9c553c88..492eb8d17682b1 100644 --- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp +++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp @@ -20,13 +20,13 @@ using LoopInfoPtr = LoopManager::LoopInfoPtr; InsertLoadStore::InsertLoadStore(size_t vector_size) : m_vector_size(vector_size) {} size_t InsertLoadStore::get_count(const PortDescriptorPtr& port_desc) const { - const auto layout = port_desc->get_layout(); - const auto shape = port_desc->get_shape(); + const auto& layout = port_desc->get_layout(); + const auto& shape = port_desc->get_shape(); // Find last dimension by layout - const auto last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1); + const auto& last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1); OPENVINO_ASSERT(last_dim_idx != layout.end() && *last_dim_idx < shape.size(), "Load/Store expression have incorrect layout"); - const auto dim = shape[*last_dim_idx]; - return dim == 1 ? 1 : m_vector_size; + const auto& dim = shape[*last_dim_idx]; + return std::min(dim, m_vector_size); } bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) { diff --git a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp index 82a38ba1f99fa6..f1213f5458dbde 100644 --- a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp +++ b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp @@ -70,6 +70,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) { const auto& reduce_type_info = reduce->get_type_info(); const auto& input_shape = reduce_expr->get_input_port_descriptor(0)->get_shape(); const auto work_amount = *(input_shape.rbegin()); + const auto increment = m_vector_size <= work_amount ? m_vector_size : work_amount; const bool is_dynamic = reduce->is_dynamic(); // We need an iterator to the inserted element @@ -87,26 +88,24 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) { const auto initial_fill = push_node(std::make_shared(vector_buffer.second, 0, fill_value)); // Reduce loop - const auto fill = push_node(std::make_shared(reduce->get_input_source_output(0), m_vector_size, fill_value)); + const auto fill = push_node(std::make_shared(reduce->get_input_source_output(0), increment, fill_value)); const auto accumulation = push_node(get_accumulation_node(fill.second, initial_fill.second, reduce_type_info)); const auto reduce_loop_id = loop_manager->mark_loop( fill.first, expr_it, work_amount, - m_vector_size, + increment, 0, std::vector{(*fill.first)->get_input_port(0), (*accumulation.first)->get_input_port(1)}, std::vector{(*accumulation.first)->get_output_port(0)}); const auto reduce_loop_info = loop_manager->get_loop_info(reduce_loop_id); - const auto tail_size = work_amount % m_vector_size; + const auto tail_size = work_amount % increment; if (tail_size != 0) { reduce_loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); reduce_loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); - if (work_amount > m_vector_size) { - reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(tail_size); - reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(); - } + reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(tail_size); + reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(); } const auto horizon = push_node(get_horizon_node(accumulation.second, reduce_type_info)); diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index a8f89dbd6ccb65..ca51104e88cbe2 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -440,7 +440,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir, // TODO: remove SoftmaxDecomposition pass pipeline.register_pass(vector_size); pipeline.register_pass(vector_size); - // pipeline.register_pass(vector_size); pipeline.register_pass(); pipeline.register_pass(); pipeline.register_pass(); @@ -465,7 +464,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir, pipeline.register_positioned_passes(backend_passes); pipeline.run(linear_ir); - linear_ir.serialize("/home/vgolubev/models/control_flow.xml", ""); lowering_result.buffer_scratchpad_size = buffer_allocation_pass->get_scratchpad_size(); } diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp index 6d46fdebb30d1f..f6f2bc0426409b 100644 --- a/src/common/snippets/tests/src/lowered/pass/loop.cpp +++ b/src/common/snippets/tests/src/lowered/pass/loop.cpp @@ -39,7 +39,7 @@ static void init_linear_ir(const std::vector& in_shapes, Linea const auto in_shape0 = in_shapes[0].get_shape(); const auto in_shape1 = in_shapes[1].get_shape(); const auto inner_wa = std::max(*in_shape0.rbegin(), *in_shape1.rbegin()); - const auto inner_inc = vector_size; + const auto inner_inc = std::min(vector_size, inner_wa); const auto blocked_wa = block_size; const auto blocked_inc = 1; const auto outer_wa = std::max(*(in_shape0.rbegin() + 1), *(in_shape1.rbegin() + 1)); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp index 92f9e5d1bae233..65023e37fe7ef3 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp @@ -172,6 +172,11 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); loop_info->handlers[LoopInfo::LAST_ITER].register_pass(1.f); } + } else { + loop_info->handlers[LoopInfo::FIRST_ITER].register_pass(block_size_k); + loop_info->handlers[LoopInfo::FIRST_ITER].register_pass(); + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(block_size_k); + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(1.f); } } }; diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index f26d7ee894f942..11988c5bd58541 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -20,6 +20,7 @@ std::vector> input_shapes{ {{1, 1, 32, 23}, {1, 1, 23, 68}}, {{1, 16, 384, 64}, {1, 16, 64, 384}}, {{1, 1, 100, 700}, {1, 1, 700, 100}}, + {{1, 1, 100, 1024}, {1, 1, 1024, 100}}, {{1, 1, 100, 2500}, {1, 1, 2500, 100}}, {{1, 1, 100, 4500}, {1, 1, 4500, 100}}, };