Don't create loops with increment > work_amount

openvinotoolkit · Dec 5, 2023 · 82b5294 · 82b5294
1 parent b2e1d84
commit 82b5294
Show file tree

Hide file tree

Showing 8 changed files with 51 additions and 53 deletions.
diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp
@@ -342,27 +342,23 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
     }
 
     for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) {
-        if (*(loop_subtensor.rbegin() + dim_idx) == PortDescriptor::ServiceDimensions::FULL_DIM) {
+        OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup");
+        const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx);
+        if (subtensor_value == PortDescriptor::ServiceDimensions::FULL_DIM) {
             continue;
         }
 
         OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup");
-        const auto work_amount =
-                loop_tensor.size() > dim_idx ? *(loop_tensor.rbegin() + dim_idx)
-                                             : 0;
-        const auto work_amount_increment =
-                loop_subtensor.size() > dim_idx ? *(loop_subtensor.rbegin() + dim_idx)
-                                                : (dim_idx == 0 ? vector_size : 1);
-        const auto id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, work_amount_increment, dim_idx, loop_entry_points, loop_exit_points);
+        const auto work_amount = *(loop_tensor.rbegin() + dim_idx);
+        const auto increment = subtensor_value <= work_amount ? subtensor_value : work_amount;
+        const auto id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, dim_idx, loop_entry_points, loop_exit_points);
         const auto loop_info = get_loop_info(id);
 
-        const auto tail_size = work_amount % work_amount_increment;
+        const auto tail_size = work_amount % increment;
         if (tail_size != 0) {
             loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::DefaultTailLoopHandler>(tail_size);
-            if (work_amount > work_amount_increment) {
-                loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ReduceWorkAmount>(tail_size);
-                loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ZeroFinalizationOffsets>();
-            }
+            loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ReduceWorkAmount>(tail_size);
+            loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ZeroFinalizationOffsets>();
         }
     }
 }
@@ -421,13 +417,14 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target,
     loop_info->set_entry_points(new_entries);
     loop_info->set_exit_points(new_exits);
 
-    // WA: if one of the fused loops is broadcastable (wa = 1), its handlers have less priority.
-    // Need to fix it by avoiding handlers creation for the loops whose work amount less than increment
-    if (loop_info_upper->get_work_amount() > loop_info_lower->get_work_amount()) {
-        loop_info->handlers = fuse_loop_handlers(loop_info_upper->handlers, loop_info_lower->handlers);
-    } else {
-        loop_info->handlers = fuse_loop_handlers(loop_info_lower->handlers, loop_info_upper->handlers);
-    }
+    loop_info->handlers = fuse_loop_handlers(loop_info_upper->handlers, loop_info_lower->handlers);
+    // Since fusion can be called for broadcastable loops (one of the loops has work_amount = increment = 1),
+    // maximum value is set to the fused loop
+    loop_info->set_work_amount(std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount()));
+    loop_info->set_increment(std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment()));
+    // If one of the Loops is outer for nested loops that splits the same dimension,
+    // after fusion new common Loop saves this status
+    loop_info->set_outer_splited_loop(loop_info_upper->get_outer_splited_loop() || loop_info_lower->get_outer_splited_loop());
 
     const auto& from = fuse_into_upper ? loop_id_lower : loop_id_upper;
     const auto& to = fuse_into_upper ? loop_id_upper : loop_id_lower;

diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp
@@ -44,20 +44,29 @@ bool FuseLoops::loop_ports_are_compatible(const LinearIR::LoopManagerPtr& loop_m
 }
 
 bool FuseLoops::can_be_fused(const LoopInfoPtr& loop_current, const LoopInfoPtr& loop_target) {
-    auto current_work_amount = loop_current->get_work_amount();
-    auto target_work_amount = loop_target->get_work_amount();
-    // Loop fusion is supported only if Loops have equal increments and the equal/broadcastable work amounts.
+    const auto current_work_amount = loop_current->get_work_amount();
+    const auto target_work_amount = loop_target->get_work_amount();
+    const auto current_increment = loop_current->get_increment();
+    const auto target_increment = loop_target->get_increment();
+    // Loop fusion is supported only if Loops have equal/broadcastable increments and work amounts.
     // Note: For example, Broadcastable work amounts are possible in the following case:
     //     Relu_0 [16x1]     Relu_1 [16x128]
     //                \           /
     //                 Add [16x128]
     // Because of expression order in linear IR and work of MarkLoop algorithm, there are 2 Inner Loops:
-    //  - Relu_0 with work amount `1` and increment `vector size`
+    //  - Relu_0 with work amount `1` and increment `1`
     //  - Relu_1 and Add with work amount `128` and increment `vector size`
     // We can fuse them into one Loop with work amount `128` and increment `vector size`
-    const auto supported_work_amount = current_work_amount == target_work_amount || current_work_amount == 1 || target_work_amount == 1;
-    const auto supported_increment = loop_current->get_increment() == loop_target->get_increment();
-    return supported_work_amount && supported_increment;
+
+    // WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't,
+    // because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters
+    // (e.g. tail size). This logic is not implemented for now.
+    const bool first_iter_handlers_match = loop_current->handlers[LoopManager::LoopInfo::FIRST_ITER].empty() ==
+                                           loop_target->handlers[LoopManager::LoopInfo::FIRST_ITER].empty();
+    const bool equal_parameters = current_work_amount == target_work_amount && current_increment == target_increment;
+    const bool current_bcastable = current_work_amount == 1 && current_increment == 1;
+    const bool target_bcastable = target_work_amount == 1 && target_increment == 1;
+    return equal_parameters || current_bcastable || target_bcastable;
 }
 
 void FuseLoops::move(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id,
@@ -124,12 +133,6 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo
     LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos;
     loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos);
     loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, target_loop_id, current_loop_id, false);
-    // Update work_amount for Loop (increment is constant because increments must be the identical for fusion):
-    loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount()));
-    // If one of the Loops is outer for nested loops that splits the same dimension,
-    // after fusion new common Loop save this status
-    loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop());
-
     const auto insertion_place = current_loop_begin_pos;
     const auto is_move_needed = target_loop_end_pos != current_loop_begin_pos;
     if (is_move_needed)
@@ -169,11 +172,6 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo
     LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos;
     loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos);
     loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, current_loop_id, target_loop_id);
-    // Update work_amount for Loop (increment is constant because increments must be the identical for fusion):
-    loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount()));
-    // If one of the Loops is outer for nested loops that splits the same dimension,
-    // after fusion new common Loop save this status
-    loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop());
 
     const auto insertion_place = current_loop_end_pos;
     const auto is_move_needed = insertion_place != target_loop_begin_pos;

diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp
@@ -20,13 +20,13 @@ using LoopInfoPtr = LoopManager::LoopInfoPtr;
 InsertLoadStore::InsertLoadStore(size_t vector_size) : m_vector_size(vector_size) {}
 
 size_t InsertLoadStore::get_count(const PortDescriptorPtr& port_desc) const {
-    const auto layout = port_desc->get_layout();
-    const auto shape = port_desc->get_shape();
+    const auto& layout = port_desc->get_layout();
+    const auto& shape = port_desc->get_shape();
     // Find last dimension by layout
-    const auto last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1);
+    const auto& last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1);
     OPENVINO_ASSERT(last_dim_idx != layout.end() && *last_dim_idx < shape.size(), "Load/Store expression have incorrect layout");
-    const auto dim = shape[*last_dim_idx];
-    return dim == 1 ? 1 : m_vector_size;
+    const auto& dim = shape[*last_dim_idx];
+    return std::min(dim, m_vector_size);
 }
 
 bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) {

diff --git a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
@@ -70,6 +70,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
         const auto& reduce_type_info = reduce->get_type_info();
         const auto& input_shape = reduce_expr->get_input_port_descriptor(0)->get_shape();
         const auto work_amount = *(input_shape.rbegin());
+        const auto increment = m_vector_size <= work_amount ? m_vector_size : work_amount;
         const bool is_dynamic = reduce->is_dynamic();
 
         // We need an iterator to the inserted element
@@ -87,26 +88,24 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
         const auto initial_fill = push_node(std::make_shared<op::Fill>(vector_buffer.second, 0, fill_value));
 
         // Reduce loop
-        const auto fill = push_node(std::make_shared<op::Fill>(reduce->get_input_source_output(0), m_vector_size, fill_value));
+        const auto fill = push_node(std::make_shared<op::Fill>(reduce->get_input_source_output(0), increment, fill_value));
         const auto accumulation = push_node(get_accumulation_node(fill.second, initial_fill.second, reduce_type_info));
 
         const auto reduce_loop_id = loop_manager->mark_loop(
             fill.first,
             expr_it,
             work_amount,
-            m_vector_size,
+            increment,
             0,
             std::vector<ExpressionPort>{(*fill.first)->get_input_port(0), (*accumulation.first)->get_input_port(1)},
             std::vector<ExpressionPort>{(*accumulation.first)->get_output_port(0)});
         const auto reduce_loop_info = loop_manager->get_loop_info(reduce_loop_id);
-        const auto tail_size = work_amount % m_vector_size;
+        const auto tail_size = work_amount % increment;
         if (tail_size != 0) {
             reduce_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<DefaultTailLoopHandler>(tail_size);
             reduce_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
-            if (work_amount > m_vector_size) {
-                reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
-                reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
-            }
+            reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
+            reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
         }
         const auto horizon = push_node(get_horizon_node(accumulation.second, reduce_type_info));
 

diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
@@ -440,7 +440,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
     // TODO: remove SoftmaxDecomposition pass
     pipeline.register_pass<lowered::pass::SoftmaxDecomposition>(vector_size);
     pipeline.register_pass<lowered::pass::ReduceDecomposition>(vector_size);
-    // pipeline.register_pass<lowered::pass::ReduceSumDecomposition>(vector_size);
     pipeline.register_pass<lowered::pass::FuseLoops>();
     pipeline.register_pass<lowered::pass::SplitLoops>();
     pipeline.register_pass<lowered::pass::MoveResultOutOfLoop>();
@@ -465,7 +464,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
 
     pipeline.register_positioned_passes(backend_passes);
     pipeline.run(linear_ir);
-    linear_ir.serialize("/home/vgolubev/models/control_flow.xml", "");
 
     lowering_result.buffer_scratchpad_size = buffer_allocation_pass->get_scratchpad_size();
 }

diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp
@@ -39,7 +39,7 @@ static void init_linear_ir(const std::vector<ov::PartialShape>& in_shapes, Linea
     const auto in_shape0 = in_shapes[0].get_shape();
     const auto in_shape1 = in_shapes[1].get_shape();
     const auto inner_wa = std::max(*in_shape0.rbegin(), *in_shape1.rbegin());
-    const auto inner_inc = vector_size;
+    const auto inner_inc = std::min(vector_size, inner_wa);
     const auto blocked_wa = block_size;
     const auto blocked_inc = 1;
     const auto outer_wa = std::max(*(in_shape0.rbegin() + 1), *(in_shape1.rbegin() + 1));

diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
@@ -172,6 +172,11 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
                         loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetBrgemmKBlockSize>(tail_size);
                         loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetBrgemmBeta>(1.f);
                     }
+                } else {
+                    loop_info->handlers[LoopInfo::FIRST_ITER].register_pass<SetSingleIterationWithWorkAmount>(block_size_k);
+                    loop_info->handlers[LoopInfo::FIRST_ITER].register_pass<ZeroFinalizationOffsets>();
+                    loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(block_size_k);
+                    loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<SetBrgemmBeta>(1.f);
                 }
             }
         };

diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
@@ -20,6 +20,7 @@ std::vector<std::vector<ov::PartialShape>> input_shapes{
         {{1, 1, 32, 23}, {1, 1, 23, 68}},
         {{1, 16, 384, 64}, {1, 16, 64, 384}},
         {{1, 1, 100, 700}, {1, 1, 700, 100}},
+        {{1, 1, 100, 1024}, {1, 1, 1024, 100}},
         {{1, 1, 100, 2500}, {1, 1, 2500, 100}},
         {{1, 1, 100, 4500}, {1, 1, 4500, 100}},
 };