ReduceDecomposition moved to backend specific transformations

v-Golubev · Jan 25, 2024 · c09c7b8 · c09c7b8
1 parent cd881a2
commit c09c7b8
Show file tree

Hide file tree

Showing 8 changed files with 150 additions and 176 deletions.
diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp
@@ -101,7 +101,7 @@ bool AssignRegisters::run(LinearIR& linear_ir) {
             manually_assigned_gprs[expr->get_output_port_connector(0)] =
                     static_cast<Reg>(num_results + num_parameters + buffer_id);
         } else if (ov::is_type<op::HorizonMax>(op) || ov::is_type<op::HorizonSum>(op)) {
-            // Only in ReduceDecomposition Reduce ops use HorizonMax/HorizonSum and VectorBuffer.
+            // Only decomposed Reduce ops use HorizonMax/HorizonSum and VectorBuffer.
             // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator
             // TODO [96351]: We should rewrite accumulator pattern using another way
             const auto& input_tensor = expr->get_input_port_connector(0);

diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
@@ -43,7 +43,6 @@
 #include "snippets/lowered/pass/insert_perf_count.hpp"
 #include "snippets/lowered/pass/validate_shapes.hpp"
 #include "snippets/lowered/pass/pass_config.hpp"
-#include "snippets/lowered/pass/reduce_decomposition.hpp"
 
 #include "transformations/utils/utils.hpp"
 
@@ -424,7 +423,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
 
     lowered::pass::PassPipeline pipeline(lowered_pass_config);
     pipeline.register_pass<lowered::pass::MarkLoops>(vector_size);
-    pipeline.register_pass<lowered::pass::ReduceDecomposition>(vector_size);
     pipeline.register_pass<lowered::pass::FuseLoops>();
     pipeline.register_pass<lowered::pass::SplitLoops>();
     pipeline.register_pass<lowered::pass::MoveResultOutOfLoop>();

diff --git a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp
@@ -53,13 +53,6 @@ class EltwiseBufferAllocationTest : public BufferAllocationTest {
     std::shared_ptr<ov::Model> GetModel() const override;
 };
 
-class MHABufferAllocationTest : public BufferAllocationTest {
-protected:
-    std::shared_ptr<ov::Model> GetModel() const override;
-
-    static void MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor);
-};
-
 }  // namespace snippets
 }  // namespace test
 }  // namespace ov
diff --git a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
@@ -16,7 +16,6 @@
 #include "snippets/lowered/pass/fuse_loops.hpp"
 #include "snippets/lowered/pass/split_loops.hpp"
 #include "snippets/lowered/pass/insert_buffers.hpp"
-#include "snippets/lowered/pass/reduce_decomposition.hpp"
 
 #include "common_test_utils/common_utils.hpp"
 
@@ -69,7 +68,6 @@ void BufferAllocationTest::MarkOp(const std::shared_ptr<ov::Node>& node, const s
 void BufferAllocationTest::ApplyTransformations(const std::shared_ptr<ov::snippets::lowered::pass::PassConfig>& pass_config) {
     ov::snippets::lowered::pass::PassPipeline pipeline(pass_config);
     pipeline.register_pass<ov::snippets::lowered::pass::MarkLoops>(m_vector_size);
-    pipeline.register_pass<ov::snippets::lowered::pass::ReduceDecomposition>(m_vector_size);
     pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
     pipeline.register_pass<ov::snippets::lowered::pass::SplitLoops>();
     pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>(2);
@@ -114,65 +112,9 @@ std::shared_ptr<ov::Model> EltwiseBufferAllocationTest::GetModel() const {
     return body;
 }
 
-void MHABufferAllocationTest::MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor) {
-    const auto subtensor_full = std::vector<size_t>{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM,
-                                                    ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
-    ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
-        node->input(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(0), subtensor));
-    ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
-        node->input(1), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(1), subtensor_full));
-    ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
-        node->output(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->output(0), subtensor));
-}
-
-std::shared_ptr<ov::Model> MHABufferAllocationTest::GetModel() const {
-    const auto subtensor_scalar = std::vector<size_t>{1};
-    const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
-    const auto subtensor_brgemm = std::vector<size_t>{32, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
-    const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
-
-    const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
-    const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 128, 12, 64}));
-    const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
-
-    const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, std::vector<size_t>{0, 2, 3, 1});
-    const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
-    const auto relu0 = std::make_shared<ov::op::v0::Relu>(store);
-    const auto matmul0 = std::make_shared<ov::snippets::op::Brgemm>(parameter0, relu0);
-    const auto relu1 = std::make_shared<ov::op::v0::Relu>(matmul0);
-
-    // Decomposed Softmax
-    const auto reduce_max = std::make_shared<ov::snippets::op::ReduceMax>(relu1, 3);
-    const auto subtract = std::make_shared<ov::op::v1::Subtract>(relu1, reduce_max);
-    const auto exp = std::make_shared<ov::op::v0::Exp>(subtract);
-
-    const auto reduce_sum = std::make_shared<ov::snippets::op::ReduceSum>(exp, 3);
-    const auto power = std::make_shared<ov::snippets::op::PowerStatic>(reduce_sum, -1.f);
-    const auto multiply = std::make_shared<ov::op::v1::Multiply>(exp, power);
-
-    const auto matmul1 = std::make_shared<ov::snippets::op::Brgemm>(multiply, parameter2);
-    const auto relu2 = std::make_shared<ov::op::v0::Relu>(matmul1);
-
-    const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(relu2), ov::ParameterVector{parameter0, parameter1, parameter2});
-
-    MarkOp(load_reshape, subtensor_scalar);
-    MarkOp(store, subtensor_scalar);
-    MarkOp(reduce_max, subtensor_softmax);
-    MarkOp(reduce_sum, subtensor_softmax);
-    MarkOp(power, subtensor_softmax);
-
-    MarkBrgemm(matmul0, subtensor_brgemm);
-    MarkBrgemm(matmul1, subtensor_brgemm);
-
-    return body;
-}
-
 TEST_P(EltwiseBufferAllocationTest, BufferAllocation) {
     Validate();
 }
-TEST_P(MHABufferAllocationTest, BufferAllocation) {
-    Validate();
-}
 
 namespace BufferAllocationTest_Instances {
 
@@ -192,38 +134,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseOptimized, Eltwi
                                  ::testing::Values(1)),  // Two Buffers reuse IDs
                          BufferAllocationTest::getTestCaseName);
 
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWSplit, MHABufferAllocationTest,
-                         ::testing::Combine(
-                                 ::testing::Values(false),
-                                 ::testing::Values(true),
-                                 ::testing::Values(139264), // Each Buffer has own allocated memory
-                                 ::testing::Values(7)),  // Each Buffer has unique ID
-                         BufferAllocationTest::getTestCaseName);
-
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABufferAllocationTest,
-                         ::testing::Combine(
-                                 ::testing::Values(true),
-                                 ::testing::Values(true),
-                                 ::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
-                                 ::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
-                         BufferAllocationTest::getTestCaseName);
-
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABufferAllocationTest,
-                         ::testing::Combine(
-                                 ::testing::Values(false),
-                                 ::testing::Values(false),
-                                 ::testing::Values(360448), // Each Buffer has own allocated memory
-                                 ::testing::Values(7)),  // Each Buffer has unique ID
-                         BufferAllocationTest::getTestCaseName);
-
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABufferAllocationTest,
-                         ::testing::Combine(
-                                 ::testing::Values(true),
-                                 ::testing::Values(false),
-                                 ::testing::Values(98304), // (between brgemms) + (Buffer before brgemm0 and after brgemm1)
-                                 ::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
-                         BufferAllocationTest::getTestCaseName);
-
 }  // namespace BufferAllocationTest_Instances
 }  // namespace snippets
 }  // namespace test

diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -19,11 +19,13 @@
 #include "snippets/lowered/pass/optimize_domain.hpp"
 #include "snippets/lowered/pass/insert_loops.hpp"
 #include "snippets/lowered/pass/mark_loops.hpp"
+#include "snippets/lowered/pass/fuse_loops.hpp"
 #include "transformations/defs.hpp"
 #include "transformations/cpu_opset/common/pass/convert_to_swish_cpu.hpp"
 #include "transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp"
 #include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp"
 #include "transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp"
+#include "transformations/snippets/x64/pass/lowered/reduce_decomposition.hpp"
 #include "transformations/snippets/x64/pass/mul_add_to_fma.hpp"
 #include "transformations/snippets/x64/pass/remove_converts.hpp"
 #include "transformations/snippets/x64/pass/set_brgemm_cpu_blocking_params.hpp"
@@ -353,6 +355,7 @@ void Snippet::initOptimalPrimitiveDescriptor() {
         SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::snippets::pass::MatMulToBrgemm,
                                         pass::EnforcePrecision, element::f32, element::bf16);
     }
+
     SNIPPETS_REGISTER_PASS_RELATIVE(Place::Before, ov::snippets::pass::PropagatePrecision,
                                     ov::intel_cpu::pass::BrgemmToBrgemmCPU);
     SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::intel_cpu::pass::BrgemmToBrgemmCPU,
@@ -633,6 +636,9 @@ void Snippet::SnippetJitExecutor::generate(const jit_snippets_compile_args* jcp)
 #    define SNIPPETS_REGISTER_PASS_RELATIVE(PASS_PLACE, TARGET_PASS, PASS, ...)
 #endif  // OPENVINO_ARCH_X86_64
 
+    const size_t vector_size = snippetAttrs.snippet->get_generator()->get_target_machine()->get_lanes();
+    SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::snippets::lowered::pass::FuseLoops,
+                                    ov::intel_cpu::pass::ReduceDecomposition, vector_size);
     SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::snippets::lowered::pass::MarkLoops,
                                     ov::intel_cpu::pass::BrgemmBlocking);
     SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::snippets::lowered::pass::InsertLoops,

diff --git a/...src/lowered/pass/reduce_decomposition.cpp → ...x64/pass/lowered/reduce_decomposition.cpp b/...src/lowered/pass/reduce_decomposition.cpp → ...x64/pass/lowered/reduce_decomposition.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "snippets/lowered/pass/reduce_decomposition.hpp"
+#include "reduce_decomposition.hpp"
 
 #include "snippets/itt.hpp"
 #include "snippets/lowered/linear_ir.hpp"
@@ -11,15 +11,19 @@
 #include "snippets/snippets_isa.hpp"
 
 namespace ov {
-namespace snippets {
-namespace lowered {
+namespace intel_cpu {
 namespace pass {
 
+using LinearIR = snippets::lowered::LinearIR;
+using HandlerType = LinearIR::LoopManager::LoopInfo::SpecificIterationHandlers::HandlerType;
+using namespace ov::snippets::lowered;
+
+
 namespace {
 uint32_t get_initial_value(const ov::DiscreteTypeInfo& type_info) {
     static const std::map<ov::DiscreteTypeInfo, uint32_t> reduce_initial_values {
-        {op::ReduceMax::get_type_info_static(), uint32_t(0xff7fffff)},
-        {op::ReduceSum::get_type_info_static(), uint32_t(0x00000000)},
+        {ov::snippets::op::ReduceMax::get_type_info_static(), uint32_t(0xff7fffff)},
+        {ov::snippets::op::ReduceSum::get_type_info_static(), uint32_t(0x00000000)},
     };
     OPENVINO_ASSERT(reduce_initial_values.count(type_info), "Unexpected ReduceType");
     return reduce_initial_values.at(type_info);
@@ -28,36 +32,33 @@ uint32_t get_initial_value(const ov::DiscreteTypeInfo& type_info) {
 std::shared_ptr<ov::Node> get_accumulation_node(const ov::Output<ov::Node>& input0,
                                                 const ov::Output<ov::Node>& input1,
                                                 const ov::DiscreteTypeInfo& type_info) {
-    if (type_info == op::ReduceMax::get_type_info_static()) {
+    if (type_info == ov::snippets::op::ReduceMax::get_type_info_static()) {
         return std::make_shared<ov::op::v1::Maximum>(input0, input1);
-    } else if (type_info == op::ReduceSum::get_type_info_static()) {
+    } else if (type_info == ov::snippets::op::ReduceSum::get_type_info_static()) {
         return std::make_shared<ov::op::v1::Add>(input0, input1);
     } else {
         OPENVINO_THROW("Unsupported reduce type: ", type_info);
     }
 }
 
 std::shared_ptr<ov::Node> get_horizon_node(const ov::Output<ov::Node>& input, const ov::DiscreteTypeInfo& type_info) {
-    if (type_info == op::ReduceMax::get_type_info_static()) {
-        return std::make_shared<op::HorizonMax>(input);
-    } else if (type_info == op::ReduceSum::get_type_info_static()) {
-        return std::make_shared<op::HorizonSum>(input);
+    if (type_info == ov::snippets::op::ReduceMax::get_type_info_static()) {
+        return std::make_shared<ov::snippets::op::HorizonMax>(input);
+    } else if (type_info == ov::snippets::op::ReduceSum::get_type_info_static()) {
+        return std::make_shared<ov::snippets::op::HorizonSum>(input);
     } else {
         OPENVINO_THROW("Unsupported reduce type: ", type_info);
     }
 }
 }  // namespace
 
-using LoopInfo = LinearIR::LoopManager::LoopInfo;
-using HandlerType = LoopInfo::SpecificIterationHandlers::HandlerType;
-
 ReduceDecomposition::ReduceDecomposition(size_t vector_size) : m_vector_size{vector_size} {}
 
-bool ReduceDecomposition::run(LinearIR& linear_ir) {
+bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ReduceMaxDecompositionLowered")
     const auto& loop_manager = linear_ir.get_loop_manager();
     bool modified = false;
-    for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto& reduce_expr = *expr_it;
         const auto& reduce = ov::as_type_ptr<ov::snippets::op::ReduceBase>(reduce_expr->get_node());
         if (!reduce)
@@ -81,11 +82,11 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
         const auto fill_value = get_initial_value(reduce_type_info);
         // Note: VectorBuffer is a special case, since it should go before the initial Load.
         // The buffer must be initialized with fill_value before reduction
-        const auto vector_buffer = push_node(std::make_shared<op::VectorBuffer>());
-        const auto initial_fill = push_node(std::make_shared<op::Fill>(vector_buffer.second, 0, fill_value));
+        const auto vector_buffer = push_node(std::make_shared<ov::snippets::op::VectorBuffer>());
+        const auto initial_fill = push_node(std::make_shared<ov::snippets::op::Fill>(vector_buffer.second, 0, fill_value));
 
         // Reduce loop
-        const auto fill = push_node(std::make_shared<op::Fill>(reduce->get_input_source_output(0), increment, fill_value));
+        const auto fill = push_node(std::make_shared<ov::snippets::op::Fill>(reduce->get_input_source_output(0), increment, fill_value));
         const auto accumulation = push_node(get_accumulation_node(fill.second, initial_fill.second, reduce_type_info));
 
         const auto reduce_loop_id = loop_manager->mark_loop(
@@ -98,7 +99,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
             std::vector<ExpressionPort>{(*accumulation.first)->get_output_port(0)});
         const auto tail_size = work_amount % increment;
         if (tail_size != 0) {
-            loop_manager->get_loop_info(reduce_loop_id)->register_handler<HandlerType::LAST_ITER, SetFillOffset>(tail_size);
+            loop_manager->get_loop_info(reduce_loop_id)->register_handler<HandlerType::LAST_ITER, ov::snippets::lowered::pass::SetFillOffset>(tail_size);
         }
         const auto horizon = push_node(get_horizon_node(accumulation.second, reduce_type_info));
 
@@ -124,7 +125,6 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
     return modified;
 }
 
-} // namespace pass
-} // namespace lowered
-} // namespace snippets
-} // namespace ov
+}  // namespace pass
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/...ets/lowered/pass/reduce_decomposition.hpp → ...x64/pass/lowered/reduce_decomposition.hpp b/...ets/lowered/pass/reduce_decomposition.hpp → ...x64/pass/lowered/reduce_decomposition.hpp
@@ -4,11 +4,10 @@
 
 #pragma once
 
-#include "pass.hpp"
+#include "snippets/lowered/pass/pass.hpp"
 
 namespace ov {
-namespace snippets {
-namespace lowered {
+namespace intel_cpu {
 namespace pass {
 
 /**
@@ -17,17 +16,18 @@ namespace pass {
  * @attention Only Reduce by last dimension is supported
  * @ingroup snippets
  */
-class ReduceDecomposition : public Pass {
+class ReduceDecomposition : public snippets::lowered::pass::RangedPass {
 public:
     OPENVINO_RTTI("ReduceDecomposition", "Pass")
     explicit ReduceDecomposition(size_t vector_size);
-    bool run(LinearIR& linear_ir) override;
+    bool run(snippets::lowered::LinearIR& linear_ir,
+             snippets::lowered::LinearIR::constExprIt begin,
+             snippets::lowered::LinearIR::constExprIt end) override;
 
 private:
     size_t m_vector_size;
 };
 
-} // namespace pass
-} // namespace lowered
-} // namespace snippets
-} // namespace ov
+}  // namespace pass
+}  // namespace intel_cpu
+}  // namespace ov