[Snippets] Created Analizator-Pass

openvinotoolkit · Aug 9, 2024 · 4dc5bbb · 4dc5bbb
1 parent 9b059c8
commit 4dc5bbb
Show file tree

Hide file tree

Showing 7 changed files with 174 additions and 101 deletions.
diff --git a/src/common/snippets/include/snippets/pass/analyze_broadcastable_inputs.hpp b/src/common/snippets/include/snippets/pass/analyze_broadcastable_inputs.hpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface AnalyzeBroadcastableInputs
+ * @brief Analyzes body parameters which affects inputs of broadcastable operations(`Broadcast` op should be inserted there).
+ *        Initializes special map `BroadcastableInputsMap = [Index of Parameter -> Index of broadcastable dimension from end]`
+ *        Notes:
+ *          - Must be called after Canonicalization pass
+ *          - Doesn't support `layouts` in PortDescriptors
+ * @ingroup snippets
+ */
+class AnalyzeBroadcastableInputs : public ov::pass::ModelPass {
+public:
+    OPENVINO_RTTI("AnalyzeBroadcastableInputs");
+    using BroadcastableInputsMap = std::map<size_t, size_t>;
+    AnalyzeBroadcastableInputs(BroadcastableInputsMap& map);
+
+    bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
+
+private:
+    BroadcastableInputsMap& m_broadcastable_inputs;
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ov
diff --git a/src/common/snippets/include/snippets/pass/manager.hpp b/src/common/snippets/include/snippets/pass/manager.hpp
@@ -20,6 +20,8 @@ namespace pass {
  */
 class Manager : public ov::pass::Manager {
 public:
+    Manager(std::shared_ptr<ov::pass::PassConfig> pass_config = std::make_shared<ov::pass::PassConfig>(),
+            std::string name = "UnnamedSnippetsManager");
     ~Manager() override = default;
     using PassBase = ov::pass::PassBase;
     using Validate = ov::pass::Validate;

diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
@@ -398,13 +398,17 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::op::data_flow_transformations")
 
-    ov::snippets::pass::Manager manager;
+    std::shared_ptr<ov::pass::PassConfig> pass_config = std::make_shared<ov::pass::PassConfig>();
     // If subgraph has its own specific canonicalization, which is different with common behavior, will skip the this common one.
     // for example in GN, scale and bias shape [c] are canonicalized to [1,c,1,1], not [1,1,1,c]. Common canonicalization is disabled in this case.
-    if (!blocked_input_shapes.empty() && !config.m_has_broadcast_sensitive_ops)
-        manager.register_pass<snippets::pass::Canonicalization>(blocked_input_shapes);
-    if (!input_precisions.empty() && !output_precisions.empty())
-        manager.register_pass<snippets::pass::AlignElementTypes>(input_precisions, output_precisions);
+    if (blocked_input_shapes.empty() || config.m_has_broadcast_sensitive_ops)
+        pass_config->disable<snippets::pass::Canonicalization>();
+    if (input_precisions.empty() || output_precisions.empty())
+        pass_config->disable<snippets::pass::AlignElementTypes>();
+
+    ov::snippets::pass::Manager manager(pass_config, "SnippetsDataFlowManager");
+    manager.register_pass<snippets::pass::Canonicalization>(blocked_input_shapes);
+    manager.register_pass<snippets::pass::AlignElementTypes>(input_precisions, output_precisions);
 
     if (config.m_has_domain_sensitive_ops) {
         manager.register_pass<snippets::pass::MatMulToBrgemm>();

diff --git a/src/common/snippets/src/pass/analyze_broadcastable_inputs.cpp b/src/common/snippets/src/pass/analyze_broadcastable_inputs.cpp
@@ -0,0 +1,106 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/pass/analyze_broadcastable_inputs.hpp"
+
+#include "snippets/lowered/pass/insert_broadcastmove.hpp"
+#include "snippets/utils/utils.hpp"
+#include "snippets/itt.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+AnalyzeBroadcastableInputs::AnalyzeBroadcastableInputs(BroadcastableInputsMap& map) : m_broadcastable_inputs(map) {}
+
+bool pass::AnalyzeBroadcastableInputs::run_on_model(const std::shared_ptr<ov::Model>& body) {
+    RUN_ON_MODEL_SCOPE(AnalyzeBroadcastableInputs);
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AnalyzeBroadcastableInputs")
+    // Snippets supports tokenization of the following operations:
+    // - Unary, Binary and Ternary (Select) Elementwise ops
+    // - Softmax, MatMul, Transpose, GroupNorm
+    // Binary Elementwise ops (+ Select) requires explicit Broadcast op
+    // on inputs if broadcasting of latest dimensions is needed.
+    // These ops will be start points of DFS - need to go to Parameters and update `broadcastable_inputs_map`.
+    // We iterates through all ops by execution order. So if we already analyzied some op in the input branch - skip this branch.
+    // However, there some ops which can change `processing_dim_idx`:
+    // - Transpose has order which changes `processing_dim_idx`. But Transpose can be only after Parameters and before Results.
+    // - MatMul's first input doesn't affect output latest dimension - skip this branch.
+    //   Also MatMul has `transposed_b` which changes `processing_dim_idx`
+    m_broadcastable_inputs.clear();
+    // Currently Broadcasting can be changed only if there are several Parameters in body
+    if (body->get_parameters().size() < 2)
+        return false;
+
+    const auto& ops = body->get_ordered_ops();
+    std::set<std::shared_ptr<ov::Node>> visited_ops = {};
+    for (const auto& op : ops) {
+        if (!ov::snippets::lowered::pass::InsertBroadcastMove::is_broadcasting_supported(op))
+            continue;
+
+        size_t processing_dim_idx = 0;
+
+        // We need to propagate `processing_dim_idx` from input of the current node to the parameter.
+        // To do it we use DFS
+        std::stack<std::shared_ptr<ov::Node>> nodes_to_calculate;
+        nodes_to_calculate.push(op);
+        while (!nodes_to_calculate.empty()) {
+            auto current_node = nodes_to_calculate.top();
+            nodes_to_calculate.pop();
+
+            if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(current_node)) {
+                const auto consumers = param->get_output_target_inputs(0);
+                if (std::any_of(consumers.cbegin(), consumers.cend(),
+                                [](const ov::Input<ov::Node>& in) { return ov::is_type<ov::op::v1::Transpose>(in.get_node()); })) {
+                    OPENVINO_ASSERT(consumers.size() == 1, "Incorrect count of outputs of Parameter!");
+                    const auto transpose = consumers.begin()->get_node();
+                    std::vector<size_t> order;
+                    const auto& constant = ov::as_type_ptr<const opset1::Constant>(transpose->get_input_node_shared_ptr(1));
+                    OPENVINO_ASSERT(constant, "Unsupported order node of Transpose");
+                    order = constant->cast_vector<size_t>();
+                    if (order.empty()) {
+                        order.resize(transpose->get_output_partial_shape(0).size());
+                        std::iota(order.rbegin(), order.rend(), 0);
+                    }
+                    // `processing_dim_idx` starts from the end
+                    processing_dim_idx = order.size() - 1 - ov::snippets::utils::get_input_dim_idx(order, processing_dim_idx);
+                }
+                const auto param_idx = body->get_parameter_index(param);
+                if (m_broadcastable_inputs.count(param_idx) == 0) {
+                    m_broadcastable_inputs[param_idx] = processing_dim_idx;
+                } else {
+                    OPENVINO_ASSERT(m_broadcastable_inputs.at(param_idx) == processing_dim_idx,
+                                    "Parameter has been already analyzed and has another processing dim index!");
+                }
+                processing_dim_idx = 0;
+                continue;
+            } else if (ov::is_type<ov::op::v0::Constant>(current_node)) {
+                visited_ops.insert(op);
+                continue;
+            }
+
+            ov::OutputVector inputs = current_node->input_values();
+            if (const auto mm = ov::as_type_ptr<ov::op::v0::MatMul>(current_node)) {
+                inputs = { current_node->input_value(1) };
+                processing_dim_idx = static_cast<size_t>(mm->get_transpose_b());
+            }
+
+            // not a leaf - continue to search
+            for (const auto& input_value : inputs) {
+                const auto& input_node = input_value.get_node()->shared_from_this();
+                if (visited_ops.count(input_node) == 0) {
+                    nodes_to_calculate.push(input_node);
+                }
+            }
+        }
+
+        visited_ops.insert(op);
+    }
+
+    return true;
+}
+
+} // namespace pass
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/src/pass/manager.cpp b/src/common/snippets/src/pass/manager.cpp
@@ -9,6 +9,9 @@ namespace ov {
 namespace snippets {
 namespace pass {
 
+Manager::Manager(std::shared_ptr<ov::pass::PassConfig> pass_config, std::string name)
+    : ov::pass::Manager(std::move(pass_config), std::move(name)) {}
+
 std::shared_ptr<Manager::PassBase> Manager::register_pass_instance(const PassPosition& position,
                                                                    const std::shared_ptr<PassBase>& pass) {
     pass->set_pass_config(m_pass_config);

diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -15,11 +15,12 @@
 #include "snippets/pass/matmul_to_brgemm.hpp"
 #include "snippets/pass/propagate_precision.hpp"
 #include "snippets/pass/positioned_pass.hpp"
+#include "snippets/pass/canonicalization.hpp"
+#include "snippets/pass/analyze_broadcastable_inputs.hpp"
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/lowered/pass/optimize_domain.hpp"
 #include "snippets/lowered/pass/insert_loops.hpp"
 #include "snippets/lowered/pass/mark_loops.hpp"
-#include "snippets/lowered/pass/insert_broadcastmove.hpp"
 #include "transformations/defs.hpp"
 #include "transformations/cpu_opset/common/pass/convert_to_swish_cpu.hpp"
 #include "transformations/snippets/common/pass/mul_add_to_fma.hpp"
@@ -540,104 +541,12 @@ void Subgraph::createPrimitive() {
         initPluginBlockedShapes();
         initAttributes();
         initStartOffsets();
-        initBroadcastableInputs();
         optimizeIR();
     }
 
     Node::createPrimitive();
 }
 
-void Subgraph::initBroadcastableInputs() {
-    // Snippets supports tokenization of the following operations:
-    // - Unary, Binary and Ternary (Select) Elementwise ops
-    // - Softmax, MatMul, Transpose, GroupNorm
-    // Binary Elementwise ops (+ Select) requires explicit Broadcast op
-    // on inputs if broadcasting of latest dimensions is needed.
-    // These ops will be start points of DFS - need to go to Parameters and update `broadcastable_inputs_map`.
-    // We iterates through all ops by execution order. So if we already analyzied some op in the input branch - skip this branch.
-    // However, there some ops which can change `processing_dim_idx`:
-    // - Transpose has order which changes `processing_dim_idx`. But Transpose can be only after Parameters
-    // - MatMul's first input doesn't affect output latest dimension - skip this branch.
-    //   Also MatMul has `transposed_b` which changes `processing_dim_idx`
-    broadcastable_inputs.clear();
-    const auto& body = subgraph_attrs->snippet->body_ptr();
-    // Currently Broadcasting can be changed only if there are several Parameters in body
-    if (body->get_parameters().size() < 2)
-        return;
-
-    const auto& ops = body->get_ordered_ops();
-    std::set<std::shared_ptr<ov::Node>> visited_ops = {};
-    for (const auto& op : ops) {
-        if (!ov::snippets::lowered::pass::InsertBroadcastMove::is_broadcasting_supported(op))
-            continue;
-
-        size_t processing_dim_idx = 0;
-
-        // We need to propagate `processing_dim_idx` from input of the current node to the parameter.
-        // To do it we use DFS
-        std::stack<std::shared_ptr<ov::Node>> nodes_to_calculate;
-        nodes_to_calculate.push(op);
-        while (!nodes_to_calculate.empty()) {
-            auto current_node = nodes_to_calculate.top();
-            nodes_to_calculate.pop();
-
-            if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(current_node)) {
-                const auto consumers = param->get_output_target_inputs(0);
-                if (std::any_of(consumers.cbegin(), consumers.cend(),
-                                [](const ov::Input<ov::Node>& in) { return ov::is_type<ov::op::v1::Transpose>(in.get_node()); })) {
-                    OPENVINO_ASSERT(consumers.size() == 1, "Incorrect count of outputs of Parameter!");
-                    const auto transpose = consumers.begin()->get_node();
-                    std::vector<size_t> order;
-                    const auto& constant = ov::as_type_ptr<const opset1::Constant>(transpose->get_input_node_shared_ptr(1));
-                    OPENVINO_ASSERT(constant, "Unsupported order node of Transpose");
-                    order = constant->cast_vector<size_t>();
-                    if (order.empty()) {
-                        order.resize(transpose->get_output_partial_shape(0).size());
-                        std::iota(order.rbegin(), order.rend(), 0);
-                    }
-                    // `processing_dim_idx` starts from the end
-                    processing_dim_idx = order.size() - 1 - ov::snippets::utils::get_input_dim_idx(order, processing_dim_idx);
-                }
-                const auto param_idx = body->get_parameter_index(param);
-                if (broadcastable_inputs.count(param_idx) == 0) {
-                    broadcastable_inputs[param_idx] = processing_dim_idx;
-                } else {
-                    OPENVINO_ASSERT(broadcastable_inputs.at(param_idx) == processing_dim_idx,
-                                    "Parameter has been already analyzed and has another processing dim index!");
-                }
-                processing_dim_idx = 0;
-                continue;
-            } else if (ov::is_type<ov::op::v0::Constant>(current_node)) {
-                visited_ops.insert(op);
-                continue;
-            }
-
-            ov::OutputVector inputs = current_node->input_values();
-            if (const auto mm = ov::as_type_ptr<ov::op::v0::MatMul>(current_node)) {
-                inputs = { current_node->input_value(1) };
-                processing_dim_idx = static_cast<size_t>(mm->get_transpose_b());
-            }
-
-            // not a leaf - continue to search
-            for (const auto& input_value : inputs) {
-                const auto& input_node = input_value.get_node()->shared_from_this();
-                if (visited_ops.count(input_node) == 0) {
-                    nodes_to_calculate.push(input_node);
-                }
-            }
-        }
-
-        visited_ops.insert(op);
-    }
-
-    OPENVINO_ASSERT(broadcastable_inputs.size() < srcMemPtrs.size() || broadcastable_inputs.rbegin()->first < srcMemPtrs.size(),
-                    "Incorrect indexes of broadcastable inputs of Subgraph");
-    for (const auto broadcastable_input : broadcastable_inputs) {
-        OPENVINO_ASSERT(broadcastable_input.second < in_shapes[broadcastable_input.first].size(),
-                        "Incorrect processing dimension index of broadcastable index");
-    }
-}
-
 void Subgraph::initMemoryPtrs() {
     srcMemPtrs.resize(input_num);
     dstMemPtrs.resize(output_num);
@@ -711,14 +620,16 @@ void Subgraph::initPluginBlockedShapes() const {
         in_shapes[i] = srcMemPtrs[i]->getDescWithType<BlockedMemoryDesc>()->getBlockDims();
 }
 
-Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() const {
+Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() {
     DataFlowPasses backend_passes;
 
     using PassPosition = ov::snippets::pass::PassPosition;
     using Place = PassPosition::Place;
 
 #   define SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(PASS_PLACE, PASS, ...) \
             backend_passes.emplace_back(PassPosition(PASS_PLACE), std::make_shared<PASS>(__VA_ARGS__))
+#   define SNIPPETS_REGISTER_PASS_RELATIVEE_COMMON(PASS_PLACE, TARGET_PASS, PASS, ...) \
+            backend_passes.emplace_back(PassPosition(PASS_PLACE, TARGET_PASS::get_type_info_static()), std::make_shared<PASS>(__VA_ARGS__))
 
 #if defined(OPENVINO_ARCH_X86_64)
 #   define SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64(PASS_PLACE, PASS, ...) \
@@ -731,6 +642,8 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() const {
 #endif  // OPENVINO_ARCH_X86_64
 
     SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(Place::PipelineStart, ConvertToSwishCPU);
+    SNIPPETS_REGISTER_PASS_RELATIVEE_COMMON(Place::After, ov::snippets::pass::Canonicalization,
+                                            ov::snippets::pass::AnalyzeBroadcastableInputs, broadcastable_inputs);
     if (context->getConfig().inferencePrecision == ov::element::bf16 && subgraph_attrs->snippet->has_domain_sensitive_ops()) {
         // enforce BF16 precisions to supported operations
         // MatMul has to be decomposed to Brgemm operations before enforcement
@@ -754,6 +667,7 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() const {
 #endif
 
 #undef SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON
+#undef SNIPPETS_REGISTER_PASS_RELATIVE_COMMON
 #undef SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64
 #undef SNIPPETS_REGISTER_PASS_RELATIVE_X86_64
 
@@ -808,6 +722,15 @@ void Subgraph::optimizeIR() {
     const auto precisions = getIOPrecisions();
     subgraph->data_flow_transformations(in_blocked_shapes, precisions.first, precisions.second, getDataFlowPasses());
 
+    // DataFlow transformations includes AnalyzeBroadcastableInputs pass - we should verify that the map is aligned with our blocked input shapes
+    OPENVINO_ASSERT((broadcastable_inputs.size() < in_shapes.size()) ||
+                    (!broadcastable_inputs.empty() && broadcastable_inputs.rbegin()->first < in_shapes.size()),
+                    "Incorrect indexes of broadcastable inputs of Subgraph");
+    for (const auto broadcastable_input : broadcastable_inputs) {
+        OPENVINO_ASSERT(broadcastable_input.second < in_shapes[broadcastable_input.first].size(),
+                        "Incorrect processing dimension index of broadcastable index");
+    }
+
     // TODO: Snippets don't support backend-provided blocking, so we need to reshape body
     //       using blocked shapes first. This can be removed after [121670]
     std::vector<snippets::VectorDimsRef> in_shapes;

diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -61,7 +61,6 @@ class Subgraph : public Node {
     IShapeInfer::Result shapeInfer() const override;
 
 private:
-    void initBroadcastableInputs();
     void initMemoryPtrs();
     void initAttributes();
     void initStartOffsets();
@@ -77,7 +76,7 @@ class Subgraph : public Node {
     using DataFlowPasses = std::vector<ov::snippets::pass::Manager::PositionedPassBase>;
     using ControlFlowPasses = std::vector<ov::snippets::lowered::pass::PassPipeline::PositionedPassLowered>;
 
-    DataFlowPasses getDataFlowPasses() const;
+    DataFlowPasses getDataFlowPasses();
     ControlFlowPasses getControlFlowPasses() const;
 
     // Holds ISA version used is codeGeneration target