diff --git a/src/common/snippets/include/snippets/pass/analyze_broadcastable_inputs.hpp b/src/common/snippets/include/snippets/pass/analyze_broadcastable_inputs.hpp new file mode 100644 index 00000000000000..0c41b6ed8d629a --- /dev/null +++ b/src/common/snippets/include/snippets/pass/analyze_broadcastable_inputs.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/pass.hpp" + +namespace ov { +namespace snippets { +namespace pass { + +/** + * @interface AnalyzeBroadcastableInputs + * @brief Analyzes body parameters which affects inputs of broadcastable operations(`Broadcast` op should be inserted there). + * Initializes special map `BroadcastableInputsMap = [Index of Parameter -> Index of broadcastable dimension from end]` + * Notes: + * - Must be called after Canonicalization pass + * - Doesn't support `layouts` in PortDescriptors + * @ingroup snippets + */ +class AnalyzeBroadcastableInputs : public ov::pass::ModelPass { +public: + OPENVINO_RTTI("AnalyzeBroadcastableInputs"); + using BroadcastableInputsMap = std::map; + AnalyzeBroadcastableInputs(BroadcastableInputsMap& map); + + bool run_on_model(const std::shared_ptr& m) override; + +private: + BroadcastableInputsMap& m_broadcastable_inputs; +}; + +} // namespace pass +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/pass/manager.hpp b/src/common/snippets/include/snippets/pass/manager.hpp index 075d449958557d..244c0d41681fe5 100644 --- a/src/common/snippets/include/snippets/pass/manager.hpp +++ b/src/common/snippets/include/snippets/pass/manager.hpp @@ -20,6 +20,8 @@ namespace pass { */ class Manager : public ov::pass::Manager { public: + Manager(std::shared_ptr pass_config = std::make_shared(), + std::string name = "UnnamedSnippetsManager"); ~Manager() override = default; using PassBase = ov::pass::PassBase; using Validate = ov::pass::Validate; diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index c51abc2848d6ee..cf9f6b3121782e 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -398,13 +398,17 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::op::data_flow_transformations") - ov::snippets::pass::Manager manager; + std::shared_ptr pass_config = std::make_shared(); // If subgraph has its own specific canonicalization, which is different with common behavior, will skip the this common one. // for example in GN, scale and bias shape [c] are canonicalized to [1,c,1,1], not [1,1,1,c]. Common canonicalization is disabled in this case. - if (!blocked_input_shapes.empty() && !config.m_has_broadcast_sensitive_ops) - manager.register_pass(blocked_input_shapes); - if (!input_precisions.empty() && !output_precisions.empty()) - manager.register_pass(input_precisions, output_precisions); + if (blocked_input_shapes.empty() || config.m_has_broadcast_sensitive_ops) + pass_config->disable(); + if (input_precisions.empty() || output_precisions.empty()) + pass_config->disable(); + + ov::snippets::pass::Manager manager(pass_config, "SnippetsDataFlowManager"); + manager.register_pass(blocked_input_shapes); + manager.register_pass(input_precisions, output_precisions); if (config.m_has_domain_sensitive_ops) { manager.register_pass(); diff --git a/src/common/snippets/src/pass/analyze_broadcastable_inputs.cpp b/src/common/snippets/src/pass/analyze_broadcastable_inputs.cpp new file mode 100644 index 00000000000000..bb5c63efa183de --- /dev/null +++ b/src/common/snippets/src/pass/analyze_broadcastable_inputs.cpp @@ -0,0 +1,106 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/pass/analyze_broadcastable_inputs.hpp" + +#include "snippets/lowered/pass/insert_broadcastmove.hpp" +#include "snippets/utils/utils.hpp" +#include "snippets/itt.hpp" + +namespace ov { +namespace snippets { +namespace pass { + +AnalyzeBroadcastableInputs::AnalyzeBroadcastableInputs(BroadcastableInputsMap& map) : m_broadcastable_inputs(map) {} + +bool pass::AnalyzeBroadcastableInputs::run_on_model(const std::shared_ptr& body) { + RUN_ON_MODEL_SCOPE(AnalyzeBroadcastableInputs); + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AnalyzeBroadcastableInputs") + // Snippets supports tokenization of the following operations: + // - Unary, Binary and Ternary (Select) Elementwise ops + // - Softmax, MatMul, Transpose, GroupNorm + // Binary Elementwise ops (+ Select) requires explicit Broadcast op + // on inputs if broadcasting of latest dimensions is needed. + // These ops will be start points of DFS - need to go to Parameters and update `broadcastable_inputs_map`. + // We iterates through all ops by execution order. So if we already analyzied some op in the input branch - skip this branch. + // However, there some ops which can change `processing_dim_idx`: + // - Transpose has order which changes `processing_dim_idx`. But Transpose can be only after Parameters and before Results. + // - MatMul's first input doesn't affect output latest dimension - skip this branch. + // Also MatMul has `transposed_b` which changes `processing_dim_idx` + m_broadcastable_inputs.clear(); + // Currently Broadcasting can be changed only if there are several Parameters in body + if (body->get_parameters().size() < 2) + return false; + + const auto& ops = body->get_ordered_ops(); + std::set> visited_ops = {}; + for (const auto& op : ops) { + if (!ov::snippets::lowered::pass::InsertBroadcastMove::is_broadcasting_supported(op)) + continue; + + size_t processing_dim_idx = 0; + + // We need to propagate `processing_dim_idx` from input of the current node to the parameter. + // To do it we use DFS + std::stack> nodes_to_calculate; + nodes_to_calculate.push(op); + while (!nodes_to_calculate.empty()) { + auto current_node = nodes_to_calculate.top(); + nodes_to_calculate.pop(); + + if (const auto& param = ov::as_type_ptr(current_node)) { + const auto consumers = param->get_output_target_inputs(0); + if (std::any_of(consumers.cbegin(), consumers.cend(), + [](const ov::Input& in) { return ov::is_type(in.get_node()); })) { + OPENVINO_ASSERT(consumers.size() == 1, "Incorrect count of outputs of Parameter!"); + const auto transpose = consumers.begin()->get_node(); + std::vector order; + const auto& constant = ov::as_type_ptr(transpose->get_input_node_shared_ptr(1)); + OPENVINO_ASSERT(constant, "Unsupported order node of Transpose"); + order = constant->cast_vector(); + if (order.empty()) { + order.resize(transpose->get_output_partial_shape(0).size()); + std::iota(order.rbegin(), order.rend(), 0); + } + // `processing_dim_idx` starts from the end + processing_dim_idx = order.size() - 1 - ov::snippets::utils::get_input_dim_idx(order, processing_dim_idx); + } + const auto param_idx = body->get_parameter_index(param); + if (m_broadcastable_inputs.count(param_idx) == 0) { + m_broadcastable_inputs[param_idx] = processing_dim_idx; + } else { + OPENVINO_ASSERT(m_broadcastable_inputs.at(param_idx) == processing_dim_idx, + "Parameter has been already analyzed and has another processing dim index!"); + } + processing_dim_idx = 0; + continue; + } else if (ov::is_type(current_node)) { + visited_ops.insert(op); + continue; + } + + ov::OutputVector inputs = current_node->input_values(); + if (const auto mm = ov::as_type_ptr(current_node)) { + inputs = { current_node->input_value(1) }; + processing_dim_idx = static_cast(mm->get_transpose_b()); + } + + // not a leaf - continue to search + for (const auto& input_value : inputs) { + const auto& input_node = input_value.get_node()->shared_from_this(); + if (visited_ops.count(input_node) == 0) { + nodes_to_calculate.push(input_node); + } + } + } + + visited_ops.insert(op); + } + + return true; +} + +} // namespace pass +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/src/pass/manager.cpp b/src/common/snippets/src/pass/manager.cpp index 81c46a1df50ddd..629bd1a8d05900 100644 --- a/src/common/snippets/src/pass/manager.cpp +++ b/src/common/snippets/src/pass/manager.cpp @@ -9,6 +9,9 @@ namespace ov { namespace snippets { namespace pass { +Manager::Manager(std::shared_ptr pass_config, std::string name) + : ov::pass::Manager(std::move(pass_config), std::move(name)) {} + std::shared_ptr Manager::register_pass_instance(const PassPosition& position, const std::shared_ptr& pass) { pass->set_pass_config(m_pass_config); diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 789a78c0c2900d..074458a9ea2b1b 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -15,11 +15,12 @@ #include "snippets/pass/matmul_to_brgemm.hpp" #include "snippets/pass/propagate_precision.hpp" #include "snippets/pass/positioned_pass.hpp" +#include "snippets/pass/canonicalization.hpp" +#include "snippets/pass/analyze_broadcastable_inputs.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/pass/optimize_domain.hpp" #include "snippets/lowered/pass/insert_loops.hpp" #include "snippets/lowered/pass/mark_loops.hpp" -#include "snippets/lowered/pass/insert_broadcastmove.hpp" #include "transformations/defs.hpp" #include "transformations/cpu_opset/common/pass/convert_to_swish_cpu.hpp" #include "transformations/snippets/common/pass/mul_add_to_fma.hpp" @@ -540,104 +541,12 @@ void Subgraph::createPrimitive() { initPluginBlockedShapes(); initAttributes(); initStartOffsets(); - initBroadcastableInputs(); optimizeIR(); } Node::createPrimitive(); } -void Subgraph::initBroadcastableInputs() { - // Snippets supports tokenization of the following operations: - // - Unary, Binary and Ternary (Select) Elementwise ops - // - Softmax, MatMul, Transpose, GroupNorm - // Binary Elementwise ops (+ Select) requires explicit Broadcast op - // on inputs if broadcasting of latest dimensions is needed. - // These ops will be start points of DFS - need to go to Parameters and update `broadcastable_inputs_map`. - // We iterates through all ops by execution order. So if we already analyzied some op in the input branch - skip this branch. - // However, there some ops which can change `processing_dim_idx`: - // - Transpose has order which changes `processing_dim_idx`. But Transpose can be only after Parameters - // - MatMul's first input doesn't affect output latest dimension - skip this branch. - // Also MatMul has `transposed_b` which changes `processing_dim_idx` - broadcastable_inputs.clear(); - const auto& body = subgraph_attrs->snippet->body_ptr(); - // Currently Broadcasting can be changed only if there are several Parameters in body - if (body->get_parameters().size() < 2) - return; - - const auto& ops = body->get_ordered_ops(); - std::set> visited_ops = {}; - for (const auto& op : ops) { - if (!ov::snippets::lowered::pass::InsertBroadcastMove::is_broadcasting_supported(op)) - continue; - - size_t processing_dim_idx = 0; - - // We need to propagate `processing_dim_idx` from input of the current node to the parameter. - // To do it we use DFS - std::stack> nodes_to_calculate; - nodes_to_calculate.push(op); - while (!nodes_to_calculate.empty()) { - auto current_node = nodes_to_calculate.top(); - nodes_to_calculate.pop(); - - if (const auto& param = ov::as_type_ptr(current_node)) { - const auto consumers = param->get_output_target_inputs(0); - if (std::any_of(consumers.cbegin(), consumers.cend(), - [](const ov::Input& in) { return ov::is_type(in.get_node()); })) { - OPENVINO_ASSERT(consumers.size() == 1, "Incorrect count of outputs of Parameter!"); - const auto transpose = consumers.begin()->get_node(); - std::vector order; - const auto& constant = ov::as_type_ptr(transpose->get_input_node_shared_ptr(1)); - OPENVINO_ASSERT(constant, "Unsupported order node of Transpose"); - order = constant->cast_vector(); - if (order.empty()) { - order.resize(transpose->get_output_partial_shape(0).size()); - std::iota(order.rbegin(), order.rend(), 0); - } - // `processing_dim_idx` starts from the end - processing_dim_idx = order.size() - 1 - ov::snippets::utils::get_input_dim_idx(order, processing_dim_idx); - } - const auto param_idx = body->get_parameter_index(param); - if (broadcastable_inputs.count(param_idx) == 0) { - broadcastable_inputs[param_idx] = processing_dim_idx; - } else { - OPENVINO_ASSERT(broadcastable_inputs.at(param_idx) == processing_dim_idx, - "Parameter has been already analyzed and has another processing dim index!"); - } - processing_dim_idx = 0; - continue; - } else if (ov::is_type(current_node)) { - visited_ops.insert(op); - continue; - } - - ov::OutputVector inputs = current_node->input_values(); - if (const auto mm = ov::as_type_ptr(current_node)) { - inputs = { current_node->input_value(1) }; - processing_dim_idx = static_cast(mm->get_transpose_b()); - } - - // not a leaf - continue to search - for (const auto& input_value : inputs) { - const auto& input_node = input_value.get_node()->shared_from_this(); - if (visited_ops.count(input_node) == 0) { - nodes_to_calculate.push(input_node); - } - } - } - - visited_ops.insert(op); - } - - OPENVINO_ASSERT(broadcastable_inputs.size() < srcMemPtrs.size() || broadcastable_inputs.rbegin()->first < srcMemPtrs.size(), - "Incorrect indexes of broadcastable inputs of Subgraph"); - for (const auto broadcastable_input : broadcastable_inputs) { - OPENVINO_ASSERT(broadcastable_input.second < in_shapes[broadcastable_input.first].size(), - "Incorrect processing dimension index of broadcastable index"); - } -} - void Subgraph::initMemoryPtrs() { srcMemPtrs.resize(input_num); dstMemPtrs.resize(output_num); @@ -711,7 +620,7 @@ void Subgraph::initPluginBlockedShapes() const { in_shapes[i] = srcMemPtrs[i]->getDescWithType()->getBlockDims(); } -Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() const { +Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() { DataFlowPasses backend_passes; using PassPosition = ov::snippets::pass::PassPosition; @@ -719,6 +628,8 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() const { # define SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(PASS_PLACE, PASS, ...) \ backend_passes.emplace_back(PassPosition(PASS_PLACE), std::make_shared(__VA_ARGS__)) +# define SNIPPETS_REGISTER_PASS_RELATIVEE_COMMON(PASS_PLACE, TARGET_PASS, PASS, ...) \ + backend_passes.emplace_back(PassPosition(PASS_PLACE, TARGET_PASS::get_type_info_static()), std::make_shared(__VA_ARGS__)) #if defined(OPENVINO_ARCH_X86_64) # define SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64(PASS_PLACE, PASS, ...) \ @@ -731,6 +642,8 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() const { #endif // OPENVINO_ARCH_X86_64 SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(Place::PipelineStart, ConvertToSwishCPU); + SNIPPETS_REGISTER_PASS_RELATIVEE_COMMON(Place::After, ov::snippets::pass::Canonicalization, + ov::snippets::pass::AnalyzeBroadcastableInputs, broadcastable_inputs); if (context->getConfig().inferencePrecision == ov::element::bf16 && subgraph_attrs->snippet->has_domain_sensitive_ops()) { // enforce BF16 precisions to supported operations // MatMul has to be decomposed to Brgemm operations before enforcement @@ -754,6 +667,7 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() const { #endif #undef SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON +#undef SNIPPETS_REGISTER_PASS_RELATIVE_COMMON #undef SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64 #undef SNIPPETS_REGISTER_PASS_RELATIVE_X86_64 @@ -808,6 +722,15 @@ void Subgraph::optimizeIR() { const auto precisions = getIOPrecisions(); subgraph->data_flow_transformations(in_blocked_shapes, precisions.first, precisions.second, getDataFlowPasses()); + // DataFlow transformations includes AnalyzeBroadcastableInputs pass - we should verify that the map is aligned with our blocked input shapes + OPENVINO_ASSERT((broadcastable_inputs.size() < in_shapes.size()) || + (!broadcastable_inputs.empty() && broadcastable_inputs.rbegin()->first < in_shapes.size()), + "Incorrect indexes of broadcastable inputs of Subgraph"); + for (const auto broadcastable_input : broadcastable_inputs) { + OPENVINO_ASSERT(broadcastable_input.second < in_shapes[broadcastable_input.first].size(), + "Incorrect processing dimension index of broadcastable index"); + } + // TODO: Snippets don't support backend-provided blocking, so we need to reshape body // using blocked shapes first. This can be removed after [121670] std::vector in_shapes; diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 7ace694164068c..3b4e1a14f13328 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -61,7 +61,6 @@ class Subgraph : public Node { IShapeInfer::Result shapeInfer() const override; private: - void initBroadcastableInputs(); void initMemoryPtrs(); void initAttributes(); void initStartOffsets(); @@ -77,7 +76,7 @@ class Subgraph : public Node { using DataFlowPasses = std::vector; using ControlFlowPasses = std::vector; - DataFlowPasses getDataFlowPasses() const; + DataFlowPasses getDataFlowPasses(); ControlFlowPasses getControlFlowPasses() const; // Holds ISA version used is codeGeneration target