Skip to content

Commit

Permalink
[Snippets] Added generic check for broadcasting (openvinotoolkit#25772)
Browse files Browse the repository at this point in the history
### Details:
- *Created the analyzer-pass `AnalyzeBroadcastableInputs` to fill map
with broadcastable inputs*
- *Added tests with scalar dimension processing with domain sensitive
ops*
- *Fixed not-first code generation for the same Subgraph node - kernel
executor table should be reset*

### Tickets:
 - *136158*

### Prerequsuites:
- [x] openvinotoolkit#25623
- [x] openvinotoolkit#25638
  • Loading branch information
a-sidorova authored Aug 12, 2024
1 parent d3fe9ff commit 5a7e226
Show file tree
Hide file tree
Showing 17 changed files with 225 additions and 37 deletions.
2 changes: 1 addition & 1 deletion src/common/snippets/include/snippets/generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class Generator {
* @param compile_params compile-time parameters used for code generation
* @return variable to handle the result
*/
LoweringResult generate(lowered::LinearIR& linear_ir, const void* compile_params = nullptr) const;
LoweringResult generate(const lowered::LinearIRPtr& linear_ir, const void* compile_params = nullptr) const;

/**
* @brief gets target machine
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ class InsertBroadcastMove : public RangedPass {
OPENVINO_RTTI("InsertBroadcastMove", "RangedPass")
bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;

private:
static bool is_broadcasting_supported(const std::shared_ptr<ov::Node>& n);
private:
static bool is_broadcasting_needed(const std::shared_ptr<ov::Node>& n);
static std::vector<size_t> get_last_dims(const ExpressionPtr& expr);
static size_t get_max_dim(const std::vector<size_t>& last_dims);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/pass/pass.hpp"

namespace ov {
namespace snippets {
namespace pass {

/**
* @interface AnalyzeBroadcastableInputs
* @brief Analyzes body parameters which affects inputs of broadcastable operations (If needed, `Broadcast` op should be inserted there).s
* Also the pass initializes special map `BroadcastableInputsMap`
* Notes:
* - Must be called after Canonicalization pass
* - Doesn't support `layouts` in PortDescriptors
* @ingroup snippets
*/
class AnalyzeBroadcastableInputs : public ov::pass::ModelPass {
public:
OPENVINO_RTTI("AnalyzeBroadcastableInputs");
// [Index of Parameter -> Index of broadcastable dimension from end]
using BroadcastableInputsMap = std::map<size_t, size_t>;
AnalyzeBroadcastableInputs(BroadcastableInputsMap& map);

bool run_on_model(const std::shared_ptr<ov::Model>& m) override;

private:
BroadcastableInputsMap& m_broadcastable_inputs;
};

} // namespace pass
} // namespace snippets
} // namespace ov
2 changes: 2 additions & 0 deletions src/common/snippets/include/snippets/pass/manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ namespace pass {
*/
class Manager : public ov::pass::Manager {
public:
Manager(std::shared_ptr<ov::pass::PassConfig> pass_config = std::make_shared<ov::pass::PassConfig>(),
std::string name = "UnnamedSnippetsManager");
~Manager() override = default;
using PassBase = ov::pass::PassBase;
using Validate = ov::pass::Validate;
Expand Down
5 changes: 5 additions & 0 deletions src/common/snippets/include/snippets/runtime_configurator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ class RuntimeConfigurator {
*/
void set_kernel_executor_table(std::shared_ptr<KernelExecutorTable> table) const;

/**
* @brief Reset KernelExecutor table
*/
void reset_kernel_executor_table() const;

protected:
/**
* @brief Update RuntimeConfig based on LinearIR
Expand Down
18 changes: 12 additions & 6 deletions src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,28 @@
namespace ov {
namespace snippets {

LoweringResult Generator::generate(lowered::LinearIR& linear_ir, const void* compile_params) const {
LoweringResult Generator::generate(const lowered::LinearIRPtr& linear_ir, const void* compile_params) const {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate")

// Before code gen we have to reset KernelExecutor Table - it should be empty
target->get_runtime_configurator()->reset_kernel_executor_table();

OV_ITT_TASK_CHAIN(GENERATE, ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::InitEmitters")

OPENVINO_ASSERT(target->is_supported(), "unsupported architecture for code generation");
linear_ir.init_emitters(target);
linear_ir->init_emitters(target);

OV_ITT_TASK_NEXT(GENERATE, "::EmitCode")

const auto kernel_op = op::Kernel::make_kernel(linear_ir);
const auto kernel_op = op::Kernel::make_kernel(*linear_ir);
kernel_op->compile_params = compile_params;
const auto kernel_expr = linear_ir.create_expression(kernel_op, std::vector<lowered::PortConnectorPtr>{});
const auto kernel_expr = linear_ir->create_expression(kernel_op, std::vector<lowered::PortConnectorPtr>{});
const auto kernel = target->get(kernel_expr->get_node()->get_type_info())(kernel_expr);

kernel->emit_code({}, {});

OV_ITT_TASK_NEXT(GENERATE, "::EmitData")
for (auto& l : linear_ir.get_ops()) {
for (auto& l : linear_ir->get_ops()) {
l->get_emitter()->emit_data();
}
OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet")
Expand All @@ -41,13 +44,16 @@ LoweringResult Generator::generate(lowered::LinearIR& linear_ir, const void* com
// 1. some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
// 2. perf count node as field of emitter should be alive at runtime.
// 3. Emitters with segfault detector debug capabilty also need to be accessible at runtime.
for (const auto& expr : linear_ir) {
for (const auto& expr : *linear_ir) {
const auto& emitter = expr->get_emitter();
if (uses_precompiled_kernel(emitter))
result.m_saved_emitters.emplace_back(emitter);
}
result.compiled_snippet = target->get_snippet();
result.kernel_executor_table = target->get_runtime_configurator()->get_kernel_executor_table();
// Some kernel executors might've been registered during code emission.
// We need to update them, so appropriate kernels will be compiled.
result.kernel_executor_table->update_state(linear_ir);

return result;
}
Expand Down
2 changes: 1 addition & 1 deletion src/common/snippets/src/lowered/linear_ir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ double LinearIR::get_inserted_expr_exec_num(constExprIt insertion_pos) const {

// sync point to enumerate expressions
// 10 * eps - is to avoid meaningless result after (right_order + left_order) / 2 below
if ((1 - left_order/right_order) <= 10 * std::numeric_limits<double>::epsilon()) {
if (std::abs(1 - left_order/right_order) <= 10 * std::numeric_limits<double>::epsilon()) {
enumerate_expressions();
left_order = left_pos->get()->get_exec_num();
right_order = right_pos->get()->get_exec_num();
Expand Down
22 changes: 11 additions & 11 deletions src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,13 +398,17 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input
INTERNAL_OP_SCOPE(Subgraph);
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::op::data_flow_transformations")

ov::snippets::pass::Manager manager;
std::shared_ptr<ov::pass::PassConfig> pass_config = std::make_shared<ov::pass::PassConfig>();
// If subgraph has its own specific canonicalization, which is different with common behavior, will skip the this common one.
// for example in GN, scale and bias shape [c] are canonicalized to [1,c,1,1], not [1,1,1,c]. Common canonicalization is disabled in this case.
if (!blocked_input_shapes.empty() && !config.m_has_broadcast_sensitive_ops)
manager.register_pass<snippets::pass::Canonicalization>(blocked_input_shapes);
if (!input_precisions.empty() && !output_precisions.empty())
manager.register_pass<snippets::pass::AlignElementTypes>(input_precisions, output_precisions);
if (blocked_input_shapes.empty() || config.m_has_broadcast_sensitive_ops)
pass_config->disable<snippets::pass::Canonicalization>();
if (input_precisions.empty() || output_precisions.empty())
pass_config->disable<snippets::pass::AlignElementTypes>();

ov::snippets::pass::Manager manager(pass_config, "SnippetsDataFlowManager");
manager.register_pass<snippets::pass::Canonicalization>(blocked_input_shapes);
manager.register_pass<snippets::pass::AlignElementTypes>(input_precisions, output_precisions);

if (config.m_has_domain_sensitive_ops) {
manager.register_pass<snippets::pass::MatMulToBrgemm>();
Expand Down Expand Up @@ -533,21 +537,17 @@ snippets::Schedule Subgraph::generate(const void* compile_params) const {
// Note: to not corrupt the lowered linear IR for the shape-dependent passes, we have to make a copy
OPENVINO_ASSERT(m_linear_ir, "Attempt to call generate, when linear IR was not initialized");
ov::snippets::lowered::ExpressionMap expression_map;
auto linear_ir = *lowered::LinearIRBuilder().clone(m_linear_ir, expression_map);
const auto linear_ir = lowered::LinearIRBuilder().clone(m_linear_ir, expression_map);

if (is_dynamic()) {
ov::snippets::lowered::pass::PassPipeline shape_dependent_pipeline;
shape_dependent_pipeline.register_pass<ov::snippets::lowered::pass::SetLoadStoreScalar>();
shape_dependent_pipeline.register_pass<ov::snippets::lowered::pass::InsertBroadcastMove>();
shape_dependent_pipeline.register_pass<ov::snippets::lowered::pass::LoadMoveBroadcastToBroadcastLoad>();
shape_dependent_pipeline.run(linear_ir);
shape_dependent_pipeline.run(*linear_ir);
}

auto lowering_result = m_generator->generate(linear_ir, compile_params);
// Some kernel executors might've been registered during code emission.
// We need to update them, so appropriate kernels will be compiled.
const auto& exec_table = get_runtime_configurator()->get_kernel_executor_table();
exec_table->update_state(m_linear_ir);
return {std::move(lowering_result)};
}

Expand Down
106 changes: 106 additions & 0 deletions src/common/snippets/src/pass/analyze_broadcastable_inputs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "snippets/pass/analyze_broadcastable_inputs.hpp"

#include "snippets/lowered/pass/insert_broadcastmove.hpp"
#include "snippets/utils/utils.hpp"
#include "snippets/itt.hpp"

namespace ov {
namespace snippets {
namespace pass {

AnalyzeBroadcastableInputs::AnalyzeBroadcastableInputs(BroadcastableInputsMap& map) : m_broadcastable_inputs(map) {}

bool pass::AnalyzeBroadcastableInputs::run_on_model(const std::shared_ptr<ov::Model>& body) {
RUN_ON_MODEL_SCOPE(AnalyzeBroadcastableInputs);
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AnalyzeBroadcastableInputs")
// Snippets supports tokenization of the following operations:
// - Unary, Binary and Ternary (Select) Elementwise ops
// - Softmax, MatMul, Transpose, GroupNorm
// Binary Elementwise ops (+ Select) requires explicit Broadcast op
// on inputs if broadcasting of latest dimensions is needed.
// These ops will be start points of DFS - need to go to Parameters and update `broadcastable_inputs_map`.
// We iterates through all ops by execution order. So if we already analyzied some op in the input branch - skip this branch.
// However, there some ops which can change `processing_dim_idx`:
// - Transpose has order which changes `processing_dim_idx`. But Transpose can be only after Parameters and before Results.
// - MatMul's first input doesn't affect output latest dimension - skip this branch.
// Also MatMul has `transposed_b` which changes `processing_dim_idx`
m_broadcastable_inputs.clear();
// Currently Broadcasting can be changed only if there are several Parameters in body
if (body->get_parameters().size() < 2)
return false;

const auto& ops = body->get_ordered_ops();
std::set<std::shared_ptr<ov::Node>> visited_ops = {};
for (const auto& op : ops) {
if (!ov::snippets::lowered::pass::InsertBroadcastMove::is_broadcasting_supported(op))
continue;

size_t processing_dim_idx = 0;

// We need to propagate `processing_dim_idx` from input of the current node to the parameter.
// To do it we use DFS
std::stack<std::shared_ptr<ov::Node>> nodes_to_calculate;
nodes_to_calculate.push(op);
while (!nodes_to_calculate.empty()) {
auto current_node = nodes_to_calculate.top();
nodes_to_calculate.pop();

if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(current_node)) {
const auto consumers = param->get_output_target_inputs(0);
if (std::any_of(consumers.cbegin(), consumers.cend(),
[](const ov::Input<ov::Node>& in) { return ov::is_type<ov::op::v1::Transpose>(in.get_node()); })) {
OPENVINO_ASSERT(consumers.size() == 1, "Incorrect count of outputs of Parameter!");
const auto transpose = consumers.begin()->get_node();
std::vector<size_t> order;
const auto& constant = ov::as_type_ptr<const opset1::Constant>(transpose->get_input_node_shared_ptr(1));
OPENVINO_ASSERT(constant, "Unsupported order node of Transpose");
order = constant->cast_vector<size_t>();
if (order.empty()) {
order.resize(transpose->get_output_partial_shape(0).size());
std::iota(order.rbegin(), order.rend(), 0);
}
// `processing_dim_idx` starts from the end
processing_dim_idx = order.size() - 1 - ov::snippets::utils::get_input_dim_idx(order, processing_dim_idx);
}
const auto param_idx = body->get_parameter_index(param);
if (m_broadcastable_inputs.count(param_idx) == 0) {
m_broadcastable_inputs[param_idx] = processing_dim_idx;
} else {
OPENVINO_ASSERT(m_broadcastable_inputs.at(param_idx) == processing_dim_idx,
"Parameter has been already analyzed and has another processing dim index!");
}
processing_dim_idx = 0;
continue;
} else if (ov::is_type<ov::op::v0::Constant>(current_node)) {
visited_ops.insert(op);
continue;
}

ov::OutputVector inputs = current_node->input_values();
if (const auto mm = ov::as_type_ptr<ov::op::v0::MatMul>(current_node)) {
inputs = { current_node->input_value(1) };
processing_dim_idx = static_cast<size_t>(mm->get_transpose_b());
}

// not a leaf - continue to search
for (const auto& input_value : inputs) {
const auto& input_node = input_value.get_node()->shared_from_this();
if (visited_ops.count(input_node) == 0) {
nodes_to_calculate.push(input_node);
}
}
}

visited_ops.insert(op);
}

return true;
}

} // namespace pass
} // namespace snippets
} // namespace ov
3 changes: 3 additions & 0 deletions src/common/snippets/src/pass/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ namespace ov {
namespace snippets {
namespace pass {

Manager::Manager(std::shared_ptr<ov::pass::PassConfig> pass_config, std::string name)
: ov::pass::Manager(std::move(pass_config), std::move(name)) {}

std::shared_ptr<Manager::PassBase> Manager::register_pass_instance(const PassPosition& position,
const std::shared_ptr<PassBase>& pass) {
pass->set_pass_config(m_pass_config);
Expand Down
4 changes: 4 additions & 0 deletions src/common/snippets/src/runtime_configurator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ RuntimeConfigurator::RuntimeConfigurator(std::shared_ptr<RuntimeConfig> c) :
OPENVINO_ASSERT(m_config, "Runtime config is nullptr!");
}

void RuntimeConfigurator::reset_kernel_executor_table() const {
m_config->kernel_executor_table = std::make_shared<ov::snippets::KernelExecutorTable>();
}

const std::shared_ptr<RuntimeConfig>& RuntimeConfigurator::get_updated_config(const lowered::LinearIRCPtr& linear_ir) {
// First initialization
if (m_io_num == 0)
Expand Down
28 changes: 21 additions & 7 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include "snippets/pass/matmul_to_brgemm.hpp"
#include "snippets/pass/propagate_precision.hpp"
#include "snippets/pass/positioned_pass.hpp"
#include "snippets/pass/canonicalization.hpp"
#include "snippets/pass/analyze_broadcastable_inputs.hpp"
#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/pass/optimize_domain.hpp"
#include "snippets/lowered/pass/insert_loops.hpp"
Expand Down Expand Up @@ -618,14 +620,16 @@ void Subgraph::initPluginBlockedShapes() const {
in_shapes[i] = srcMemPtrs[i]->getDescWithType<BlockedMemoryDesc>()->getBlockDims();
}

Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() const {
Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() {
DataFlowPasses backend_passes;

using PassPosition = ov::snippets::pass::PassPosition;
using Place = PassPosition::Place;

# define SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(PASS_PLACE, PASS, ...) \
backend_passes.emplace_back(PassPosition(PASS_PLACE), std::make_shared<PASS>(__VA_ARGS__))
# define SNIPPETS_REGISTER_PASS_RELATIVE_COMMON(PASS_PLACE, TARGET_PASS, PASS, ...) \
backend_passes.emplace_back(PassPosition(PASS_PLACE, TARGET_PASS::get_type_info_static()), std::make_shared<PASS>(__VA_ARGS__))

#if defined(OPENVINO_ARCH_X86_64)
# define SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64(PASS_PLACE, PASS, ...) \
Expand All @@ -638,6 +642,8 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() const {
#endif // OPENVINO_ARCH_X86_64

SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(Place::PipelineStart, ConvertToSwishCPU);
SNIPPETS_REGISTER_PASS_RELATIVE_COMMON(Place::After, ov::snippets::pass::Canonicalization,
ov::snippets::pass::AnalyzeBroadcastableInputs, broadcastable_inputs);
if (context->getConfig().inferencePrecision == ov::element::bf16 && subgraph_attrs->snippet->has_domain_sensitive_ops()) {
// enforce BF16 precisions to supported operations
// MatMul has to be decomposed to Brgemm operations before enforcement
Expand All @@ -661,6 +667,7 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() const {
#endif

#undef SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON
#undef SNIPPETS_REGISTER_PASS_RELATIVE_COMMON
#undef SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64
#undef SNIPPETS_REGISTER_PASS_RELATIVE_X86_64

Expand Down Expand Up @@ -698,14 +705,11 @@ Subgraph::ControlFlowPasses Subgraph::getControlFlowPasses() const {
}

uint8_t Subgraph::getBroadcastingMask(const std::vector<VectorDims>& input_shapes) {
// TODO: add check for non-eltwise inputs
if (subgraph_attrs->snippet->has_domain_sensitive_ops())
return 0;

uint8_t mask = 0;
for (const auto& shape : input_shapes) {
for (const auto& broadcastable_input : broadcastable_inputs) {
const auto& shape = input_shapes[broadcastable_input.first];
mask = mask << 1;
if (shape.back() == 1)
if (*(shape.rbegin() + broadcastable_input.second) == 1)
mask = mask | 1;
}
return mask;
Expand All @@ -718,6 +722,16 @@ void Subgraph::optimizeIR() {
const auto precisions = getIOPrecisions();
subgraph->data_flow_transformations(in_blocked_shapes, precisions.first, precisions.second, getDataFlowPasses());

// DataFlow transformations includes AnalyzeBroadcastableInputs pass:
// we should verify that the received map is aligned with our blocked input shapes
OPENVINO_ASSERT((broadcastable_inputs.size() < in_shapes.size()) ||
(!broadcastable_inputs.empty() && broadcastable_inputs.rbegin()->first < in_shapes.size()),
"Incorrect indexes of broadcastable inputs of Subgraph");
for (const auto broadcastable_input : broadcastable_inputs) {
OPENVINO_ASSERT(broadcastable_input.second < in_shapes[broadcastable_input.first].size(),
"Incorrect processing dimension index of broadcastable index");
}

// TODO: Snippets don't support backend-provided blocking, so we need to reshape body
// using blocked shapes first. This can be removed after [121670]
std::vector<snippets::VectorDimsRef> in_shapes;
Expand Down
Loading

0 comments on commit 5a7e226

Please sign in to comment.