From 9b6aed47c0169b0936d9c108eacd4b64bd150aa6 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Wed, 3 Jul 2024 11:35:18 +0400 Subject: [PATCH 1/6] [Snippets][CPU] Supported once evaluation of Brgemm in tail loop by dynamic M [Snippets][CPU] Supported evaluate once of Loop in dynamic case [Snippets][CPU] Fixed FULL_DIM in subtensor and updated BrgemmBlocking [Snippets] Updated default value in UpdateSubtensors [Snippets][CPU] Supported Brgemm subtensor update in runtime --- .../include/snippets/lowered/loop_info.hpp | 13 ++++ src/common/snippets/src/lowered/loop_info.cpp | 15 ++++- .../pass/optimize_loop_single_evaluation.cpp | 44 ++++++------ .../src/lowered/pass/propagate_subtensors.cpp | 37 +++++++++- .../snippets/src/lowered/port_descriptor.cpp | 3 +- .../snippets/src/op/serialization_node.cpp | 3 +- .../snippets/src/runtime_configurator.cpp | 16 +++-- .../snippets/tests/include/lir_test_utils.hpp | 30 +++++---- .../snippets/tests/src/lir_test_utils.cpp | 32 ++++----- .../snippets/cpu_runtime_configurator.cpp | 59 +++++++++++++++- .../snippets/cpu_runtime_configurator.hpp | 12 +++- .../snippets/x64/jit_loop_emitters.cpp | 9 ++- .../snippets/x64/kernel_executors/brgemm.cpp | 31 +++++++-- .../snippets/x64/kernel_executors/brgemm.hpp | 1 + .../x64/pass/lowered/brgemm_blocking.cpp | 47 ++++++++++--- .../x64/pass/lowered/cpu_iter_handlers.cpp | 22 ++++++ .../x64/pass/lowered/cpu_iter_handlers.hpp | 22 +++++- .../snippets/matmul.cpp | 32 ++++++++- .../x64/lowered/brgemm_blocking.cpp | 67 +++++++++++++------ 19 files changed, 387 insertions(+), 108 deletions(-) diff --git a/src/common/snippets/include/snippets/lowered/loop_info.hpp b/src/common/snippets/include/snippets/lowered/loop_info.hpp index e763f2244d76c6..a38218626dcad1 100644 --- a/src/common/snippets/include/snippets/lowered/loop_info.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_info.hpp @@ -474,7 +474,18 @@ class ExpandedLoopInfo : public LoopInfo { * @return const ref of `m_data_sizes` */ const std::vector& get_data_sizes() const; + /** + * @brief Returns True if the current Loop should be executed once + * Otherwise, returns False + * @return `m_evaluance_once` + */ + bool is_evaluate_once() const; + /** + * @brief Set value to `m_evaluance_once` + * @param value - new value of `m_evaluance_once` + */ + void set_evaluate_once(bool value); /** * @brief Update `m_ptr_increments` using copy values from `new_values`. * The count of new values must be equal to the count of current increments. @@ -517,6 +528,8 @@ class ExpandedLoopInfo : public LoopInfo { const SpecificLoopIterType m_type = {}; std::shared_ptr m_unified_loop_info = {}; + + bool m_evaluance_once = false; }; using ExpandedLoopInfoPtr = std::shared_ptr; diff --git a/src/common/snippets/src/lowered/loop_info.cpp b/src/common/snippets/src/lowered/loop_info.cpp index 6f14a52e750feb..7e32a49307ec4b 100644 --- a/src/common/snippets/src/lowered/loop_info.cpp +++ b/src/common/snippets/src/lowered/loop_info.cpp @@ -391,8 +391,11 @@ std::shared_ptr ExpandedLoopInfo::clone_with_new_expr(const Expression const auto& new_input_ports = clone_loop_ports(expr_map, m_input_ports); const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports); - return std::make_shared(m_work_amount, m_increment, new_input_ports, new_output_ports, - m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, m_unified_loop_info, m_is_work_amount_const); + const auto cloned = std::make_shared(m_work_amount, m_increment, new_input_ports, new_output_ports, + m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, + m_unified_loop_info, m_is_work_amount_const); + cloned->m_evaluance_once = m_evaluance_once; + return cloned; } bool ExpandedLoopInfo::is_dynamic() const { @@ -435,6 +438,14 @@ const std::vector& ExpandedLoopInfo::get_data_sizes() const { return m_data_sizes; } +bool ExpandedLoopInfo::is_evaluate_once() const { + return m_evaluance_once; +} + +void ExpandedLoopInfo::set_evaluate_once(bool value) { + m_evaluance_once = value; +} + void ExpandedLoopInfo::update_ptr_increments(const std::vector& new_values) { OPENVINO_ASSERT(new_values.size() == m_ptr_increments.size(), "Failed to update ptr_increments: incompatible counts"); m_ptr_increments.assign(new_values.cbegin(), new_values.end()); diff --git a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp index c19bf7d65a2fef..b9d795393c00aa 100644 --- a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp +++ b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp @@ -4,6 +4,7 @@ #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" +#include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/op/loop.hpp" #include "snippets/utils/utils.hpp" @@ -16,30 +17,35 @@ namespace pass { bool OptimizeLoopSingleEvaluation::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::OptimizeLoopSingleEvaluation") + const auto& loop_manager = linear_ir.get_loop_manager(); + bool is_modified = false; for (auto expr_it = begin; expr_it != end; ++expr_it) { const auto& expr = *expr_it; if (auto loop_end = ov::as_type_ptr(expr->get_node())) { - // *1* solo vector/tail loop + empty outer loop - // => skip increments (both counter & ptr) : set evaluate_once flag - // *2* solo vector/tail loop + non-empty outer loop - // => skip counter increments but perform ptr increments : set evaluate_once, - // and perform pointer increments through finalization offsets - // *3* vector loop(s) + one tail loop - // => vector as usual, tail depends on outer loop, see *1* and *2* - if (loop_end->has_dynamic_params() || loop_end->get_work_amount() >= 2 * loop_end->get_increment()) - continue; - - auto new_finalization_offsets = loop_end->get_finalization_offsets(); - const auto& ptr_increments = loop_end->get_ptr_increments(); - const auto work_amount_incr = static_cast(loop_end->get_increment()); - for (size_t i = 0; i < new_finalization_offsets.size(); i++) { - new_finalization_offsets[i] += ptr_increments[i] * work_amount_incr; + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + const auto work_amount = loop_end->get_work_amount(); + const auto increment = loop_end->get_increment(); + + if ((!utils::is_dynamic_value(work_amount) && work_amount == increment) || (loop_info->is_evaluate_once())) { + auto new_finalization_offsets = loop_end->get_finalization_offsets(); + const auto& ptr_increments = loop_end->get_ptr_increments(); + const auto work_amount_incr = static_cast(increment); + for (size_t i = 0; i < new_finalization_offsets.size(); i++) { + const auto ptr_shift = utils::dynamic_safe_mul(ptr_increments[i], work_amount_incr); + new_finalization_offsets[i] = utils::dynamic_safe_add(new_finalization_offsets[i], ptr_shift); + } + loop_end->set_finalization_offsets(new_finalization_offsets); + loop_end->set_ptr_increments(std::vector(new_finalization_offsets.size(), 0)); + loop_end->set_evaluate_once(true); + + // Update the corresponding ExpandedLoopInfo + loop_info->update_ptr_increments(loop_end->get_ptr_increments()); + loop_info->update_finalization_offsets(loop_end->get_finalization_offsets()); + loop_info->set_evaluate_once(true); + + is_modified = true; } - loop_end->set_finalization_offsets(new_finalization_offsets); - loop_end->set_ptr_increments(std::vector(new_finalization_offsets.size(), 0)); - loop_end->set_evaluate_once(true); - is_modified = true; } } return is_modified; diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp index b58de6790c23a4..09d8491840d804 100644 --- a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp +++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp @@ -15,14 +15,43 @@ namespace snippets { namespace lowered { namespace pass { namespace { + +// The algorithm uses the following special values in subtensors/shapes: +// 1. Dynamic value in subtensor/shape : SIZE_MAX +// 2. Full fimension in subtensor : SIZE_MAX - 1 +// 3. Default value of `new_dim_value` : SIZE_MAX - 2 +// 4. `Forced` special dynamic value : SIZE_MAX - 3 +// +// We have to introduce `SPECIAL_DYNAMIC_VALUE` to distinguish `new_dim_value = DYNAMIC` +// from the real dynamic values in subtensors and shapes and force this value in subtensors. +// For example, there is Brgemm with the following info in the tail Loop: +// Input 0: shape [?, ?], existing subtensor [32, FULL_DIM] +// Input 1: shape [?, ?], existing subtensor [FULL_DIM, FULL_DIM] +// Output : shape [?, ?], existing subtensor [32, FULL_DIM] +// If the user wants to force `?` in the place of `32` in subtensors, the steps will be: +// 1. Set `?` to subtensor and shape of Input 0 : +// shape [?, ?] (shape has not been changed!), new subtensor [?, FULL_DIM] +// 2. Make shape inference of Brgemm and get Output: +// shape [?, ?] (shape has not been changed!), existing subtensor [FULL_DIM, FULL_DIM] +// 3. Update subtensor on output using shape: +// new_subtensor[i] = std::min(planar_shape[i], subtensor[i]); // i = 0: std::min(SIZE_MAX(?), 32) +// new subtensor [32, FULL_DIM] - has not been changed! But should be [?, FULL_DIM] +// Conculsion: we have to distinguish forced dynamic value with existing dynamic values in shape and subtensor + +constexpr size_t NEW_DEFAULT_VALUE = SIZE_MAX - 2; +constexpr size_t FORCED_DYNAMIC_VALUE = SIZE_MAX - 3; + void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, const LoopInfoPtr& loop_info, LinearIR::container::const_iterator begin, LinearIR::container::const_iterator end, bool most_outer_loop, - const size_t new_dim_value = SIZE_MAX) { - OPENVINO_ASSERT(snippets::utils::implication(most_outer_loop, new_dim_value != SIZE_MAX), + size_t new_dim_value = NEW_DEFAULT_VALUE) { + // Marks the forced dynamic value + new_dim_value = utils::is_dynamic_value(new_dim_value) ? FORCED_DYNAMIC_VALUE : new_dim_value; + OPENVINO_ASSERT(snippets::utils::implication(most_outer_loop, new_dim_value != NEW_DEFAULT_VALUE), "if the updated subtensor propagation was called for the outer loop, new_dim_value must not be equal to default value"); + std::map original_shapes; // First step: set new dim value to the corresponding input_ports' dimensions if (most_outer_loop) { @@ -78,7 +107,9 @@ void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, const size_t subtensor_start = planar_dims.size() - subtensor.size(); VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end()); for (size_t i = 0; i < new_subtensor.size(); ++i) { - new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]); + // If user forces dynamic value to set in subtensor, set real dynamic dimension using `get_dynamic_value()` + new_subtensor[i] = new_subtensor[i] == FORCED_DYNAMIC_VALUE ? utils::get_dynamic_value() + : std::min(new_subtensor[i], subtensor[i]); } desc->set_subtensor(new_subtensor); } diff --git a/src/common/snippets/src/lowered/port_descriptor.cpp b/src/common/snippets/src/lowered/port_descriptor.cpp index 3280be29973b69..612dd0e8b9bf58 100644 --- a/src/common/snippets/src/lowered/port_descriptor.cpp +++ b/src/common/snippets/src/lowered/port_descriptor.cpp @@ -9,7 +9,8 @@ namespace ov { namespace snippets { namespace lowered { -size_t PortDescriptor::ServiceDimensions::FULL_DIM = SIZE_MAX; +// SIZE_MAX - is dynamic value +size_t PortDescriptor::ServiceDimensions::FULL_DIM = SIZE_MAX - 1; PortDescriptor::PortDescriptor(const ov::Input& in, VectorDims subtensor_shape, std::vector layout) : PortDescriptor(ov::Input(in.get_node(), in.get_index()), std::move(subtensor_shape), std::move(layout)) {} diff --git a/src/common/snippets/src/op/serialization_node.cpp b/src/common/snippets/src/op/serialization_node.cpp index cb17e8a57ddf24..1d58cec7aa0ef8 100644 --- a/src/common/snippets/src/op/serialization_node.cpp +++ b/src/common/snippets/src/op/serialization_node.cpp @@ -49,7 +49,8 @@ bool SerializationNode::visit_attributes(AttributeVisitor &visitor) { std::stringstream ss; for (size_t i = 0; i < subtensor.size(); ++i) { const auto& v = subtensor[i]; - const auto v_str = (v == lowered::PortDescriptor::ServiceDimensions::FULL_DIM) ? "FULL_DIM" : std::to_string(v); + const auto v_str = v == lowered::PortDescriptor::ServiceDimensions::FULL_DIM ? "FULL_DIM" : + (utils::is_dynamic_value(v) ? "?" : std::to_string(v)); const auto del = i < subtensor.size() - 1 ? ", " : ""; ss << v_str << del; } diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index c3db1864bf1135..45a7ec72b734ae 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -185,11 +185,19 @@ void RuntimeConfigurator::update_loop_info(const std::shared_ptrget_work_amount(); - expanded_loop_info->update_ptr_increments(ptr_increments); - if (current_work_amount > 0) { - expanded_loop_info->update_finalization_offsets(std::vector(finalization_offsets.size(), 0)); + if (expanded_loop_info->is_evaluate_once()) { + expanded_loop_info->update_ptr_increments(std::vector(ptr_increments.size(), 0)); + auto updated_finalization_offsets = current_work_amount > 0 ? std::vector(finalization_offsets.size(), 0) : finalization_offsets; + // work_amount is equal to increment in cases with `evaluate_once` + for (size_t i = 0; i < updated_finalization_offsets.size(); ++i) + updated_finalization_offsets[i] += ptr_increments[i] * expanded_loop_info->get_work_amount(); + expanded_loop_info->update_finalization_offsets(updated_finalization_offsets); } else { - expanded_loop_info->update_finalization_offsets(finalization_offsets); + expanded_loop_info->update_ptr_increments(ptr_increments); + if (current_work_amount > 0) + expanded_loop_info->update_finalization_offsets(std::vector(finalization_offsets.size(), 0)); + else + expanded_loop_info->update_finalization_offsets(finalization_offsets); } } } diff --git a/src/common/snippets/tests/include/lir_test_utils.hpp b/src/common/snippets/tests/include/lir_test_utils.hpp index 2f687f6e1412d1..ebe2e63cc2e66f 100644 --- a/src/common/snippets/tests/include/lir_test_utils.hpp +++ b/src/common/snippets/tests/include/lir_test_utils.hpp @@ -51,13 +51,14 @@ void init_expr_descriptors(const ov::snippets::lowered::ExpressionPtr& expr, * @param linear_ir linear_ir in which loop info should be added * @param entries entry points of loop * @param exits exit points of loop + * @return ID of created loop */ -void create_and_add_unified_loop_info(const std::shared_ptr& linear_ir, - size_t work_amount, - size_t increment, - const std::vector& entries, - const std::vector& exits, - bool add_default_handlers = true); +size_t create_and_add_unified_loop_info(const std::shared_ptr& linear_ir, + size_t work_amount, + size_t increment, + const std::vector& entries, + const std::vector& exits, + bool add_default_handlers = true); /** * @brief Creates unified loop info based on provided entry and exit points, and adds it to the linear_ir's loops map. * Meanwhile set loop id to expr range [loop_begin_pos, loop_end_pos). @@ -67,15 +68,16 @@ void create_and_add_unified_loop_info(const std::shared_ptr& linear_ir, - ov::snippets::lowered::LinearIR::constExprIt loop_begin_pos, - ov::snippets::lowered::LinearIR::constExprIt loop_end_pos, - size_t work_amount, - size_t increment, - const std::vector& entries, - const std::vector& exits, - bool add_default_handlers = true); +size_t create_and_add_unified_loop_info(const std::shared_ptr& linear_ir, + ov::snippets::lowered::LinearIR::constExprIt loop_begin_pos, + ov::snippets::lowered::LinearIR::constExprIt loop_end_pos, + size_t work_amount, + size_t increment, + const std::vector& entries, + const std::vector& exits, + bool add_default_handlers = true); } // namespace snippets } // namespace test } // namespace ov diff --git a/src/common/snippets/tests/src/lir_test_utils.cpp b/src/common/snippets/tests/src/lir_test_utils.cpp index 274480fcd84c85..91f5e803784dec 100644 --- a/src/common/snippets/tests/src/lir_test_utils.cpp +++ b/src/common/snippets/tests/src/lir_test_utils.cpp @@ -85,26 +85,26 @@ void init_expr_descriptors(const ov::snippets::lowered::ExpressionPtr& expr, } } -void create_and_add_unified_loop_info(const LinearIRPtr& linear_ir, - size_t work_amount, - size_t increment, - const std::vector& entries, - const std::vector& exits, - bool set_default_handlers) { +size_t create_and_add_unified_loop_info(const LinearIRPtr& linear_ir, + size_t work_amount, + size_t increment, + const std::vector& entries, + const std::vector& exits, + bool set_default_handlers) { // Equal begin and end iterators are set to avoid expressions marking with new loop id - create_and_add_unified_loop_info(linear_ir, linear_ir->begin(), linear_ir->begin(), work_amount, increment, entries, exits, set_default_handlers); + return create_and_add_unified_loop_info(linear_ir, linear_ir->begin(), linear_ir->begin(), work_amount, increment, entries, exits, set_default_handlers); } -void create_and_add_unified_loop_info(const LinearIRPtr& linear_ir, - ov::snippets::lowered::LinearIR::constExprIt loop_begin_pos, - ov::snippets::lowered::LinearIR::constExprIt loop_end_pos, - size_t work_amount, - size_t increment, - const std::vector& entries, - const std::vector& exits, - bool set_default_handlers) { +size_t create_and_add_unified_loop_info(const LinearIRPtr& linear_ir, + ov::snippets::lowered::LinearIR::constExprIt loop_begin_pos, + ov::snippets::lowered::LinearIR::constExprIt loop_end_pos, + size_t work_amount, + size_t increment, + const std::vector& entries, + const std::vector& exits, + bool set_default_handlers) { const auto& loop_manager = linear_ir->get_loop_manager(); - loop_manager->mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, entries, exits, set_default_handlers); + return loop_manager->mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, entries, exits, set_default_handlers); } } // namespace snippets diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 1f6bd487032730..b7b1ef9461ed8a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -4,6 +4,7 @@ #include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "snippets/utils/utils.hpp" #include "snippets/lowered/loop_manager.hpp" @@ -18,8 +19,40 @@ void CPURuntimeConfigurator::update(const std::shared_ptris_dynamic()) { + const auto& loop_manager = linear_ir->get_loop_manager(); + update_loop_args(loop_manager); + update_brgemms(loop_manager); get_kernel_executor_table()->update_state(); - update_loop_args(linear_ir); + } +} + +void CPURuntimeConfigurator::initialization(const std::shared_ptr& linear_ir) { + RuntimeConfigurator::initialization(linear_ir); + + for (const auto& expr : *linear_ir) { + if (ov::is_type(expr->get_node())) { + const auto& in0_desc = expr->get_input_port_descriptor(0); + const auto& in1_desc = expr->get_input_port_descriptor(1); + const auto& out_desc = expr->get_output_port_descriptor(0); + + const auto& in0_subtensor = in0_desc->get_subtensor(); + const auto& in1_subtensor = in1_desc->get_subtensor(); + const auto& out_subtensor = out_desc->get_subtensor(); + + // TODO [146125]: At the moment only blocking by dynamic M is supported + // So we save Brgemm with only dynamic M + // If there are other dynamic dimensions, throw exception for now + OPENVINO_ASSERT(!snippets::utils::is_dynamic_value(*in0_subtensor.crbegin()) && + !snippets::utils::is_dynamic_value(*in1_subtensor.crbegin()) && + !snippets::utils::is_dynamic_value(*(++in1_subtensor.crbegin())) && + !snippets::utils::is_dynamic_value(*out_subtensor.crbegin()), + "CPURuntimeConfigurator supports only dynamic M in Brgemm subtensors"); + OPENVINO_ASSERT(*(++in0_subtensor.crbegin()) == *(++out_subtensor.crbegin()), + "Incorrect values in subtensors of BrgemmCPU"); + + if (snippets::utils::is_dynamic_value(*(++in0_subtensor.crbegin()))) + m_dynamic_brgemms.push_back(expr); + } } } @@ -27,11 +60,11 @@ void CPURuntimeConfigurator::init_tensor_rank(const std::shared_ptrtensor_rank = std::max(linear_ir->get_master_shape().size(), rank6D); } -void CPURuntimeConfigurator::update_loop_args(const std::shared_ptr& linear_ir) const { +void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const { const auto& cpu_config = ov::as_type_ptr(m_config); OPENVINO_ASSERT(cpu_config, "CPURuntimeConfigurator expects CPURuntimeConfig"); - const auto& loop_map = linear_ir->get_loop_manager()->get_map(); + const auto& loop_map = loop_manager->get_map(); cpu_config->loop_args.resize(loop_map.size()); for (const auto& loop : loop_map) { const auto& idx = loop.first; @@ -50,5 +83,25 @@ void CPURuntimeConfigurator::update_loop_args(const std::shared_ptrget_loop_ids(); + OPENVINO_ASSERT(!loop_ids.empty(), "Dynamic Brgemm must be in loops"); + // TODO [146125]: Loop by M is first one in `loop_ids` + const auto& expanded_loop_info = loop_manager->get_loop_info(loop_ids.front()); + const auto& block_size_m = expanded_loop_info->get_work_amount(); + + const auto& in_desc = brgemm_expr->get_input_port_descriptor(0); + const auto& out_desc = brgemm_expr->get_output_port_descriptor(0); + + auto in_subtensor = in_desc->get_subtensor(); + auto out_subtensor = out_desc->get_subtensor(); + *++in_subtensor.rbegin() = block_size_m; + *++out_subtensor.rbegin() = block_size_m; + in_desc->set_subtensor(in_subtensor); + out_desc->set_subtensor(out_subtensor); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 6b3a54652097ae..b5ea975bb0c3c0 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -30,6 +30,11 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { * @param linear_ir LinearIR */ void update(const std::shared_ptr& linear_ir) override; + /** + * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator + * @param linear_ir LinearIR + */ + void initialization(const std::shared_ptr& linear_ir) override; /** * @brief Initializes tensor rank of config * @param linear_ir LinearIR @@ -39,9 +44,14 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig * @param linear_ir LinearIR */ - void update_loop_args(const std::shared_ptr& linear_ir) const; + void update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const; + /** + * @brief Update subtensors of Brgemms + */ + void update_brgemms(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const; const size_t rank6D = 6; + std::vector m_dynamic_brgemms = {}; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp index 6b99097872db37..cb6dfeb741109a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp @@ -41,7 +41,8 @@ void jit_loop_begin_emitter::validate_arguments(const std::vector &in, c // Note: the only expected output is work amount register (communicated to jit_loop_end_emitter) OV_CPU_JIT_EMITTER_ASSERT(out.size() == 1, "Invalid outputs size: expected 1 got " + std::to_string(out.size())); OV_CPU_JIT_EMITTER_ASSERT(loop_begin_label != nullptr && loop_end_label != nullptr, "has not inited labels!"); - OV_CPU_JIT_EMITTER_ASSERT(implication(is_work_amount_dynamic, !evaluate_once), "with dynamic work_amount cannot evaluate once!"); + OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_value(wa_increment) || evaluate_once, + "loop increment might be dynamic only if loop evaluates once!"); } void jit_loop_begin_emitter::emit_code(const std::vector &in, const std::vector &out, @@ -52,7 +53,8 @@ void jit_loop_begin_emitter::emit_code(const std::vector &in, const std: void jit_loop_begin_emitter::emit_impl(const std::vector& in, const std::vector& out) const { // If the loop evaulate once, we can skip loop begin code emission - if (evaluate_once) + // If work_amount is dynamic, we should get runtime `work_amount` - it might be `zero` and we should skip loop evaluation + if (evaluate_once && !is_work_amount_dynamic) return; Reg64 reg_work_amount = Reg64(static_cast(out.back())); @@ -124,7 +126,8 @@ void jit_loop_end_emitter::validate_arguments(const std::vector &in, con "Invalid finalization_offsets size: expected: ", io_size, " got ", finalization_offsets.size()); OV_CPU_JIT_EMITTER_ASSERT(data_sizes.size() == io_size, "Invalid data_sizes size: expected: ", io_size, " got ", data_sizes.size()); OV_CPU_JIT_EMITTER_ASSERT(loop_end_label != nullptr && loop_begin_label != nullptr, "has not inited labels!"); - OV_CPU_JIT_EMITTER_ASSERT(implication(are_ptr_shifts_dynamic, !evaluate_once), "with dynamic data pointer shifts cannot evaluate once!"); + OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_value(wa_increment) || evaluate_once, + "loop increment might be dynamic only if loop evaluates once!"); } void jit_loop_end_emitter::emit_code(const std::vector &in, const std::vector &out, diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp index 6898fd18b587cd..69466faa8aaa5a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp @@ -18,7 +18,7 @@ using namespace dnnl::impl::cpu::x64; namespace { size_t init_hash(dnnl_data_type_t dt_in0, dnnl_data_type_t dt_in1, float beta, bool is_with_amx, - bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t isa) { + bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t isa) { size_t seed = 0; #define HASH(X) seed = hash_combine(seed, X) HASH(dt_in0); HASH(dt_in1); @@ -41,7 +41,7 @@ BrgemmKernelConfig::BrgemmKernelConfig(const element::Type& in0_dtype, const ele } bool BrgemmKernelConfig::is_completed() const { - return !utils::one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC); + return !utils::one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC) || is_empty(); } bool BrgemmKernelConfig::operator==(const BrgemmKernelConfig& rhs) const { @@ -54,11 +54,22 @@ bool BrgemmKernelConfig::operator==(const BrgemmKernelConfig& rhs) const { } void BrgemmKernelConfig::update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC) { - m_M = M; m_N = N; m_K = K; - m_LDA = LDA; m_LDB = LDB; m_LDC = LDC; + // If M is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example) + // To process this case, we have to make this Config as empty (nullify runtime parameters) + if (M == 0 && !utils::one_of(0, N, K, LDA, LDB, LDC)) { + m_M = 0; m_N = 0; m_K = 0; + m_LDA = 0; m_LDB = 0; m_LDC = 0; + } else { + m_M = M; m_N = N; m_K = K; + m_LDA = LDA; m_LDB = LDB; m_LDC = LDC; + } m_hash = compute_hash(); } +bool BrgemmKernelConfig::is_empty() const { + return everyone_is(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC); +} + BrgemmKernelConfig::operator amx_tile_config_t() const { amx_tile_config_t res; res.M = m_M; res.N = m_N; res.K = m_K; @@ -115,6 +126,14 @@ BrgemmKernelExecutor::BrgemmKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kern std::shared_ptr BrgemmKernelExecutor::compile_kernel(const BrgemmKernelConfig& config) const { + std::shared_ptr compiled_kernel = std::make_shared(); + + // Brgemm is not executable - nothing to compile + if (config.is_empty()) { + compiled_kernel->compiled_kernel = std::unique_ptr(); + return compiled_kernel; + } + cpu::x64::brgemm_t desc; auto status = brgemm_desc_init(&desc, config.get_isa(), cpu::x64::brgemm_strd, config.get_dt_in0(), config.get_dt_in1(), @@ -122,10 +141,8 @@ std::shared_ptr BrgemmKernelExecutor::compile_kernel(const config.get_beta(), config.get_LDA(), config.get_LDB(), config.get_LDC(), config.get_M(), config.get_N(), config.get_K(), nullptr); - - auto compiled_kernel = std::make_shared(); - OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot initialize brgemm descriptor due to invalid params"); + if (config.is_with_amx()) { status = brgemm_init_tiles(desc, compiled_kernel->palette); OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot initialize brgemm tiles due to invalid params"); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp index b0dd9c465b66de..c87a7e93f3b3f7 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp @@ -24,6 +24,7 @@ struct BrgemmKernelConfig : public snippets::KernelExecutorBase::GenericConfig { return std::unique_ptr( new BrgemmKernelConfig(*this)); } void update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC); + bool is_empty() const; dnnl_data_type_t get_dt_in0() const { return m_static_params->dt_in0; } dnnl_data_type_t get_dt_in1() const { return m_static_params->dt_in1; } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp index 044a1f724e78c3..6b74b2e8eff1ba 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp @@ -9,6 +9,7 @@ #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/pass/pass.hpp" +#include "snippets/lowered/pass/propagate_subtensors.hpp" #include "snippets/snippets_isa.hpp" #include "snippets/utils/utils.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" @@ -107,12 +108,20 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea const auto block_size_n = snippets::utils::is_dynamic_value(n) ? brgemm->get_n_block_size() : std::min(brgemm->get_n_block_size(), n); const auto block_size_k = snippets::utils::is_dynamic_value(k) ? brgemm->get_k_block_size() : std::min(brgemm->get_k_block_size(), k); - *++in_0_subtensor.rbegin() = block_size_m; - *++out_subtensor.rbegin() = block_size_m; - *in_1_subtensor.rbegin() = block_size_n; - *out_subtensor.rbegin() = block_size_n; - *in_0_subtensor.rbegin() = block_size_k; - *++in_1_subtensor.rbegin() = block_size_k; + // If block_size is dynamic, it means that Brgemm will process full tensor: + // subtensor[i] = FULL_DIM as by default + if (!snippets::utils::is_dynamic_value(block_size_m)) { + *++in_0_subtensor.rbegin() = block_size_m; + *++out_subtensor.rbegin() = block_size_m; + } + if (!snippets::utils::is_dynamic_value(block_size_n)) { + *in_1_subtensor.rbegin() = block_size_n; + *out_subtensor.rbegin() = block_size_n; + } + if (!snippets::utils::is_dynamic_value(block_size_k)) { + *in_0_subtensor.rbegin() = block_size_k; + *++in_1_subtensor.rbegin() = block_size_k; + } brgemm_expr->get_input_port_descriptor(0)->set_subtensor(in_0_subtensor); brgemm_expr->get_input_port_descriptor(1)->set_subtensor(in_1_subtensor); @@ -142,6 +151,15 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea } } + auto get_default_handlers = [](size_t work_amount, size_t block_size) { + SpecificIterationHandlers handlers; + const auto tail_size = snippets::utils::is_dynamic_value(work_amount) ? snippets::utils::get_dynamic_value() : work_amount % block_size; + if (tail_size != 0) + handlers.register_pass(tail_size); + handlers.register_pass(true); + return handlers; + }; + auto mark_m_blocking = [&](bool include_repacking) { const auto loop_begin_it = get_loop_begin_pos(linear_ir, expr_it, include_repacking); const auto loop_end_it = std::next(expr_it); @@ -154,7 +172,9 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea if (!include_repacking && brgemm_cpu && with_compensations(brgemm_cpu->get_type())) entries.emplace_back(brgemm_expr->get_input_port(2), false); const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; - loop_manager->mark_loop(loop_begin_it, loop_end_it, m, block_size_m, 1, entries, exits); + + const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, m, block_size_m, 1, entries, exits, false); + loop_manager->get_loop_info(id)->set_handlers(get_default_handlers(m, block_size_m)); }; auto mark_n_blocking = [&]() { @@ -165,7 +185,9 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea LoopPort(brgemm_expr->get_input_port(0), false), LoopPort(need_brgemm_copy_b ? copy_b_expr->get_input_port(0) : brgemm_expr->get_input_port(1), true)}; const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; - loop_manager->mark_loop(loop_begin_it, loop_end_it, n, block_size_n, 0, entries, exits); + + const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, n, block_size_n, 0, entries, exits, false); + loop_manager->get_loop_info(id)->set_handlers(get_default_handlers(n, block_size_n)); }; auto mark_k_blocking = [&]() { @@ -176,9 +198,12 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea LoopPort(brgemm_expr->get_input_port(0), true, 0), LoopPort(need_brgemm_copy_b ? copy_b_expr->get_input_port(0) : brgemm_expr->get_input_port(1), true, 1)}; const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), false)}; - const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits); - const auto& loop_info = loop_manager->get_loop_info(id); - loop_info->register_pass_to_handler(0.f); + + auto handlers = get_default_handlers(k, block_size_k); + handlers.register_pass(0.f); + + const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits, false); + loop_manager->get_loop_info(id)->set_handlers(handlers); }; const bool k_blocking = block_size_k != k; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp index d5e96b2a7339ba..97c8c2a5299f6b 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp @@ -4,6 +4,7 @@ #include "cpu_iter_handlers.hpp" +#include "snippets/op/loop.hpp" #include "snippets/lowered/loop_manager.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" @@ -34,6 +35,27 @@ std::shared_ptr SetBrgemmBeta::merge(const st return nullptr; return merged_pass; } + +SetEvaluanceOnce::SetEvaluanceOnce(bool evaluation) : snippets::lowered::pass::RangedPass(), m_evaluation(evaluation) {} + +bool SetEvaluanceOnce::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + const auto& loop_end = ov::as_type_ptr(end->get()->get_node()); + OPENVINO_ASSERT(loop_end, "SetEvaluanceOnce expected LoopEnd node in iterator `end`."); + const auto& loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id()); + loop_info->set_evaluate_once(m_evaluation); + return true; +} + +std::shared_ptr SetEvaluanceOnce::merge(const std::shared_ptr& other) { + const auto merged_pass = std::make_shared(m_evaluation); + if (other == nullptr) + return merged_pass; + const auto casted_pass = ov::as_type_ptr(other); + if (!casted_pass || m_evaluation != casted_pass->m_evaluation) + return nullptr; + return merged_pass; +} + } // namespace pass } // namespace intel_cpu } // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp index 5da97e29796f70..7616954bc2cca5 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp @@ -25,7 +25,27 @@ class SetBrgemmBeta : public snippets::lowered::pass::RangedPass { std::shared_ptr merge(const std::shared_ptr& other) override; private: - float m_beta; + float m_beta = 0; +}; + +/** + * @interface SetEvaluanceOnce + * @brief The pass set `evaluate once` only to ExpandedLoopInfo which is mapped on LoopEnd in the passed iterator `end`. + * The pointer arithmetic should be updated in the separate optimization `OptimizeLoopSingleEvaluation` + * @param m_evaluation - value which must be set + * @ingroup snippets + */ +class SetEvaluanceOnce : public snippets::lowered::pass::RangedPass { +public: + SetEvaluanceOnce(bool evaluation); + OPENVINO_RTTI("SetEvaluanceOnce", "RangedPass") + bool run(snippets::lowered::LinearIR& linear_ir, + snippets::lowered::LinearIR::constExprIt begin, + snippets::lowered::LinearIR::constExprIt end) override; + std::shared_ptr merge(const std::shared_ptr& other) override; + +private: + bool m_evaluation = false; }; } // namespace pass } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index 778bcba7a235a0..8e1959224ef6a1 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -66,11 +66,39 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul, std::vector> input_shapes_dynamic{ + // All dimensions are dynamic { {PartialShape{-1, -1, -1, -1}, {{2, 1, 32, 64}, {2, 2, 10, 20}, {2, 2, 100, 80}, - {2, 2, 10, 20}, {2, 1, 32, 64}}}, + {2, 2, 10, 20}, {2, 1, 32, 64}, {2, 3, 64, 55}}}, {PartialShape{-1, -1, -1, -1}, {{1, 3, 64, 128}, {2, 2, 20, 30}, {2, 2, 80, 120}, - {2, 2, 20, 30}, {1, 3, 64, 128}}} + {2, 2, 20, 30}, {1, 3, 64, 128}, {2, 3, 55, 128}}} + }, + // Only M dimension is dynamic + one one loop by M + { + {PartialShape{-1, 2, -1, 64}, {{2, 2, 64, 64}, {2, 2, 64, 64}, {2, 2, 35, 64}, + {2, 2, 120, 64}, {2, 2, 15, 64}, {2, 2, 35, 64}}}, + {PartialShape{-1, 2, 64, 32}, {{2, 2, 64, 32}, {2, 2, 64, 32}, {1, 2, 64, 32}, + {1, 2, 64, 32}, {2, 2, 64, 32}, {1, 2, 64, 32}}} + }, + // Only M dimension is dynamic + all Loops (by M, N, K) + { + {PartialShape{2, 2, -1, 550}, {{2, 2, 64, 550}, {2, 2, 16, 550}, {2, 2, 35, 550}, + {2, 2, 16, 550}, {2, 2, 70, 550}, {2, 2, 64, 550}}}, + {PartialShape{2, 1, 550, 70}, {{2, 1, 550, 70}, {2, 1, 550, 70}, {2, 1, 550, 70}, + {2, 1, 550, 70}, {2, 1, 550, 70}, {2, 1, 550, 70}}} + }, + // Only K dimension is dynamic + { + {PartialShape{2, 2, 70, -1}, {{2, 2, 70, 128}, {2, 2, 70, 10}, {2, 2, 70, 33}, + {2, 2, 70, 35}, {2, 2, 70, 100}}}, + {PartialShape{2, 2, -1, 70}, {{2, 2, 128, 70}, {2, 2, 10, 70}, {2, 2, 33, 70}, + {2, 2, 35, 70}, {2, 2, 100, 70}}} + }, + // Only N dimension is dynamic + { + STATIC_SHAPE(2, 2, 65, 550), + {PartialShape{2, 2, 550, -1}, {{2, 2, 550, 70}, {2, 2, 550, 12}, {2, 2, 550, 70}, + {2, 2, 550, 12}, {2, 2, 550, 10}}} }, }; diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp index ef0ffcd70e6c39..2e6cd996cb9135 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp @@ -7,6 +7,8 @@ #include "lir_test_utils.hpp" #include "openvino/opsets/opset10.hpp" #include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/propagate_subtensors.hpp" +#include "snippets/lowered/pass/serialize_control_flow.hpp" #include "snippets/snippets_isa.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" @@ -22,6 +24,15 @@ using namespace ov::snippets; using BRGEMM_TYPE = intel_cpu::brgemm_utils::BRGEMM_TYPE; namespace { +SpecificIterationHandlers get_default_handlers(size_t work_amount, size_t block_size) { + SpecificIterationHandlers handlers; + const auto tail_size = snippets::utils::is_dynamic_value(work_amount) ? snippets::utils::get_dynamic_value() : work_amount % block_size; + if (tail_size != 0) + handlers.register_pass(tail_size); + handlers.register_pass(true); + return handlers; +} + void create_brgemm_loop_infos(const LinearIRPtr& linear_ir, const ExpressionPtr& brgemm_expr, size_t m = 0, size_t m_blk = 0, @@ -31,21 +42,29 @@ void create_brgemm_loop_infos(const LinearIRPtr& linear_ir, const bool n_block = k != 0 && k_blk != 0; const bool m_block = m != 0 && m_blk != 0; if (k_block) { - create_and_add_unified_loop_info(linear_ir, k, k_blk, - {LoopPort(brgemm_expr->get_input_port(0)), LoopPort(brgemm_expr->get_input_port(1), true, 1)}, - {LoopPort(brgemm_expr->get_output_port(0), false)}); - const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(0); + const size_t loop_id = create_and_add_unified_loop_info(linear_ir, k, k_blk, + {LoopPort(brgemm_expr->get_input_port(0)), + LoopPort(brgemm_expr->get_input_port(1), true, 1)}, + {LoopPort(brgemm_expr->get_output_port(0), false)}, false); + const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); + loop_info->set_handlers(get_default_handlers(k, k_block)); loop_info->register_pass_to_handler(0.f); } if (n_block) { - create_and_add_unified_loop_info(linear_ir, n, n_blk, - {LoopPort(brgemm_expr->get_input_port(0), false), LoopPort(brgemm_expr->get_input_port(1))}, - {LoopPort(brgemm_expr->get_output_port(0))}); + const size_t loop_id = create_and_add_unified_loop_info(linear_ir, n, n_blk, + {LoopPort(brgemm_expr->get_input_port(0), false), + LoopPort(brgemm_expr->get_input_port(1))}, + {LoopPort(brgemm_expr->get_output_port(0))}, false); + const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); + loop_info->set_handlers(get_default_handlers(n, n_block)); } if (m_block) { - create_and_add_unified_loop_info(linear_ir, m, m_blk, - {LoopPort(brgemm_expr->get_input_port(0), true, 1), LoopPort(brgemm_expr->get_input_port(1), false, 1)}, - {LoopPort(brgemm_expr->get_output_port(0), true, 1)}); + const size_t loop_id = create_and_add_unified_loop_info(linear_ir, m, m_blk, + {LoopPort(brgemm_expr->get_input_port(0), true, 1), + LoopPort(brgemm_expr->get_input_port(1), false, 1)}, + {LoopPort(brgemm_expr->get_output_port(0), true, 1)}, false); + const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); + loop_info->set_handlers(get_default_handlers(m, m_block)); } } @@ -59,22 +78,30 @@ void create_brgemm_with_copy_b_loop_infos(const LinearIRPtr& linear_ir, const bool n_block = k != 0 && k_blk != 0; const bool m_block = m != 0 && m_blk != 0; if (k_block) { - create_and_add_unified_loop_info(linear_ir, k, k_blk, - {LoopPort(brgemm_expr->get_input_port(0)), LoopPort(copy_b_expr->get_input_port(0), true, 1)}, - {LoopPort(brgemm_expr->get_output_port(0), false)}); - const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(0); + const size_t loop_id = create_and_add_unified_loop_info(linear_ir, k, k_blk, + {LoopPort(brgemm_expr->get_input_port(0)), + LoopPort(copy_b_expr->get_input_port(0), true, 1)}, + {LoopPort(brgemm_expr->get_output_port(0), false)}); + const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); + loop_info->set_handlers(get_default_handlers(k, k_block)); loop_info->register_pass_to_handler(0.f); } if (n_block) { - create_and_add_unified_loop_info(linear_ir, n, n_blk, - {LoopPort(brgemm_expr->get_input_port(0), false), LoopPort(copy_b_expr->get_input_port(0))}, - {LoopPort(brgemm_expr->get_output_port(0))}); + const size_t loop_id = create_and_add_unified_loop_info(linear_ir, n, n_blk, + {LoopPort(brgemm_expr->get_input_port(0), false), + LoopPort(copy_b_expr->get_input_port(0))}, + {LoopPort(brgemm_expr->get_output_port(0))}); + const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); + loop_info->set_handlers(get_default_handlers(n, n_block)); } if (m_block) { const auto& second_input_port = k_block || n_block ? copy_b_expr->get_input_port(0) : brgemm_expr->get_input_port(1); - create_and_add_unified_loop_info(linear_ir, m, m_blk, - {LoopPort(brgemm_expr->get_input_port(0), true, 1), LoopPort(second_input_port, false, 1)}, - {LoopPort(brgemm_expr->get_output_port(0), true, 1)}); + const size_t loop_id = create_and_add_unified_loop_info(linear_ir, m, m_blk, + {LoopPort(brgemm_expr->get_input_port(0), true, 1), + LoopPort(second_input_port, false, 1)}, + {LoopPort(brgemm_expr->get_output_port(0), true, 1)}); + const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); + loop_info->set_handlers(get_default_handlers(m, m_block)); } } } // namespace From 6e703b2fe6583aa5e990372e25cf3dae35d87df0 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Fri, 12 Jul 2024 11:53:49 +0400 Subject: [PATCH 2/6] [Snippets] Applied Vladislav comments --- .../snippets/lowered/port_descriptor.hpp | 3 + .../src/lowered/pass/propagate_subtensors.cpp | 11 +- .../snippets/src/lowered/port_descriptor.cpp | 5 + .../snippets/src/runtime_configurator.cpp | 2 +- .../snippets/tests/include/lir_test_utils.hpp | 34 ---- .../snippets/tests/src/lir_test_utils.cpp | 22 --- .../pass/extracted_loop_invariants.cpp | 168 +++++++++--------- .../snippets/cpu_runtime_configurator.cpp | 13 +- .../snippets/cpu_runtime_configurator.hpp | 6 +- .../x64/pass/lowered/brgemm_blocking.cpp | 40 ++--- .../x64/pass/lowered/brgemm_blocking.hpp | 3 + .../x64/pass/lowered/cpu_iter_handlers.cpp | 12 +- .../x64/pass/lowered/cpu_iter_handlers.hpp | 8 +- .../snippets/matmul.cpp | 2 +- .../x64/lowered/brgemm_blocking.cpp | 84 ++++----- 15 files changed, 170 insertions(+), 243 deletions(-) diff --git a/src/common/snippets/include/snippets/lowered/port_descriptor.hpp b/src/common/snippets/include/snippets/lowered/port_descriptor.hpp index 3fc429bec4df1e..926136abf0e8f8 100644 --- a/src/common/snippets/include/snippets/lowered/port_descriptor.hpp +++ b/src/common/snippets/include/snippets/lowered/port_descriptor.hpp @@ -54,6 +54,9 @@ class PortDescriptor { void set_reg_type(RegType type) { m_reg.type = type; } void set_reg_idx(size_t idx) { m_reg.idx = idx; } + // Indexing starts from the end (rbegin() + idx) + void set_subtensor_value(size_t idx, VectorDims::value_type value); + std::string serialize() const; bool empty() const { return m_layout.empty() && m_subtensor_shape.empty();} PortDescriptorPtr clone() const; diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp index 09d8491840d804..25635976350327 100644 --- a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp +++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp @@ -18,11 +18,11 @@ namespace { // The algorithm uses the following special values in subtensors/shapes: // 1. Dynamic value in subtensor/shape : SIZE_MAX -// 2. Full fimension in subtensor : SIZE_MAX - 1 +// 2. Full dimension in subtensor : SIZE_MAX - 1 // 3. Default value of `new_dim_value` : SIZE_MAX - 2 // 4. `Forced` special dynamic value : SIZE_MAX - 3 // -// We have to introduce `SPECIAL_DYNAMIC_VALUE` to distinguish `new_dim_value = DYNAMIC` +// We have to introduce `FORCED_DYNAMIC_VALUE` to distinguish `new_dim_value = DYNAMIC` // from the real dynamic values in subtensors and shapes and force this value in subtensors. // For example, there is Brgemm with the following info in the tail Loop: // Input 0: shape [?, ?], existing subtensor [32, FULL_DIM] @@ -36,7 +36,7 @@ namespace { // 3. Update subtensor on output using shape: // new_subtensor[i] = std::min(planar_shape[i], subtensor[i]); // i = 0: std::min(SIZE_MAX(?), 32) // new subtensor [32, FULL_DIM] - has not been changed! But should be [?, FULL_DIM] -// Conculsion: we have to distinguish forced dynamic value with existing dynamic values in shape and subtensor +// Conclusion: we have to distinguish forced dynamic value with existing dynamic values in shape and subtensor constexpr size_t NEW_DEFAULT_VALUE = SIZE_MAX - 2; constexpr size_t FORCED_DYNAMIC_VALUE = SIZE_MAX - 3; @@ -61,9 +61,8 @@ void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, const auto& expr = port.expr_port->get_expr(); const auto& desc = port.expr_port->get_descriptor_ptr(); auto subtensor = desc->get_subtensor(); - if (port.dim_idx < subtensor.size()) { - *(subtensor.rbegin() + port.dim_idx) = new_dim_value; - desc->set_subtensor(subtensor); + if (port.dim_idx < desc->get_subtensor().size()) { + desc->set_subtensor_value(port.dim_idx, new_dim_value); } const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); diff --git a/src/common/snippets/src/lowered/port_descriptor.cpp b/src/common/snippets/src/lowered/port_descriptor.cpp index 612dd0e8b9bf58..6f2776e4fd8fc2 100644 --- a/src/common/snippets/src/lowered/port_descriptor.cpp +++ b/src/common/snippets/src/lowered/port_descriptor.cpp @@ -54,6 +54,11 @@ void PortDescriptor::set_shape(const VectorDims& tensor) { *m_tensor_shape = tensor; } +void PortDescriptor::set_subtensor_value(size_t idx, VectorDims::value_type value) { + OPENVINO_ASSERT(idx < m_subtensor_shape.size(), "Failed to set subtensor value: idx should be less than size"); + *(m_subtensor_shape.rbegin() + idx) = value; +} + PortDescriptorPtr PortDescriptor::clone() const { auto desc = std::make_shared(*m_tensor_shape, m_subtensor_shape, m_layout); desc->set_reg(m_reg); diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 45a7ec72b734ae..be9afb6fa702f5 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -186,7 +186,7 @@ void RuntimeConfigurator::update_loop_info(const std::shared_ptrget_work_amount(); if (expanded_loop_info->is_evaluate_once()) { - expanded_loop_info->update_ptr_increments(std::vector(ptr_increments.size(), 0)); + // Update only `finalization offsets`. `Ptr increments` are always zeroed in this case auto updated_finalization_offsets = current_work_amount > 0 ? std::vector(finalization_offsets.size(), 0) : finalization_offsets; // work_amount is equal to increment in cases with `evaluate_once` for (size_t i = 0; i < updated_finalization_offsets.size(); ++i) diff --git a/src/common/snippets/tests/include/lir_test_utils.hpp b/src/common/snippets/tests/include/lir_test_utils.hpp index ebe2e63cc2e66f..b653c86af8ab0b 100644 --- a/src/common/snippets/tests/include/lir_test_utils.hpp +++ b/src/common/snippets/tests/include/lir_test_utils.hpp @@ -44,40 +44,6 @@ void init_expr_descriptors(const ov::snippets::lowered::ExpressionPtr& expr, const std::vector& subtensors = {}, const std::vector& layouts = {}); -/** - * @brief Creates unified loop info based on provided entry and exit points, and adds it to the linear_ir's loops map - * @attention This helper wraps LoopManager::mark_loop method, but only for LoopInfo creation (whereas original - * mark_loop method also marks expressions with the corresponding loop info). - * @param linear_ir linear_ir in which loop info should be added - * @param entries entry points of loop - * @param exits exit points of loop - * @return ID of created loop - */ -size_t create_and_add_unified_loop_info(const std::shared_ptr& linear_ir, - size_t work_amount, - size_t increment, - const std::vector& entries, - const std::vector& exits, - bool add_default_handlers = true); -/** - * @brief Creates unified loop info based on provided entry and exit points, and adds it to the linear_ir's loops map. - * Meanwhile set loop id to expr range [loop_begin_pos, loop_end_pos). - * @attention This helper wraps LoopManager::mark_loop method, which also marks expressions with the corresponding loop info - * @param linear_ir linear_ir in which loop info should be added - * @param loop_begin_pos begin expr postion in this loop - * @param loop_end_pos end expr postion in this loop - * @param entries entry points of loop - * @param exits exit points of loop - * @return ID of created loop - */ -size_t create_and_add_unified_loop_info(const std::shared_ptr& linear_ir, - ov::snippets::lowered::LinearIR::constExprIt loop_begin_pos, - ov::snippets::lowered::LinearIR::constExprIt loop_end_pos, - size_t work_amount, - size_t increment, - const std::vector& entries, - const std::vector& exits, - bool add_default_handlers = true); } // namespace snippets } // namespace test } // namespace ov diff --git a/src/common/snippets/tests/src/lir_test_utils.cpp b/src/common/snippets/tests/src/lir_test_utils.cpp index 91f5e803784dec..eca5be9987118f 100644 --- a/src/common/snippets/tests/src/lir_test_utils.cpp +++ b/src/common/snippets/tests/src/lir_test_utils.cpp @@ -85,28 +85,6 @@ void init_expr_descriptors(const ov::snippets::lowered::ExpressionPtr& expr, } } -size_t create_and_add_unified_loop_info(const LinearIRPtr& linear_ir, - size_t work_amount, - size_t increment, - const std::vector& entries, - const std::vector& exits, - bool set_default_handlers) { - // Equal begin and end iterators are set to avoid expressions marking with new loop id - return create_and_add_unified_loop_info(linear_ir, linear_ir->begin(), linear_ir->begin(), work_amount, increment, entries, exits, set_default_handlers); -} - -size_t create_and_add_unified_loop_info(const LinearIRPtr& linear_ir, - ov::snippets::lowered::LinearIR::constExprIt loop_begin_pos, - ov::snippets::lowered::LinearIR::constExprIt loop_end_pos, - size_t work_amount, - size_t increment, - const std::vector& entries, - const std::vector& exits, - bool set_default_handlers) { - const auto& loop_manager = linear_ir->get_loop_manager(); - return loop_manager->mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, entries, exits, set_default_handlers); -} - } // namespace snippets } // namespace test } // namespace ov diff --git a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp index c3f4f5ea7f6877..ee762f4bfca746 100644 --- a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp +++ b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp @@ -66,11 +66,11 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsWithParams) { auto result = linear_ir->push_node(sub.second); auto begin = multiply.first; auto end = result.first; - create_and_add_unified_loop_info(linear_ir, begin, end, 512, vector_size, - {LoopPort((*multiply.first)->get_input_port(0)), - LoopPort((*multiply.first)->get_input_port(1)), - LoopPort((*sub.first)->get_input_port(0))}, - {LoopPort((*sub.first)->get_output_port(0))}); + linear_ir->get_loop_manager()->mark_loop(begin, end, 512, vector_size, + std::vector{LoopPort((*multiply.first)->get_input_port(0)), + LoopPort((*multiply.first)->get_input_port(1)), + LoopPort((*sub.first)->get_input_port(0))}, + std::vector{LoopPort((*sub.first)->get_output_port(0))}); linear_ir->set_loop_depth(1); } { @@ -85,10 +85,10 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsWithParams) { auto result = linear_ir_ref->push_node(sub.second); auto begin = sub.first; auto end = result.first; - create_and_add_unified_loop_info(linear_ir_ref, begin, end, 512, vector_size, - {LoopPort((*sub.first)->get_input_port(0)), - LoopPort((*sub.first)->get_input_port(1))}, - {LoopPort((*sub.first)->get_output_port(0))}); + linear_ir_ref->get_loop_manager()->mark_loop(begin, end, 512, vector_size, + std::vector{LoopPort((*sub.first)->get_input_port(0)), + LoopPort((*sub.first)->get_input_port(1))}, + std::vector{LoopPort((*sub.first)->get_output_port(0))}); } } @@ -124,10 +124,10 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsWithScalar) { auto result = linear_ir->push_node(sub.second); auto begin = scalar.first; auto end = result.first; - create_and_add_unified_loop_info(linear_ir, begin, end, 512, vector_size, - {LoopPort((*multiply.first)->get_input_port(0)), - LoopPort((*sub.first)->get_input_port(0))}, - {LoopPort((*sub.first)->get_output_port(0))}); + linear_ir->get_loop_manager()->mark_loop(begin, end, 512, vector_size, + std::vector{LoopPort((*multiply.first)->get_input_port(0)), + LoopPort((*sub.first)->get_input_port(0))}, + std::vector{LoopPort((*sub.first)->get_output_port(0))}); linear_ir->set_loop_depth(1); } { @@ -142,10 +142,10 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsWithScalar) { auto result = linear_ir_ref->push_node(sub.second); auto begin = sub.first; auto end = result.first; - create_and_add_unified_loop_info(linear_ir_ref, begin, end, 512, vector_size, - {LoopPort((*sub.first)->get_input_port(0)), - LoopPort((*sub.first)->get_input_port(1))}, - {LoopPort((*sub.first)->get_output_port(0))}); + linear_ir_ref->get_loop_manager()->mark_loop(begin, end, 512, vector_size, + std::vector{LoopPort((*sub.first)->get_input_port(0)), + LoopPort((*sub.first)->get_input_port(1))}, + std::vector{LoopPort((*sub.first)->get_output_port(0))}); } } @@ -187,20 +187,20 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsOutputLoopUpdateNotNeed auto result1 = linear_ir->push_node(sub.second); auto begin = multiply.first; auto end = result1.first; - create_and_add_unified_loop_info(linear_ir, begin, end, 16, vector_size, - {LoopPort((*multiply.first)->get_input_port(0), true, 0), - LoopPort((*multiply.first)->get_input_port(1), true, 0), - LoopPort((*add.first)->get_input_port(0), true, 0), - LoopPort((*sub.first)->get_input_port(0), true, 0)}, - {LoopPort((*add.first)->get_output_port(0), true, 0), - LoopPort((*sub.first)->get_output_port(0), true, 0)}); - create_and_add_unified_loop_info(linear_ir, begin, end, 3, 1, - {LoopPort((*multiply.first)->get_input_port(0), true, 1), - LoopPort((*multiply.first)->get_input_port(1), true, 1), - LoopPort((*add.first)->get_input_port(0), true, 1), - LoopPort((*sub.first)->get_input_port(0), true, 1)}, - {LoopPort((*add.first)->get_output_port(0), true, 1), - LoopPort((*sub.first)->get_output_port(0), true, 1)}); + linear_ir->get_loop_manager()->mark_loop(begin, end, 16, vector_size, + std::vector{LoopPort((*multiply.first)->get_input_port(0), true, 0), + LoopPort((*multiply.first)->get_input_port(1), true, 0), + LoopPort((*add.first)->get_input_port(0), true, 0), + LoopPort((*sub.first)->get_input_port(0), true, 0)}, + std::vector{LoopPort((*add.first)->get_output_port(0), true, 0), + LoopPort((*sub.first)->get_output_port(0), true, 0)}); + linear_ir->get_loop_manager()->mark_loop(begin, end, 3, 1, + std::vector{LoopPort((*multiply.first)->get_input_port(0), true, 1), + LoopPort((*multiply.first)->get_input_port(1), true, 1), + LoopPort((*add.first)->get_input_port(0), true, 1), + LoopPort((*sub.first)->get_input_port(0), true, 1)}, + std::vector{LoopPort((*add.first)->get_output_port(0), true, 1), + LoopPort((*sub.first)->get_output_port(0), true, 1)}); linear_ir->set_loop_depth(2); } { @@ -218,21 +218,21 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsOutputLoopUpdateNotNeed auto result1 = linear_ir_ref->push_node(sub.second); auto begin_inner = add.first; auto end_inner = result1.first; - create_and_add_unified_loop_info(linear_ir_ref, begin_inner, end_inner, 16, vector_size, - {LoopPort((*add.first)->get_input_port(0), true, 0), - LoopPort((*add.first)->get_input_port(1), true, 0), - LoopPort((*sub.first)->get_input_port(0), true, 0)}, - {LoopPort((*add.first)->get_output_port(0), true, 0), - LoopPort((*sub.first)->get_output_port(0), true, 0)}); + linear_ir_ref->get_loop_manager()->mark_loop(begin_inner, end_inner, 16, vector_size, + std::vector{LoopPort((*add.first)->get_input_port(0), true, 0), + LoopPort((*add.first)->get_input_port(1), true, 0), + LoopPort((*sub.first)->get_input_port(0), true, 0)}, + std::vector{LoopPort((*add.first)->get_output_port(0), true, 0), + LoopPort((*sub.first)->get_output_port(0), true, 0)}); auto begin_outer = multiply.first; auto end_outer = result1.first; - create_and_add_unified_loop_info(linear_ir_ref, begin_outer, end_outer, 3, 1, - {LoopPort((*multiply.first)->get_input_port(0), true, 1), - LoopPort((*multiply.first)->get_input_port(1), true, 1), - LoopPort((*add.first)->get_input_port(0), true, 1), - LoopPort((*sub.first)->get_input_port(0), true, 1)}, - {LoopPort((*add.first)->get_output_port(0), true, 1), - LoopPort((*sub.first)->get_output_port(0), true, 1)}); + linear_ir_ref->get_loop_manager()->mark_loop(begin_outer, end_outer, 3, 1, + std::vector{LoopPort((*multiply.first)->get_input_port(0), true, 1), + LoopPort((*multiply.first)->get_input_port(1), true, 1), + LoopPort((*add.first)->get_input_port(0), true, 1), + LoopPort((*sub.first)->get_input_port(0), true, 1)}, + std::vector{LoopPort((*add.first)->get_output_port(0), true, 1), + LoopPort((*sub.first)->get_output_port(0), true, 1)}); } } @@ -263,14 +263,14 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsFromInnermostToLoopOuts auto add = linear_ir->push_node(param_0.second, broadcastmove.second); init_expr_descriptors(*add.first, {subtensor, subtensor, subtensor}, {layout, layout, layout}); auto result = linear_ir->push_node(add.second); - create_and_add_unified_loop_info(linear_ir, broadcastmove.first, result.first, 3, 1, - {LoopPort((*broadcastmove.first)->get_input_port(0), true, 1), - LoopPort((*add.first)->get_input_port(0), true, 1)}, - {LoopPort((*add.first)->get_output_port(0), true, 1)}); - create_and_add_unified_loop_info(linear_ir, broadcastmove.first, result.first, 512, vector_size, - {LoopPort((*broadcastmove.first)->get_input_port(0), true, 0), - LoopPort((*add.first)->get_input_port(0), true, 0)}, - {LoopPort((*add.first)->get_output_port(0), true, 0)}); + linear_ir->get_loop_manager()->mark_loop(broadcastmove.first, result.first, 3, 1, + std::vector{LoopPort((*broadcastmove.first)->get_input_port(0), true, 1), + LoopPort((*add.first)->get_input_port(0), true, 1)}, + std::vector{LoopPort((*add.first)->get_output_port(0), true, 1)}); + linear_ir->get_loop_manager()->mark_loop(broadcastmove.first, result.first, 512, vector_size, + std::vector{LoopPort((*broadcastmove.first)->get_input_port(0), true, 0), + LoopPort((*add.first)->get_input_port(0), true, 0)}, + std::vector{LoopPort((*add.first)->get_output_port(0), true, 0)}); linear_ir->set_loop_depth(2); } { @@ -281,14 +281,14 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsFromInnermostToLoopOuts auto add = linear_ir_ref->push_node(param_0.second, broadcastmove.second); init_expr_descriptors(*add.first, {subtensor, subtensor, subtensor}, {layout, layout, layout}); auto result = linear_ir_ref->push_node(add.second); - create_and_add_unified_loop_info(linear_ir_ref, add.first, result.first, 3, 1, - {LoopPort((*add.first)->get_input_port(0), true, 1), - LoopPort((*add.first)->get_input_port(1), true, 1)}, - {LoopPort((*add.first)->get_output_port(0), true, 1)}); - create_and_add_unified_loop_info(linear_ir_ref, add.first, result.first, 512, vector_size, - {LoopPort((*add.first)->get_input_port(0), true, 0), - LoopPort((*add.first)->get_input_port(1), true, 0)}, - {LoopPort((*add.first)->get_output_port(0), true, 0)}); + linear_ir_ref->get_loop_manager()->mark_loop(add.first, result.first, 3, 1, + std::vector{LoopPort((*add.first)->get_input_port(0), true, 1), + LoopPort((*add.first)->get_input_port(1), true, 1)}, + std::vector{LoopPort((*add.first)->get_output_port(0), true, 1)}); + linear_ir_ref->get_loop_manager()->mark_loop(add.first, result.first, 512, vector_size, + std::vector{LoopPort((*add.first)->get_input_port(0), true, 0), + LoopPort((*add.first)->get_input_port(1), true, 0)}, + std::vector{LoopPort((*add.first)->get_output_port(0), true, 0)}); } } @@ -356,31 +356,31 @@ TEST_F(ExtractLoopInvariantsRemoveLoopsTest, ExtractedLoopInvariantsAllExprsInLo init_expr_descriptors(*multiply.first, {subtensor, subtensor, subtensor}, {layout, layout, layout}); auto result = linear_ir->push_node(multiply.second); // 3 inner loop - create_and_add_unified_loop_info(linear_ir, max.first, hmax.first, 1, vector_size, - {LoopPort((*max.first)->get_input_port(0), true, 0), - LoopPort((*max.first)->get_input_port(1), true, 0)}, - {LoopPort((*max.first)->get_output_port(0), true, 0)}); - create_and_add_unified_loop_info(linear_ir, sub.first, hsum.first, 1, vector_size, - {LoopPort((*sub.first)->get_input_port(0), true, 0), - LoopPort((*sub.first)->get_input_port(1), true, 0), - LoopPort((*add.first)->get_input_port(1), true, 0)}, - {LoopPort((*exp.first)->get_output_port(0), true, 0), - LoopPort((*add.first)->get_output_port(0), true, 0)}); - create_and_add_unified_loop_info(linear_ir, multiply.first, result.first, 1, vector_size, - {LoopPort((*multiply.first)->get_input_port(0), true, 0), - LoopPort((*multiply.first)->get_input_port(1), true, 0)}, - {LoopPort((*multiply.first)->get_output_port(0), true, 0)}); + linear_ir->get_loop_manager()->mark_loop(max.first, hmax.first, 1, vector_size, + std::vector{LoopPort((*max.first)->get_input_port(0), true, 0), + LoopPort((*max.first)->get_input_port(1), true, 0)}, + std::vector{LoopPort((*max.first)->get_output_port(0), true, 0)}); + linear_ir->get_loop_manager()->mark_loop(sub.first, hsum.first, 1, vector_size, + std::vector{LoopPort((*sub.first)->get_input_port(0), true, 0), + LoopPort((*sub.first)->get_input_port(1), true, 0), + LoopPort((*add.first)->get_input_port(1), true, 0)}, + std::vector{LoopPort((*exp.first)->get_output_port(0), true, 0), + LoopPort((*add.first)->get_output_port(0), true, 0)}); + linear_ir->get_loop_manager()->mark_loop(multiply.first, result.first, 1, vector_size, + std::vector{LoopPort((*multiply.first)->get_input_port(0), true, 0), + LoopPort((*multiply.first)->get_input_port(1), true, 0)}, + std::vector{LoopPort((*multiply.first)->get_output_port(0), true, 0)}); // outer loop info const auto loop_begin = std::make_shared(); auto loop_begin_expr = linear_ir->insert_node(loop_begin, std::vector{}, {}, false, max.first); const auto loop_end = std::make_shared(); std::vector loop_end_inputs{(*loop_begin_expr)->get_output_port_connector(0)}; auto loop_end_expr = linear_ir->insert_node(loop_end, loop_end_inputs, {}, false, result.first); - create_and_add_unified_loop_info(linear_ir, loop_begin_expr, result.first, 10, 1, - {LoopPort((*max.first)->get_input_port(0), true, 1), - LoopPort((*max.first)->get_input_port(1), true, 0), - LoopPort((*add.first)->get_input_port(1), true, 0)}, - {LoopPort((*multiply.first)->get_output_port(0), true, 1)}); + linear_ir->get_loop_manager()->mark_loop(loop_begin_expr, result.first, 10, 1, + std::vector{LoopPort((*max.first)->get_input_port(0), true, 1), + LoopPort((*max.first)->get_input_port(1), true, 0), + LoopPort((*add.first)->get_input_port(1), true, 0)}, + std::vector{LoopPort((*multiply.first)->get_output_port(0), true, 1)}); loop_end->set_id((*loop_end_expr)->get_loop_ids().back()); linear_ir->set_loop_depth(2); } @@ -409,11 +409,11 @@ TEST_F(ExtractLoopInvariantsRemoveLoopsTest, ExtractedLoopInvariantsAllExprsInLo const auto loop_end = std::make_shared(); std::vector loop_end_inputs{(*loop_begin_expr)->get_output_port_connector(0)}; auto loop_end_expr = linear_ir_ref->insert_node(loop_end, loop_end_inputs, {}, false, result.first); - create_and_add_unified_loop_info(linear_ir_ref, loop_begin_expr, result.first, 10, 1, - {LoopPort((*max.first)->get_input_port(0), true, 1), - LoopPort((*max.first)->get_input_port(1), true, 0), - LoopPort((*add.first)->get_input_port(1), true, 0)}, - {LoopPort((*multiply.first)->get_output_port(0), true, 1)}); + linear_ir_ref->get_loop_manager()->mark_loop(loop_begin_expr, result.first, 10, 1, + std::vector{LoopPort((*max.first)->get_input_port(0), true, 1), + LoopPort((*max.first)->get_input_port(1), true, 0), + LoopPort((*add.first)->get_input_port(1), true, 0)}, + std::vector{LoopPort((*multiply.first)->get_output_port(0), true, 1)}); loop_end->set_id((*loop_end_expr)->get_loop_ids().back()); } } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index b7b1ef9461ed8a..b8b6f590bf41bb 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -51,7 +51,7 @@ void CPURuntimeConfigurator::initialization(const std::shared_ptrget_loop_info(loop_ids.front()); const auto& block_size_m = expanded_loop_info->get_work_amount(); - const auto& in_desc = brgemm_expr->get_input_port_descriptor(0); - const auto& out_desc = brgemm_expr->get_output_port_descriptor(0); - - auto in_subtensor = in_desc->get_subtensor(); - auto out_subtensor = out_desc->get_subtensor(); - *++in_subtensor.rbegin() = block_size_m; - *++out_subtensor.rbegin() = block_size_m; - in_desc->set_subtensor(in_subtensor); - out_desc->set_subtensor(out_subtensor); + brgemm_expr->get_input_port_descriptor(0)->set_subtensor_value(1, block_size_m); + brgemm_expr->get_output_port_descriptor(0)->set_subtensor_value(1, block_size_m); } } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index b5ea975bb0c3c0..39ab1977f878d1 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -42,16 +42,18 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { void init_tensor_rank(const std::shared_ptr& linear_ir) const override; /** * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig - * @param linear_ir LinearIR + * @param loop_manager Loop Manager */ void update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const; /** * @brief Update subtensors of Brgemms + * @param loop_manager Loop Manager */ void update_brgemms(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const; const size_t rank6D = 6; - std::vector m_dynamic_brgemms = {}; + // Brgemm expressions with subtensors with dynamic values + std::unordered_set m_dynamic_brgemms = {}; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp index 6b74b2e8eff1ba..d31f205a932116 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp @@ -57,6 +57,15 @@ LinearIR::constExprIt BrgemmBlocking::get_loop_begin_pos(LinearIR& linear_ir, co return loop_begin_it; } +snippets::lowered::SpecificIterationHandlers BrgemmBlocking::get_default_blocking_loop_handlers(size_t work_amount, size_t block_size) { + SpecificIterationHandlers handlers; + const auto tail_size = snippets::utils::is_dynamic_value(work_amount) ? snippets::utils::get_dynamic_value() : work_amount % block_size; + if (tail_size != 0) + handlers.register_pass(tail_size); + handlers.register_pass(); + return handlers; +} + bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmBlocking") const auto& loop_manager = linear_ir.get_loop_manager(); @@ -111,22 +120,18 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea // If block_size is dynamic, it means that Brgemm will process full tensor: // subtensor[i] = FULL_DIM as by default if (!snippets::utils::is_dynamic_value(block_size_m)) { - *++in_0_subtensor.rbegin() = block_size_m; - *++out_subtensor.rbegin() = block_size_m; + brgemm_expr->get_input_port_descriptor(0)->set_subtensor_value(1, block_size_m); + brgemm_expr->get_output_port_descriptor(0)->set_subtensor_value(1, block_size_m); } if (!snippets::utils::is_dynamic_value(block_size_n)) { - *in_1_subtensor.rbegin() = block_size_n; - *out_subtensor.rbegin() = block_size_n; + brgemm_expr->get_input_port_descriptor(1)->set_subtensor_value(0, block_size_n); + brgemm_expr->get_output_port_descriptor(0)->set_subtensor_value(0, block_size_n); } if (!snippets::utils::is_dynamic_value(block_size_k)) { - *in_0_subtensor.rbegin() = block_size_k; - *++in_1_subtensor.rbegin() = block_size_k; + brgemm_expr->get_input_port_descriptor(0)->set_subtensor_value(0, block_size_k); + brgemm_expr->get_input_port_descriptor(1)->set_subtensor_value(1, block_size_k); } - brgemm_expr->get_input_port_descriptor(0)->set_subtensor(in_0_subtensor); - brgemm_expr->get_input_port_descriptor(1)->set_subtensor(in_1_subtensor); - brgemm_expr->get_output_port_descriptor(0)->set_subtensor(out_subtensor); - const bool need_brgemm_copy_b = brgemm_cpu && with_repacking(brgemm_cpu->get_type()); ov::snippets::lowered::ExpressionPtr copy_b_expr = nullptr; if (need_brgemm_copy_b) { @@ -151,15 +156,6 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea } } - auto get_default_handlers = [](size_t work_amount, size_t block_size) { - SpecificIterationHandlers handlers; - const auto tail_size = snippets::utils::is_dynamic_value(work_amount) ? snippets::utils::get_dynamic_value() : work_amount % block_size; - if (tail_size != 0) - handlers.register_pass(tail_size); - handlers.register_pass(true); - return handlers; - }; - auto mark_m_blocking = [&](bool include_repacking) { const auto loop_begin_it = get_loop_begin_pos(linear_ir, expr_it, include_repacking); const auto loop_end_it = std::next(expr_it); @@ -174,7 +170,7 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, m, block_size_m, 1, entries, exits, false); - loop_manager->get_loop_info(id)->set_handlers(get_default_handlers(m, block_size_m)); + loop_manager->get_loop_info(id)->set_handlers(get_default_blocking_loop_handlers(m, block_size_m)); }; auto mark_n_blocking = [&]() { @@ -187,7 +183,7 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, n, block_size_n, 0, entries, exits, false); - loop_manager->get_loop_info(id)->set_handlers(get_default_handlers(n, block_size_n)); + loop_manager->get_loop_info(id)->set_handlers(get_default_blocking_loop_handlers(n, block_size_n)); }; auto mark_k_blocking = [&]() { @@ -199,7 +195,7 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea LoopPort(need_brgemm_copy_b ? copy_b_expr->get_input_port(0) : brgemm_expr->get_input_port(1), true, 1)}; const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), false)}; - auto handlers = get_default_handlers(k, block_size_k); + auto handlers = get_default_blocking_loop_handlers(k, block_size_k); handlers.register_pass(0.f); const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits, false); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp index cdc2d05cffd1e5..4d29267f034fc9 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp @@ -5,6 +5,7 @@ #pragma once #include "snippets/lowered/pass/pass.hpp" +#include "snippets/lowered/specific_loop_iter_handlers.hpp" namespace ov { namespace intel_cpu { @@ -24,6 +25,8 @@ class BrgemmBlocking : public snippets::lowered::pass::RangedPass { snippets::lowered::LinearIR::constExprIt begin, snippets::lowered::LinearIR::constExprIt end) override; + static snippets::lowered::SpecificIterationHandlers get_default_blocking_loop_handlers(size_t work_amount, size_t block_size); + private: static snippets::lowered::LinearIR::constExprIt move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp index 97c8c2a5299f6b..6acf68fe245467 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp @@ -36,24 +36,16 @@ std::shared_ptr SetBrgemmBeta::merge(const st return merged_pass; } -SetEvaluanceOnce::SetEvaluanceOnce(bool evaluation) : snippets::lowered::pass::RangedPass(), m_evaluation(evaluation) {} - bool SetEvaluanceOnce::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { const auto& loop_end = ov::as_type_ptr(end->get()->get_node()); OPENVINO_ASSERT(loop_end, "SetEvaluanceOnce expected LoopEnd node in iterator `end`."); const auto& loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id()); - loop_info->set_evaluate_once(m_evaluation); + loop_info->set_evaluate_once(true); return true; } std::shared_ptr SetEvaluanceOnce::merge(const std::shared_ptr& other) { - const auto merged_pass = std::make_shared(m_evaluation); - if (other == nullptr) - return merged_pass; - const auto casted_pass = ov::as_type_ptr(other); - if (!casted_pass || m_evaluation != casted_pass->m_evaluation) - return nullptr; - return merged_pass; + return !other || ov::is_type(other) ? std::make_shared() : nullptr; } } // namespace pass diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp index 7616954bc2cca5..2a9492713e0c46 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp @@ -30,22 +30,18 @@ class SetBrgemmBeta : public snippets::lowered::pass::RangedPass { /** * @interface SetEvaluanceOnce - * @brief The pass set `evaluate once` only to ExpandedLoopInfo which is mapped on LoopEnd in the passed iterator `end`. + * @brief The pass set `evaluate once = true` only to ExpandedLoopInfo which is mapped on LoopEnd in the passed iterator `end`. * The pointer arithmetic should be updated in the separate optimization `OptimizeLoopSingleEvaluation` - * @param m_evaluation - value which must be set * @ingroup snippets */ class SetEvaluanceOnce : public snippets::lowered::pass::RangedPass { public: - SetEvaluanceOnce(bool evaluation); + SetEvaluanceOnce() = default; OPENVINO_RTTI("SetEvaluanceOnce", "RangedPass") bool run(snippets::lowered::LinearIR& linear_ir, snippets::lowered::LinearIR::constExprIt begin, snippets::lowered::LinearIR::constExprIt end) override; std::shared_ptr merge(const std::shared_ptr& other) override; - -private: - bool m_evaluation = false; }; } // namespace pass } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index 8e1959224ef6a1..1089bdc3faffaa 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -96,7 +96,7 @@ std::vector> input_shapes_dynamic{ }, // Only N dimension is dynamic { - STATIC_SHAPE(2, 2, 65, 550), + {PartialShape{}, {{2, 2, 65, 550}}}, {PartialShape{2, 2, 550, -1}, {{2, 2, 550, 70}, {2, 2, 550, 12}, {2, 2, 550, 70}, {2, 2, 550, 12}, {2, 2, 550, 10}}} }, diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp index 2e6cd996cb9135..df7790692c1dd8 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp @@ -7,8 +7,8 @@ #include "lir_test_utils.hpp" #include "openvino/opsets/opset10.hpp" #include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_info.hpp" #include "snippets/lowered/pass/propagate_subtensors.hpp" -#include "snippets/lowered/pass/serialize_control_flow.hpp" #include "snippets/snippets_isa.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" @@ -24,14 +24,6 @@ using namespace ov::snippets; using BRGEMM_TYPE = intel_cpu::brgemm_utils::BRGEMM_TYPE; namespace { -SpecificIterationHandlers get_default_handlers(size_t work_amount, size_t block_size) { - SpecificIterationHandlers handlers; - const auto tail_size = snippets::utils::is_dynamic_value(work_amount) ? snippets::utils::get_dynamic_value() : work_amount % block_size; - if (tail_size != 0) - handlers.register_pass(tail_size); - handlers.register_pass(true); - return handlers; -} void create_brgemm_loop_infos(const LinearIRPtr& linear_ir, const ExpressionPtr& brgemm_expr, @@ -42,29 +34,30 @@ void create_brgemm_loop_infos(const LinearIRPtr& linear_ir, const bool n_block = k != 0 && k_blk != 0; const bool m_block = m != 0 && m_blk != 0; if (k_block) { - const size_t loop_id = create_and_add_unified_loop_info(linear_ir, k, k_blk, - {LoopPort(brgemm_expr->get_input_port(0)), - LoopPort(brgemm_expr->get_input_port(1), true, 1)}, - {LoopPort(brgemm_expr->get_output_port(0), false)}, false); - const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); - loop_info->set_handlers(get_default_handlers(k, k_block)); + const auto loop_info = + std::make_shared(k, k_blk, + std::vector{LoopPort(brgemm_expr->get_input_port(0)), + LoopPort(brgemm_expr->get_input_port(1), true, 1)}, + std::vector{LoopPort(brgemm_expr->get_output_port(0), false)}, + ov::intel_cpu::pass::BrgemmBlocking::get_default_blocking_loop_handlers(k, k_block)); loop_info->register_pass_to_handler(0.f); + linear_ir->get_loop_manager()->add_loop_info(loop_info); } if (n_block) { - const size_t loop_id = create_and_add_unified_loop_info(linear_ir, n, n_blk, - {LoopPort(brgemm_expr->get_input_port(0), false), - LoopPort(brgemm_expr->get_input_port(1))}, - {LoopPort(brgemm_expr->get_output_port(0))}, false); - const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); - loop_info->set_handlers(get_default_handlers(n, n_block)); + linear_ir->get_loop_manager()->add_loop_info( + std::make_shared(n, n_blk, + std::vector{LoopPort(brgemm_expr->get_input_port(0), false), + LoopPort(brgemm_expr->get_input_port(1))}, + std::vector{LoopPort(brgemm_expr->get_output_port(0))}, + ov::intel_cpu::pass::BrgemmBlocking::get_default_blocking_loop_handlers(n, n_block))); } if (m_block) { - const size_t loop_id = create_and_add_unified_loop_info(linear_ir, m, m_blk, - {LoopPort(brgemm_expr->get_input_port(0), true, 1), - LoopPort(brgemm_expr->get_input_port(1), false, 1)}, - {LoopPort(brgemm_expr->get_output_port(0), true, 1)}, false); - const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); - loop_info->set_handlers(get_default_handlers(m, m_block)); + linear_ir->get_loop_manager()->add_loop_info( + std::make_shared(m, m_blk, + std::vector{LoopPort(brgemm_expr->get_input_port(0), true, 1), + LoopPort(brgemm_expr->get_input_port(1), false, 1)}, + std::vector{LoopPort(brgemm_expr->get_output_port(0), true, 1)}, + ov::intel_cpu::pass::BrgemmBlocking::get_default_blocking_loop_handlers(m, m_block))); } } @@ -78,30 +71,31 @@ void create_brgemm_with_copy_b_loop_infos(const LinearIRPtr& linear_ir, const bool n_block = k != 0 && k_blk != 0; const bool m_block = m != 0 && m_blk != 0; if (k_block) { - const size_t loop_id = create_and_add_unified_loop_info(linear_ir, k, k_blk, - {LoopPort(brgemm_expr->get_input_port(0)), - LoopPort(copy_b_expr->get_input_port(0), true, 1)}, - {LoopPort(brgemm_expr->get_output_port(0), false)}); - const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); - loop_info->set_handlers(get_default_handlers(k, k_block)); + const auto loop_info = + std::make_shared(k, k_blk, + std::vector{LoopPort(brgemm_expr->get_input_port(0)), + LoopPort(copy_b_expr->get_input_port(0), true, 1)}, + std::vector{LoopPort(brgemm_expr->get_output_port(0), false)}, + ov::intel_cpu::pass::BrgemmBlocking::get_default_blocking_loop_handlers(k, k_block)); loop_info->register_pass_to_handler(0.f); + linear_ir->get_loop_manager()->add_loop_info(loop_info); } if (n_block) { - const size_t loop_id = create_and_add_unified_loop_info(linear_ir, n, n_blk, - {LoopPort(brgemm_expr->get_input_port(0), false), - LoopPort(copy_b_expr->get_input_port(0))}, - {LoopPort(brgemm_expr->get_output_port(0))}); - const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); - loop_info->set_handlers(get_default_handlers(n, n_block)); + linear_ir->get_loop_manager()->add_loop_info( + std::make_shared(n, n_blk, + std::vector{LoopPort(brgemm_expr->get_input_port(0), false), + LoopPort(copy_b_expr->get_input_port(0))}, + std::vector{LoopPort(brgemm_expr->get_output_port(0))}, + ov::intel_cpu::pass::BrgemmBlocking::get_default_blocking_loop_handlers(n, n_block))); } if (m_block) { const auto& second_input_port = k_block || n_block ? copy_b_expr->get_input_port(0) : brgemm_expr->get_input_port(1); - const size_t loop_id = create_and_add_unified_loop_info(linear_ir, m, m_blk, - {LoopPort(brgemm_expr->get_input_port(0), true, 1), - LoopPort(second_input_port, false, 1)}, - {LoopPort(brgemm_expr->get_output_port(0), true, 1)}); - const auto& loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_id); - loop_info->set_handlers(get_default_handlers(m, m_block)); + linear_ir->get_loop_manager()->add_loop_info( + std::make_shared(m, m_blk, + std::vector{LoopPort(brgemm_expr->get_input_port(0), true, 1), + LoopPort(second_input_port, false, 1)}, + std::vector{LoopPort(brgemm_expr->get_output_port(0), true, 1)}, + ov::intel_cpu::pass::BrgemmBlocking::get_default_blocking_loop_handlers(m, m_block))); } } } // namespace From 520680ba7614db9d7e1d827c2b5c4d36087ed73a Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Wed, 24 Jul 2024 09:05:07 +0400 Subject: [PATCH 3/6] [Snippets] Applied Ivan comments --- .../include/snippets/lowered/loop_info.hpp | 5 ++-- .../snippets/lowered/port_descriptor.hpp | 21 ++----------- .../snippets/include/snippets/utils/utils.hpp | 30 ++++++++++++------- src/common/snippets/src/lowered/loop_info.cpp | 16 +++++----- .../snippets/src/lowered/loop_manager.cpp | 11 ++++--- .../pass/insert_specific_iterations.cpp | 3 +- .../pass/optimize_loop_single_evaluation.cpp | 8 ++--- .../src/lowered/pass/propagate_subtensors.cpp | 2 +- .../snippets/src/lowered/port_descriptor.cpp | 25 +++++++++++++--- src/common/snippets/src/op/reduce.cpp | 3 +- .../snippets/src/op/serialization_node.cpp | 4 +-- .../snippets/src/pass/matmul_to_brgemm.cpp | 6 +--- .../src/pass/softmax_decomposition.cpp | 2 +- .../snippets/src/runtime_configurator.cpp | 10 ++----- .../snippets/tests/src/lir_test_utils.cpp | 4 +-- .../src/lowered/pass/buffer_allocation.cpp | 3 +- .../snippets/cpu_runtime_configurator.cpp | 4 +-- .../snippets/x64/kernel_executors/brgemm.cpp | 6 ++-- .../src/emitters/tpp/x64/jit_tpp_emitter.cpp | 2 +- .../x64/pass/brgemm_to_brgemm_cpu.cpp | 2 +- .../x64/pass/lowered/brgemm_blocking.cpp | 14 ++++----- .../x64/pass/lowered/cpu_iter_handlers.cpp | 8 ++--- .../x64/pass/lowered/cpu_iter_handlers.hpp | 8 ++--- .../tpp/x64/pass/eltwise_to_eltwise_tpp.cpp | 9 +++--- .../x64/pass/lowered/set_tpp_leading_dim.cpp | 2 +- .../tpp/x64/pass/scalar_to_scalar_tpp.cpp | 4 +-- .../x64/lowered/brgemm_blocking.cpp | 1 - .../x64/lowered/buffer_allocation.cpp | 8 ++--- 28 files changed, 107 insertions(+), 114 deletions(-) diff --git a/src/common/snippets/include/snippets/lowered/loop_info.hpp b/src/common/snippets/include/snippets/lowered/loop_info.hpp index a38218626dcad1..6be47f49d17ae1 100644 --- a/src/common/snippets/include/snippets/lowered/loop_info.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_info.hpp @@ -430,7 +430,8 @@ class ExpandedLoopInfo : public LoopInfo { ExpandedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, std::vector ptr_increments, std::vector final_offsets, std::vector data_sizes, - SpecificLoopIterType type, std::shared_ptr unified_loop_info, bool is_wa_const = false); + SpecificLoopIterType type, std::shared_ptr unified_loop_info, bool is_wa_const = false, + bool evaluate_once = false); /** * @brief Clone LoopInfo with new expressions * @param expr_map map of new and old expressions @@ -529,7 +530,7 @@ class ExpandedLoopInfo : public LoopInfo { const SpecificLoopIterType m_type = {}; std::shared_ptr m_unified_loop_info = {}; - bool m_evaluance_once = false; + bool m_evaluate_once = false; }; using ExpandedLoopInfoPtr = std::shared_ptr; diff --git a/src/common/snippets/include/snippets/lowered/port_descriptor.hpp b/src/common/snippets/include/snippets/lowered/port_descriptor.hpp index 926136abf0e8f8..2d5c72c06ef983 100644 --- a/src/common/snippets/include/snippets/lowered/port_descriptor.hpp +++ b/src/common/snippets/include/snippets/lowered/port_descriptor.hpp @@ -20,12 +20,6 @@ using PortDescriptorPtr = std::shared_ptr; class PortDescriptor { friend class LinearIRBuilder; public: - // The structure with service values for scheduling parameters - struct ServiceDimensions { - // The value for the subtensor that means that scheduling should be by full dimension - static size_t FULL_DIM; - }; - explicit PortDescriptor(const ov::Input& node, VectorDims subtensor_shape = {}, std::vector layout = {}); @@ -55,7 +49,7 @@ class PortDescriptor { void set_reg_idx(size_t idx) { m_reg.idx = idx; } // Indexing starts from the end (rbegin() + idx) - void set_subtensor_value(size_t idx, VectorDims::value_type value); + void set_subtensor_dim(size_t idx, VectorDims::value_type value); std::string serialize() const; bool empty() const { return m_layout.empty() && m_subtensor_shape.empty();} @@ -90,6 +84,8 @@ class PortDescriptorUtils { public: static void set_port_descriptor_ptr(const ov::Input& n, const PortDescriptorPtr& desc); static void set_port_descriptor_ptr(const ov::Output& n, const PortDescriptorPtr& desc); + static void set_port_descriptor(const ov::Input& n, std::vector subtensor, std::vector layout = {}); + static void set_port_descriptor(const ov::Output& n, std::vector subtensor, std::vector layout = {}); static PortDescriptorPtr get_port_descriptor_ptr(const ov::Input& in); static PortDescriptorPtr get_port_descriptor_ptr(const ov::Input& out); @@ -119,17 +115,6 @@ class PortDescriptorVectorAttribute : public ov::RuntimeAttribute { std::vector outputs{}; }; -template -void set_port_desc(const T& port, std::vector subtensor) { - const auto& shape = port.get_shape(); - for (size_t i = 1; i <= std::min(subtensor.size(), shape.size()); i++) { - auto& dim = subtensor[subtensor.size() - i]; - if (dim != PortDescriptor::ServiceDimensions::FULL_DIM) - dim = std::min(dim, shape[shape.size() - i]); - } - PortDescriptorUtils::set_port_descriptor_ptr(port, std::make_shared(shape, subtensor)); -} - } // namespace lowered } // namespace snippets } // namespace ov diff --git a/src/common/snippets/include/snippets/utils/utils.hpp b/src/common/snippets/include/snippets/utils/utils.hpp index 33eebcffedf68b..869956b5274c60 100644 --- a/src/common/snippets/include/snippets/utils/utils.hpp +++ b/src/common/snippets/include/snippets/utils/utils.hpp @@ -21,6 +21,26 @@ namespace ov { namespace snippets { namespace utils { +/* --- Special values --- */ +template::value || std::is_same::value), bool>::type> +constexpr inline T get_dynamic_value() { + return std::numeric_limits::max(); +} +template::value || std::is_same::value), bool>::type> +constexpr inline bool is_dynamic_value(T value) { + return value == get_dynamic_value(); +} + +// This value means full dimension +// For example, for the subtensor it means that scheduling should be by full dimension +constexpr inline size_t get_full_dim_value() { + return get_dynamic_value() - 1; +} +constexpr inline bool is_full_dim_value(size_t value) { + return value == get_full_dim_value(); +} +/* ---------------------- */ + // Get non-scalar Constant count that will be created after FakeQuantize decomposition. // This count is needed to know exact count of non-scalar Constants during tokenization. auto get_non_scalar_constant_count_for_fq(const std::shared_ptr& fq) -> size_t; @@ -59,16 +79,6 @@ inline T div_up(const T a, const U b) { return static_cast((a + b - 1) / b); } -template::value || std::is_same::value), bool>::type> -constexpr inline T get_dynamic_value() { - return std::numeric_limits::max(); -} - -template::value || std::is_same::value), bool>::type> -constexpr inline bool is_dynamic_value(T value) { - return value == get_dynamic_value(); -} - inline bool is_dynamic_vdims(const VectorDims& shape) { return std::any_of(shape.cbegin(), shape.cend(), [](size_t v){ return is_dynamic_value(v); }); } diff --git a/src/common/snippets/src/lowered/loop_info.cpp b/src/common/snippets/src/lowered/loop_info.cpp index 7e32a49307ec4b..d99788fad12946 100644 --- a/src/common/snippets/src/lowered/loop_info.cpp +++ b/src/common/snippets/src/lowered/loop_info.cpp @@ -373,10 +373,10 @@ void UnifiedLoopInfo::add_loop_ports(const std::vector& ports) { ExpandedLoopInfo::ExpandedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, std::vector ptr_increments, std::vector final_offsets, std::vector data_sizes, - SpecificLoopIterType type, std::shared_ptr unified_loop_info, bool is_wa_const) + SpecificLoopIterType type, std::shared_ptr unified_loop_info, bool is_wa_const, bool evaluate_once) : LoopInfo(work_amount, increment, entries, exits, is_wa_const), m_ptr_increments(std::move(ptr_increments)), m_finalization_offsets(std::move(final_offsets)), - m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)) { + m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)), m_evaluate_once(evaluate_once) { validate(); } @@ -391,11 +391,9 @@ std::shared_ptr ExpandedLoopInfo::clone_with_new_expr(const Expression const auto& new_input_ports = clone_loop_ports(expr_map, m_input_ports); const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports); - const auto cloned = std::make_shared(m_work_amount, m_increment, new_input_ports, new_output_ports, - m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, - m_unified_loop_info, m_is_work_amount_const); - cloned->m_evaluance_once = m_evaluance_once; - return cloned; + return std::make_shared(m_work_amount, m_increment, new_input_ports, new_output_ports, + m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, + m_unified_loop_info, m_is_work_amount_const, m_evaluate_once); } bool ExpandedLoopInfo::is_dynamic() const { @@ -439,11 +437,11 @@ const std::vector& ExpandedLoopInfo::get_data_sizes() const { } bool ExpandedLoopInfo::is_evaluate_once() const { - return m_evaluance_once; + return m_evaluate_once; } void ExpandedLoopInfo::set_evaluate_once(bool value) { - m_evaluance_once = value; + m_evaluate_once = value; } void ExpandedLoopInfo::update_ptr_increments(const std::vector& new_values) { diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp index 3e07ec850927ab..09f8ccb94b9660 100644 --- a/src/common/snippets/src/lowered/loop_manager.cpp +++ b/src/common/snippets/src/lowered/loop_manager.cpp @@ -160,7 +160,6 @@ void LoopManager::get_io_loop_ports(LinearIR::constExprIt loop_begin_pos, void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, size_t loop_depth, size_t vector_size) { - const auto FULL_DIM = PortDescriptor::ServiceDimensions::FULL_DIM; std::vector loop_input_ports, loop_output_ports; LoopManager::get_io_loop_ports(loop_begin_pos, loop_end_pos, loop_input_ports, loop_output_ports); @@ -178,8 +177,8 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, "Failed to broadcast work amount in marking loop"); }; - auto is_outside_loop = [&FULL_DIM](const std::vector& subtensor) { - return std::all_of(subtensor.begin(), subtensor.end(), [&FULL_DIM](size_t lhs) { return lhs == FULL_DIM; }); + auto is_outside_loop = [](const std::vector& subtensor) { + return std::all_of(subtensor.begin(), subtensor.end(), utils::is_full_dim_value); }; std::vector loop_subtensor; @@ -192,7 +191,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, subtensor[subtensor.size() - 1] = vector_size; } - const size_t resizing_value = is_outside_loop(subtensor) ? FULL_DIM : 1; + const size_t resizing_value = is_outside_loop(subtensor) ? utils::get_full_dim_value() : 1; while (subtensor.size() < loop_depth) subtensor.insert(subtensor.begin(), resizing_value); if (loop_subtensor.empty()) @@ -202,7 +201,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, "Incorrect scheduling parameters for loop"); for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) { - if (*(subtensor.rbegin() + dim_idx) != FULL_DIM) { + if (!utils::is_full_dim_value(*(subtensor.rbegin() + dim_idx))) { broadcast(loop_tensor, shape, dim_idx); } } @@ -211,7 +210,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) { OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup"); const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx); - if (subtensor_value == FULL_DIM) { + if (utils::is_full_dim_value(subtensor_value)) { continue; } diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp index 2ef872ba4ad262..dcff90015d28f2 100644 --- a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp +++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp @@ -167,6 +167,7 @@ bool InsertSpecificIterations::decompose(LinearIR& linear_ir, LinearIR::constExp if (is_decomposed_loop_needed(unified_loop_info, iter_type, remaining_work_amount)) { const auto work_amount = get_decomposed_loop_work_amount(unified_loop_info, iter_type, remaining_work_amount); const auto increment = get_decomposed_loop_increment(unified_loop_info, iter_type, remaining_work_amount); + const auto evaluate_once = !utils::is_dynamic_value(work_amount) && work_amount == increment; // Update remaining Loop work amount // Note: if work_amount is unknown and increment = 1, it means that a loop will iterate by whole work_amount if (!is_wa_dynamic || increment == 1) { @@ -199,7 +200,7 @@ bool InsertSpecificIterations::decompose(LinearIR& linear_ir, LinearIR::constExp const auto decomposed_loop_info = std::make_shared(work_amount, increment, decomposed_loop_entry_ports, decomposed_loop_exit_ports, decomposed_ptr_increments, decomposed_finalization_offsets, - decomposed_data_sizes, iter_type, unified_loop_info); + decomposed_data_sizes, iter_type, unified_loop_info, false, evaluate_once); init_decomposed_loop(linear_ir, decomposed_loop_begin_it, decomposed_loop_end_it, decomposed_loop_info, loop_id, decomposed_loop_end); decomposed = true; diff --git a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp index b9d795393c00aa..c6255d90106e77 100644 --- a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp +++ b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp @@ -24,13 +24,10 @@ bool OptimizeLoopSingleEvaluation::run(lowered::LinearIR& linear_ir, lowered::Li const auto& expr = *expr_it; if (auto loop_end = ov::as_type_ptr(expr->get_node())) { const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); - const auto work_amount = loop_end->get_work_amount(); - const auto increment = loop_end->get_increment(); - - if ((!utils::is_dynamic_value(work_amount) && work_amount == increment) || (loop_info->is_evaluate_once())) { + if (loop_info->is_evaluate_once()) { auto new_finalization_offsets = loop_end->get_finalization_offsets(); const auto& ptr_increments = loop_end->get_ptr_increments(); - const auto work_amount_incr = static_cast(increment); + const auto work_amount_incr = static_cast(loop_end->get_increment()); for (size_t i = 0; i < new_finalization_offsets.size(); i++) { const auto ptr_shift = utils::dynamic_safe_mul(ptr_increments[i], work_amount_incr); new_finalization_offsets[i] = utils::dynamic_safe_add(new_finalization_offsets[i], ptr_shift); @@ -42,7 +39,6 @@ bool OptimizeLoopSingleEvaluation::run(lowered::LinearIR& linear_ir, lowered::Li // Update the corresponding ExpandedLoopInfo loop_info->update_ptr_increments(loop_end->get_ptr_increments()); loop_info->update_finalization_offsets(loop_end->get_finalization_offsets()); - loop_info->set_evaluate_once(true); is_modified = true; } diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp index 25635976350327..e6753198053393 100644 --- a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp +++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp @@ -62,7 +62,7 @@ void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, const auto& desc = port.expr_port->get_descriptor_ptr(); auto subtensor = desc->get_subtensor(); if (port.dim_idx < desc->get_subtensor().size()) { - desc->set_subtensor_value(port.dim_idx, new_dim_value); + desc->set_subtensor_dim(port.dim_idx, new_dim_value); } const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); diff --git a/src/common/snippets/src/lowered/port_descriptor.cpp b/src/common/snippets/src/lowered/port_descriptor.cpp index 6f2776e4fd8fc2..7ea861dc190c26 100644 --- a/src/common/snippets/src/lowered/port_descriptor.cpp +++ b/src/common/snippets/src/lowered/port_descriptor.cpp @@ -9,9 +9,6 @@ namespace ov { namespace snippets { namespace lowered { -// SIZE_MAX - is dynamic value -size_t PortDescriptor::ServiceDimensions::FULL_DIM = SIZE_MAX - 1; - PortDescriptor::PortDescriptor(const ov::Input& in, VectorDims subtensor_shape, std::vector layout) : PortDescriptor(ov::Input(in.get_node(), in.get_index()), std::move(subtensor_shape), std::move(layout)) {} @@ -54,7 +51,7 @@ void PortDescriptor::set_shape(const VectorDims& tensor) { *m_tensor_shape = tensor; } -void PortDescriptor::set_subtensor_value(size_t idx, VectorDims::value_type value) { +void PortDescriptor::set_subtensor_dim(size_t idx, VectorDims::value_type value) { OPENVINO_ASSERT(idx < m_subtensor_shape.size(), "Failed to set subtensor value: idx should be less than size"); *(m_subtensor_shape.rbegin() + idx) = value; } @@ -136,6 +133,26 @@ void PortDescriptorUtils::set_port_descriptor_ptr(const ov::Output& ou } } +namespace { +template +void set_port_descriptor(const T& port, std::vector subtensor, std::vector layout) { + const auto& shape = port.get_shape(); + for (size_t i = 1; i <= std::min(subtensor.size(), shape.size()); i++) { + auto& dim = subtensor[subtensor.size() - i]; + if (!utils::is_full_dim_value(dim)) + dim = std::min(dim, shape[shape.size() - i]); + } + PortDescriptorUtils::set_port_descriptor_ptr(port, std::make_shared(shape, subtensor, layout)); +} +} // namespace + +void PortDescriptorUtils::set_port_descriptor(const ov::Input& in, std::vector subtensor, std::vector layout) { + set_port_descriptor(in, subtensor, layout); +} +void PortDescriptorUtils::set_port_descriptor(const ov::Output& in, std::vector subtensor, std::vector layout) { + set_port_descriptor(in, subtensor, layout); +} + PortDescriptorPtr PortDescriptorUtils::get_port_descriptor_ptr(const ov::Input& in) { return get_port_descriptor_ptr(ov::Input(in.get_node(), in.get_index())); } diff --git a/src/common/snippets/src/op/reduce.cpp b/src/common/snippets/src/op/reduce.cpp index 5717bfe1255300..dd93f082c1356c 100644 --- a/src/common/snippets/src/op/reduce.cpp +++ b/src/common/snippets/src/op/reduce.cpp @@ -5,6 +5,7 @@ #include "snippets/op/reduce.hpp" #include "snippets/itt.hpp" +#include "snippets/utils.hpp" #include "snippets/lowered/port_descriptor.hpp" namespace ov { @@ -33,7 +34,7 @@ void ReduceBase::compute_and_set_reduce_subtensors(const std::shared_ptr subtensor(reduce_rank, 1); for (size_t i = axis; i < reduce_rank; ++i) - subtensor[i] = lowered::PortDescriptor::ServiceDimensions::FULL_DIM; + subtensor[i] = utils::get_full_dim_value(); lowered::PortDescriptorUtils::set_port_descriptor_ptr(reduce->input(0), std::make_shared(reduce->input(0), subtensor)); lowered::PortDescriptorUtils::set_port_descriptor_ptr(reduce->output(0), std::make_shared(reduce->output(0), subtensor)); } diff --git a/src/common/snippets/src/op/serialization_node.cpp b/src/common/snippets/src/op/serialization_node.cpp index 1d58cec7aa0ef8..9864a1a12f94a5 100644 --- a/src/common/snippets/src/op/serialization_node.cpp +++ b/src/common/snippets/src/op/serialization_node.cpp @@ -49,8 +49,8 @@ bool SerializationNode::visit_attributes(AttributeVisitor &visitor) { std::stringstream ss; for (size_t i = 0; i < subtensor.size(); ++i) { const auto& v = subtensor[i]; - const auto v_str = v == lowered::PortDescriptor::ServiceDimensions::FULL_DIM ? "FULL_DIM" : - (utils::is_dynamic_value(v) ? "?" : std::to_string(v)); + const auto v_str = utils::is_full_dim_value(v) ? "FULL_DIM" : + utils::is_dynamic_value(v) ? "?" : std::to_string(v); const auto del = i < subtensor.size() - 1 ? ", " : ""; ss << v_str << del; } diff --git a/src/common/snippets/src/pass/matmul_to_brgemm.cpp b/src/common/snippets/src/pass/matmul_to_brgemm.cpp index 6eaf8424ff5a78..7268d4a7cc6a67 100644 --- a/src/common/snippets/src/pass/matmul_to_brgemm.cpp +++ b/src/common/snippets/src/pass/matmul_to_brgemm.cpp @@ -18,16 +18,12 @@ namespace snippets { namespace pass { void MatMulToBrgemm::init_ports(const std::shared_ptr& brgemm) const { - auto get_subtensor = []() { - return std::vector{ lowered::PortDescriptor::ServiceDimensions::FULL_DIM, lowered::PortDescriptor::ServiceDimensions::FULL_DIM }; - }; + const auto subtensor = std::vector(2, utils::get_full_dim_value()); for (const auto& input : brgemm->inputs()) { const auto& tensor = utils::pshape_to_vdims(input.get_partial_shape()); - const auto& subtensor = get_subtensor(); lowered::PortDescriptorUtils::set_port_descriptor_ptr(input, std::make_shared(tensor, subtensor)); } const auto& tensor = utils::pshape_to_vdims(brgemm->get_output_partial_shape(0)); - const auto& subtensor = get_subtensor(); lowered::PortDescriptorUtils::set_port_descriptor_ptr(brgemm->output(0), std::make_shared(tensor, subtensor)); } diff --git a/src/common/snippets/src/pass/softmax_decomposition.cpp b/src/common/snippets/src/pass/softmax_decomposition.cpp index 269d06c958dd39..34dc1c19c5d9d0 100644 --- a/src/common/snippets/src/pass/softmax_decomposition.cpp +++ b/src/common/snippets/src/pass/softmax_decomposition.cpp @@ -55,7 +55,7 @@ SoftmaxDecomposition::SoftmaxDecomposition() { OPENVINO_ASSERT(axis < rank, "Softmax has incorrect axis"); std::vector subtensor(rank, 1); for (size_t i = axis; i < rank; ++i) - subtensor[i] = PortDescriptor::ServiceDimensions::FULL_DIM; + subtensor[i] = utils::get_full_dim_value(); PortDescriptorUtils::set_port_descriptor_ptr(power->input(0), std::make_shared(power->input(0), subtensor)); PortDescriptorUtils::set_port_descriptor_ptr(power->output(0), std::make_shared(power->output(0), subtensor)); diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index be9afb6fa702f5..8a1eb1bfa65f78 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -185,20 +185,16 @@ void RuntimeConfigurator::update_loop_info(const std::shared_ptrget_work_amount(); + // Update only `finalization offsets`. `Ptr increments` are always zeroed in this case + auto updated_finalization_offsets = current_work_amount > 0 ? std::vector(finalization_offsets.size(), 0) : finalization_offsets; if (expanded_loop_info->is_evaluate_once()) { - // Update only `finalization offsets`. `Ptr increments` are always zeroed in this case - auto updated_finalization_offsets = current_work_amount > 0 ? std::vector(finalization_offsets.size(), 0) : finalization_offsets; // work_amount is equal to increment in cases with `evaluate_once` for (size_t i = 0; i < updated_finalization_offsets.size(); ++i) updated_finalization_offsets[i] += ptr_increments[i] * expanded_loop_info->get_work_amount(); - expanded_loop_info->update_finalization_offsets(updated_finalization_offsets); } else { expanded_loop_info->update_ptr_increments(ptr_increments); - if (current_work_amount > 0) - expanded_loop_info->update_finalization_offsets(std::vector(finalization_offsets.size(), 0)); - else - expanded_loop_info->update_finalization_offsets(finalization_offsets); } + expanded_loop_info->update_finalization_offsets(updated_finalization_offsets); } } diff --git a/src/common/snippets/tests/src/lir_test_utils.cpp b/src/common/snippets/tests/src/lir_test_utils.cpp index eca5be9987118f..c4f5047011cd08 100644 --- a/src/common/snippets/tests/src/lir_test_utils.cpp +++ b/src/common/snippets/tests/src/lir_test_utils.cpp @@ -39,9 +39,7 @@ void LoweredPassTestsF::TearDown() { } ov::snippets::VectorDims get_default_subtensor() { - static const VectorDims default_subtensor{PortDescriptor::ServiceDimensions::FULL_DIM, - PortDescriptor::ServiceDimensions::FULL_DIM}; - return default_subtensor; + return VectorDims(2, ov::snippets::utils::get_full_dim_value()); } void init_expr_descriptors(const ov::snippets::lowered::ExpressionPtr& expr, diff --git a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp index e56a31a8e92a4c..4dc3f2dae7e867 100644 --- a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp +++ b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp @@ -95,8 +95,7 @@ void BufferAllocationTest::Validate() { std::shared_ptr EltwiseBufferAllocationTest::GetModel() const { const auto subtensor_eltwise = std::vector{1, m_vector_size}; - const auto subtensor_buffer = std::vector{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM, - ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM}; + const auto subtensor_buffer = std::vector(2, ov::snippets::utils::get_full_dim_value()); const auto parameter0 = std::make_shared(ov::element::f32, ov::PartialShape({1, 3, 100, 100})); const auto parameter1 = std::make_shared(ov::element::f32, ov::PartialShape({1, 3, 100, 100})); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index b8b6f590bf41bb..5b517a68ae11a9 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -91,8 +91,8 @@ void CPURuntimeConfigurator::update_brgemms(const ov::snippets::lowered::LoopMan const auto& expanded_loop_info = loop_manager->get_loop_info(loop_ids.front()); const auto& block_size_m = expanded_loop_info->get_work_amount(); - brgemm_expr->get_input_port_descriptor(0)->set_subtensor_value(1, block_size_m); - brgemm_expr->get_output_port_descriptor(0)->set_subtensor_value(1, block_size_m); + brgemm_expr->get_input_port_descriptor(0)->set_subtensor_dim(1, block_size_m); + brgemm_expr->get_output_port_descriptor(0)->set_subtensor_dim(1, block_size_m); } } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp index 69466faa8aaa5a..fb15ada10c504f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp @@ -56,7 +56,7 @@ bool BrgemmKernelConfig::operator==(const BrgemmKernelConfig& rhs) const { void BrgemmKernelConfig::update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC) { // If M is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example) // To process this case, we have to make this Config as empty (nullify runtime parameters) - if (M == 0 && !utils::one_of(0, N, K, LDA, LDB, LDC)) { + if (utils::one_of(0, M, N, K)) { m_M = 0; m_N = 0; m_K = 0; m_LDA = 0; m_LDB = 0; m_LDC = 0; } else { @@ -129,10 +129,8 @@ std::shared_ptr BrgemmKernelExecutor::compile_kernel(const std::shared_ptr compiled_kernel = std::make_shared(); // Brgemm is not executable - nothing to compile - if (config.is_empty()) { - compiled_kernel->compiled_kernel = std::unique_ptr(); + if (config.is_empty()) return compiled_kernel; - } cpu::x64::brgemm_t desc; auto status = brgemm_desc_init(&desc, config.get_isa(), cpu::x64::brgemm_strd, diff --git a/src/plugins/intel_cpu/src/emitters/tpp/x64/jit_tpp_emitter.cpp b/src/plugins/intel_cpu/src/emitters/tpp/x64/jit_tpp_emitter.cpp index 91c95f0a478d3c..70ddbb3d79ee21 100644 --- a/src/plugins/intel_cpu/src/emitters/tpp/x64/jit_tpp_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/tpp/x64/jit_tpp_emitter.cpp @@ -48,7 +48,7 @@ TppEmitter::TppEmitter(dnnl::impl::cpu::x64::jit_generator* h, io_port_descriptors.resize(num_kernel_args); // Note: this is needed mostly for Reduce operations, since they allow the last subternsor dim to be FULL_DIM; auto replace_full_dim = [](size_t dim, size_t replace_dim) { - if (dim == snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM) + if (ov::snippets::utils::is_full_dim_value(dim)) return replace_dim; return dim; }; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp index 3c9bfcc5ea064b..d71faef96923d0 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp @@ -31,7 +31,7 @@ using namespace snippets::lowered; namespace { std::vector make_subtensor(const ov::Shape& tensor) { - return std::vector(std::min(tensor.size(), size_t(2)), PortDescriptor::ServiceDimensions::FULL_DIM); + return std::vector(std::min(tensor.size(), size_t(2)), ov::snippets::utils::get_full_dim_value()); } template void set_full_port_desc(const T& port) { diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp index d31f205a932116..f533c56bfabce5 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp @@ -62,7 +62,7 @@ snippets::lowered::SpecificIterationHandlers BrgemmBlocking::get_default_blockin const auto tail_size = snippets::utils::is_dynamic_value(work_amount) ? snippets::utils::get_dynamic_value() : work_amount % block_size; if (tail_size != 0) handlers.register_pass(tail_size); - handlers.register_pass(); + handlers.register_pass(); return handlers; } @@ -120,16 +120,16 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea // If block_size is dynamic, it means that Brgemm will process full tensor: // subtensor[i] = FULL_DIM as by default if (!snippets::utils::is_dynamic_value(block_size_m)) { - brgemm_expr->get_input_port_descriptor(0)->set_subtensor_value(1, block_size_m); - brgemm_expr->get_output_port_descriptor(0)->set_subtensor_value(1, block_size_m); + brgemm_expr->get_input_port_descriptor(0)->set_subtensor_dim(1, block_size_m); + brgemm_expr->get_output_port_descriptor(0)->set_subtensor_dim(1, block_size_m); } if (!snippets::utils::is_dynamic_value(block_size_n)) { - brgemm_expr->get_input_port_descriptor(1)->set_subtensor_value(0, block_size_n); - brgemm_expr->get_output_port_descriptor(0)->set_subtensor_value(0, block_size_n); + brgemm_expr->get_input_port_descriptor(1)->set_subtensor_dim(0, block_size_n); + brgemm_expr->get_output_port_descriptor(0)->set_subtensor_dim(0, block_size_n); } if (!snippets::utils::is_dynamic_value(block_size_k)) { - brgemm_expr->get_input_port_descriptor(0)->set_subtensor_value(0, block_size_k); - brgemm_expr->get_input_port_descriptor(1)->set_subtensor_value(1, block_size_k); + brgemm_expr->get_input_port_descriptor(0)->set_subtensor_dim(0, block_size_k); + brgemm_expr->get_input_port_descriptor(1)->set_subtensor_dim(1, block_size_k); } const bool need_brgemm_copy_b = brgemm_cpu && with_repacking(brgemm_cpu->get_type()); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp index 6acf68fe245467..a8281ad1d02da6 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp @@ -36,16 +36,16 @@ std::shared_ptr SetBrgemmBeta::merge(const st return merged_pass; } -bool SetEvaluanceOnce::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { +bool SetEvaluateOnce::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { const auto& loop_end = ov::as_type_ptr(end->get()->get_node()); - OPENVINO_ASSERT(loop_end, "SetEvaluanceOnce expected LoopEnd node in iterator `end`."); + OPENVINO_ASSERT(loop_end, "SetEvaluateOnce expected LoopEnd node in iterator `end`."); const auto& loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id()); loop_info->set_evaluate_once(true); return true; } -std::shared_ptr SetEvaluanceOnce::merge(const std::shared_ptr& other) { - return !other || ov::is_type(other) ? std::make_shared() : nullptr; +std::shared_ptr SetEvaluateOnce::merge(const std::shared_ptr& other) { + return !other || ov::is_type(other) ? std::make_shared() : nullptr; } } // namespace pass diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp index 2a9492713e0c46..24697c2f50f6a6 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp @@ -29,15 +29,15 @@ class SetBrgemmBeta : public snippets::lowered::pass::RangedPass { }; /** - * @interface SetEvaluanceOnce + * @interface SetEvaluateOnce * @brief The pass set `evaluate once = true` only to ExpandedLoopInfo which is mapped on LoopEnd in the passed iterator `end`. * The pointer arithmetic should be updated in the separate optimization `OptimizeLoopSingleEvaluation` * @ingroup snippets */ -class SetEvaluanceOnce : public snippets::lowered::pass::RangedPass { +class SetEvaluateOnce : public snippets::lowered::pass::RangedPass { public: - SetEvaluanceOnce() = default; - OPENVINO_RTTI("SetEvaluanceOnce", "RangedPass") + SetEvaluateOnce() = default; + OPENVINO_RTTI("SetEvaluateOnce", "RangedPass") bool run(snippets::lowered::LinearIR& linear_ir, snippets::lowered::LinearIR::constExprIt begin, snippets::lowered::LinearIR::constExprIt end) override; diff --git a/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/eltwise_to_eltwise_tpp.cpp b/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/eltwise_to_eltwise_tpp.cpp index b3c04fb7833db9..15066968d88155 100644 --- a/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/eltwise_to_eltwise_tpp.cpp +++ b/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/eltwise_to_eltwise_tpp.cpp @@ -3,6 +3,7 @@ // #include "snippets/itt.hpp" +#include "snippets/utils.hpp" #include "eltwise_to_eltwise_tpp.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" @@ -40,14 +41,12 @@ EltwiseToEltwiseTPP::EltwiseToEltwiseTPP() { OPENVINO_ASSERT(tpp_eltwise, "Failed to create TPP node"); const size_t M_block = 32; - const size_t N_block = ov::is_type(node) ? - snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM : - 64; + const size_t N_block = ov::is_type(node) ? ov::snippets::utils::get_full_dim_value() : 64; ov::replace_node_update_name(node, tpp_eltwise); for (size_t i = 0; i < node->get_input_size(); i++) - snippets::lowered::set_port_desc(tpp_eltwise->input(i), {M_block, N_block}); + ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(tpp_eltwise->input(i), {M_block, N_block}); - snippets::lowered::set_port_desc(tpp_eltwise->output(0), {M_block, N_block}); + ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(tpp_eltwise->output(0), {M_block, N_block}); return true; }; diff --git a/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/lowered/set_tpp_leading_dim.cpp b/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/lowered/set_tpp_leading_dim.cpp index a420ed2cbfea22..4f38eddc2bde0f 100644 --- a/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/lowered/set_tpp_leading_dim.cpp +++ b/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/lowered/set_tpp_leading_dim.cpp @@ -74,7 +74,7 @@ size_t get_leading_dim(ExpressionPort port, const snippets::lowered::LoopManager bool full_dim_substituted = false; for (size_t i = 1; i <= subtensor.size(); i++) { const auto idx = subtensor.size() - i; - if (subtensor[idx] == snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM) { + if (ov::snippets::utils::is_full_dim_value(subtensor[idx])) { // the reason that we don't support FULL_DIM substitution for an arbitrary layout is that // the layout and subtersor can (and usually do) have different ranks full_dim_substituted = true; diff --git a/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/scalar_to_scalar_tpp.cpp b/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/scalar_to_scalar_tpp.cpp index 5ea5b135ba595a..0b9f41d47aa0da 100644 --- a/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/scalar_to_scalar_tpp.cpp +++ b/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/scalar_to_scalar_tpp.cpp @@ -42,9 +42,9 @@ ScalarToScalarTPP::ScalarToScalarTPP() { tpp_scalar->set_friendly_name(node->get_friendly_name()); ov::replace_node_update_name(node, tpp_scalar); const auto& out = tpp_scalar->output(0); - snippets::lowered::set_port_desc(out, {1}); + ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(out, {1}); for (const auto& in : out.get_target_inputs()) - snippets::lowered::set_port_desc(in, {1}); + ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(in, {1}); return true; }; diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp index df7790692c1dd8..986aa993f471c3 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp @@ -8,7 +8,6 @@ #include "openvino/opsets/opset10.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_info.hpp" -#include "snippets/lowered/pass/propagate_subtensors.hpp" #include "snippets/snippets_isa.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp index c618c9e0d86fb5..2abfde0b3bb431 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp @@ -138,8 +138,8 @@ class MHAFP32BufferAllocationTest : public BufferAllocationCPUTest { const size_t k_blk = 16; const size_t n_blk = 64; const auto subtensor_scalar = std::vector{1}; - const auto subtensor_power = std::vector{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM}; - const auto subtensor_full = std::vector(2, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM); + const auto subtensor_power = std::vector{1, ov::snippets::utils::get_full_dim_value()}; + const auto subtensor_full = std::vector(2, ov::snippets::utils::get_full_dim_value()); const auto parameter0 = std::make_shared(ov::element::f32, ov::PartialShape({1, 12, 128, 64})); const auto parameter1 = std::make_shared(ov::element::f32, ov::PartialShape({1, 128, 12, 64})); @@ -196,8 +196,8 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const size_t k_blk = 16; const size_t n_blk = 64; const auto subtensor_scalar = std::vector{1}; - const auto subtensor_power = std::vector{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM}; - const auto subtensor_full = std::vector(2, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM); + const auto subtensor_power = std::vector{1, ov::snippets::utils::get_full_dim_value()}; + const auto subtensor_full = std::vector(2, ov::snippets::utils::get_full_dim_value()); const auto parameter0 = std::make_shared(ov::element::bf16, ov::PartialShape({1, 12, 128, 64})); const auto parameter1 = std::make_shared(ov::element::bf16, ov::PartialShape({1, 128, 12, 64})); From 6a6df1880dc663af7899e9eba79d80ba9cc79136 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Wed, 24 Jul 2024 11:02:41 +0400 Subject: [PATCH 4/6] [Snippets] Some fixes which were found during validation --- .../lowered/pass/compute_buffer_allocation_size.cpp | 2 +- src/common/snippets/src/op/reduce.cpp | 2 +- .../emitters/snippets/cpu_runtime_configurator.cpp | 11 +++++++++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp index e4664800995db1..028cdde1088e60 100644 --- a/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp +++ b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp @@ -60,7 +60,7 @@ size_t ComputeBufferAllocationSize::get_allocation_size(const LoopManagerPtr& lo const auto processing_rank = !processed_dim_idxs.empty() ? std::max(*processed_dim_idxs.rbegin(), subtensor.size()) : subtensor.size(); for (size_t i = 0; i < std::min(processing_rank, rank); ++i) { if (processed_dim_idxs.count(i) == 0) { - if (i < subtensor.size()) + if (i < subtensor.size() && !utils::is_full_dim_value(*(subtensor.rbegin() + i))) allocation_size = utils::dynamic_safe_mul(allocation_size, std::min(*(planar_shape.rbegin() + i), *(subtensor.rbegin() + i))); else allocation_size = utils::dynamic_safe_mul(allocation_size, *(planar_shape.rbegin() + i)); diff --git a/src/common/snippets/src/op/reduce.cpp b/src/common/snippets/src/op/reduce.cpp index dd93f082c1356c..b0b69e0bd7e84c 100644 --- a/src/common/snippets/src/op/reduce.cpp +++ b/src/common/snippets/src/op/reduce.cpp @@ -5,7 +5,7 @@ #include "snippets/op/reduce.hpp" #include "snippets/itt.hpp" -#include "snippets/utils.hpp" +#include "snippets/utils/utils.hpp" #include "snippets/lowered/port_descriptor.hpp" namespace ov { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 5b517a68ae11a9..14d21652010a5e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -16,14 +16,21 @@ CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigur } void CPURuntimeConfigurator::update(const std::shared_ptr& linear_ir) { - RuntimeConfigurator::update(linear_ir); - if (linear_ir->is_dynamic()) { const auto& loop_manager = linear_ir->get_loop_manager(); + update_loop_info(linear_ir); update_loop_args(loop_manager); + // Update Brgemm should be before `update_buffer_scratchpad_size` + // because `ComputeAllocationSize` depends on subtensors which are updated in `update_brgemms` update_brgemms(loop_manager); + update_buffer_scratchpad_size(linear_ir); get_kernel_executor_table()->update_state(); } + + m_config->master_shape = linear_ir->get_master_shape(); + + update_data_offsets(); + update_latest_shapes(); } void CPURuntimeConfigurator::initialization(const std::shared_ptr& linear_ir) { From 6b2257789a795adc7886ba83ea504e9c7b03a12f Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Wed, 24 Jul 2024 11:14:53 +0400 Subject: [PATCH 5/6] [Snippets] Fix build --- src/common/snippets/src/lowered/port_descriptor.cpp | 6 +++--- .../transformations/tpp/x64/pass/eltwise_to_eltwise_tpp.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/common/snippets/src/lowered/port_descriptor.cpp b/src/common/snippets/src/lowered/port_descriptor.cpp index 7ea861dc190c26..e5fd3638e831c8 100644 --- a/src/common/snippets/src/lowered/port_descriptor.cpp +++ b/src/common/snippets/src/lowered/port_descriptor.cpp @@ -135,7 +135,7 @@ void PortDescriptorUtils::set_port_descriptor_ptr(const ov::Output& ou namespace { template -void set_port_descriptor(const T& port, std::vector subtensor, std::vector layout) { +void set_port_desc(const T& port, std::vector subtensor, std::vector layout) { const auto& shape = port.get_shape(); for (size_t i = 1; i <= std::min(subtensor.size(), shape.size()); i++) { auto& dim = subtensor[subtensor.size() - i]; @@ -147,10 +147,10 @@ void set_port_descriptor(const T& port, std::vector subtensor, std::vect } // namespace void PortDescriptorUtils::set_port_descriptor(const ov::Input& in, std::vector subtensor, std::vector layout) { - set_port_descriptor(in, subtensor, layout); + set_port_desc(in, subtensor, layout); } void PortDescriptorUtils::set_port_descriptor(const ov::Output& in, std::vector subtensor, std::vector layout) { - set_port_descriptor(in, subtensor, layout); + set_port_desc(in, subtensor, layout); } PortDescriptorPtr PortDescriptorUtils::get_port_descriptor_ptr(const ov::Input& in) { diff --git a/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/eltwise_to_eltwise_tpp.cpp b/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/eltwise_to_eltwise_tpp.cpp index 15066968d88155..da83038f5455f8 100644 --- a/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/eltwise_to_eltwise_tpp.cpp +++ b/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/eltwise_to_eltwise_tpp.cpp @@ -3,7 +3,7 @@ // #include "snippets/itt.hpp" -#include "snippets/utils.hpp" +#include "snippets/utils/utils.hpp" #include "eltwise_to_eltwise_tpp.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" From 18eaf4a4419587e46063e81e989d0b32017fa0f6 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 25 Jul 2024 08:38:10 +0400 Subject: [PATCH 6/6] [Snippets] Moved UpdateBrgemms to KernelExecutor::update_config --- .../snippets/kernel_executor_table.hpp | 12 ++-- .../include/snippets/runtime_configurator.hpp | 16 ++--- .../src/lowered/pass/propagate_subtensors.cpp | 4 +- src/common/snippets/src/op/subgraph.cpp | 2 +- .../snippets/src/runtime_configurator.cpp | 26 ++++---- .../snippets/cpu_runtime_configurator.cpp | 62 +++-------------- .../snippets/cpu_runtime_configurator.hpp | 20 ++---- .../snippets/x64/kernel_executors/brgemm.cpp | 66 ++++++++++++------- .../snippets/x64/kernel_executors/brgemm.hpp | 4 +- .../x64/pass/lowered/brgemm_blocking.cpp | 13 ++-- .../x64/lowered/brgemm_blocking.cpp | 6 +- 11 files changed, 99 insertions(+), 132 deletions(-) diff --git a/src/common/snippets/include/snippets/kernel_executor_table.hpp b/src/common/snippets/include/snippets/kernel_executor_table.hpp index 46f9cd04b923ba..af797e4c80422a 100644 --- a/src/common/snippets/include/snippets/kernel_executor_table.hpp +++ b/src/common/snippets/include/snippets/kernel_executor_table.hpp @@ -43,7 +43,7 @@ class KernelExecutorBase { * @brief Update current kernel config in accordance with the passed expression. Corresponding kernel is recompiled if necessary. * This method should be called to update KernelExecutor based on runtime info (e.g. shapes) available through expression ptr */ - virtual void update_by_expression(const lowered::ExpressionPtr& expr) = 0; + virtual void update_by_expression(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir) = 0; /** * @brief Replace current kernel config with the provided value. Corresponding kernel is recompiled if necessary. * This method should be called to restore a saved state of the executor, that was configured using update_by_expression(). @@ -70,8 +70,8 @@ class KernelExecutor : public KernelExecutorBase { explicit KernelExecutor(Conf c) : KernelExecutorBase(), m_config{std::move(c)} {} // Note: override when final is redundant, but needed to avoid warnings on some compilers - void update_by_expression(const lowered::ExpressionPtr& expr) override final { // NOLINT - update_config(expr, m_config); + void update_by_expression(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir) override final { // NOLINT + update_config(expr, linear_ir, m_config); OPENVINO_ASSERT(m_config.is_completed(), "Failed to update kernel config in update_by_expression"); update_kernel(m_config, m_kernel); OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor"); @@ -103,7 +103,7 @@ class KernelExecutor : public KernelExecutorBase { protected: /*** Updates stored kernel config based on runtime info from expression (e.g. new input shapes). */ - virtual void update_config(const lowered::ExpressionPtr& expr, Conf& config) const = 0; + virtual void update_config(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir, Conf& config) const = 0; /*** Updates stored kernel in accordance with the passed config. Recompilation of the kernel is * performed if necessary. */ virtual void update_kernel(const Conf& c, std::shared_ptr& kernel) const = 0; @@ -130,9 +130,9 @@ class KernelExecutorTable { return m_table.at(expr); } /*** Updates every registered KernelExecutor in accordance with the corresponding expression */ - void update_state() const { + void update_state(const lowered::LinearIRPtr& linear_ir) const { for (const auto& record : m_table) - record.second->update_by_expression(record.first); + record.second->update_by_expression(record.first, linear_ir); } /*** Returns lambda function that contains current state of the table, and restores this state when called */ diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 059771d961df82..058eca59716d1b 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -61,7 +61,7 @@ class RuntimeConfigurator { * @param linear_ir LinearIR * @return updated config */ - const std::shared_ptr& get_updated_config(const std::shared_ptr& linear_ir); + const std::shared_ptr& get_updated_config(const lowered::LinearIRPtr& linear_ir); /*** Returns pointer to KernelExecutorTable owned by the config */ const std::shared_ptr& get_kernel_executor_table() const { return m_config->kernel_executor_table; } @@ -70,19 +70,19 @@ class RuntimeConfigurator { * @brief Update RuntimeConfig based on LinearIR * @param linear_ir LinearIR */ - virtual void update(const std::shared_ptr& linear_ir); + virtual void update(const lowered::LinearIRPtr& linear_ir); /** * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator * @param linear_ir LinearIR */ - virtual void initialization(const std::shared_ptr& linear_ir); + virtual void initialization(const lowered::LinearIRPtr& linear_ir); /** * @brief Initializes input and data information of LinearIR: * descriptors (that contains shapes and layouts) and data_sizes * @param linear_ir LinearIR */ - void init_data_info(const std::shared_ptr& linear_ir); + void init_data_info(const lowered::LinearIRPtr& linear_ir); /** * @brief Initializes information of buffers: * - static buffer_scratchpad_size @@ -90,23 +90,23 @@ class RuntimeConfigurator { * - clusters with dynamic buffers (`m_dynamic_buffer_clusters`) for the quick access in `update()` * @param linear_ir LinearIR */ - void init_buffer_info(const std::shared_ptr& linear_ir); + void init_buffer_info(const lowered::LinearIRPtr& linear_ir); /** * @brief Initializes tensor rank of config * @param linear_ir LinearIR */ - virtual void init_tensor_rank(const std::shared_ptr& linear_ir) const; + virtual void init_tensor_rank(const lowered::LinearIRPtr& linear_ir) const; /** * @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo * @param linear_ir LinearIR */ - void update_loop_info(const std::shared_ptr& linear_ir) const; + void update_loop_info(const lowered::LinearIRPtr& linear_ir) const; /** * @brief Update Buffer scratchpad size and offsets if needed * Note: `update_loop_info` must be called before * @param linear_ir LinearIR */ - void update_buffer_scratchpad_size(const std::shared_ptr& linear_ir) const; + void update_buffer_scratchpad_size(const lowered::LinearIRPtr& linear_ir) const; /** * @brief Calculate data offsets of LinearIR and update these values in RuntimeConfig */ diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp index e6753198053393..c89274a728c4c9 100644 --- a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp +++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp @@ -107,8 +107,8 @@ void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end()); for (size_t i = 0; i < new_subtensor.size(); ++i) { // If user forces dynamic value to set in subtensor, set real dynamic dimension using `get_dynamic_value()` - new_subtensor[i] = new_subtensor[i] == FORCED_DYNAMIC_VALUE ? utils::get_dynamic_value() - : std::min(new_subtensor[i], subtensor[i]); + new_subtensor[i] = new_subtensor[i] == FORCED_DYNAMIC_VALUE ? utils::get_dynamic_value() : + utils::is_full_dim_value(subtensor[i]) ? subtensor[i] : std::min(new_subtensor[i], subtensor[i]); } desc->set_subtensor(new_subtensor); } diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index a33d478ee3929d..4ede0b58a66cf0 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -552,7 +552,7 @@ snippets::Schedule Subgraph::generate(const void* compile_params) const { exec_table->replace_key_expression(expression_map.at(expr.get()), expr); // Some kernel executors might've been registered during code emission. // We need to update them, so appropriate kernels will be compiled. - exec_table->update_state(); + exec_table->update_state(m_linear_ir); return {std::move(lowering_result)}; } diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 8a1eb1bfa65f78..6f8945649c2b94 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -35,7 +35,7 @@ RuntimeConfigurator::RuntimeConfigurator(std::shared_ptr c) : OPENVINO_ASSERT(m_config, "Runtime config is nullptr!"); } -const std::shared_ptr& RuntimeConfigurator::get_updated_config(const std::shared_ptr& linear_ir) { +const std::shared_ptr& RuntimeConfigurator::get_updated_config(const lowered::LinearIRPtr& linear_ir) { // First initialization if (m_io_num == 0) initialization(linear_ir); @@ -44,7 +44,7 @@ const std::shared_ptr& RuntimeConfigurator::get_updated_config(co return m_config; } -void RuntimeConfigurator::initialization(const std::shared_ptr& linear_ir) { +void RuntimeConfigurator::initialization(const lowered::LinearIRPtr& linear_ir) { init_data_info(linear_ir); init_tensor_rank(linear_ir); init_buffer_info(linear_ir); @@ -55,7 +55,7 @@ void RuntimeConfigurator::initialization(const std::shared_ptrtile_rank = linear_ir->get_config().m_loop_depth; } -void RuntimeConfigurator::update(const std::shared_ptr& linear_ir) { +void RuntimeConfigurator::update(const lowered::LinearIRPtr& linear_ir) { if (linear_ir->is_dynamic()) { update_loop_info(linear_ir); update_buffer_scratchpad_size(linear_ir); @@ -67,11 +67,11 @@ void RuntimeConfigurator::update(const std::shared_ptr& linea update_latest_shapes(); } -void RuntimeConfigurator::init_tensor_rank(const std::shared_ptr& linear_ir) const { +void RuntimeConfigurator::init_tensor_rank(const lowered::LinearIRPtr& linear_ir) const { m_config->tensor_rank = linear_ir->get_master_shape().size(); } -void RuntimeConfigurator::init_data_info(const std::shared_ptr& linear_ir) { +void RuntimeConfigurator::init_data_info(const lowered::LinearIRPtr& linear_ir) { const auto& parameters = linear_ir->get_parameters(); const auto& results = linear_ir->get_results(); m_in_num = parameters.size(); @@ -113,7 +113,7 @@ void RuntimeConfigurator::init_data_info(const std::shared_ptr& linear_ir) { +void RuntimeConfigurator::init_buffer_info(const lowered::LinearIRPtr& linear_ir) { std::map> dynamic_buffer_clusters, static_buffer_clusters; // All needed checks are in Validate pass @@ -143,7 +143,7 @@ void RuntimeConfigurator::init_buffer_info(const std::shared_ptr& linear_ir) const { +void RuntimeConfigurator::update_loop_info(const lowered::LinearIRPtr& linear_ir) const { // Initialized UnifiedLoopInfo struct CurrentUnifiedLoopInfo { size_t current_work_amount = 0; @@ -180,17 +180,19 @@ void RuntimeConfigurator::update_loop_info(const std::shared_ptrset_work_amount( - lowered::pass::InsertSpecificIterations::get_decomposed_loop_work_amount(current_unified_loop_info, decomposed_loop_type, current_work_amount)); + const auto work_amount = + lowered::pass::InsertSpecificIterations::get_decomposed_loop_work_amount(current_unified_loop_info, decomposed_loop_type, current_work_amount); + expanded_loop_info->set_work_amount(work_amount); // Update remaining Loop work amount - current_work_amount -= expanded_loop_info->get_work_amount(); + current_work_amount -= work_amount; // Update only `finalization offsets`. `Ptr increments` are always zeroed in this case auto updated_finalization_offsets = current_work_amount > 0 ? std::vector(finalization_offsets.size(), 0) : finalization_offsets; if (expanded_loop_info->is_evaluate_once()) { + expanded_loop_info->set_increment(work_amount); // work_amount is equal to increment in cases with `evaluate_once` for (size_t i = 0; i < updated_finalization_offsets.size(); ++i) - updated_finalization_offsets[i] += ptr_increments[i] * expanded_loop_info->get_work_amount(); + updated_finalization_offsets[i] += ptr_increments[i] * work_amount; } else { expanded_loop_info->update_ptr_increments(ptr_increments); } @@ -198,7 +200,7 @@ void RuntimeConfigurator::update_loop_info(const std::shared_ptr& linear_ir) const { +void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRPtr& linear_ir) const { const auto& loop_manager = linear_ir->get_loop_manager(); m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size(); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 14d21652010a5e..925a6d28697d41 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -4,7 +4,6 @@ #include "emitters/snippets/cpu_runtime_configurator.hpp" -#include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "snippets/utils/utils.hpp" #include "snippets/lowered/loop_manager.hpp" @@ -15,16 +14,14 @@ namespace intel_cpu { CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigurator(std::make_shared()) { } -void CPURuntimeConfigurator::update(const std::shared_ptr& linear_ir) { +void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRPtr& linear_ir) { if (linear_ir->is_dynamic()) { - const auto& loop_manager = linear_ir->get_loop_manager(); update_loop_info(linear_ir); - update_loop_args(loop_manager); - // Update Brgemm should be before `update_buffer_scratchpad_size` - // because `ComputeAllocationSize` depends on subtensors which are updated in `update_brgemms` - update_brgemms(loop_manager); + update_loop_args(linear_ir); + // Update KernelExecutor Table should be before `update_buffer_scratchpad_size` + // because `ComputeAllocationSize` depends on subtensors which are updated in the table + get_kernel_executor_table()->update_state(linear_ir); update_buffer_scratchpad_size(linear_ir); - get_kernel_executor_table()->update_state(); } m_config->master_shape = linear_ir->get_master_shape(); @@ -33,45 +30,15 @@ void CPURuntimeConfigurator::update(const std::shared_ptr& linear_ir) { - RuntimeConfigurator::initialization(linear_ir); - - for (const auto& expr : *linear_ir) { - if (ov::is_type(expr->get_node())) { - const auto& in0_desc = expr->get_input_port_descriptor(0); - const auto& in1_desc = expr->get_input_port_descriptor(1); - const auto& out_desc = expr->get_output_port_descriptor(0); - - const auto& in0_subtensor = in0_desc->get_subtensor(); - const auto& in1_subtensor = in1_desc->get_subtensor(); - const auto& out_subtensor = out_desc->get_subtensor(); - - // TODO [146125]: At the moment only blocking by dynamic M is supported - // So we save Brgemm with only dynamic M - // If there are other dynamic dimensions, throw exception for now - OPENVINO_ASSERT(!snippets::utils::is_dynamic_value(*in0_subtensor.crbegin()) && - !snippets::utils::is_dynamic_value(*in1_subtensor.crbegin()) && - !snippets::utils::is_dynamic_value(*(++in1_subtensor.crbegin())) && - !snippets::utils::is_dynamic_value(*out_subtensor.crbegin()), - "CPURuntimeConfigurator supports only dynamic M in Brgemm subtensors"); - OPENVINO_ASSERT(*(++in0_subtensor.crbegin()) == *(++out_subtensor.crbegin()), - "Incorrect values in subtensors of BrgemmCPU"); - - if (snippets::utils::is_dynamic_value(*(++in0_subtensor.crbegin()))) - m_dynamic_brgemms.insert(expr); - } - } -} - -void CPURuntimeConfigurator::init_tensor_rank(const std::shared_ptr& linear_ir) const { +void CPURuntimeConfigurator::init_tensor_rank(const ov::snippets::lowered::LinearIRPtr& linear_ir) const { m_config->tensor_rank = std::max(linear_ir->get_master_shape().size(), rank6D); } -void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const { +void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LinearIRPtr& linear_ir) const { const auto& cpu_config = ov::as_type_ptr(m_config); OPENVINO_ASSERT(cpu_config, "CPURuntimeConfigurator expects CPURuntimeConfig"); - const auto& loop_map = loop_manager->get_map(); + const auto& loop_map = linear_ir->get_loop_manager()->get_map(); cpu_config->loop_args.resize(loop_map.size()); for (const auto& loop : loop_map) { const auto& idx = loop.first; @@ -90,18 +57,5 @@ void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LoopM } } -void CPURuntimeConfigurator::update_brgemms(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const { - for (const auto& brgemm_expr : m_dynamic_brgemms) { - const auto& loop_ids = brgemm_expr->get_loop_ids(); - OPENVINO_ASSERT(!loop_ids.empty(), "Dynamic Brgemm must be in loops"); - // TODO [146125]: Loop by M is first one in `loop_ids` - const auto& expanded_loop_info = loop_manager->get_loop_info(loop_ids.front()); - const auto& block_size_m = expanded_loop_info->get_work_amount(); - - brgemm_expr->get_input_port_descriptor(0)->set_subtensor_dim(1, block_size_m); - brgemm_expr->get_output_port_descriptor(0)->set_subtensor_dim(1, block_size_m); - } -} - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 39ab1977f878d1..f1a21e5982aa1c 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -29,31 +29,19 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { * @brief Update RuntimeConfig based on LinearIR * @param linear_ir LinearIR */ - void update(const std::shared_ptr& linear_ir) override; - /** - * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator - * @param linear_ir LinearIR - */ - void initialization(const std::shared_ptr& linear_ir) override; + void update(const ov::snippets::lowered::LinearIRPtr& linear_ir) override; /** * @brief Initializes tensor rank of config * @param linear_ir LinearIR */ - void init_tensor_rank(const std::shared_ptr& linear_ir) const override; + void init_tensor_rank(const ov::snippets::lowered::LinearIRPtr& linear_ir) const override; /** * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig - * @param loop_manager Loop Manager - */ - void update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const; - /** - * @brief Update subtensors of Brgemms - * @param loop_manager Loop Manager + * @param linear_ir LinearIR */ - void update_brgemms(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const; + void update_loop_args(const ov::snippets::lowered::LinearIRPtr& linear_ir) const; const size_t rank6D = 6; - // Brgemm expressions with subtensors with dynamic values - std::unordered_set m_dynamic_brgemms = {}; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp index fb15ada10c504f..e538c3baef28bb 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp @@ -4,6 +4,8 @@ #include "brgemm.hpp" +#include "snippets/lowered/loop_manager.hpp" + #include #include "common/utils.hpp" #include "dnnl_extension_utils.h" @@ -153,31 +155,49 @@ std::shared_ptr BrgemmKernelExecutor::compile_kernel(const return compiled_kernel; } -void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, BrgemmKernelConfig& config) const { - auto get_projected_input_subtensor = [](const snippets::lowered::PortDescriptorPtr& desc) { - // Note: for output shape you will need get_preordered_vdims() - auto shape = snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout()); - auto subtensor = desc->get_subtensor(); - OV_CPU_JIT_EMITTER_ASSERT(subtensor.size() <= shape.size() && subtensor.size() == 2, - "Invalid subtensor + shape combination"); - auto shape_it = shape.rbegin(); - for (auto sub_it = subtensor.rbegin(); sub_it != subtensor.rend(); sub_it++, shape_it++) { - *sub_it = std::min(*sub_it, *shape_it); - } - return subtensor; - }; +void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRPtr& linear_ir, + BrgemmKernelConfig& config) const { const auto& input_pds = expr->get_input_port_descriptors(); const auto& output_pds = expr->get_output_port_descriptors(); OV_CPU_JIT_EMITTER_ASSERT((input_pds.size() == 2 || input_pds.size() == 3) && output_pds.size() == 1, "Invalid number of in/out port descriptors"); - // Update runtime-defined config fields: - // Matrix A (first input) + + const auto in0_shape = snippets::utils::get_planar_vdims(input_pds[0]->get_shape(), input_pds[0]->get_layout()); + const auto in1_shape = snippets::utils::get_planar_vdims(input_pds[1]->get_shape(), input_pds[1]->get_layout()); + auto in0_subtensor = input_pds[0]->get_subtensor(); + auto in1_subtensor = input_pds[1]->get_subtensor(); + + auto M = *++in0_subtensor.rbegin(); + auto K = *in0_subtensor.rbegin(); + auto N = *in1_subtensor.rbegin(); + + if (ov::snippets::utils::is_full_dim_value(M)) { + M = *++in0_shape.rbegin(); + } else { + const auto& loop_ids = expr->get_loop_ids(); + OPENVINO_ASSERT(!loop_ids.empty(), "Loop by dimension M is missed"); + // TODO [146125]: Loop by M is first one in `loop_ids` + const auto& expanded_loop_info = linear_ir->get_loop_manager()->get_loop_info(loop_ids.front()); + M = expanded_loop_info->get_increment(); + input_pds[0]->set_subtensor_dim(1, M); + output_pds[0]->set_subtensor_dim(1, M); + } + + if (ov::snippets::utils::is_full_dim_value(K)) { + K = *in0_shape.rbegin(); + } else if (ov::snippets::utils::is_dynamic_value(K)) { + OPENVINO_THROW("Dynamic K is not supported"); + } + + if (ov::snippets::utils::is_full_dim_value(N)) { + N = *in1_shape.rbegin(); + } else if (ov::snippets::utils::is_dynamic_value(N)) { + OPENVINO_THROW("Dynamic N is not supported"); + } + const auto LDA = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(0))); - const auto& in0_subtensor = get_projected_input_subtensor(input_pds[0]); - const auto K = DIM_CAST(*in0_subtensor.rbegin()); - const auto M = DIM_CAST(*++in0_subtensor.rbegin()); - // Matrix B (second input) - // Non float input 1 => with data repacking + const auto LDC = DIM_CAST(snippets::utils::get_dim_stride(expr->get_output_port(0))); auto LDB = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(1))); const auto& brgemm_node = as_type_ptr(expr->get_node()); @@ -187,10 +207,8 @@ void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::Expression OV_CPU_JIT_EMITTER_ASSERT(!repacking_buffer_shape.empty(), "Repacking buffer shape mustn't be empty"); LDB = DIM_CAST(repacking_buffer_shape.back()); } - const auto N = DIM_CAST(*get_projected_input_subtensor(input_pds[1]).rbegin()); - // Matrix C (output) - const auto LDC = DIM_CAST(snippets::utils::get_dim_stride(expr->get_output_port(0))); - config.update(M, N, K, LDA, LDB, LDC); + + config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC); } void BrgemmKernelExecutor::execute(const BrgemmKernelExecutor* executor, call_args* args) { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp index c87a7e93f3b3f7..4dd52e21ca2dfd 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp @@ -96,7 +96,9 @@ class BrgemmKernelExecutor : public CPUKernelExecutor compile_kernel(const BrgemmKernelConfig& c) const override; - void update_config(const ov::snippets::lowered::ExpressionPtr& expr, BrgemmKernelConfig& config) const override; + void update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRPtr& linear_ir, + BrgemmKernelConfig& config) const override; }; #define GET_OFF_BRGEMM_ARGS(field) offsetof(BrgemmKernelExecutor::call_args, field) diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp index f533c56bfabce5..3c8e4caf00c9b0 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp @@ -117,17 +117,21 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea const auto block_size_n = snippets::utils::is_dynamic_value(n) ? brgemm->get_n_block_size() : std::min(brgemm->get_n_block_size(), n); const auto block_size_k = snippets::utils::is_dynamic_value(k) ? brgemm->get_k_block_size() : std::min(brgemm->get_k_block_size(), k); + const bool m_blocking = block_size_m != m; + const bool n_blocking = block_size_n != n; + const bool k_blocking = block_size_k != k; + // If block_size is dynamic, it means that Brgemm will process full tensor: // subtensor[i] = FULL_DIM as by default - if (!snippets::utils::is_dynamic_value(block_size_m)) { + if (!snippets::utils::is_dynamic_value(block_size_m) && m_blocking) { brgemm_expr->get_input_port_descriptor(0)->set_subtensor_dim(1, block_size_m); brgemm_expr->get_output_port_descriptor(0)->set_subtensor_dim(1, block_size_m); } - if (!snippets::utils::is_dynamic_value(block_size_n)) { + if (!snippets::utils::is_dynamic_value(block_size_n) && n_blocking) { brgemm_expr->get_input_port_descriptor(1)->set_subtensor_dim(0, block_size_n); brgemm_expr->get_output_port_descriptor(0)->set_subtensor_dim(0, block_size_n); } - if (!snippets::utils::is_dynamic_value(block_size_k)) { + if (!snippets::utils::is_dynamic_value(block_size_k) && k_blocking) { brgemm_expr->get_input_port_descriptor(0)->set_subtensor_dim(0, block_size_k); brgemm_expr->get_input_port_descriptor(1)->set_subtensor_dim(1, block_size_k); } @@ -202,9 +206,6 @@ bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, Linea loop_manager->get_loop_info(id)->set_handlers(handlers); }; - const bool k_blocking = block_size_k != k; - const bool n_blocking = block_size_n != n; - const bool m_blocking = block_size_m != m; // It is not necessary to include copyB in loop by M if there are no blocking by KN const bool include_repacking_in_loop = k_blocking || n_blocking; diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp index 986aa993f471c3..82cbcdfa2c21f3 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp @@ -168,7 +168,8 @@ TEST_F(BrgemmBlockingTest, BlockingIsNotNeeded) { auto brgemm = linear_ir_ref->push_node(data_a.second, data_b.second, BRGEMM_TYPE::STAND_ALONE, 0, 0, 0, layout, layout, layout, m, k, n); brgemm.second->set_beta(0.f); - init_expr_descriptors(*brgemm.first, {{m, k}, {k, n}, {m, n}}); + const auto full_subtensor = VectorDims(2, ov::snippets::utils::get_full_dim_value()); + init_expr_descriptors(*brgemm.first, std::vector(3, full_subtensor)); auto result = linear_ir_ref->push_node(brgemm.second); } } @@ -221,6 +222,7 @@ TEST_F(BrgemmBlockingTest, WithDataRepackingOnlyByM) { const ov::PartialShape input_shape_b{1, 16, 64, 384}; const auto precision_a = ov::element::u8; const auto precision_b = ov::element::i8; + const auto full = ov::snippets::utils::get_full_dim_value(); { auto data_a = linear_ir->push_node(precision_a, input_shape_a); @@ -246,7 +248,7 @@ TEST_F(BrgemmBlockingTest, WithDataRepackingOnlyByM) { auto brgemm = linear_ir_ref->push_node(data_a.second, copy_b.second, BRGEMM_TYPE::REPACKING_ONLY, 0, 0, 0, VectorDims{}, VectorDims{}, VectorDims{}, m_blk, k, n, 0.f); const auto& brgemm_expr = *brgemm.first; - init_expr_descriptors(brgemm_expr, {{m_blk, k}, {k, n}, {m_blk, n}}); + init_expr_descriptors(brgemm_expr, {{m_blk, full}, {full, full}, {m_blk, full}}); create_brgemm_with_copy_b_loop_infos(linear_ir_ref, brgemm_expr, copy_b_expr, 384, m_blk); brgemm_expr->set_loop_ids({0}); auto result = linear_ir_ref->push_node(brgemm.second);