From 0453a1c864024515ab4cb388ce1223bae050596a Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 4 Jul 2024 13:44:23 +0400 Subject: [PATCH] [Snippets][CPU] Supported Brgemm subtensor update in runtime --- .../snippets/kernel_executor_table.hpp | 16 ++++-- .../src/lowered/pass/propagate_subtensors.cpp | 37 ++++++++++-- .../snippets/src/op/serialization_node.cpp | 3 +- .../snippets/cpu_runtime_configurator.cpp | 56 ++++++++++++++++++- .../snippets/cpu_runtime_configurator.hpp | 12 +++- .../snippets/x64/jit_loop_emitters.cpp | 2 +- .../snippets/matmul.cpp | 32 ++++++++++- 7 files changed, 139 insertions(+), 19 deletions(-) diff --git a/src/common/snippets/include/snippets/kernel_executor_table.hpp b/src/common/snippets/include/snippets/kernel_executor_table.hpp index bfff0d9d4f778d..8f093b1bd4775c 100644 --- a/src/common/snippets/include/snippets/kernel_executor_table.hpp +++ b/src/common/snippets/include/snippets/kernel_executor_table.hpp @@ -75,17 +75,21 @@ class KernelExecutor : public snippets::KernelExecutorBase { void update_by_expression(const ov::snippets::lowered::ExpressionPtr& expr) override final { // NOLINT m_config = std::static_pointer_cast(m_config->clone()); update_config(expr, m_config); - OPENVINO_ASSERT(m_config && m_config->is_completed(), "Failed to update kernel config in update_by_expression"); - update_kernel(m_config, m_kernel); - OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor"); + OPENVINO_ASSERT(m_config, "Failed to update kernel config in update_by_expression"); + if (m_config->is_completed()) { + update_kernel(m_config, m_kernel); + OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor"); + } } void update_by_config(const std::shared_ptr& new_config) override final { // NOLINT if (*m_config == *new_config) return; m_config = std::static_pointer_cast(std::const_pointer_cast(new_config)); - OPENVINO_ASSERT(m_config && m_config->is_completed(), "Failed to update kernel config in get_config"); - update_kernel(m_config, m_kernel); - OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor"); + OPENVINO_ASSERT(m_config, "Failed to update kernel config in get_config"); + if (m_config->is_completed()) { + update_kernel(m_config, m_kernel); + OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor"); + } } std::shared_ptr get_config() const override { return m_config; } std::shared_ptr get_kernel() const { return m_kernel; } diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp index 5e407db8074db1..17c6961217c0b7 100644 --- a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp +++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp @@ -16,17 +16,42 @@ namespace lowered { namespace pass { namespace { -// SIZE_MAX - dynamic value -constexpr size_t DEFAULT_VALUE = SIZE_MAX - 1; +// The algorithm uses the following special values in subtensors/shapes: +// 1. Dynamic value in subtensor/shape : SIZE_MAX +// 2. Full fimension in subtensor : SIZE_MAX - 1 +// 3. Default value of `new_dim_value` : SIZE_MAX - 2 +// 4. `Forced` special dynamic value : SIZE_MAX - 3 +// +// We have to introduce `SPECIAL_DYNAMIC_VALUE` to distinguish `new_dim_value = DYNAMIC` +// from the real dynamic values in subtensors and shapes and force this value in subtensors. +// For example, there is Brgemm with the following info in the tail Loop: +// Input 0: shape [?, ?], existing subtensor [32, FULL_DIM] +// Input 1: shape [?, ?], existing subtensor [FULL_DIM, FULL_DIM] +// Output : shape [?, ?], existing subtensor [32, FULL_DIM] +// If the user wants to force `?` in the place of `32` in subtensors, the steps will be: +// 1. Set `?` to subtensor and shape of Input 0 : +// shape [?, ?] (shape has not been changed!), new subtensor [?, FULL_DIM] +// 2. Make shape inference of Brgemm and get Output: +// shape [?, ?] (shape has not been changed!), existing subtensor [FULL_DIM, FULL_DIM] +// 3. Update subtensor on output using shape: +// new_subtensor[i] = std::min(planar_shape[i], subtensor[i]); // i = 0: std::min(SIZE_MAX(?), 32) +// new subtensor [32, FULL_DIM] - has not been changed! But should be [?, FULL_DIM] +// Conculsion: we have to distinguish forced dynamic value with existing dynamic values in shape and subtensor + +constexpr size_t NEW_DEFAULT_VALUE = SIZE_MAX - 2; +constexpr size_t FORCED_DYNAMIC_VALUE = SIZE_MAX - 3; void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, const LoopInfoPtr& loop_info, LinearIR::container::const_iterator begin, LinearIR::container::const_iterator end, bool most_outer_loop, - const size_t new_dim_value = DEFAULT_VALUE) { - OPENVINO_ASSERT(snippets::utils::implication(most_outer_loop, new_dim_value != DEFAULT_VALUE), + size_t new_dim_value = NEW_DEFAULT_VALUE) { + // Marks the forced dynamic value + new_dim_value = utils::is_dynamic_value(new_dim_value) ? FORCED_DYNAMIC_VALUE : new_dim_value; + OPENVINO_ASSERT(snippets::utils::implication(most_outer_loop, new_dim_value != NEW_DEFAULT_VALUE), "if the updated subtensor propagation was called for the outer loop, new_dim_value must not be equal to default value"); + std::map original_shapes; // First step: set new dim value to the corresponding input_ports' dimensions if (most_outer_loop) { @@ -82,7 +107,9 @@ void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, const size_t subtensor_start = planar_dims.size() - subtensor.size(); VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end()); for (size_t i = 0; i < new_subtensor.size(); ++i) { - new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]); + // If user forces dynamic value to set in subtensor, set real dynamic dimension using `get_dynamic_value()` + new_subtensor[i] = new_subtensor[i] == FORCED_DYNAMIC_VALUE ? utils::get_dynamic_value() + : std::min(new_subtensor[i], subtensor[i]); } desc->set_subtensor(new_subtensor); } diff --git a/src/common/snippets/src/op/serialization_node.cpp b/src/common/snippets/src/op/serialization_node.cpp index cb17e8a57ddf24..1d58cec7aa0ef8 100644 --- a/src/common/snippets/src/op/serialization_node.cpp +++ b/src/common/snippets/src/op/serialization_node.cpp @@ -49,7 +49,8 @@ bool SerializationNode::visit_attributes(AttributeVisitor &visitor) { std::stringstream ss; for (size_t i = 0; i < subtensor.size(); ++i) { const auto& v = subtensor[i]; - const auto v_str = (v == lowered::PortDescriptor::ServiceDimensions::FULL_DIM) ? "FULL_DIM" : std::to_string(v); + const auto v_str = v == lowered::PortDescriptor::ServiceDimensions::FULL_DIM ? "FULL_DIM" : + (utils::is_dynamic_value(v) ? "?" : std::to_string(v)); const auto del = i < subtensor.size() - 1 ? ", " : ""; ss << v_str << del; } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index b92d70136ab4d5..627dec28e65d05 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -4,6 +4,7 @@ #include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "snippets/utils.hpp" #include "snippets/lowered/loop_manager.hpp" @@ -18,8 +19,38 @@ void CPURuntimeConfigurator::update(const std::shared_ptris_dynamic()) { + const auto& loop_manager = linear_ir->get_loop_manager(); + update_loop_args(loop_manager); + update_brgemms(loop_manager); get_kernel_executor_table()->update_state(); - update_loop_args(linear_ir); + } +} + +void CPURuntimeConfigurator::initialization(const std::shared_ptr& linear_ir) { + RuntimeConfigurator::initialization(linear_ir); + + for (const auto& expr : *linear_ir) { + // At the moment only blocking by dynamic M is supported + if (ov::is_type(expr->get_node())) { + const auto& in0_desc = expr->get_input_port_descriptor(0); + const auto& in1_desc = expr->get_input_port_descriptor(1); + const auto& out_desc = expr->get_output_port_descriptor(0); + + const auto& in0_subtensor = in0_desc->get_subtensor(); + const auto& in1_subtensor = in1_desc->get_subtensor(); + const auto& out_subtensor = out_desc->get_subtensor(); + + OPENVINO_ASSERT(!snippets::utils::is_dynamic_value(*in0_subtensor.crbegin()) && + !snippets::utils::is_dynamic_value(*in1_subtensor.crbegin()) && + !snippets::utils::is_dynamic_value(*(++in1_subtensor.crbegin())) && + !snippets::utils::is_dynamic_value(*out_subtensor.crbegin()), + "CPURuntimeConfigurator supports only dynamic M in Brgemm subtensors"); + OPENVINO_ASSERT(*(++in0_subtensor.crbegin()) == *(++out_subtensor.crbegin()), + "Incorrect values in subtensors of BrgemmCPU"); + + if (snippets::utils::is_dynamic_value(*(++in0_subtensor.crbegin()))) + m_dynamic_brgemms.push_back(expr); + } } } @@ -27,11 +58,11 @@ void CPURuntimeConfigurator::init_tensor_rank(const std::shared_ptrtensor_rank = std::max(linear_ir->get_master_shape().size(), rank6D); } -void CPURuntimeConfigurator::update_loop_args(const std::shared_ptr& linear_ir) const { +void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const { const auto& cpu_config = ov::as_type_ptr(m_config); OPENVINO_ASSERT(cpu_config, "CPURuntimeConfigurator expects CPURuntimeConfig"); - const auto& loop_map = linear_ir->get_loop_manager()->get_map(); + const auto& loop_map = loop_manager->get_map(); cpu_config->loop_args.resize(loop_map.size()); for (const auto& loop : loop_map) { const auto& idx = loop.first; @@ -50,5 +81,24 @@ void CPURuntimeConfigurator::update_loop_args(const std::shared_ptrget_loop_ids(); + OPENVINO_ASSERT(!loop_ids.empty(), "Dynamic Brgemm must be in loops"); + const auto& expanded_loop_info = loop_manager->get_loop_info(loop_ids.front()); + const auto& block_size_m = expanded_loop_info->get_work_amount(); + + const auto& in_desc = brgemm_expr->get_input_port_descriptor(0); + const auto& out_desc = brgemm_expr->get_output_port_descriptor(0); + + auto in_subtensor = in_desc->get_subtensor(); + auto out_subtensor = out_desc->get_subtensor(); + *++in_subtensor.rbegin() = block_size_m; + *++out_subtensor.rbegin() = block_size_m; + in_desc->set_subtensor(in_subtensor); + out_desc->set_subtensor(out_subtensor); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 6b3a54652097ae..aaa30cc4961266 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -30,6 +30,11 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { * @param linear_ir LinearIR */ void update(const std::shared_ptr& linear_ir) override; + /** + * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator + * @param linear_ir LinearIR + */ + void initialization(const std::shared_ptr& linear_ir) override; /** * @brief Initializes tensor rank of config * @param linear_ir LinearIR @@ -39,9 +44,14 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig * @param linear_ir LinearIR */ - void update_loop_args(const std::shared_ptr& linear_ir) const; + void update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const; + /** + * @brief Update latest input shapes + */ + void update_brgemms(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const; const size_t rank6D = 6; + std::vector m_dynamic_brgemms = {}; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp index 8bce82a3e7091f..2c41fdff64f586 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp @@ -53,7 +53,7 @@ void jit_loop_begin_emitter::emit_code(const std::vector &in, const std: void jit_loop_begin_emitter::emit_impl(const std::vector& in, const std::vector& out) const { // If the loop evaulate once, we can skip loop begin code emission - if (evaluate_once) + if (evaluate_once && !is_work_amount_dynamic) return; Reg64 reg_work_amount = Reg64(static_cast(out.back())); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index 96733959205ca7..7385e996ab818d 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -67,11 +67,39 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul, std::vector> input_shapes_dynamic{ + // All dimensions are dynamic { {PartialShape{-1, -1, -1, -1}, {{2, 1, 32, 64}, {2, 2, 10, 20}, {2, 2, 100, 80}, - {2, 2, 10, 20}, {2, 1, 32, 64}}}, + {2, 2, 10, 20}, {2, 1, 32, 64}, {2, 3, 64, 55}}}, {PartialShape{-1, -1, -1, -1}, {{1, 3, 64, 128}, {2, 2, 20, 30}, {2, 2, 80, 120}, - {2, 2, 20, 30}, {1, 3, 64, 128}}} + {2, 2, 20, 30}, {1, 3, 64, 128}, {2, 3, 55, 128}}} + }, + // Only M dimension is dynamic + one one loop by M + { + {PartialShape{-1, 2, -1, 64}, {{2, 2, 64, 64}, {2, 2, 64, 64}, {2, 2, 35, 64}, + {2, 2, 120, 64}, {2, 2, 15, 64}, {2, 2, 35, 64}}}, + {PartialShape{-1, 2, 64, 32}, {{2, 2, 64, 32}, {2, 2, 64, 32}, {1, 2, 64, 32}, + {1, 2, 64, 32}, {2, 2, 64, 32}, {1, 2, 64, 32}}} + }, + // Only M dimension is dynamic + all Loops (by M, N, K) + { + {PartialShape{2, 2, -1, 550}, {{2, 2, 64, 550}, {2, 2, 16, 550}, {2, 2, 35, 550}, + {2, 2, 16, 550}, {2, 2, 50, 550}, {2, 2, 64, 550}}}, + {PartialShape{2, 1, 550, 70}, {{2, 1, 550, 70}, {2, 1, 550, 70}, {2, 1, 550, 70}, + {2, 1, 550, 70}, {2, 1, 550, 70}, {2, 1, 550, 70}}} + }, + // Only K dimension is dynamic + { + {PartialShape{2, 2, 35, -1}, {{2, 2, 35, 128}, {2, 2, 35, 10}, {2, 2, 35, 33}, + {2, 2, 35, 35}, {2, 2, 35, 100},}}, + {PartialShape{2, 2, -1, 70}, {{2, 2, 128, 70}, {2, 2, 10, 70}, {2, 2, 33, 70}, + {2, 2, 35, 70}, {2, 2, 100, 70},}} + }, + // Only N dimension is dynamic + { + STATIC_SHAPE(2, 2, 35, 550), + {PartialShape{2, 2, 550, -1}, {{2, 2, 550, 70}, {2, 2, 550, 12}, {2, 2, 550, 70}, + {2, 2, 550, 12}, {2, 2, 550, 10},}} }, };