From 0453a1c864024515ab4cb388ce1223bae050596a Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Thu, 4 Jul 2024 13:44:23 +0400
Subject: [PATCH] [Snippets][CPU] Supported Brgemm subtensor update in runtime

---
 .../snippets/kernel_executor_table.hpp        | 16 ++++--
 .../src/lowered/pass/propagate_subtensors.cpp | 37 ++++++++++--
 .../snippets/src/op/serialization_node.cpp    |  3 +-
 .../snippets/cpu_runtime_configurator.cpp     | 56 ++++++++++++++++++-
 .../snippets/cpu_runtime_configurator.hpp     | 12 +++-
 .../snippets/x64/jit_loop_emitters.cpp        |  2 +-
 .../snippets/matmul.cpp                       | 32 ++++++++++-
 7 files changed, 139 insertions(+), 19 deletions(-)
diff --git a/src/common/snippets/include/snippets/kernel_executor_table.hpp b/src/common/snippets/include/snippets/kernel_executor_table.hpp
index bfff0d9d4f778d..8f093b1bd4775c 100644
--- a/src/common/snippets/include/snippets/kernel_executor_table.hpp
+++ b/src/common/snippets/include/snippets/kernel_executor_table.hpp
@@ -75,17 +75,21 @@ class KernelExecutor : public snippets::KernelExecutorBase {
     void update_by_expression(const ov::snippets::lowered::ExpressionPtr& expr) override final { // NOLINT
         m_config = std::static_pointer_cast<Conf>(m_config->clone());
         update_config(expr, m_config);
-        OPENVINO_ASSERT(m_config && m_config->is_completed(), "Failed to update kernel config in update_by_expression");
-        update_kernel(m_config, m_kernel);
-        OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor");
+        OPENVINO_ASSERT(m_config, "Failed to update kernel config in update_by_expression");
+        if (m_config->is_completed()) {
+            update_kernel(m_config, m_kernel);
+            OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor");
+        }
     }
     void update_by_config(const std::shared_ptr<const GenericConfig>& new_config) override final { // NOLINT
         if (*m_config == *new_config)
             return;
         m_config = std::static_pointer_cast<Conf>(std::const_pointer_cast<GenericConfig>(new_config));
-        OPENVINO_ASSERT(m_config && m_config->is_completed(), "Failed to update kernel config in get_config");
-        update_kernel(m_config, m_kernel);
-        OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor");
+        OPENVINO_ASSERT(m_config, "Failed to update kernel config in get_config");
+        if (m_config->is_completed()) {
+            update_kernel(m_config, m_kernel);
+            OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor");
+        }
     }
     std::shared_ptr<const GenericConfig> get_config() const override { return m_config; }
     std::shared_ptr<const KernelType> get_kernel() const { return m_kernel; }
diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp
index 5e407db8074db1..17c6961217c0b7 100644
--- a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp
+++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp
@@ -16,17 +16,42 @@ namespace lowered {
 namespace pass {
 namespace {
 
-// SIZE_MAX - dynamic value
-constexpr size_t DEFAULT_VALUE = SIZE_MAX - 1;
+// The algorithm uses the following special values in subtensors/shapes:
+// 1. Dynamic value in subtensor/shape : SIZE_MAX
+// 2. Full fimension in subtensor      : SIZE_MAX - 1
+// 3. Default value of `new_dim_value` : SIZE_MAX - 2
+// 4. `Forced` special dynamic value   : SIZE_MAX - 3
+//
+// We have to introduce `SPECIAL_DYNAMIC_VALUE` to distinguish `new_dim_value = DYNAMIC`
+// from the real dynamic values in subtensors and shapes and force this value in subtensors.
+// For example, there is Brgemm with the following info in the tail Loop:
+// Input 0: shape [?, ?], existing subtensor [32, FULL_DIM]
+// Input 1: shape [?, ?], existing subtensor [FULL_DIM, FULL_DIM]
+// Output : shape [?, ?], existing subtensor [32, FULL_DIM]
+// If the user wants to force `?` in the place of `32` in subtensors, the steps will be:
+// 1. Set `?` to subtensor and shape of Input 0 :
+//    shape [?, ?] (shape has not been changed!), new subtensor [?, FULL_DIM]
+// 2. Make shape inference of Brgemm and get Output:
+//    shape [?, ?] (shape has not been changed!), existing subtensor [FULL_DIM, FULL_DIM]
+// 3. Update subtensor on output using shape:
+//    new_subtensor[i] = std::min(planar_shape[i], subtensor[i]); // i = 0: std::min(SIZE_MAX(?), 32)
+//    new subtensor [32, FULL_DIM] - has not been changed! But should be [?, FULL_DIM]
+// Conculsion: we have to distinguish forced dynamic value with existing dynamic values in shape and subtensor
+
+constexpr size_t NEW_DEFAULT_VALUE     = SIZE_MAX - 2;
+constexpr size_t FORCED_DYNAMIC_VALUE = SIZE_MAX - 3;
 
 void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir,
                                               const LoopInfoPtr& loop_info,
                                               LinearIR::container::const_iterator begin,
                                               LinearIR::container::const_iterator end,
                                               bool most_outer_loop,
-                                              const size_t new_dim_value = DEFAULT_VALUE) {
-    OPENVINO_ASSERT(snippets::utils::implication(most_outer_loop, new_dim_value != DEFAULT_VALUE),
+                                              size_t new_dim_value = NEW_DEFAULT_VALUE) {
+    // Marks the forced dynamic value
+    new_dim_value = utils::is_dynamic_value(new_dim_value) ? FORCED_DYNAMIC_VALUE : new_dim_value;
+    OPENVINO_ASSERT(snippets::utils::implication(most_outer_loop, new_dim_value != NEW_DEFAULT_VALUE),
                     "if the updated subtensor propagation was called for the outer loop, new_dim_value must not be equal to default value");
+
     std::map<lowered::PortDescriptorPtr, snippets::VectorDims> original_shapes;
     // First step: set new dim value to the corresponding input_ports' dimensions
     if (most_outer_loop) {
@@ -82,7 +107,9 @@ void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir,
                 const size_t subtensor_start = planar_dims.size() - subtensor.size();
                 VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end());
                 for (size_t i = 0; i < new_subtensor.size(); ++i) {
-                    new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]);
+                    // If user forces dynamic value to set in subtensor, set real dynamic dimension using `get_dynamic_value<size_t>()`
+                    new_subtensor[i] = new_subtensor[i] == FORCED_DYNAMIC_VALUE ? utils::get_dynamic_value<size_t>()
+                                                                                : std::min(new_subtensor[i], subtensor[i]);
                 }
                 desc->set_subtensor(new_subtensor);
             }
diff --git a/src/common/snippets/src/op/serialization_node.cpp b/src/common/snippets/src/op/serialization_node.cpp
index cb17e8a57ddf24..1d58cec7aa0ef8 100644
--- a/src/common/snippets/src/op/serialization_node.cpp
+++ b/src/common/snippets/src/op/serialization_node.cpp
@@ -49,7 +49,8 @@ bool SerializationNode::visit_attributes(AttributeVisitor &visitor) {
         std::stringstream ss;
         for (size_t i = 0; i < subtensor.size(); ++i) {
             const auto& v = subtensor[i];
-            const auto v_str = (v == lowered::PortDescriptor::ServiceDimensions::FULL_DIM) ? "FULL_DIM" : std::to_string(v);
+            const auto v_str = v == lowered::PortDescriptor::ServiceDimensions::FULL_DIM ? "FULL_DIM" :
+                               (utils::is_dynamic_value(v) ? "?" : std::to_string(v));
             const auto del = i < subtensor.size() - 1 ? ", " : "";
             ss << v_str << del;
         }
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
index b92d70136ab4d5..627dec28e65d05 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
@@ -4,6 +4,7 @@
 
 #include "emitters/snippets/cpu_runtime_configurator.hpp"
 
+#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
 #include "snippets/utils.hpp"
 #include "snippets/lowered/loop_manager.hpp"
 
@@ -18,8 +19,38 @@ void CPURuntimeConfigurator::update(const std::shared_ptr<ov::snippets::lowered:
     RuntimeConfigurator::update(linear_ir);
 
     if (linear_ir->is_dynamic()) {
+        const auto& loop_manager = linear_ir->get_loop_manager();
+        update_loop_args(loop_manager);
+        update_brgemms(loop_manager);
         get_kernel_executor_table()->update_state();
-        update_loop_args(linear_ir);
+    }
+}
+
+void CPURuntimeConfigurator::initialization(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) {
+    RuntimeConfigurator::initialization(linear_ir);
+
+    for (const auto& expr : *linear_ir) {
+        // At the moment only blocking by dynamic M is supported
+        if (ov::is_type<ov::intel_cpu::BrgemmCPU>(expr->get_node())) {
+            const auto& in0_desc = expr->get_input_port_descriptor(0);
+            const auto& in1_desc = expr->get_input_port_descriptor(1);
+            const auto& out_desc = expr->get_output_port_descriptor(0);
+
+            const auto& in0_subtensor = in0_desc->get_subtensor();
+            const auto& in1_subtensor = in1_desc->get_subtensor();
+            const auto& out_subtensor = out_desc->get_subtensor();
+
+            OPENVINO_ASSERT(!snippets::utils::is_dynamic_value(*in0_subtensor.crbegin()) &&
+                            !snippets::utils::is_dynamic_value(*in1_subtensor.crbegin()) &&
+                            !snippets::utils::is_dynamic_value(*(++in1_subtensor.crbegin())) &&
+                            !snippets::utils::is_dynamic_value(*out_subtensor.crbegin()),
+                            "CPURuntimeConfigurator supports only dynamic M in Brgemm subtensors");
+            OPENVINO_ASSERT(*(++in0_subtensor.crbegin()) == *(++out_subtensor.crbegin()),
+                            "Incorrect values in subtensors of BrgemmCPU");
+
+            if (snippets::utils::is_dynamic_value(*(++in0_subtensor.crbegin())))
+                m_dynamic_brgemms.push_back(expr);
+        }
     }
 }
 
@@ -27,11 +58,11 @@ void CPURuntimeConfigurator::init_tensor_rank(const std::shared_ptr<ov::snippets
     m_config->tensor_rank = std::max(linear_ir->get_master_shape().size(), rank6D);
 }
 
-void CPURuntimeConfigurator::update_loop_args(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) const {
+void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const {
     const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_config);
     OPENVINO_ASSERT(cpu_config, "CPURuntimeConfigurator expects CPURuntimeConfig");
 
-    const auto& loop_map = linear_ir->get_loop_manager()->get_map();
+    const auto& loop_map = loop_manager->get_map();
     cpu_config->loop_args.resize(loop_map.size());
     for (const auto& loop : loop_map) {
         const auto& idx = loop.first;
@@ -50,5 +81,24 @@ void CPURuntimeConfigurator::update_loop_args(const std::shared_ptr<ov::snippets
     }
 }
 
+void CPURuntimeConfigurator::update_brgemms(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const {
+    for (const auto& brgemm_expr : m_dynamic_brgemms) {
+        const auto& loop_ids = brgemm_expr->get_loop_ids();
+        OPENVINO_ASSERT(!loop_ids.empty(), "Dynamic Brgemm must be in loops");
+        const auto& expanded_loop_info = loop_manager->get_loop_info<snippets::lowered::ExpandedLoopInfo>(loop_ids.front());
+        const auto& block_size_m = expanded_loop_info->get_work_amount();
+
+        const auto& in_desc = brgemm_expr->get_input_port_descriptor(0);
+        const auto& out_desc = brgemm_expr->get_output_port_descriptor(0);
+
+        auto in_subtensor = in_desc->get_subtensor();
+        auto out_subtensor = out_desc->get_subtensor();
+        *++in_subtensor.rbegin() = block_size_m;
+        *++out_subtensor.rbegin() = block_size_m;
+        in_desc->set_subtensor(in_subtensor);
+        out_desc->set_subtensor(out_subtensor);
+    }
+}
+
 } // namespace intel_cpu
 } // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
index 6b3a54652097ae..aaa30cc4961266 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
@@ -30,6 +30,11 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
      * @param linear_ir LinearIR
      */
     void update(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) override;
+    /**
+     * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator
+     * @param linear_ir LinearIR
+     */
+    void initialization(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) override;
     /**
      * @brief Initializes tensor rank of config
      * @param linear_ir LinearIR
@@ -39,9 +44,14 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
      * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig
      * @param linear_ir LinearIR
      */
-    void update_loop_args(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) const;
+    void update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const;
+    /**
+     * @brief Update latest input shapes
+     */
+    void update_brgemms(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const;
 
     const size_t rank6D = 6;
+    std::vector<ov::snippets::lowered::ExpressionPtr> m_dynamic_brgemms = {};
 };
 
 }   // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp
index 8bce82a3e7091f..2c41fdff64f586 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp
@@ -53,7 +53,7 @@ void jit_loop_begin_emitter::emit_code(const std::vector<size_t> &in, const std:
 
 void jit_loop_begin_emitter::emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out) const {
     // If the loop evaulate once, we can skip loop begin code emission
-    if (evaluate_once)
+    if (evaluate_once && !is_work_amount_dynamic)
         return;
 
     Reg64 reg_work_amount = Reg64(static_cast<int>(out.back()));
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
index 96733959205ca7..7385e996ab818d 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
@@ -67,11 +67,39 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul,
 
 
 std::vector<std::vector<ov::test::InputShape>> input_shapes_dynamic{
+        // All dimensions are dynamic
         {
             {PartialShape{-1, -1, -1, -1}, {{2, 1, 32, 64}, {2, 2, 10, 20}, {2, 2, 100, 80},
-                                                           {2, 2, 10, 20}, {2, 1, 32, 64}}},
+                                            {2, 2, 10, 20}, {2, 1, 32, 64}, {2, 3, 64, 55}}},
             {PartialShape{-1, -1, -1, -1}, {{1, 3, 64, 128}, {2, 2, 20, 30}, {2, 2, 80, 120},
-                                                           {2, 2, 20, 30}, {1, 3, 64, 128}}}
+                                             {2, 2, 20, 30}, {1, 3, 64, 128}, {2, 3, 55, 128}}}
+        },
+        // Only M dimension is dynamic + one one loop by M
+        {
+            {PartialShape{-1, 2, -1, 64}, {{2, 2, 64, 64}, {2, 2, 64, 64}, {2, 2, 35, 64},
+                                           {2, 2, 120, 64}, {2, 2, 15, 64}, {2, 2, 35, 64}}},
+            {PartialShape{-1, 2, 64, 32}, {{2, 2, 64, 32}, {2, 2, 64, 32}, {1, 2, 64, 32},
+                                           {1, 2, 64, 32}, {2, 2, 64, 32}, {1, 2, 64, 32}}}
+        },
+        // Only M dimension is dynamic + all Loops (by M, N, K)
+        {
+            {PartialShape{2, 2, -1, 550}, {{2, 2, 64, 550}, {2, 2, 16, 550}, {2, 2, 35, 550},
+                                           {2, 2, 16, 550}, {2, 2, 50, 550}, {2, 2, 64, 550}}},
+            {PartialShape{2, 1, 550, 70}, {{2, 1, 550, 70}, {2, 1, 550, 70}, {2, 1, 550, 70},
+                                           {2, 1, 550, 70}, {2, 1, 550, 70}, {2, 1, 550, 70}}}
+        },
+        // Only K dimension is dynamic
+        {
+            {PartialShape{2, 2, 35, -1}, {{2, 2, 35, 128}, {2, 2, 35, 10}, {2, 2, 35, 33},
+                                          {2, 2, 35, 35}, {2, 2, 35, 100},}},
+            {PartialShape{2, 2, -1, 70}, {{2, 2, 128, 70}, {2, 2, 10, 70}, {2, 2, 33, 70},
+                                          {2, 2, 35, 70}, {2, 2, 100, 70},}}
+        },
+        // Only N dimension is dynamic
+        {
+            STATIC_SHAPE(2, 2, 35, 550),
+            {PartialShape{2, 2, 550, -1}, {{2, 2, 550, 70}, {2, 2, 550, 12}, {2, 2, 550, 70},
+                                           {2, 2, 550, 12}, {2, 2, 550, 10},}}
         },
 };