Integrate recompilation infrastructure into RuntimeConfigurator (open…

…vinotoolkit#24955) ### Details: - *Integrate dynamic executors recompilation infrastructure into RuntimeConfigurator* - *Allow RuntimeConfigurator to recompile dynamic kernel executors in runtime* - *Employ this approach to enable dynamic MatMul tests (fp32)* ### Tickets: - *143257*
evkotov · Jun 21, 2024 · b660da8 · b660da8
1 parent 080f22e
commit b660da8
Show file tree

Hide file tree

Showing 31 changed files with 517 additions and 320 deletions.
diff --git a/src/common/snippets/include/snippets/kernel_executor_table.hpp b/src/common/snippets/include/snippets/kernel_executor_table.hpp
@@ -4,8 +4,10 @@
 
 #pragma once
 
-#include "snippets/lowered/expression.hpp"
-
+#include "snippets/lowered/linear_ir.hpp"
+#if defined(SNIPPETS_DEBUG_CAPS) && !defined(_WIN32)
+#include <cxxabi.h>
+#endif
 namespace ov {
 namespace snippets {
 
@@ -23,8 +25,38 @@ class KernelExecutorBase {
          * while dynamic kernels will be completed only in runtime, when all the shapes are known.
         */
         virtual bool is_completed() const = 0;
+
+        /*** Return deep copy of the config */
+        virtual std::shared_ptr<GenericConfig> clone() const = 0;
+
+        /*** Compute hash for fast comparison operations or caching support */
+        virtual size_t hash() const = 0;
+
+        bool operator==(const GenericConfig& rhs) const { return hash() == rhs.hash(); }
+        bool operator!=(const GenericConfig& rhs) const { return hash() != rhs.hash(); }
+
         virtual ~GenericConfig() = default;
+        /** serialize config for debug purposes */
+#ifdef SNIPPETS_DEBUG_CAPS
+        virtual std::string to_string() const = 0;
+#endif
     };
+    /**
+    * @brief Update current kernel config in accordance with the passed expression. Corresponding kernel is recompiled if necessary.
+     * This method should be called to update KernelExecutor based on runtime info (e.g. shapes) available through expression ptr
+    */
+    virtual void update_by_expression(const ov::snippets::lowered::ExpressionPtr& expr) = 0;
+    /**
+    * @brief Replace current kernel config with the provided value. Corresponding kernel is recompiled if necessary.
+     * This method should be called to restore a saved state of the executor, that was configured using update_by_expression().
+    */
+    virtual void update_by_config(const std::shared_ptr<const GenericConfig>& new_config) = 0;
+
+    virtual std::shared_ptr<const GenericConfig> get_config() const = 0;
+    /** serialize for debug purposes */
+#ifdef SNIPPETS_DEBUG_CAPS
+    virtual std::string to_string() const = 0;
+#endif
     virtual ~KernelExecutorBase() = default;
 
 private:
@@ -38,17 +70,47 @@ template<typename Conf, typename KernelType,
 class KernelExecutor : public snippets::KernelExecutorBase {
 public:
     explicit KernelExecutor(std::shared_ptr<Conf> c) : KernelExecutorBase(), m_config{std::move(c)} {}
-    /**
-    * @brief check current config and recompile kernel if necessary. Use kernel caching to avoid redundant recompilations.
-     * This method must be called only for complete configs. It's the user responsibility to check is_completed() before calling.
-    */
-    virtual void update_kernel()  = 0;
+
+    // Note: override when final is redundant, but needed to avoid warnings on some compilers
+    void update_by_expression(const ov::snippets::lowered::ExpressionPtr& expr) override final { // NOLINT
+        m_config = std::static_pointer_cast<Conf>(m_config->clone());
+        update_config(expr, m_config);
+        OPENVINO_ASSERT(m_config && m_config->is_completed(), "Failed to update kernel config in update_by_expression");
+        update_kernel(m_config, m_kernel);
+        OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor");
+    }
+    void update_by_config(const std::shared_ptr<const GenericConfig>& new_config) override final { // NOLINT
+        if (*m_config == *new_config)
+            return;
+        m_config = std::static_pointer_cast<Conf>(std::const_pointer_cast<GenericConfig>(new_config));
+        OPENVINO_ASSERT(m_config && m_config->is_completed(), "Failed to update kernel config in get_config");
+        update_kernel(m_config, m_kernel);
+        OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor");
+    }
+    std::shared_ptr<const GenericConfig> get_config() const override { return m_config; }
+    std::shared_ptr<const KernelType> get_kernel() const { return m_kernel; }
+#ifdef SNIPPETS_DEBUG_CAPS
+    std::string to_string() const override {
+        std::string type_name = typeid(KernelType).name();
+#ifndef _WIN32
+        int status;
+        std::unique_ptr<char, void (*)(void*)> demangled_name(
+                abi::__cxa_demangle(type_name.c_str(), nullptr, nullptr, &status),
+                std::free);
+        type_name = demangled_name.get();
+#endif
+        return  "KernelExecutorType: " + std::string(type_name) + " KernelConfig: " + m_config->to_string();
+    }
+#endif
+
 protected:
-    /**
-    * @brief Takes shared_ptr to compilation config, returns shared_ptr to compiled kernel.
-     * Should be called only if actual compilation is required. Kernel caching must be implemented in update_kernel().
-    */
-    virtual std::shared_ptr<KernelType> compile_kernel(const std::shared_ptr<Conf>& c) const = 0;
+    /*** Updates stored kernel config based on runtime info from expression (e.g. new input shapes). */
+    virtual void update_config(const ov::snippets::lowered::ExpressionPtr& expr, std::shared_ptr<Conf>& config) const = 0;
+    /*** Updates stored kernel in accordance with the passed config. Recompilation of the kernel is
+     * performed only if necessary, otherwise an appropriate kernel is retrieved from cache. */
+    virtual void update_kernel(const std::shared_ptr<const Conf>& c, std::shared_ptr<KernelType>& kernel) const = 0;
+
+private:
     /** Contains all the necessary information to compile a desired kernel*/
     std::shared_ptr<Conf> m_config = nullptr;
     /** Stores pointer to compiled kernel since the last update_kernel() call */
@@ -57,6 +119,7 @@ class KernelExecutor : public snippets::KernelExecutorBase {
 
 class KernelExecutorTable {
 public:
+    /*** Register KernelExecutor in the KernelExecutorTable so it can be later updated in runtime. */
     template<typename T, class ...C,
             typename std::enable_if<std::is_base_of<KernelExecutorBase, T>::value, bool>::type = true>
     std::shared_ptr<T> register_kernel(const snippets::lowered::ExpressionPtr& expr, C... args) {
@@ -69,10 +132,37 @@ class KernelExecutorTable {
         OPENVINO_ASSERT(m_table.count(expr), "This expression doesn't have a registered kernel executor");
         return m_table.at(expr);
     }
+    /*** Updates every registered KernelExecutor in accordance with the corresponding expression */
+    void update_state() const {
+        for (const auto& record : m_table)
+            record.second->update_by_expression(record.first);
+    }
+
+    /*** Returns lambda function that contains current state of the table, and restores this state when called  */
+    std::function<void()> get_state_reset() {
+        auto current_state = get_state();
+        return [=]() { reset_state(current_state); };
+    }
+
+    /**
+    * @brief Replace originally registered ExpressionPtr with a new value.
+     * Note that code emission is performed on a copy of LIR, so all expression pointers visible from emitters won't
+     * be accessible from RuntimeConfigurator. In order to replace these cloned ExpressionPtrs with the original ones,
+     * we need to call this method.
+    */
+    void replace_key_expression(const snippets::lowered::ExpressionPtr& from, const snippets::lowered::ExpressionPtr& to);
+
     virtual ~KernelExecutorTable() = default;
 
 protected:
     std::unordered_map<snippets::lowered::ExpressionPtr, std::shared_ptr<KernelExecutorBase>> m_table{};
+    typedef std::vector<std::pair<snippets::lowered::ExpressionPtr, std::shared_ptr<const KernelExecutorBase::GenericConfig>>> ExecTableState;
+
+    /*** Restore the table state previously obtained by get_state() */
+    void reset_state(const ExecTableState& state);
+
+    /*** Return cumulative state of all the executors in the table. The returned ExecTableState object can be passed to reset_state */
+    ExecTableState get_state() const;
 };
 
 using KernelExecutorTablePtr = std::shared_ptr<KernelExecutorTable>;

diff --git a/src/common/snippets/include/snippets/lowered/linear_ir_builder.hpp b/src/common/snippets/include/snippets/lowered/linear_ir_builder.hpp
@@ -29,9 +29,14 @@ class LinearIRBuilder {
     /**
      * @brief Make a full copy of LinearIR by rules described in `m_config`
      * @param linear_ir Linear IR
+     * @param expression_map expression map
      * @return clone of `linear_ir`
      */
-    std::shared_ptr<LinearIR> clone(const std::shared_ptr<LinearIR>& linear_ir) const;
+    std::shared_ptr<LinearIR> clone(const std::shared_ptr<LinearIR>& linear_ir,  ExpressionMap& expression_map) const;
+    inline std::shared_ptr<LinearIR> clone(const std::shared_ptr<LinearIR>& linear_ir) const {
+        ExpressionMap expression_map;
+        return clone(linear_ir, expression_map);
+    }
     /**
      * @brief Make a copy of LinearIR range by rules described in `m_config`
      * @param begin begin iterator of the target range of LinearIR

diff --git a/src/common/snippets/include/snippets/op/brgemm.hpp b/src/common/snippets/include/snippets/op/brgemm.hpp
@@ -55,17 +55,16 @@ class Brgemm : virtual public modifier::MemoryAccess, public ov::op::Op {
 protected:
     ov::element::Type get_output_type() const;
     std::vector<ov::PartialShape> get_planar_input_shapes(const std::vector<ov::Input<ov::Node>>& inputs) const;
-    ov::PartialShape get_output_partial_shape(const std::vector<ov::PartialShape>& input_shapes) const;
+    ov::PartialShape infer_output_partial_shape(const std::vector<ov::PartialShape>& input_shapes) const;
     ov::PartialShape get_planar_output_shape(const ov::PartialShape& output_shape) const;
-    void compute_block_size_values(size_t blk_size_m, size_t blk_size_k, size_t blk_size_n);
+    void set_block_size_values(size_t blk_size_m, size_t blk_size_k, size_t blk_size_n);
     size_t m_M_blk = 0;
     size_t m_K_blk = 0;
     size_t m_N_blk = 0;
     float m_beta = 0.f;
 
 private:
     void custom_constructor_validate_and_infer_types(std::vector<size_t> layout_a, std::vector<size_t> layout_b, std::vector<size_t> layout_c);
-    void validate_inputs() const;
 };
 
 } // namespace op

diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "snippets/lowered/linear_ir.hpp"
+#include "snippets/kernel_executor_table.hpp"
 #include "snippets/lowered/pass/pass.hpp"
 
 namespace ov {
@@ -42,7 +43,8 @@ class RuntimeConfig {
     ov::snippets::VectorDims master_shape = {};
 
     size_t buffer_scratchpad_size = 0;
-    std::vector<size_t> buffer_cluster_offsets;
+    std::vector<size_t> buffer_cluster_offsets {};
+    KernelExecutorTablePtr kernel_executor_table = std::make_shared<ov::snippets::KernelExecutorTable>();
 };
 
 /**
@@ -60,6 +62,8 @@ class RuntimeConfigurator {
      * @return updated config
      */
     const std::shared_ptr<RuntimeConfig>& get_updated_config(const std::shared_ptr<lowered::LinearIR>& linear_ir);
+    /*** Returns pointer to KernelExecutorTable owned by the config */
+    const std::shared_ptr<KernelExecutorTable>& get_kernel_executor_table() const { return m_config->kernel_executor_table; }
 
 protected:
     /**

diff --git a/src/common/snippets/include/snippets/target_machine.hpp b/src/common/snippets/include/snippets/target_machine.hpp
@@ -10,7 +10,6 @@
 
 #include "emitter.hpp"
 #include "snippets/lowered/expression.hpp"
-#include "kernel_executor_table.hpp"
 
 namespace ov {
 namespace snippets {
@@ -94,7 +93,6 @@ class TargetMachine {
 
 protected:
     std::map<const ov::DiscreteTypeInfo, jitters_value> jitters;
-    std::shared_ptr<KernelExecutorTable> kernel_executor_table;
     std::shared_ptr<RuntimeConfigurator> configurator;
 };
 

diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp
@@ -243,6 +243,44 @@ std::shared_ptr<ov::Node> get_leaf_node_of_first_child_shape_infer_seq(const std
  */
 std::shared_ptr<ov::Node> get_leaf_node_of_first_parent_shape_infer_seq(const std::shared_ptr<ov::Node>& start_node);
 
+/**
+ * @brief Calculate leading dimension of the shape that should be read according to the layout
+ * @param shape original (not reordered) input shape
+ * @param layout specifies the order in what dimensions of in the input shape should be read
+ * @return stride of the dimension idx = layout[layout.size() - 2] in the original shape
+   Example:
+         Original shape (shape) = [1, 49, 2, 23]
+         Layout (transpose order) = [2, 0, 1, 3]
+
+         dim_idx = layout.size() - 2 = 2
+         // Since layout specifies the order of dimensions in which the shape should be read
+         dim = layout[dim_idx] = 1
+         stride(shape[1]) = shape[2] * shape[3] = 2 * 23
+ */
+size_t get_in_leading_dim(const VectorDims& shape, const std::vector<size_t>& layout);
+inline size_t get_in_leading_dim(const lowered::PortDescriptorPtr& pd) {
+    return get_in_leading_dim(pd->get_shape(), pd->get_layout());
+}
+/**
+ *
+ * @param shape reordered input shape that is stored according to the layout
+ * @param layout specifies the order in what the dimensions of the input shape are stored
+ * @return
+     Output shape is already transposed, we need to correctly write the data with original shape by the order
+     Example:
+          Original transposed shape (shape) = [49, 2, 7, 39]
+          Layout (transpose order) = [2, 0, 1, 3]
+
+          dim_idx = layout.size() - 2 = 2
+          // Since the shape dimensions are already reordered according to the layout
+          dim = /find dim_idx index in layout/ = 0
+          stride(shape[0]) = shape[1] * shape[2] * shape[3] = 2 * 7 * 39
+ */
+size_t get_out_leading_dim(const VectorDims& shape, const std::vector<size_t>& layout);
+inline size_t get_out_leading_dim(const lowered::PortDescriptorPtr& pd) {
+    return get_out_leading_dim(pd->get_shape(), pd->get_layout());
+}
+
 } // namespace utils
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/src/kernel_executor_table.cpp b/src/common/snippets/src/kernel_executor_table.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/kernel_executor_table.hpp"
+
+namespace ov {
+namespace snippets {
+
+void KernelExecutorTable::replace_key_expression(const snippets::lowered::ExpressionPtr& from, const snippets::lowered::ExpressionPtr& to) {
+    const auto& found = m_table.find(from);
+    if (found != m_table.end()) {
+        OPENVINO_ASSERT(m_table.count(to) == 0, "Attempt to replace a value that is already in the KernelExecutorTable");
+        m_table.insert({to, found->second});
+        m_table.erase(found);
+    }
+}
+
+void KernelExecutorTable::reset_state(const ExecTableState& state) {
+    OPENVINO_ASSERT(state.size() == m_table.size(), "Invalid state in restore_state: size mismatch");
+    auto state_it = state.begin();
+    for (const auto& table_record : m_table) {
+        const auto& state_record = *state_it++;
+        OPENVINO_ASSERT(table_record.first == state_record.first, "Invalid state in restore_state: expressions mismatch");
+        table_record.second->update_by_config(state_record.second);
+    }
+}
+
+KernelExecutorTable::ExecTableState KernelExecutorTable::get_state() const {
+    ExecTableState result;
+    // Note: we need to clone configs when saving the state, since the configs still stored in the table can
+    // be modified e.g. by calling update_by_expression();
+    for (const auto& record : m_table)
+        result.emplace_back(std::make_pair(record.first, record.second->get_config()->clone()));
+    return result;
+}
+
+}// namespace snippets
+}// namespace ov
diff --git a/src/common/snippets/src/lowered/linear_ir_builder.cpp b/src/common/snippets/src/lowered/linear_ir_builder.cpp
@@ -65,11 +65,10 @@ std::vector<std::shared_ptr<ov::Node>> clone_nodes(const std::vector<std::shared
 }
 }  // namespace
 
-std::shared_ptr<LinearIR> LinearIRBuilder::clone(const std::shared_ptr<LinearIR>& linear_ir) const {
+std::shared_ptr<LinearIR> LinearIRBuilder::clone(const std::shared_ptr<LinearIR>& linear_ir, ExpressionMap& expression_map) const {
     auto cloned = std::make_shared<LinearIR>();
     cloned->m_config = linear_ir->m_config;
 
-    ExpressionMap expression_map;
     cloned->m_expressions = clone_range(linear_ir->m_expressions.cbegin(), linear_ir->m_expressions.cend(), expression_map);
     for (const auto& expr : cloned->m_expressions) {
         cloned->register_expression(expr, true);