openvinotoolkit · IvanNovoselov · Jul 26, 2024 · Jul 3, 2024 · Jul 12, 2024 · Jul 24, 2024
@@ -43,7 +43,7 @@ class KernelExecutorBase {
     * @brief Update current kernel config in accordance with the passed expression. Corresponding kernel is recompiled if necessary.
      * This method should be called to update KernelExecutor based on runtime info (e.g. shapes) available through expression ptr
     */
-    virtual void update_by_expression(const lowered::ExpressionPtr& expr) = 0;
+    virtual void update_by_expression(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir) = 0;
     /**
     * @brief Replace current kernel config with the provided value. Corresponding kernel is recompiled if necessary.
      * This method should be called to restore a saved state of the executor, that was configured using update_by_expression().
@@ -70,8 +70,8 @@ class KernelExecutor : public KernelExecutorBase {
     explicit KernelExecutor(Conf c) : KernelExecutorBase(), m_config{std::move(c)} {}
 
     // Note: override when final is redundant, but needed to avoid warnings on some compilers
-    void update_by_expression(const lowered::ExpressionPtr& expr) override final { // NOLINT
-        update_config(expr, m_config);
+    void update_by_expression(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir) override final { // NOLINT
+        update_config(expr, linear_ir, m_config);
         OPENVINO_ASSERT(m_config.is_completed(), "Failed to update kernel config in update_by_expression");
         update_kernel(m_config, m_kernel);
         OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor");
@@ -103,7 +103,7 @@ class KernelExecutor : public KernelExecutorBase {
 
 protected:
     /*** Updates stored kernel config based on runtime info from expression (e.g. new input shapes). */
-    virtual void update_config(const lowered::ExpressionPtr& expr, Conf& config) const = 0;
+    virtual void update_config(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir, Conf& config) const = 0;
     /*** Updates stored kernel in accordance with the passed config. Recompilation of the kernel is
      * performed if necessary. */
     virtual void update_kernel(const Conf& c, std::shared_ptr<KernelType>& kernel) const = 0;
@@ -130,9 +130,9 @@ class KernelExecutorTable {
         return m_table.at(expr);
     }
     /*** Updates every registered KernelExecutor in accordance with the corresponding expression */
-    void update_state() const {
+    void update_state(const lowered::LinearIRPtr& linear_ir) const {
         for (const auto& record : m_table)
-            record.second->update_by_expression(record.first);
+            record.second->update_by_expression(record.first, linear_ir);
     }
 
     /*** Returns lambda function that contains current state of the table, and restores this state when called  */

@@ -430,7 +430,8 @@ class ExpandedLoopInfo : public LoopInfo {
     ExpandedLoopInfo(size_t work_amount, size_t increment,
                      const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
                      std::vector<int64_t> ptr_increments, std::vector<int64_t> final_offsets, std::vector<int64_t> data_sizes,
-                     SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const = false);
+                     SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const = false,
+                     bool evaluate_once = false);
     /**
      * @brief Clone LoopInfo with new expressions
      * @param expr_map map of new and old expressions
@@ -474,7 +475,18 @@ class ExpandedLoopInfo : public LoopInfo {
      * @return const ref of `m_data_sizes`
      */
     const std::vector<int64_t>& get_data_sizes() const;
+    /**
+     * @brief Returns True if the current Loop should be executed once
+     *        Otherwise, returns False
+     * @return `m_evaluance_once`
+     */
+    bool is_evaluate_once() const;
 
+    /**
+     * @brief Set value to `m_evaluance_once`
+     * @param value - new value of `m_evaluance_once`
+     */
+    void set_evaluate_once(bool value);
     /**
      * @brief Update `m_ptr_increments` using copy values from `new_values`.
      *        The count of new values must be equal to the count of current increments.
@@ -517,6 +529,8 @@ class ExpandedLoopInfo : public LoopInfo {
 
     const SpecificLoopIterType m_type = {};
     std::shared_ptr<UnifiedLoopInfo> m_unified_loop_info = {};
+
+    bool m_evaluate_once = false;
 };
 using ExpandedLoopInfoPtr = std::shared_ptr<ExpandedLoopInfo>;
 

@@ -20,12 +20,6 @@ using PortDescriptorPtr = std::shared_ptr<PortDescriptor>;
 class PortDescriptor {
     friend class LinearIRBuilder;
 public:
-    // The structure with service values for scheduling parameters
-    struct ServiceDimensions {
-        // The value for the subtensor that means that scheduling should be by full dimension
-        static size_t FULL_DIM;
-    };
-
     explicit PortDescriptor(const ov::Input<ov::Node>& node,
                             VectorDims subtensor_shape = {},
                             std::vector<size_t> layout = {});
@@ -54,6 +48,9 @@ class PortDescriptor {
     void set_reg_type(RegType type) { m_reg.type = type; }
     void set_reg_idx(size_t idx) { m_reg.idx = idx; }
 
+    // Indexing starts from the end (rbegin() + idx)
+    void set_subtensor_dim(size_t idx, VectorDims::value_type value);
+
     std::string serialize() const;
     bool empty() const { return m_layout.empty() && m_subtensor_shape.empty();}
     PortDescriptorPtr clone() const;
@@ -87,6 +84,8 @@ class PortDescriptorUtils {
 public:
     static void set_port_descriptor_ptr(const ov::Input<ov::Node>& n, const PortDescriptorPtr& desc);
     static void set_port_descriptor_ptr(const ov::Output<ov::Node>& n, const PortDescriptorPtr& desc);
+    static void set_port_descriptor(const ov::Input<ov::Node>& n, std::vector<size_t> subtensor, std::vector<size_t> layout = {});
+    static void set_port_descriptor(const ov::Output<ov::Node>& n, std::vector<size_t> subtensor, std::vector<size_t> layout = {});
 
     static PortDescriptorPtr get_port_descriptor_ptr(const ov::Input<ov::Node>& in);
     static PortDescriptorPtr get_port_descriptor_ptr(const ov::Input<const ov::Node>& out);
@@ -116,17 +115,6 @@ class PortDescriptorVectorAttribute : public ov::RuntimeAttribute {
     std::vector<PortDescriptorPtr> outputs{};
 };
 
-template<typename T>
-void set_port_desc(const T& port, std::vector<size_t> subtensor) {
-    const auto& shape = port.get_shape();
-    for (size_t i = 1; i <= std::min(subtensor.size(), shape.size()); i++) {
-        auto& dim = subtensor[subtensor.size() - i];
-        if (dim != PortDescriptor::ServiceDimensions::FULL_DIM)
-            dim = std::min(dim, shape[shape.size() - i]);
-    }
-    PortDescriptorUtils::set_port_descriptor_ptr(port, std::make_shared<PortDescriptor>(shape, subtensor));
-}
-
 } // namespace lowered
 } // namespace snippets
 } // namespace ov
@@ -61,7 +61,7 @@ class RuntimeConfigurator {
      * @param linear_ir LinearIR
      * @return updated config
      */
-    const std::shared_ptr<RuntimeConfig>& get_updated_config(const std::shared_ptr<lowered::LinearIR>& linear_ir);
+    const std::shared_ptr<RuntimeConfig>& get_updated_config(const lowered::LinearIRPtr& linear_ir);
     /*** Returns pointer to KernelExecutorTable owned by the config */
     const std::shared_ptr<KernelExecutorTable>& get_kernel_executor_table() const { return m_config->kernel_executor_table; }
 
@@ -70,43 +70,43 @@ class RuntimeConfigurator {
      * @brief Update RuntimeConfig based on LinearIR
      * @param linear_ir LinearIR
      */
-    virtual void update(const std::shared_ptr<lowered::LinearIR>& linear_ir);
+    virtual void update(const lowered::LinearIRPtr& linear_ir);
     /**
      * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator
      * @param linear_ir LinearIR
      */
-    virtual void initialization(const std::shared_ptr<lowered::LinearIR>& linear_ir);
+    virtual void initialization(const lowered::LinearIRPtr& linear_ir);
 
     /**
      * @brief Initializes input and data information of LinearIR:
      *        descriptors (that contains shapes and layouts) and data_sizes
      * @param linear_ir LinearIR
      */
-    void init_data_info(const std::shared_ptr<lowered::LinearIR>& linear_ir);
+    void init_data_info(const lowered::LinearIRPtr& linear_ir);
     /**
      * @brief Initializes information of buffers:
      *        - static buffer_scratchpad_size
      *        - offsets of static clusters (with static buffers)
      *        - clusters with dynamic buffers (`m_dynamic_buffer_clusters`) for the quick access in `update()`
      * @param linear_ir LinearIR
      */
-    void init_buffer_info(const std::shared_ptr<lowered::LinearIR>& linear_ir);
+    void init_buffer_info(const lowered::LinearIRPtr& linear_ir);
     /**
      * @brief Initializes tensor rank of config
      * @param linear_ir LinearIR
      */
-    virtual void init_tensor_rank(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
+    virtual void init_tensor_rank(const lowered::LinearIRPtr& linear_ir) const;
     /**
      * @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo
      * @param linear_ir LinearIR
      */
-    void update_loop_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
+    void update_loop_info(const lowered::LinearIRPtr& linear_ir) const;
     /**
      * @brief Update Buffer scratchpad size and offsets if needed
      *        Note: `update_loop_info` must be called before
      * @param linear_ir LinearIR
      */
-    void update_buffer_scratchpad_size(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
+    void update_buffer_scratchpad_size(const lowered::LinearIRPtr& linear_ir) const;
     /**
      * @brief Calculate data offsets of LinearIR and update these values in RuntimeConfig
      */

@@ -21,6 +21,26 @@ namespace ov {
 namespace snippets {
 namespace utils {
 
+/* --- Special values --- */
+template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
+constexpr inline T get_dynamic_value() {
+    return std::numeric_limits<T>::max();
+}
+template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
+constexpr inline bool is_dynamic_value(T value) {
+    return value == get_dynamic_value<T>();
+}
+
+// This value means full dimension
+// For example, for the subtensor it means that scheduling should be by full dimension
+constexpr inline size_t get_full_dim_value() {
+    return get_dynamic_value<size_t>() - 1;
+}
+constexpr inline bool is_full_dim_value(size_t value) {
+    return value == get_full_dim_value();
+}
+/* ---------------------- */
+
 // Get non-scalar Constant count that will be created after FakeQuantize decomposition.
 // This count is needed to know exact count of non-scalar Constants during tokenization.
 auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<ov::op::v0::FakeQuantize>& fq) -> size_t;
@@ -59,16 +79,6 @@ inline T div_up(const T a, const U b) {
     return static_cast<T>((a + b - 1) / b);
 }
 
-template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
-constexpr inline T get_dynamic_value() {
-    return std::numeric_limits<T>::max();
-}
-
-template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
-constexpr inline bool is_dynamic_value(T value) {
-    return value == get_dynamic_value<T>();
-}
-
 inline bool is_dynamic_vdims(const VectorDims& shape) {
     return std::any_of(shape.cbegin(), shape.cend(), [](size_t v){ return is_dynamic_value(v); });
 }

@@ -373,10 +373,10 @@ void UnifiedLoopInfo::add_loop_ports(const std::vector<ExpressionPort>& ports) {
 ExpandedLoopInfo::ExpandedLoopInfo(size_t work_amount, size_t increment,
                                    const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
                                    std::vector<int64_t> ptr_increments, std::vector<int64_t> final_offsets, std::vector<int64_t> data_sizes,
-                                   SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const)
+                                   SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const, bool evaluate_once)
     : LoopInfo(work_amount, increment, entries, exits, is_wa_const),
       m_ptr_increments(std::move(ptr_increments)), m_finalization_offsets(std::move(final_offsets)),
-      m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)) {
+      m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)), m_evaluate_once(evaluate_once) {
     validate();
 }
 
@@ -392,7 +392,8 @@ std::shared_ptr<LoopInfo> ExpandedLoopInfo::clone_with_new_expr(const Expression
     const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports);
 
     return std::make_shared<ExpandedLoopInfo>(m_work_amount, m_increment, new_input_ports, new_output_ports,
-                                              m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, m_unified_loop_info, m_is_work_amount_const);
+                                              m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type,
+                                              m_unified_loop_info, m_is_work_amount_const, m_evaluate_once);
 }
 
 bool ExpandedLoopInfo::is_dynamic() const {
@@ -435,6 +436,14 @@ const std::vector<int64_t>& ExpandedLoopInfo::get_data_sizes() const {
     return m_data_sizes;
 }
 
+bool ExpandedLoopInfo::is_evaluate_once() const {
+    return m_evaluate_once;
+}
+
+void ExpandedLoopInfo::set_evaluate_once(bool value) {
+    m_evaluate_once = value;
+}
+
 void ExpandedLoopInfo::update_ptr_increments(const std::vector<int64_t>& new_values) {
     OPENVINO_ASSERT(new_values.size() == m_ptr_increments.size(), "Failed to update ptr_increments: incompatible counts");
     m_ptr_increments.assign(new_values.cbegin(), new_values.end());

@@ -160,7 +160,6 @@ void LoopManager::get_io_loop_ports(LinearIR::constExprIt loop_begin_pos,
 void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
                             LinearIR::constExprIt loop_end_pos,
                             size_t loop_depth, size_t vector_size) {
-    const auto FULL_DIM = PortDescriptor::ServiceDimensions::FULL_DIM;
     std::vector<ExpressionPort> loop_input_ports, loop_output_ports;
     LoopManager::get_io_loop_ports(loop_begin_pos, loop_end_pos, loop_input_ports, loop_output_ports);
 
@@ -178,8 +177,8 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
                         "Failed to broadcast work amount in marking loop");
     };
 
-    auto is_outside_loop = [&FULL_DIM](const std::vector<size_t>& subtensor) {
-        return std::all_of(subtensor.begin(), subtensor.end(), [&FULL_DIM](size_t lhs) { return lhs == FULL_DIM; });
+    auto is_outside_loop = [](const std::vector<size_t>& subtensor) {
+        return std::all_of(subtensor.begin(), subtensor.end(), utils::is_full_dim_value);
     };
 
     std::vector<size_t> loop_subtensor;
@@ -192,7 +191,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
             subtensor[subtensor.size() - 1] = vector_size;
         }
 
-        const size_t resizing_value = is_outside_loop(subtensor) ? FULL_DIM : 1;
+        const size_t resizing_value = is_outside_loop(subtensor) ? utils::get_full_dim_value() : 1;
         while (subtensor.size() < loop_depth)
             subtensor.insert(subtensor.begin(), resizing_value);
         if (loop_subtensor.empty())
@@ -202,7 +201,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
                         "Incorrect scheduling parameters for loop");
 
         for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) {
-            if (*(subtensor.rbegin() + dim_idx) != FULL_DIM) {
+            if (!utils::is_full_dim_value(*(subtensor.rbegin() + dim_idx))) {
                 broadcast(loop_tensor, shape, dim_idx);
             }
         }
@@ -211,7 +210,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
     for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) {
         OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup");
         const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx);
-        if (subtensor_value == FULL_DIM) {
+        if (utils::is_full_dim_value(subtensor_value)) {
             continue;
         }
 

@@ -60,7 +60,7 @@ size_t ComputeBufferAllocationSize::get_allocation_size(const LoopManagerPtr& lo
     const auto processing_rank = !processed_dim_idxs.empty() ? std::max(*processed_dim_idxs.rbegin(), subtensor.size()) : subtensor.size();
     for (size_t i = 0; i < std::min(processing_rank, rank); ++i) {
         if (processed_dim_idxs.count(i) == 0) {
-            if (i < subtensor.size())
+            if (i < subtensor.size() && !utils::is_full_dim_value(*(subtensor.rbegin() + i)))
                 allocation_size = utils::dynamic_safe_mul(allocation_size, std::min(*(planar_shape.rbegin() + i), *(subtensor.rbegin() + i)));
             else
                 allocation_size = utils::dynamic_safe_mul(allocation_size, *(planar_shape.rbegin() + i));

@@ -167,6 +167,7 @@ bool InsertSpecificIterations::decompose(LinearIR& linear_ir, LinearIR::constExp
         if (is_decomposed_loop_needed(unified_loop_info, iter_type, remaining_work_amount)) {
             const auto work_amount = get_decomposed_loop_work_amount(unified_loop_info, iter_type, remaining_work_amount);
             const auto increment = get_decomposed_loop_increment(unified_loop_info, iter_type, remaining_work_amount);
+            const auto evaluate_once = !utils::is_dynamic_value(work_amount) && work_amount == increment;
             // Update remaining Loop work amount
             // Note: if work_amount is unknown and increment = 1, it means that a loop will iterate by whole work_amount
             if (!is_wa_dynamic || increment == 1) {
@@ -199,7 +200,7 @@ bool InsertSpecificIterations::decompose(LinearIR& linear_ir, LinearIR::constExp
             const auto decomposed_loop_info = std::make_shared<ExpandedLoopInfo>(work_amount, increment,
                                                                                  decomposed_loop_entry_ports, decomposed_loop_exit_ports,
                                                                                  decomposed_ptr_increments, decomposed_finalization_offsets,
-                                                                                 decomposed_data_sizes, iter_type, unified_loop_info);
+                                                                                 decomposed_data_sizes, iter_type, unified_loop_info, false, evaluate_once);
             init_decomposed_loop(linear_ir, decomposed_loop_begin_it, decomposed_loop_end_it, decomposed_loop_info, loop_id, decomposed_loop_end);
 
             decomposed = true;