diff --git a/src/common/snippets/include/snippets/lowered/loop_info.hpp b/src/common/snippets/include/snippets/lowered/loop_info.hpp index 5563dc240c2818..cc66f5f6ffcc95 100644 --- a/src/common/snippets/include/snippets/lowered/loop_info.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_info.hpp @@ -12,28 +12,40 @@ namespace ov { namespace snippets { namespace lowered { +class LoopInfo; +using LoopInfoMap = std::unordered_map>; +using LoopInfoSet = std::unordered_set; +using LoopInfoPtr = std::shared_ptr; + /** * @interface LoopInfo * @brief The base class that contains the common information about a Loop in Linear Intermediate Representation (Linear IR): * work amount of the Loop, step of loop counter increment, input and output ports of the Loop. * @ingroup snippets */ -class LoopInfo { +class LoopInfo : public std::enable_shared_from_this { public: enum {UNDEFINED_DIM_IDX = std::numeric_limits::max()}; LoopInfo() = default; - LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, bool is_wa_const = false); - LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - bool is_wa_const = false); + LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits); + LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits); virtual ~LoopInfo() = default; /** - * @brief Clone LoopInfo with new expressions + * @brief Clone LoopInfo with new Expressions * @param expr_map map of new and old expressions + * @param loop_map map of new and old LoopInfo. * @return the copy */ - virtual std::shared_ptr clone_with_new_expr(const ExpressionMap& expr_map) const = 0; + virtual std::shared_ptr clone_with_new_expr(const ExpressionMap& expr_map, LoopInfoMap& loop_map) const = 0; + + /** + * @brief Apply the passed function to the current LoopInfo + * @param func function for applying + * @param applied_loops set of already updated loops + */ + virtual void apply(const std::function& func, LoopInfoSet& applied_loops) = 0; /** * @brief Check if some parameters of Loop are dynamic (undefined) @@ -62,7 +74,7 @@ class LoopInfo { * @brief Returns work amount of the Loop. * @return m_work_amount */ - size_t get_work_amount() const; + virtual size_t get_work_amount() const; /** * @brief Returns step of loop counter increment. * @return m_increment @@ -83,17 +95,12 @@ class LoopInfo { * @return m_output_ports */ const std::vector& get_output_ports() const; - /** - * @brief Returns True if `work_amount` cannot be rewritten/updated by passes. - * @return m_is_work_amount_const - */ - bool is_work_amount_const() const; /** * @brief Set m_work_amount value * @param work_amount - work amount of the loop */ - void set_work_amount(size_t work_amount); + virtual void set_work_amount(size_t work_amount); /** * @brief Set m_increment value * @param increment - step of loop counter increment @@ -104,13 +111,8 @@ class LoopInfo { * @param dim_idx - index */ void set_dim_idx(size_t dim_idx); - /** - * @brief Sets `value` to `m_is_work_amount_const` - * @param value - value of the attribute - */ - void set_work_amount_const(bool value); -/** + /** * @brief Replace the current LoopPort `actual_port` with new `target_ports` * @param actual_port actual port * @param target_ports new ports. The ports order is important. Can contain `actual_port` @@ -188,12 +190,7 @@ class LoopInfo { // Note: Scalars aren't input expressions but can be before first input expr in Linear IR std::vector m_input_ports = {}; std::vector m_output_ports = {}; - - // TODO [143394] : All static values in compilation stage should be `is_const=True` (not only `work_amount`) - // If True, no one pass can rewrite the value of `m_work_amount` - bool m_is_work_amount_const = false; }; -using LoopInfoPtr = std::shared_ptr; /** * @interface UnifiedLoopInfo @@ -227,20 +224,28 @@ class UnifiedLoopInfo : public LoopInfo { UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, const std::vector& in_descs, const std::vector& out_descs, - const SpecificIterationHandlers& handlers = SpecificIterationHandlers(), bool is_wa_const = false); + const SpecificIterationHandlers& handlers = SpecificIterationHandlers()); UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - const SpecificIterationHandlers& handlers = SpecificIterationHandlers(), bool is_wa_const = false); + const SpecificIterationHandlers& handlers = SpecificIterationHandlers()); UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - const SpecificIterationHandlers& handlers = SpecificIterationHandlers(), bool is_wa_const = false); + const SpecificIterationHandlers& handlers = SpecificIterationHandlers()); /** - * @brief Clone LoopInfo with new expressions + * @brief Clone LoopInfo with new Expressions * @param expr_map map of new and old expressions + * @param loop_map map of new and old LoopInfo. * @return the copy */ - std::shared_ptr clone_with_new_expr(const ExpressionMap& expr_map) const override; + std::shared_ptr clone_with_new_expr(const ExpressionMap& expr_map, LoopInfoMap& loop_map) const override; + + /** + * @brief Apply the passed function on the current LoopInfo. + * @param func function for applying + * @param applied_loops set of already updated loops + */ + void apply(const std::function& func, LoopInfoSet& applied_loops) override; /** * @brief Check if some parameters of Loop are dynamic (undefined) @@ -362,7 +367,7 @@ class UnifiedLoopInfo : public LoopInfo { caller(m_output_ports[i], m_output_port_descs[i]); } -private: +protected: /** * @brief Clone LoopPortDesc[actual_port_idx] `new_count` times and insert on the place of current desc * @param actual_port_idx index of the current descriptor/port @@ -377,6 +382,65 @@ class UnifiedLoopInfo : public LoopInfo { }; using UnifiedLoopInfoPtr = std::shared_ptr; +/** + * @interface InnerSplittedUnifiedLoopInfo + * @brief The structure describes inner splitted Loop after `SplitLoops`. + * Contains pointer to outer splitted loop info. WorkAmount is equal to increment of outer splitted loop info. + * @ingroup snippets + */ +class InnerSplittedUnifiedLoopInfo : public UnifiedLoopInfo { +public: + OPENVINO_RTTI("InnerSplittedUnifiedLoopInfo", "0", UnifiedLoopInfo) + + InnerSplittedUnifiedLoopInfo() = default; + InnerSplittedUnifiedLoopInfo(size_t increment, const std::vector& entries, const std::vector& exits, + const std::vector& in_descs, const std::vector& out_descs, + const SpecificIterationHandlers& handlers, LoopInfoPtr outer_splitted_loop_info); + + /** + * @brief Clone LoopInfo with new Expressions + * @param expr_map map of new and old expressions + * @param loop_map map of new and old LoopInfo. + * If `loop_map` contains cloned outer splitted loop -info, we take it from there. + * Otherwise we manually clone it and add to this map. + * @return the copy + */ + std::shared_ptr clone_with_new_expr(const ExpressionMap& expr_map, LoopInfoMap& loop_map) const override; + + /** + * @brief Apply the passed function on OuterSplittedLoopInfo and then on the current LoopInfo. + * @param func function for applying + * @param applied_loops set of already updated loops + */ + void apply(const std::function& func, LoopInfoSet& applied_loops) override; + + /** + * @brief Returns work amount of the Loop. + * @return m_work_amount + */ + size_t get_work_amount() const override; + /** + * @brief Returns OuterSplittedLoopInfo + * @return m_outer_splitted_loop_info + */ + LoopInfoPtr get_outer_splitted_loop_info() const; + + /** + * @brief Set m_work_amount value + * @param work_amount - work amount of the loop + */ + void set_work_amount(size_t work_amount) override; + /** + * @brief Set m_outer_splitted_loop_info value + * @param outer - OuterSplittedLoopInfo + */ + void set_outer_splitted_loop_info(LoopInfoPtr outer); + +private: + LoopInfoPtr m_outer_splitted_loop_info = nullptr; +}; +using InnerSplittedUnifiedLoopInfoPtr = std::shared_ptr; + /** * @interface ExpandedLoopInfo * @brief The structure describes expanded Loop (specific iterations) after unified loop decomposition into specific loop iterations. @@ -390,14 +454,23 @@ class ExpandedLoopInfo : public LoopInfo { ExpandedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, std::vector ptr_increments, std::vector final_offsets, std::vector data_sizes, - SpecificLoopIterType type, std::shared_ptr unified_loop_info, bool is_wa_const = false, - bool evaluate_once = false); + SpecificLoopIterType type, UnifiedLoopInfoPtr unified_loop_info, bool evaluate_once = false); /** - * @brief Clone LoopInfo with new expressions + * @brief Clone LoopInfo with new Expressions * @param expr_map map of new and old expressions + * @param loop_map map of new and old LoopInfo. + * If `loop_map` contains cloned unified loop -info, we take it from there. + * Otherwise we manually clone it and add to this map. * @return the copy */ - std::shared_ptr clone_with_new_expr(const ExpressionMap& expr_map) const override; + std::shared_ptr clone_with_new_expr(const ExpressionMap& expr_map, LoopInfoMap& loop_map) const override; + + /** + * @brief Apply the passed function on UnifiedLoopInfo and then on the current LoopInfo. + * @param func function for applying + * @param applied_loops set of already updated loops + */ + void apply(const std::function& func, LoopInfoSet& applied_loops) override; /** * @brief Check if some parameters of Loop are dynamic (undefined) diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp index ab4db81a8e2e6b..099d7ff634924d 100644 --- a/src/common/snippets/include/snippets/lowered/loop_manager.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_manager.hpp @@ -98,13 +98,12 @@ class LoopManager { size_t increment, const std::vector& entries, const std::vector& exits, - bool set_default_handlers = true, - bool is_work_amount_const = false) { + bool set_default_handlers = true) { const auto normalized_increment = utils::is_dynamic_value(work_amount) || work_amount == 0 ? increment : std::min(increment, work_amount); - const auto& handlers = set_default_handlers - ? SpecificIterationHandlers(work_amount, normalized_increment) - : SpecificIterationHandlers(); - const auto loop_info = std::make_shared(work_amount, normalized_increment, entries, exits, handlers, is_work_amount_const); + const auto loop_info = std::make_shared(work_amount, normalized_increment, entries, exits); + if (set_default_handlers) + loop_info->set_handlers(SpecificIterationHandlers(work_amount, normalized_increment, loop_info->get_dim_idx())); + const auto loop_id = this->add_loop_info(loop_info); for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) { insert_loop_id(*expr_it, loop_id); @@ -131,9 +130,8 @@ class LoopManager { size_t dim_idx, const std::vector& entries, const std::vector& exits, - bool set_default_handlers = true, - bool is_work_amount_const = false) { - const auto loop_id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, entries, exits, set_default_handlers, is_work_amount_const); + bool set_default_handlers = true) { + const auto loop_id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, entries, exits, set_default_handlers); const auto loop_info = get_loop_info(loop_id); loop_info->set_dim_idx(dim_idx); return loop_id; diff --git a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp index b7eb4e7176f3c1..2587ffbd546dfa 100644 --- a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp @@ -48,32 +48,14 @@ class SetFillOffset : public pass::RangedPass { }; /** - * @interface TransformInnerSplitLoop - * @brief The pass updates finalization offsets, work amount and increment of inner Loop basing on tail_size of the current Loop - * @param m_tail_size - tail_size of the current Loop + * @interface SetLoopIncrementOne + * @brief The pass set `increment = 1` to ExpandedLoopInfo which is mapped on LoopEnd in the passed iterator `end` and to this LoopEnd. * @ingroup snippets */ -class TransformInnerSplitLoop : public pass::RangedPass { +class SetLoopIncrementOne : public snippets::lowered::pass::RangedPass { public: - TransformInnerSplitLoop(size_t tail_size); - OPENVINO_RTTI("TransformInnerSplitLoop", "RangedPass") - bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; - std::shared_ptr merge(const std::shared_ptr& other) override; - -private: - size_t m_tail_size; -}; - -/** - * @interface SetEvaluateOnce - * @brief The pass set `evaluate once = true` only to ExpandedLoopInfo which is mapped on LoopEnd in the passed iterator `end`. - * The pointer arithmetic should be updated in the separate optimization `OptimizeLoopSingleEvaluation` - * @ingroup snippets - */ -class SetEvaluateOnce : public snippets::lowered::pass::RangedPass { -public: - SetEvaluateOnce() = default; - OPENVINO_RTTI("SetEvaluateOnce", "RangedPass") + SetLoopIncrementOne() = default; + OPENVINO_RTTI("SetLoopIncrementOne", "RangedPass") bool run(snippets::lowered::LinearIR& linear_ir, snippets::lowered::LinearIR::constExprIt begin, snippets::lowered::LinearIR::constExprIt end) override; diff --git a/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp index c186e132d455e3..953f20bbe56c3d 100644 --- a/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp @@ -32,11 +32,27 @@ namespace pass { class SplitLoops : public RangedPass { public: OPENVINO_RTTI("SplitLoops", "RangedPass") - SplitLoops(); + SplitLoops() = default; bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: static bool can_be_split(const UnifiedLoopInfoPtr& current, const UnifiedLoopInfoPtr& target); + + static void split(LinearIR& linear_ir, size_t loop_to_split_id, size_t outer_increment); + + /** + * @interface TransformInnerSplitLoop + * @brief The pass replace existing inner splitted LoopInfo with new InnerSplittedUnifiedLoopInfo and + * update the corresponding LoopInfo + * @ingroup snippets + */ + class TransformInnerSplitLoop : public pass::RangedPass { + public: + TransformInnerSplitLoop() = default; + OPENVINO_RTTI("TransformInnerSplitLoop", "RangedPass") + bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + std::shared_ptr merge(const std::shared_ptr& other) override; + }; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/specific_loop_iter_handlers.hpp b/src/common/snippets/include/snippets/lowered/specific_loop_iter_handlers.hpp index c94ac6561bce4d..f91595bf86bd26 100644 --- a/src/common/snippets/include/snippets/lowered/specific_loop_iter_handlers.hpp +++ b/src/common/snippets/include/snippets/lowered/specific_loop_iter_handlers.hpp @@ -15,7 +15,7 @@ namespace lowered { class SpecificIterationHandlers { public: SpecificIterationHandlers() = default; - SpecificIterationHandlers(size_t loop_work_amount, size_t loop_increment); + SpecificIterationHandlers(size_t loop_work_amount, size_t loop_increment, size_t processing_dim_idx); SpecificIterationHandlers(pass::PassPipeline first_iter_handlers, pass::PassPipeline main_body_handlers, pass::PassPipeline last_iter_handlers); diff --git a/src/common/snippets/include/snippets/pass/analyze_broadcastable_inputs.hpp b/src/common/snippets/include/snippets/pass/analyze_broadcastable_inputs.hpp index 1cb8122685bde6..4367567c9df1c7 100644 --- a/src/common/snippets/include/snippets/pass/analyze_broadcastable_inputs.hpp +++ b/src/common/snippets/include/snippets/pass/analyze_broadcastable_inputs.hpp @@ -12,7 +12,7 @@ namespace pass { /** * @interface AnalyzeBroadcastableInputs - * @brief Analyzes body parameters which affects inputs of broadcastable operations (If needed, `Broadcast` op should be inserted there).s + * @brief Analyzes body parameters which affects inputs of broadcastable operations (If needed, `Broadcast` op should be inserted there). * Also the pass initializes special map `BroadcastableInputsMap` * Notes: * - Must be called after Canonicalization pass diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index af01757cb0194f..169d63ee4baa92 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -37,6 +37,10 @@ class RuntimeConfig { return get_type_info().name; } +#ifdef SNIPPETS_DEBUG_CAPS + virtual std::string to_string() const; +#endif + size_t tensor_rank = 0; size_t tile_rank = 0; @@ -214,7 +218,8 @@ class RuntimeConfigurator { std::vector m_io_descs = {}; std::vector m_io_data_sizes = {}; // [cluster_id -> buffer expressions ] - std::map> m_dynamic_buffer_clusters; + std::map> m_dynamic_buffer_clusters = {}; + std::vector m_ordered_loop_ids = {}; std::vector m_latest_shapes = {}; }; diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 3629245cc978bd..7ba5e830fd3362 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -51,9 +51,11 @@ LoweringResult Generator::generate(const lowered::LinearIRPtr& linear_ir, const } result.compiled_snippet = target->get_snippet(); result.kernel_executor_table = target->get_runtime_configurator()->get_kernel_executor_table(); - // Some kernel executors might've been registered during code emission. + // In static case some kernel executors might've been registered during code emission. // We need to update them, so appropriate kernels will be compiled. - result.kernel_executor_table->update_state(linear_ir); + // In dynamic case it should be handled by RuntimeConfigurator + if (!linear_ir->is_dynamic()) + result.kernel_executor_table->update_state(linear_ir); return result; } diff --git a/src/common/snippets/src/lowered/loop_info.cpp b/src/common/snippets/src/lowered/loop_info.cpp index 092d222bde86ec..534159020e4b4f 100644 --- a/src/common/snippets/src/lowered/loop_info.cpp +++ b/src/common/snippets/src/lowered/loop_info.cpp @@ -11,12 +11,11 @@ namespace ov { namespace snippets { namespace lowered { -LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, bool is_wa_const) - : m_work_amount(work_amount), m_increment(increment), m_input_ports(entries), m_output_ports(exits), m_is_work_amount_const(is_wa_const) {} +LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits) + : m_work_amount(work_amount), m_increment(increment), m_input_ports(entries), m_output_ports(exits) {} -LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - bool is_wa_const) - : m_work_amount(work_amount), m_increment(increment), m_is_work_amount_const(is_wa_const) { +LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits) + : m_work_amount(work_amount), m_increment(increment) { m_input_ports.reserve(entries.size()); m_output_ports.reserve(exits.size()); for (const auto& port : entries) @@ -73,10 +72,6 @@ const std::vector& LoopInfo::get_output_ports() const { return m_output_ports; } -bool LoopInfo::is_work_amount_const() const { - return m_is_work_amount_const; -} - void LoopInfo::set_work_amount(size_t work_amount) { m_work_amount = work_amount; } @@ -91,10 +86,6 @@ void LoopInfo::set_dim_idx(size_t dim_idx) { std::for_each(m_output_ports.begin(), m_output_ports.end(), setter); } -void LoopInfo::set_work_amount_const(bool value) { - m_is_work_amount_const = value; -} - template<> std::vector::iterator LoopInfo::find_loop_port(const LoopPort& loop_port) { auto& ports = loop_port.expr_port->get_type() == ExpressionPort::Input ? m_input_ports : m_output_ports; @@ -187,16 +178,16 @@ bool UnifiedLoopInfo::LoopPortDesc::is_dynamic() const { UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - const SpecificIterationHandlers& handlers, bool is_wa_const) - : LoopInfo(work_amount, increment, entries, exits, is_wa_const), m_handlers(handlers), + const SpecificIterationHandlers& handlers) + : LoopInfo(work_amount, increment, entries, exits), m_handlers(handlers), m_input_port_descs(std::vector(entries.size())), m_output_port_descs(std::vector(exits.size())) { sort_ports(); } UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - const SpecificIterationHandlers& handlers, bool is_wa_const) - : LoopInfo(work_amount, increment, entries, exits, is_wa_const), m_handlers(handlers), + const SpecificIterationHandlers& handlers) + : LoopInfo(work_amount, increment, entries, exits), m_handlers(handlers), m_input_port_descs(std::vector(entries.size())), m_output_port_descs(std::vector(exits.size())) { sort_ports(); } @@ -204,17 +195,27 @@ UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment, UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, const std::vector& in_shifts, const std::vector& out_shifts, - const SpecificIterationHandlers& handlers, bool is_wa_const) - : LoopInfo(work_amount, increment, entries, exits, is_wa_const), m_handlers(handlers), m_input_port_descs(in_shifts), m_output_port_descs(out_shifts) { + const SpecificIterationHandlers& handlers) + : LoopInfo(work_amount, increment, entries, exits), m_handlers(handlers), m_input_port_descs(in_shifts), m_output_port_descs(out_shifts) { sort_ports(); } -std::shared_ptr UnifiedLoopInfo::clone_with_new_expr(const ExpressionMap& expr_map) const { - const auto& new_input_ports = clone_loop_ports(expr_map, m_input_ports); - const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports); +std::shared_ptr UnifiedLoopInfo::clone_with_new_expr(const ExpressionMap& expr_map, LoopInfoMap& loop_map) const { + if (loop_map.count(this) == 0) { + const auto& new_input_ports = clone_loop_ports(expr_map, m_input_ports); + const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports); - return std::make_shared(m_work_amount, m_increment, new_input_ports, new_output_ports, - m_input_port_descs, m_output_port_descs, m_handlers, m_is_work_amount_const); + loop_map[this] = std::make_shared(m_work_amount, m_increment, new_input_ports, new_output_ports, + m_input_port_descs, m_output_port_descs, m_handlers); + } + return loop_map.at(this); +} + +void UnifiedLoopInfo::apply(const std::function& func, LoopInfoSet& applied_loops) { + if (applied_loops.count(this) == 0) { + func(shared_from_this()); + applied_loops.insert(this); + } } bool UnifiedLoopInfo::is_dynamic() const { @@ -364,24 +365,83 @@ void UnifiedLoopInfo::replace_with_new_ports(const ExpressionPort& actual_port, sort_ports(); } +InnerSplittedUnifiedLoopInfo::InnerSplittedUnifiedLoopInfo(size_t increment, const std::vector& entries, const std::vector& exits, + const std::vector& in_descs, const std::vector& out_descs, + const SpecificIterationHandlers& handlers, LoopInfoPtr outer_splitted_loop_info) + : UnifiedLoopInfo(utils::get_dynamic_value(), increment, entries, exits, in_descs, out_descs, handlers), + m_outer_splitted_loop_info(std::move(outer_splitted_loop_info)) { + OPENVINO_ASSERT(m_outer_splitted_loop_info != nullptr, "Outer Splitted Loop Info is missed!"); +} + +std::shared_ptr InnerSplittedUnifiedLoopInfo::clone_with_new_expr(const ExpressionMap& expr_map, LoopInfoMap& loop_map) const { + if (loop_map.count(this) == 0) { + auto cloned_outer_splitted_loop_info = m_outer_splitted_loop_info->clone_with_new_expr(expr_map, loop_map); + const auto& new_input_ports = clone_loop_ports(expr_map, m_input_ports); + const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports); + + loop_map[this] = std::make_shared(m_increment, new_input_ports, new_output_ports, + m_input_port_descs, m_output_port_descs, m_handlers, + std::move(cloned_outer_splitted_loop_info)); + } + return loop_map.at(this); +} + +void InnerSplittedUnifiedLoopInfo::apply(const std::function& func, LoopInfoSet& applied_loops) { + if (applied_loops.count(this) == 0) { + m_outer_splitted_loop_info->apply(func, applied_loops); + func(shared_from_this()); + applied_loops.insert(this); + } +} + +size_t InnerSplittedUnifiedLoopInfo::get_work_amount() const { + return get_outer_splitted_loop_info()->get_increment(); +} + +LoopInfoPtr InnerSplittedUnifiedLoopInfo::get_outer_splitted_loop_info() const { + OPENVINO_ASSERT(m_outer_splitted_loop_info, "Outer Splitted loop info is nullptr!"); + return m_outer_splitted_loop_info; +} + +void InnerSplittedUnifiedLoopInfo::set_work_amount(size_t work_amount) { + OPENVINO_THROW("InnerSplittedUnifiedLoopInfo doesn't support `set_work_amount`"); +} + +void InnerSplittedUnifiedLoopInfo::set_outer_splitted_loop_info(LoopInfoPtr outer) { + OPENVINO_ASSERT(outer, "Outer Splitted loop info cannot be nullptr!"); + m_outer_splitted_loop_info = std::move(outer); +} + ExpandedLoopInfo::ExpandedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, std::vector ptr_increments, std::vector final_offsets, std::vector data_sizes, - SpecificLoopIterType type, std::shared_ptr unified_loop_info, bool is_wa_const, bool evaluate_once) - : LoopInfo(work_amount, increment, entries, exits, is_wa_const), + SpecificLoopIterType type, std::shared_ptr unified_loop_info, bool evaluate_once) + : LoopInfo(work_amount, increment, entries, exits), m_ptr_increments(std::move(ptr_increments)), m_finalization_offsets(std::move(final_offsets)), m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)), m_evaluate_once(evaluate_once) { OPENVINO_ASSERT(m_unified_loop_info, "Failed to create ExpandedLoopInfo: unified loop info is nullptr!"); sort_ports(); } -std::shared_ptr ExpandedLoopInfo::clone_with_new_expr(const ExpressionMap& expr_map) const { - const auto& new_input_ports = clone_loop_ports(expr_map, m_input_ports); - const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports); +std::shared_ptr ExpandedLoopInfo::clone_with_new_expr(const ExpressionMap& expr_map, LoopInfoMap& loop_map) const { + if (loop_map.count(this) == 0) { + auto cloned_unified_loop_info = ov::as_type_ptr(m_unified_loop_info->clone_with_new_expr(expr_map, loop_map)); + const auto& new_input_ports = clone_loop_ports(expr_map, m_input_ports); + const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports); - return std::make_shared(m_work_amount, m_increment, new_input_ports, new_output_ports, - m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, - m_unified_loop_info, m_is_work_amount_const, m_evaluate_once); + loop_map[this] = std::make_shared(m_work_amount, m_increment, new_input_ports, new_output_ports, + m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, + std::move(cloned_unified_loop_info), m_evaluate_once); + } + return loop_map.at(this); +} + +void ExpandedLoopInfo::apply(const std::function& func, LoopInfoSet& applied_loops) { + if (applied_loops.count(this) == 0) { + m_unified_loop_info->apply(func, applied_loops); + func(shared_from_this()); + applied_loops.insert(this); + } } bool ExpandedLoopInfo::is_dynamic() const { diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp index 15634edb2c34c7..21f4ecc83c57b0 100644 --- a/src/common/snippets/src/lowered/loop_manager.cpp +++ b/src/common/snippets/src/lowered/loop_manager.cpp @@ -20,8 +20,12 @@ namespace lowered { std::shared_ptr LoopManager::clone_with_new_expr(const ExpressionMap& expr_map) const { auto new_loop_manager = std::make_shared(); + // To fully cloned all LoopInfo we have to create this map [old LoopInfo -> cloned LoopInfo], + // because some LoopInfo types contains pointer to another LoopInfo + // so we should recurrently make a cloning of LoopInfos' + LoopInfoMap loop_info_map; // [ old - > cloned ] for (const auto& id_info : m_map) - new_loop_manager->m_map.insert({id_info.first, id_info.second->clone_with_new_expr(expr_map)}); + new_loop_manager->m_map.insert({id_info.first, id_info.second->clone_with_new_expr(expr_map, loop_info_map)}); new_loop_manager->next_id = next_id; return new_loop_manager; } @@ -273,14 +277,22 @@ void LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, LinearIR:: const auto work_amount = std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount()); const auto increment = std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment()); const auto handlers = SpecificIterationHandlers::merge_handlers(loop_info_upper->get_handlers(), loop_info_lower->get_handlers()); - const auto is_work_amount_const = loop_info_upper->is_work_amount_const() || loop_info_lower->is_work_amount_const(); auto new_entries = std::move(input_ports_upper); new_entries.insert(new_entries.end(), input_ports_lower.begin(), input_ports_lower.end()); auto new_exits = std::move(output_ports_upper); new_exits.insert(new_exits.end(), output_ports_lower.begin(), output_ports_lower.end()); - m_map[to] = std::make_shared(work_amount, increment, new_entries, new_exits, handlers, is_work_amount_const); + m_map[to] = std::make_shared(work_amount, increment, new_entries, new_exits, handlers); + + // Need to handle InnerSplittedLoopInfo - update outer splitted loop info if it was fused + for (const auto& p : m_map) { + if (const auto inner_splitted_loop_info = ov::as_type_ptr(p.second)) { + const auto outer = inner_splitted_loop_info->get_outer_splitted_loop_info(); + if (utils::one_of(outer, loop_info_upper, loop_info_lower)) + inner_splitted_loop_info->set_outer_splitted_loop_info(m_map[to]); + } + } for (auto it = loop_begin_target; it != loop_end_target; ++it) { const auto& expr = *it; diff --git a/src/common/snippets/src/lowered/pass/brgemm_blocking.cpp b/src/common/snippets/src/lowered/pass/brgemm_blocking.cpp index a7336c14454319..d689b183456bc1 100644 --- a/src/common/snippets/src/lowered/pass/brgemm_blocking.cpp +++ b/src/common/snippets/src/lowered/pass/brgemm_blocking.cpp @@ -26,7 +26,6 @@ snippets::lowered::SpecificIterationHandlers BrgemmBlockingBase::get_default_blo const auto tail_size = snippets::utils::is_dynamic_value(work_amount) ? snippets::utils::get_dynamic_value() : work_amount % block_size; if (tail_size != 0) handlers.register_pass(tail_size); - handlers.register_pass(); return handlers; } diff --git a/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp index 3abb254092268a..85bbed324a9865 100644 --- a/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp +++ b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp @@ -27,9 +27,13 @@ std::vector get_parent_inner_loops(const std::vector& parent_loo // Ticket: 113744 // TODO: This logic covers only several specific cases so it should be generalized. size_t ComputeBufferAllocationSize::get_allocation_size(const LoopManagerPtr& loop_manager, const ExpressionPtr& buffer_expr, size_t allocation_rank) { + const auto& current_buffer = ov::as_type_ptr(buffer_expr->get_node()); + OPENVINO_ASSERT(current_buffer, "`get_allocation_size` expected Buffer"); + // Note: Buffer expressions can have more than one parent after the loops splitting transformation, but only the last parent // can be used to access valid loop ports. More info in the ticket: 146646 - const auto& parent_port = buffer_expr->get_input_port_connector(buffer_expr->get_input_count() - 1)->get_source(); + const auto buffer_in_idx = buffer_expr->get_input_count() - 1; + const auto& parent_port = buffer_expr->get_input_port_connector(buffer_in_idx)->get_source(); const auto& parent_loop_ids = get_parent_inner_loops(parent_port.get_expr()->get_loop_ids(), buffer_expr->get_loop_ids()); const auto planar_shape = utils::get_preordered_vdims(parent_port); @@ -38,13 +42,38 @@ size_t ComputeBufferAllocationSize::get_allocation_size(const LoopManagerPtr& lo const auto& subtensor = ov::snippets::utils::get_projected_subtensor(parent_port); + auto hard_equal = [&parent_port](const LoopPort& port) { + return *port.expr_port == parent_port; + }; + auto soft_equal = [&](const LoopPort& loop_port) { + const auto& port = *loop_port.expr_port; + // Check semantic of LoopPort + if (parent_port.get_index() != port.get_index() || + port.get_expr()->get_node()->get_type_info() != parent_port.get_expr()->get_node()->get_type_info()) + return false; + // Check that this LoopPort is connected to the same by semantic Buffer + const auto consumers = port.get_connected_ports(); + for (const auto& consumer : consumers) { + if (const auto buffer_consumer = ov::as_type_ptr(consumer.get_expr()->get_node())) { + if (buffer_consumer->get_cluster_id() == current_buffer->get_cluster_id() && consumer.get_index() == buffer_in_idx) + return true; + } + } + return false; + }; + size_t allocation_size = 1; std::set processed_dim_idxs; for (const auto& parent_loop : parent_loop_ids) { const auto loop_info = loop_manager->get_loop_info(parent_loop); const auto& output_ports = loop_info->get_output_ports(); - auto it = std::find_if(output_ports.begin(), output_ports.end(), [&parent_port](const LoopPort& port) { return *port.expr_port == parent_port; }); - OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found"); + auto it = std::find_if(output_ports.begin(), output_ports.end(), hard_equal); + // [149219] : Try to find original loop port if this LoopInfo is cloned after InsertSpecificIterations + // and ports are not mapped on the original ExpressionPorts + if (it == output_ports.end()) { + it = std::find_if(output_ports.begin(), output_ports.end(), soft_equal); + OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found"); + } const auto& loop_port = *it; const auto& dim_idx = loop_port.dim_idx; if (loop_port.is_incremented && dim_idx < rank) { diff --git a/src/common/snippets/src/lowered/pass/extract_loop_invariants.cpp b/src/common/snippets/src/lowered/pass/extract_loop_invariants.cpp index 6053451be9fafb..f20ace893df463 100644 --- a/src/common/snippets/src/lowered/pass/extract_loop_invariants.cpp +++ b/src/common/snippets/src/lowered/pass/extract_loop_invariants.cpp @@ -14,6 +14,34 @@ namespace snippets { namespace lowered { namespace pass { namespace { + +// Sort Loop IDs by execution order of these Loops +std::vector get_reordered_loop_ids(const LoopManagerPtr& loop_manager) { + const auto& loop_map = loop_manager->get_map(); + std::vector loop_ids_need_extract; + loop_ids_need_extract.reserve(loop_map.size()); + for (const auto& p : loop_map) + loop_ids_need_extract.push_back(p.first); + + auto sorter = [&](size_t lhs, size_t rhs) { + const auto lhs_last_expr = loop_manager->get_loop_info(lhs)->get_output_ports().back().expr_port->get_expr(); + const auto rhs_last_expr = loop_manager->get_loop_info(rhs)->get_output_ports().back().expr_port->get_expr(); + // If last output loop ports are the same expressions - first executive Loop has inner ID in expression loop IDs. + if (lhs_last_expr == rhs_last_expr) { + for (const auto& id : lhs_last_expr->get_loop_ids()) { + if (id == lhs) return false; + if (id == rhs) return true; + } + OPENVINO_THROW("Incorrect Loop IDs"); + } else { + return lhs_last_expr->get_exec_num() < rhs_last_expr->get_exec_num(); + } + }; + + std::sort(loop_ids_need_extract.begin(), loop_ids_need_extract.end(), sorter); + return loop_ids_need_extract; +} + void remove_last_loop_id(const std::shared_ptr& expr) { auto loop_ids = expr->get_loop_ids(); OPENVINO_ASSERT(!loop_ids.empty(), "Expr loop_ids should not be empty when remove last loop id."); @@ -179,22 +207,10 @@ bool ExtractLoopInvariants::run(LinearIR& linear_ir, lowered::LinearIR::constExp OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ExtractLoopInvariants") bool modified = false; - const auto& loop_depth = linear_ir.get_config().m_loop_depth; - std::vector> loop_ids_need_extract(loop_depth); - const auto& loop_map = linear_ir.get_loop_manager()->get_map(); - for (const auto& loop : loop_map) { - const auto& loop_dim = loop.second->get_dim_idx(); - if (loop_dim != LoopInfo::UNDEFINED_DIM_IDX) { - OPENVINO_ASSERT(loop_dim < loop_depth, "dim_idx of loop should be smaller than loop_depth"); - loop_ids_need_extract[loop_dim].insert(loop.first); - } - } // move invariant expr to top(outside) of current loop - for (size_t d = 0; d < loop_depth; d++) { - const auto& loops_in_this_depth = loop_ids_need_extract[d]; - for (const auto& loop_id : loops_in_this_depth) { - modified |= extract_from_loop(loop_id, linear_ir); - } + const auto loop_ids_need_extract = get_reordered_loop_ids(linear_ir.get_loop_manager()); + for (const auto& loop_id : loop_ids_need_extract) { + modified |= extract_from_loop(loop_id, linear_ir); } return modified; diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp index 1673e4ddf712c0..3708896f5abf39 100644 --- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp +++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp @@ -65,13 +65,18 @@ bool FuseLoops::can_be_fused(const UnifiedLoopInfoPtr& loop_upper, const Unified (work_amount_upper == work_amount_lower) && increment_upper == increment_lower; const bool bcastable_upper = work_amount_upper == 1 && increment_upper == 1; const bool bcastable_lower = work_amount_lower == 1 && increment_lower == 1; - const auto is_const_wa_equal = loop_upper->is_work_amount_const() == loop_lower->is_work_amount_const(); // WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't, // because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters // (e.g. tail size). This logic is not implemented for now, so fusion for such loops is skipped. const bool first_iter_handlers_match = loop_upper->get_handlers().get_passes().empty() == loop_lower->get_handlers().get_passes().empty(); - return first_iter_handlers_match && is_const_wa_equal && (is_dynamic_case || equal_parameters || bcastable_upper || bcastable_lower); + // InnerSplittedUnifiedLoops can be fused only if they point to the same outer loop + const auto& ispl_loop_upper = ov::as_type_ptr(loop_upper); + const auto& ispl_loop_lower = ov::as_type_ptr(loop_lower); + const auto& inner_splitted_loop_compatible = + (!ispl_loop_upper && !ispl_loop_lower) || + (ispl_loop_upper && ispl_loop_lower && ispl_loop_upper->get_outer_splitted_loop_info() == ispl_loop_lower->get_outer_splitted_loop_info()); + return first_iter_handlers_match && inner_splitted_loop_compatible && (is_dynamic_case || equal_parameters || bcastable_upper || bcastable_lower); } void FuseLoops::move(LinearIR& linear_ir, const LoopManagerPtr& loop_manager, size_t loop_id, diff --git a/src/common/snippets/src/lowered/pass/init_loops.cpp b/src/common/snippets/src/lowered/pass/init_loops.cpp index 18575131aad306..8e9b62d8fab825 100644 --- a/src/common/snippets/src/lowered/pass/init_loops.cpp +++ b/src/common/snippets/src/lowered/pass/init_loops.cpp @@ -88,8 +88,11 @@ inline int64_t get_ptr_increment(const LoopPort& loop_port, size_t work_amount, } inline int64_t get_finalization_offset(size_t work_amount, int64_t ptr_increment) { - return utils::is_dynamic_value(work_amount) || utils::is_dynamic_value(ptr_increment) ? utils::get_dynamic_value() - : -1 * ptr_increment * work_amount; + if (ptr_increment == 0 || work_amount == 0) + return 0; + if (utils::is_dynamic_value(work_amount) || utils::is_dynamic_value(ptr_increment)) + return utils::get_dynamic_value(); + return -1 * ptr_increment * work_amount; } inline int64_t get_data_size(const LoopPort& loop_port) { @@ -145,7 +148,7 @@ void InitLoops::update_data_pointer_shifts(const UnifiedLoopInfoPtr& loop_info) void InitLoops::update_runtime_parameters(const UnifiedLoopInfoPtr& loop_info) { OPENVINO_ASSERT(loop_info != nullptr, "UnifiedLoopInfo is nullptr, nothing to update"); - if (!loop_info->is_work_amount_const()) + if (!ov::is_type(loop_info)) init_work_amount(loop_info); update_data_pointer_shifts(loop_info); } diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp index dcff90015d28f2..badf4b0477759c 100644 --- a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp +++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp @@ -91,14 +91,13 @@ size_t InsertSpecificIterations::get_decomposed_loop_increment(const UnifiedLoop size_t remaining_work_amount) { OPENVINO_ASSERT(unified_loop_info, "UnifiedLoopInfo is missed!"); const auto increment = unified_loop_info->get_increment(); - const auto is_dynamic = utils::is_dynamic_value(remaining_work_amount); switch (type) { case (SpecificLoopIterType::FIRST_ITER): case (SpecificLoopIterType::MAIN_BODY): return increment; case(SpecificLoopIterType::LAST_ITER): - return is_dynamic ? 1 : remaining_work_amount; + return remaining_work_amount; default: OPENVINO_THROW("Unknown SpecificLoopIterType!"); } @@ -167,10 +166,9 @@ bool InsertSpecificIterations::decompose(LinearIR& linear_ir, LinearIR::constExp if (is_decomposed_loop_needed(unified_loop_info, iter_type, remaining_work_amount)) { const auto work_amount = get_decomposed_loop_work_amount(unified_loop_info, iter_type, remaining_work_amount); const auto increment = get_decomposed_loop_increment(unified_loop_info, iter_type, remaining_work_amount); - const auto evaluate_once = !utils::is_dynamic_value(work_amount) && work_amount == increment; // Update remaining Loop work amount // Note: if work_amount is unknown and increment = 1, it means that a loop will iterate by whole work_amount - if (!is_wa_dynamic || increment == 1) { + if (!is_wa_dynamic || increment == 1 || iter_type == SpecificLoopIterType::LAST_ITER) { remaining_work_amount -= work_amount; } @@ -200,7 +198,7 @@ bool InsertSpecificIterations::decompose(LinearIR& linear_ir, LinearIR::constExp const auto decomposed_loop_info = std::make_shared(work_amount, increment, decomposed_loop_entry_ports, decomposed_loop_exit_ports, decomposed_ptr_increments, decomposed_finalization_offsets, - decomposed_data_sizes, iter_type, unified_loop_info, false, evaluate_once); + decomposed_data_sizes, iter_type, unified_loop_info); init_decomposed_loop(linear_ir, decomposed_loop_begin_it, decomposed_loop_end_it, decomposed_loop_info, loop_id, decomposed_loop_end); decomposed = true; diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp index a3ee577338a691..3e035628df476f 100644 --- a/src/common/snippets/src/lowered/pass/iter_handler.cpp +++ b/src/common/snippets/src/lowered/pass/iter_handler.cpp @@ -80,78 +80,17 @@ std::shared_ptr SetFillOffset::merge(const std::shared_ptrget_node(); - const auto loop_end = ov::as_type_ptr(node); - OPENVINO_ASSERT(loop_end, "the last operation in range must be LoopEnd"); - - const auto& loop_manager = linear_ir.get_loop_manager(); - const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); - const auto current_dim_idx = loop_info->get_dim_idx(); - OPENVINO_ASSERT(current_dim_idx != LoopInfo::UNDEFINED_DIM_IDX, - "Outer splitted loop unexpectedly iterates by several dimension indices"); - - bool modified = false; - for (auto it = begin; it != end; ++it) { - const auto& expr = *it; - const auto inner_loop_end = ov::as_type_ptr(expr->get_node()); - if (!inner_loop_end) - continue; - // There is already ExpandedLoopInfo - const auto inner_loop_info = loop_manager->get_loop_info(inner_loop_end->get_id()); - const auto inner_dim_idx = inner_loop_info->get_dim_idx(); - if (inner_dim_idx != current_dim_idx) - continue; - // TODO [141735] : At the moment Splitted loops are not supported in dynamic case - OPENVINO_ASSERT(!inner_loop_end->has_dynamic_params(), "inner loop must be static in TransformInnerSplitLoop"); - const auto inner_loop_begin = inner_loop_end->get_loop_begin(); - const auto inner_loop_work_amount = static_cast(inner_loop_end->get_work_amount()); - const auto inner_loop_increment = inner_loop_end->get_increment(); - auto inner_finalization_offsets = inner_loop_end->get_finalization_offsets(); - for (auto& offset : inner_finalization_offsets) { - offset = offset / inner_loop_work_amount * static_cast(m_tail_size); - } - inner_loop_end->set_work_amount(m_tail_size); - // Since the loop has work amount equal to increment of outer loop, not broadcasted dimension, - // we should set `work_amount_const = true` to avoid rewriting in common loop intiialization passes (for example, `InitLoops`) - inner_loop_info->set_work_amount_const(true); - // TODO: if m_tail_size more than inner loop increment, - // handlers of the inner loop must be reset with new tail size - inner_loop_end->set_increment(std::min(inner_loop_increment, m_tail_size)); - inner_loop_end->set_finalization_offsets(inner_finalization_offsets); - const auto inner_loop_begin_it = std::find(begin, it, linear_ir.get_expr_by_node(inner_loop_begin)); - const auto inner_loop_end_it = std::next(it); - OPENVINO_ASSERT(inner_loop_begin_it != it, "LoopBegin has not been found!"); - const auto& last_iter_handlers = inner_loop_info->get_unified_loop_info()->get_handlers().get_passes(); - last_iter_handlers.run(linear_ir, std::next(inner_loop_begin_it), inner_loop_end_it); - modified = true; - } - return modified; -} - -std::shared_ptr TransformInnerSplitLoop::merge(const std::shared_ptr& other) { - const auto merged_pass = std::make_shared(m_tail_size); - if (other == nullptr) - return merged_pass; - const auto casted_pass = ov::as_type_ptr(other); - if (!casted_pass || m_tail_size != casted_pass->m_tail_size) - return nullptr; - return merged_pass; -} - -bool SetEvaluateOnce::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { +bool SetLoopIncrementOne::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { const auto& loop_end = ov::as_type_ptr(end->get()->get_node()); - OPENVINO_ASSERT(loop_end, "SetEvaluateOnce expected LoopEnd node in iterator `end`."); + OPENVINO_ASSERT(loop_end, "SetLoopIncrementOne expected LoopEnd node in iterator `end`."); const auto& loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id()); - loop_info->set_evaluate_once(true); + loop_info->set_increment(1); + loop_end->set_increment(1); return true; } -std::shared_ptr SetEvaluateOnce::merge(const std::shared_ptr& other) { - return !other || ov::is_type(other) ? std::make_shared() : nullptr; +std::shared_ptr SetLoopIncrementOne::merge(const std::shared_ptr& other) { + return !other || ov::is_type(other) ? std::make_shared() : nullptr; } } // namespace pass diff --git a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp index c6255d90106e77..24ba946c92382b 100644 --- a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp +++ b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp @@ -24,7 +24,7 @@ bool OptimizeLoopSingleEvaluation::run(lowered::LinearIR& linear_ir, lowered::Li const auto& expr = *expr_it; if (auto loop_end = ov::as_type_ptr(expr->get_node())) { const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); - if (loop_info->is_evaluate_once()) { + if (loop_info->get_work_amount() == loop_info->get_increment()) { auto new_finalization_offsets = loop_end->get_finalization_offsets(); const auto& ptr_increments = loop_end->get_ptr_increments(); const auto work_amount_incr = static_cast(loop_end->get_increment()); @@ -39,6 +39,7 @@ bool OptimizeLoopSingleEvaluation::run(lowered::LinearIR& linear_ir, lowered::Li // Update the corresponding ExpandedLoopInfo loop_info->update_ptr_increments(loop_end->get_ptr_increments()); loop_info->update_finalization_offsets(loop_end->get_finalization_offsets()); + loop_info->set_evaluate_once(true); is_modified = true; } diff --git a/src/common/snippets/src/lowered/pass/split_loops.cpp b/src/common/snippets/src/lowered/pass/split_loops.cpp index 348347c35dd975..e6c59d502d4804 100644 --- a/src/common/snippets/src/lowered/pass/split_loops.cpp +++ b/src/common/snippets/src/lowered/pass/split_loops.cpp @@ -5,6 +5,7 @@ #include "snippets/lowered/pass/split_loops.hpp" #include "snippets/lowered/pass/fuse_loops.hpp" +#include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/pass/iter_handler.hpp" @@ -16,8 +17,6 @@ namespace snippets { namespace lowered { namespace pass { -SplitLoops::SplitLoops() : RangedPass() {} - bool SplitLoops::can_be_split(const UnifiedLoopInfoPtr& loop_to_split, const UnifiedLoopInfoPtr& loop_to_fuse) { OPENVINO_ASSERT(loop_to_split != nullptr && loop_to_fuse != nullptr, "LoopInfo is nullptr!"); const auto current_dim_idx = loop_to_split->get_dim_idx(); @@ -26,9 +25,7 @@ bool SplitLoops::can_be_split(const UnifiedLoopInfoPtr& loop_to_split, const Uni const bool equal_dim_idxes = current_dim_idx != LoopInfo::UNDEFINED_DIM_IDX && current_dim_idx == parent_dim_idx; const bool only_main_body = handlers.get_passes().empty() && handlers.get_passes().empty(); - // TODO [141735] : At the moment Splitted loops are not supported in dynamic case - const auto are_static = !loop_to_split->is_dynamic() && !loop_to_fuse->is_dynamic(); - return are_static && loop_to_split->get_work_amount() == loop_to_fuse->get_work_amount() && + return loop_to_split->get_work_amount() == loop_to_fuse->get_work_amount() && loop_to_split->get_increment() != loop_to_fuse->get_increment() && equal_dim_idxes && only_main_body; } @@ -70,32 +67,8 @@ bool SplitLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, const auto& loop_to_fuse = !split_parent ? parent_loop : loop; // We don't split loop which are not compatible with parent loop because such loops will not be fused if (FuseLoops::can_be_fused(upper_loop, lower_loop) && can_be_split(loop_to_split, loop_to_fuse)) { + split(linear_ir, split_parent ? parent_loop_id : loop_id, loop_to_fuse->get_increment()); loop_was_split = true; - loop_to_split->set_work_amount(loop_to_fuse->get_increment()); - // Since the loop has work amount equal to increment of outer loop, not broadcasted dimension, - // we should set `work_amount_const = true` to avoid rewriting in common loop intiialization passes (for example, `InitLoops`) - loop_to_split->set_work_amount_const(true); - - const auto& loop_to_split_id = split_parent ? parent_loop_id : loop_id; - const auto loop_bounds = LoopManager::get_loop_bounds(linear_ir, loop_to_split_id, - loop_to_split->get_input_ports(), - loop_to_split->get_output_ports()); - const auto split_loop_id = loop_manager->mark_loop(loop_bounds.first, - loop_bounds.second, - loop_to_fuse->get_work_amount(), - loop_to_fuse->get_increment(), - loop_to_split->get_dim_idx(), - loop_to_split->get_input_ports(), - loop_to_split->get_output_ports()); - const auto& new_loop_info = loop_manager->get_loop_info(split_loop_id); - const auto work_amount = loop_to_fuse->get_work_amount(); - const auto increment = loop_to_fuse->get_increment(); - const auto tail_size = work_amount % increment; - auto new_handlers = loop_to_split->get_handlers(); - if (tail_size != 0) { - new_handlers.register_pass(tail_size); - } - new_loop_info->set_handlers(new_handlers); break; } } @@ -107,6 +80,108 @@ bool SplitLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, FuseLoops().run(linear_ir, begin, end); return loop_was_split; } + +void SplitLoops::split(LinearIR& linear_ir, size_t loop_to_split_id, size_t outer_increment) { + const auto& loop_manager = linear_ir.get_loop_manager(); + + const auto& inner_loop_info = loop_manager->get_loop_info(loop_to_split_id); + const auto loop_bounds = LoopManager::get_loop_bounds(linear_ir, loop_to_split_id, + inner_loop_info->get_input_ports(), + inner_loop_info->get_output_ports()); + const auto outer_loop_id = loop_manager->mark_loop(loop_bounds.first, loop_bounds.second, inner_loop_info->get_work_amount(), + outer_increment, inner_loop_info->get_dim_idx(), + inner_loop_info->get_input_ports(), inner_loop_info->get_output_ports(), false); + const auto& outer_loop_info = loop_manager->get_loop_info(outer_loop_id); + + const auto& inner_splitted_loop_info = + std::make_shared(inner_loop_info->get_increment(), inner_loop_info->get_input_ports(), + inner_loop_info->get_output_ports(), inner_loop_info->get_input_port_descs(), + inner_loop_info->get_output_port_descs(), inner_loop_info->get_handlers(), + outer_loop_info); + loop_manager->replace_with_new_loop(linear_ir, loop_bounds.first, loop_bounds.second, inner_splitted_loop_info, loop_to_split_id); + + if (!outer_loop_info->get_handlers().get_passes().empty()) { + outer_loop_info->register_pass_to_handler(); + } + outer_loop_info->register_pass_to_handler(); + outer_loop_info->register_pass_to_handler(); +} + +namespace { +InnerSplittedUnifiedLoopInfoPtr make_own_inner_splitted_unified_loop_info(const ExpandedLoopInfoPtr& inner_expanded, + const ExpandedLoopInfoPtr& outer_expanded, + const InnerSplittedUnifiedLoopInfoPtr& existing_inner_unified) { + const auto loop_info = + std::make_shared(inner_expanded->get_increment(), inner_expanded->get_input_ports(), + inner_expanded->get_output_ports(), existing_inner_unified->get_input_port_descs(), + existing_inner_unified->get_output_port_descs(), existing_inner_unified->get_handlers(), + outer_expanded); + InitLoops::update_runtime_parameters(loop_info); + return loop_info; +} +ExpandedLoopInfoPtr make_own_inner_splitted_expanded_loop_info(const ExpandedLoopInfoPtr& inner_expanded, + const InnerSplittedUnifiedLoopInfoPtr& inner_unified) { + return std::make_shared(inner_unified->get_work_amount(), inner_unified->get_increment(), + inner_unified->get_input_ports(), inner_unified->get_output_ports(), + inner_unified->get_ptr_increments(), + inner_unified->get_finalization_offsets(), + inner_unified->get_data_sizes(), inner_expanded->get_type(), + inner_unified, inner_expanded->is_evaluate_once()); +} +} // namespace + +bool SplitLoops::TransformInnerSplitLoop::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + OPENVINO_ASSERT(end != linear_ir.cend(), "Incorrect LinearIR range for processing"); + const auto& expr = *end; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + OPENVINO_ASSERT(loop_end, "the last operation in range must be LoopEnd"); + + const auto& loop_manager = linear_ir.get_loop_manager(); + const auto& outer_loop_info = loop_manager->get_loop_info(loop_end->get_id()); + const auto current_dim_idx = outer_loop_info->get_dim_idx(); + OPENVINO_ASSERT(current_dim_idx != LoopInfo::UNDEFINED_DIM_IDX, + "Outer splitted loop unexpectedly iterates by several dimension indices"); + + bool modified = false; + for (auto it = begin; it != end; ++it) { + const auto& expr = *it; + const auto inner_loop_end = ov::as_type_ptr(expr->get_node()); + if (!inner_loop_end) + continue; + + // There is already ExpandedLoopInfo + const auto& inner_expanded_loop_info = loop_manager->get_loop_info(inner_loop_end->get_id()); + const auto inner_unified_loop_info = ov::as_type_ptr(inner_expanded_loop_info->get_unified_loop_info()); + if (!inner_unified_loop_info || inner_unified_loop_info->get_outer_splitted_loop_info() != outer_loop_info->get_unified_loop_info()) + continue; + + OPENVINO_ASSERT(current_dim_idx == inner_unified_loop_info->get_dim_idx(), "Incorrect processing dim index of splitted loops"); + OPENVINO_ASSERT(inner_expanded_loop_info->get_type() == SpecificLoopIterType::MAIN_BODY, "InnerSplittedLoop must be Main Body of loop"); + + // We have to make a new UnifiedLoopInfo to distinguish it from other unified loops in other specific iterations of outer loop. + const auto inner_splitted_unified_loop_info = make_own_inner_splitted_unified_loop_info(inner_expanded_loop_info, outer_loop_info, + inner_unified_loop_info); + + // We have to replace existing ExpandedLoopInfo with new one to have the own InnerSplittedUnifiedLoopInfo and + // distinguish it from other expanded loops in other specific iterations of outer loop. + const auto new_expanded_inner_loop_info = make_own_inner_splitted_expanded_loop_info(inner_expanded_loop_info, inner_splitted_unified_loop_info); + const auto inner_begin = linear_ir.find_before(it, linear_ir.get_expr_by_node(inner_loop_end->get_loop_begin())); + const auto new_id = loop_manager->replace_with_new_loop(linear_ir, inner_begin, std::next(it), new_expanded_inner_loop_info, inner_loop_end->get_id()); + + // [147894] : Update inner LoopEnd expression + inner_loop_end->set_id(new_id); + inner_loop_end->set_work_amount(new_expanded_inner_loop_info->get_work_amount()); + inner_loop_end->set_increment(new_expanded_inner_loop_info->get_increment()); + inner_loop_end->set_finalization_offsets(new_expanded_inner_loop_info->get_finalization_offsets()); + } + return modified; +} + +std::shared_ptr SplitLoops::TransformInnerSplitLoop::merge(const std::shared_ptr& other) { + return !other || ov::is_type(other) ? std::make_shared() : nullptr; +} + } // namespace pass } // namespace lowered } // namespace snippets diff --git a/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp b/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp index cc598b4e768cd4..ec77d75fe3ff43 100644 --- a/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp +++ b/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp @@ -43,49 +43,51 @@ void ValidateExpandedLoops::validate_loop_information(const LinearIR& linear_ir) const auto& loop_manager = linear_ir.get_loop_manager(); const auto& loop_map = loop_manager->get_map(); - UnifiedLoopInfoPtr current_unified_loop_info = nullptr; - std::vector total_finalization_offsets; - size_t current_work_amount = 0; - size_t num_ports = 0; + // Initialized UnifiedLoopInfo + struct CurrentUnifiedLoopInfo { + size_t work_amount = 0; + size_t num_ports = 0; + size_t id = 0; + std::vector finalization_offsets; + }; + std::unordered_map initializated_info_map; for (const auto& p : loop_map) { const auto& expanded_loop_info = ov::as_type_ptr(p.second); INFORMATIVE_ASSERT(expanded_loop_info, "expects only ExpandedLoopInfo in LoopManager"); - const auto& unified_loop_info = expanded_loop_info->get_unified_loop_info(); - INFORMATIVE_ASSERT(unified_loop_info, "expects non nullptr UnifiedLoopInfo in ExpandedLoopInfo"); + const auto& current_unified_loop_info = expanded_loop_info->get_unified_loop_info(); + INFORMATIVE_ASSERT(current_unified_loop_info, "expects non nullptr UnifiedLoopInfo in ExpandedLoopInfo"); - if (unified_loop_info != current_unified_loop_info) { - // If there is `current_unified_loop_info` - the previos loop is finished and need to validate total information - if (current_unified_loop_info) { - INFORMATIVE_ASSERT(current_work_amount == current_unified_loop_info->get_work_amount(), - "total work amount of expanded loops is not equal to work amount of undefined loop"); - INFORMATIVE_ASSERT(total_finalization_offsets == current_unified_loop_info->get_finalization_offsets(), - "total finalization offsets are not equal to finalization offsets of undefined loop"); - } - - current_unified_loop_info = unified_loop_info; - - INFORMATIVE_ASSERT(current_unified_loop_info->get_input_count() == expanded_loop_info->get_input_count() && - current_unified_loop_info->get_output_count() == expanded_loop_info->get_output_count(), - "incompatible loop ports with UnifiedLoopInfo"); - - current_work_amount = 0; - num_ports = expanded_loop_info->get_input_count() + expanded_loop_info->get_output_count(); - total_finalization_offsets.clear(); - total_finalization_offsets.resize(num_ports, 0); + auto& current_info = initializated_info_map[current_unified_loop_info]; + if (current_info.num_ports == 0) { // the info was just default constructed + current_info.num_ports = current_unified_loop_info->get_input_count() + current_unified_loop_info->get_output_count(); + current_info.finalization_offsets.resize(current_info.num_ports, 0); } - current_work_amount = utils::dynamic_safe_add(current_work_amount, expanded_loop_info->get_work_amount()); - INFORMATIVE_ASSERT(current_unified_loop_info, "expects non nullptr current UnifiedLoopInfo"); + INFORMATIVE_ASSERT(current_unified_loop_info->get_input_count() == expanded_loop_info->get_input_count() && + current_unified_loop_info->get_output_count() == expanded_loop_info->get_output_count(), + "incompatible loop ports with UnifiedLoopInfo"); + + current_info.work_amount = utils::dynamic_safe_add(current_info.work_amount, expanded_loop_info->get_work_amount()); INFORMATIVE_ASSERT(current_unified_loop_info->get_ptr_increments() == expanded_loop_info->get_ptr_increments(), "incompatible pointer increments with UnifiedLoopInfo"); const auto& finalization_offsets = expanded_loop_info->get_finalization_offsets(); - INFORMATIVE_ASSERT(finalization_offsets.size() == total_finalization_offsets.size(), + INFORMATIVE_ASSERT(finalization_offsets.size() == current_info.finalization_offsets.size(), "incompatible finalization offset count"); - for (size_t i = 0; i < num_ports; ++i) - total_finalization_offsets[i] = utils::dynamic_safe_add(total_finalization_offsets[i], finalization_offsets[i]); + for (size_t i = 0; i < current_info.num_ports; ++i) + current_info.finalization_offsets[i] = utils::dynamic_safe_add(current_info.finalization_offsets[i], finalization_offsets[i]); + } + + // Validation of total information + for (const auto& p : initializated_info_map) { + const auto loop_info = p.first; + const auto total_info = p.second; + INFORMATIVE_ASSERT(total_info.work_amount == loop_info->get_work_amount(), + "total work amount of expanded loops is not equal to work amount of undefined loop with ID: " + std::to_string(total_info.id)); + INFORMATIVE_ASSERT(total_info.finalization_offsets == loop_info->get_finalization_offsets(), + "total finalization offsets are not equal to finalization offsets of undefined loop with ID: " + std::to_string(total_info.id)); } } diff --git a/src/common/snippets/src/lowered/specific_loop_iter_handlers.cpp b/src/common/snippets/src/lowered/specific_loop_iter_handlers.cpp index ffc00cdd08e50a..c1302084aa68f7 100644 --- a/src/common/snippets/src/lowered/specific_loop_iter_handlers.cpp +++ b/src/common/snippets/src/lowered/specific_loop_iter_handlers.cpp @@ -13,11 +13,23 @@ namespace ov { namespace snippets { namespace lowered { -SpecificIterationHandlers::SpecificIterationHandlers(size_t loop_work_amount, size_t loop_increment) { - const auto tail_size = utils::is_dynamic_value(loop_work_amount) ? 1lu : loop_work_amount % loop_increment; - if (tail_size != 0) { - m_last_iter_handlers.register_pass(tail_size); - m_last_iter_handlers.register_pass(tail_size); +SpecificIterationHandlers::SpecificIterationHandlers(size_t loop_work_amount, size_t loop_increment, size_t processing_dim_idx) { + // The following handlers are set only for Last Iter processing + if (loop_increment > 1) { + size_t last_iter_increment = utils::get_dynamic_value(); + if (!utils::is_dynamic_value(loop_work_amount)) { + last_iter_increment = loop_work_amount % loop_increment; + } else if (utils::is_dynamic_value(loop_work_amount) && processing_dim_idx == 0) { + // [149935] : Last Iterations of Loop processed last dimensions with Eltwise nodes inside should have increment = 1 + last_iter_increment = 1; + } + if (last_iter_increment != 0) { + m_last_iter_handlers.register_pass(last_iter_increment); + m_last_iter_handlers.register_pass(last_iter_increment); + // Last Iterations of Loop processed last dimensions with Eltwise nodes inside should have increment = 1 + if (last_iter_increment == 1) + m_last_iter_handlers.register_pass(); + } } } diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index e22e4b3bee57fb..552455b89f5529 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -17,6 +17,24 @@ namespace snippets { using namespace ov::snippets::pass; using namespace ov::snippets::lowered; +#ifdef SNIPPETS_DEBUG_CAPS +std::string RuntimeConfig::to_string() const { + std::stringstream out; + out << " ========== RuntimeConfig state ==========\n" << + "tensor_rank: " << tensor_rank << "\n" << + "tile_rank: " << tile_rank << "\n" << + "master_shape: " << ov::Shape(master_shape) << "\n"; + out << "io_data_offsets: " << "\n"; + for (size_t i = 0; i < io_data_offsets.size(); ++i) + out << "\t[" << i << "]" << ov::Shape(io_data_offsets[i]) << "\n"; + out << "buffer_scratchpad_size: " << buffer_scratchpad_size << "\n"; + out << "buffer_cluster_offsets: " << "\n"; + for (size_t i = 0; i < buffer_cluster_offsets.size(); ++i) + out << "\t[" << i << "]" << buffer_cluster_offsets[i] << "\n"; + return out.str(); +} +#endif + RuntimeConfigurator::RuntimeConfigurator(std::shared_ptr c) : m_config(std::move(c)) { OPENVINO_ASSERT(m_config, "Runtime config is nullptr!"); @@ -45,6 +63,21 @@ void RuntimeConfigurator::initialization(const lowered::LinearIRCPtr& linear_ir) m_config->io_data_offsets.resize(m_io_num); m_config->tile_rank = linear_ir->get_config().m_loop_depth; m_optimizer.init(linear_ir, m_io_descs, m_in_num); + + // InnerSplittedLoops should be inited after OuterSplittedLoops + const auto& loop_map = linear_ir->get_loop_manager()->get_map(); + m_ordered_loop_ids.clear(); + m_ordered_loop_ids.reserve(loop_map.size()); + std::vector loops_must_be_last; + for (const auto& p : loop_map) { + const auto loop_id = p.first; + const auto& expanded_loop_info = ov::as_type_ptr(p.second); + OPENVINO_ASSERT(expanded_loop_info, "UpdateLoopInfo expects ExpandedLoopInfo in LoopManager"); + const auto& unified_loop_info = expanded_loop_info->get_unified_loop_info(); + auto& collection = ov::is_type(unified_loop_info) ? loops_must_be_last : m_ordered_loop_ids; + collection.push_back(loop_id); + } + m_ordered_loop_ids.insert(m_ordered_loop_ids.end(), loops_must_be_last.cbegin(), loops_must_be_last.cend()); } void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { @@ -152,18 +185,17 @@ void RuntimeConfigurator::init_buffer_info(const lowered::LinearIRCPtr& linear_i void RuntimeConfigurator::update_loop_info(const lowered::LinearIRCPtr& linear_ir, LoopInfoRuntimeParamsMap& initializated_info_map) const { - const auto& loop_map = linear_ir->get_loop_manager()->get_map(); - for (const auto& p : loop_map) { - const auto& expanded_loop_info = ov::as_type_ptr(p.second); - OPENVINO_ASSERT(expanded_loop_info, "UpdateLoopInfo expects ExpandedLoopInfo in LoopManager"); + auto update_unified_loop_info = [&](const lowered::UnifiedLoopInfoPtr& unified_loop_info) { + if (initializated_info_map.count(unified_loop_info) == 0) { + lowered::pass::InitLoops::update_runtime_parameters(unified_loop_info); + initializated_info_map[unified_loop_info] = compute_runtime_params(unified_loop_info); + } + }; - // First visiting of unified (whole) loop + auto update_expanded_loop_info = [&](const lowered::ExpandedLoopInfoPtr& expanded_loop_info) { const auto& current_unified_loop_info = expanded_loop_info->get_unified_loop_info(); - if (initializated_info_map.count(current_unified_loop_info) == 0) { - lowered::pass::InitLoops::update_runtime_parameters(current_unified_loop_info); - initializated_info_map[current_unified_loop_info] = compute_runtime_params(current_unified_loop_info); - } + OPENVINO_ASSERT(initializated_info_map.count(current_unified_loop_info) > 0, "UnifiedLoopInfo must be updated before ExpandedLoopInfo"); auto& initializated_info = initializated_info_map.at(current_unified_loop_info); auto& current_work_amount = initializated_info.work_amount; const auto& ptr_increments = initializated_info.ptr_increments; @@ -176,7 +208,7 @@ void RuntimeConfigurator::update_loop_info(const lowered::LinearIRCPtr& linear_i expanded_loop_info->set_work_amount(0); if (expanded_loop_info->is_evaluate_once()) expanded_loop_info->set_increment(0); - continue; + return; } const auto work_amount = @@ -196,6 +228,22 @@ void RuntimeConfigurator::update_loop_info(const lowered::LinearIRCPtr& linear_i expanded_loop_info->update_ptr_increments(ptr_increments); } expanded_loop_info->update_finalization_offsets(updated_finalization_offsets); + }; + + auto update_loop_info = [&](const lowered::LoopInfoPtr& loop_info) { + if (const auto unified_loop_info = ov::as_type_ptr(loop_info)) { + update_unified_loop_info(unified_loop_info); + } else if (const auto expanded_loop_info = ov::as_type_ptr(loop_info)) { + update_expanded_loop_info(expanded_loop_info); + } else { + OPENVINO_THROW("Failed to update loop info: unknown type!"); + } + }; + + lowered::LoopInfoSet updated_loops; + const auto& loop_map = linear_ir->get_loop_manager()->get_map(); + for (const auto& p : loop_map) { + p.second->apply(update_loop_info, updated_loops); } } @@ -203,6 +251,12 @@ void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRC const auto& loop_manager = linear_ir->get_loop_manager(); m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size(); + auto is_not_executed = [&loop_manager](const lowered::ExpressionPtr& buffer_expr) { + const auto& loop_ids = buffer_expr->get_loop_ids(); + return std::any_of(loop_ids.cbegin(), loop_ids.cend(), + [&loop_manager](size_t loop_id) { return loop_manager->get_loop_info(loop_id)->get_work_amount() == 0; }); + }; + for (const auto& p : m_dynamic_buffer_clusters) { const auto& cluster_id = p.first; const auto& cluster = p.second; @@ -212,13 +266,16 @@ void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRC size_t additional_size = 0; for (const auto& buffer_expr : cluster) { + // No need to calculate allocation size of Buffers which are in Loops with `work_amount = 0` - they won't be executed + if (is_not_executed(buffer_expr)) + continue; const auto& allocation_size = lowered::pass::ComputeBufferAllocationSize::get_allocation_size(loop_manager, buffer_expr, m_config->tile_rank); + OPENVINO_ASSERT(!utils::is_dynamic_value(allocation_size), "Buffer scratchpad size must be defined!"); additional_size = std::max(allocation_size * buffer_expr->get_node()->get_element_type().size(), additional_size); } cluster_offset = m_config->buffer_scratchpad_size; OPENVINO_ASSERT(!utils::is_dynamic_value(cluster_offset), "Offset of the cluster must be defined!"); - OPENVINO_ASSERT(!utils::is_dynamic_value(additional_size), "Buffer scratchpad size must be defined!"); m_config->buffer_scratchpad_size += additional_size; } diff --git a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp index db58805a8f023b..3e148d3c1cf329 100644 --- a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp +++ b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp @@ -7,12 +7,8 @@ #include "openvino/opsets/opset10.hpp" #include "snippets/lowered/pass/extract_loop_invariants.hpp" #include "snippets/lowered/pass/normalize_loop_ids.hpp" -#include "snippets/op/broadcastmove.hpp" -#include "snippets/op/scalar.hpp" -#include "snippets/op/vector_buffer.hpp" -#include "snippets/op/horizon_max.hpp" -#include "snippets/op/horizon_sum.hpp" -#include "snippets/op/powerstatic.hpp" +#include "snippets/lowered/pass/split_loops.hpp" +#include "snippets/snippets_isa.hpp" namespace ov { namespace test { @@ -292,6 +288,76 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsFromInnermostToLoopOuts } } +TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsSplitLoops) { + size_t vector_size = 16; + size_t block_size = 32; + const auto input_precision = ov::element::f32; + const ov::Shape input_shape_0{128, 512}; + const ov::Shape input_shape_1{512, 64}; + const ov::Shape input_shape_2{1, 1}; + const ov::snippets::VectorDims layout{0, 1}; + const ov::snippets::VectorDims subtensor{1, vector_size}; + /* + * Params Param2(1,1) + * \ / + * MatMul Broadcast + * \ / + * Add + * | + * Result + */ + { + const auto param0 = linear_ir->push_node(input_precision, input_shape_0); + const auto param1 = linear_ir->push_node(input_precision, input_shape_1); + const auto param2 = linear_ir->push_node(input_precision, input_shape_2); + const auto matmul = linear_ir->push_node(param0.second, param1.second); + const auto broadcastmove = linear_ir->push_node(param2.second, input_shape_1.back()); + init_expr_descriptors(*broadcastmove.first, {{1, 1}, subtensor}, {layout, layout}); + const auto add = linear_ir->push_node(matmul.second, broadcastmove.second); + init_expr_descriptors(*add.first, {subtensor, subtensor, subtensor}, {layout, layout, layout}); + const auto result = linear_ir->push_node(add.second); + const auto& loop_manager = linear_ir->get_loop_manager(); + loop_manager->mark_loop(matmul.first, broadcastmove.first, 128, block_size, 1, + std::vector{LoopPort((*matmul.first)->get_input_port(0)), + LoopPort((*matmul.first)->get_input_port(1), false)}, + std::vector{LoopPort((*matmul.first)->get_output_port(0))}); + loop_manager->mark_loop(broadcastmove.first, result.first, 64, vector_size, 0, + std::vector{LoopPort((*broadcastmove.first)->get_input_port(0)), + LoopPort((*add.first)->get_input_port(0))}, + std::vector{LoopPort((*add.first)->get_output_port(0))}); + loop_manager->mark_loop(broadcastmove.first, result.first, 128, 1, 1, + std::vector{LoopPort((*broadcastmove.first)->get_input_port(0)), + LoopPort((*add.first)->get_input_port(0))}, + std::vector{LoopPort((*add.first)->get_output_port(0))}); + ov::snippets::lowered::pass::SplitLoops().run(*linear_ir, linear_ir->begin(), linear_ir->end()); + } + { + const auto param0 = linear_ir_ref->push_node(input_precision, input_shape_0); + const auto param1 = linear_ir_ref->push_node(input_precision, input_shape_1); + const auto param2 = linear_ir_ref->push_node(input_precision, input_shape_2); + auto broadcastmove = linear_ir_ref->push_node(param2.second, input_shape_1.back()); + init_expr_descriptors(*broadcastmove.first, {{1, 1}, subtensor}, {layout, layout}); + const auto matmul = linear_ir_ref->push_node(param0.second, param1.second); + const auto add = linear_ir_ref->push_node(matmul.second, broadcastmove.second); + init_expr_descriptors(*add.first, {subtensor, subtensor, subtensor}, {layout, layout, layout}); + auto result = linear_ir_ref->push_node(add.second); + const auto& loop_manager = linear_ir_ref->get_loop_manager(); + loop_manager->mark_loop(matmul.first, add.first, 128, block_size, 1, + std::vector{LoopPort((*matmul.first)->get_input_port(0)), + LoopPort((*matmul.first)->get_input_port(1), false)}, + std::vector{LoopPort((*matmul.first)->get_output_port(0))}); + loop_manager->mark_loop(add.first, result.first, 64, vector_size, 0, + std::vector{LoopPort((*add.first)->get_input_port(0)), + LoopPort((*add.first)->get_input_port(1))}, + std::vector{LoopPort((*add.first)->get_output_port(0))}); + loop_manager->mark_loop(add.first, result.first, 128, 1, 1, + std::vector{LoopPort((*add.first)->get_input_port(0)), + LoopPort((*add.first)->get_input_port(1))}, + std::vector{LoopPort((*add.first)->get_output_port(0))}); + ov::snippets::lowered::pass::SplitLoops().run(*linear_ir_ref, linear_ir_ref->begin(), linear_ir_ref->end()); + } +} + class ExtractLoopInvariantsRemoveLoopsTest : public LoweredPassTestsF { public: ExtractLoopInvariantsRemoveLoopsTest() : LoweredPassTestsF() { diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp index a7214c755f5ed6..610b98e558760f 100644 --- a/src/common/snippets/tests/src/lowered/pass/loop.cpp +++ b/src/common/snippets/tests/src/lowered/pass/loop.cpp @@ -2,23 +2,23 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "snippets/op/loop.hpp" - #include +#include "openvino/opsets/opset10.hpp" +#include "snippets/snippets_isa.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/pass/cleanup_loop_offsets.hpp" #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_load_store.hpp" #include "snippets/lowered/pass/insert_loops.hpp" #include "snippets/lowered/pass/insert_specific_iterations.hpp" -#include "snippets/lowered/pass/iter_handler.hpp" +#include "snippets/lowered/pass/split_loops.hpp" +#include "snippets/lowered/pass/insert_buffers.hpp" #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" #include "snippets/lowered/pass/validate_unified_loops.hpp" #include "snippets/lowered/pass/validate_expanded_loops.hpp" #include "snippets/lowered/pass/normalize_loop_ids.hpp" #include "snippets/shape_inference/shape_inference.hpp" -#include "subgraph_simple.hpp" using Snippets_TailProcessingTransformation = ::testing::Test; // [Inserted Loop number, [ptr_increments, final_offsets] @@ -27,34 +27,32 @@ using namespace ov::snippets::lowered; constexpr static size_t vector_size = 16; -static void init_linear_ir(const std::vector& in_shapes, LinearIR& linear_ir, size_t block_size) { - auto body = ov::test::snippets::AddFunction(in_shapes).getOriginal(); - auto shape_infer_factory = std::make_shared(); - linear_ir = LinearIR(body, shape_infer_factory); - auto expr_it = std::find_if(linear_ir.cbegin(), linear_ir.cend(), - [](const ExpressionPtr& expr) { return ov::is_type(expr->get_node()); }); - ASSERT_TRUE(expr_it != linear_ir.cend()); - const auto add = *expr_it; - const auto loop_input_ports = std::vector{add->get_input_port(0), add->get_input_port(1)}; - const auto loop_output_ports = std::vector{add->get_output_port(0)}; +static void init_linear_ir(const std::vector& in_shapes, LinearIR& linear_ir, size_t block_size) { + Config lir_config; + lir_config.m_manual_build_support = true; + linear_ir = LinearIR(lir_config, std::make_shared()); + + const ov::element::Type input_precision = ov::element::f32; + const auto param0 = linear_ir.push_node(input_precision, in_shapes[0]); + const auto param1 = linear_ir.push_node(input_precision, in_shapes[1]); + const auto param2 = linear_ir.push_node(input_precision, in_shapes[2]); + const auto matmul = linear_ir.push_node(param0.second, param1.second); + const auto add = linear_ir.push_node(matmul.second, param2.second); + const auto result = linear_ir.push_node(add.second); + const auto loop_manager = linear_ir.get_loop_manager(); - const auto in_shape0 = in_shapes[0].get_shape(); - const auto in_shape1 = in_shapes[1].get_shape(); - const auto inner_wa = std::max(*in_shape0.rbegin(), *in_shape1.rbegin()); - const auto inner_inc = std::min(vector_size, inner_wa); - const auto blocked_wa = block_size; - const auto blocked_inc = 1; - const auto outer_wa = std::max(*(in_shape0.rbegin() + 1), *(in_shape1.rbegin() + 1)); - const auto outer_inc = blocked_wa; - loop_manager->mark_loop(expr_it, std::next(expr_it), inner_wa, inner_inc, 0, loop_input_ports, loop_output_ports); - loop_manager->mark_loop(expr_it, std::next(expr_it), blocked_wa, blocked_inc, 1, loop_input_ports, loop_output_ports, true, true); - const auto loop_id = loop_manager->mark_loop(expr_it, std::next(expr_it), outer_wa, outer_inc, 1, loop_input_ports, loop_output_ports); - const auto& outer_loop_info = loop_manager->get_loop_info(loop_id); - const auto outer_tail_size = outer_wa % outer_inc; - if (outer_tail_size != 0) { - outer_loop_info->register_pass_to_handler( - outer_tail_size); - } + linear_ir.get_loop_manager()->mark_loop(matmul.first, add.first, in_shapes[0].front(), block_size, 1, + std::vector{LoopPort((*matmul.first)->get_input_port(0)), + LoopPort((*matmul.first)->get_input_port(1), false)}, + std::vector{LoopPort((*matmul.first)->get_output_port(0))}); + linear_ir.get_loop_manager()->mark_loop(add.first, result.first, in_shapes[2].back(), vector_size, 0, + std::vector{LoopPort((*add.first)->get_input_port(0)), + LoopPort((*add.first)->get_input_port(1))}, + std::vector{LoopPort((*add.first)->get_output_port(0))}); + linear_ir.get_loop_manager()->mark_loop(add.first, result.first, in_shapes[2].front(), 1, 1, + std::vector{LoopPort((*add.first)->get_input_port(0)), + LoopPort((*add.first)->get_input_port(1))}, + std::vector{LoopPort((*add.first)->get_output_port(0))}); } static void apply_transformations(LinearIR& linear_ir, const std::shared_ptr& config) { @@ -64,6 +62,8 @@ static void apply_transformations(LinearIR& linear_ir, const std::shared_ptr(); + pipeline.register_pass(); pipeline.register_pass(vector_size); pipeline.register_pass(); pipeline.register_pass(); @@ -77,25 +77,27 @@ static void apply_transformations(LinearIR& linear_ir, const std::shared_ptr loops; for (const auto& expr : linear_ir) { const auto& node = expr->get_node(); const auto loop_end = ov::as_type_ptr(node); if (!loop_end) continue; + const auto loop_num = loop_end->get_id(); ASSERT_GT(reference.count(loop_num), 0); + loops.insert(loop_num); ASSERT_TRUE(loop_end->get_ptr_increments() == reference.at(loop_num).first); ASSERT_TRUE(loop_end->get_finalization_offsets() == reference.at(loop_num).second); - loop_num++; } - ASSERT_EQ(loop_num, reference.size()); + ASSERT_EQ(loops.size(), reference.size()); } TEST(Snippets_TailProcessingTransformation, BlockedWOTail_OriginalPtrShifts) { LinearIR linear_ir; - ov::Shape inputShape0 = {1, 2, 16, 20}; - ov::Shape inputShape1 = {1, 2, 16, 20}; - init_linear_ir({inputShape0, inputShape1}, linear_ir, 4); + ov::Shape inputShape0 = {12, 16}; + ov::Shape inputShape1 = {16, 20}; + ov::Shape inputShape2 = {12, 20}; + init_linear_ir({inputShape0, inputShape1, inputShape2}, linear_ir, 4); auto config = std::make_shared(); config->disable(); @@ -107,16 +109,17 @@ TEST(Snippets_TailProcessingTransformation, BlockedWOTail_OriginalPtrShifts) { std::map, std::vector>> reference; reference[0] = { std::vector(3, 1), std::vector(3, -20)}; reference[1] = { std::vector(3, 20), std::vector(3, -80)}; - reference[2] = { std::vector(3, 20), std::vector(3, -320)}; + reference[2] = { {16, 0, 20, 20}, {-192, 0, -240, -240}}; validate(linear_ir, reference); } TEST(Snippets_TailProcessingTransformation, BlockedWOTail_CleanUpPtrShifts) { LinearIR linear_ir; - ov::Shape inputShape0 = {1, 2, 16, 20}; - ov::Shape inputShape1 = {1, 2, 16, 20}; - init_linear_ir({inputShape0, inputShape1}, linear_ir, 4); + ov::Shape inputShape0 = {12, 16}; + ov::Shape inputShape1 = {16, 20}; + ov::Shape inputShape2 = {12, 20}; + init_linear_ir({inputShape0, inputShape1, inputShape2}, linear_ir, 4); auto config = std::make_shared(); config->disable(); @@ -126,17 +129,18 @@ TEST(Snippets_TailProcessingTransformation, BlockedWOTail_CleanUpPtrShifts) { // [Inserted Loop number, [ptr_increments, final_offsets] std::map, std::vector>> reference; reference[0] = { std::vector(3, 1), std::vector(3, 0)}; - reference[1] = { std::vector(3, 0), std::vector(3, 0)}; - reference[2] = { std::vector(3, 0), std::vector(3, 0)}; + reference[1] = { std::vector(3, 0), {0, -80, 0}}; // -80 - finalization offset for Buffer ptr + reference[2] = { {16, 0, 0, 0}, std::vector(4, 0)}; validate(linear_ir, reference); } TEST(Snippets_TailProcessingTransformation, BlockedTail_OriginalPtrShifts) { LinearIR linear_ir; - ov::Shape inputShape0 = {1, 2, 18, 20}; - ov::Shape inputShape1 = {1, 2, 18, 20}; - init_linear_ir({inputShape0, inputShape1}, linear_ir, 4); + ov::Shape inputShape0 = {14, 16}; + ov::Shape inputShape1 = {16, 20}; + ov::Shape inputShape2 = {14, 20}; + init_linear_ir({inputShape0, inputShape1, inputShape2}, linear_ir, 4); auto config = std::make_shared(); config->disable(); @@ -144,38 +148,37 @@ TEST(Snippets_TailProcessingTransformation, BlockedTail_OriginalPtrShifts) { // [Inserted Loop number, [ptr_increments, final_offsets] std::map, std::vector>> reference; - reference[0] = { std::vector(3, 0), std::vector(3, 16)}; // Vector Inner - reference[1] = { std::vector(3, 0), std::vector(3, -16)}; // Blocked Inner - reference[2] = { std::vector(3, 20), std::vector(3, -80)}; // Vector Blocked - reference[3] = { std::vector(3, 20), std::vector(3, 0)}; // Vector Outer + reference[0] = { std::vector(3, 0), std::vector(3, 16)}; // Vector Inner + reference[1] = { std::vector(3, 0), std::vector(3, -16)}; // Tail Inner - reference[4] = { std::vector(3, 0), std::vector(3, 16)}; // Vector Inner - reference[5] = { std::vector(3, 0), std::vector(3, -16)}; // Blocked Inner - reference[6] = { std::vector(3, 20), std::vector(3, -40)}; // Tail Blocked - reference[7] = { std::vector(3, 0), std::vector(3, -320)}; // Tail Blocked + reference[2] = { std::vector(3, 20), std::vector(3, -80)}; // Inner Vector Blocked + reference[3] = { {16, 0, 20, 20}, std::vector(4, 0)}; // Outer Vector Blocked + + reference[4] = { std::vector(3, 20), std::vector(3, -40)}; // Inner Tail Blocked + reference[5] = { std::vector(4, 0), {-192, 0, -240, -240}}; // Outer Tail Blocked validate(linear_ir, reference); } TEST(Snippets_TailProcessingTransformation, BlockedTail_CleanUpPtrShifts) { LinearIR linear_ir; - ov::Shape inputShape0 = {1, 2, 18, 20}; - ov::Shape inputShape1 = {1, 2, 18, 20}; - init_linear_ir({inputShape0, inputShape1}, linear_ir, 4); + ov::Shape inputShape0 = {14, 16}; + ov::Shape inputShape1 = {16, 20}; + ov::Shape inputShape2 = {14, 20}; + init_linear_ir({inputShape0, inputShape1, inputShape2}, linear_ir, 4); apply_transformations(linear_ir, std::make_shared()); // [Inserted Loop number, [ptr_increments, final_offsets] std::map, std::vector>> reference; - reference[0] = { std::vector(3, 0), std::vector(3, 16)}; // Vector Inner - reference[1] = { std::vector(3, 0), std::vector(3, 4)}; // Blocked Inner - reference[2] = {std::vector(3, 0), std::vector(3, 0)}; // Vector Blocked - reference[3] = { std::vector(3, 0), std::vector(3, 0)}; // Vector Outer - - reference[4] = { std::vector(3, 0), std::vector(3, 16)}; // Vector Inner - reference[5] = { std::vector(3, 0), std::vector(3, 4)}; // Blocked Inner - reference[6] = { std::vector(3, 0), std::vector(3, 0)}; // Tail Blocked - reference[7] = { std::vector(3, 0), std::vector(3, 0)}; // Tail Blocked + reference[0] = { std::vector(3, 0), std::vector(3, 16)}; // Vector Inner + reference[1] = { std::vector(3, 0), std::vector(3, 4)}; // Tail Inner + + reference[2] = { std::vector(3, 0), {0, -80, 0}}; // Inner Vector Blocked (-80 - finalization offset for Buffer ptr) + reference[3] = { {16, 0, 0, 0}, std::vector(4, 0)}; // Outer Vector Blocked + + reference[4] = { std::vector(3, 0), {0, -40, 0}}; // Inner Tail Blocked (-40 - finalization offset for Buffer ptr) + reference[5] = { std::vector(4, 0), {32, 0, 0, 0}}; // Outer Tail Blocked validate(linear_ir, reference); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 40da4130b212c9..56c2c75dae9bc2 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -12,6 +12,27 @@ namespace intel_cpu { const size_t CPURuntimeConfigurator::rank6D = 6; +#ifdef SNIPPETS_DEBUG_CAPS +std::string CPURuntimeConfig::to_string() const { + std::stringstream out; + out << RuntimeConfig::to_string(); + out << "Loop Parameters:" << "\n"; + for (size_t i = 0; i < loop_args.size(); ++i) { + const auto& loop = loop_args[i]; + out << "\t[" << i << "] WA: " << loop.m_work_amount << "\n"; + out << "\tPointer Increments: "; + for (int64_t j = 0; j < loop.m_num_data_ptrs; ++j) + out << loop.m_ptr_increments[j] << " "; + out << "\n"; + out << "\tFinalization offsets: "; + for (int64_t j = 0; j < loop.m_num_data_ptrs; ++j) + out << loop.m_finalization_offsets[j] << " "; + out << "\n"; + } + return out.str(); +} +#endif + CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigurator(std::make_shared()) { } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 2104dfbdd42ddb..00bc676678d189 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -17,6 +17,10 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { OPENVINO_RTTI("CPURuntimeConfig", "0", ov::snippets::RuntimeConfig) CPURuntimeConfig() = default; +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const override; +#endif + std::vector loop_args = {}; }; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp index cb6dfeb741109a..b968786eea8d67 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp @@ -69,7 +69,12 @@ void jit_loop_begin_emitter::emit_impl(const std::vector& in, const std: } // if wa < increment, skip the loop - h->cmp(reg_work_amount, wa_increment); + // Note : If the loop should be evaluated once and increment is dynamic, + // we should manually set `increment = 1` to compare the dynamic work amount + // with `1` at least before loop execution + // (work amount can be zero and we should skip this loop even `evaluate_once = 1`) + auto increment = evaluate_once && snippets::utils::is_dynamic_value(wa_increment) ? 1 : wa_increment; + h->cmp(reg_work_amount, increment); h->jl(*loop_end_label, Xbyak::CodeGenerator::T_NEAR); h->L(*loop_begin_label);