From 2ec7fd8bdb1a8ae8880a6aedf48c058e5d2deb61 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 14 Dec 2023 17:16:50 +0100 Subject: [PATCH] [Snippets] Specific loop iterations handler --- .../include/snippets/lowered/linear_ir.hpp | 6 +- .../include/snippets/lowered/loop_manager.hpp | 51 +-- .../pass/insert_specific_iterations.hpp | 25 ++ .../lowered/pass/insert_tail_loop.hpp | 52 --- .../snippets/lowered/pass/iter_handler.hpp | 48 +++ .../include/snippets/lowered/pass/pass.hpp | 51 ++- .../lowered/pass/propagate_subtensors.hpp | 28 ++ .../snippets/include/snippets/op/subgraph.hpp | 6 +- .../include/snippets/pass/manager.hpp | 7 +- src/common/snippets/src/generator.cpp | 14 +- .../snippets/src/lowered/loop_manager.cpp | 86 ++++- .../src/lowered/pass/assign_registers.cpp | 4 +- .../snippets/src/lowered/pass/fuse_loops.cpp | 34 +- .../src/lowered/pass/insert_load_store.cpp | 10 +- .../pass/insert_specific_iterations.cpp | 142 +++++++ .../src/lowered/pass/insert_tail_loop.cpp | 360 ------------------ .../src/lowered/pass/iter_handler.cpp | 108 ++++++ src/common/snippets/src/lowered/pass/pass.cpp | 16 +- .../src/lowered/pass/propagate_subtensors.cpp | 148 +++++++ .../lowered/pass/softmax_decomposition.cpp | 63 +-- .../snippets/src/lowered/pass/split_loops.cpp | 32 +- .../snippets/tests/src/lowered/pass/loop.cpp | 17 +- .../snippets/x64/op/brgemm_cpu.hpp | 10 +- .../x64/pass/lowered/brgemm_blocking.cpp | 143 +++---- .../x64/pass/lowered/cpu_iter_handlers.cpp | 29 ++ .../x64/pass/lowered/cpu_iter_handlers.hpp | 25 ++ .../snippets/matmul.cpp | 2 + 27 files changed, 870 insertions(+), 647 deletions(-) create mode 100644 src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp delete mode 100644 src/common/snippets/include/snippets/lowered/pass/insert_tail_loop.hpp create mode 100644 src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp create mode 100644 src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp create mode 100644 src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp delete mode 100644 src/common/snippets/src/lowered/pass/insert_tail_loop.cpp create mode 100644 src/common/snippets/src/lowered/pass/iter_handler.cpp create mode 100644 src/common/snippets/src/lowered/pass/propagate_subtensors.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp index 6a62762ae1c806..2f0534233d324f 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp @@ -74,9 +74,9 @@ class LinearIR { LinearIR::container::const_iterator end, ExressionMap& expression_map); - const container& get_ops() const {return m_expressions; } - const io_container& get_IO_ops() const {return m_io_expressions; } - Config get_config() {return m_config; } + const container& get_ops() const { return m_expressions; } + const io_container& get_IO_ops() const { return m_io_expressions; } + Config get_config() const { return m_config; } void set_loop_depth(size_t loop_depth) { m_config.m_loop_depth = loop_depth; } const ExpressionPtr& get_expr_by_node(const std::shared_ptr& n) const; diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp index 93d1620f5fdbe7..70dfa8d37c4642 100644 --- a/src/common/snippets/include/snippets/lowered/loop_manager.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_manager.hpp @@ -4,11 +4,12 @@ #pragma once -#include "linear_ir.hpp" - #include #include +#include "linear_ir.hpp" +#include "pass/iter_handler.hpp" +#include "pass/pass.hpp" #include "port_descriptor.hpp" namespace ov { @@ -45,9 +46,7 @@ class LinearIR::LoopManager { LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - bool outer_splited_loop = false) - : m_work_amount(work_amount), m_increment(increment), - m_entry_points(entries), m_exit_points(exits), m_outer_splited_loop(outer_splited_loop) {} + bool outer_splited_loop = false); LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, @@ -63,19 +62,6 @@ class LinearIR::LoopManager { const std::vector& get_exit_points() const; bool get_outer_splited_loop() const; - /** - * \brief Inserts a separate body for first loop iteration processing if needed. - * Can also modify both main and first iter loop bodies. - * TODO: replace this temporary solution when ticket 119851 is implemented - * - * \param linear_ir LIR which should be modified - * \param loop_end_it iterator on LoopEnd expression for which the handler is called - * - * \return bool value which indicates whether the linear_ir was changed or not. - */ - using FirstIterHandler = std::function; - const FirstIterHandler& get_first_iter_handler() const; - // Sets dim_idx to all entry and exit points void set_dim_idx(size_t dim_idx); void set_work_amount(size_t work_amount); @@ -83,7 +69,9 @@ class LinearIR::LoopManager { void set_entry_points(std::vector entry_points); void set_exit_points(std::vector exit_points); void set_outer_splited_loop(bool outer_splited_loop); - void set_first_iter_handler(FirstIterHandler handler); + + enum {FIRST_ITER, MAIN_BODY, LAST_ITER}; + std::vector handlers; private: size_t m_work_amount = 0; @@ -96,7 +84,6 @@ class LinearIR::LoopManager { std::vector m_exit_points = {}; // True if this Loop is outer Loop for nested Loops that splits the same dimension bool m_outer_splited_loop = false; - FirstIterHandler m_first_iter_handler = nullptr; }; using LoopInfoPtr = std::shared_ptr; @@ -118,16 +105,22 @@ class LinearIR::LoopManager { size_t mark_loop(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, size_t work_amount, - size_t work_amount_increment, + size_t increment, size_t dim_idx, const std::vector& entries, - const std::vector& exits) { - const auto loop_info = std::make_shared(work_amount, work_amount_increment, entries, exits); + const std::vector& exits, + bool set_default_handlers = true) { + if (increment > work_amount) + increment = work_amount; + const auto loop_info = std::make_shared(work_amount, increment, entries, exits); loop_info->set_dim_idx(dim_idx); const auto loop_id = this->add_loop_info(loop_info); for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) { insert_loop_id(*expr_it, loop_id); } + if (set_default_handlers) { + set_default_loop_handlers(loop_info); + } return loop_id; } @@ -137,12 +130,18 @@ class LinearIR::LoopManager { size_t work_amount, size_t increment, const std::vector& entries, - const std::vector& exits) { + const std::vector& exits, + bool set_default_handlers = true) { + if (increment > work_amount) + increment = work_amount; const auto loop_info = std::make_shared(work_amount, increment, entries, exits); const auto loop_id = this->add_loop_info(loop_info); for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) { insert_loop_id(*expr_it, loop_id); } + if (set_default_handlers) { + set_default_loop_handlers(loop_info); + } return loop_id; } @@ -197,6 +196,7 @@ class LinearIR::LoopManager { size_t loop_id, bool loop_ops_inserted = false); LoopPort get_loop_port_by_expr_port(const ExpressionPort& expr_port, const size_t loop_id); + static void set_default_loop_handlers(const LoopInfoPtr& loop_info); private: static void get_io_loop_ports(LinearIR::constExprIt loop_begin_pos, @@ -207,6 +207,9 @@ class LinearIR::LoopManager { static void fuse_loop_ports(std::vector& exit_points, std::vector& entry_points, size_t loop_id); + static std::vector fuse_loop_handlers( + std::vector& lhs, + std::vector& rhs); /* ===== The methods for work with Loop IDs of Expression ===== */ // Notes: diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp new file mode 100644 index 00000000000000..099b43a54b2d6b --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "pass.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +class InsertSpecificIterations : public Pass { +public: + OPENVINO_RTTI("InsertSpecificIterations", "Pass") + bool run(LinearIR& linear_ir) override; + + static LinearIR::container copy_loop(const LinearIR& linear_ir, const size_t loop_id); +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_tail_loop.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_tail_loop.hpp deleted file mode 100644 index faafd8186b8448..00000000000000 --- a/src/common/snippets/include/snippets/lowered/pass/insert_tail_loop.hpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "pass.hpp" - -#include "snippets/op/loop.hpp" -#include "snippets/lowered/loop_manager.hpp" - -namespace ov { -namespace snippets { -namespace lowered { -namespace pass { - -/** - * @interface InsertTailLoop - * @brief Injects tail-processing loop after a vector loop if required. - * Additional optimizations are performed if a loop body is executed only once. - * @ingroup snippets - */ -class InsertTailLoop : public Pass { -public: - OPENVINO_RTTI("InsertTailLoop", "Pass") - bool run(LinearIR& linear_ir) override; - static LinearIR::container copy_loop(const LinearIR& linear_ir, const size_t loop_id); - - static constexpr size_t existing_subtensor_value = SIZE_MAX; - static void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, - const LinearIR::LoopManager::LoopInfoPtr& loop_info, - LinearIR::container::const_iterator begin, - LinearIR::container::const_iterator end, - const size_t new_dim_value = existing_subtensor_value); - -private: - static void create_tail_loop(LinearIR& linear_ir, - LinearIR::constExprIt begin, - LinearIR::constExprIt end, - const std::shared_ptr& loop_end, - bool need_vector_loop, - size_t tail_size); - static void tail_transformations(LinearIR& linear_ir, - LinearIR::constExprIt tail_begin, - LinearIR::constExprIt tail_end, - size_t tail_size); -}; - -} // namespace pass -} // namespace lowered -} // namespace snippets -} // namespace ov diff --git a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp new file mode 100644 index 00000000000000..2111749b4b27d8 --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" +#include "snippets/op/loop.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +class UpdateMemoryAccessOps : public pass::RangedPass { +public: + UpdateMemoryAccessOps(size_t count); + OPENVINO_RTTI("UpdateMemoryAccessOps", "RangedPass") + bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + +private: + size_t m_count; +}; + +class SetFillOffset : public pass::RangedPass { +public: + SetFillOffset(size_t offset); + OPENVINO_RTTI("SetFillOffset", "RangedPass") + bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + +private: + size_t m_offset; +}; + +class TransformInnerSplitLoop : public pass::RangedPass { +public: + TransformInnerSplitLoop(size_t tail_size); + OPENVINO_RTTI("TransformInnerSplitLoop", "RangedPass") + bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + +private: + size_t m_tail_size; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/lowered/pass/pass.hpp b/src/common/snippets/include/snippets/lowered/pass/pass.hpp index 177056d2984d25..ce17fda4b199ee 100644 --- a/src/common/snippets/include/snippets/lowered/pass/pass.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/pass.hpp @@ -16,18 +16,18 @@ namespace lowered { namespace pass { /** - * @interface Pass + * @interface PassBase * @brief Base class for transformations on linear IR * @ingroup snippets */ -class Pass { +class PassBase { public: - Pass() = default; - virtual ~Pass() = default; + PassBase() = default; + virtual ~PassBase() = default; // Note that get_type_info_static and get_type_info are needed to mimic OPENVINO_RTTI interface, // so the standard OPENVINO_RTTI(...) macros could be used in derived classes. _OPENVINO_HIDDEN_METHOD static const ::ov::DiscreteTypeInfo& get_type_info_static() { - static ::ov::DiscreteTypeInfo type_info_static {"Pass"}; + static ::ov::DiscreteTypeInfo type_info_static {"PassBase"}; type_info_static.hash(); return type_info_static; } @@ -39,7 +39,15 @@ class Pass { const char* get_type_name() const { return get_type_info().name; } +}; +/** + * @interface Pass + * @brief Base class for LIR passes which are performed on a full LIR body + * @ingroup snippets + */ +class Pass : public PassBase { +public: /** * @brief Apply the pass to the Linear IR * @param linear_ir the target Linear IR @@ -48,25 +56,45 @@ class Pass { virtual bool run(lowered::LinearIR& linear_ir) = 0; }; +/** + * @interface Pass + * @brief Base class for LIR passes which are performed on a range of a LIR body + * @ingroup snippets + */ +class RangedPass : public PassBase { +public: + /** + * @brief Apply the pass to the Linear IR + * @param linear_ir the target Linear IR + * @param begin begin of the range on which the pass is performed + * @param end end of the range on which the pass is performed + * @return status of the pass + */ + virtual bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) = 0; +}; + class PassPipeline { public: - using PositionedPassLowered = snippets::pass::PositionedPass; + using PositionedPassLowered = snippets::pass::PositionedPass; PassPipeline(); PassPipeline(const std::shared_ptr& pass_config); - void register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr& pass); - void register_pass(const std::shared_ptr& pass); + const std::vector>& get_passes() const { return m_passes; } + bool empty() const { return m_passes.empty(); } + + void register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr& pass); + void register_pass(const std::shared_ptr& pass); template void register_pass(Args&&... args) { - static_assert(std::is_base_of::value, "Pass not derived from lowered::Pass"); + static_assert(std::is_base_of::value, "Pass not derived from lowered::Pass"); auto pass = std::make_shared(std::forward(args)...); register_pass(pass); } template::value, bool>() = true> void register_pass(const snippets::pass::PassPosition& position, Args&&... args) { - static_assert(std::is_base_of::value, "Pass not derived from lowered::Pass"); + static_assert(std::is_base_of::value, "Pass not derived from lowered::Pass"); auto pass = std::make_shared(std::forward(args)...); register_pass(position, pass); } @@ -74,10 +102,11 @@ class PassPipeline { void register_positioned_passes(const std::vector& pos_passes); void run(lowered::LinearIR& linear_ir) const; + void run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) const; private: std::shared_ptr m_pass_config; - std::vector> m_passes; + std::vector> m_passes; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp new file mode 100644 index 00000000000000..88282497475cf0 --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +class UpdateSubtensors : public pass::RangedPass { +public: + UpdateSubtensors(size_t tail_size); + OPENVINO_RTTI("UpdateSubtensors", "Pass") + bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + +private: + size_t m_tail_size; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index af0112f03a9087..cefd64f46b3137 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -5,11 +5,13 @@ #pragma once #include - #include #include -#include "openvino/op/op.hpp" + #include "openvino/core/rt_info.hpp" +#include "openvino/op/op.hpp" +#include "snippets/generator.hpp" +#include "snippets/lowered/pass/pass.hpp" #include "snippets/pass/manager.hpp" #include "snippets/shape_inference/shape_inference.hpp" #include "snippets/lowered/pass/pass.hpp" diff --git a/src/common/snippets/include/snippets/pass/manager.hpp b/src/common/snippets/include/snippets/pass/manager.hpp index a9e3c2aec37498..3867366f1b399d 100644 --- a/src/common/snippets/include/snippets/pass/manager.hpp +++ b/src/common/snippets/include/snippets/pass/manager.hpp @@ -10,9 +10,6 @@ #include "openvino/pass/pass.hpp" #include "openvino/pass/validate.hpp" -#include - - namespace ov { namespace snippets { namespace pass { @@ -36,7 +33,7 @@ class Manager : public ov::pass::Manager { std::shared_ptr register_pass(const PassPosition& position, Args&&... args) { static_assert(std::is_base_of::value, "Attempt to insert pass that is not derived from PassBase"); auto pass = std::make_shared(std::forward(args)...); - auto rc = insert_pass_instance(position, pass); + auto rc = insert_pass_instance(position, pass); rc->set_pass_config(m_pass_config); if (!m_pass_config->is_enabled()) { m_pass_config->disable(); @@ -48,7 +45,7 @@ class Manager : public ov::pass::Manager { void register_positioned_passes(const std::vector& pos_passes); protected: - std::shared_ptr insert_pass_instance(const PassPosition& position, const std::shared_ptr& pass); + std::shared_ptr insert_pass_instance(const PassPosition& position, const std::shared_ptr& pass); }; } // namespace pass diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index b1b179ab4fdfa1..5db1670022bb7f 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -4,28 +4,28 @@ #include "snippets/generator.hpp" +#include "snippets/itt.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/pass/assign_registers.hpp" #include "snippets/lowered/pass/cleanup_loop_offsets.hpp" -#include "snippets/lowered/pass/insert_tail_loop.hpp" +#include "snippets/lowered/pass/insert_specific_iterations.hpp" #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" - +#include "snippets/lowered/pass/serialize_control_flow.hpp" +#include "snippets/lowered/pass/pass.hpp" #include "snippets/op/kernel.hpp" -#include "snippets/itt.hpp" - namespace ov { namespace snippets { void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, const void* compile_params) const { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate") OV_ITT_TASK_CHAIN(GENERATE, ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::Transformations") - if (!target->is_supported()) - OPENVINO_THROW("unsupported architecture for code generation"); + OPENVINO_ASSERT(target->is_supported(), "unsupported architecture for code generation"); std::function& op)> reg_type_mapper = [&](const std::shared_ptr& op) -> opRegType { return get_op_reg_type(op); }; + lowered::pass::PassPipeline lowered_pipeline; // Note: the order of all passes in this pipeline must not be changed since they have hard dependencies // 1. InsertTailLoop must be called after AssignRegisters since tail loop expressions must have the same @@ -35,7 +35,7 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c // 3. OptimizeLoopSingleEvaluation must be called after CleanupLoopOffsets // since CleanupLoopOffsets can't handle loops with evaluate_once = true lowered_pipeline.register_pass(reg_type_mapper); - lowered_pipeline.register_pass(); + lowered_pipeline.register_pass(); lowered_pipeline.register_pass(); lowered_pipeline.register_pass(); lowered_pipeline.run(linear_ir); diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp index e7e83361ee0a39..7592e681019b14 100644 --- a/src/common/snippets/src/lowered/loop_manager.cpp +++ b/src/common/snippets/src/lowered/loop_manager.cpp @@ -5,6 +5,8 @@ #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/expression.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" +#include "snippets/lowered/pass/propagate_subtensors.hpp" #include "snippets/utils.hpp" #include "openvino/core/graph_util.hpp" @@ -37,6 +39,19 @@ std::shared_ptr LoopPort::clone_with_new_expr(const ExpressionPtr& new return new_loop_port; } +LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount, + size_t increment, + const std::vector& entries, + const std::vector& exits, + bool outer_splited_loop) + : m_work_amount(work_amount), + m_increment(increment), + m_entry_points(entries), + m_exit_points(exits), + m_outer_splited_loop(outer_splited_loop) { + handlers.resize(3); +} + LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, @@ -51,6 +66,7 @@ LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount, m_entry_points.emplace_back(port); for (const auto& port : exits) m_exit_points.emplace_back(port); + handlers.resize(3); } std::shared_ptr LoopInfo::clone_with_new_expr(const ExressionMap& expr_map) const { @@ -68,7 +84,9 @@ std::shared_ptr LoopInfo::clone_with_new_expr(const ExressionMap& expr const auto& new_entry_points = clone_loop_ports(m_entry_points); const auto& new_exit_points = clone_loop_ports(m_exit_points); - return std::make_shared(m_work_amount, m_increment, new_entry_points, new_exit_points, m_outer_splited_loop); + auto new_info = std::make_shared(m_work_amount, m_increment, new_entry_points, new_exit_points, m_outer_splited_loop); + new_info->handlers = handlers; + return new_info; } size_t LoopInfo::get_work_amount() const { @@ -91,10 +109,6 @@ bool LoopInfo::get_outer_splited_loop() const { return m_outer_splited_loop; } -const LoopInfo::FirstIterHandler& LoopInfo::get_first_iter_handler() const { - return m_first_iter_handler; -} - size_t LinearIR::LoopManager::LoopInfo::get_dim_idx() const { OPENVINO_ASSERT(!m_entry_points.empty(), "Loop info must have at least one entry point"); auto equal_dim_idxes = [&](const LinearIR::LoopManager::LoopPort& p) { @@ -137,10 +151,6 @@ void LoopInfo::set_outer_splited_loop(bool outer_splited_loop) { m_outer_splited_loop = outer_splited_loop; } -void LoopInfo::set_first_iter_handler(LoopInfo::FirstIterHandler first_iter_handler) { - m_first_iter_handler = std::move(first_iter_handler); -} - bool operator==(const LinearIR::LoopManager::LoopPort& lhs, const LinearIR::LoopManager::LoopPort& rhs) { if (&lhs == &rhs) return true; @@ -248,6 +258,14 @@ LinearIR::LoopManager::LoopPort LinearIR::LoopManager::get_loop_port_by_expr_por : get_loop_port(loop_info->get_exit_points()); } +void LinearIR::LoopManager::set_default_loop_handlers(const LoopInfoPtr& loop_info) { + const auto tail_size = loop_info->get_work_amount() % loop_info->get_increment(); + if (tail_size != 0) { + loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + } +} + void LinearIR::LoopManager::get_io_loop_ports(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, std::vector &entries, @@ -330,18 +348,16 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, } for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) { - if (*(loop_subtensor.rbegin() + dim_idx) == PortDescriptor::ServiceDimensions::FULL_DIM) { + OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup"); + const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx); + if (subtensor_value == PortDescriptor::ServiceDimensions::FULL_DIM) { continue; } OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup"); - const auto work_amount = - loop_tensor.size() > dim_idx ? *(loop_tensor.rbegin() + dim_idx) - : 0; - const auto work_amount_increment = - loop_subtensor.size() > dim_idx ? *(loop_subtensor.rbegin() + dim_idx) - : (dim_idx == 0 ? vector_size : 1); - mark_loop(loop_begin_pos, loop_end_pos, work_amount, work_amount_increment, dim_idx, loop_entry_points, loop_exit_points); + const auto work_amount = *(loop_tensor.rbegin() + dim_idx); + const auto increment = subtensor_value; + mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, dim_idx, loop_entry_points, loop_exit_points); } } @@ -399,6 +415,15 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, loop_info->set_entry_points(new_entries); loop_info->set_exit_points(new_exits); + loop_info->handlers = fuse_loop_handlers(loop_info_upper->handlers, loop_info_lower->handlers); + // Since fusion can be called for broadcastable loops (one of the loops has work_amount = increment = 1), + // maximum value is set to the fused loop + loop_info->set_work_amount(std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount())); + loop_info->set_increment(std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment())); + // If one of the Loops is outer for nested loops that splits the same dimension, + // after fusion new common Loop saves this status + loop_info->set_outer_splited_loop(loop_info_upper->get_outer_splited_loop() || loop_info_lower->get_outer_splited_loop()); + const auto& from = fuse_into_upper ? loop_id_lower : loop_id_upper; const auto& to = fuse_into_upper ? loop_id_upper : loop_id_lower; for (auto it = loop_begin_target; it != loop_end_target; ++it) { @@ -409,6 +434,31 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, remove_loop_info(from); } +std::vector LinearIR::LoopManager::fuse_loop_handlers( + std::vector& from, + std::vector& to) { + const auto min_size = std::min(from.size(), to.size()); + std::vector merged_handlers; + merged_handlers.resize(min_size); + for (size_t i = 0; i < min_size; ++i) { + merged_handlers[i] = from[i]; + const auto& res_passes = merged_handlers[i].get_passes(); + for (const auto& pass : to[i].get_passes()) { + auto pred = [&pass](const std::shared_ptr& p) { + return p->get_type_info() == pass->get_type_info(); + }; + if (std::find_if(res_passes.begin(), res_passes.end(), pred) == res_passes.end()) { + merged_handlers[i].register_pass(pass); + } + } + } + auto& handlers_with_larger_size = from.size() > to.size() ? from : to; + for (size_t i = min_size; i < handlers_with_larger_size.size(); ++i) { + merged_handlers.emplace_back(std::move(handlers_with_larger_size[i])); + } + return merged_handlers; +} + void LinearIR::LoopManager::fuse_loop_ports(std::vector& exit_points, std::vector& entry_points, size_t loop_id) { @@ -543,7 +593,7 @@ void LinearIR::LoopManager::insert_loop_id(const ExpressionPtr& expr, size_t new OPENVINO_ASSERT(m_map.count(new_id) == 1, "Failed marking expression by Loop ID: the Loop with this ID hasn't registered"); auto& loop_ids = expr->m_loop_ids; OPENVINO_ASSERT(std::find(loop_ids.cbegin(), loop_ids.cend(), new_id) == loop_ids.cend(), - "Expression cannot have several the same Loop IDs"); + "Expression cannot have several identical Loop IDs"); auto insert_it = before ? loop_ids.cbegin() : loop_ids.cend(); if (target_id != SIZE_MAX) { insert_it = std::find(loop_ids.cbegin(), loop_ids.cend(), target_id); diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp index d49cf8d63155a7..fbeef30888fa85 100644 --- a/src/common/snippets/src/lowered/pass/assign_registers.cpp +++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp @@ -80,10 +80,10 @@ bool AssignRegisters::run(LinearIR& linear_ir) { for (const auto& tensor : input_expr_input_tensors) { const auto parent_expr = tensor->get_source().get_expr(); if (ov::is_type(parent_expr->get_node())) { - manually_assigned_vecs[tensor] = static_cast(accumulator_reg); if (ov::is_type(parent_expr->get_input_port_connector(0)->get_source().get_expr()->get_node())) { + manually_assigned_vecs[tensor] = static_cast(accumulator_reg); manually_assigned_vecs[parent_expr->get_input_port_connector(0)] = static_cast(accumulator_reg); - } + } } } const auto& output_tensor = expr->get_output_port_connector(0); diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp index 1738d6d8fe9574..dc7dac6eed4095 100644 --- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp +++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp @@ -44,20 +44,29 @@ bool FuseLoops::loop_ports_are_compatible(const LinearIR::LoopManagerPtr& loop_m } bool FuseLoops::can_be_fused(const LoopInfoPtr& loop_current, const LoopInfoPtr& loop_target) { - auto current_work_amount = loop_current->get_work_amount(); - auto target_work_amount = loop_target->get_work_amount(); - // Loop fusion is supported only if Loops have equal increments and the equal/broadcastable work amounts. + const auto current_work_amount = loop_current->get_work_amount(); + const auto target_work_amount = loop_target->get_work_amount(); + const auto current_increment = loop_current->get_increment(); + const auto target_increment = loop_target->get_increment(); + // Loop fusion is supported only if Loops have equal/broadcastable increments and work amounts. // Note: For example, Broadcastable work amounts are possible in the following case: // Relu_0 [16x1] Relu_1 [16x128] // \ / // Add [16x128] // Because of expression order in linear IR and work of MarkLoop algorithm, there are 2 Inner Loops: - // - Relu_0 with work amount `1` and increment `vector size` + // - Relu_0 with work amount `1` and increment `1` // - Relu_1 and Add with work amount `128` and increment `vector size` // We can fuse them into one Loop with work amount `128` and increment `vector size` - const auto supported_work_amount = current_work_amount == target_work_amount || current_work_amount == 1 || target_work_amount == 1; - const auto supported_increment = loop_current->get_increment() == loop_target->get_increment(); - return supported_work_amount && supported_increment; + + // WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't, + // because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters + // (e.g. tail size). This logic is not implemented for now, so fusion for such loops is skipped. + const bool first_iter_handlers_match = loop_current->handlers[LoopManager::LoopInfo::FIRST_ITER].empty() == + loop_target->handlers[LoopManager::LoopInfo::FIRST_ITER].empty(); + const bool equal_parameters = current_work_amount == target_work_amount && current_increment == target_increment; + const bool current_bcastable = current_work_amount == 1 && current_increment == 1; + const bool target_bcastable = target_work_amount == 1 && target_increment == 1; + return first_iter_handlers_match && (equal_parameters || current_bcastable || target_bcastable); } void FuseLoops::move(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id, @@ -124,12 +133,6 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos; loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos); loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, target_loop_id, current_loop_id, false); - // Update work_amount for Loop (increment is constant because increments must be the identical for fusion): - loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount())); - // If one of the Loops is outer for nested loops that splits the same dimension, - // after fusion new common Loop save this status - loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop()); - const auto insertion_place = current_loop_begin_pos; const auto is_move_needed = target_loop_end_pos != current_loop_begin_pos; if (is_move_needed) @@ -169,11 +172,6 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos; loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos); loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, current_loop_id, target_loop_id); - // Update work_amount for Loop (increment is constant because increments must be the identical for fusion): - loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount())); - // If one of the Loops is outer for nested loops that splits the same dimension, - // after fusion new common Loop save this status - loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop()); const auto insertion_place = current_loop_end_pos; const auto is_move_needed = insertion_place != target_loop_begin_pos; diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp index 75e70c9c553c88..492eb8d17682b1 100644 --- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp +++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp @@ -20,13 +20,13 @@ using LoopInfoPtr = LoopManager::LoopInfoPtr; InsertLoadStore::InsertLoadStore(size_t vector_size) : m_vector_size(vector_size) {} size_t InsertLoadStore::get_count(const PortDescriptorPtr& port_desc) const { - const auto layout = port_desc->get_layout(); - const auto shape = port_desc->get_shape(); + const auto& layout = port_desc->get_layout(); + const auto& shape = port_desc->get_shape(); // Find last dimension by layout - const auto last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1); + const auto& last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1); OPENVINO_ASSERT(last_dim_idx != layout.end() && *last_dim_idx < shape.size(), "Load/Store expression have incorrect layout"); - const auto dim = shape[*last_dim_idx]; - return dim == 1 ? 1 : m_vector_size; + const auto& dim = shape[*last_dim_idx]; + return std::min(dim, m_vector_size); } bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) { diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp new file mode 100644 index 00000000000000..0ad3877ad29e1a --- /dev/null +++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp @@ -0,0 +1,142 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/insert_specific_iterations.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" +#include "snippets/itt.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +LinearIR::container InsertSpecificIterations::copy_loop(const LinearIR& linear_ir, const size_t loop_id) { + const auto& loop_manager = linear_ir.get_loop_manager(); + LinearIR::constExprIt loop_begin_pos, loop_end_pos; + loop_manager->get_loop_bounds(linear_ir, loop_id, loop_begin_pos, loop_end_pos, true); + ExressionMap expression_map; + const auto& loop_copy_range = LinearIR::deep_copy_range(loop_begin_pos, std::next(loop_end_pos), expression_map); + + const auto original_loop_info = loop_manager->get_loop_info(loop_id); + std::vector new_entry_points, new_exit_points; + // Clone loop ports from original loop info to new loop info + for (const auto& entry : original_loop_info->get_entry_points()) + new_entry_points.push_back(*entry.clone_with_new_expr(expression_map[entry.expr_port->get_expr().get()])); + for (const auto& exit : original_loop_info->get_exit_points()) + new_exit_points.push_back(*exit.clone_with_new_expr(expression_map[exit.expr_port->get_expr().get()])); + + for (const auto& elem : expression_map) { + const auto expr = elem.first->shared_from_this(); + const auto& new_expr = elem.second; + // Loop begin/end ops can't be loop ports + if (ov::is_type(expr->get_node())) + continue; + // Update loop info of all outer loops with new loop ports + const auto outer_loop_ids = LinearIR::LoopManager::get_outer_expr_loops(expr, loop_id); + for (size_t i = 0; i < expr->get_input_count(); ++i) + loop_manager->update_loops_port(outer_loop_ids, expr->get_input_port(i), {expr->get_input_port(i), new_expr->get_input_port(i)}, true); + for (size_t i = 0; i < expr->get_output_count(); ++i) + loop_manager->update_loops_port(outer_loop_ids, expr->get_output_port(i), {expr->get_output_port(i), new_expr->get_output_port(i)}, false); + } + + const auto new_loop_begin_pos = loop_copy_range.begin(); + const auto new_loop_end_pos = loop_copy_range.end(); + const auto new_id = loop_manager->replace_with_new_loop(linear_ir, + std::next(new_loop_begin_pos), + std::prev(new_loop_end_pos), + original_loop_info->get_work_amount(), + original_loop_info->get_increment(), + new_entry_points, + new_exit_points, + loop_id); + const auto loop_end = ov::as_type_ptr(std::prev(new_loop_end_pos)->get()->get_node()); + OPENVINO_ASSERT(loop_end, "Cloned Loop does not contain LoopEnd op at the expected place."); + loop_end->set_id(new_id); + return loop_copy_range; +} + +using LoopInfo = LinearIR::LoopManager::LoopInfo; + +bool InsertSpecificIterations::run(LinearIR& linear_ir) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertSpecificIterations") + const auto& loop_manager = linear_ir.get_loop_manager(); + + bool modified = false; + for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) { + const auto& expr = *expr_it; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + if (!loop_end) + continue; + + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + const auto work_amount = loop_info->get_work_amount(); + const auto increment = loop_info->get_increment(); + auto& handlers = loop_info->handlers; + + const auto main_body_begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end->get_loop_begin())); + const auto main_body_end_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end)); + + auto update_loop_params = [&loop_manager](const std::shared_ptr& loop_end_copy, + size_t new_work_amount, + size_t new_increment, + bool zero_finalization_offsets) { + loop_end_copy->set_work_amount(new_work_amount); + loop_end_copy->set_increment(new_increment); + + const auto& loop_info_copy = loop_manager->get_loop_info(loop_end_copy->get_id()); + loop_info_copy->set_work_amount(new_work_amount); + loop_info_copy->set_increment(new_increment); + + if (zero_finalization_offsets) + loop_end_copy->set_finalization_offsets(std::vector(loop_end_copy->get_finalization_offsets().size(), 0)); + }; + + auto copy_and_run_specific_handlers = [&](const PassPipeline& handlers) { + const auto& cloned_body = copy_loop(linear_ir, loop_end->get_id()); + linear_ir.insert(main_body_begin_it, cloned_body.begin(), cloned_body.end()); + const auto& loop_end_it = std::prev(cloned_body.end()); + handlers.run(linear_ir, cloned_body.begin(), loop_end_it); + return ov::as_type_ptr(loop_end_it->get()->get_node()); + }; + + const bool specific_first_iteration = !handlers[LoopInfo::FIRST_ITER].empty(); + if (work_amount == increment) { + handlers[LoopInfo::FIRST_ITER].run(linear_ir, main_body_begin_it, main_body_end_it); + } else { + if (specific_first_iteration) { + const auto loop_end_copy = copy_and_run_specific_handlers(handlers[LoopInfo::FIRST_ITER]); + update_loop_params(loop_end_copy, increment, increment, true); + } + + const auto tail_size = work_amount % increment; + if (tail_size != 0) { + if (!specific_first_iteration || work_amount > 2 * increment) { + const auto loop_end_copy = copy_and_run_specific_handlers(handlers[LoopInfo::MAIN_BODY]); + const auto reduce_value = specific_first_iteration ? tail_size + increment : tail_size; + const auto new_work_amount = work_amount - reduce_value; + update_loop_params(loop_end_copy, new_work_amount, increment, true); + } + handlers[LoopInfo::LAST_ITER].run(linear_ir, main_body_begin_it, main_body_end_it); + update_loop_params(loop_end, tail_size, tail_size, false); + } else if (specific_first_iteration) { + handlers[LoopInfo::MAIN_BODY].run(linear_ir, main_body_begin_it, main_body_end_it); + update_loop_params(loop_end, work_amount - increment, increment, false); + } + } + modified = true; + } + return modified; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov + diff --git a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp deleted file mode 100644 index cc685c1851157a..00000000000000 --- a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp +++ /dev/null @@ -1,360 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/lowered/pass/insert_tail_loop.hpp" - -#include "snippets/lowered/linear_ir.hpp" -#include "snippets/lowered/loop_manager.hpp" -#include "snippets/lowered/pass/init_loops.hpp" -#include "snippets/snippets_isa.hpp" -#include "snippets/utils.hpp" -#include "snippets/itt.hpp" - -namespace ov { -namespace snippets { -namespace lowered { -namespace pass { -void InsertTailLoop::propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, - const LinearIR::LoopManager::LoopInfoPtr& loop_info, - LinearIR::container::const_iterator begin, - LinearIR::container::const_iterator end, - const size_t new_dim_value) { - std::map original_shapes; - // First step: set new dim value to the corresponding entry_points' dimensions - if (new_dim_value != existing_subtensor_value) { - for (const auto& port : loop_info->get_entry_points()) { - if (port.is_incremented) { - const auto& expr = port.expr_port->get_expr(); - const auto node = expr->get_node(); - auto desc = port.expr_port->get_descriptor_ptr(); - auto subtensor = desc->get_subtensor(); - if (port.dim_idx < subtensor.size()) { - *(subtensor.rbegin() + port.dim_idx) = new_dim_value; - desc->set_subtensor(subtensor); - } - - const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); - const auto& layout = parent_desc->get_layout(); - const auto& shape = parent_desc->get_shape(); - if (original_shapes.find(parent_desc) == original_shapes.end()) { - original_shapes[parent_desc] = shape; - } - auto new_shape = shape; - new_shape[*(layout.rbegin() + port.dim_idx)] = new_dim_value; - parent_desc->set_shape(new_shape); - } - } - } - - auto update_only_dim_idx_with_subtensor_value = [&](const LinearIR::LoopManager::LoopPort& port) { - if (port.is_incremented) { - auto desc = port.expr_port->get_descriptor_ptr(); - const auto expr = port.expr_port->get_expr(); - const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); - - const auto& layout = parent_desc->get_layout(); - const auto& shape = parent_desc->get_shape(); - const auto& desc_subtensor = desc->get_subtensor(); - if (port.dim_idx < desc_subtensor.size()) { - if (original_shapes.find(parent_desc) == original_shapes.end()) { - original_shapes[parent_desc] = shape; - } - auto new_shape = shape; - new_shape[*(layout.rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx); - parent_desc->set_shape(new_shape); - } - } - }; - - auto update_subtensors = [](const std::vector& descs, bool is_input) { - for (const auto& desc : descs) { - const auto& subtensor = desc->get_subtensor(); - if (!subtensor.empty()) { - auto planar_dims = is_input ? snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout()) - : snippets::utils::get_preordered_vdims(desc->get_shape(), desc->get_layout()); - const size_t subtensor_start = planar_dims.size() - subtensor.size(); - VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end()); - for (size_t i = 0; i < new_subtensor.size(); ++i) { - new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]); - } - desc->set_subtensor(new_subtensor); - } - } - }; - - auto shape_inference_end_it = end; - const bool loop_by_last_dim = loop_info->get_dim_idx() == 0; - // Subtensors are updated using shape inference infrastructure: - // For inner loops propagation function is called recursively - for (auto expr_it = begin; expr_it != end; expr_it++) { - const auto expr = *expr_it; - if (ov::is_type(expr->get_node())) - continue; - if (auto loop_begin = ov::as_type_ptr(expr->get_node())) { - const auto loop_end = loop_begin->get_loop_end(); - const auto inner_loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id()); - const auto inner_begin = std::next(expr_it); - const auto inner_end = linear_ir.find(linear_ir.get_expr_by_node(loop_end)); - - // The corresponding shapes of inner loops entry points must be updated using existing subtensor values - if (new_dim_value == existing_subtensor_value) { - for (const auto& port : loop_info->get_entry_points()) - update_only_dim_idx_with_subtensor_value(port); - } - propagate_updated_subtensor_through_loop(linear_ir, inner_loop_info, inner_begin, inner_end); - expr_it = inner_end; - continue; - } - if ((ov::is_type(expr_it->get()->get_node()) || - ov::is_type(expr_it->get()->get_node())) && - loop_by_last_dim) { - // WA: we have to break subtensor propagation if we try to propagate new last dim through Broadcast nodes - // which broadcast last dim in original dimension value anyway - // This workaround might be avoided if blocked shape are used for tail size propagation - shape_inference_end_it = expr_it; - break; - } - expr->updateShapes(); - update_subtensors(expr->get_input_port_descriptors(), true); - update_subtensors(expr->get_output_port_descriptors(), false); - } - - // After subtensor propagation, the original shapes must be restored - for (const auto& elem : original_shapes) - elem.first->set_shape(elem.second); - for (auto expr_it = begin; expr_it != shape_inference_end_it; expr_it++) - (*expr_it)->updateShapes(); -} - -LinearIR::container InsertTailLoop::copy_loop(const LinearIR& linear_ir, const size_t loop_id) { - const auto& loop_manager = linear_ir.get_loop_manager(); - LinearIR::constExprIt loop_begin_pos, loop_end_pos; - loop_manager->get_loop_bounds(linear_ir, loop_id, loop_begin_pos, loop_end_pos, true); - ExressionMap expression_map; - const auto& loop_copy_range = LinearIR::deep_copy_range(loop_begin_pos, std::next(loop_end_pos), expression_map); - - const auto original_loop_info = loop_manager->get_loop_info(loop_id); - std::vector new_entry_points, new_exit_points; - // Clone loop ports from original loop info to new loop info - for (const auto& entry : original_loop_info->get_entry_points()) - new_entry_points.push_back(*entry.clone_with_new_expr(expression_map[entry.expr_port->get_expr().get()])); - for (const auto& exit : original_loop_info->get_exit_points()) - new_exit_points.push_back(*exit.clone_with_new_expr(expression_map[exit.expr_port->get_expr().get()])); - - for (const auto& elem : expression_map) { - const auto expr = elem.first->shared_from_this(); - const auto& new_expr = elem.second; - // Loop begin/end ops can't be loop ports - if (ov::is_type(expr->get_node())) - continue; - // Update loop info of all outer loops with new loop ports - const auto outer_loop_ids = LinearIR::LoopManager::get_outer_expr_loops(expr, loop_id); - for (size_t i = 0; i < expr->get_input_count(); ++i) - loop_manager->update_loops_port(outer_loop_ids, expr->get_input_port(i), {expr->get_input_port(i), new_expr->get_input_port(i)}, true); - for (size_t i = 0; i < expr->get_output_count(); ++i) - loop_manager->update_loops_port(outer_loop_ids, expr->get_output_port(i), {expr->get_output_port(i), new_expr->get_output_port(i)}, false); - } - - const auto new_loop_begin_pos = loop_copy_range.begin(); - const auto new_loop_end_pos = loop_copy_range.end(); - const auto new_id = loop_manager->replace_with_new_loop(linear_ir, - std::next(new_loop_begin_pos), - std::prev(new_loop_end_pos), - original_loop_info->get_work_amount(), - original_loop_info->get_increment(), - new_entry_points, - new_exit_points, - loop_id); - const auto loop_end = ov::as_type_ptr(std::prev(new_loop_end_pos)->get()->get_node()); - OPENVINO_ASSERT(loop_end, "Cloned Loop does not contain LoopEnd op at the expected place."); - loop_end->set_id(new_id); - return loop_copy_range; -} - -void InsertTailLoop::create_tail_loop(LinearIR& linear_ir, - LinearIR::constExprIt begin, - LinearIR::constExprIt end, - const std::shared_ptr& loop_end, - bool need_vector_loop, - size_t tail_size) { - // tail is required => transform the body into a tail representation - // tail loop is fake loop because for tail we should calculate only - // finalization offsets which are supported by LoopEnd. - const auto& loop_manager = linear_ir.get_loop_manager(); - const auto original_loop_id = loop_end->get_id(); - auto original_loop_info = loop_manager->get_loop_info(original_loop_id); - auto tail_loop_info = original_loop_info; - if (need_vector_loop) { - const auto new_loop_range = copy_loop(linear_ir, original_loop_id); - const auto new_loop_end = ov::as_type_ptr(std::prev(new_loop_range.end())->get()->get_node()); - OPENVINO_ASSERT(new_loop_end, "Cloned Loop does not contain LoopEnd op at the expected place."); - tail_loop_info = original_loop_info; - original_loop_info = loop_manager->get_loop_info(new_loop_end->get_id()); - - // Note: new loop body is inserted before the original loop - // So new loop becomes a main vector loop, the original loop becomes tail loop - // This is done in such way to have original ops from the main body at the end: - // this allows us to conveniently interact with outer loops in further passes - linear_ir.insert(begin, new_loop_range.begin(), new_loop_range.end()); - - const auto new_vector_loop_wa = original_loop_info->get_work_amount() - tail_size; - original_loop_info->set_work_amount(new_vector_loop_wa); - new_loop_end->set_work_amount(new_vector_loop_wa); - original_loop_info->set_outer_splited_loop(tail_loop_info->get_outer_splited_loop()); - // Note that finalization offsets should be applied after the last iteration. - // So if there is a tail, then we should apply offsets after it, but not now. - new_loop_end->set_finalization_offsets(std::vector(loop_end->get_finalization_offsets().size(), 0)); - } - loop_end->set_increment(tail_size); - loop_end->set_work_amount(tail_size); - tail_loop_info->set_increment(tail_size); - tail_loop_info->set_work_amount(tail_size); - - // We have to check the loop body for any nested loops that work on the same dimension - // and rescale their work_amount and increment accordingly - if (original_loop_info->get_outer_splited_loop()) { - const auto current_dim_idx = original_loop_info->get_dim_idx(); - OPENVINO_ASSERT(current_dim_idx != LinearIR::LoopManager::LoopInfo::UNDEFINED_DIM_IDX, - "Outer splitted loop unexpectedly iterates by several dimension indices"); - for (auto it = std::next(begin); it != std::prev(end); ++it) { - const auto& expr = *it; - const auto inner_loop_end = ov::as_type_ptr(expr->get_node()); - if (!inner_loop_end) - continue; - const auto inner_loop_info = loop_manager->get_loop_info(inner_loop_end->get_id()); - const auto inner_dim_idx = inner_loop_info->get_dim_idx(); - if (inner_dim_idx != current_dim_idx) - continue; - const auto inner_loop_begin = inner_loop_end->get_loop_begin(); - const auto inner_tail_work_amount = static_cast(inner_loop_end->get_work_amount()); - const auto inner_tail_increment = inner_loop_end->get_increment(); - auto inner_finalization_offsets = inner_loop_end->get_finalization_offsets(); - for (auto& offset : inner_finalization_offsets) { - offset = offset / inner_tail_work_amount * static_cast(tail_size); - } - inner_loop_end->set_work_amount(tail_size); - inner_loop_end->set_increment(std::min(inner_tail_increment, tail_size)); - inner_loop_end->set_finalization_offsets(inner_finalization_offsets); - const auto inner_loop_begin_it = std::find(begin, it, linear_ir.get_expr_by_node(inner_loop_begin)); - const auto inner_loop_end_it = std::next(end); - OPENVINO_ASSERT(inner_loop_begin_it != it, "LoopBegin has not been found!"); - tail_transformations(linear_ir, inner_loop_begin_it, inner_loop_end_it, tail_size); - } - } - tail_transformations(linear_ir, begin, end, tail_size); - propagate_updated_subtensor_through_loop(linear_ir, tail_loop_info, std::next(begin), end, tail_size); -} - -void InsertTailLoop::tail_transformations(LinearIR& linear_ir, - LinearIR::constExprIt tail_begin, - LinearIR::constExprIt tail_end, - const size_t tail_size) { - const auto& config = linear_ir.get_config(); - auto insertFill = [tail_size](const ov::Input& input) -> std::shared_ptr { - std::shared_ptr fill = nullptr; - auto& rt = input.get_rt_info(); - auto fill_rt = rt.find("set_fill"); - if (fill_rt != rt.end()) { - const auto fill_value = fill_rt->second.as(); - fill = std::make_shared(input.get_source_output(), tail_size, fill_value); - input.get_node()->set_argument(input.get_index(), fill); - } - return fill; - }; - - for (auto expr_it = std::next(tail_begin); expr_it != tail_end; expr_it++) { - // Skip inner Loops - const auto loop_begin = ov::as_type_ptr(expr_it->get()->get_node()); - if (loop_begin) { - expr_it = linear_ir.find(expr_it, tail_end, linear_ir.get_expr_by_node(loop_begin->get_loop_end())); - continue; - } - // We should fill vector regs by float_min and zero to have - // correct math calculations for ReduceMax and ReduceSum in scalar case. - // Note: We find Maximum and Add ops because HorizonMax and HorizonSum are outside Loop, - // so they are missed in - const auto& expr = *expr_it; - const auto op = expr->get_node(); - if (config.m_need_fill_tail_register && - (ov::is_type(op) || - ov::is_type(op))) { - for (size_t i = 0; i < op->inputs().size(); ++i) { - if (auto fill = insertFill(op->input(i))) { - const auto& input = expr->get_input_port_connector(i); - const auto consumers = input->get_consumers(); - // If there are several consumers, fill expression must be inserted before first of them - auto fst_consumer = std::min_element(consumers.cbegin(), consumers.cend(), [&](ExpressionPort lhs, ExpressionPort rhs) { - auto lhs_it = linear_ir.find(lhs.get_expr()); - auto rhs_it = linear_ir.find(rhs.get_expr()); - return std::distance(linear_ir.cbegin(), lhs_it) < std::distance(linear_ir.cbegin(), rhs_it); - }); - const auto insert_pos = linear_ir.find(fst_consumer->get_expr()); - auto fill_expr = linear_ir.create_expression(fill, {input}); - linear_ir.insert(insert_pos, fill_expr); - linear_ir.replace_input(consumers, fill_expr->get_output_port_connector(0)); - // in_reg == out_reg since we want to modify vector reg inplace - const auto reg = expr->get_input_port_descriptor(0)->get_reg(); - fill_expr->get_input_port_descriptor(0)->set_reg(reg); - fill_expr->get_output_port_descriptor(0)->set_reg(reg); - fill_expr->set_loop_ids(expr->get_loop_ids()); - } - } - } else if (const auto memory_access = std::dynamic_pointer_cast(op)) { - for (const auto p : memory_access->get_memory_access_input_ports()) { - const auto port = p.first; - if (memory_access->get_input_count(port) > 1) { - memory_access->set_input_count(tail_size, port); - } - } - for (const auto p : memory_access->get_memory_access_output_ports()) { - const auto port = p.first; - if (memory_access->get_output_count(port) > 1) { - memory_access->set_output_count(tail_size, port); - } - } - } - } -} - -bool InsertTailLoop::run(LinearIR& linear_ir) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::insertTailLoop") - const auto& loop_manager = linear_ir.get_loop_manager(); - bool modified = false; - - for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) { - const auto& expr = *expr_it; - const auto node = expr->get_node(); - const auto loop_end = ov::as_type_ptr(node); - if (!loop_end) - continue; - - const auto loop_info = loop_manager->get_loop_info(loop_end->get_id()); - const auto& first_iter_handler = loop_info->get_first_iter_handler(); - if (first_iter_handler) { - modified |= first_iter_handler(linear_ir, expr_it); - } - - const auto work_amount = loop_end->get_work_amount(); - const auto increment = loop_end->get_increment(); - const auto tail_size = work_amount % increment; - - // tail is required => transform the body into a tail representation - // tail loop is fake loop because for tail we should calculate only - // finalization offsets which are supported by LoopEnd. - if (tail_size != 0) { - const auto loop_begin = loop_end->get_loop_begin(); - const auto begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_begin)); - const auto need_vector_loop = work_amount >= increment; - create_tail_loop(linear_ir, begin_it, std::next(expr_it), loop_end, need_vector_loop, tail_size); - } - modified = true; - } - return modified; -} - -} // namespace pass -} // namespace lowered -} // namespace snippets -} // namespace ov - diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp new file mode 100644 index 00000000000000..cc6351dba168cd --- /dev/null +++ b/src/common/snippets/src/lowered/pass/iter_handler.cpp @@ -0,0 +1,108 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/iter_handler.hpp" + +#include "snippets/itt.hpp" +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/lowered/pass/propagate_subtensors.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +UpdateMemoryAccessOps::UpdateMemoryAccessOps(size_t count) : RangedPass(), m_count(count) {} + +bool UpdateMemoryAccessOps::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + for (auto expr_it = std::next(begin); expr_it != end; expr_it++) { + // Skip inner Loops + const auto loop_begin = ov::as_type_ptr(expr_it->get()->get_node()); + if (loop_begin) { + expr_it = linear_ir.find(expr_it, end, linear_ir.get_expr_by_node(loop_begin->get_loop_end())); + continue; + } + + const auto& node = expr_it->get()->get_node(); + if (const auto memory_access = ov::as_type_ptr(node)) { + for (const auto p : memory_access->get_memory_access_input_ports()) { + const auto port = p.first; + if (memory_access->get_input_count(port) > 1) { + memory_access->set_input_count(m_count, port); + } + } + for (const auto p : memory_access->get_memory_access_output_ports()) { + const auto port = p.first; + if (memory_access->get_output_count(port) > 1) { + memory_access->set_output_count(m_count, port); + } + } + } + } + return true; +} + +SetFillOffset::SetFillOffset(size_t offset) : RangedPass(), m_offset(offset) {} + +bool SetFillOffset::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + for (auto expr_it = std::next(begin); expr_it != end; expr_it++) { + const auto& node = expr_it->get()->get_node(); + if (const auto fill = ov::as_type_ptr(node)) { + fill->set_offset(m_offset); + } + } + return true; +} + +TransformInnerSplitLoop::TransformInnerSplitLoop(size_t tail_size) : RangedPass(), m_tail_size(tail_size) {} + +bool TransformInnerSplitLoop::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + const auto& expr = *end; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + const auto& loop_manager = linear_ir.get_loop_manager(); + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + const auto current_dim_idx = loop_info->get_dim_idx(); + OPENVINO_ASSERT(current_dim_idx != LinearIR::LoopManager::LoopInfo::UNDEFINED_DIM_IDX, + "Outer splitted loop unexpectedly iterates by several dimension indices"); + + bool modified = false; + for (auto it = std::next(begin); it != end; ++it) { + const auto& expr = *it; + const auto inner_loop_end = ov::as_type_ptr(expr->get_node()); + if (!inner_loop_end) + continue; + const auto inner_loop_info = loop_manager->get_loop_info(inner_loop_end->get_id()); + const auto inner_dim_idx = inner_loop_info->get_dim_idx(); + if (inner_dim_idx != current_dim_idx) + continue; + const auto inner_loop_begin = inner_loop_end->get_loop_begin(); + const auto inner_loop_work_amount = static_cast(inner_loop_end->get_work_amount()); + const auto inner_loop_increment = inner_loop_end->get_increment(); + auto inner_finalization_offsets = inner_loop_end->get_finalization_offsets(); + for (auto& offset : inner_finalization_offsets) { + offset = offset / inner_loop_work_amount * static_cast(m_tail_size); + } + inner_loop_end->set_work_amount(m_tail_size); + // TODO: if m_tail_size more than inner loop increment, + // handlers of the inner loop must be reset with new tail size + inner_loop_end->set_increment(std::min(inner_loop_increment, m_tail_size)); + inner_loop_end->set_finalization_offsets(inner_finalization_offsets); + const auto inner_loop_begin_it = std::find(begin, it, linear_ir.get_expr_by_node(inner_loop_begin)); + const auto inner_loop_end_it = std::next(end); + OPENVINO_ASSERT(inner_loop_begin_it != it, "LoopBegin has not been found!"); + const auto& last_iter_handlers = inner_loop_info->handlers[LinearIR::LoopManager::LoopInfo::LAST_ITER]; + last_iter_handlers.run(linear_ir, inner_loop_begin_it, inner_loop_end_it); + modified = true; + } + return modified; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov + diff --git a/src/common/snippets/src/lowered/pass/pass.cpp b/src/common/snippets/src/lowered/pass/pass.cpp index 70a05fc30be147..27588a03d431fa 100644 --- a/src/common/snippets/src/lowered/pass/pass.cpp +++ b/src/common/snippets/src/lowered/pass/pass.cpp @@ -16,23 +16,33 @@ PassPipeline::PassPipeline(const std::shared_ptr& pass_config) : m_p OPENVINO_ASSERT(m_pass_config != nullptr, "PassConfig is not initialized!"); } -void PassPipeline::register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr& pass) { +void PassPipeline::register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr& pass) { OPENVINO_ASSERT(pass != nullptr, "PassPipeline cannot register empty pass!"); m_passes.insert(position.get_insert_position(m_passes), pass); } -void PassPipeline::register_pass(const std::shared_ptr& pass) { +void PassPipeline::register_pass(const std::shared_ptr& pass) { OPENVINO_ASSERT(pass != nullptr, "PassPipeline cannot register empty pass!"); m_passes.push_back(pass); } void PassPipeline::run(LinearIR& linear_ir) const { + run(linear_ir, linear_ir.begin(), linear_ir.end()); +} + +void PassPipeline::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) const { for (const auto& pass : m_passes) { OPENVINO_ASSERT(pass != nullptr, "PassPipeline has empty pass!"); if (m_pass_config->is_disabled(pass->get_type_info())) { continue; } - pass->run(linear_ir); + if (auto lir_pass = std::dynamic_pointer_cast(pass)) { + lir_pass->run(linear_ir); + } else if (auto ranged_pass = std::dynamic_pointer_cast(pass)) { + ranged_pass->run(linear_ir, begin, end); + } else { + OPENVINO_THROW("Unexpected pass (", pass->get_type_info(), ") is registered in PassPipeline"); + } } } diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp new file mode 100644 index 00000000000000..b39b2762a3c7ad --- /dev/null +++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp @@ -0,0 +1,148 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/propagate_subtensors.hpp" + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" +#include "snippets/itt.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +namespace { +void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, + const LinearIR::LoopManager::LoopInfoPtr& loop_info, + LinearIR::container::const_iterator begin, + LinearIR::container::const_iterator end, + const size_t new_dim_value) { + std::map original_shapes; + static constexpr size_t existing_subtensor_value = SIZE_MAX; + // First step: set new dim value to the corresponding entry_points' dimensions + if (new_dim_value != existing_subtensor_value) { + for (const auto& port : loop_info->get_entry_points()) { + if (port.is_incremented) { + const auto& expr = port.expr_port->get_expr(); + const auto node = expr->get_node(); + auto desc = port.expr_port->get_descriptor_ptr(); + auto subtensor = desc->get_subtensor(); + if (port.dim_idx < subtensor.size()) { + *(subtensor.rbegin() + port.dim_idx) = new_dim_value; + desc->set_subtensor(subtensor); + } + + const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); + const auto& layout = parent_desc->get_layout(); + const auto& shape = parent_desc->get_shape(); + if (original_shapes.find(parent_desc) == original_shapes.end()) { + original_shapes[parent_desc] = shape; + } + auto new_shape = shape; + new_shape[*(layout.rbegin() + port.dim_idx)] = new_dim_value; + parent_desc->set_shape(new_shape); + } + } + } + + auto update_only_dim_idx_with_subtensor_value = [&](const LinearIR::LoopManager::LoopPort& port) { + if (port.is_incremented) { + auto desc = port.expr_port->get_descriptor_ptr(); + const auto expr = port.expr_port->get_expr(); + const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); + + const auto& layout = parent_desc->get_layout(); + const auto& shape = parent_desc->get_shape(); + const auto& desc_subtensor = desc->get_subtensor(); + if (port.dim_idx < desc_subtensor.size()) { + if (original_shapes.find(parent_desc) == original_shapes.end()) { + original_shapes[parent_desc] = shape; + } + auto new_shape = shape; + new_shape[*(layout.rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx); + parent_desc->set_shape(new_shape); + } + } + }; + + auto update_subtensors = [](const std::vector& descs, bool is_input) { + for (const auto& desc : descs) { + const auto& subtensor = desc->get_subtensor(); + if (!subtensor.empty()) { + auto planar_dims = is_input ? snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout()) + : snippets::utils::get_preordered_vdims(desc->get_shape(), desc->get_layout()); + const size_t subtensor_start = planar_dims.size() - subtensor.size(); + VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end()); + for (size_t i = 0; i < new_subtensor.size(); ++i) { + new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]); + } + desc->set_subtensor(new_subtensor); + } + } + }; + + auto shape_inference_end_it = end; + const bool loop_by_last_dim = loop_info->get_dim_idx() == 0; + // Subtensors are updated using shape inference infrastructure: + // For inner loops propagation function is called recursively + for (auto expr_it = begin; expr_it != end; expr_it++) { + const auto expr = *expr_it; + if (ov::is_type(expr->get_node())) + continue; + if (auto loop_begin = ov::as_type_ptr(expr->get_node())) { + const auto loop_end = loop_begin->get_loop_end(); + const auto inner_loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id()); + const auto inner_begin = std::next(expr_it); + const auto inner_end = linear_ir.find(linear_ir.get_expr_by_node(loop_end)); + + // The corresponding shapes of inner loops entry points must be updated using existing subtensor values + if (new_dim_value == existing_subtensor_value) { + for (const auto& port : loop_info->get_entry_points()) + update_only_dim_idx_with_subtensor_value(port); + } + propagate_updated_subtensor_through_loop(linear_ir, inner_loop_info, inner_begin, inner_end, existing_subtensor_value); + expr_it = inner_end; + continue; + } + if ((ov::is_type(expr_it->get()->get_node()) || + ov::is_type(expr_it->get()->get_node())) && + loop_by_last_dim) { + // WA: we have to break subtensor propagation if we try to propagate new last dim through Broadcast nodes + // which broadcast last dim in original dimension value anyway + // This workaround might be avoided if blocked shape are used for tail size propagation + shape_inference_end_it = expr_it; + break; + } + expr->updateShapes(); + update_subtensors(expr->get_input_port_descriptors(), true); + update_subtensors(expr->get_output_port_descriptors(), false); + } + + // After subtensor propagation, the original shapes must be restored + for (const auto& elem : original_shapes) + elem.first->set_shape(elem.second); + for (auto expr_it = begin; expr_it != shape_inference_end_it; expr_it++) + (*expr_it)->updateShapes(); +} +} // namespace + +UpdateSubtensors::UpdateSubtensors(size_t tail_size) : RangedPass(), m_tail_size(tail_size) {} + +bool UpdateSubtensors::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + const auto& expr = *end; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + const auto& loop_manager = linear_ir.get_loop_manager(); + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + propagate_updated_subtensor_through_loop(linear_ir, loop_info, std::next(begin), end, m_tail_size); + return true; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov + diff --git a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp index 4174f928352289..b884f3f72f0d8f 100644 --- a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp +++ b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp @@ -7,6 +7,7 @@ #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/pass/mark_loops.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" #include "snippets/snippets_isa.hpp" #include "snippets/itt.hpp" @@ -19,6 +20,8 @@ namespace snippets { namespace lowered { namespace pass { +using LoopInfo = LinearIR::LoopManager::LoopInfo; + SoftmaxDecomposition::SoftmaxDecomposition(size_t vector_size) : m_vector_size{vector_size} {} bool SoftmaxDecomposition::run(LinearIR& linear_ir) { @@ -40,6 +43,7 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { const auto& output_connector = softmax_expr->get_output_port_connector(0); const auto tensor_out = softmax_expr->get_output_port_descriptor(0)->get_shape(); const auto inner_work_amount = *(tensor_out.rbegin()); + const auto inner_increment = std::min(inner_work_amount, m_vector_size); // Float constant values in byte representation const auto float_min_constant = uint32_t(0xff7fffff); @@ -58,15 +62,22 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { // Init value of vector buffer for ReduceMax is -FLOAT_MIN. const auto fill_max = push_node(std::make_shared(vector_buffer_max.second, 0, float_min_constant)); // ReduceMax loop - const auto& max = push_node(std::make_shared(softmax->get_input_source_output(0), fill_max.second)); + const auto fill_max_tail = push_node(std::make_shared(softmax->get_input_source_output(0), inner_increment, float_min_constant)); + + const auto& max = push_node(std::make_shared(fill_max_tail.second, fill_max.second)); const auto horizon_max = push_node(std::make_shared(max.second)); // Markup of ReduceMax Loop - loop_manager->mark_loop(max.first, horizon_max.first, inner_work_amount, m_vector_size, 0, - std::vector{(*max.first)->get_input_port(0), - (*max.first)->get_input_port(1)}, - std::vector{(*max.first)->get_output_port(0)}); + const auto reduce_max_loop_id = loop_manager->mark_loop(fill_max_tail.first, horizon_max.first, inner_work_amount, inner_increment, 0, + std::vector{(*fill_max_tail.first)->get_input_port(0), + (*max.first)->get_input_port(1)}, + std::vector{(*max.first)->get_output_port(0)}); + const auto& reduce_max_loop_info = loop_manager->get_loop_info(reduce_max_loop_id); + const auto tail_size = inner_work_amount % inner_increment; + if (tail_size != 0) { + reduce_max_loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + } const auto broadcast_horizon_max = push_node(std::make_shared(horizon_max.second, broadcasted_dim)); const auto vector_buffer_sum = push_node(std::make_shared()); // Init value of vector buffer for ReduceSum is zero. @@ -75,38 +86,42 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { // Sub + Exp + ReduceSum Loop const auto sub = push_node(std::make_shared(softmax->get_input_source_output(0), broadcast_horizon_max.second)); const auto exp = push_node(std::make_shared(sub.second)); - const auto sum = push_node(std::make_shared(exp.second, fill_sum.second)); + const auto fill_sum_tail = push_node(std::make_shared(exp.second, inner_increment, zero_constant)); + const auto sum = push_node(std::make_shared(fill_sum_tail.second, fill_sum.second)); const auto horizon_sum = push_node(std::make_shared(sum.second)); - // Markup of ReduceMax Loop - loop_manager->mark_loop(sub.first, horizon_sum.first, inner_work_amount, m_vector_size, 0, - std::vector{(*sub.first)->get_input_port(0), - (*sub.first)->get_input_port(1), - (*sum.first)->get_input_port(1)}, - std::vector{(*exp.first)->get_output_port(0), - (*sum.first)->get_output_port(0)}); + // Markup of ReduceSum Loop + const auto reduce_sum_loop_id = loop_manager->mark_loop(sub.first, horizon_sum.first, inner_work_amount, inner_increment, 0, + std::vector{(*sub.first)->get_input_port(0), + (*sub.first)->get_input_port(1), + (*sum.first)->get_input_port(1)}, + std::vector{(*fill_sum_tail.first)->get_output_port(0), + (*sum.first)->get_output_port(0)}); + const auto& reduce_sum_loop_info = loop_manager->get_loop_info(reduce_sum_loop_id); + if (tail_size != 0) { + reduce_sum_loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + } // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop const auto pow = push_node(std::make_shared(horizon_sum.second, -1.f)); const auto broadcast_pow = push_node(std::make_shared(pow.second, broadcasted_dim)); // Mul (pseudo-Divide loop) - const auto mul = push_node(std::make_shared(exp.second, broadcast_pow.second)); + const auto mul = push_node(std::make_shared(fill_sum_tail.second, broadcast_pow.second)); // Transfer original ExpressionPorts - linear_ir.replace_input((*max.first)->get_input_port(0), input_connector); + linear_ir.replace_input((*fill_max_tail.first)->get_input_port(0), input_connector); linear_ir.replace_input((*sub.first)->get_input_port(0), input_connector); linear_ir.replace_input(output_connector->get_consumers(), (*mul.first)->get_output_port_connector(0)); // Markup of Mul Loop - loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, m_vector_size, 0, - std::vector{(*mul.first)->get_input_port(0), - (*mul.first)->get_input_port(1)}, + loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, inner_increment, 0, + std::vector{(*mul.first)->get_input_port(0), (*mul.first)->get_input_port(1)}, std::vector{(*mul.first)->get_output_port(0)}); // Update Loop info for outer loops - const auto entry_points = std::vector{(*max.first)->get_input_port(0), + const auto entry_points = std::vector{(*fill_max_tail.first)->get_input_port(0), (*sub.first)->get_input_port(0)}; const auto exit_points = std::vector{(*mul.first)->get_output_port(0)}; for (auto loop_id : softmax_loop_ids) { @@ -114,16 +129,6 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { } expr_it = linear_ir.erase(expr_it); // Remove Softmax - - /* =========================================== */ - - /* ============= Runtime Info ================ */ - - // For tail loop we should fill input of Max by float min and - // input of Sum by zero to avoid math incorrect calculations - // TODO [111383]: It should be covered via general pipeline (for example, via analyze in InsertTailLoop?) - max.second->input(0).get_rt_info()["set_fill"] = float_min_constant; - sum.second->input(0).get_rt_info()["set_fill"] = zero_constant; modified = true; } } diff --git a/src/common/snippets/src/lowered/pass/split_loops.cpp b/src/common/snippets/src/lowered/pass/split_loops.cpp index ba036eca8011f9..8b764c5a44442e 100644 --- a/src/common/snippets/src/lowered/pass/split_loops.cpp +++ b/src/common/snippets/src/lowered/pass/split_loops.cpp @@ -5,6 +5,7 @@ #include "snippets/lowered/pass/split_loops.hpp" #include "snippets/lowered/pass/fuse_loops.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_manager.hpp" #include "snippets/snippets_isa.hpp" @@ -20,11 +21,14 @@ using LoopInfoPtr = LoopManager::LoopInfoPtr; SplitLoops::SplitLoops() : Pass() {} -bool SplitLoops::can_be_split(const LoopInfoPtr& current, const LoopInfoPtr& parent) { - const auto current_dim_idx = current->get_dim_idx(); - const auto parent_dim_idx = parent->get_dim_idx(); +bool SplitLoops::can_be_split(const LoopInfoPtr& loop_to_split, const LoopInfoPtr& loop_to_fuse) { + const auto current_dim_idx = loop_to_split->get_dim_idx(); + const auto parent_dim_idx = loop_to_fuse->get_dim_idx(); + const auto& handlers = loop_to_split->handlers; const bool equal_dim_idxes = current_dim_idx != LoopInfo::UNDEFINED_DIM_IDX && current_dim_idx == parent_dim_idx; - return current->get_work_amount() == parent->get_work_amount() && current->get_increment() != parent->get_increment() && equal_dim_idxes; + const bool only_main_body = handlers[LoopInfo::FIRST_ITER].empty() && handlers[LoopInfo::FIRST_ITER].empty(); + return loop_to_split->get_work_amount() == loop_to_fuse->get_work_amount() && + loop_to_split->get_increment() != loop_to_fuse->get_increment() && equal_dim_idxes && only_main_body; } bool SplitLoops::run(LinearIR& linear_ir) { @@ -59,12 +63,12 @@ bool SplitLoops::run(LinearIR& linear_ir) { continue; const auto parent_loop = loop_manager->get_loop_info(parent_loop_id); - if (can_be_split(loop, parent_loop)) { + const bool split_parent = parent_loop->get_increment() < loop->get_increment(); + const auto& loop_to_split = split_parent ? parent_loop : loop; + const auto& loop_to_split_id = split_parent ? parent_loop_id : loop_id; + const auto& loop_to_fuse = !split_parent ? parent_loop : loop; + if (can_be_split(loop_to_split, loop_to_fuse)) { loop_was_split = true; - const bool split_parent = parent_loop->get_increment() < loop->get_increment(); - const auto& loop_to_split = split_parent ? parent_loop : loop; - const auto& loop_to_split_id = split_parent ? parent_loop_id : loop_id; - const auto& loop_to_fuse = !split_parent ? parent_loop : loop; loop_to_split->set_work_amount(loop_to_fuse->get_increment()); LinearIR::constExprIt loop_begin_pos, loop_end_pos; @@ -81,7 +85,15 @@ bool SplitLoops::run(LinearIR& linear_ir) { loop_to_split->get_dim_idx(), loop_to_split->get_entry_points(), loop_to_split->get_exit_points()); - loop_manager->get_loop_info(split_loop_id)->set_outer_splited_loop(true); + const auto& new_loop_info = loop_manager->get_loop_info(split_loop_id); + new_loop_info->set_outer_splited_loop(true); + new_loop_info->handlers = loop_to_split->handlers; + const auto work_amount = loop_to_fuse->get_work_amount(); + const auto increment = loop_to_fuse->get_increment(); + const auto tail_size = work_amount % increment; + if (tail_size != 0) { + new_loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + } break; } } diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp index 455c261cec5109..f5bcc910464841 100644 --- a/src/common/snippets/tests/src/lowered/pass/loop.cpp +++ b/src/common/snippets/tests/src/lowered/pass/loop.cpp @@ -11,7 +11,8 @@ #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_load_store.hpp" #include "snippets/lowered/pass/insert_loops.hpp" -#include "snippets/lowered/pass/insert_tail_loop.hpp" +#include "snippets/lowered/pass/insert_specific_iterations.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" #include "snippets/lowered/pass/validate_loops.hpp" #include "snippets/shape_inference/shape_inference.hpp" @@ -38,7 +39,7 @@ static void init_linear_ir(const std::vector& in_shapes, Linea const auto in_shape0 = in_shapes[0].get_shape(); const auto in_shape1 = in_shapes[1].get_shape(); const auto inner_wa = std::max(*in_shape0.rbegin(), *in_shape1.rbegin()); - const auto inner_inc = vector_size; + const auto inner_inc = std::min(vector_size, inner_wa); const auto blocked_wa = block_size; const auto blocked_inc = 1; const auto outer_wa = std::max(*(in_shape0.rbegin() + 1), *(in_shape1.rbegin() + 1)); @@ -46,7 +47,11 @@ static void init_linear_ir(const std::vector& in_shapes, Linea loop_manager->mark_loop(expr_it, std::next(expr_it), inner_wa, inner_inc, 0, loop_entry_points, loop_exit_points); loop_manager->mark_loop(expr_it, std::next(expr_it), blocked_wa, blocked_inc, 1, loop_entry_points, loop_exit_points); const auto loop_id = loop_manager->mark_loop(expr_it, std::next(expr_it), outer_wa, outer_inc, 1, loop_entry_points, loop_exit_points); - loop_manager->get_loop_info(loop_id)->set_outer_splited_loop(true); + const auto& outer_loop_info = loop_manager->get_loop_info(loop_id); + outer_loop_info->set_outer_splited_loop(true); + const auto outer_tail_size = outer_wa % outer_inc; + if (outer_tail_size != 0) + outer_loop_info->handlers[LinearIR::LoopManager::LoopInfo::LAST_ITER].register_pass(outer_tail_size); } static void apply_transformations(LinearIR& linear_ir, const std::shared_ptr& config) { @@ -55,7 +60,7 @@ static void apply_transformations(LinearIR& linear_ir, const std::shared_ptr(); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(); + pipeline.register_pass(); pipeline.register_pass(); pipeline.register_pass(); pipeline.run(linear_ir); @@ -84,7 +89,7 @@ TEST(Snippets_TailProcessingTransformation, BlockedWOTail_OriginalPtrShifts) { auto config = std::make_shared(); config->disable(); - config->disable(); + config->disable(); config->disable(); apply_transformations(linear_ir, config); @@ -104,7 +109,7 @@ TEST(Snippets_TailProcessingTransformation, BlockedWOTail_CleanUpPtrShifts) { init_linear_ir({inputShape0, inputShape1}, linear_ir, 4); auto config = std::make_shared(); - config->disable(); + config->disable(); config->disable(); apply_transformations(linear_ir, config); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp index 1ea2418f995463..f5bfa19a7dcf66 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp @@ -32,19 +32,19 @@ class BrgemmCPU : public snippets::op::Brgemm { BrgemmCPU(const Output& A, const Output& B, const Type type, const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_c = 0, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}, - const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f); + const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f); BrgemmCPU(const Output& A, const Output& B, const Output& scratch, const Type type, const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_scratch = 0, const size_t offset_c = 0, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}, - const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f); + const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f); BrgemmCPU(const Output& A, const Output& B, const Type type, const PortDescriptor& desc_a, const PortDescriptor& desc_b, const PortDescriptor& desc_c, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}, - const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f); + const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f); BrgemmCPU(const Output& A, const Output& B, const Output& scratch, const Type type, const PortDescriptor& desc_a, const PortDescriptor& desc_b, const PortDescriptor& desc_scratch, const PortDescriptor& desc_c, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}, - const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f); + const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f); BrgemmCPU() = default; void validate_and_infer_types() override; @@ -83,7 +83,7 @@ class BrgemmCPU : public snippets::op::Brgemm { size_t m_M_blk = 0; size_t m_K_blk = 0; size_t m_N_blk = 0; - float m_beta = 0.f; + float m_beta = 1.f; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp index fc9aeeac10ee92..ab999b5425eb93 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp @@ -4,23 +4,25 @@ #include "brgemm_blocking.hpp" -#include "openvino/pass/pattern/matcher.hpp" -#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "cpu_iter_handlers.hpp" #include "snippets/itt.hpp" -#include "snippets/utils.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_manager.hpp" -#include "snippets/lowered/pass/insert_tail_loop.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" +#include "snippets/lowered/pass/pass.hpp" +#include "snippets/lowered/pass/propagate_subtensors.hpp" #include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" - namespace ov { namespace intel_cpu { namespace pass { using LinearIR = snippets::lowered::LinearIR; using LoopPort = LinearIR::LoopManager::LoopPort; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; +using LoopInfo = LinearIR::LoopManager::LoopInfo; +using namespace ov::snippets::lowered::pass; BrgemmBlocking::BrgemmBlocking() : Pass() {} @@ -83,22 +85,22 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { if (block_size_m >= m) { *(in_0_subtensor.rbegin() + 1) = m; *(out_subtensor.rbegin() + 1) = m; - } else { - *(in_0_subtensor.rbegin() + 1) = block_size_m; - *(out_subtensor.rbegin() + 1) = block_size_m; - - auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); - std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true), - LoopPort(brgemm_expr->get_input_port(1), false)}; - if (brgemm->is_with_compensations()) { - entries.emplace_back(brgemm_expr->get_input_port(2), false); - } else if (brgemm->is_amx()) { - move_new_memory_buffer(linear_ir, expr_it); - loop_begin_it = std::prev(expr_it); - } - std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; - loop_manager->mark_loop(loop_begin_it, loop_end_it, m, block_size_m, 1, entries, exits); + return; + } + + *(in_0_subtensor.rbegin() + 1) = block_size_m; + *(out_subtensor.rbegin() + 1) = block_size_m; + auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); + std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true), + LoopPort(brgemm_expr->get_input_port(1), false)}; + if (brgemm->is_with_compensations()) { + entries.emplace_back(brgemm_expr->get_input_port(2), false); + } else if (brgemm->is_amx()) { + move_new_memory_buffer(linear_ir, expr_it); + loop_begin_it = std::prev(expr_it); } + std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; + loop_manager->mark_loop(loop_begin_it, loop_end_it, m, block_size_m, 1, entries, exits); }; auto apply_n_blocking = [&]() { @@ -107,22 +109,22 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { if (block_size_n >= n) { *in_1_subtensor.rbegin() = n; *out_subtensor.rbegin() = n; - } else { - *in_1_subtensor.rbegin() = block_size_n; - *out_subtensor.rbegin() = block_size_n; - - auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); - std::vector entries{LoopPort(brgemm_expr->get_input_port(0), false), - LoopPort(brgemm_expr->get_input_port(1), true)}; - if (brgemm->is_with_compensations()) { - entries.emplace_back(brgemm_expr->get_input_port(2), true); - } else if (brgemm->is_amx()) { - move_new_memory_buffer(linear_ir, expr_it); - loop_begin_it = std::prev(expr_it); - } - std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; - loop_manager->mark_loop(loop_begin_it, loop_end_it, n, block_size_n, 0, entries, exits); + return; } + + *in_1_subtensor.rbegin() = block_size_n; + *out_subtensor.rbegin() = block_size_n; + auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); + std::vector entries{LoopPort(brgemm_expr->get_input_port(0), false), + LoopPort(brgemm_expr->get_input_port(1), true)}; + if (brgemm->is_with_compensations()) { + entries.emplace_back(brgemm_expr->get_input_port(2), true); + } else if (brgemm->is_amx()) { + move_new_memory_buffer(linear_ir, expr_it); + loop_begin_it = std::prev(expr_it); + } + std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; + loop_manager->mark_loop(loop_begin_it, loop_end_it, n, block_size_n, 0, entries, exits); }; auto apply_k_blocking = [&]() { @@ -132,59 +134,25 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { if (block_size_k >= k) { *in_0_subtensor.rbegin() = k; *(in_1_subtensor.rbegin() + 1) = k; - } else { - *in_0_subtensor.rbegin() = block_size_k; - *(in_1_subtensor.rbegin() + 1) = block_size_k; - - auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); - std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true, 0), - LoopPort(brgemm_expr->get_input_port(1), true, 1)}; - if (brgemm->is_with_compensations()) { - entries.emplace_back(brgemm_expr->get_input_port(2), false, 1); - } else if (brgemm->is_amx()) { - move_new_memory_buffer(linear_ir, expr_it); - loop_begin_it = std::prev(expr_it); - } - std::vector exits{LoopPort(brgemm_expr->get_output_port(0), false)}; - auto loop_id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits); - const auto loop_info = loop_manager->get_loop_info(loop_id); - - auto first_iter_handler = [](LinearIR& linear_ir, LinearIR::constExprIt loop_end_it) { - const auto loop_end = ov::as_type_ptr(loop_end_it->get()->get_node()); - OPENVINO_ASSERT(loop_end, "First loop iteraton handler must be called on LoopEnd expression"); - const auto loop_id = loop_end->get_id(); - const auto& loop_manager = linear_ir.get_loop_manager(); - const auto& loop_info = loop_manager->get_loop_info(loop_id); - const auto work_amount = loop_info->get_work_amount(); - const auto increment = loop_info->get_increment(); - if (work_amount <= increment) - return false; - - auto new_loop_range = snippets::lowered::pass::InsertTailLoop::copy_loop(linear_ir, loop_id); - const auto firt_iter_loop_end = ov::as_type_ptr(std::prev(new_loop_range.end())->get()->get_node()); - auto first_iter_loop_info = loop_manager->get_loop_info(firt_iter_loop_end->get_id()); - firt_iter_loop_end->set_work_amount(increment); - first_iter_loop_info->set_work_amount(increment); - firt_iter_loop_end->set_finalization_offsets(std::vector(loop_end->get_finalization_offsets().size(), 0)); - - const auto loop_begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end->get_loop_begin())); - linear_ir.insert(loop_begin_it, new_loop_range.begin(), new_loop_range.end()); - - const auto new_work_amount = work_amount - increment; - loop_info->set_work_amount(new_work_amount); - loop_end->set_work_amount(new_work_amount); - - // Update original body's Brgemms with new beta parameter - for (auto expr_it = loop_begin_it; expr_it != loop_end_it; ++expr_it) { - const auto& expr_node = expr_it->get()->get_node(); - if (const auto brgemm = ov::as_type_ptr(expr_node)) { - brgemm->set_beta(1.f); - } - } - return true; - }; - loop_info->set_first_iter_handler(first_iter_handler); + brgemm->set_beta(0.f); + return; } + + *in_0_subtensor.rbegin() = block_size_k; + *(in_1_subtensor.rbegin() + 1) = block_size_k; + auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); + std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true, 0), + LoopPort(brgemm_expr->get_input_port(1), true, 1)}; + if (brgemm->is_with_compensations()) { + entries.emplace_back(brgemm_expr->get_input_port(2), false, 1); + } else if (brgemm->is_amx()) { + move_new_memory_buffer(linear_ir, expr_it); + loop_begin_it = std::prev(expr_it); + } + std::vector exits{LoopPort(brgemm_expr->get_output_port(0), false)}; + const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits); + const auto loop_info = loop_manager->get_loop_info(id); + loop_info->handlers[LoopInfo::FIRST_ITER].register_pass(0.f); }; apply_k_blocking(); @@ -194,6 +162,7 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { brgemm_expr->get_input_port_descriptor(0)->set_subtensor(in_0_subtensor); brgemm_expr->get_input_port_descriptor(1)->set_subtensor(in_1_subtensor); brgemm_expr->get_output_port_descriptor(0)->set_subtensor(out_subtensor); + modified = true; } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp new file mode 100644 index 00000000000000..41eda2273157d4 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp @@ -0,0 +1,29 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "cpu_iter_handlers.hpp" + +#include "snippets/lowered/loop_manager.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { +using LinearIR = snippets::lowered::LinearIR; +using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; + +SetBrgemmBeta::SetBrgemmBeta(float beta) : snippets::lowered::pass::RangedPass(), m_beta(beta) {} + +bool SetBrgemmBeta::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = expr_it->get(); + if (const auto brgemm = ov::as_type_ptr(expr->get_node())) { + brgemm->set_beta(m_beta); + } + } + return true; +} +} // namespace pass +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp new file mode 100644 index 00000000000000..b7a17fa57d3464 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/pass/iter_handler.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { +class SetBrgemmBeta : public snippets::lowered::pass::RangedPass { +public: + SetBrgemmBeta(float beta); + OPENVINO_RTTI("SetBrgemmBeta", "RangedPass") + bool run(snippets::lowered::LinearIR& linear_ir, + snippets::lowered::LinearIR::constExprIt begin, + snippets::lowered::LinearIR::constExprIt end) override; + +private: + size_t m_beta; +}; +} // namespace pass +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index 77c78e31ca6b00..11988c5bd58541 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -20,7 +20,9 @@ std::vector> input_shapes{ {{1, 1, 32, 23}, {1, 1, 23, 68}}, {{1, 16, 384, 64}, {1, 16, 64, 384}}, {{1, 1, 100, 700}, {1, 1, 700, 100}}, + {{1, 1, 100, 1024}, {1, 1, 1024, 100}}, {{1, 1, 100, 2500}, {1, 1, 2500, 100}}, + {{1, 1, 100, 4500}, {1, 1, 4500, 100}}, }; static inline std::vector> quantized_precisions() {