From e240aee72aceb68a6e0cb992ccb1902160a0a276 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 14 Dec 2023 17:16:50 +0100 Subject: [PATCH] [Snippets] Specific loop iterations handler --- .../include/snippets/lowered/linear_ir.hpp | 6 +- .../include/snippets/lowered/loop_manager.hpp | 51 +++--- .../pass/insert_specific_iterations.hpp | 25 +++ .../snippets/lowered/pass/iter_handler.hpp | 75 +++++++++ .../include/snippets/lowered/pass/pass.hpp | 57 +++++++ .../lowered/pass/propagate_subtensors.hpp | 28 ++++ .../snippets/include/snippets/op/subgraph.hpp | 6 +- .../include/snippets/pass/manager.hpp | 7 +- src/common/snippets/src/generator.cpp | 12 +- .../snippets/src/lowered/loop_manager.cpp | 88 ++++++++-- .../src/lowered/pass/assign_registers.cpp | 4 +- .../snippets/src/lowered/pass/fuse_loops.cpp | 34 ++-- .../src/lowered/pass/insert_load_store.cpp | 10 +- .../pass/insert_specific_iterations.cpp | 106 ++++++++++++ .../src/lowered/pass/insert_tail_loop.cpp | 8 +- .../src/lowered/pass/iter_handler.cpp | 155 ++++++++++++++++++ src/common/snippets/src/lowered/pass/pass.cpp | 30 ++++ .../src/lowered/pass/propagate_subtensors.cpp | 148 +++++++++++++++++ .../lowered/pass/softmax_decomposition.cpp | 60 +++---- .../snippets/src/lowered/pass/split_loops.cpp | 14 +- src/common/snippets/src/op/subgraph.cpp | 2 + src/common/snippets/src/pass/manager.cpp | 1 - .../snippets/tests/src/lowered/pass/loop.cpp | 1 + .../x64/pass/lowered/brgemm_blocking.cpp | 85 +++++----- .../x64/pass/lowered/cpu_iter_handlers.cpp | 29 ++++ .../x64/pass/lowered/cpu_iter_handlers.hpp | 25 +++ .../snippets/matmul.cpp | 2 + 27 files changed, 908 insertions(+), 161 deletions(-) create mode 100644 src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp create mode 100644 src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp create mode 100644 src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp create mode 100644 src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp create mode 100644 src/common/snippets/src/lowered/pass/iter_handler.cpp create mode 100644 src/common/snippets/src/lowered/pass/propagate_subtensors.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp index 73d3ab573e6254..4d2deae7a8bee5 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp @@ -70,9 +70,9 @@ class LinearIR { LinearIR::container::const_iterator end, ExressionMap& expression_map); - const container& get_ops() const {return m_expressions; } - const io_container& get_IO_ops() const {return m_io_expressions; } - Config get_config() {return m_config; } + const container& get_ops() const { return m_expressions; } + const io_container& get_IO_ops() const { return m_io_expressions; } + Config get_config() const { return m_config; } void set_loop_depth(size_t loop_depth) { m_config.m_loop_depth = loop_depth; } const ExpressionPtr& get_expr_by_node(const std::shared_ptr& n) const; diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp index 93d1620f5fdbe7..f19adbc72e3e42 100644 --- a/src/common/snippets/include/snippets/lowered/loop_manager.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_manager.hpp @@ -4,11 +4,12 @@ #pragma once -#include "linear_ir.hpp" - #include #include +#include "linear_ir.hpp" +#include "pass/iter_handler.hpp" +#include "pass/pass.hpp" #include "port_descriptor.hpp" namespace ov { @@ -45,9 +46,7 @@ class LinearIR::LoopManager { LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - bool outer_splited_loop = false) - : m_work_amount(work_amount), m_increment(increment), - m_entry_points(entries), m_exit_points(exits), m_outer_splited_loop(outer_splited_loop) {} + bool outer_splited_loop = false); LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, @@ -63,19 +62,6 @@ class LinearIR::LoopManager { const std::vector& get_exit_points() const; bool get_outer_splited_loop() const; - /** - * \brief Inserts a separate body for first loop iteration processing if needed. - * Can also modify both main and first iter loop bodies. - * TODO: replace this temporary solution when ticket 119851 is implemented - * - * \param linear_ir LIR which should be modified - * \param loop_end_it iterator on LoopEnd expression for which the handler is called - * - * \return bool value which indicates whether the linear_ir was changed or not. - */ - using FirstIterHandler = std::function; - const FirstIterHandler& get_first_iter_handler() const; - // Sets dim_idx to all entry and exit points void set_dim_idx(size_t dim_idx); void set_work_amount(size_t work_amount); @@ -83,7 +69,9 @@ class LinearIR::LoopManager { void set_entry_points(std::vector entry_points); void set_exit_points(std::vector exit_points); void set_outer_splited_loop(bool outer_splited_loop); - void set_first_iter_handler(FirstIterHandler handler); + + enum {FIRST_ITER, MAIN_BODY, LAST_ITER}; + std::vector handlers; private: size_t m_work_amount = 0; @@ -96,7 +84,6 @@ class LinearIR::LoopManager { std::vector m_exit_points = {}; // True if this Loop is outer Loop for nested Loops that splits the same dimension bool m_outer_splited_loop = false; - FirstIterHandler m_first_iter_handler = nullptr; }; using LoopInfoPtr = std::shared_ptr; @@ -118,16 +105,22 @@ class LinearIR::LoopManager { size_t mark_loop(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, size_t work_amount, - size_t work_amount_increment, + size_t increment, size_t dim_idx, const std::vector& entries, - const std::vector& exits) { - const auto loop_info = std::make_shared(work_amount, work_amount_increment, entries, exits); + const std::vector& exits, + bool set_default_handlers = true) { + if (increment > work_amount) + increment = work_amount; + const auto loop_info = std::make_shared(work_amount, increment, entries, exits); loop_info->set_dim_idx(dim_idx); const auto loop_id = this->add_loop_info(loop_info); for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) { insert_loop_id(*expr_it, loop_id); } + if (set_default_handlers) { + set_default_loop_handlers(loop_info); + } return loop_id; } @@ -137,12 +130,18 @@ class LinearIR::LoopManager { size_t work_amount, size_t increment, const std::vector& entries, - const std::vector& exits) { + const std::vector& exits, + bool set_default_handlers = true) { + if (increment > work_amount) + increment = work_amount; const auto loop_info = std::make_shared(work_amount, increment, entries, exits); const auto loop_id = this->add_loop_info(loop_info); for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) { insert_loop_id(*expr_it, loop_id); } + if (set_default_handlers) { + set_default_loop_handlers(loop_info); + } return loop_id; } @@ -197,6 +196,7 @@ class LinearIR::LoopManager { size_t loop_id, bool loop_ops_inserted = false); LoopPort get_loop_port_by_expr_port(const ExpressionPort& expr_port, const size_t loop_id); + static void set_default_loop_handlers(const LoopInfoPtr& loop_info); private: static void get_io_loop_ports(LinearIR::constExprIt loop_begin_pos, @@ -207,6 +207,9 @@ class LinearIR::LoopManager { static void fuse_loop_ports(std::vector& exit_points, std::vector& entry_points, size_t loop_id); + static std::vector fuse_loop_handlers( + std::vector& lhs, + std::vector& rhs); /* ===== The methods for work with Loop IDs of Expression ===== */ // Notes: diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp new file mode 100644 index 00000000000000..099b43a54b2d6b --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "pass.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +class InsertSpecificIterations : public Pass { +public: + OPENVINO_RTTI("InsertSpecificIterations", "Pass") + bool run(LinearIR& linear_ir) override; + + static LinearIR::container copy_loop(const LinearIR& linear_ir, const size_t loop_id); +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp new file mode 100644 index 00000000000000..a65e4a3bbabaa6 --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp @@ -0,0 +1,75 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" +#include "snippets/op/loop.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +class SetSingleIterationWithWorkAmount : public pass::SubgraphPass { +public: + SetSingleIterationWithWorkAmount(size_t work_amount); + OPENVINO_RTTI("SetSingleIterationWithWorkAmount", "SubgraphPass") + bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + +private: + size_t m_work_amount; +}; + +class UpdateMemoryAccessOps : public pass::SubgraphPass { +public: + UpdateMemoryAccessOps(size_t count); + OPENVINO_RTTI("UpdateMemoryAccessOps", "SubgraphPass") + bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + +private: + size_t m_count; +}; + +class ReduceWorkAmount : public pass::SubgraphPass { +public: + ReduceWorkAmount(size_t reduce_value); + OPENVINO_RTTI("ReduceWorkAmount", "SubgraphPass") + bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + +private: + size_t m_reduce_value; +}; + +class ZeroFinalizationOffsets : public pass::SubgraphPass { +public: + OPENVINO_RTTI("ZeroFinalizationOffsets", "SubgraphPass") + bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; +}; + +class SetFillOffset : public pass::SubgraphPass { +public: + SetFillOffset(size_t offset); + OPENVINO_RTTI("SetFillOffset", "SubgraphPass") + bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + +private: + size_t m_offset; +}; + +class TransformInnerSplitLoop : public pass::SubgraphPass { +public: + TransformInnerSplitLoop(size_t tail_size); + OPENVINO_RTTI("TransformInnerSplitLoop", "SubgraphPass") + bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + +private: + size_t m_tail_size; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/lowered/pass/pass.hpp b/src/common/snippets/include/snippets/lowered/pass/pass.hpp index 177056d2984d25..bb49f6b3202c4e 100644 --- a/src/common/snippets/include/snippets/lowered/pass/pass.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/pass.hpp @@ -80,6 +80,63 @@ class PassPipeline { std::vector> m_passes; }; +class SubgraphPass { +public: + SubgraphPass() = default; + virtual ~SubgraphPass() = default; + // Note that get_type_info_static and get_type_info are needed to mimic OPENVINO_RTTI interface, + // so the standard OPENVINO_RTTI(...) macros could be used in derived classes. + _OPENVINO_HIDDEN_METHOD static const ::ov::DiscreteTypeInfo& get_type_info_static() { + static ::ov::DiscreteTypeInfo type_info_static {"SubgraphPass"}; + type_info_static.hash(); + return type_info_static; + } + + virtual const DiscreteTypeInfo& get_type_info() const { + return get_type_info_static(); + } + + const char* get_type_name() const { + return get_type_info().name; + } + + virtual bool run(const lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) = 0; +}; + +class SubgraphPassPipeline { +public: + using PositionedSubgraphPassLowered = snippets::pass::PositionedPass; + + SubgraphPassPipeline(); + SubgraphPassPipeline(const std::shared_ptr& pass_config); + + void run(const lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) const; + const std::vector>& get_passes() const { return m_passes; } + bool empty() const { return m_passes.empty(); } + + void register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr& pass); + void register_pass(const std::shared_ptr& pass); + + template + void register_pass(Args&&... args) { + static_assert(std::is_base_of::value, "Pass not derived from lowered::SubgraphPass"); + auto pass = std::make_shared(std::forward(args)...); + register_pass(pass); + } + template::value, bool>() = true> + void register_pass(const snippets::pass::PassPosition& position, Args&&... args) { + static_assert(std::is_base_of::value, "Pass not derived from lowered::SubgraphPass"); + auto pass = std::make_shared(std::forward(args)...); + register_pass(position, pass); + } + + void register_positioned_passes(const std::vector& pos_passes); + +private: + std::shared_ptr m_pass_config; + std::vector> m_passes; +}; + } // namespace pass } // namespace lowered } // namespace snippets diff --git a/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp new file mode 100644 index 00000000000000..4d4d3df84cf60d --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +class UpdateSubtensors : public pass::SubgraphPass { +public: + UpdateSubtensors(size_t tail_size); + OPENVINO_RTTI("UpdateSubtensors", "Pass") + bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + +private: + size_t m_tail_size; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index c8ae7929ea2744..46530409426a32 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -5,11 +5,13 @@ #pragma once #include - #include #include -#include "openvino/op/op.hpp" + #include "openvino/core/rt_info.hpp" +#include "openvino/op/op.hpp" +#include "snippets/generator.hpp" +#include "snippets/lowered/pass/pass.hpp" #include "snippets/pass/manager.hpp" #include "snippets/shape_inference/shape_inference.hpp" #include "snippets/lowered/pass/pass.hpp" diff --git a/src/common/snippets/include/snippets/pass/manager.hpp b/src/common/snippets/include/snippets/pass/manager.hpp index a9e3c2aec37498..3867366f1b399d 100644 --- a/src/common/snippets/include/snippets/pass/manager.hpp +++ b/src/common/snippets/include/snippets/pass/manager.hpp @@ -10,9 +10,6 @@ #include "openvino/pass/pass.hpp" #include "openvino/pass/validate.hpp" -#include - - namespace ov { namespace snippets { namespace pass { @@ -36,7 +33,7 @@ class Manager : public ov::pass::Manager { std::shared_ptr register_pass(const PassPosition& position, Args&&... args) { static_assert(std::is_base_of::value, "Attempt to insert pass that is not derived from PassBase"); auto pass = std::make_shared(std::forward(args)...); - auto rc = insert_pass_instance(position, pass); + auto rc = insert_pass_instance(position, pass); rc->set_pass_config(m_pass_config); if (!m_pass_config->is_enabled()) { m_pass_config->disable(); @@ -48,7 +45,7 @@ class Manager : public ov::pass::Manager { void register_positioned_passes(const std::vector& pos_passes); protected: - std::shared_ptr insert_pass_instance(const PassPosition& position, const std::shared_ptr& pass); + std::shared_ptr insert_pass_instance(const PassPosition& position, const std::shared_ptr& pass); }; } // namespace pass diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index c0a2583aef23b4..e5242feaeada4f 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -4,28 +4,28 @@ #include "snippets/generator.hpp" +#include "snippets/itt.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/pass/assign_registers.hpp" #include "snippets/lowered/pass/cleanup_loop_offsets.hpp" +#include "snippets/lowered/pass/insert_specific_iterations.hpp" #include "snippets/lowered/pass/insert_tail_loop.hpp" #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" - +#include "snippets/lowered/pass/pass.hpp" #include "snippets/op/kernel.hpp" -#include "snippets/itt.hpp" - namespace ov { namespace snippets { void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, const void* compile_params) const { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate") OV_ITT_TASK_CHAIN(GENERATE, ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::Transformations") - if (!target->is_supported()) - OPENVINO_THROW("unsupported architecture for code generation"); + OPENVINO_ASSERT(target->is_supported(), "unsupported architecture for code generation"); std::function& op)> reg_type_mapper = [&](const std::shared_ptr& op) -> opRegType { return get_op_reg_type(op); }; + lowered::pass::PassPipeline lowered_pipeline; // Note: the order of all passes in this pipeline must not be changed since they have hard dependencies // 1. InsertTailLoop must be called after AssignRegisters since tail loop expressions must have the same @@ -35,7 +35,7 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c // 3. OptimizeLoopSingleEvaluation must be called after CleanupLoopOffsets // since CleanupLoopOffsets can't handle loops with evaluate_once = true lowered_pipeline.register_pass(reg_type_mapper); - lowered_pipeline.register_pass(); + lowered_pipeline.register_pass(); lowered_pipeline.register_pass(); lowered_pipeline.register_pass(); lowered_pipeline.run(linear_ir); diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp index e7e83361ee0a39..88f74782620e88 100644 --- a/src/common/snippets/src/lowered/loop_manager.cpp +++ b/src/common/snippets/src/lowered/loop_manager.cpp @@ -5,6 +5,8 @@ #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/expression.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" +#include "snippets/lowered/pass/propagate_subtensors.hpp" #include "snippets/utils.hpp" #include "openvino/core/graph_util.hpp" @@ -37,6 +39,19 @@ std::shared_ptr LoopPort::clone_with_new_expr(const ExpressionPtr& new return new_loop_port; } +LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount, + size_t increment, + const std::vector& entries, + const std::vector& exits, + bool outer_splited_loop) + : m_work_amount(work_amount), + m_increment(increment), + m_entry_points(entries), + m_exit_points(exits), + m_outer_splited_loop(outer_splited_loop) { + handlers.resize(3); +} + LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, @@ -51,6 +66,7 @@ LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount, m_entry_points.emplace_back(port); for (const auto& port : exits) m_exit_points.emplace_back(port); + handlers.resize(3); } std::shared_ptr LoopInfo::clone_with_new_expr(const ExressionMap& expr_map) const { @@ -68,7 +84,8 @@ std::shared_ptr LoopInfo::clone_with_new_expr(const ExressionMap& expr const auto& new_entry_points = clone_loop_ports(m_entry_points); const auto& new_exit_points = clone_loop_ports(m_exit_points); - return std::make_shared(m_work_amount, m_increment, new_entry_points, new_exit_points, m_outer_splited_loop); + auto new_info = std::make_shared(m_work_amount, m_increment, new_entry_points, new_exit_points, m_outer_splited_loop); + return new_info; } size_t LoopInfo::get_work_amount() const { @@ -91,10 +108,6 @@ bool LoopInfo::get_outer_splited_loop() const { return m_outer_splited_loop; } -const LoopInfo::FirstIterHandler& LoopInfo::get_first_iter_handler() const { - return m_first_iter_handler; -} - size_t LinearIR::LoopManager::LoopInfo::get_dim_idx() const { OPENVINO_ASSERT(!m_entry_points.empty(), "Loop info must have at least one entry point"); auto equal_dim_idxes = [&](const LinearIR::LoopManager::LoopPort& p) { @@ -137,10 +150,6 @@ void LoopInfo::set_outer_splited_loop(bool outer_splited_loop) { m_outer_splited_loop = outer_splited_loop; } -void LoopInfo::set_first_iter_handler(LoopInfo::FirstIterHandler first_iter_handler) { - m_first_iter_handler = std::move(first_iter_handler); -} - bool operator==(const LinearIR::LoopManager::LoopPort& lhs, const LinearIR::LoopManager::LoopPort& rhs) { if (&lhs == &rhs) return true; @@ -248,6 +257,17 @@ LinearIR::LoopManager::LoopPort LinearIR::LoopManager::get_loop_port_by_expr_por : get_loop_port(loop_info->get_exit_points()); } +void LinearIR::LoopManager::set_default_loop_handlers(const LoopInfoPtr& loop_info) { + const auto tail_size = loop_info->get_work_amount() % loop_info->get_increment(); + if (tail_size != 0) { + loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(tail_size); + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(); + } +} + void LinearIR::LoopManager::get_io_loop_ports(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, std::vector &entries, @@ -330,18 +350,16 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, } for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) { - if (*(loop_subtensor.rbegin() + dim_idx) == PortDescriptor::ServiceDimensions::FULL_DIM) { + OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup"); + const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx); + if (subtensor_value == PortDescriptor::ServiceDimensions::FULL_DIM) { continue; } OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup"); - const auto work_amount = - loop_tensor.size() > dim_idx ? *(loop_tensor.rbegin() + dim_idx) - : 0; - const auto work_amount_increment = - loop_subtensor.size() > dim_idx ? *(loop_subtensor.rbegin() + dim_idx) - : (dim_idx == 0 ? vector_size : 1); - mark_loop(loop_begin_pos, loop_end_pos, work_amount, work_amount_increment, dim_idx, loop_entry_points, loop_exit_points); + const auto work_amount = *(loop_tensor.rbegin() + dim_idx); + const auto increment = subtensor_value; + mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, dim_idx, loop_entry_points, loop_exit_points); } } @@ -399,6 +417,15 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, loop_info->set_entry_points(new_entries); loop_info->set_exit_points(new_exits); + loop_info->handlers = fuse_loop_handlers(loop_info_upper->handlers, loop_info_lower->handlers); + // Since fusion can be called for broadcastable loops (one of the loops has work_amount = increment = 1), + // maximum value is set to the fused loop + loop_info->set_work_amount(std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount())); + loop_info->set_increment(std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment())); + // If one of the Loops is outer for nested loops that splits the same dimension, + // after fusion new common Loop saves this status + loop_info->set_outer_splited_loop(loop_info_upper->get_outer_splited_loop() || loop_info_lower->get_outer_splited_loop()); + const auto& from = fuse_into_upper ? loop_id_lower : loop_id_upper; const auto& to = fuse_into_upper ? loop_id_upper : loop_id_lower; for (auto it = loop_begin_target; it != loop_end_target; ++it) { @@ -409,6 +436,31 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, remove_loop_info(from); } +std::vector LinearIR::LoopManager::fuse_loop_handlers( + std::vector& from, + std::vector& to) { + const auto min_size = std::min(from.size(), to.size()); + std::vector merged_handlers; + merged_handlers.resize(min_size); + for (size_t i = 0; i < min_size; ++i) { + merged_handlers[i] = from[i]; + const auto& res_passes = merged_handlers[i].get_passes(); + for (const auto& pass : to[i].get_passes()) { + auto pred = [&pass](const std::shared_ptr& p) { + return p->get_type_info() == pass->get_type_info(); + }; + if (std::find_if(res_passes.begin(), res_passes.end(), pred) == res_passes.end()) { + merged_handlers[i].register_pass(pass); + } + } + } + auto& handlers_with_larger_size = from.size() > to.size() ? from : to; + for (size_t i = min_size; i < handlers_with_larger_size.size(); ++i) { + merged_handlers.emplace_back(std::move(handlers_with_larger_size[i])); + } + return merged_handlers; +} + void LinearIR::LoopManager::fuse_loop_ports(std::vector& exit_points, std::vector& entry_points, size_t loop_id) { @@ -543,7 +595,7 @@ void LinearIR::LoopManager::insert_loop_id(const ExpressionPtr& expr, size_t new OPENVINO_ASSERT(m_map.count(new_id) == 1, "Failed marking expression by Loop ID: the Loop with this ID hasn't registered"); auto& loop_ids = expr->m_loop_ids; OPENVINO_ASSERT(std::find(loop_ids.cbegin(), loop_ids.cend(), new_id) == loop_ids.cend(), - "Expression cannot have several the same Loop IDs"); + "Expression cannot have several identical Loop IDs"); auto insert_it = before ? loop_ids.cbegin() : loop_ids.cend(); if (target_id != SIZE_MAX) { insert_it = std::find(loop_ids.cbegin(), loop_ids.cend(), target_id); diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp index d49cf8d63155a7..fbeef30888fa85 100644 --- a/src/common/snippets/src/lowered/pass/assign_registers.cpp +++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp @@ -80,10 +80,10 @@ bool AssignRegisters::run(LinearIR& linear_ir) { for (const auto& tensor : input_expr_input_tensors) { const auto parent_expr = tensor->get_source().get_expr(); if (ov::is_type(parent_expr->get_node())) { - manually_assigned_vecs[tensor] = static_cast(accumulator_reg); if (ov::is_type(parent_expr->get_input_port_connector(0)->get_source().get_expr()->get_node())) { + manually_assigned_vecs[tensor] = static_cast(accumulator_reg); manually_assigned_vecs[parent_expr->get_input_port_connector(0)] = static_cast(accumulator_reg); - } + } } } const auto& output_tensor = expr->get_output_port_connector(0); diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp index 1738d6d8fe9574..dc7dac6eed4095 100644 --- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp +++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp @@ -44,20 +44,29 @@ bool FuseLoops::loop_ports_are_compatible(const LinearIR::LoopManagerPtr& loop_m } bool FuseLoops::can_be_fused(const LoopInfoPtr& loop_current, const LoopInfoPtr& loop_target) { - auto current_work_amount = loop_current->get_work_amount(); - auto target_work_amount = loop_target->get_work_amount(); - // Loop fusion is supported only if Loops have equal increments and the equal/broadcastable work amounts. + const auto current_work_amount = loop_current->get_work_amount(); + const auto target_work_amount = loop_target->get_work_amount(); + const auto current_increment = loop_current->get_increment(); + const auto target_increment = loop_target->get_increment(); + // Loop fusion is supported only if Loops have equal/broadcastable increments and work amounts. // Note: For example, Broadcastable work amounts are possible in the following case: // Relu_0 [16x1] Relu_1 [16x128] // \ / // Add [16x128] // Because of expression order in linear IR and work of MarkLoop algorithm, there are 2 Inner Loops: - // - Relu_0 with work amount `1` and increment `vector size` + // - Relu_0 with work amount `1` and increment `1` // - Relu_1 and Add with work amount `128` and increment `vector size` // We can fuse them into one Loop with work amount `128` and increment `vector size` - const auto supported_work_amount = current_work_amount == target_work_amount || current_work_amount == 1 || target_work_amount == 1; - const auto supported_increment = loop_current->get_increment() == loop_target->get_increment(); - return supported_work_amount && supported_increment; + + // WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't, + // because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters + // (e.g. tail size). This logic is not implemented for now, so fusion for such loops is skipped. + const bool first_iter_handlers_match = loop_current->handlers[LoopManager::LoopInfo::FIRST_ITER].empty() == + loop_target->handlers[LoopManager::LoopInfo::FIRST_ITER].empty(); + const bool equal_parameters = current_work_amount == target_work_amount && current_increment == target_increment; + const bool current_bcastable = current_work_amount == 1 && current_increment == 1; + const bool target_bcastable = target_work_amount == 1 && target_increment == 1; + return first_iter_handlers_match && (equal_parameters || current_bcastable || target_bcastable); } void FuseLoops::move(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id, @@ -124,12 +133,6 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos; loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos); loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, target_loop_id, current_loop_id, false); - // Update work_amount for Loop (increment is constant because increments must be the identical for fusion): - loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount())); - // If one of the Loops is outer for nested loops that splits the same dimension, - // after fusion new common Loop save this status - loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop()); - const auto insertion_place = current_loop_begin_pos; const auto is_move_needed = target_loop_end_pos != current_loop_begin_pos; if (is_move_needed) @@ -169,11 +172,6 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos; loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos); loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, current_loop_id, target_loop_id); - // Update work_amount for Loop (increment is constant because increments must be the identical for fusion): - loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount())); - // If one of the Loops is outer for nested loops that splits the same dimension, - // after fusion new common Loop save this status - loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop()); const auto insertion_place = current_loop_end_pos; const auto is_move_needed = insertion_place != target_loop_begin_pos; diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp index 75e70c9c553c88..492eb8d17682b1 100644 --- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp +++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp @@ -20,13 +20,13 @@ using LoopInfoPtr = LoopManager::LoopInfoPtr; InsertLoadStore::InsertLoadStore(size_t vector_size) : m_vector_size(vector_size) {} size_t InsertLoadStore::get_count(const PortDescriptorPtr& port_desc) const { - const auto layout = port_desc->get_layout(); - const auto shape = port_desc->get_shape(); + const auto& layout = port_desc->get_layout(); + const auto& shape = port_desc->get_shape(); // Find last dimension by layout - const auto last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1); + const auto& last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1); OPENVINO_ASSERT(last_dim_idx != layout.end() && *last_dim_idx < shape.size(), "Load/Store expression have incorrect layout"); - const auto dim = shape[*last_dim_idx]; - return dim == 1 ? 1 : m_vector_size; + const auto& dim = shape[*last_dim_idx]; + return std::min(dim, m_vector_size); } bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) { diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp new file mode 100644 index 00000000000000..4dd6cfa26745b4 --- /dev/null +++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp @@ -0,0 +1,106 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/insert_specific_iterations.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" +#include "snippets/itt.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +LinearIR::container InsertSpecificIterations::copy_loop(const LinearIR& linear_ir, const size_t loop_id) { + const auto& loop_manager = linear_ir.get_loop_manager(); + LinearIR::constExprIt loop_begin_pos, loop_end_pos; + loop_manager->get_loop_bounds(linear_ir, loop_id, loop_begin_pos, loop_end_pos, true); + ExressionMap expression_map; + const auto& loop_copy_range = LinearIR::deep_copy_range(loop_begin_pos, std::next(loop_end_pos), expression_map); + + const auto original_loop_info = loop_manager->get_loop_info(loop_id); + std::vector new_entry_points, new_exit_points; + // Clone loop ports from original loop info to new loop info + for (const auto& entry : original_loop_info->get_entry_points()) + new_entry_points.push_back(*entry.clone_with_new_expr(expression_map[entry.expr_port->get_expr().get()])); + for (const auto& exit : original_loop_info->get_exit_points()) + new_exit_points.push_back(*exit.clone_with_new_expr(expression_map[exit.expr_port->get_expr().get()])); + + for (const auto& elem : expression_map) { + const auto expr = elem.first->shared_from_this(); + const auto& new_expr = elem.second; + // Loop begin/end ops can't be loop ports + if (ov::is_type(expr->get_node())) + continue; + // Update loop info of all outer loops with new loop ports + const auto outer_loop_ids = LinearIR::LoopManager::get_outer_expr_loops(expr, loop_id); + for (size_t i = 0; i < expr->get_input_count(); ++i) + loop_manager->update_loops_port(outer_loop_ids, expr->get_input_port(i), {expr->get_input_port(i), new_expr->get_input_port(i)}, true); + for (size_t i = 0; i < expr->get_output_count(); ++i) + loop_manager->update_loops_port(outer_loop_ids, expr->get_output_port(i), {expr->get_output_port(i), new_expr->get_output_port(i)}, false); + } + + const auto new_loop_begin_pos = loop_copy_range.begin(); + const auto new_loop_end_pos = loop_copy_range.end(); + const auto new_id = loop_manager->replace_with_new_loop(linear_ir, + std::next(new_loop_begin_pos), + std::prev(new_loop_end_pos), + original_loop_info->get_work_amount(), + original_loop_info->get_increment(), + new_entry_points, + new_exit_points, + loop_id); + const auto loop_end = ov::as_type_ptr(std::prev(new_loop_end_pos)->get()->get_node()); + OPENVINO_ASSERT(loop_end, "Cloned Loop does not contain LoopEnd op at the expected place."); + loop_end->set_id(new_id); + return loop_copy_range; +} + +bool InsertSpecificIterations::run(LinearIR& linear_ir) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertSpecificIterations") + const auto& loop_manager = linear_ir.get_loop_manager(); + + bool modified = false; + for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) { + const auto& expr = *expr_it; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + if (!loop_end) + continue; + + std::vector> pipelines_to_run; + for (const auto& pipeline : loop_manager->get_loop_info(loop_end->get_id())->handlers) { + if (!pipeline.empty()) + pipelines_to_run.emplace_back(pipeline); + } + if (pipelines_to_run.empty()) + continue; + + const auto main_body_begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end->get_loop_begin())); + const auto main_body_end_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end)); + auto copy_and_run_specific_handlers = [&](const SubgraphPassPipeline& handlers) { + const auto& cloned_body = copy_loop(linear_ir, loop_end->get_id()); + linear_ir.insert(main_body_begin_it, cloned_body.begin(), cloned_body.end()); + handlers.run(linear_ir, cloned_body.begin(), std::prev(cloned_body.end())); + }; + + for (size_t i = 0; i < pipelines_to_run.size() - 1; ++i) { + copy_and_run_specific_handlers(pipelines_to_run[i].get()); + } + // Last pipeline is run on original body to avoid unnecesarry copy + pipelines_to_run.back().get().run(linear_ir, main_body_begin_it, main_body_end_it); + modified = true; + } + return modified; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov + diff --git a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp index cc685c1851157a..c8bfffc3360722 100644 --- a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp +++ b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp @@ -330,10 +330,10 @@ bool InsertTailLoop::run(LinearIR& linear_ir) { continue; const auto loop_info = loop_manager->get_loop_info(loop_end->get_id()); - const auto& first_iter_handler = loop_info->get_first_iter_handler(); - if (first_iter_handler) { - modified |= first_iter_handler(linear_ir, expr_it); - } + // const auto& first_iter_handler = loop_info->get_first_iter_handler(); + // if (first_iter_handler) { + // modified |= first_iter_handler(linear_ir, expr_it); + // } const auto work_amount = loop_end->get_work_amount(); const auto increment = loop_end->get_increment(); diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp new file mode 100644 index 00000000000000..31bda1589b01ce --- /dev/null +++ b/src/common/snippets/src/lowered/pass/iter_handler.cpp @@ -0,0 +1,155 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/iter_handler.hpp" + +#include "snippets/itt.hpp" +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/lowered/pass/propagate_subtensors.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +SetSingleIterationWithWorkAmount::SetSingleIterationWithWorkAmount(size_t work_amount) + : SubgraphPass(), + m_work_amount(work_amount) {} + +bool SetSingleIterationWithWorkAmount::run(const LinearIR& linear_ir, + LinearIR::constExprIt begin, + LinearIR::constExprIt end) { + const auto& expr = *end; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + + const auto& loop_manager = linear_ir.get_loop_manager(); + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + if (loop_end->get_work_amount() == m_work_amount && loop_end->get_increment() == m_work_amount) + return false; + loop_end->set_work_amount(m_work_amount); + loop_end->set_increment(m_work_amount); + loop_info->set_work_amount(m_work_amount); + loop_info->set_increment(m_work_amount); + return true; +} + +UpdateMemoryAccessOps::UpdateMemoryAccessOps(size_t count) : SubgraphPass(), m_count(count) {} + +bool UpdateMemoryAccessOps::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + for (auto expr_it = std::next(begin); expr_it != end; expr_it++) { + // Skip inner Loops + const auto loop_begin = ov::as_type_ptr(expr_it->get()->get_node()); + if (loop_begin) { + expr_it = linear_ir.find(expr_it, end, linear_ir.get_expr_by_node(loop_begin->get_loop_end())); + continue; + } + + const auto& node = expr_it->get()->get_node(); + if (const auto memory_access = ov::as_type_ptr(node)) { + for (const auto p : memory_access->get_memory_access_input_ports()) { + const auto port = p.first; + if (memory_access->get_input_count(port) > 1) { + memory_access->set_input_count(m_count, port); + } + } + for (const auto p : memory_access->get_memory_access_output_ports()) { + const auto port = p.first; + if (memory_access->get_output_count(port) > 1) { + memory_access->set_output_count(m_count, port); + } + } + } + } + return true; +} + +ReduceWorkAmount::ReduceWorkAmount(size_t reduce_value) : SubgraphPass(), m_reduce_value(reduce_value) {} + +bool ReduceWorkAmount::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + const auto& expr = *end; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + const auto work_amount = loop_end->get_work_amount(); + const auto new_work_amount = work_amount - m_reduce_value; + loop_end->set_work_amount(new_work_amount); + + const auto& loop_manager = linear_ir.get_loop_manager(); + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + loop_info->set_work_amount(new_work_amount); + return true; +} + +bool ZeroFinalizationOffsets::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + const auto& expr = *end; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + loop_end->set_finalization_offsets(std::vector(loop_end->get_finalization_offsets().size(), 0)); + return true; +} + +SetFillOffset::SetFillOffset(size_t offset) : SubgraphPass(), m_offset(offset) {} + +bool SetFillOffset::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + for (auto expr_it = std::next(begin); expr_it != end; expr_it++) { + const auto& node = expr_it->get()->get_node(); + if (const auto fill = ov::as_type_ptr(node)) { + fill->set_offset(m_offset); + } + } + return true; +} + +TransformInnerSplitLoop::TransformInnerSplitLoop(size_t tail_size) : SubgraphPass(), m_tail_size(tail_size) {} + +bool TransformInnerSplitLoop::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + const auto& expr = *end; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + const auto& loop_manager = linear_ir.get_loop_manager(); + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + const auto current_dim_idx = loop_info->get_dim_idx(); + OPENVINO_ASSERT(current_dim_idx != LinearIR::LoopManager::LoopInfo::UNDEFINED_DIM_IDX, + "Outer splitted loop unexpectedly iterates by several dimension indices"); + + bool modified = false; + for (auto it = std::next(begin); it != end; ++it) { + const auto& expr = *it; + const auto inner_loop_end = ov::as_type_ptr(expr->get_node()); + if (!inner_loop_end) + continue; + const auto inner_loop_info = loop_manager->get_loop_info(inner_loop_end->get_id()); + const auto inner_dim_idx = inner_loop_info->get_dim_idx(); + if (inner_dim_idx != current_dim_idx) + continue; + const auto inner_loop_begin = inner_loop_end->get_loop_begin(); + const auto inner_tail_work_amount = static_cast(inner_loop_end->get_work_amount()); + const auto inner_tail_increment = inner_loop_end->get_increment(); + auto inner_finalization_offsets = inner_loop_end->get_finalization_offsets(); + for (auto& offset : inner_finalization_offsets) { + offset = offset / inner_tail_work_amount * static_cast(m_tail_size); + } + inner_loop_end->set_work_amount(m_tail_size); + // TODO: if the new m_tail_size increment is set, all last iter handlers must be updated with new tail value + // We can also don't split loops in case if inner loop has increment not equal to 1 + inner_loop_end->set_increment(std::min(inner_tail_increment, m_tail_size)); + inner_loop_end->set_finalization_offsets(inner_finalization_offsets); + const auto inner_loop_begin_it = std::find(begin, it, linear_ir.get_expr_by_node(inner_loop_begin)); + const auto inner_loop_end_it = std::next(end); + OPENVINO_ASSERT(inner_loop_begin_it != it, "LoopBegin has not been found!"); + const auto& last_iter_handlers = inner_loop_info->handlers[LinearIR::LoopManager::LoopInfo::LAST_ITER]; + last_iter_handlers.run(linear_ir, inner_loop_begin_it, inner_loop_end_it); + modified = true; + } + return modified; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov + diff --git a/src/common/snippets/src/lowered/pass/pass.cpp b/src/common/snippets/src/lowered/pass/pass.cpp index 70a05fc30be147..f69578e6aab9ba 100644 --- a/src/common/snippets/src/lowered/pass/pass.cpp +++ b/src/common/snippets/src/lowered/pass/pass.cpp @@ -41,6 +41,36 @@ void PassPipeline::register_positioned_passes(const std::vector()) {} +SubgraphPassPipeline::SubgraphPassPipeline(const std::shared_ptr& pass_config) : m_pass_config(pass_config) { + OPENVINO_ASSERT(m_pass_config != nullptr, "PassConfig is not initialized!"); +} + +void SubgraphPassPipeline::register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr& pass) { + OPENVINO_ASSERT(pass != nullptr, "SubgraphPassPipeline cannot register empty pass!"); + m_passes.insert(position.get_insert_position(m_passes), pass); +} + +void SubgraphPassPipeline::register_pass(const std::shared_ptr& pass) { + OPENVINO_ASSERT(pass != nullptr, "SubgraphPassPipeline cannot register empty pass!"); + m_passes.push_back(pass); +} + +void SubgraphPassPipeline::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) const { + for (const auto& pass : m_passes) { + OPENVINO_ASSERT(pass != nullptr, "SubgraphPassPipeline has empty pass!"); + if (m_pass_config->is_disabled(pass->get_type_info())) { + continue; + } + pass->run(linear_ir, begin, end); + } +} + +void SubgraphPassPipeline::register_positioned_passes(const std::vector& pos_passes) { + for (const auto& pp : pos_passes) + register_pass(pp.position, pp.pass); +} + } // namespace pass } // namespace lowered } // namespace snippets diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp new file mode 100644 index 00000000000000..e41cfe78de7f6d --- /dev/null +++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp @@ -0,0 +1,148 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/propagate_subtensors.hpp" + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" +#include "snippets/itt.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +namespace { +void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, + const LinearIR::LoopManager::LoopInfoPtr& loop_info, + LinearIR::container::const_iterator begin, + LinearIR::container::const_iterator end, + const size_t new_dim_value) { + std::map original_shapes; + static constexpr size_t existing_subtensor_value = SIZE_MAX; + // First step: set new dim value to the corresponding entry_points' dimensions + if (new_dim_value != existing_subtensor_value) { + for (const auto& port : loop_info->get_entry_points()) { + if (port.is_incremented) { + const auto& expr = port.expr_port->get_expr(); + const auto node = expr->get_node(); + auto desc = port.expr_port->get_descriptor_ptr(); + auto subtensor = desc->get_subtensor(); + if (port.dim_idx < subtensor.size()) { + *(subtensor.rbegin() + port.dim_idx) = new_dim_value; + desc->set_subtensor(subtensor); + } + + const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); + const auto& layout = parent_desc->get_layout(); + const auto& shape = parent_desc->get_shape(); + if (original_shapes.find(parent_desc) == original_shapes.end()) { + original_shapes[parent_desc] = shape; + } + auto new_shape = shape; + new_shape[*(layout.rbegin() + port.dim_idx)] = new_dim_value; + parent_desc->set_shape(new_shape); + } + } + } + + auto update_only_dim_idx_with_subtensor_value = [&](const LinearIR::LoopManager::LoopPort& port) { + if (port.is_incremented) { + auto desc = port.expr_port->get_descriptor_ptr(); + const auto expr = port.expr_port->get_expr(); + const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); + + const auto& layout = parent_desc->get_layout(); + const auto& shape = parent_desc->get_shape(); + const auto& desc_subtensor = desc->get_subtensor(); + if (port.dim_idx < desc_subtensor.size()) { + if (original_shapes.find(parent_desc) == original_shapes.end()) { + original_shapes[parent_desc] = shape; + } + auto new_shape = shape; + new_shape[*(layout.rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx); + parent_desc->set_shape(new_shape); + } + } + }; + + auto update_subtensors = [](const std::vector& descs, bool is_input) { + for (const auto& desc : descs) { + const auto& subtensor = desc->get_subtensor(); + if (!subtensor.empty()) { + auto planar_dims = is_input ? snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout()) + : snippets::utils::get_preordered_vdims(desc->get_shape(), desc->get_layout()); + const size_t subtensor_start = planar_dims.size() - subtensor.size(); + VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end()); + for (size_t i = 0; i < new_subtensor.size(); ++i) { + new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]); + } + desc->set_subtensor(new_subtensor); + } + } + }; + + auto shape_inference_end_it = end; + const bool loop_by_last_dim = loop_info->get_dim_idx() == 0; + // Subtensors are updated using shape inference infrastructure: + // For inner loops propagation function is called recursively + for (auto expr_it = begin; expr_it != end; expr_it++) { + const auto expr = *expr_it; + if (ov::is_type(expr->get_node())) + continue; + if (auto loop_begin = ov::as_type_ptr(expr->get_node())) { + const auto loop_end = loop_begin->get_loop_end(); + const auto inner_loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id()); + const auto inner_begin = std::next(expr_it); + const auto inner_end = linear_ir.find(linear_ir.get_expr_by_node(loop_end)); + + // The corresponding shapes of inner loops entry points must be updated using existing subtensor values + if (new_dim_value == existing_subtensor_value) { + for (const auto& port : loop_info->get_entry_points()) + update_only_dim_idx_with_subtensor_value(port); + } + propagate_updated_subtensor_through_loop(linear_ir, inner_loop_info, inner_begin, inner_end, existing_subtensor_value); + expr_it = inner_end; + continue; + } + if ((ov::is_type(expr_it->get()->get_node()) || + ov::is_type(expr_it->get()->get_node())) && + loop_by_last_dim) { + // WA: we have to break subtensor propagation if we try to propagate new last dim through Broadcast nodes + // which broadcast last dim in original dimension value anyway + // This workaround might be avoided if blocked shape are used for tail size propagation + shape_inference_end_it = expr_it; + break; + } + expr->updateShapes(); + update_subtensors(expr->get_input_port_descriptors(), true); + update_subtensors(expr->get_output_port_descriptors(), false); + } + + // After subtensor propagation, the original shapes must be restored + for (const auto& elem : original_shapes) + elem.first->set_shape(elem.second); + for (auto expr_it = begin; expr_it != shape_inference_end_it; expr_it++) + (*expr_it)->updateShapes(); +} +} // namespace + +UpdateSubtensors::UpdateSubtensors(size_t tail_size) : SubgraphPass(), m_tail_size(tail_size) {} + +bool UpdateSubtensors::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + const auto& expr = *end; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + const auto& loop_manager = linear_ir.get_loop_manager(); + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + propagate_updated_subtensor_through_loop(linear_ir, loop_info, std::next(begin), end, m_tail_size); + return true; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov + diff --git a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp index 4174f928352289..3f3e21509c8adc 100644 --- a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp +++ b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp @@ -7,6 +7,7 @@ #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/pass/mark_loops.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" #include "snippets/snippets_isa.hpp" #include "snippets/itt.hpp" @@ -19,6 +20,8 @@ namespace snippets { namespace lowered { namespace pass { +using LoopInfo = LinearIR::LoopManager::LoopInfo; + SoftmaxDecomposition::SoftmaxDecomposition(size_t vector_size) : m_vector_size{vector_size} {} bool SoftmaxDecomposition::run(LinearIR& linear_ir) { @@ -58,15 +61,22 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { // Init value of vector buffer for ReduceMax is -FLOAT_MIN. const auto fill_max = push_node(std::make_shared(vector_buffer_max.second, 0, float_min_constant)); // ReduceMax loop - const auto& max = push_node(std::make_shared(softmax->get_input_source_output(0), fill_max.second)); + const auto fill_max_tail = push_node(std::make_shared(softmax->get_input_source_output(0), m_vector_size, float_min_constant)); + + const auto& max = push_node(std::make_shared(fill_max_tail.second, fill_max.second)); const auto horizon_max = push_node(std::make_shared(max.second)); // Markup of ReduceMax Loop - loop_manager->mark_loop(max.first, horizon_max.first, inner_work_amount, m_vector_size, 0, - std::vector{(*max.first)->get_input_port(0), - (*max.first)->get_input_port(1)}, - std::vector{(*max.first)->get_output_port(0)}); + const auto reduce_max_loop_id = loop_manager->mark_loop(fill_max_tail.first, horizon_max.first, inner_work_amount, m_vector_size, 0, + std::vector{(*fill_max_tail.first)->get_input_port(0), + (*max.first)->get_input_port(1)}, + std::vector{(*max.first)->get_output_port(0)}); + const auto& reduce_max_loop_info = loop_manager->get_loop_info(reduce_max_loop_id); + const auto tail_size = inner_work_amount % m_vector_size; + if (tail_size != 0) { + reduce_max_loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + } const auto broadcast_horizon_max = push_node(std::make_shared(horizon_max.second, broadcasted_dim)); const auto vector_buffer_sum = push_node(std::make_shared()); // Init value of vector buffer for ReduceSum is zero. @@ -75,38 +85,42 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { // Sub + Exp + ReduceSum Loop const auto sub = push_node(std::make_shared(softmax->get_input_source_output(0), broadcast_horizon_max.second)); const auto exp = push_node(std::make_shared(sub.second)); - const auto sum = push_node(std::make_shared(exp.second, fill_sum.second)); + const auto fill_sum_tail = push_node(std::make_shared(exp.second, m_vector_size, zero_constant)); + const auto sum = push_node(std::make_shared(fill_sum_tail.second, fill_sum.second)); const auto horizon_sum = push_node(std::make_shared(sum.second)); - // Markup of ReduceMax Loop - loop_manager->mark_loop(sub.first, horizon_sum.first, inner_work_amount, m_vector_size, 0, - std::vector{(*sub.first)->get_input_port(0), - (*sub.first)->get_input_port(1), - (*sum.first)->get_input_port(1)}, - std::vector{(*exp.first)->get_output_port(0), - (*sum.first)->get_output_port(0)}); + // Markup of ReduceSum Loop + const auto reduce_sum_loop_id = loop_manager->mark_loop(sub.first, horizon_sum.first, inner_work_amount, m_vector_size, 0, + std::vector{(*sub.first)->get_input_port(0), + (*sub.first)->get_input_port(1), + (*sum.first)->get_input_port(1)}, + std::vector{(*fill_sum_tail.first)->get_output_port(0), + (*sum.first)->get_output_port(0)}); + const auto& reduce_sum_loop_info = loop_manager->get_loop_info(reduce_sum_loop_id); + if (tail_size != 0) { + reduce_sum_loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + } // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop const auto pow = push_node(std::make_shared(horizon_sum.second, -1.f)); const auto broadcast_pow = push_node(std::make_shared(pow.second, broadcasted_dim)); // Mul (pseudo-Divide loop) - const auto mul = push_node(std::make_shared(exp.second, broadcast_pow.second)); + const auto mul = push_node(std::make_shared(fill_sum_tail.second, broadcast_pow.second)); // Transfer original ExpressionPorts - linear_ir.replace_input((*max.first)->get_input_port(0), input_connector); + linear_ir.replace_input((*fill_max_tail.first)->get_input_port(0), input_connector); linear_ir.replace_input((*sub.first)->get_input_port(0), input_connector); linear_ir.replace_input(output_connector->get_consumers(), (*mul.first)->get_output_port_connector(0)); // Markup of Mul Loop loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, m_vector_size, 0, - std::vector{(*mul.first)->get_input_port(0), - (*mul.first)->get_input_port(1)}, + std::vector{(*mul.first)->get_input_port(0), (*mul.first)->get_input_port(1)}, std::vector{(*mul.first)->get_output_port(0)}); // Update Loop info for outer loops - const auto entry_points = std::vector{(*max.first)->get_input_port(0), + const auto entry_points = std::vector{(*fill_max_tail.first)->get_input_port(0), (*sub.first)->get_input_port(0)}; const auto exit_points = std::vector{(*mul.first)->get_output_port(0)}; for (auto loop_id : softmax_loop_ids) { @@ -114,16 +128,6 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { } expr_it = linear_ir.erase(expr_it); // Remove Softmax - - /* =========================================== */ - - /* ============= Runtime Info ================ */ - - // For tail loop we should fill input of Max by float min and - // input of Sum by zero to avoid math incorrect calculations - // TODO [111383]: It should be covered via general pipeline (for example, via analyze in InsertTailLoop?) - max.second->input(0).get_rt_info()["set_fill"] = float_min_constant; - sum.second->input(0).get_rt_info()["set_fill"] = zero_constant; modified = true; } } diff --git a/src/common/snippets/src/lowered/pass/split_loops.cpp b/src/common/snippets/src/lowered/pass/split_loops.cpp index ba036eca8011f9..d65e27feca8adb 100644 --- a/src/common/snippets/src/lowered/pass/split_loops.cpp +++ b/src/common/snippets/src/lowered/pass/split_loops.cpp @@ -5,6 +5,7 @@ #include "snippets/lowered/pass/split_loops.hpp" #include "snippets/lowered/pass/fuse_loops.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_manager.hpp" #include "snippets/snippets_isa.hpp" @@ -81,7 +82,18 @@ bool SplitLoops::run(LinearIR& linear_ir) { loop_to_split->get_dim_idx(), loop_to_split->get_entry_points(), loop_to_split->get_exit_points()); - loop_manager->get_loop_info(split_loop_id)->set_outer_splited_loop(true); + const auto& new_loop_info = loop_manager->get_loop_info(split_loop_id); + new_loop_info->set_outer_splited_loop(true); + new_loop_info->handlers = loop_to_split->handlers; + const auto work_amount = loop_to_fuse->get_work_amount(); + const auto increment = loop_to_fuse->get_increment(); + const auto tail_size = work_amount % increment; + // TODO: current logic doesn't handle the case when loop has first iteration handlers too. + // Need to skip this transformation for such cases or improve the logic + if (tail_size != 0) { + // TODO: should we remove previous tail loop handler? + new_loop_info->handlers[LoopInfo::LAST_ITER].register_pass(tail_size); + } break; } } diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 5068d915dc1ebe..2204660086478a 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -62,6 +62,8 @@ namespace ov { namespace snippets { namespace op { +using PassPipeline = lowered::pass::PassPipeline; + void Subgraph::set_generator(std::shared_ptr generator) { m_generator = std::move(generator); } diff --git a/src/common/snippets/src/pass/manager.cpp b/src/common/snippets/src/pass/manager.cpp index d5a1456c9a8ca5..3ed83085155fa5 100644 --- a/src/common/snippets/src/pass/manager.cpp +++ b/src/common/snippets/src/pass/manager.cpp @@ -8,7 +8,6 @@ namespace ov { namespace snippets { namespace pass { - std::shared_ptr Manager::register_pass_instance(const PassPosition& position, const std::shared_ptr& pass) { pass->set_pass_config(m_pass_config); diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp index 455c261cec5109..92e91aa8e18400 100644 --- a/src/common/snippets/tests/src/lowered/pass/loop.cpp +++ b/src/common/snippets/tests/src/lowered/pass/loop.cpp @@ -13,6 +13,7 @@ #include "snippets/lowered/pass/insert_loops.hpp" #include "snippets/lowered/pass/insert_tail_loop.hpp" #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" +#include "snippets/lowered/pass/pass.hpp" #include "snippets/lowered/pass/validate_loops.hpp" #include "snippets/shape_inference/shape_inference.hpp" #include "subgraph_simple.hpp" diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp index fc9aeeac10ee92..dc8739c7944555 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp @@ -4,23 +4,26 @@ #include "brgemm_blocking.hpp" -#include "openvino/pass/pattern/matcher.hpp" -#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "cpu_iter_handlers.hpp" #include "snippets/itt.hpp" -#include "snippets/utils.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/pass/insert_tail_loop.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" +#include "snippets/lowered/pass/pass.hpp" +#include "snippets/lowered/pass/propagate_subtensors.hpp" #include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" - namespace ov { namespace intel_cpu { namespace pass { using LinearIR = snippets::lowered::LinearIR; using LoopPort = LinearIR::LoopManager::LoopPort; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; +using LoopInfo = LinearIR::LoopManager::LoopInfo; +using namespace ov::snippets::lowered::pass; BrgemmBlocking::BrgemmBlocking() : Pass() {} @@ -86,7 +89,6 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { } else { *(in_0_subtensor.rbegin() + 1) = block_size_m; *(out_subtensor.rbegin() + 1) = block_size_m; - auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true), LoopPort(brgemm_expr->get_input_port(1), false)}; @@ -110,7 +112,6 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { } else { *in_1_subtensor.rbegin() = block_size_n; *out_subtensor.rbegin() = block_size_n; - auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); std::vector entries{LoopPort(brgemm_expr->get_input_port(0), false), LoopPort(brgemm_expr->get_input_port(1), true)}; @@ -135,7 +136,6 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { } else { *in_0_subtensor.rbegin() = block_size_k; *(in_1_subtensor.rbegin() + 1) = block_size_k; - auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true, 0), LoopPort(brgemm_expr->get_input_port(1), true, 1)}; @@ -146,44 +146,40 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { loop_begin_it = std::prev(expr_it); } std::vector exits{LoopPort(brgemm_expr->get_output_port(0), false)}; - auto loop_id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits); - const auto loop_info = loop_manager->get_loop_info(loop_id); - - auto first_iter_handler = [](LinearIR& linear_ir, LinearIR::constExprIt loop_end_it) { - const auto loop_end = ov::as_type_ptr(loop_end_it->get()->get_node()); - OPENVINO_ASSERT(loop_end, "First loop iteraton handler must be called on LoopEnd expression"); - const auto loop_id = loop_end->get_id(); - const auto& loop_manager = linear_ir.get_loop_manager(); - const auto& loop_info = loop_manager->get_loop_info(loop_id); - const auto work_amount = loop_info->get_work_amount(); - const auto increment = loop_info->get_increment(); - if (work_amount <= increment) - return false; - - auto new_loop_range = snippets::lowered::pass::InsertTailLoop::copy_loop(linear_ir, loop_id); - const auto firt_iter_loop_end = ov::as_type_ptr(std::prev(new_loop_range.end())->get()->get_node()); - auto first_iter_loop_info = loop_manager->get_loop_info(firt_iter_loop_end->get_id()); - firt_iter_loop_end->set_work_amount(increment); - first_iter_loop_info->set_work_amount(increment); - firt_iter_loop_end->set_finalization_offsets(std::vector(loop_end->get_finalization_offsets().size(), 0)); - - const auto loop_begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end->get_loop_begin())); - linear_ir.insert(loop_begin_it, new_loop_range.begin(), new_loop_range.end()); - - const auto new_work_amount = work_amount - increment; - loop_info->set_work_amount(new_work_amount); - loop_end->set_work_amount(new_work_amount); - - // Update original body's Brgemms with new beta parameter - for (auto expr_it = loop_begin_it; expr_it != loop_end_it; ++expr_it) { - const auto& expr_node = expr_it->get()->get_node(); - if (const auto brgemm = ov::as_type_ptr(expr_node)) { - brgemm->set_beta(1.f); - } - } - return true; + const bool set_default_handlers = false; + const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits, set_default_handlers); + const auto loop_info = loop_manager->get_loop_info(id); + const auto tail_size = k % block_size_k; + + auto set_last_iter_handlers = [tail_size](SubgraphPassPipeline& pipeline) { + pipeline.register_pass(tail_size); + pipeline.register_pass(tail_size); + pipeline.register_pass(tail_size); + pipeline.register_pass(1.f); + }; + auto set_first_iter_handlers = [block_size_k](SubgraphPassPipeline& pipeline) { + pipeline.register_pass(block_size_k); + pipeline.register_pass(); }; - loop_info->set_first_iter_handler(first_iter_handler); + + if (tail_size != 0) { + if (k <= 2 * block_size_k) { + // First iter as main body and tail loop + set_first_iter_handlers(loop_info->handlers[LoopInfo::MAIN_BODY]); + set_last_iter_handlers(loop_info->handlers[LoopInfo::LAST_ITER]); + } else { + // First iter, main body and tail loop + set_first_iter_handlers(loop_info->handlers[LoopInfo::FIRST_ITER]); + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(block_size_k + tail_size); + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(); + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(1.f); + set_last_iter_handlers(loop_info->handlers[LoopInfo::LAST_ITER]); + } + } else { + set_first_iter_handlers(loop_info->handlers[LoopInfo::FIRST_ITER]); + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(block_size_k); + loop_info->handlers[LoopInfo::MAIN_BODY].register_pass(1.f); + } } }; @@ -194,6 +190,7 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { brgemm_expr->get_input_port_descriptor(0)->set_subtensor(in_0_subtensor); brgemm_expr->get_input_port_descriptor(1)->set_subtensor(in_1_subtensor); brgemm_expr->get_output_port_descriptor(0)->set_subtensor(out_subtensor); + modified = true; } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp new file mode 100644 index 00000000000000..688962d1f105d4 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp @@ -0,0 +1,29 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "cpu_iter_handlers.hpp" + +#include "snippets/lowered/loop_manager.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { +using LinearIR = snippets::lowered::LinearIR; +using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; + +SetBrgemmBeta::SetBrgemmBeta(float beta) : snippets::lowered::pass::SubgraphPass(), m_beta(beta) {} + +bool SetBrgemmBeta::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = expr_it->get(); + if (const auto brgemm = ov::as_type_ptr(expr->get_node())) { + brgemm->set_beta(m_beta); + } + } + return true; +} +} // namespace pass +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp new file mode 100644 index 00000000000000..b60b958983ab66 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/pass/iter_handler.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { +class SetBrgemmBeta : public snippets::lowered::pass::SubgraphPass { +public: + SetBrgemmBeta(float beta); + OPENVINO_RTTI("SetBrgemmBeta", "SubgraphPass") + bool run(const snippets::lowered::LinearIR& linear_ir, + snippets::lowered::LinearIR::constExprIt begin, + snippets::lowered::LinearIR::constExprIt end) override; + +private: + size_t m_beta; +}; +} // namespace pass +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index 77c78e31ca6b00..11988c5bd58541 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -20,7 +20,9 @@ std::vector> input_shapes{ {{1, 1, 32, 23}, {1, 1, 23, 68}}, {{1, 16, 384, 64}, {1, 16, 64, 384}}, {{1, 1, 100, 700}, {1, 1, 700, 100}}, + {{1, 1, 100, 1024}, {1, 1, 1024, 100}}, {{1, 1, 100, 2500}, {1, 1, 2500, 100}}, + {{1, 1, 100, 4500}, {1, 1, 4500, 100}}, }; static inline std::vector> quantized_precisions() {