diff --git a/src/common/snippets/include/snippets/lowered/expression.hpp b/src/common/snippets/include/snippets/lowered/expression.hpp index 0b619370ab47a5..6731e369ae0921 100644 --- a/src/common/snippets/include/snippets/lowered/expression.hpp +++ b/src/common/snippets/include/snippets/lowered/expression.hpp @@ -19,7 +19,7 @@ namespace lowered { class LinearIR; using ExpressionPtr = std::shared_ptr; -using ExressionMap = std::unordered_map; +using ExpressionMap = std::unordered_map; class Expression : public std::enable_shared_from_this { friend class LinearIR; friend class ExpressionPort; @@ -63,7 +63,7 @@ class Expression : public std::enable_shared_from_this { void set_loop_ids(const std::vector& loops); virtual ExpressionPtr clone_with_new_inputs(const std::vector& new_inputs, const std::shared_ptr& new_node) const; - ExpressionPtr clone_with_new_inputs(const ExressionMap& expr_map, const std::shared_ptr& new_node) const; + ExpressionPtr clone_with_new_inputs(const ExpressionMap& expr_map, const std::shared_ptr& new_node) const; protected: Expression(const Expression& other); diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp index 5034de4e481540..c9c5e6963a2924 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp @@ -70,11 +70,11 @@ class LinearIR { std::shared_ptr clone() const; static LinearIR::container deep_copy_range(LinearIR::container::const_iterator begin, LinearIR::container::const_iterator end, - ExressionMap& expression_map); + ExpressionMap& expression_map); - const container& get_ops() const {return m_expressions; } - const io_container& get_IO_ops() const {return m_io_expressions; } - Config get_config() {return m_config; } + const container& get_ops() const { return m_expressions; } + const io_container& get_IO_ops() const { return m_io_expressions; } + const Config& get_config() const { return m_config; } void set_loop_depth(size_t loop_depth) { m_config.m_loop_depth = loop_depth; } const ExpressionPtr& get_expr_by_node(const std::shared_ptr& n) const; diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp index 28904165d3ebef..b5aa9484cbec9e 100644 --- a/src/common/snippets/include/snippets/lowered/loop_manager.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_manager.hpp @@ -4,11 +4,12 @@ #pragma once -#include "linear_ir.hpp" - #include #include +#include "linear_ir.hpp" +#include "pass/iter_handler.hpp" +#include "pass/pass.hpp" #include "port_descriptor.hpp" namespace ov { @@ -41,19 +42,61 @@ class LinearIR::LoopManager { class LoopInfo { public: enum {UNDEFINED_DIM_IDX = std::numeric_limits::max()}; + class SpecificIterationHandlers { + public: + enum class HandlerType { FIRST_ITER, MAIN_BODY, LAST_ITER }; + SpecificIterationHandlers() = default; + SpecificIterationHandlers(size_t loop_work_amount, size_t loop_increment); + SpecificIterationHandlers(lowered::pass::PassPipeline first_iter_handlers, + lowered::pass::PassPipeline main_body_handlers, + lowered::pass::PassPipeline last_iter_handlers); + + const lowered::pass::PassPipeline& get_first_iter_handelrs() const; + const lowered::pass::PassPipeline& get_main_iter_handelrs() const; + const lowered::pass::PassPipeline& get_last_iter_handelrs() const; + static SpecificIterationHandlers merge_handlers(const SpecificIterationHandlers& lhs, const SpecificIterationHandlers& rhs); + + template ::type = true> + void register_handler(Args&&... args) { + m_first_iter_handlers.register_pass(args...); + } + + template ::type = true> + void register_handler(Args&&... args) { + m_main_body_handlers.register_pass(args...); + } + + template ::type = true> + void register_handler(Args&&... args) { + m_last_iter_handlers.register_pass(args...); + } + + private: + lowered::pass::PassPipeline m_first_iter_handlers; + lowered::pass::PassPipeline m_main_body_handlers; + lowered::pass::PassPipeline m_last_iter_handlers; + }; + LoopInfo() = default; LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - bool outer_splited_loop = false) - : m_work_amount(work_amount), m_increment(increment), - m_entry_points(entries), m_exit_points(exits), m_outer_splited_loop(outer_splited_loop) {} + const SpecificIterationHandlers& handlers = SpecificIterationHandlers()); LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - bool outer_splited_loop = false); + const SpecificIterationHandlers& handlers = SpecificIterationHandlers()); - std::shared_ptr clone_with_new_expr(const ExressionMap& expr_map) const; + std::shared_ptr clone_with_new_expr(const ExpressionMap& expr_map) const; // Returns dimension index if dimension indices for all entry and exit points are equal, and UNDEFINED_DIM_IDX otherwise size_t get_dim_idx() const; @@ -61,20 +104,7 @@ class LinearIR::LoopManager { size_t get_increment() const; const std::vector& get_entry_points() const; const std::vector& get_exit_points() const; - bool get_outer_splited_loop() const; - - /** - * \brief Inserts a separate body for first loop iteration processing if needed. - * Can also modify both main and first iter loop bodies. - * TODO: replace this temporary solution when ticket 119851 is implemented - * - * \param linear_ir LIR which should be modified - * \param loop_end_it iterator on LoopEnd expression for which the handler is called - * - * \return bool value which indicates whether the linear_ir was changed or not. - */ - using FirstIterHandler = std::function; - const FirstIterHandler& get_first_iter_handler() const; + const SpecificIterationHandlers& get_handlers() const; // Sets dim_idx to all entry and exit points void set_dim_idx(size_t dim_idx); @@ -82,8 +112,12 @@ class LinearIR::LoopManager { void set_increment(size_t increment); void set_entry_points(std::vector entry_points); void set_exit_points(std::vector exit_points); - void set_outer_splited_loop(bool outer_splited_loop); - void set_first_iter_handler(FirstIterHandler handler); + void set_handlers(SpecificIterationHandlers handlers); + + template + void register_handler(Args&&... args) { + m_handlers.register_handler(args...); + } // Update the parameters of existing LoopPorts void update_entry_points(const std::function& updater); @@ -98,9 +132,7 @@ class LinearIR::LoopManager { // Note: Scalars aren't entry expressions but can be before first entry expr in Linear IR std::vector m_entry_points = {}; std::vector m_exit_points = {}; - // True if this Loop is outer Loop for nested Loops that splits the same dimension - bool m_outer_splited_loop = false; - FirstIterHandler m_first_iter_handler = nullptr; + SpecificIterationHandlers m_handlers = {}; }; using LoopInfoPtr = std::shared_ptr; @@ -109,7 +141,7 @@ class LinearIR::LoopManager { * @param expr_map map of new and old expressions * @return the copy */ - std::shared_ptr clone_with_new_expr(const ExressionMap& expr_map) const; + std::shared_ptr clone_with_new_expr(const ExpressionMap& expr_map) const; /** * @brief Get target Loop Info @@ -176,8 +208,13 @@ class LinearIR::LoopManager { size_t increment, size_t dim_idx, const std::vector& entries, - const std::vector& exits) { - const auto loop_info = std::make_shared(work_amount, increment, entries, exits); + const std::vector& exits, + bool set_default_handlers = true) { + const auto normalized_increment = std::min(increment, work_amount); + const auto handlers = set_default_handlers + ? LoopInfo::SpecificIterationHandlers(work_amount, normalized_increment) + : LoopInfo::SpecificIterationHandlers(); + const auto loop_info = std::make_shared(work_amount, normalized_increment, entries, exits, handlers); loop_info->set_dim_idx(dim_idx); const auto loop_id = this->add_loop_info(loop_info); for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) { @@ -201,8 +238,13 @@ class LinearIR::LoopManager { size_t work_amount, size_t increment, const std::vector& entries, - const std::vector& exits) { - const auto loop_info = std::make_shared(work_amount, increment, entries, exits); + const std::vector& exits, + bool set_default_handlers = true) { + const auto normalized_increment = std::min(increment, work_amount); + const auto handlers = set_default_handlers + ? LoopInfo::SpecificIterationHandlers(work_amount, normalized_increment) + : LoopInfo::SpecificIterationHandlers(); + const auto loop_info = std::make_shared(work_amount, normalized_increment, entries, exits, handlers); const auto loop_id = this->add_loop_info(loop_info); for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) { insert_loop_id(*expr_it, loop_id); diff --git a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp index 1c1ea092d52059..1ec9598ec1d2c2 100644 --- a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp @@ -23,9 +23,9 @@ namespace pass { * The buffer scratchpad has one general data pointer. Each buffer has offset relative to the data pointer of buffer scratchpad. * @ingroup snippets */ -class AllocateBuffers: public Pass { +class AllocateBuffers: public RangedPass { public: - OPENVINO_RTTI("AllocateBuffers", "Pass") + OPENVINO_RTTI("AllocateBuffers", "RangedPass") AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true); /** @@ -33,7 +33,7 @@ class AllocateBuffers: public Pass { * @param linear_ir the target Linear IR * @return status of the pass */ - bool run(LinearIR& linear_ir) override; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; /** * @brief Set offset to Buffer op and propagates its to the connected memory access ops diff --git a/src/common/snippets/include/snippets/lowered/pass/clean_repeated_ptr_shifts.hpp b/src/common/snippets/include/snippets/lowered/pass/clean_repeated_ptr_shifts.hpp index 892137747a2776..e6863ef8ae62bd 100644 --- a/src/common/snippets/include/snippets/lowered/pass/clean_repeated_ptr_shifts.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/clean_repeated_ptr_shifts.hpp @@ -21,15 +21,15 @@ namespace pass { * This condition should be removed when Buffers stop being inplace by default. * @ingroup snippets */ -class CleanRepeatedDataPointerShifts: public Pass { +class CleanRepeatedDataPointerShifts: public RangedPass { public: - OPENVINO_RTTI("CleanRepeatedDataPointerShifts", "Pass") + OPENVINO_RTTI("CleanRepeatedDataPointerShifts", "RangedPass") CleanRepeatedDataPointerShifts() = default; - bool run(LinearIR& linear_ir) override; + bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: - bool reuse_increments(const LinearIR& linear_ir, const ExpressionPtr& loop_end_expr); + bool reuse_increments(const ExpressionPtr& loop_end_expr); }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/cleanup_loop_offsets.hpp b/src/common/snippets/include/snippets/lowered/pass/cleanup_loop_offsets.hpp index 5af01ad137e09b..cf72577ea98859 100644 --- a/src/common/snippets/include/snippets/lowered/pass/cleanup_loop_offsets.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/cleanup_loop_offsets.hpp @@ -17,10 +17,10 @@ namespace pass { * This transformation "fuses" the offsets with an outer loop's ptr_increments, and zeroes the offsets before Results. * @ingroup snippets */ -class CleanupLoopOffsets : public Pass { +class CleanupLoopOffsets : public RangedPass { public: - OPENVINO_RTTI("CleanupLoopOffsets", "Pass") - bool run(LinearIR& linear_ir) override; + OPENVINO_RTTI("CleanupLoopOffsets", "RangedPass") + bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp index 87bf8cbea0e77b..67254d879f3351 100644 --- a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp @@ -31,9 +31,9 @@ namespace pass { * These passes should be executed separately before this pass! * @ingroup snippets */ -class DefineBufferClusters : public Pass { +class DefineBufferClusters : public RangedPass { public: - OPENVINO_RTTI("DefineBufferClusters", "Pass") + OPENVINO_RTTI("DefineBufferClusters", "RangedPass") DefineBufferClusters(AllocateBuffers::BufferClusters& clusters) : m_clusters(clusters) {} @@ -42,7 +42,7 @@ class DefineBufferClusters : public Pass { * @param linear_ir the target Linear IR * @return status of the pass */ - bool run(lowered::LinearIR& linear_ir) override; + bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: using BufferPorts = std::unordered_map>; diff --git a/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp index 64b3a758a0ad8f..2b527d551f6f68 100644 --- a/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp @@ -36,11 +36,11 @@ namespace pass { * The main conditions of possible fusion is the equal increments and the equal/broadcastable work amounts. * @ingroup snippets */ -class FuseLoops : public Pass { +class FuseLoops : public RangedPass { public: - OPENVINO_RTTI("FuseLoops", "Pass") + OPENVINO_RTTI("FuseLoops", "RangedPass") FuseLoops(); - bool run(LinearIR& linear_ir) override; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; // This method checks that all ports which connect lower and upper loops are incremented. // This helps to avoid fusing for the ports with incompleted data diff --git a/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp index 81d284e4467597..31631b9b0ec638 100644 --- a/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp @@ -27,9 +27,9 @@ namespace pass { * Note: should be called before ResetBuffer() pass to have correct offsets * @ingroup snippets */ -class IdentifyBuffers: public Pass { +class IdentifyBuffers: public RangedPass { public: - OPENVINO_RTTI("IdentifyBuffers", "Pass") + OPENVINO_RTTI("IdentifyBuffers", "RangedPass") IdentifyBuffers() = default; /** @@ -37,7 +37,7 @@ class IdentifyBuffers: public Pass { * @param linear_ir the target Linear IR * @return status of the pass */ - bool run(LinearIR& linear_ir) override; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; struct ShiftPtrParams { ShiftPtrParams() = default; @@ -75,7 +75,7 @@ class IdentifyBuffers: public Pass { * @param pool set of Buffers from the Linear IR * @return adjacency matrix where True value means that Buffers are adjacent and cannot have the same ID */ - static std::vector create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool); + static std::vector create_adjacency_matrix(lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end, const BufferPool& pool); /** * @brief Algorithm of Graph coloring where vertices are Buffers * @param buffers set of Buffers from the Linear IR diff --git a/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp b/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp index 5993b0d41ea1d3..3b085ca2b32f80 100644 --- a/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp @@ -17,9 +17,9 @@ namespace pass { * @ingroup snippets */ -class InitBuffersDefault : public Pass { +class InitBuffersDefault : public RangedPass { public: - OPENVINO_RTTI("InitBuffersDefault", "Pass") + OPENVINO_RTTI("InitBuffersDefault", "RangedPass") InitBuffersDefault(size_t& buffer_scratchpad_size) : m_buffer_scratchpad_size(buffer_scratchpad_size) { m_buffer_scratchpad_size = 0; @@ -29,7 +29,7 @@ class InitBuffersDefault : public Pass { * @param linear_ir the target Linear IR * @return status of the pass */ - bool run(lowered::LinearIR& linear_ir) override; + bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: size_t& m_buffer_scratchpad_size; diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_broadcastmove.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_broadcastmove.hpp index fe4f9956d81c66..0d4c89c8605703 100644 --- a/src/common/snippets/include/snippets/lowered/pass/insert_broadcastmove.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/insert_broadcastmove.hpp @@ -16,10 +16,10 @@ namespace pass { * @brief Injects explicit Movebroadcast operations when the most varying dim is broadcasted * @ingroup snippets */ -class InsertBroadcastMove : public Pass { +class InsertBroadcastMove : public RangedPass { public: - OPENVINO_RTTI("InsertBroadcastMove", "Pass") - bool run(LinearIR& linear_ir) override; + OPENVINO_RTTI("InsertBroadcastMove", "RangedPass") + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp index 004ea711288ab2..37a03a364e8915 100644 --- a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp @@ -21,14 +21,17 @@ namespace pass { * @param m_buffer_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank] * @ingroup snippets */ -class InsertBuffers : public Pass { +class InsertBuffers : public RangedPass { public: - OPENVINO_RTTI("InsertBuffers", "Pass") + OPENVINO_RTTI("InsertBuffers", "RangedPass") InsertBuffers(int32_t buffer_allocation_rank); - bool run(LinearIR& linear_ir) override; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: - void insertion(LinearIR& linear_ir, const LinearIR::constExprIt& expr_it, const LinearIR::LoopManagerPtr& loop_manager, + void insertion(LinearIR& linear_ir, + const LinearIR::constExprIt& begin_it, + const LinearIR::constExprIt& end_it, + const LinearIR::LoopManagerPtr& loop_manager, const std::vector& loop_entries, const std::vector& loop_exits); diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_load_store.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_load_store.hpp index dbd4222888ec6d..cb6773fe186a20 100644 --- a/src/common/snippets/include/snippets/lowered/pass/insert_load_store.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/insert_load_store.hpp @@ -20,11 +20,11 @@ namespace pass { * @param m_vector_size - the count of elements for loading/storing * @ingroup snippets */ -class InsertLoadStore : public Pass { +class InsertLoadStore : public RangedPass { public: + OPENVINO_RTTI("InsertLoadStore", "RangedPass") explicit InsertLoadStore(size_t vector_size); - OPENVINO_RTTI("InsertLoadStore", "Pass") - bool run(LinearIR& linear_ir) override; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: size_t get_count(const PortDescriptorPtr& port_desc) const; diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_loops.hpp index bcd5c9231e7441..f29c4b558c0513 100644 --- a/src/common/snippets/include/snippets/lowered/pass/insert_loops.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/insert_loops.hpp @@ -18,11 +18,11 @@ namespace pass { * @brief The pass explicitly insert LoadBegin and LoadEnd in Linear IR using LoopManager::LoopInfo from Loop markup algorithm * @ingroup snippets */ -class InsertLoops : public Pass { +class InsertLoops : public RangedPass { public: - OPENVINO_RTTI("InsertLoops", "Pass") + OPENVINO_RTTI("InsertLoops", "RangedPass") InsertLoops(); - bool run(LinearIR& linear_ir) override; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: static void insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id, bool has_outer_loop); }; diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp index bad6dd3504fdc5..17d3f4cb2829dc 100644 --- a/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp @@ -21,11 +21,11 @@ namespace pass { * Developers could modify this to insert perf count pairs around interested sequence of nodes. * @ingroup snippets */ -class InsertPerfCount: public Pass { +class InsertPerfCount: public RangedPass { public: - OPENVINO_RTTI("InsertPerfCount", "Pass") + OPENVINO_RTTI("InsertPerfCount", "RangedPass") InsertPerfCount(std::map boundary_op_names); - bool run(LinearIR& linear_ir) override; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: std::map m_boundary_op_names; diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp new file mode 100644 index 00000000000000..15d2703d3f8e6d --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "pass.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +/** + * @interface InsertSpecificIterations + * @brief Inserts separate loop bodies for first/last iterations if needed. + * Also calls previously registered SpecificIterationHandlers for the inserted bodies and the main body. + * @ingroup snippets + */ +class InsertSpecificIterations : public RangedPass { +public: + OPENVINO_RTTI("InsertSpecificIterations", "RangedPass") + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; + + /** + * @brief Makes a copy of a loop body with id 'loop_id' and inserts it to the LinearIR before the 'insert_pos' position + * @param linear_ir LinearIR which should be modified + * @param loop_id id of the loop which should be copied + * @param insert_pos position before which the loop body copy should be inserted + * @return iterator which points on the LoopBegin copy + */ + static LinearIR::constExprIt insert_copy_loop(LinearIR& linear_ir, + const size_t loop_id, + const LinearIR::constExprIt& insert_pos); +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_tail_loop.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_tail_loop.hpp deleted file mode 100644 index 5fe8634959fb51..00000000000000 --- a/src/common/snippets/include/snippets/lowered/pass/insert_tail_loop.hpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "pass.hpp" - -#include "snippets/op/loop.hpp" -#include "snippets/lowered/loop_manager.hpp" - -namespace ov { -namespace snippets { -namespace lowered { -namespace pass { - -/** - * @interface InsertTailLoop - * @brief Injects tail-processing loop after a vector loop if required. - * Additional optimizations are performed if a loop body is executed only once. - * @ingroup snippets - */ -class InsertTailLoop : public Pass { -public: - OPENVINO_RTTI("InsertTailLoop", "Pass") - bool run(LinearIR& linear_ir) override; - static LinearIR::constExprIt insert_copy_loop(LinearIR& linear_ir, const size_t loop_id, const LinearIR::constExprIt& insert_pos); - - static constexpr size_t existing_subtensor_value = SIZE_MAX; - static void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, - const LinearIR::LoopManager::LoopInfoPtr& loop_info, - LinearIR::container::const_iterator begin, - LinearIR::container::const_iterator end, - const size_t new_dim_value = existing_subtensor_value); - -private: - static void create_tail_loop(LinearIR& linear_ir, - LinearIR::constExprIt begin, - LinearIR::constExprIt end, - const std::shared_ptr& loop_end, - bool need_vector_loop, - size_t tail_size); - static void tail_transformations(LinearIR& linear_ir, - LinearIR::constExprIt tail_begin, - LinearIR::constExprIt tail_end, - size_t tail_size); -}; - -} // namespace pass -} // namespace lowered -} // namespace snippets -} // namespace ov diff --git a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp new file mode 100644 index 00000000000000..467e3d5735d123 --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +/** + * @interface UpdateMemoryAccessCounts + * @brief The pass changes counts of all MemoryAccess ops + * @attention The pass skips inner loops + * @attention The pass ignores memory access ports which have count == 1 + * @param m_count - count which must be set + * @ingroup snippets + */ +class UpdateMemoryAccessCounts : public pass::RangedPass { +public: + UpdateMemoryAccessCounts(size_t count); + OPENVINO_RTTI("UpdateMemoryAccessCounts", "RangedPass") + bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + std::shared_ptr merge(const std::shared_ptr& other) override; + +private: + size_t m_count; +}; + +/** + * @interface SetFillOffset + * @brief The pass changes offset of all Fill ops + * @param m_offset - offset which must be set + * @ingroup snippets + */ +class SetFillOffset : public pass::RangedPass { +public: + SetFillOffset(size_t offset); + OPENVINO_RTTI("SetFillOffset", "RangedPass") + bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + std::shared_ptr merge(const std::shared_ptr& other) override; + +private: + size_t m_offset; +}; + +/** + * @interface TransformInnerSplitLoop + * @brief The pass updates finalization offsets, work amount and increment of inner Loop basing on tail_size of the current Loop + * @param m_tail_size - tail_size of the current Loop + * @ingroup snippets + */ +class TransformInnerSplitLoop : public pass::RangedPass { +public: + TransformInnerSplitLoop(size_t tail_size); + OPENVINO_RTTI("TransformInnerSplitLoop", "RangedPass") + bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + std::shared_ptr merge(const std::shared_ptr& other) override; + +private: + size_t m_tail_size; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/lowered/pass/load_movebroadcast_to_broadcastload.hpp b/src/common/snippets/include/snippets/lowered/pass/load_movebroadcast_to_broadcastload.hpp index 769208842e9338..e7aac012480fbc 100644 --- a/src/common/snippets/include/snippets/lowered/pass/load_movebroadcast_to_broadcastload.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/load_movebroadcast_to_broadcastload.hpp @@ -16,11 +16,11 @@ namespace pass { * @brief Fuses consecutive Load and MoveBroadcast into a single load insctruction. * @ingroup snippets */ -class LoadMoveBroadcastToBroadcastLoad: public Pass { +class LoadMoveBroadcastToBroadcastLoad: public RangedPass { public: LoadMoveBroadcastToBroadcastLoad() = default; - OPENVINO_RTTI("LoadMoveBroadcastToBroadcastLoad", "Pass") - bool run(LinearIR& linear_ir) override; + OPENVINO_RTTI("LoadMoveBroadcastToBroadcastLoad", "RangedPass") + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/mark_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/mark_loops.hpp index 048f9457ddb455..f3c1cd4c8f9818 100644 --- a/src/common/snippets/include/snippets/lowered/pass/mark_loops.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/mark_loops.hpp @@ -20,11 +20,11 @@ namespace pass { * - the consumer of the expression is explicitly after this expression - the pass marks the branches * @ingroup snippets */ -class MarkLoops : public Pass { +class MarkLoops : public RangedPass { public: - OPENVINO_RTTI("MarkLoops", "Pass") + OPENVINO_RTTI("MarkLoops", "RangedPass") MarkLoops(size_t vector_size); - bool run(LinearIR& linear_ir) override; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: size_t m_vector_size; diff --git a/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp b/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp index 7f80fafda08aeb..81b7536b63edaa 100644 --- a/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp @@ -23,15 +23,15 @@ namespace pass { * @ingroup snippets */ -class NormalizeBufferIDs : public Pass { +class NormalizeBufferIDs : public RangedPass { public: - OPENVINO_RTTI("NormalizeBufferIDs", "Pass") + OPENVINO_RTTI("NormalizeBufferIDs", "RangedPass") /** * @brief Apply the pass to the Linear IR * @param linear_ir the target Linear IR * @return status of the pass */ - bool run(lowered::LinearIR& linear_ir) override; + bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp b/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp index 9ac4181e61e861..b320bd8396e866 100644 --- a/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp @@ -18,10 +18,10 @@ namespace pass { * - moves all ptr arithmetic to finalization offsets * @ingroup snippets */ -class OptimizeLoopSingleEvaluation : public Pass { +class OptimizeLoopSingleEvaluation : public RangedPass { public: - OPENVINO_RTTI("OptimizeLoopSingleEvaluation", "Pass") - bool run(LinearIR& linear_ir) override; + OPENVINO_RTTI("OptimizeLoopSingleEvaluation", "RangedPass") + bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/pass.hpp b/src/common/snippets/include/snippets/lowered/pass/pass.hpp index 177056d2984d25..5833b695b0bba8 100644 --- a/src/common/snippets/include/snippets/lowered/pass/pass.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/pass.hpp @@ -16,18 +16,18 @@ namespace lowered { namespace pass { /** - * @interface Pass + * @interface PassBase * @brief Base class for transformations on linear IR * @ingroup snippets */ -class Pass { +class PassBase { public: - Pass() = default; - virtual ~Pass() = default; + PassBase() = default; + virtual ~PassBase() = default; // Note that get_type_info_static and get_type_info are needed to mimic OPENVINO_RTTI interface, // so the standard OPENVINO_RTTI(...) macros could be used in derived classes. _OPENVINO_HIDDEN_METHOD static const ::ov::DiscreteTypeInfo& get_type_info_static() { - static ::ov::DiscreteTypeInfo type_info_static {"Pass"}; + static ::ov::DiscreteTypeInfo type_info_static {"PassBase"}; type_info_static.hash(); return type_info_static; } @@ -40,6 +40,25 @@ class Pass { return get_type_info().name; } + /** + * @brief Merges the current pass with other (e.g. during 2 pass pipelines fusion). + * @param other Pointer on the another pass. + * @return The merged pass + * @attention If 'other' pass is empty (aka nullptr), it can be merged to any other pass. + * @attention If the merge fails, then nullptr is returned. + */ + virtual std::shared_ptr merge(const std::shared_ptr& other) { + return nullptr; + } +}; + +/** + * @interface Pass + * @brief Base class for LIR passes which are performed on a full LIR body + * @ingroup snippets + */ +class Pass : public PassBase { +public: /** * @brief Apply the pass to the Linear IR * @param linear_ir the target Linear IR @@ -48,25 +67,46 @@ class Pass { virtual bool run(lowered::LinearIR& linear_ir) = 0; }; +/** + * @interface RangedPass + * @brief Base class for LIR passes which are performed on a range of a LIR body + * @ingroup snippets + */ +class RangedPass : public PassBase { +public: + /** + * @brief Apply the pass to the Linear IR + * @param linear_ir the target Linear IR + * @param begin begin of the range on which the pass is performed + * @param end end of the range on which the pass is performed + * @return status of the pass + */ + virtual bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) = 0; +}; + class PassPipeline { public: - using PositionedPassLowered = snippets::pass::PositionedPass; + using PositionedPassLowered = snippets::pass::PositionedPass; PassPipeline(); PassPipeline(const std::shared_ptr& pass_config); - void register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr& pass); - void register_pass(const std::shared_ptr& pass); + const std::vector>& get_passes() const { return m_passes; } + const std::shared_ptr& get_pass_config() const { return m_pass_config; } + bool empty() const { return m_passes.empty(); } + + void register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr& pass); + void register_pass(const std::shared_ptr& pass); template void register_pass(Args&&... args) { - static_assert(std::is_base_of::value, "Pass not derived from lowered::Pass"); + static_assert(std::is_base_of::value, "Pass not derived from lowered::Pass"); auto pass = std::make_shared(std::forward(args)...); register_pass(pass); } template::value, bool>() = true> void register_pass(const snippets::pass::PassPosition& position, Args&&... args) { - static_assert(std::is_base_of::value, "Pass not derived from lowered::Pass"); + static_assert(std::is_base_of::value, "Pass not derived from lowered::Pass"); auto pass = std::make_shared(std::forward(args)...); register_pass(position, pass); } @@ -74,10 +114,20 @@ class PassPipeline { void register_positioned_passes(const std::vector& pos_passes); void run(lowered::LinearIR& linear_ir) const; + void run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) const; + + /** + * @brief Merges 2 pass pipelines into one + * @param lhs first pass pipeline + * @param rhs second pass pipeline + * @return the merged pass pipeline + * @attention the function can not be used in case when one of the pipelines contains passes whose running order is important. + */ + static PassPipeline merge_pipelines(const PassPipeline& lhs, const PassPipeline& rhs); private: std::shared_ptr m_pass_config; - std::vector> m_passes; + std::vector> m_passes; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/pass_config.hpp b/src/common/snippets/include/snippets/lowered/pass/pass_config.hpp index 03fe2b3dd6d65d..90a45cc0eba708 100644 --- a/src/common/snippets/include/snippets/lowered/pass/pass_config.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/pass_config.hpp @@ -48,6 +48,9 @@ class PassConfig { return is_enabled(T::get_type_info_static()); } + friend bool operator==(const PassConfig& lhs, const PassConfig& rhs); + friend bool operator!=(const PassConfig& lhs, const PassConfig& rhs); + private: std::unordered_set m_disabled; std::unordered_set m_enabled; diff --git a/src/common/snippets/include/snippets/lowered/pass/propagate_layout.hpp b/src/common/snippets/include/snippets/lowered/pass/propagate_layout.hpp index 6ba062b0525556..b77b61e90b480d 100644 --- a/src/common/snippets/include/snippets/lowered/pass/propagate_layout.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/propagate_layout.hpp @@ -17,10 +17,10 @@ namespace pass { * proper data pointer offsets in the Kernel; * @ingroup snippets */ -class PropagateLayout : public Pass { +class PropagateLayout : public RangedPass { public: - OPENVINO_RTTI("PropagateLayout", "Pass") - bool run(LinearIR& linear_ir) override; + OPENVINO_RTTI("PropagateLayout", "RangedPass") + bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp new file mode 100644 index 00000000000000..4803e0556b7118 --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +/** + * @interface UpdateSubtensors + * @brief The pass updates subtensors of all operations in Loop based on tail size. + * Firstly, the pass updates subtensors of all Loop entry points. + * After that, shape inference infrastructure is used to update subtensors of all ops in Loop body + * @param m_offset - offset which must be set + * @ingroup snippets + */ +class UpdateSubtensors : public pass::RangedPass { +public: + UpdateSubtensors(size_t tail_size); + OPENVINO_RTTI("UpdateSubtensors", "RangedPass") + bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; + std::shared_ptr merge(const std::shared_ptr& other) override; + +private: + size_t m_tail_size; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/lowered/pass/softmax_decomposition.hpp b/src/common/snippets/include/snippets/lowered/pass/softmax_decomposition.hpp index 795dc0d3725f1c..62704dafcfdfa9 100644 --- a/src/common/snippets/include/snippets/lowered/pass/softmax_decomposition.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/softmax_decomposition.hpp @@ -16,11 +16,11 @@ namespace pass { * @brief Decomposes Softmax to a range of low-level operations on linear IR * @ingroup snippets */ -class SoftmaxDecomposition : public Pass { +class SoftmaxDecomposition : public RangedPass { public: - OPENVINO_RTTI("SoftmaxDecomposition", "Pass") + OPENVINO_RTTI("SoftmaxDecomposition", "RangedPass") explicit SoftmaxDecomposition(size_t vector_size); - bool run(LinearIR& linear_ir) override; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: size_t m_vector_size; diff --git a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp index 4a99a6f2a4541e..dfa5c3fc54d120 100644 --- a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp @@ -19,6 +19,7 @@ namespace pass { * @brief The pass optimally calculates the common buffer scratchpad size and * set the offsets relative to the common data pointer to all Buffers. The pass uses MemorySolver API. * Note: The pass requires expression enumeration. It should be executed separately before this pass! + * Note: this transformation works only with m_clusters, no lir or iterators are really needed * @ingroup snippets */ class SolveBufferMemory : public Pass { diff --git a/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp index bb74529cfbfc5f..ccc63d602cf657 100644 --- a/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp @@ -29,11 +29,11 @@ namespace pass { * @ingroup snippets */ -class SplitLoops : public Pass { +class SplitLoops : public RangedPass { public: - OPENVINO_RTTI("SplitLoops", "Pass") + OPENVINO_RTTI("SplitLoops", "RangedPass") SplitLoops(); - bool run(LinearIR& linear_ir) override; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: static bool can_be_split(const LinearIR::LoopManager::LoopInfoPtr& current, diff --git a/src/common/snippets/include/snippets/lowered/pass/validate_shapes.hpp b/src/common/snippets/include/snippets/lowered/pass/validate_shapes.hpp index 08243c96beedf5..c650ac21f206c1 100644 --- a/src/common/snippets/include/snippets/lowered/pass/validate_shapes.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/validate_shapes.hpp @@ -18,11 +18,11 @@ namespace pass { * @brief The pass checks that there are no dynamic shapes in the IR * @ingroup snippets */ -class ValidateShapes : public Pass { +class ValidateShapes : public RangedPass { public: - OPENVINO_RTTI("ValidateShapes", "Pass") + OPENVINO_RTTI("ValidateShapes", "RangedPass") ValidateShapes() = default; - bool run(LinearIR& linear_ir) override; + bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index 3829daf539e782..e5d99d59a6d361 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -5,11 +5,13 @@ #pragma once #include - #include #include -#include "openvino/op/op.hpp" + #include "openvino/core/rt_info.hpp" +#include "openvino/op/op.hpp" +#include "snippets/generator.hpp" +#include "snippets/lowered/pass/pass.hpp" #include "snippets/pass/manager.hpp" #include "snippets/shape_inference/shape_inference.hpp" #include "snippets/lowered/pass/pass.hpp" diff --git a/src/common/snippets/include/snippets/pass/manager.hpp b/src/common/snippets/include/snippets/pass/manager.hpp index a9e3c2aec37498..3867366f1b399d 100644 --- a/src/common/snippets/include/snippets/pass/manager.hpp +++ b/src/common/snippets/include/snippets/pass/manager.hpp @@ -10,9 +10,6 @@ #include "openvino/pass/pass.hpp" #include "openvino/pass/validate.hpp" -#include - - namespace ov { namespace snippets { namespace pass { @@ -36,7 +33,7 @@ class Manager : public ov::pass::Manager { std::shared_ptr register_pass(const PassPosition& position, Args&&... args) { static_assert(std::is_base_of::value, "Attempt to insert pass that is not derived from PassBase"); auto pass = std::make_shared(std::forward(args)...); - auto rc = insert_pass_instance(position, pass); + auto rc = insert_pass_instance(position, pass); rc->set_pass_config(m_pass_config); if (!m_pass_config->is_enabled()) { m_pass_config->disable(); @@ -48,7 +45,7 @@ class Manager : public ov::pass::Manager { void register_positioned_passes(const std::vector& pos_passes); protected: - std::shared_ptr insert_pass_instance(const PassPosition& position, const std::shared_ptr& pass); + std::shared_ptr insert_pass_instance(const PassPosition& position, const std::shared_ptr& pass); }; } // namespace pass diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 96972fce825c0c..e44902fe4eebd6 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -4,28 +4,27 @@ #include "snippets/generator.hpp" +#include "snippets/itt.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/pass/assign_registers.hpp" #include "snippets/lowered/pass/cleanup_loop_offsets.hpp" -#include "snippets/lowered/pass/insert_tail_loop.hpp" +#include "snippets/lowered/pass/insert_specific_iterations.hpp" #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" - +#include "snippets/lowered/pass/pass.hpp" #include "snippets/op/kernel.hpp" -#include "snippets/itt.hpp" - namespace ov { namespace snippets { void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, const void* compile_params) const { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate") OV_ITT_TASK_CHAIN(GENERATE, ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::Transformations") - if (!target->is_supported()) - OPENVINO_THROW("unsupported architecture for code generation"); + OPENVINO_ASSERT(target->is_supported(), "unsupported architecture for code generation"); std::function& out)> reg_type_mapper = [&](const ov::Output& out) -> RegType { return get_op_out_reg_type(out); }; + lowered::pass::PassPipeline lowered_pipeline; // Note: the order of all passes in this pipeline must not be changed since they have hard dependencies // 1. InsertTailLoop must be called after AssignRegisters since tail loop expressions must have the same @@ -35,7 +34,7 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c // 3. OptimizeLoopSingleEvaluation must be called after CleanupLoopOffsets // since CleanupLoopOffsets can't handle loops with evaluate_once = true lowered_pipeline.register_pass(reg_type_mapper); - lowered_pipeline.register_pass(); + lowered_pipeline.register_pass(); lowered_pipeline.register_pass(); lowered_pipeline.register_pass(); lowered_pipeline.run(linear_ir); diff --git a/src/common/snippets/src/lowered/expression.cpp b/src/common/snippets/src/lowered/expression.cpp index f33f3aeef95fc3..5c2a190dbf66a0 100644 --- a/src/common/snippets/src/lowered/expression.cpp +++ b/src/common/snippets/src/lowered/expression.cpp @@ -156,7 +156,7 @@ ExpressionPtr Expression::clone_with_new_inputs(const std::vector& new_node) const { std::vector new_inputs; new_inputs.reserve(m_input_port_connectors.size()); diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp index 65eb3f741cc628..b489ca27d5bd6d 100644 --- a/src/common/snippets/src/lowered/linear_ir.cpp +++ b/src/common/snippets/src/lowered/linear_ir.cpp @@ -47,7 +47,7 @@ std::shared_ptr LinearIR::clone() const { auto cloned = std::make_shared(); cloned->m_config = m_config; - ExressionMap expression_map; + ExpressionMap expression_map; cloned->m_expressions = deep_copy_range(m_expressions.cbegin(), m_expressions.cend(), expression_map); for (const auto& expr : cloned->m_expressions) { cloned->m_node2expression_map[expr->get_node()] = expr; @@ -161,7 +161,7 @@ std::vector> clone_nodes(const std::vector LoopPort::clone_with_new_expr(const ExpressionPtr& new return new_loop_port; } -LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount, - size_t increment, - const std::vector& entries, - const std::vector& exits, - bool outer_splited_loop) +LoopInfo::SpecificIterationHandlers::SpecificIterationHandlers(size_t loop_work_amount, size_t loop_increment) { + const auto tail_size = loop_work_amount % loop_increment; + if (tail_size != 0) { + m_last_iter_handlers.register_pass(tail_size); + m_last_iter_handlers.register_pass(tail_size); + } +} + +LoopInfo::SpecificIterationHandlers::SpecificIterationHandlers(lowered::pass::PassPipeline first_iter_handlers, + lowered::pass::PassPipeline main_body_handlers, + lowered::pass::PassPipeline last_iter_handlers) + : m_first_iter_handlers(std::move(first_iter_handlers)), + m_main_body_handlers(std::move(main_body_handlers)), + m_last_iter_handlers(std::move(last_iter_handlers)) {} + +const lowered::pass::PassPipeline& LoopInfo::SpecificIterationHandlers::get_first_iter_handelrs() const { + return m_first_iter_handlers; +} + +const lowered::pass::PassPipeline& LoopInfo::SpecificIterationHandlers::get_main_iter_handelrs() const { + return m_main_body_handlers; +} + +const lowered::pass::PassPipeline& LoopInfo::SpecificIterationHandlers::get_last_iter_handelrs() const { + return m_last_iter_handlers; +} + +LoopInfo::SpecificIterationHandlers LoopInfo::SpecificIterationHandlers::merge_handlers( + const SpecificIterationHandlers& lhs, + const SpecificIterationHandlers& rhs) { + return LoopInfo::SpecificIterationHandlers( + lowered::pass::PassPipeline::merge_pipelines(lhs.get_first_iter_handelrs(), rhs.get_first_iter_handelrs()), + lowered::pass::PassPipeline::merge_pipelines(lhs.get_main_iter_handelrs(), rhs.get_main_iter_handelrs()), + lowered::pass::PassPipeline::merge_pipelines(lhs.get_last_iter_handelrs(), rhs.get_last_iter_handelrs())); +} + +LoopInfo::LoopInfo(size_t work_amount, + size_t increment, + const std::vector& entries, + const std::vector& exits, + const LoopInfo::SpecificIterationHandlers& handlers) : m_work_amount(work_amount), m_increment(increment), - m_outer_splited_loop(outer_splited_loop) { + m_entry_points(entries), + m_exit_points(exits), + m_handlers(handlers) {} + +LoopInfo::LoopInfo(size_t work_amount, + size_t increment, + const std::vector& entries, + const std::vector& exits, + const LoopInfo::SpecificIterationHandlers& handlers) + : m_work_amount(work_amount), + m_increment(increment), + m_handlers(handlers) { m_entry_points.reserve(entries.size()); m_exit_points.reserve(exits.size()); for (const auto& port : entries) @@ -53,7 +102,7 @@ LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount, m_exit_points.emplace_back(port); } -std::shared_ptr LoopInfo::clone_with_new_expr(const ExressionMap& expr_map) const { +std::shared_ptr LoopInfo::clone_with_new_expr(const ExpressionMap& expr_map) const { auto clone_loop_ports = [&expr_map](const std::vector& port_points) { std::vector cloned_port_points; cloned_port_points.reserve(port_points.size()); @@ -68,7 +117,7 @@ std::shared_ptr LoopInfo::clone_with_new_expr(const ExressionMap& expr const auto& new_entry_points = clone_loop_ports(m_entry_points); const auto& new_exit_points = clone_loop_ports(m_exit_points); - return std::make_shared(m_work_amount, m_increment, new_entry_points, new_exit_points, m_outer_splited_loop); + return std::make_shared(m_work_amount, m_increment, new_entry_points, new_exit_points, m_handlers); } size_t LoopInfo::get_work_amount() const { @@ -87,15 +136,11 @@ const std::vector& LoopInfo::get_exit_points() const { return m_exit_points; } -bool LoopInfo::get_outer_splited_loop() const { - return m_outer_splited_loop; +const LoopInfo::SpecificIterationHandlers& LoopInfo::get_handlers() const { + return m_handlers; } -const LoopInfo::FirstIterHandler& LoopInfo::get_first_iter_handler() const { - return m_first_iter_handler; -} - -size_t LinearIR::LoopManager::LoopInfo::get_dim_idx() const { +size_t LoopInfo::get_dim_idx() const { OPENVINO_ASSERT(!m_entry_points.empty(), "Loop info must have at least one entry point"); auto equal_dim_idxes = [&](const LinearIR::LoopManager::LoopPort& p) { return p.dim_idx == m_entry_points[0].dim_idx; @@ -130,15 +175,11 @@ void LoopInfo::set_entry_points(std::vector entry_points) { } void LoopInfo::set_exit_points(std::vector exit_points) { - m_exit_points = std::move(exit_points);; -} - -void LoopInfo::set_outer_splited_loop(bool outer_splited_loop) { - m_outer_splited_loop = outer_splited_loop; + m_exit_points = std::move(exit_points); } -void LoopInfo::set_first_iter_handler(LoopInfo::FirstIterHandler first_iter_handler) { - m_first_iter_handler = std::move(first_iter_handler); +void LoopInfo::set_handlers(LoopInfo::SpecificIterationHandlers handlers) { + m_handlers = std::move(handlers); } void LoopInfo::update_entry_points(const std::function& updater) { @@ -164,7 +205,7 @@ bool operator<(const LinearIR::LoopManager::LoopPort& lhs, const LinearIR::LoopM (lhs.is_incremented == rhs.is_incremented && lhs.dim_idx < rhs.dim_idx))); } -std::shared_ptr LoopManager::clone_with_new_expr(const ExressionMap& expr_map) const { +std::shared_ptr LoopManager::clone_with_new_expr(const ExpressionMap& expr_map) const { auto new_loop_manager = std::make_shared(); for (const auto& id_info : m_map) new_loop_manager->m_map.insert({id_info.first, id_info.second->clone_with_new_expr(expr_map)}); @@ -370,18 +411,16 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, } for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) { - if (*(loop_subtensor.rbegin() + dim_idx) == PortDescriptor::ServiceDimensions::FULL_DIM) { + OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup"); + const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx); + if (subtensor_value == PortDescriptor::ServiceDimensions::FULL_DIM) { continue; } OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup"); - const auto work_amount = - loop_tensor.size() > dim_idx ? *(loop_tensor.rbegin() + dim_idx) - : 0; - const auto work_amount_increment = - loop_subtensor.size() > dim_idx ? *(loop_subtensor.rbegin() + dim_idx) - : (dim_idx == 0 ? vector_size : 1); - mark_loop(loop_begin_pos, loop_end_pos, work_amount, work_amount_increment, dim_idx, loop_entry_points, loop_exit_points); + const auto work_amount = *(loop_tensor.rbegin() + dim_idx); + const auto increment = subtensor_value; + mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, dim_idx, loop_entry_points, loop_exit_points); } } @@ -444,6 +483,12 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, loop_info->set_entry_points(new_entries); loop_info->set_exit_points(new_exits); + loop_info->set_handlers(LoopInfo::SpecificIterationHandlers::merge_handlers(loop_info_upper->get_handlers(), loop_info_lower->get_handlers())); + // Since fusion can be called for broadcastable loops (one of the loops has work_amount = increment = 1), + // maximum value is set to the fused loop + loop_info->set_work_amount(std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount())); + loop_info->set_increment(std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment())); + const auto& from = fuse_into_upper ? loop_id_lower : loop_id_upper; const auto& to = fuse_into_upper ? loop_id_upper : loop_id_lower; for (auto it = loop_begin_target; it != loop_end_target; ++it) { diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp index d34b442fd33051..c7cf6b67abd8ea 100644 --- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp @@ -64,7 +64,7 @@ void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const } } -bool AllocateBuffers::run(lowered::LinearIR& linear_ir) { +bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AllocateBuffers"); m_buffer_scratchpad_size = 0; @@ -78,7 +78,7 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir) { pipeline.register_pass(); pipeline.run(linear_ir); } else { - InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir); + InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir, linear_ir.cbegin(), linear_ir.cend()); } return m_buffer_scratchpad_size > 0; diff --git a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp index 644a5dd1509f7f..ebe802168f5871 100644 --- a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp +++ b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp @@ -13,7 +13,7 @@ namespace snippets { namespace lowered { namespace pass { -bool CleanRepeatedDataPointerShifts::reuse_increments(const LinearIR& linear_ir, const ExpressionPtr& loop_end_expr) { +bool CleanRepeatedDataPointerShifts::reuse_increments(const ExpressionPtr& loop_end_expr) { const auto loop_end = ov::as_type_ptr(loop_end_expr->get_node()); if (!loop_end) return false; @@ -89,14 +89,15 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LinearIR& linear_ir, return true; } -bool CleanRepeatedDataPointerShifts::run(LinearIR& linear_ir) { +bool CleanRepeatedDataPointerShifts::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::CleanRepeatedDataPointerShifts") bool modified = false; - for (const auto& expr : linear_ir) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; const auto& node = expr->get_node(); if (ov::is_type(node)) { - modified |= reuse_increments(linear_ir, expr); + modified |= reuse_increments(expr); } } diff --git a/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp b/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp index 5e5cc43b13c835..f503e116824960 100644 --- a/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp +++ b/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp @@ -13,14 +13,10 @@ namespace snippets { namespace lowered { namespace pass { -bool CleanupLoopOffsets::run(LinearIR& linear_ir) { +bool CleanupLoopOffsets::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::CleanupLoopOffsets") - if (linear_ir.empty()) - return false; bool is_modified = false; - // Note: it doesn't make sense to check the last expression - it must always be Result - const auto before_last = std::prev(linear_ir.end()); - for (auto expr_it = linear_ir.begin(); expr_it != before_last; expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto& node = expr_it->get()->get_node(); if (auto loop_end = as_type_ptr(node)) { auto next_expr_it = std::next(expr_it); diff --git a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp index 9e51b169a5deff..dc2eae08947163 100644 --- a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp +++ b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp @@ -320,10 +320,10 @@ void DefineBufferClusters::parse_memory_access_op(const ExpressionPtr& expr) { } } -bool DefineBufferClusters::run(LinearIR& linear_ir) { +bool DefineBufferClusters::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::DefineBufferClusters"); - for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { const auto& expr = *expr_it; const auto op = expr->get_node(); if (ov::is_type(op)) { diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp index 004d2fa62f9da9..8fe892628f4f1c 100644 --- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp +++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp @@ -22,7 +22,7 @@ bool is_loop_id_found(const std::vector& ids, size_t id) { using LoopManager = LinearIR::LoopManager; using LoopInfoPtr = LoopManager::LoopInfoPtr; -FuseLoops::FuseLoops() : Pass() {} +FuseLoops::FuseLoops() : RangedPass() {} bool FuseLoops::loop_ports_are_compatible(const LinearIR::LoopManagerPtr& loop_manager, const size_t loop_lower_id, @@ -44,20 +44,29 @@ bool FuseLoops::loop_ports_are_compatible(const LinearIR::LoopManagerPtr& loop_m } bool FuseLoops::can_be_fused(const LoopInfoPtr& loop_current, const LoopInfoPtr& loop_target) { - auto current_work_amount = loop_current->get_work_amount(); - auto target_work_amount = loop_target->get_work_amount(); - // Loop fusion is supported only if Loops have equal increments and the equal/broadcastable work amounts. + const auto current_work_amount = loop_current->get_work_amount(); + const auto target_work_amount = loop_target->get_work_amount(); + const auto current_increment = loop_current->get_increment(); + const auto target_increment = loop_target->get_increment(); + // Loop fusion is supported only if Loops have equal/broadcastable increments and work amounts. // Note: For example, Broadcastable work amounts are possible in the following case: // Relu_0 [16x1] Relu_1 [16x128] // \ / // Add [16x128] // Because of expression order in linear IR and work of MarkLoop algorithm, there are 2 Inner Loops: - // - Relu_0 with work amount `1` and increment `vector size` + // - Relu_0 with work amount `1` and increment `1` // - Relu_1 and Add with work amount `128` and increment `vector size` // We can fuse them into one Loop with work amount `128` and increment `vector size` - const auto supported_work_amount = current_work_amount == target_work_amount || current_work_amount == 1 || target_work_amount == 1; - const auto supported_increment = loop_current->get_increment() == loop_target->get_increment(); - return supported_work_amount && supported_increment; + + // WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't, + // because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters + // (e.g. tail size). This logic is not implemented for now, so fusion for such loops is skipped. + const bool first_iter_handlers_match = loop_current->get_handlers().get_first_iter_handelrs().empty() == + loop_target->get_handlers().get_first_iter_handelrs().empty(); + const bool equal_parameters = current_work_amount == target_work_amount && current_increment == target_increment; + const bool current_bcastable = current_work_amount == 1 && current_increment == 1; + const bool target_bcastable = target_work_amount == 1 && target_increment == 1; + return first_iter_handlers_match && (equal_parameters || current_bcastable || target_bcastable); } void FuseLoops::move(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id, @@ -123,12 +132,6 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos; std::tie(target_loop_begin_pos, target_loop_end_pos) = loop_manager->get_loop_bounds(linear_ir, target_loop_id); loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, target_loop_id, current_loop_id, false); - // Update work_amount for Loop (increment is constant because increments must be the identical for fusion): - loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount())); - // If one of the Loops is outer for nested loops that splits the same dimension, - // after fusion new common Loop save this status - loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop()); - const auto insertion_place = current_loop_begin_pos; const auto is_move_needed = target_loop_end_pos != current_loop_begin_pos; if (is_move_needed) @@ -168,11 +171,6 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos; std::tie(target_loop_begin_pos, target_loop_end_pos) = loop_manager->get_loop_bounds(linear_ir, target_loop_id); loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, current_loop_id, target_loop_id); - // Update work_amount for Loop (increment is constant because increments must be the identical for fusion): - loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount())); - // If one of the Loops is outer for nested loops that splits the same dimension, - // after fusion new common Loop save this status - loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop()); const auto insertion_place = current_loop_end_pos; const auto is_move_needed = insertion_place != target_loop_begin_pos; @@ -184,15 +182,12 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo return true; } -bool FuseLoops::run(LinearIR& linear_ir) { +bool FuseLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::FuseLoops") - if (linear_ir.empty()) - return false; - const auto& loop_manager = linear_ir.get_loop_manager(); std::set prev_fused_loops; - for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto expr = *expr_it; const auto& node = expr->get_node(); if (ov::is_type(node) || diff --git a/src/common/snippets/src/lowered/pass/identify_buffers.cpp b/src/common/snippets/src/lowered/pass/identify_buffers.cpp index 6b04701ff155d5..485252b1ae7f5d 100644 --- a/src/common/snippets/src/lowered/pass/identify_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/identify_buffers.cpp @@ -77,7 +77,7 @@ void IdentifyBuffers::update_adj_matrix(const std::pair IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool) { +std::vector IdentifyBuffers::create_adjacency_matrix(LinearIR::constExprIt begin, LinearIR::constExprIt end, const BufferPool& pool) { // The sync point to check for adjacency is Loop because only in Loop we increment pointers. // So if some Buffers in the one Loop have conflict (cannot be inplace: the different ptr increment and data sizes) // they are called as adjacent @@ -86,7 +86,7 @@ std::vector IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea for (size_t i = 0; i < size; ++i) adj[index(size, i, i)] = true; - for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto &expr = *expr_it; if (!ov::is_type(expr->get_node())) continue; @@ -214,19 +214,20 @@ auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector& adj) -> s return color_groups; } -bool IdentifyBuffers::run(LinearIR& linear_ir) { +bool IdentifyBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::IdentifyBuffers") // Identify Buffers using Graph coloring algorithm. BufferPool buffer_pool; - for (const auto& expr : linear_ir) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; if (ov::is_type(expr->get_node())) { buffer_pool.push_back(expr); } } // Creation of Adj matrix - auto adj = create_adjacency_matrix(linear_ir, buffer_pool); + auto adj = create_adjacency_matrix(begin, end, buffer_pool); // Graph coloring algorithm const auto color_groups = coloring(buffer_pool, adj); diff --git a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp index b525428dd344d3..36cb41d3b9c96e 100644 --- a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp +++ b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp @@ -14,12 +14,13 @@ namespace snippets { namespace lowered { namespace pass { -bool InitBuffersDefault::run(LinearIR& linear_ir) { +bool InitBuffersDefault::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InitBuffersDefault"); size_t id = 0; size_t offset = 0; - for (const auto& expr : linear_ir) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; const auto op = expr->get_node(); if (const auto buffer = ov::as_type_ptr(op)) { AllocateBuffers::set_buffer_offset(expr, offset); diff --git a/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp b/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp index d76a2b1af35147..575e73057625ac 100644 --- a/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp +++ b/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp @@ -14,7 +14,7 @@ namespace snippets { namespace lowered { namespace pass { -bool InsertBroadcastMove::run(LinearIR& linear_ir) { +bool InsertBroadcastMove::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertBroadcastMove") bool modified = false; @@ -32,7 +32,7 @@ bool InsertBroadcastMove::run(LinearIR& linear_ir) { ov::is_type(v.get_node_shared_ptr()) || ov::is_type(v.get_node_shared_ptr()); }; - for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto& expr = *expr_it; const auto& node = expr->get_node(); const auto& descriptors = expr->get_input_port_descriptors(); diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp index d977570fce4a3b..eb72f971ced1c4 100644 --- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp @@ -101,7 +101,7 @@ ov::Shape compute_allocation_shape(const LinearIR::LoopManagerPtr& loop_manager, } // namespace InsertBuffers::InsertBuffers(int32_t buffer_allocation_rank) - : Pass(), m_buffer_allocation_rank(buffer_allocation_rank) {} + : RangedPass(), m_buffer_allocation_rank(buffer_allocation_rank) {} LinearIR::constExprIt InsertBuffers::insertion_position(const LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, const ExpressionPtr& up_expr, const ExpressionPtr& down_expr) { @@ -136,7 +136,10 @@ LinearIR::constExprIt InsertBuffers::insertion_position(const LinearIR& linear_i OPENVINO_THROW("Incorrect configuration for Buffer insertion!"); } -void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::constExprIt& expr_it, const LinearIR::LoopManagerPtr& loop_manager, +void InsertBuffers::insertion(LinearIR& linear_ir, + const LinearIR::constExprIt& begin_it, + const LinearIR::constExprIt& end_it, + const LinearIR::LoopManagerPtr& loop_manager, const std::vector& loop_entries, const std::vector& loop_exits) { for (const auto& entry_point : loop_entries) { @@ -230,7 +233,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::constExprIt& const auto buffer_consumers_inputs = buffer_out->get_consumers(); replace_input_port_connectors(buffer_consumers_inputs, output_connector); potential_consumers.insert(buffer_consumers_inputs.begin(), buffer_consumers_inputs.end()); - linear_ir.erase(linear_ir.find_after(expr_it, buffer)); + linear_ir.erase(linear_ir.find_after(begin_it, buffer)); } } @@ -243,9 +246,9 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::constExprIt& std::set consumers; for (const auto& port : potential_consumers) consumers.insert(port.get_expr()); - const auto it = std::find_if(expr_it, linear_ir.cend(), + const auto it = std::find_if(begin_it, end_it, [&consumers](const ExpressionPtr& expr) { return consumers.count(expr) > 0; }); - OPENVINO_ASSERT(it != linear_ir.cend(), "Consumer of Buffer has not been found in Linear IR"); + OPENVINO_ASSERT(it != end_it, "Consumer of Buffer has not been found in Linear IR"); consumer_expr = *it; } @@ -275,11 +278,8 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::constExprIt& } } -bool InsertBuffers::run(LinearIR& linear_ir) { +bool InsertBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertBuffers") - if (linear_ir.empty()) - return false; - const auto& loop_manager = linear_ir.get_loop_manager(); const auto loop_data_map = loop_manager->get_map(); for (const auto& loop_data : loop_data_map) { @@ -287,10 +287,10 @@ bool InsertBuffers::run(LinearIR& linear_ir) { const auto loop_entries = loop_info->get_entry_points(); const auto loop_exits = loop_info->get_exit_points(); // using begin() as expr_it because we work with LoopInfo, not expressions in Linear IR - insertion(linear_ir, linear_ir.cbegin(), loop_manager, loop_entries, loop_exits); + insertion(linear_ir, begin, end, loop_manager, loop_entries, loop_exits); } - for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto expr = *expr_it; const auto node = (*expr_it)->get_node(); const auto ma = ov::as_type_ptr(node); @@ -307,7 +307,7 @@ bool InsertBuffers::run(LinearIR& linear_ir) { loop_exits[p.first] = expr->get_output_port(p.first); } - insertion(linear_ir, expr_it, loop_manager, loop_entries, loop_exits); + insertion(linear_ir, expr_it, end, loop_manager, loop_entries, loop_exits); } return true; diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp index eb70e3d26042b8..64c01a489ba668 100644 --- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp +++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp @@ -20,13 +20,13 @@ using LoopInfoPtr = LoopManager::LoopInfoPtr; InsertLoadStore::InsertLoadStore(size_t vector_size) : m_vector_size(vector_size) {} size_t InsertLoadStore::get_count(const PortDescriptorPtr& port_desc) const { - const auto layout = port_desc->get_layout(); - const auto shape = port_desc->get_shape(); + const auto& layout = port_desc->get_layout(); + const auto& shape = port_desc->get_shape(); // Find last dimension by layout - const auto last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1); + const auto& last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1); OPENVINO_ASSERT(last_dim_idx != layout.end() && *last_dim_idx < shape.size(), "Load/Store expression have incorrect layout"); - const auto dim = shape[*last_dim_idx]; - return dim == 1 ? 1 : m_vector_size; + const auto& dim = shape[*last_dim_idx]; + return std::min(dim, m_vector_size); } bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) { @@ -72,11 +72,11 @@ bool InsertLoadStore::insert_store(LinearIR& linear_ir, const LinearIR::constExp return true; } -bool InsertLoadStore::run(LinearIR& linear_ir) { +bool InsertLoadStore::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertLoadStore") bool modified = false; - for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto expr = *expr_it; const auto& node = expr->get_node(); if (ov::is_type(node)) { diff --git a/src/common/snippets/src/lowered/pass/insert_loops.cpp b/src/common/snippets/src/lowered/pass/insert_loops.cpp index 2155cab55f201d..08c993c188795f 100644 --- a/src/common/snippets/src/lowered/pass/insert_loops.cpp +++ b/src/common/snippets/src/lowered/pass/insert_loops.cpp @@ -25,7 +25,7 @@ std::vector get_outer_loop_ids(const ExpressionPtr& expr, size_t loop_id } } // namespace -InsertLoops::InsertLoops() : Pass() {} +InsertLoops::InsertLoops() : RangedPass() {} void InsertLoops::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id, bool has_outer_loop) { const auto loop_info = loop_manager->get_loop_info(loop_id); @@ -72,15 +72,12 @@ void InsertLoops::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& linear_ir.insert_node(loop_end, loop_end_inputs, outer_loop_ids, false, loop_bounds.second); } -bool InsertLoops::run(LinearIR& linear_ir) { +bool InsertLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertLoops") - if (linear_ir.empty()) - return false; - const auto& loop_manager = linear_ir.get_loop_manager(); std::set inserted_loops; - for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto expr = *expr_it; const auto& node = expr->get_node(); if (ov::is_type(node) || diff --git a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp index 6ccfbf1094cdc3..9f68b45c8d0857 100644 --- a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp +++ b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp @@ -13,13 +13,11 @@ namespace lowered { namespace pass { InsertPerfCount::InsertPerfCount(std::map boundary_op_names) - : Pass(), m_boundary_op_names(std::move(boundary_op_names)) { + : RangedPass(), m_boundary_op_names(std::move(boundary_op_names)) { } -bool InsertPerfCount::run(LinearIR& linear_ir) { +bool InsertPerfCount::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertPerfCount") - if (linear_ir.empty()) - return false; if (m_boundary_op_names.empty()) { const auto& first_op_name = linear_ir.begin()->get()->get_node()->get_friendly_name(); const auto& last_op_name = linear_ir.rbegin()->get()->get_node()->get_friendly_name(); @@ -27,7 +25,7 @@ bool InsertPerfCount::run(LinearIR& linear_ir) { } size_t seq_number = 0; - for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto& op_name = expr_it->get()->get_node()->get_friendly_name(); const auto& found = m_boundary_op_names.find(op_name); if (found != m_boundary_op_names.end()) { diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp new file mode 100644 index 00000000000000..f6c7faf27b4cfb --- /dev/null +++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp @@ -0,0 +1,142 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/insert_specific_iterations.hpp" + +#include "snippets/itt.hpp" +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +LinearIR::constExprIt InsertSpecificIterations::insert_copy_loop(LinearIR& linear_ir, const size_t loop_id, const LinearIR::constExprIt& insert_pos) { + const auto& loop_manager = linear_ir.get_loop_manager(); + const auto loop_bounds = loop_manager->get_loop_bounds(linear_ir, loop_id); + ExpressionMap expression_map; + const auto& loop_copy_range = LinearIR::deep_copy_range(loop_bounds.first, std::next(loop_bounds.second), expression_map); + const auto new_loop_begin_pos = linear_ir.insert(insert_pos, loop_copy_range.begin(), loop_copy_range.end()); + const auto new_loop_end_pos = insert_pos; + + const auto original_loop_info = loop_manager->get_loop_info(loop_id); + std::vector new_entry_points, new_exit_points; + // Clone loop ports from original loop info to new loop info + for (const auto& entry : original_loop_info->get_entry_points()) + new_entry_points.push_back(*entry.clone_with_new_expr(expression_map[entry.expr_port->get_expr().get()])); + for (const auto& exit : original_loop_info->get_exit_points()) + new_exit_points.push_back(*exit.clone_with_new_expr(expression_map[exit.expr_port->get_expr().get()])); + + for (const auto& elem : expression_map) { + const auto expr = elem.first->shared_from_this(); + const auto& new_expr = elem.second; + // Loop begin/end ops can't be loop ports + if (ov::is_type(expr->get_node())) + continue; + // Update loop info of all outer loops with new loop ports + const auto outer_loop_ids = LinearIR::LoopManager::get_outer_expr_loops(expr, loop_id); + for (size_t i = 0; i < expr->get_input_count(); ++i) + loop_manager->update_loops_port(outer_loop_ids, expr->get_input_port(i), {expr->get_input_port(i), new_expr->get_input_port(i)}, true); + for (size_t i = 0; i < expr->get_output_count(); ++i) + loop_manager->update_loops_port(outer_loop_ids, expr->get_output_port(i), {expr->get_output_port(i), new_expr->get_output_port(i)}, false); + } + + const auto new_id = loop_manager->replace_with_new_loop(linear_ir, new_loop_begin_pos, new_loop_end_pos, + original_loop_info->get_work_amount(), original_loop_info->get_increment(), + new_entry_points, new_exit_points, loop_id); + const auto loop_end = ov::as_type_ptr(std::prev(new_loop_end_pos)->get()->get_node()); + OPENVINO_ASSERT(loop_end, "Cloned Loop does not contain LoopEnd op at the expected place."); + loop_end->set_id(new_id); + return new_loop_begin_pos; +} + +using LoopInfo = LinearIR::LoopManager::LoopInfo; + +bool InsertSpecificIterations::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertSpecificIterations") + const auto& loop_manager = linear_ir.get_loop_manager(); + + bool modified = false; + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + if (!loop_end) + continue; + + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + const auto work_amount = loop_info->get_work_amount(); + const auto increment = loop_info->get_increment(); + const auto& handlers = loop_info->get_handlers(); + + const auto main_loop_begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end->get_loop_begin())); + const auto main_loop_end_it = linear_ir.find_after(main_loop_begin_it, linear_ir.get_expr_by_node(loop_end)); + // Note: handlers must be run on the range started with the first operation in the loop body. + const auto main_first_body_op_it = std::next(main_loop_begin_it); + + auto update_loop_params = [&loop_manager](const std::shared_ptr& loop_end_copy, + size_t new_work_amount, + size_t new_increment, + bool zero_finalization_offsets) { + loop_end_copy->set_work_amount(new_work_amount); + loop_end_copy->set_increment(new_increment); + + const auto& loop_info_copy = loop_manager->get_loop_info(loop_end_copy->get_id()); + loop_info_copy->set_work_amount(new_work_amount); + loop_info_copy->set_increment(new_increment); + + if (zero_finalization_offsets) + loop_end_copy->set_finalization_offsets(std::vector(loop_end_copy->get_finalization_offsets().size(), 0)); + }; + + auto copy_and_run_specific_handlers = [&](const PassPipeline& handlers) { + const auto new_loop_begin_pos = insert_copy_loop(linear_ir, loop_end->get_id(), main_loop_begin_it); + const auto new_loop_begin = ov::as_type_ptr(new_loop_begin_pos->get()->get_node()); + OPENVINO_ASSERT(new_loop_begin, "Cloned Loop does not contain LoopBegin op at the expected place."); + const auto new_loop_end = new_loop_begin->get_loop_end(); + const auto new_loop_end_pos = linear_ir.find_after(new_loop_begin_pos, linear_ir.get_expr_by_node(new_loop_end)); + OPENVINO_ASSERT(new_loop_end, "Cloned Loop does not contain LoopEnd op at the expected place."); + + // Note: handlers must be run on the range started with the first operation in the loop body. + handlers.run(linear_ir, std::next(new_loop_begin_pos), new_loop_end_pos); + return new_loop_end; + }; + + const bool specific_first_iteration = !handlers.get_first_iter_handelrs().empty(); + if (work_amount == increment) { + handlers.get_first_iter_handelrs().run(linear_ir, main_first_body_op_it, main_loop_end_it); + } else { + if (specific_first_iteration) { + const auto loop_end_copy = copy_and_run_specific_handlers(handlers.get_first_iter_handelrs()); + update_loop_params(loop_end_copy, increment, increment, true); + } + + const auto tail_size = work_amount % increment; + if (tail_size != 0) { + if (!specific_first_iteration || work_amount > 2 * increment) { + const auto loop_end_copy = copy_and_run_specific_handlers(handlers.get_main_iter_handelrs()); + const auto reduce_value = specific_first_iteration ? tail_size + increment : tail_size; + const auto new_work_amount = work_amount - reduce_value; + update_loop_params(loop_end_copy, new_work_amount, increment, true); + } + handlers.get_last_iter_handelrs().run(linear_ir, main_first_body_op_it, main_loop_end_it); + update_loop_params(loop_end, tail_size, tail_size, false); + } else if (specific_first_iteration) { + handlers.get_main_iter_handelrs().run(linear_ir, main_first_body_op_it, main_loop_end_it); + update_loop_params(loop_end, work_amount - increment, increment, false); + } + } + modified = true; + } + return modified; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov + diff --git a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp deleted file mode 100644 index 7774883aa86b1d..00000000000000 --- a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/lowered/pass/insert_tail_loop.hpp" - -#include "snippets/lowered/linear_ir.hpp" -#include "snippets/lowered/loop_manager.hpp" -#include "snippets/lowered/pass/init_loops.hpp" -#include "snippets/snippets_isa.hpp" -#include "snippets/utils.hpp" -#include "snippets/itt.hpp" - -namespace ov { -namespace snippets { -namespace lowered { -namespace pass { -void InsertTailLoop::propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, - const LinearIR::LoopManager::LoopInfoPtr& loop_info, - LinearIR::container::const_iterator begin, - LinearIR::container::const_iterator end, - const size_t new_dim_value) { - std::map original_shapes; - // First step: set new dim value to the corresponding entry_points' dimensions - if (new_dim_value != existing_subtensor_value) { - for (const auto& port : loop_info->get_entry_points()) { - const auto& reg_type = port.expr_port->get_descriptor_ptr()->get_reg().type; - if ((port.is_incremented && reg_type == RegType::gpr) || (reg_type == RegType::vec)) { - const auto& expr = port.expr_port->get_expr(); - const auto node = expr->get_node(); - auto desc = port.expr_port->get_descriptor_ptr(); - auto subtensor = desc->get_subtensor(); - if (port.dim_idx < subtensor.size()) { - *(subtensor.rbegin() + port.dim_idx) = new_dim_value; - desc->set_subtensor(subtensor); - } - - const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); - const auto& layout = parent_desc->get_layout(); - const auto& shape = parent_desc->get_shape(); - if (original_shapes.find(parent_desc) == original_shapes.end()) { - original_shapes[parent_desc] = shape; - } - auto new_shape = shape; - new_shape[*(layout.rbegin() + port.dim_idx)] = new_dim_value; - parent_desc->set_shape(new_shape); - } - } - } - - auto update_only_dim_idx_with_subtensor_value = [&](const LinearIR::LoopManager::LoopPort& port) { - const auto& reg_type = port.expr_port->get_descriptor_ptr()->get_reg().type; - if ((port.is_incremented && reg_type == RegType::gpr) || (reg_type == RegType::vec)) { - auto desc = port.expr_port->get_descriptor_ptr(); - const auto expr = port.expr_port->get_expr(); - const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); - - const auto& layout = parent_desc->get_layout(); - const auto& shape = parent_desc->get_shape(); - const auto& desc_subtensor = desc->get_subtensor(); - if (port.dim_idx < desc_subtensor.size()) { - if (original_shapes.find(parent_desc) == original_shapes.end()) { - original_shapes[parent_desc] = shape; - } - auto new_shape = shape; - new_shape[*(layout.rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx); - parent_desc->set_shape(new_shape); - } - } - }; - - auto update_subtensors = [](const std::vector& descs, bool is_input) { - for (const auto& desc : descs) { - const auto& subtensor = desc->get_subtensor(); - if (!subtensor.empty()) { - auto planar_dims = is_input ? snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout()) - : snippets::utils::get_preordered_vdims(desc->get_shape(), desc->get_layout()); - const size_t subtensor_start = planar_dims.size() - subtensor.size(); - VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end()); - for (size_t i = 0; i < new_subtensor.size(); ++i) { - new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]); - } - desc->set_subtensor(new_subtensor); - } - } - }; - - auto shape_inference_end_it = end; - const bool loop_by_last_dim = loop_info->get_dim_idx() == 0; - // Subtensors are updated using shape inference infrastructure: - // For inner loops propagation function is called recursively - for (auto expr_it = begin; expr_it != end; expr_it++) { - const auto expr = *expr_it; - if (ov::is_type(expr->get_node())) - continue; - if (auto loop_begin = ov::as_type_ptr(expr->get_node())) { - const auto loop_end = loop_begin->get_loop_end(); - const auto inner_loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id()); - const auto inner_begin = std::next(expr_it); - const auto inner_end = linear_ir.find(linear_ir.get_expr_by_node(loop_end)); - - // The corresponding shapes of inner loops entry points must be updated using existing subtensor values - if (new_dim_value == existing_subtensor_value) { - for (const auto& port : loop_info->get_entry_points()) - update_only_dim_idx_with_subtensor_value(port); - } - propagate_updated_subtensor_through_loop(linear_ir, inner_loop_info, inner_begin, inner_end); - expr_it = inner_end; - continue; - } - if ((ov::is_type(expr_it->get()->get_node()) || - ov::is_type(expr_it->get()->get_node())) && - loop_by_last_dim) { - // WA: we have to break subtensor propagation if we try to propagate new last dim through Broadcast nodes - // which broadcast last dim in original dimension value anyway - // This workaround might be avoided if blocked shape are used for tail size propagation - shape_inference_end_it = expr_it; - break; - } - expr->updateShapes(); - update_subtensors(expr->get_input_port_descriptors(), true); - update_subtensors(expr->get_output_port_descriptors(), false); - } - - // After subtensor propagation, the original shapes must be restored - for (const auto& elem : original_shapes) - elem.first->set_shape(elem.second); - for (auto expr_it = begin; expr_it != shape_inference_end_it; expr_it++) - (*expr_it)->updateShapes(); -} - -LinearIR::constExprIt InsertTailLoop::insert_copy_loop(LinearIR& linear_ir, const size_t loop_id, const LinearIR::constExprIt& insert_pos) { - const auto& loop_manager = linear_ir.get_loop_manager(); - const auto loop_bounds = loop_manager->get_loop_bounds(linear_ir, loop_id); - - ExressionMap expression_map; - const auto& loop_copy_range = LinearIR::deep_copy_range(loop_bounds.first, std::next(loop_bounds.second), expression_map); - const auto new_loop_begin_pos = linear_ir.insert(insert_pos, loop_copy_range.begin(), loop_copy_range.end()); - const auto new_loop_end_pos = insert_pos; - - const auto original_loop_info = loop_manager->get_loop_info(loop_id); - std::vector new_entry_points, new_exit_points; - // Clone loop ports from original loop info to new loop info - for (const auto& entry : original_loop_info->get_entry_points()) - new_entry_points.push_back(*entry.clone_with_new_expr(expression_map[entry.expr_port->get_expr().get()])); - for (const auto& exit : original_loop_info->get_exit_points()) - new_exit_points.push_back(*exit.clone_with_new_expr(expression_map[exit.expr_port->get_expr().get()])); - - for (const auto& elem : expression_map) { - const auto expr = elem.first->shared_from_this(); - const auto& new_expr = elem.second; - // Loop begin/end ops can't be loop ports - if (ov::is_type(expr->get_node())) - continue; - // Update loop info of all outer loops with new loop ports - const auto outer_loop_ids = LinearIR::LoopManager::get_outer_expr_loops(expr, loop_id); - for (size_t i = 0; i < expr->get_input_count(); ++i) - loop_manager->update_loops_port(outer_loop_ids, expr->get_input_port(i), {expr->get_input_port(i), new_expr->get_input_port(i)}, true); - for (size_t i = 0; i < expr->get_output_count(); ++i) - loop_manager->update_loops_port(outer_loop_ids, expr->get_output_port(i), {expr->get_output_port(i), new_expr->get_output_port(i)}, false); - } - - const auto new_id = loop_manager->replace_with_new_loop(linear_ir, new_loop_begin_pos, new_loop_end_pos, - original_loop_info->get_work_amount(), original_loop_info->get_increment(), - new_entry_points, new_exit_points, loop_id); - const auto loop_end = ov::as_type_ptr(std::prev(new_loop_end_pos)->get()->get_node()); - OPENVINO_ASSERT(loop_end, "Cloned Loop does not contain LoopEnd op at the expected place."); - loop_end->set_id(new_id); - return new_loop_begin_pos; -} - -void InsertTailLoop::create_tail_loop(LinearIR& linear_ir, - LinearIR::constExprIt begin, - LinearIR::constExprIt end, - const std::shared_ptr& loop_end, - bool need_vector_loop, - size_t tail_size) { - // tail is required => transform the body into a tail representation - // tail loop is fake loop because for tail we should calculate only - // finalization offsets which are supported by LoopEnd. - const auto& loop_manager = linear_ir.get_loop_manager(); - const auto original_loop_id = loop_end->get_id(); - auto original_loop_info = loop_manager->get_loop_info(original_loop_id); - auto tail_loop_info = original_loop_info; - if (need_vector_loop) { - // Note: new loop body is inserted before the original loop - // So new loop becomes a main vector loop, the original loop becomes tail loop - // This is done in such way to have original ops from the main body at the end: - // this allows us to conveniently interact with outer loops in further passes - const auto new_loop_begin_pos = insert_copy_loop(linear_ir, original_loop_id, begin); - const auto new_loop_begin = ov::as_type_ptr(new_loop_begin_pos->get()->get_node()); - OPENVINO_ASSERT(new_loop_begin, "Cloned Loop does not contain LoopBegin op at the expected place."); - const auto new_loop_end = new_loop_begin->get_loop_end(); - tail_loop_info = original_loop_info; - original_loop_info = loop_manager->get_loop_info(new_loop_end->get_id()); - - const auto new_vector_loop_wa = original_loop_info->get_work_amount() - tail_size; - original_loop_info->set_work_amount(new_vector_loop_wa); - new_loop_end->set_work_amount(new_vector_loop_wa); - original_loop_info->set_outer_splited_loop(tail_loop_info->get_outer_splited_loop()); - // Note that finalization offsets should be applied after the last iteration. - // So if there is a tail, then we should apply offsets after it, but not now. - new_loop_end->set_finalization_offsets(std::vector(loop_end->get_finalization_offsets().size(), 0)); - } - loop_end->set_increment(tail_size); - loop_end->set_work_amount(tail_size); - tail_loop_info->set_increment(tail_size); - tail_loop_info->set_work_amount(tail_size); - - // We have to check the loop body for any nested loops that work on the same dimension - // and rescale their work_amount and increment accordingly - if (original_loop_info->get_outer_splited_loop()) { - const auto current_dim_idx = original_loop_info->get_dim_idx(); - OPENVINO_ASSERT(current_dim_idx != LinearIR::LoopManager::LoopInfo::UNDEFINED_DIM_IDX, - "Outer splitted loop unexpectedly iterates by several dimension indices"); - for (auto it = std::next(begin); it != std::prev(end); ++it) { - const auto& expr = *it; - const auto inner_loop_end = ov::as_type_ptr(expr->get_node()); - if (!inner_loop_end) - continue; - const auto inner_loop_info = loop_manager->get_loop_info(inner_loop_end->get_id()); - const auto inner_dim_idx = inner_loop_info->get_dim_idx(); - if (inner_dim_idx != current_dim_idx) - continue; - const auto inner_loop_begin = inner_loop_end->get_loop_begin(); - const auto inner_tail_work_amount = static_cast(inner_loop_end->get_work_amount()); - const auto inner_tail_increment = inner_loop_end->get_increment(); - auto inner_finalization_offsets = inner_loop_end->get_finalization_offsets(); - for (auto& offset : inner_finalization_offsets) { - offset = offset / inner_tail_work_amount * static_cast(tail_size); - } - inner_loop_end->set_work_amount(tail_size); - inner_loop_end->set_increment(std::min(inner_tail_increment, tail_size)); - inner_loop_end->set_finalization_offsets(inner_finalization_offsets); - const auto inner_loop_begin_it = std::find(begin, it, linear_ir.get_expr_by_node(inner_loop_begin)); - const auto inner_loop_end_it = std::next(end); - OPENVINO_ASSERT(inner_loop_begin_it != it, "LoopBegin has not been found!"); - tail_transformations(linear_ir, inner_loop_begin_it, inner_loop_end_it, tail_size); - } - } - tail_transformations(linear_ir, begin, end, tail_size); - propagate_updated_subtensor_through_loop(linear_ir, tail_loop_info, std::next(begin), end, tail_size); -} - -void InsertTailLoop::tail_transformations(LinearIR& linear_ir, - LinearIR::constExprIt tail_begin, - LinearIR::constExprIt tail_end, - const size_t tail_size) { - const auto& config = linear_ir.get_config(); - auto insertFill = [tail_size](const ov::Input& input, const ExpressionPort& source) -> std::shared_ptr { - std::shared_ptr fill = nullptr; - auto& rt = input.get_rt_info(); - auto fill_rt = rt.find("set_fill"); - if (fill_rt != rt.end()) { - const auto fill_value = fill_rt->second.as(); - fill = std::make_shared(source.get_expr()->get_node()->output(source.get_index()), tail_size, fill_value); - } - return fill; - }; - - for (auto expr_it = std::next(tail_begin); expr_it != tail_end; expr_it++) { - // Skip inner Loops - const auto loop_begin = ov::as_type_ptr(expr_it->get()->get_node()); - if (loop_begin) { - expr_it = linear_ir.find(expr_it, tail_end, linear_ir.get_expr_by_node(loop_begin->get_loop_end())); - continue; - } - // We should fill vector regs by float_min and zero to have - // correct math calculations for ReduceMax and ReduceSum in scalar case. - // Note: We find Maximum and Add ops because HorizonMax and HorizonSum are outside Loop, - // so they are missed in - const auto& expr = *expr_it; - const auto op = expr->get_node(); - if (config.m_need_fill_tail_register && - (ov::is_type(op) || - ov::is_type(op))) { - for (size_t i = 0; i < expr->get_input_count(); ++i) { - const auto& input = expr->get_input_port_connector(i); - if (auto fill = insertFill(op->input(i), input->get_source())) { - const auto consumers = input->get_consumers(); - // If there are several consumers, fill expression must be inserted before first of them - auto fst_consumer = std::min_element(consumers.cbegin(), consumers.cend(), [&](ExpressionPort lhs, ExpressionPort rhs) { - auto lhs_it = linear_ir.find(lhs.get_expr()); - auto rhs_it = linear_ir.find(rhs.get_expr()); - return std::distance(linear_ir.cbegin(), lhs_it) < std::distance(linear_ir.cbegin(), rhs_it); - }); - const auto fill_expr = *linear_ir.insert_node(fill, std::vector{ input->get_source() }, expr->get_loop_ids(), true, - linear_ir.find(fst_consumer->get_expr()), consumers); - - // in_reg == out_reg since we want to modify vector reg inplace - const auto reg = expr->get_input_port_descriptor(0)->get_reg(); - fill_expr->get_input_port_descriptor(0)->set_reg(reg); - fill_expr->get_output_port_descriptor(0)->set_reg(reg); - } - } - } else if (const auto memory_access = std::dynamic_pointer_cast(op)) { - for (const auto p : memory_access->get_memory_access_input_ports()) { - const auto port = p.first; - if (memory_access->get_input_count(port) > 1) { - memory_access->set_input_count(tail_size, port); - } - } - for (const auto p : memory_access->get_memory_access_output_ports()) { - const auto port = p.first; - if (memory_access->get_output_count(port) > 1) { - memory_access->set_output_count(tail_size, port); - } - } - } - } -} - -bool InsertTailLoop::run(LinearIR& linear_ir) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::insertTailLoop") - const auto& loop_manager = linear_ir.get_loop_manager(); - bool modified = false; - - for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) { - const auto& expr = *expr_it; - const auto node = expr->get_node(); - const auto loop_end = ov::as_type_ptr(node); - if (!loop_end) - continue; - - const auto loop_info = loop_manager->get_loop_info(loop_end->get_id()); - const auto& first_iter_handler = loop_info->get_first_iter_handler(); - if (first_iter_handler) { - modified |= first_iter_handler(linear_ir, expr_it); - } - - const auto work_amount = loop_end->get_work_amount(); - const auto increment = loop_end->get_increment(); - const auto tail_size = work_amount % increment; - - // tail is required => transform the body into a tail representation - // tail loop is fake loop because for tail we should calculate only - // finalization offsets which are supported by LoopEnd. - if (tail_size != 0) { - const auto loop_begin = loop_end->get_loop_begin(); - const auto begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_begin)); - const auto need_vector_loop = work_amount >= increment; - create_tail_loop(linear_ir, begin_it, std::next(expr_it), loop_end, need_vector_loop, tail_size); - } - modified = true; - } - return modified; -} - -} // namespace pass -} // namespace lowered -} // namespace snippets -} // namespace ov - diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp new file mode 100644 index 00000000000000..8b396329644017 --- /dev/null +++ b/src/common/snippets/src/lowered/pass/iter_handler.cpp @@ -0,0 +1,143 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/iter_handler.hpp" + +#include "snippets/itt.hpp" +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +UpdateMemoryAccessCounts::UpdateMemoryAccessCounts(size_t count) : RangedPass(), m_count(count) {} + +bool UpdateMemoryAccessCounts::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + bool status = false; + for (auto expr_it = begin; expr_it != end; expr_it++) { + // Skip inner Loops + const auto loop_begin = ov::as_type_ptr(expr_it->get()->get_node()); + if (loop_begin) { + expr_it = linear_ir.find(expr_it, end, linear_ir.get_expr_by_node(loop_begin->get_loop_end())); + if (expr_it == end) + return status; + continue; + } + + const auto& node = expr_it->get()->get_node(); + if (const auto memory_access = ov::as_type_ptr(node)) { + for (const auto p : memory_access->get_memory_access_input_ports()) { + const auto port = p.first; + if (memory_access->get_input_count(port) > 1) { + memory_access->set_input_count(m_count, port); + } + } + for (const auto p : memory_access->get_memory_access_output_ports()) { + const auto port = p.first; + if (memory_access->get_output_count(port) > 1) { + memory_access->set_output_count(m_count, port); + } + } + status = true; + } + } + return status; +} + +std::shared_ptr UpdateMemoryAccessCounts::merge(const std::shared_ptr& other) { + const auto merged_pass = std::make_shared(m_count); + if (other == nullptr) + return merged_pass; + const auto casted_pass = ov::as_type_ptr(other); + if (!casted_pass || m_count != casted_pass->m_count) + return nullptr; + return merged_pass; +} + +SetFillOffset::SetFillOffset(size_t offset) : RangedPass(), m_offset(offset) {} + +bool SetFillOffset::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + for (auto expr_it = begin; expr_it != end; expr_it++) { + const auto& node = expr_it->get()->get_node(); + if (const auto fill = ov::as_type_ptr(node)) { + fill->set_offset(m_offset); + } + } + return true; +} + +std::shared_ptr SetFillOffset::merge(const std::shared_ptr& other) { + const auto merged_pass = std::make_shared(m_offset); + if (other == nullptr) + return merged_pass; + const auto casted_pass = ov::as_type_ptr(other); + if (!casted_pass || m_offset != casted_pass->m_offset) + return nullptr; + return merged_pass; +} + +TransformInnerSplitLoop::TransformInnerSplitLoop(size_t tail_size) : RangedPass(), m_tail_size(tail_size) {} + +bool TransformInnerSplitLoop::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + const auto& expr = *end; + const auto node = expr->get_node(); + const auto loop_end = ov::as_type_ptr(node); + OPENVINO_ASSERT(loop_end, "the last operation in range must be LoopEnd"); + + const auto& loop_manager = linear_ir.get_loop_manager(); + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + const auto current_dim_idx = loop_info->get_dim_idx(); + OPENVINO_ASSERT(current_dim_idx != LinearIR::LoopManager::LoopInfo::UNDEFINED_DIM_IDX, + "Outer splitted loop unexpectedly iterates by several dimension indices"); + + bool modified = false; + for (auto it = begin; it != end; ++it) { + const auto& expr = *it; + const auto inner_loop_end = ov::as_type_ptr(expr->get_node()); + if (!inner_loop_end) + continue; + const auto inner_loop_info = loop_manager->get_loop_info(inner_loop_end->get_id()); + const auto inner_dim_idx = inner_loop_info->get_dim_idx(); + if (inner_dim_idx != current_dim_idx) + continue; + const auto inner_loop_begin = inner_loop_end->get_loop_begin(); + const auto inner_loop_work_amount = static_cast(inner_loop_end->get_work_amount()); + const auto inner_loop_increment = inner_loop_end->get_increment(); + auto inner_finalization_offsets = inner_loop_end->get_finalization_offsets(); + for (auto& offset : inner_finalization_offsets) { + offset = offset / inner_loop_work_amount * static_cast(m_tail_size); + } + inner_loop_end->set_work_amount(m_tail_size); + // TODO: if m_tail_size more than inner loop increment, + // handlers of the inner loop must be reset with new tail size + inner_loop_end->set_increment(std::min(inner_loop_increment, m_tail_size)); + inner_loop_end->set_finalization_offsets(inner_finalization_offsets); + const auto inner_loop_begin_it = std::find(begin, it, linear_ir.get_expr_by_node(inner_loop_begin)); + const auto inner_loop_end_it = std::next(it); + OPENVINO_ASSERT(inner_loop_begin_it != it, "LoopBegin has not been found!"); + const auto& last_iter_handlers = inner_loop_info->get_handlers().get_last_iter_handelrs(); + last_iter_handlers.run(linear_ir, std::next(inner_loop_begin_it), inner_loop_end_it); + modified = true; + } + return modified; +} + +std::shared_ptr TransformInnerSplitLoop::merge(const std::shared_ptr& other) { + const auto merged_pass = std::make_shared(m_tail_size); + if (other == nullptr) + return merged_pass; + const auto casted_pass = ov::as_type_ptr(other); + if (!casted_pass || m_tail_size != casted_pass->m_tail_size) + return nullptr; + return merged_pass; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov + diff --git a/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp index 3f9de12a5a0523..3c5b5f3060f7a8 100644 --- a/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp +++ b/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp @@ -14,11 +14,11 @@ namespace snippets { namespace lowered { namespace pass { -bool LoadMoveBroadcastToBroadcastLoad::run(LinearIR& linear_ir) { +bool LoadMoveBroadcastToBroadcastLoad::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::LoadMoveBroadcastToBroadcastLoad") bool modified = false; - for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto& expr = *expr_it; const auto& op = expr->get_node(); // Match on MoveBroadcast because MoveBroadcast is rare node in bodies diff --git a/src/common/snippets/src/lowered/pass/mark_loops.cpp b/src/common/snippets/src/lowered/pass/mark_loops.cpp index 05d38e111927c4..82b65eb3e8ee91 100644 --- a/src/common/snippets/src/lowered/pass/mark_loops.cpp +++ b/src/common/snippets/src/lowered/pass/mark_loops.cpp @@ -14,13 +14,10 @@ namespace snippets { namespace lowered { namespace pass { -MarkLoops::MarkLoops(size_t vector_size) : Pass(), m_vector_size(vector_size) {} +MarkLoops::MarkLoops(size_t vector_size) : RangedPass(), m_vector_size(vector_size) {} -bool MarkLoops::run(LinearIR& linear_ir) { +bool MarkLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::MarkLoops") - if (linear_ir.empty()) - return false; - const auto& lowering_config = linear_ir.get_config(); const auto& loop_manager = linear_ir.get_loop_manager(); auto loop_depth = lowering_config.m_loop_depth; @@ -41,7 +38,7 @@ bool MarkLoops::run(LinearIR& linear_ir) { lhs_desc->get_shape() != rhs_desc->get_shape(); }; - for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto expr = *expr_it; const auto& node = expr->get_node(); if (is_not_start_point(node)) @@ -55,7 +52,7 @@ bool MarkLoops::run(LinearIR& linear_ir) { const auto& prev_expr = *loop_end_pos; loop_end_pos++; // If iterator is the last, we should finish Loop - if (loop_end_pos == linear_ir.end()) + if (loop_end_pos == end) break; // If iterator is the last, we should finish Loop diff --git a/src/common/snippets/src/lowered/pass/normalize_bufer_ids.cpp b/src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp similarity index 80% rename from src/common/snippets/src/lowered/pass/normalize_bufer_ids.cpp rename to src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp index 7e99302743a0b3..76ef3562760daa 100644 --- a/src/common/snippets/src/lowered/pass/normalize_bufer_ids.cpp +++ b/src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp @@ -13,12 +13,13 @@ namespace snippets { namespace lowered { namespace pass { -bool NormalizeBufferIDs::run(LinearIR& linear_ir) { +bool NormalizeBufferIDs::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::NormalizeBufferIDs"); // [ original Buffer ID -> normalized ] std::map buffer_ids; - for (const auto& expr : linear_ir) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; const auto op = expr->get_node(); if (const auto buffer = ov::as_type_ptr(op)) { const auto buffer_id = buffer->get_id(); diff --git a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp index 317eb32f7ab1fe..b8391964ef783d 100644 --- a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp +++ b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp @@ -13,13 +13,11 @@ namespace snippets { namespace lowered { namespace pass { -bool OptimizeLoopSingleEvaluation::run(LinearIR& linear_ir) { +bool OptimizeLoopSingleEvaluation::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::OptimizeLoopSingleEvaluation") - if (linear_ir.empty()) - return false; - bool is_modified = false; - for (const auto& expr : linear_ir) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; if (auto loop_end = ov::as_type_ptr(expr->get_node())) { // *1* solo vector/tail loop + empty outer loop // => skip increments (both counter & ptr) : set evaluate_once flag diff --git a/src/common/snippets/src/lowered/pass/pass.cpp b/src/common/snippets/src/lowered/pass/pass.cpp index 70a05fc30be147..db13c90ae97673 100644 --- a/src/common/snippets/src/lowered/pass/pass.cpp +++ b/src/common/snippets/src/lowered/pass/pass.cpp @@ -16,23 +16,33 @@ PassPipeline::PassPipeline(const std::shared_ptr& pass_config) : m_p OPENVINO_ASSERT(m_pass_config != nullptr, "PassConfig is not initialized!"); } -void PassPipeline::register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr& pass) { +void PassPipeline::register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr& pass) { OPENVINO_ASSERT(pass != nullptr, "PassPipeline cannot register empty pass!"); m_passes.insert(position.get_insert_position(m_passes), pass); } -void PassPipeline::register_pass(const std::shared_ptr& pass) { +void PassPipeline::register_pass(const std::shared_ptr& pass) { OPENVINO_ASSERT(pass != nullptr, "PassPipeline cannot register empty pass!"); m_passes.push_back(pass); } void PassPipeline::run(LinearIR& linear_ir) const { + run(linear_ir, linear_ir.cbegin(), linear_ir.cend()); +} + +void PassPipeline::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) const { for (const auto& pass : m_passes) { OPENVINO_ASSERT(pass != nullptr, "PassPipeline has empty pass!"); if (m_pass_config->is_disabled(pass->get_type_info())) { continue; } - pass->run(linear_ir); + if (auto lir_pass = std::dynamic_pointer_cast(pass)) { + lir_pass->run(linear_ir); + } else if (auto ranged_pass = std::dynamic_pointer_cast(pass)) { + ranged_pass->run(linear_ir, begin, end); + } else { + OPENVINO_THROW("Unexpected pass (", pass->get_type_info(), ") is registered in PassPipeline"); + } } } @@ -41,6 +51,29 @@ void PassPipeline::register_positioned_passes(const std::vector> lhs_passes_map; + for (const auto& pass : lhs_passes) { + lhs_passes_map[pass->get_type_info()] = pass; + } + OPENVINO_ASSERT(lhs_passes_map.size() == lhs_passes.size(), "The pass pipeline must not contain several passes with equal type info"); + + PassPipeline merged_pipeline; + for (const auto& rhs_pass : rhs.get_passes()) { + const auto lhs_pass = rhs_pass->merge(lhs_passes_map[rhs_pass->get_type_info()]); + OPENVINO_ASSERT(lhs_pass, "2 passes with type info ", rhs_pass->get_type_info(), " can't be merged."); + merged_pipeline.register_pass(lhs_pass); + lhs_passes_map.erase(rhs_pass->get_type_info()); + } + + for (const auto& rest_pass : lhs_passes_map) { + merged_pipeline.register_pass(rest_pass.second); + } + return merged_pipeline; +} + } // namespace pass } // namespace lowered } // namespace snippets diff --git a/src/common/snippets/src/lowered/pass/pass_config.cpp b/src/common/snippets/src/lowered/pass/pass_config.cpp index ae73f88c55805a..6d4888e81c7420 100644 --- a/src/common/snippets/src/lowered/pass/pass_config.cpp +++ b/src/common/snippets/src/lowered/pass/pass_config.cpp @@ -28,6 +28,14 @@ bool PassConfig::is_enabled(const DiscreteTypeInfo& type_info) const { return m_enabled.count(type_info); } +bool operator==(const PassConfig& lhs, const PassConfig& rhs) { + return lhs.m_disabled == rhs.m_disabled && lhs.m_enabled == rhs.m_enabled; +} + +bool operator!=(const PassConfig& lhs, const PassConfig& rhs) { + return !(lhs == rhs); +} + } // namespace pass } // namespace lowered } // namespace snippets diff --git a/src/common/snippets/src/lowered/pass/propagate_layout.cpp b/src/common/snippets/src/lowered/pass/propagate_layout.cpp index aea3cf99858622..8023516e159ba3 100644 --- a/src/common/snippets/src/lowered/pass/propagate_layout.cpp +++ b/src/common/snippets/src/lowered/pass/propagate_layout.cpp @@ -14,12 +14,10 @@ namespace snippets { namespace lowered { namespace pass { -bool PropagateLayout::run(LinearIR& linear_ir) { +bool PropagateLayout::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::PropagateLayout") - if (linear_ir.empty()) - return false; - - for (const auto& expr : linear_ir) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; const auto io_expr = std::dynamic_pointer_cast(expr); if (!io_expr) continue; diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp new file mode 100644 index 00000000000000..cd06f6d163c479 --- /dev/null +++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp @@ -0,0 +1,161 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/propagate_subtensors.hpp" + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" +#include "snippets/itt.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +namespace { +void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, + const LinearIR::LoopManager::LoopInfoPtr& loop_info, + LinearIR::container::const_iterator begin, + LinearIR::container::const_iterator end, + bool most_outer_loop, + const size_t new_dim_value = SIZE_MAX) { + OPENVINO_ASSERT(snippets::utils::implication(most_outer_loop, new_dim_value != SIZE_MAX), + "if the updated subtensor propagation was called for the outer loop, new_dim_value must not be equal to default value"); + std::map original_shapes; + // First step: set new dim value to the corresponding entry_points' dimensions + if (most_outer_loop) { + for (const auto& port : loop_info->get_entry_points()) { + const auto& reg_type = port.expr_port->get_descriptor_ptr()->get_reg().type; + if ((port.is_incremented && reg_type == RegType::gpr) || (reg_type == RegType::vec)) { + const auto& expr = port.expr_port->get_expr(); + const auto& desc = port.expr_port->get_descriptor_ptr(); + auto subtensor = desc->get_subtensor(); + if (port.dim_idx < subtensor.size()) { + *(subtensor.rbegin() + port.dim_idx) = new_dim_value; + desc->set_subtensor(subtensor); + } + + const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); + const auto& parent_shape = parent_desc->get_shape(); + if (original_shapes.find(parent_desc) == original_shapes.end()) { + original_shapes[parent_desc] = parent_shape; + } + auto new_shape = parent_shape; + new_shape[*(desc->get_layout().rbegin() + port.dim_idx)] = new_dim_value; + parent_desc->set_shape(new_shape); + } + } + } + + auto update_only_dim_idx_with_subtensor_value = [&](const LinearIR::LoopManager::LoopPort& port) { + const auto& reg_type = port.expr_port->get_descriptor_ptr()->get_reg().type; + if ((port.is_incremented && reg_type == RegType::gpr) || (reg_type == RegType::vec)) { + const auto desc = port.expr_port->get_descriptor_ptr(); + const auto expr = port.expr_port->get_expr(); + const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); + + const auto& parent_shape = parent_desc->get_shape(); + const auto& desc_subtensor = desc->get_subtensor(); + if (port.dim_idx < desc_subtensor.size()) { + if (original_shapes.find(parent_desc) == original_shapes.end()) { + original_shapes[parent_desc] = parent_shape; + } + auto new_shape = parent_shape; + new_shape[*(desc->get_layout().rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx); + parent_desc->set_shape(new_shape); + } + } + }; + + auto update_subtensors = [](const std::vector& descs, bool is_input) { + for (const auto& desc : descs) { + const auto& subtensor = desc->get_subtensor(); + if (!subtensor.empty()) { + auto planar_dims = is_input ? snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout()) + : snippets::utils::get_preordered_vdims(desc->get_shape(), desc->get_layout()); + const size_t subtensor_start = planar_dims.size() - subtensor.size(); + VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end()); + for (size_t i = 0; i < new_subtensor.size(); ++i) { + new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]); + } + desc->set_subtensor(new_subtensor); + } + } + }; + + auto shape_inference_end_it = end; + const bool loop_by_last_dim = loop_info->get_dim_idx() == 0; + // Subtensors are updated using shape inference infrastructure: + // For inner loops propagation function is called recursively + for (auto expr_it = begin; expr_it != end; expr_it++) { + const auto expr = *expr_it; + if (ov::is_type(expr->get_node())) + continue; + if (auto loop_begin = ov::as_type_ptr(expr->get_node())) { + const auto loop_end = loop_begin->get_loop_end(); + const auto inner_loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id()); + const auto inner_begin = std::next(expr_it); + const auto inner_end = linear_ir.find_after(inner_begin, linear_ir.get_expr_by_node(loop_end)); + + // The corresponding shapes of inner loops entry points must be updated using existing subtensor values + if (!most_outer_loop) { + for (const auto& port : loop_info->get_entry_points()) + update_only_dim_idx_with_subtensor_value(port); + } + propagate_updated_subtensor_through_loop(linear_ir, inner_loop_info, inner_begin, inner_end, false); + expr_it = inner_end; + continue; + } + if ((ov::is_type(expr_it->get()->get_node()) || + ov::is_type(expr_it->get()->get_node())) && + loop_by_last_dim) { + // WA: we have to break subtensor propagation if we try to propagate new last dim through Broadcast nodes + // which broadcast last dim in original dimension value anyway + // This workaround might be avoided if blocked shape are used for tail size propagation + shape_inference_end_it = expr_it; + break; + } + expr->updateShapes(); + update_subtensors(expr->get_input_port_descriptors(), true); + update_subtensors(expr->get_output_port_descriptors(), false); + } + + // After subtensor propagation, the original shapes must be restored + for (const auto& elem : original_shapes) + elem.first->set_shape(elem.second); + for (auto expr_it = begin; expr_it != shape_inference_end_it; expr_it++) + (*expr_it)->updateShapes(); +} +} // namespace + +UpdateSubtensors::UpdateSubtensors(size_t tail_size) : RangedPass(), m_tail_size(tail_size) {} + +bool UpdateSubtensors::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + const auto& last_expr = *end; + const auto last_node = last_expr->get_node(); + const auto loop_end = ov::as_type_ptr(last_node); + OPENVINO_ASSERT(loop_end, "the last operation in range must be LoopEnd"); + + const auto& loop_manager = linear_ir.get_loop_manager(); + const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id()); + propagate_updated_subtensor_through_loop(linear_ir, loop_info, begin, end, true, m_tail_size); + return true; +} + +std::shared_ptr UpdateSubtensors::merge(const std::shared_ptr& other) { + const auto merged_pass = std::make_shared(m_tail_size); + if (other == nullptr) + return merged_pass; + const auto casted_pass = ov::as_type_ptr(other); + if (!casted_pass || m_tail_size != casted_pass->m_tail_size) + return nullptr; + return merged_pass; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov + diff --git a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp index 2ec613495e9a13..7497eb19c82cb0 100644 --- a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp +++ b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp @@ -19,9 +19,12 @@ namespace snippets { namespace lowered { namespace pass { +using LoopInfo = LinearIR::LoopManager::LoopInfo; +using HandlerType = LoopInfo::SpecificIterationHandlers::HandlerType; + SoftmaxDecomposition::SoftmaxDecomposition(size_t vector_size) : m_vector_size{vector_size} {} -bool SoftmaxDecomposition::run(LinearIR& linear_ir) { +bool SoftmaxDecomposition::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SoftmaxDecompositionLowered") bool modified = false; const auto& loop_manager = linear_ir.get_loop_manager(); @@ -29,7 +32,7 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { auto match_softmax = ov::pass::pattern::wrap_type(); auto matcher = std::make_shared(match_softmax, "SoftmaxDecompositionLowered"); - for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto& op = (*expr_it)->get_node(); if (matcher->match(op)) { const auto& pm = matcher->get_pattern_map(); @@ -40,6 +43,7 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { const auto& output_connector = softmax_expr->get_output_port_connector(0); const auto tensor_out = softmax_expr->get_output_port_descriptor(0)->get_shape(); const auto inner_work_amount = *(tensor_out.rbegin()); + const auto inner_increment = std::min(inner_work_amount, m_vector_size); // Float constant values in byte representation const auto float_min_constant = uint32_t(0xff7fffff); @@ -58,15 +62,21 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { // Init value of vector buffer for ReduceMax is -FLOAT_MIN. const auto fill_max = push_node(std::make_shared(vector_buffer_max.second, 0, float_min_constant)); // ReduceMax loop - const auto& max = push_node(std::make_shared(softmax->get_input_source_output(0), fill_max.second)); + const auto fill_max_tail = push_node(std::make_shared(softmax->get_input_source_output(0), inner_increment, float_min_constant)); + + const auto& max = push_node(std::make_shared(fill_max_tail.second, fill_max.second)); const auto horizon_max = push_node(std::make_shared(max.second)); // Markup of ReduceMax Loop - loop_manager->mark_loop(max.first, horizon_max.first, inner_work_amount, m_vector_size, 0, - std::vector{(*max.first)->get_input_port(0), - (*max.first)->get_input_port(1)}, - std::vector{(*max.first)->get_output_port(0)}); + const auto reduce_max_loop_id = loop_manager->mark_loop(fill_max_tail.first, horizon_max.first, inner_work_amount, inner_increment, 0, + std::vector{(*fill_max_tail.first)->get_input_port(0), + (*max.first)->get_input_port(1)}, + std::vector{(*max.first)->get_output_port(0)}); + const auto tail_size = inner_work_amount % inner_increment; + if (tail_size != 0) { + loop_manager->get_loop_info(reduce_max_loop_id)->register_handler(tail_size); + } const auto broadcast_horizon_max = push_node(std::make_shared(horizon_max.second, broadcasted_dim)); const auto vector_buffer_sum = push_node(std::make_shared()); // Init value of vector buffer for ReduceSum is zero. @@ -75,37 +85,40 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { // Sub + Exp + ReduceSum Loop const auto sub = push_node(std::make_shared(softmax->get_input_source_output(0), broadcast_horizon_max.second)); const auto exp = push_node(std::make_shared(sub.second)); - const auto sum = push_node(std::make_shared(exp.second, fill_sum.second)); + const auto fill_sum_tail = push_node(std::make_shared(exp.second, inner_increment, zero_constant)); + const auto sum = push_node(std::make_shared(fill_sum_tail.second, fill_sum.second)); const auto horizon_sum = push_node(std::make_shared(sum.second)); - // Markup of ReduceMax Loop - loop_manager->mark_loop(sub.first, horizon_sum.first, inner_work_amount, m_vector_size, 0, - std::vector{(*sub.first)->get_input_port(0), - (*sub.first)->get_input_port(1), - (*sum.first)->get_input_port(1)}, - std::vector{(*exp.first)->get_output_port(0), - (*sum.first)->get_output_port(0)}); + // Markup of ReduceSum Loop + const auto reduce_sum_loop_id = loop_manager->mark_loop(sub.first, horizon_sum.first, inner_work_amount, inner_increment, 0, + std::vector{(*sub.first)->get_input_port(0), + (*sub.first)->get_input_port(1), + (*sum.first)->get_input_port(1)}, + std::vector{(*fill_sum_tail.first)->get_output_port(0), + (*sum.first)->get_output_port(0)}); + if (tail_size != 0) { + loop_manager->get_loop_info(reduce_sum_loop_id)->register_handler(tail_size); + } // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop const auto pow = push_node(std::make_shared(horizon_sum.second, -1.f)); const auto broadcast_pow = push_node(std::make_shared(pow.second, broadcasted_dim)); // Mul (pseudo-Divide loop) - const auto mul = push_node(std::make_shared(exp.second, broadcast_pow.second)); + const auto mul = push_node(std::make_shared(fill_sum_tail.second, broadcast_pow.second)); // Transfer original ExpressionPorts - replace_input_port_connectors({ max.first->get()->get_input_port(0), sub.first->get()->get_input_port(0) }, input_connector); + replace_input_port_connectors({ fill_max_tail.first->get()->get_input_port(0), sub.first->get()->get_input_port(0) }, input_connector); replace_input_port_connectors(output_connector->get_consumers(), (*mul.first)->get_output_port_connector(0)); // Markup of Mul Loop - loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, m_vector_size, 0, - std::vector{(*mul.first)->get_input_port(0), - (*mul.first)->get_input_port(1)}, + loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, inner_increment, 0, + std::vector{(*mul.first)->get_input_port(0), (*mul.first)->get_input_port(1)}, std::vector{(*mul.first)->get_output_port(0)}); // Update Loop info for outer loops - const auto entry_points = std::vector{(*max.first)->get_input_port(0), + const auto entry_points = std::vector{(*fill_max_tail.first)->get_input_port(0), (*sub.first)->get_input_port(0)}; const auto exit_points = std::vector{(*mul.first)->get_output_port(0)}; for (auto loop_id : softmax_loop_ids) { @@ -113,16 +126,6 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { } expr_it = linear_ir.erase(expr_it); // Remove Softmax - - /* =========================================== */ - - /* ============= Runtime Info ================ */ - - // For tail loop we should fill input of Max by float min and - // input of Sum by zero to avoid math incorrect calculations - // TODO [111383]: It should be covered via general pipeline (for example, via analyze in InsertTailLoop?) - max.second->input(0).get_rt_info()["set_fill"] = float_min_constant; - sum.second->input(0).get_rt_info()["set_fill"] = zero_constant; modified = true; } } diff --git a/src/common/snippets/src/lowered/pass/split_loops.cpp b/src/common/snippets/src/lowered/pass/split_loops.cpp index 02df0868f4c607..70b9b0ff0d72f8 100644 --- a/src/common/snippets/src/lowered/pass/split_loops.cpp +++ b/src/common/snippets/src/lowered/pass/split_loops.cpp @@ -18,23 +18,24 @@ using LoopManager = LinearIR::LoopManager; using LoopInfo = LoopManager::LoopInfo; using LoopInfoPtr = LoopManager::LoopInfoPtr; -SplitLoops::SplitLoops() : Pass() {} +SplitLoops::SplitLoops() : RangedPass() {} -bool SplitLoops::can_be_split(const LoopInfoPtr& current, const LoopInfoPtr& parent) { - const auto current_dim_idx = current->get_dim_idx(); - const auto parent_dim_idx = parent->get_dim_idx(); +bool SplitLoops::can_be_split(const LoopInfoPtr& loop_to_split, const LoopInfoPtr& loop_to_fuse) { + const auto current_dim_idx = loop_to_split->get_dim_idx(); + const auto parent_dim_idx = loop_to_fuse->get_dim_idx(); + const auto& handlers = loop_to_split->get_handlers(); const bool equal_dim_idxes = current_dim_idx != LoopInfo::UNDEFINED_DIM_IDX && current_dim_idx == parent_dim_idx; - return current->get_work_amount() == parent->get_work_amount() && current->get_increment() != parent->get_increment() && equal_dim_idxes; + const bool only_main_body = handlers.get_first_iter_handelrs().empty() && handlers.get_last_iter_handelrs().empty(); + return loop_to_split->get_work_amount() == loop_to_fuse->get_work_amount() && + loop_to_split->get_increment() != loop_to_fuse->get_increment() && equal_dim_idxes && only_main_body; } -bool SplitLoops::run(LinearIR& linear_ir) { +bool SplitLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SplitLoops") - if (linear_ir.empty()) - return false; - const auto& loop_manager = linear_ir.get_loop_manager(); bool loop_was_split = false; - for (const auto& expr : linear_ir) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; const auto& loop_ids = expr->get_loop_ids(); if (loop_ids.empty()) continue; @@ -59,12 +60,12 @@ bool SplitLoops::run(LinearIR& linear_ir) { continue; const auto parent_loop = loop_manager->get_loop_info(parent_loop_id); - if (can_be_split(loop, parent_loop)) { + const bool split_parent = parent_loop->get_increment() < loop->get_increment(); + const auto& loop_to_split = split_parent ? parent_loop : loop; + const auto& loop_to_split_id = split_parent ? parent_loop_id : loop_id; + const auto& loop_to_fuse = !split_parent ? parent_loop : loop; + if (can_be_split(loop_to_split, loop_to_fuse)) { loop_was_split = true; - const bool split_parent = parent_loop->get_increment() < loop->get_increment(); - const auto& loop_to_split = split_parent ? parent_loop : loop; - const auto& loop_to_split_id = split_parent ? parent_loop_id : loop_id; - const auto& loop_to_fuse = !split_parent ? parent_loop : loop; loop_to_split->set_work_amount(loop_to_fuse->get_increment()); const auto loop_bounds = LoopManager::get_loop_bounds(linear_ir, loop_to_split_id, @@ -77,7 +78,15 @@ bool SplitLoops::run(LinearIR& linear_ir) { loop_to_split->get_dim_idx(), loop_to_split->get_entry_points(), loop_to_split->get_exit_points()); - loop_manager->get_loop_info(split_loop_id)->set_outer_splited_loop(true); + const auto& new_loop_info = loop_manager->get_loop_info(split_loop_id); + const auto work_amount = loop_to_fuse->get_work_amount(); + const auto increment = loop_to_fuse->get_increment(); + const auto tail_size = work_amount % increment; + auto new_handlers = loop_to_split->get_handlers(); + if (tail_size != 0) { + new_handlers.register_handler(tail_size); + } + new_loop_info->set_handlers(new_handlers); break; } } @@ -86,7 +95,7 @@ bool SplitLoops::run(LinearIR& linear_ir) { // FuseLoops pass is explicitly run here in order to avoid unnecessary computations // in case if loops are not split but FuseLoops is registered in pass manager after SplitLoops if (loop_was_split) - FuseLoops().run(linear_ir); + FuseLoops().run(linear_ir, begin, end); return loop_was_split; } } // namespace pass diff --git a/src/common/snippets/src/lowered/pass/validate_loops.cpp b/src/common/snippets/src/lowered/pass/validate_loops.cpp index 99698a6b4329bd..43afdc12e63551 100644 --- a/src/common/snippets/src/lowered/pass/validate_loops.cpp +++ b/src/common/snippets/src/lowered/pass/validate_loops.cpp @@ -63,8 +63,6 @@ bool ValidateLoops::run(LinearIR& linear_ir) { "Incorrect Loop ID configuration: the Loops with splitted dimension should be successively nested"); OPENVINO_ASSERT(loop_manager->get_loop_info(loop_ids[i - 1])->get_increment() == loop_manager->get_loop_info(id)->get_work_amount(), "Incorrect Loop ID configuration: the Loops with splitted dimension should be successively nested"); - OPENVINO_ASSERT(loop_manager->get_loop_info(loop_ids[i - 1])->get_outer_splited_loop(), - "Incorrect Loop ID configuration: the outer Loop with splitted dimension should have `outer_splited_loop=True`"); } dim_indexes.push_back(dim_idx); } diff --git a/src/common/snippets/src/lowered/pass/validate_shapes.cpp b/src/common/snippets/src/lowered/pass/validate_shapes.cpp index 8d12004313e0bf..8e90cc723576fa 100644 --- a/src/common/snippets/src/lowered/pass/validate_shapes.cpp +++ b/src/common/snippets/src/lowered/pass/validate_shapes.cpp @@ -13,10 +13,11 @@ namespace snippets { namespace lowered { namespace pass { -bool ValidateShapes::run(LinearIR& linear_ir) { +bool ValidateShapes::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ValidateShapes") - for (const auto& expr : linear_ir) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; const auto num_inputs = expr->get_input_count(); const auto& port_connectors = expr->get_input_port_connectors(); const auto& port_descriptors = expr->get_input_port_descriptors(); diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 3b23da523d53ca..5ff96826a74f71 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -473,7 +473,7 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const std::shared_ptrgenerate(linear_ir, lowering_result, compile_params); diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp index 455c261cec5109..209ecb4592368a 100644 --- a/src/common/snippets/tests/src/lowered/pass/loop.cpp +++ b/src/common/snippets/tests/src/lowered/pass/loop.cpp @@ -11,7 +11,8 @@ #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_load_store.hpp" #include "snippets/lowered/pass/insert_loops.hpp" -#include "snippets/lowered/pass/insert_tail_loop.hpp" +#include "snippets/lowered/pass/insert_specific_iterations.hpp" +#include "snippets/lowered/pass/iter_handler.hpp" #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" #include "snippets/lowered/pass/validate_loops.hpp" #include "snippets/shape_inference/shape_inference.hpp" @@ -38,7 +39,7 @@ static void init_linear_ir(const std::vector& in_shapes, Linea const auto in_shape0 = in_shapes[0].get_shape(); const auto in_shape1 = in_shapes[1].get_shape(); const auto inner_wa = std::max(*in_shape0.rbegin(), *in_shape1.rbegin()); - const auto inner_inc = vector_size; + const auto inner_inc = std::min(vector_size, inner_wa); const auto blocked_wa = block_size; const auto blocked_inc = 1; const auto outer_wa = std::max(*(in_shape0.rbegin() + 1), *(in_shape1.rbegin() + 1)); @@ -46,7 +47,12 @@ static void init_linear_ir(const std::vector& in_shapes, Linea loop_manager->mark_loop(expr_it, std::next(expr_it), inner_wa, inner_inc, 0, loop_entry_points, loop_exit_points); loop_manager->mark_loop(expr_it, std::next(expr_it), blocked_wa, blocked_inc, 1, loop_entry_points, loop_exit_points); const auto loop_id = loop_manager->mark_loop(expr_it, std::next(expr_it), outer_wa, outer_inc, 1, loop_entry_points, loop_exit_points); - loop_manager->get_loop_info(loop_id)->set_outer_splited_loop(true); + const auto& outer_loop_info = loop_manager->get_loop_info(loop_id); + const auto outer_tail_size = outer_wa % outer_inc; + if (outer_tail_size != 0) { + outer_loop_info->register_handler(outer_tail_size); + } } static void apply_transformations(LinearIR& linear_ir, const std::shared_ptr& config) { @@ -55,7 +61,7 @@ static void apply_transformations(LinearIR& linear_ir, const std::shared_ptr(); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(); + pipeline.register_pass(); pipeline.register_pass(); pipeline.register_pass(); pipeline.run(linear_ir); @@ -84,7 +90,7 @@ TEST(Snippets_TailProcessingTransformation, BlockedWOTail_OriginalPtrShifts) { auto config = std::make_shared(); config->disable(); - config->disable(); + config->disable(); config->disable(); apply_transformations(linear_ir, config); @@ -104,7 +110,7 @@ TEST(Snippets_TailProcessingTransformation, BlockedWOTail_CleanUpPtrShifts) { init_linear_ir({inputShape0, inputShape1}, linear_ir, 4); auto config = std::make_shared(); - config->disable(); + config->disable(); config->disable(); apply_transformations(linear_ir, config); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp index b32a78cdbe8e5f..1c05100317ae5f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp @@ -59,10 +59,19 @@ void jit_fill_emitter::emit_isa(const std::vector &in, const std::vector Vmm src_vmm = Vmm(in[0]); Vmm dst_vmm = Vmm(out[0]); - if (is_full_reg()) + const size_t supported_et_size = 4; + const auto register_capacity = (src_vmm.getBit() / 8) / supported_et_size; + if (offset == register_capacity) { + // WA: since AssignRegisters doesn't support inplace logic, Fill ops with offset = register_capacity can't be removed from the LIR + // TODO: when inplace is supported, remove such Fill ops from the LIR and remove this logic. + // Ticket: 126270 + if (src_vmm.getIdx() != dst_vmm.getIdx()) + h->uni_vmovups(dst_vmm, src_vmm); + } else if (is_full_reg()) { fill_full(dst_vmm); - else + } else { fill_tail(src_vmm, dst_vmm); + } } template diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp index 1ea2418f995463..f5bfa19a7dcf66 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp @@ -32,19 +32,19 @@ class BrgemmCPU : public snippets::op::Brgemm { BrgemmCPU(const Output& A, const Output& B, const Type type, const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_c = 0, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}, - const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f); + const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f); BrgemmCPU(const Output& A, const Output& B, const Output& scratch, const Type type, const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_scratch = 0, const size_t offset_c = 0, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}, - const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f); + const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f); BrgemmCPU(const Output& A, const Output& B, const Type type, const PortDescriptor& desc_a, const PortDescriptor& desc_b, const PortDescriptor& desc_c, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}, - const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f); + const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f); BrgemmCPU(const Output& A, const Output& B, const Output& scratch, const Type type, const PortDescriptor& desc_a, const PortDescriptor& desc_b, const PortDescriptor& desc_scratch, const PortDescriptor& desc_c, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}, - const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f); + const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f); BrgemmCPU() = default; void validate_and_infer_types() override; @@ -83,7 +83,7 @@ class BrgemmCPU : public snippets::op::Brgemm { size_t m_M_blk = 0; size_t m_K_blk = 0; size_t m_N_blk = 0; - float m_beta = 0.f; + float m_beta = 1.f; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp index 5a6236d1916b13..16e6f897af5691 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp @@ -4,25 +4,25 @@ #include "brgemm_blocking.hpp" -#include "openvino/pass/pattern/matcher.hpp" -#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "cpu_iter_handlers.hpp" #include "snippets/itt.hpp" -#include "snippets/utils.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_manager.hpp" -#include "snippets/lowered/pass/insert_tail_loop.hpp" +#include "snippets/lowered/pass/pass.hpp" #include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" - namespace ov { namespace intel_cpu { namespace pass { using LinearIR = snippets::lowered::LinearIR; using LoopPort = LinearIR::LoopManager::LoopPort; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; +using LoopInfo = LinearIR::LoopManager::LoopInfo; +using namespace ov::snippets::lowered::pass; -BrgemmBlocking::BrgemmBlocking() : Pass() {} +BrgemmBlocking::BrgemmBlocking() : RangedPass() {} void BrgemmBlocking::move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it) { const auto& brgemm_expr = brgemm_it->get(); @@ -36,11 +36,8 @@ void BrgemmBlocking::move_new_memory_buffer(snippets::lowered::LinearIR& linear_ } } -bool BrgemmBlocking::run(LinearIR& linear_ir) { +bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmBlocking") - if (linear_ir.empty()) - return false; - const auto& loop_manager = linear_ir.get_loop_manager(); auto blocking_loop_exists = [&](const ExpressionPtr& brgemm_expr, const std::shared_ptr& brgemm) { auto check_port = [&](const LoopPort& p) { @@ -59,7 +56,7 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { }; bool modified = false; - for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto& brgemm_expr = *expr_it; const auto brgemm = ov::as_type_ptr(brgemm_expr->get_node()); if (!brgemm || blocking_loop_exists(brgemm_expr, brgemm)) @@ -83,22 +80,22 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { if (block_size_m >= m) { *(in_0_subtensor.rbegin() + 1) = m; *(out_subtensor.rbegin() + 1) = m; - } else { - *(in_0_subtensor.rbegin() + 1) = block_size_m; - *(out_subtensor.rbegin() + 1) = block_size_m; - - auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); - std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true), - LoopPort(brgemm_expr->get_input_port(1), false)}; - if (brgemm->is_with_compensations()) { - entries.emplace_back(brgemm_expr->get_input_port(2), false); - } else if (brgemm->is_amx()) { - move_new_memory_buffer(linear_ir, expr_it); - loop_begin_it = std::prev(expr_it); - } - std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; - loop_manager->mark_loop(loop_begin_it, loop_end_it, m, block_size_m, 1, entries, exits); + return; } + + *(in_0_subtensor.rbegin() + 1) = block_size_m; + *(out_subtensor.rbegin() + 1) = block_size_m; + auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); + std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true), + LoopPort(brgemm_expr->get_input_port(1), false)}; + if (brgemm->is_with_compensations()) { + entries.emplace_back(brgemm_expr->get_input_port(2), false); + } else if (brgemm->is_amx()) { + move_new_memory_buffer(linear_ir, expr_it); + loop_begin_it = std::prev(expr_it); + } + std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; + loop_manager->mark_loop(loop_begin_it, loop_end_it, m, block_size_m, 1, entries, exits); }; auto apply_n_blocking = [&]() { @@ -107,22 +104,22 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { if (block_size_n >= n) { *in_1_subtensor.rbegin() = n; *out_subtensor.rbegin() = n; - } else { - *in_1_subtensor.rbegin() = block_size_n; - *out_subtensor.rbegin() = block_size_n; - - auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); - std::vector entries{LoopPort(brgemm_expr->get_input_port(0), false), - LoopPort(brgemm_expr->get_input_port(1), true)}; - if (brgemm->is_with_compensations()) { - entries.emplace_back(brgemm_expr->get_input_port(2), true); - } else if (brgemm->is_amx()) { - move_new_memory_buffer(linear_ir, expr_it); - loop_begin_it = std::prev(expr_it); - } - std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; - loop_manager->mark_loop(loop_begin_it, loop_end_it, n, block_size_n, 0, entries, exits); + return; + } + + *in_1_subtensor.rbegin() = block_size_n; + *out_subtensor.rbegin() = block_size_n; + auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); + std::vector entries{LoopPort(brgemm_expr->get_input_port(0), false), + LoopPort(brgemm_expr->get_input_port(1), true)}; + if (brgemm->is_with_compensations()) { + entries.emplace_back(brgemm_expr->get_input_port(2), true); + } else if (brgemm->is_amx()) { + move_new_memory_buffer(linear_ir, expr_it); + loop_begin_it = std::prev(expr_it); } + std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; + loop_manager->mark_loop(loop_begin_it, loop_end_it, n, block_size_n, 0, entries, exits); }; auto apply_k_blocking = [&]() { @@ -132,59 +129,25 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { if (block_size_k >= k) { *in_0_subtensor.rbegin() = k; *(in_1_subtensor.rbegin() + 1) = k; - } else { - *in_0_subtensor.rbegin() = block_size_k; - *(in_1_subtensor.rbegin() + 1) = block_size_k; - - auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); - std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true, 0), - LoopPort(brgemm_expr->get_input_port(1), true, 1)}; - if (brgemm->is_with_compensations()) { - entries.emplace_back(brgemm_expr->get_input_port(2), false, 1); - } else if (brgemm->is_amx()) { - move_new_memory_buffer(linear_ir, expr_it); - loop_begin_it = std::prev(expr_it); - } - std::vector exits{LoopPort(brgemm_expr->get_output_port(0), false)}; - auto loop_id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits); - const auto loop_info = loop_manager->get_loop_info(loop_id); - - auto first_iter_handler = [](LinearIR& linear_ir, LinearIR::constExprIt loop_end_it) { - const auto loop_end = ov::as_type_ptr(loop_end_it->get()->get_node()); - OPENVINO_ASSERT(loop_end, "First loop iteraton handler must be called on LoopEnd expression"); - const auto loop_id = loop_end->get_id(); - const auto& loop_manager = linear_ir.get_loop_manager(); - const auto& loop_info = loop_manager->get_loop_info(loop_id); - const auto work_amount = loop_info->get_work_amount(); - const auto increment = loop_info->get_increment(); - if (work_amount <= increment) - return false; - - const auto loop_begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end->get_loop_begin())); - const auto new_loop_begin_pos = snippets::lowered::pass::InsertTailLoop::insert_copy_loop(linear_ir, loop_id, loop_begin_it); - const auto new_loop_begin = ov::as_type_ptr(new_loop_begin_pos->get()->get_node()); - OPENVINO_ASSERT(new_loop_begin, "Cloned Loop does not contain LoopBegin op at the expected place."); - const auto firt_iter_loop_end = new_loop_begin->get_loop_end(); - auto first_iter_loop_info = loop_manager->get_loop_info(firt_iter_loop_end->get_id()); - firt_iter_loop_end->set_work_amount(increment); - first_iter_loop_info->set_work_amount(increment); - firt_iter_loop_end->set_finalization_offsets(std::vector(loop_end->get_finalization_offsets().size(), 0)); - - const auto new_work_amount = work_amount - increment; - loop_info->set_work_amount(new_work_amount); - loop_end->set_work_amount(new_work_amount); - - // Update original body's Brgemms with new beta parameter - for (auto expr_it = loop_begin_it; expr_it != loop_end_it; ++expr_it) { - const auto& expr_node = expr_it->get()->get_node(); - if (const auto brgemm = ov::as_type_ptr(expr_node)) { - brgemm->set_beta(1.f); - } - } - return true; - }; - loop_info->set_first_iter_handler(first_iter_handler); + brgemm->set_beta(0.f); + return; + } + + *in_0_subtensor.rbegin() = block_size_k; + *(in_1_subtensor.rbegin() + 1) = block_size_k; + auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it); + std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true, 0), + LoopPort(brgemm_expr->get_input_port(1), true, 1)}; + if (brgemm->is_with_compensations()) { + entries.emplace_back(brgemm_expr->get_input_port(2), false, 1); + } else if (brgemm->is_amx()) { + move_new_memory_buffer(linear_ir, expr_it); + loop_begin_it = std::prev(expr_it); } + std::vector exits{LoopPort(brgemm_expr->get_output_port(0), false)}; + const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits); + const auto loop_info = loop_manager->get_loop_info(id); + loop_info->register_handler(0.f); }; apply_k_blocking(); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp index 81ae47aa3c6948..483a2c5ba53100 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp @@ -16,11 +16,13 @@ namespace pass { * @ingroup snippets */ -class BrgemmBlocking : public snippets::lowered::pass::Pass { +class BrgemmBlocking : public snippets::lowered::pass::RangedPass { public: OPENVINO_RTTI("BrgemmBlocking", "Pass") BrgemmBlocking(); - bool run(snippets::lowered::LinearIR& linear_ir) override; + bool run(snippets::lowered::LinearIR& linear_ir, + snippets::lowered::LinearIR::constExprIt begin, + snippets::lowered::LinearIR::constExprIt end) override; private: static void move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp new file mode 100644 index 00000000000000..382ee78d8be59e --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "cpu_iter_handlers.hpp" + +#include "snippets/lowered/loop_manager.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { +using LinearIR = snippets::lowered::LinearIR; +using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; + +SetBrgemmBeta::SetBrgemmBeta(float beta) : snippets::lowered::pass::RangedPass(), m_beta(beta) {} + +bool SetBrgemmBeta::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = expr_it->get(); + if (const auto brgemm = ov::as_type_ptr(expr->get_node())) { + brgemm->set_beta(m_beta); + } + } + return true; +} + +std::shared_ptr SetBrgemmBeta::merge(const std::shared_ptr& other) { + const auto merged_pass = std::make_shared(m_beta); + if (other == nullptr) + return merged_pass; + const auto casted_pass = ov::as_type_ptr(other); + if (!casted_pass || m_beta != casted_pass->m_beta) + return nullptr; + return merged_pass; +} +} // namespace pass +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp new file mode 100644 index 00000000000000..5da97e29796f70 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/pass/iter_handler.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { +/** + * @interface SetBrgemmBeta + * @brief The pass updates all CPUBrgemm nodes with a new beta value + * @param m_beta - beta which must be set + * @ingroup snippets + */ +class SetBrgemmBeta : public snippets::lowered::pass::RangedPass { +public: + SetBrgemmBeta(float beta); + OPENVINO_RTTI("SetBrgemmBeta", "RangedPass") + bool run(snippets::lowered::LinearIR& linear_ir, + snippets::lowered::LinearIR::constExprIt begin, + snippets::lowered::LinearIR::constExprIt end) override; + std::shared_ptr merge(const std::shared_ptr& other) override; + +private: + float m_beta; +}; +} // namespace pass +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp index 165f9626014290..722ead2258a3ba 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp @@ -92,12 +92,14 @@ bool ov::intel_cpu::pass::FuseLoadStoreConvert::fuse_store_convert(snippets::low return true; } -bool ov::intel_cpu::pass::FuseLoadStoreConvert::run(snippets::lowered::LinearIR& linear_ir) { +bool ov::intel_cpu::pass::FuseLoadStoreConvert::run(snippets::lowered::LinearIR& linear_ir, + snippets::lowered::LinearIR::constExprIt begin, + snippets::lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::FuseLoadStoreConvert") bool modified = false; - for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) { + for (auto expr_it = begin; expr_it != end; expr_it++) { const auto& expr = *expr_it; const auto& convert = expr->get_node(); if (!ov::is_type(convert)) diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp index 0ff16116fa9002..32d862bc8d8356 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp @@ -18,11 +18,13 @@ namespace pass { * Fuse Store and ConvertTruncation into one op StoreConvertTruncation * @ingroup snippets */ -class FuseLoadStoreConvert: public snippets::lowered::pass::Pass { +class FuseLoadStoreConvert: public snippets::lowered::pass::RangedPass { public: FuseLoadStoreConvert() = default; - OPENVINO_RTTI("FuseLoadStoreConvert", "Pass"); - bool run(snippets::lowered::LinearIR& linear_ir) override; + OPENVINO_RTTI("FuseLoadStoreConvert", "RangedPass"); + bool run(snippets::lowered::LinearIR& linear_ir, + snippets::lowered::LinearIR::constExprIt begin, + snippets::lowered::LinearIR::constExprIt end) override; private: bool fuse_load_convert(snippets::lowered::LinearIR& linear_ir, diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp index 0f14f9a7dc5d8a..68fdda2f7f83df 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp @@ -10,7 +10,9 @@ #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" -bool ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape::run(snippets::lowered::LinearIR& linear_ir) { +bool ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape::run(snippets::lowered::LinearIR& linear_ir, + snippets::lowered::LinearIR::constExprIt begin, + snippets::lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SetBrgemmCopyBBuffersShape") auto get_buffer_from_output = [](const snippets::lowered::ExpressionPtr& expr, const size_t out_idx) { @@ -22,7 +24,8 @@ bool ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape::run(snippets::lowered::Lin }; bool modified = false; - for (const auto& expr : linear_ir) { + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; if (auto copy_b = ov::as_type_ptr(expr->get_node())) { const auto buffer = get_buffer_from_output(expr, 0); const auto& out_desc = expr->get_output_port_descriptor(0); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp index c7eec92700a16a..81c4629907e0d7 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp @@ -17,11 +17,13 @@ namespace pass { * Ticket: 113744 * @ingroup snippets */ -class SetBrgemmCopyBBuffersShape: public snippets::lowered::pass::Pass { +class SetBrgemmCopyBBuffersShape: public snippets::lowered::pass::RangedPass { public: SetBrgemmCopyBBuffersShape() = default; OPENVINO_RTTI("SetBrgemmCopyBBuffersShape", "Pass"); - bool run(snippets::lowered::LinearIR& linear_ir) override; + bool run(snippets::lowered::LinearIR& linear_ir, + snippets::lowered::LinearIR::constExprIt begin, + snippets::lowered::LinearIR::constExprIt end) override; }; } // namespace pass diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index dc25378528199c..abecd3c954e0cb 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -21,7 +21,9 @@ std::vector> input_shapes{ {{1, 1, 32, 23}, {1, 1, 23, 68}}, {{1, 16, 384, 64}, {1, 16, 64, 384}}, {{1, 1, 100, 700}, {1, 1, 700, 100}}, + {{1, 1, 100, 1024}, {1, 1, 1024, 100}}, {{1, 1, 100, 2500}, {1, 1, 2500, 100}}, + {{1, 1, 100, 4500}, {1, 1, 4500, 100}}, }; static inline std::vector> quantized_precisions() {