From e240aee72aceb68a6e0cb992ccb1902160a0a276 Mon Sep 17 00:00:00 2001
From: Vladislav Golubev <vladislav.golubev@intel.com>
Date: Thu, 14 Dec 2023 17:16:50 +0100
Subject: [PATCH] [Snippets] Specific loop iterations handler

---
 .../include/snippets/lowered/linear_ir.hpp    |   6 +-
 .../include/snippets/lowered/loop_manager.hpp |  51 +++---
 .../pass/insert_specific_iterations.hpp       |  25 +++
 .../snippets/lowered/pass/iter_handler.hpp    |  75 +++++++++
 .../include/snippets/lowered/pass/pass.hpp    |  57 +++++++
 .../lowered/pass/propagate_subtensors.hpp     |  28 ++++
 .../snippets/include/snippets/op/subgraph.hpp |   6 +-
 .../include/snippets/pass/manager.hpp         |   7 +-
 src/common/snippets/src/generator.cpp         |  12 +-
 .../snippets/src/lowered/loop_manager.cpp     |  88 ++++++++--
 .../src/lowered/pass/assign_registers.cpp     |   4 +-
 .../snippets/src/lowered/pass/fuse_loops.cpp  |  34 ++--
 .../src/lowered/pass/insert_load_store.cpp    |  10 +-
 .../pass/insert_specific_iterations.cpp       | 106 ++++++++++++
 .../src/lowered/pass/insert_tail_loop.cpp     |   8 +-
 .../src/lowered/pass/iter_handler.cpp         | 155 ++++++++++++++++++
 src/common/snippets/src/lowered/pass/pass.cpp |  30 ++++
 .../src/lowered/pass/propagate_subtensors.cpp | 148 +++++++++++++++++
 .../lowered/pass/softmax_decomposition.cpp    |  60 +++----
 .../snippets/src/lowered/pass/split_loops.cpp |  14 +-
 src/common/snippets/src/op/subgraph.cpp       |   2 +
 src/common/snippets/src/pass/manager.cpp      |   1 -
 .../snippets/tests/src/lowered/pass/loop.cpp  |   1 +
 .../x64/pass/lowered/brgemm_blocking.cpp      |  85 +++++-----
 .../x64/pass/lowered/cpu_iter_handlers.cpp    |  29 ++++
 .../x64/pass/lowered/cpu_iter_handlers.hpp    |  25 +++
 .../snippets/matmul.cpp                       |   2 +
 27 files changed, 908 insertions(+), 161 deletions(-)
 create mode 100644 src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp
 create mode 100644 src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp
 create mode 100644 src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp
 create mode 100644 src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp
 create mode 100644 src/common/snippets/src/lowered/pass/iter_handler.cpp
 create mode 100644 src/common/snippets/src/lowered/pass/propagate_subtensors.cpp
 create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp
 create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp
diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
index 73d3ab573e6254..4d2deae7a8bee5 100644
--- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp
+++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@@ -70,9 +70,9 @@ class LinearIR {
                                                LinearIR::container::const_iterator end,
                                                ExressionMap& expression_map);
 
-    const container& get_ops() const {return m_expressions; }
-    const io_container& get_IO_ops() const {return m_io_expressions; }
-    Config get_config() {return m_config; }
+    const container& get_ops() const { return m_expressions; }
+    const io_container& get_IO_ops() const { return m_io_expressions; }
+    Config get_config() const { return m_config; }
     void set_loop_depth(size_t loop_depth) { m_config.m_loop_depth = loop_depth; }
 
     const ExpressionPtr& get_expr_by_node(const std::shared_ptr<Node>& n) const;
diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp
index 93d1620f5fdbe7..f19adbc72e3e42 100644
--- a/src/common/snippets/include/snippets/lowered/loop_manager.hpp
+++ b/src/common/snippets/include/snippets/lowered/loop_manager.hpp
@@ -4,11 +4,12 @@
 
 #pragma once
 
-#include "linear_ir.hpp"
-
 #include <openvino/core/node.hpp>
 #include <openvino/opsets/opset1.hpp>
 
+#include "linear_ir.hpp"
+#include "pass/iter_handler.hpp"
+#include "pass/pass.hpp"
 #include "port_descriptor.hpp"
 
 namespace ov {
@@ -45,9 +46,7 @@ class LinearIR::LoopManager {
         LoopInfo(size_t work_amount, size_t increment,
                  const std::vector<LoopPort>& entries,
                  const std::vector<LoopPort>& exits,
-                 bool outer_splited_loop = false)
-            : m_work_amount(work_amount), m_increment(increment),
-              m_entry_points(entries), m_exit_points(exits), m_outer_splited_loop(outer_splited_loop) {}
+                 bool outer_splited_loop = false);
         LoopInfo(size_t work_amount, size_t increment,
                  const std::vector<ExpressionPort>& entries,
                  const std::vector<ExpressionPort>& exits,
@@ -63,19 +62,6 @@ class LinearIR::LoopManager {
         const std::vector<LoopPort>& get_exit_points() const;
         bool get_outer_splited_loop() const;
 
-        /**
-         * \brief Inserts a separate body for first loop iteration processing if needed.
-         * Can also modify both main and first iter loop bodies.
-         * TODO: replace this temporary solution when ticket 119851 is implemented
-         *
-         * \param linear_ir LIR which should be modified
-         * \param loop_end_it iterator on LoopEnd expression for which the handler is called
-         *
-         * \return bool value which indicates whether the linear_ir was changed or not.
-         */
-        using FirstIterHandler = std::function<bool(LinearIR&, LinearIR::constExprIt)>;
-        const FirstIterHandler& get_first_iter_handler() const;
-
         // Sets dim_idx to all entry and exit points
         void set_dim_idx(size_t dim_idx);
         void set_work_amount(size_t work_amount);
@@ -83,7 +69,9 @@ class LinearIR::LoopManager {
         void set_entry_points(std::vector<LoopPort> entry_points);
         void set_exit_points(std::vector<LoopPort> exit_points);
         void set_outer_splited_loop(bool outer_splited_loop);
-        void set_first_iter_handler(FirstIterHandler handler);
+
+        enum {FIRST_ITER, MAIN_BODY, LAST_ITER};
+        std::vector<lowered::pass::SubgraphPassPipeline> handlers;
 
     private:
         size_t m_work_amount = 0;
@@ -96,7 +84,6 @@ class LinearIR::LoopManager {
         std::vector<LoopPort> m_exit_points = {};
         // True if this Loop is outer Loop for nested Loops that splits the same dimension
         bool m_outer_splited_loop = false;
-        FirstIterHandler m_first_iter_handler = nullptr;
     };
     using LoopInfoPtr = std::shared_ptr<LoopInfo>;
 
@@ -118,16 +105,22 @@ class LinearIR::LoopManager {
     size_t mark_loop(LinearIR::constExprIt loop_begin_pos,
                      LinearIR::constExprIt loop_end_pos,
                      size_t work_amount,
-                     size_t work_amount_increment,
+                     size_t increment,
                      size_t dim_idx,
                      const std::vector<T>& entries,
-                     const std::vector<T>& exits) {
-        const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, work_amount_increment, entries, exits);
+                     const std::vector<T>& exits,
+                     bool set_default_handlers = true) {
+        if (increment > work_amount)
+            increment = work_amount;
+        const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, increment, entries, exits);
         loop_info->set_dim_idx(dim_idx);
         const auto loop_id = this->add_loop_info(loop_info);
         for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
             insert_loop_id(*expr_it, loop_id);
         }
+        if (set_default_handlers) {
+            set_default_loop_handlers(loop_info);
+        }
         return loop_id;
     }
 
@@ -137,12 +130,18 @@ class LinearIR::LoopManager {
                      size_t work_amount,
                      size_t increment,
                      const std::vector<T>& entries,
-                     const std::vector<T>& exits) {
+                     const std::vector<T>& exits,
+                     bool set_default_handlers = true) {
+        if (increment > work_amount)
+            increment = work_amount;
         const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, increment, entries, exits);
         const auto loop_id = this->add_loop_info(loop_info);
         for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
             insert_loop_id(*expr_it, loop_id);
         }
+        if (set_default_handlers) {
+            set_default_loop_handlers(loop_info);
+        }
         return loop_id;
     }
 
@@ -197,6 +196,7 @@ class LinearIR::LoopManager {
                                 size_t loop_id, bool loop_ops_inserted = false);
 
     LoopPort get_loop_port_by_expr_port(const ExpressionPort& expr_port, const size_t loop_id);
+    static void set_default_loop_handlers(const LoopInfoPtr& loop_info);
 
 private:
     static void get_io_loop_ports(LinearIR::constExprIt loop_begin_pos,
@@ -207,6 +207,9 @@ class LinearIR::LoopManager {
     static void fuse_loop_ports(std::vector<LinearIR::LoopManager::LoopPort>& exit_points,
                                 std::vector<LinearIR::LoopManager::LoopPort>& entry_points,
                                 size_t loop_id);
+    static std::vector<lowered::pass::SubgraphPassPipeline> fuse_loop_handlers(
+        std::vector<lowered::pass::SubgraphPassPipeline>& lhs,
+        std::vector<lowered::pass::SubgraphPassPipeline>& rhs);
 
     /* ===== The methods for work with Loop IDs of Expression ===== */
     // Notes:
diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp
new file mode 100644
index 00000000000000..099b43a54b2d6b
--- /dev/null
+++ b/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+class InsertSpecificIterations : public Pass {
+public:
+    OPENVINO_RTTI("InsertSpecificIterations", "Pass")
+    bool run(LinearIR& linear_ir) override;
+
+    static LinearIR::container copy_loop(const LinearIR& linear_ir, const size_t loop_id);
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp
new file mode 100644
index 00000000000000..a65e4a3bbabaa6
--- /dev/null
+++ b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp
@@ -0,0 +1,75 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/pass.hpp"
+#include "snippets/op/loop.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+class SetSingleIterationWithWorkAmount : public pass::SubgraphPass {
+public:
+    SetSingleIterationWithWorkAmount(size_t work_amount);
+    OPENVINO_RTTI("SetSingleIterationWithWorkAmount", "SubgraphPass")
+    bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+
+private:
+    size_t m_work_amount;
+};
+
+class UpdateMemoryAccessOps : public pass::SubgraphPass {
+public:
+    UpdateMemoryAccessOps(size_t count);
+    OPENVINO_RTTI("UpdateMemoryAccessOps", "SubgraphPass")
+    bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+
+private:
+    size_t m_count;
+};
+
+class ReduceWorkAmount : public pass::SubgraphPass {
+public:
+    ReduceWorkAmount(size_t reduce_value);
+    OPENVINO_RTTI("ReduceWorkAmount", "SubgraphPass")
+    bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+
+private:
+    size_t m_reduce_value;
+};
+
+class ZeroFinalizationOffsets : public pass::SubgraphPass {
+public:
+    OPENVINO_RTTI("ZeroFinalizationOffsets", "SubgraphPass")
+    bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+};
+
+class SetFillOffset : public pass::SubgraphPass {
+public:
+    SetFillOffset(size_t offset);
+    OPENVINO_RTTI("SetFillOffset", "SubgraphPass")
+    bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+
+private:
+    size_t m_offset;
+};
+
+class TransformInnerSplitLoop : public pass::SubgraphPass {
+public:
+    TransformInnerSplitLoop(size_t tail_size);
+    OPENVINO_RTTI("TransformInnerSplitLoop", "SubgraphPass")
+    bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+
+private:
+    size_t m_tail_size;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/lowered/pass/pass.hpp b/src/common/snippets/include/snippets/lowered/pass/pass.hpp
index 177056d2984d25..bb49f6b3202c4e 100644
--- a/src/common/snippets/include/snippets/lowered/pass/pass.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/pass.hpp
@@ -80,6 +80,63 @@ class PassPipeline {
     std::vector<std::shared_ptr<Pass>> m_passes;
 };
 
+class SubgraphPass {
+public:
+    SubgraphPass() = default;
+    virtual ~SubgraphPass() = default;
+    // Note that get_type_info_static and get_type_info are needed to mimic OPENVINO_RTTI interface,
+    // so the standard OPENVINO_RTTI(...) macros could be used in derived classes.
+    _OPENVINO_HIDDEN_METHOD static const ::ov::DiscreteTypeInfo& get_type_info_static() {
+        static ::ov::DiscreteTypeInfo type_info_static {"SubgraphPass"};
+        type_info_static.hash();
+        return type_info_static;
+    }
+
+    virtual const DiscreteTypeInfo& get_type_info() const {
+        return get_type_info_static();
+    }
+
+    const char* get_type_name() const {
+        return get_type_info().name;
+    }
+
+    virtual bool run(const lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) = 0;
+};
+
+class SubgraphPassPipeline {
+public:
+    using PositionedSubgraphPassLowered = snippets::pass::PositionedPass<lowered::pass::SubgraphPass>;
+
+    SubgraphPassPipeline();
+    SubgraphPassPipeline(const std::shared_ptr<PassConfig>& pass_config);
+
+    void run(const lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) const;
+    const std::vector<std::shared_ptr<SubgraphPass>>& get_passes() const { return m_passes; }
+    bool empty() const { return m_passes.empty(); }
+
+    void register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr<SubgraphPass>& pass);
+    void register_pass(const std::shared_ptr<SubgraphPass>& pass);
+
+    template<typename T, class... Args>
+    void register_pass(Args&&... args) {
+        static_assert(std::is_base_of<SubgraphPass, T>::value, "Pass not derived from lowered::SubgraphPass");
+        auto pass = std::make_shared<T>(std::forward<Args>(args)...);
+        register_pass(pass);
+    }
+    template<typename T, class Pos, class... Args, std::enable_if<std::is_same<snippets::pass::PassPosition, Pos>::value, bool>() = true>
+    void register_pass(const snippets::pass::PassPosition& position, Args&&... args) {
+        static_assert(std::is_base_of<SubgraphPass, T>::value, "Pass not derived from lowered::SubgraphPass");
+        auto pass = std::make_shared<T>(std::forward<Args>(args)...);
+        register_pass(position, pass);
+    }
+
+    void register_positioned_passes(const std::vector<PositionedSubgraphPassLowered>& pos_passes);
+
+private:
+    std::shared_ptr<PassConfig> m_pass_config;
+    std::vector<std::shared_ptr<SubgraphPass>> m_passes;
+};
+
 } // namespace pass
 } // namespace lowered
 } // namespace snippets
diff --git a/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp
new file mode 100644
index 00000000000000..4d4d3df84cf60d
--- /dev/null
+++ b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+class UpdateSubtensors : public pass::SubgraphPass {
+public:
+    UpdateSubtensors(size_t tail_size);
+    OPENVINO_RTTI("UpdateSubtensors", "Pass")
+    bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+
+private:
+    size_t m_tail_size;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
index c8ae7929ea2744..46530409426a32 100644
--- a/src/common/snippets/include/snippets/op/subgraph.hpp
+++ b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -5,11 +5,13 @@
 #pragma once
 
 #include <memory>
-
 #include <openvino/core/model.hpp>
 #include <openvino/op/util/sub_graph_base.hpp>
-#include "openvino/op/op.hpp"
+
 #include "openvino/core/rt_info.hpp"
+#include "openvino/op/op.hpp"
+#include "snippets/generator.hpp"
+#include "snippets/lowered/pass/pass.hpp"
 #include "snippets/pass/manager.hpp"
 #include "snippets/shape_inference/shape_inference.hpp"
 #include "snippets/lowered/pass/pass.hpp"
diff --git a/src/common/snippets/include/snippets/pass/manager.hpp b/src/common/snippets/include/snippets/pass/manager.hpp
index a9e3c2aec37498..3867366f1b399d 100644
--- a/src/common/snippets/include/snippets/pass/manager.hpp
+++ b/src/common/snippets/include/snippets/pass/manager.hpp
@@ -10,9 +10,6 @@
 #include "openvino/pass/pass.hpp"
 #include "openvino/pass/validate.hpp"
 
-#include <typeinfo>
-
-
 namespace ov {
 namespace snippets {
 namespace pass {
@@ -36,7 +33,7 @@ class Manager : public ov::pass::Manager {
     std::shared_ptr<T> register_pass(const PassPosition& position, Args&&... args) {
         static_assert(std::is_base_of<PassBase, T>::value, "Attempt to insert pass that is not derived from PassBase");
         auto pass = std::make_shared<T>(std::forward<Args>(args)...);
-        auto rc =  insert_pass_instance(position, pass);
+        auto rc = insert_pass_instance(position, pass);
         rc->set_pass_config(m_pass_config);
         if (!m_pass_config->is_enabled<T>()) {
             m_pass_config->disable<T>();
@@ -48,7 +45,7 @@ class Manager : public ov::pass::Manager {
     void register_positioned_passes(const std::vector<PositionedPassBase>& pos_passes);
 
 protected:
-    std::shared_ptr<Manager::PassBase> insert_pass_instance(const PassPosition& position, const std::shared_ptr<PassBase>& pass);
+    std::shared_ptr<PassBase> insert_pass_instance(const PassPosition& position, const std::shared_ptr<PassBase>& pass);
 };
 
 } // namespace pass
diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
index c0a2583aef23b4..e5242feaeada4f 100644
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@@ -4,28 +4,28 @@
 
 #include "snippets/generator.hpp"
 
+#include "snippets/itt.hpp"
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/lowered/pass/assign_registers.hpp"
 #include "snippets/lowered/pass/cleanup_loop_offsets.hpp"
+#include "snippets/lowered/pass/insert_specific_iterations.hpp"
 #include "snippets/lowered/pass/insert_tail_loop.hpp"
 #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp"
-
+#include "snippets/lowered/pass/pass.hpp"
 #include "snippets/op/kernel.hpp"
 
-#include "snippets/itt.hpp"
-
 namespace ov {
 namespace snippets {
 
 void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, const void* compile_params) const {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate")
     OV_ITT_TASK_CHAIN(GENERATE, ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::Transformations")
-    if (!target->is_supported())
-        OPENVINO_THROW("unsupported architecture for code generation");
+    OPENVINO_ASSERT(target->is_supported(), "unsupported architecture for code generation");
 
     std::function<opRegType(const std::shared_ptr<Node>& op)> reg_type_mapper = [&](const std::shared_ptr<Node>& op) -> opRegType {
         return get_op_reg_type(op);
     };
+
     lowered::pass::PassPipeline lowered_pipeline;
     // Note: the order of all passes in this pipeline must not be changed since they have hard dependencies
     //    1. InsertTailLoop must be called after AssignRegisters since tail loop expressions must have the same
@@ -35,7 +35,7 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c
     //    3. OptimizeLoopSingleEvaluation must be called after CleanupLoopOffsets
     //       since CleanupLoopOffsets can't handle loops with evaluate_once = true
     lowered_pipeline.register_pass<lowered::pass::AssignRegisters>(reg_type_mapper);
-    lowered_pipeline.register_pass<lowered::pass::InsertTailLoop>();
+    lowered_pipeline.register_pass<lowered::pass::InsertSpecificIterations>();
     lowered_pipeline.register_pass<lowered::pass::CleanupLoopOffsets>();
     lowered_pipeline.register_pass<lowered::pass::OptimizeLoopSingleEvaluation>();
     lowered_pipeline.run(linear_ir);
diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp
index e7e83361ee0a39..88f74782620e88 100644
--- a/src/common/snippets/src/lowered/loop_manager.cpp
+++ b/src/common/snippets/src/lowered/loop_manager.cpp
@@ -5,6 +5,8 @@
 #include "snippets/lowered/loop_manager.hpp"
 
 #include "snippets/lowered/expression.hpp"
+#include "snippets/lowered/pass/iter_handler.hpp"
+#include "snippets/lowered/pass/propagate_subtensors.hpp"
 #include "snippets/utils.hpp"
 
 #include "openvino/core/graph_util.hpp"
@@ -37,6 +39,19 @@ std::shared_ptr<LoopPort> LoopPort::clone_with_new_expr(const ExpressionPtr& new
     return new_loop_port;
 }
 
+LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount,
+                                          size_t increment,
+                                          const std::vector<LoopPort>& entries,
+                                          const std::vector<LoopPort>& exits,
+                                          bool outer_splited_loop)
+    : m_work_amount(work_amount),
+      m_increment(increment),
+      m_entry_points(entries),
+      m_exit_points(exits),
+      m_outer_splited_loop(outer_splited_loop) {
+    handlers.resize(3);
+}
+
 LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount,
                                           size_t increment,
                                           const std::vector<ExpressionPort>& entries,
@@ -51,6 +66,7 @@ LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount,
         m_entry_points.emplace_back(port);
     for (const auto& port : exits)
         m_exit_points.emplace_back(port);
+    handlers.resize(3);
 }
 
 std::shared_ptr<LoopInfo> LoopInfo::clone_with_new_expr(const ExressionMap& expr_map) const {
@@ -68,7 +84,8 @@ std::shared_ptr<LoopInfo> LoopInfo::clone_with_new_expr(const ExressionMap& expr
     const auto& new_entry_points = clone_loop_ports(m_entry_points);
     const auto& new_exit_points = clone_loop_ports(m_exit_points);
 
-    return std::make_shared<LoopInfo>(m_work_amount, m_increment, new_entry_points, new_exit_points, m_outer_splited_loop);
+    auto new_info = std::make_shared<LoopInfo>(m_work_amount, m_increment, new_entry_points, new_exit_points, m_outer_splited_loop);
+    return new_info;
 }
 
 size_t LoopInfo::get_work_amount() const {
@@ -91,10 +108,6 @@ bool LoopInfo::get_outer_splited_loop() const {
     return m_outer_splited_loop;
 }
 
-const LoopInfo::FirstIterHandler& LoopInfo::get_first_iter_handler() const {
-    return m_first_iter_handler;
-}
-
 size_t LinearIR::LoopManager::LoopInfo::get_dim_idx() const {
     OPENVINO_ASSERT(!m_entry_points.empty(), "Loop info must have at least one entry point");
     auto equal_dim_idxes = [&](const LinearIR::LoopManager::LoopPort& p) {
@@ -137,10 +150,6 @@ void LoopInfo::set_outer_splited_loop(bool outer_splited_loop) {
     m_outer_splited_loop = outer_splited_loop;
 }
 
-void LoopInfo::set_first_iter_handler(LoopInfo::FirstIterHandler first_iter_handler) {
-    m_first_iter_handler = std::move(first_iter_handler);
-}
-
 bool operator==(const LinearIR::LoopManager::LoopPort& lhs, const LinearIR::LoopManager::LoopPort& rhs) {
     if (&lhs == &rhs)
         return true;
@@ -248,6 +257,17 @@ LinearIR::LoopManager::LoopPort LinearIR::LoopManager::get_loop_port_by_expr_por
                                                          : get_loop_port(loop_info->get_exit_points());
 }
 
+void LinearIR::LoopManager::set_default_loop_handlers(const LoopInfoPtr& loop_info) {
+    const auto tail_size = loop_info->get_work_amount() % loop_info->get_increment();
+    if (tail_size != 0) {
+        loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::SetSingleIterationWithWorkAmount>(tail_size);
+        loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::UpdateMemoryAccessOps>(tail_size);
+        loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::UpdateSubtensors>(tail_size);
+        loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ReduceWorkAmount>(tail_size);
+        loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ZeroFinalizationOffsets>();
+    }
+}
+
 void LinearIR::LoopManager::get_io_loop_ports(LinearIR::constExprIt loop_begin_pos,
                                               LinearIR::constExprIt loop_end_pos,
                                               std::vector<ExpressionPort> &entries,
@@ -330,18 +350,16 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
     }
 
     for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) {
-        if (*(loop_subtensor.rbegin() + dim_idx) == PortDescriptor::ServiceDimensions::FULL_DIM) {
+        OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup");
+        const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx);
+        if (subtensor_value == PortDescriptor::ServiceDimensions::FULL_DIM) {
             continue;
         }
 
         OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup");
-        const auto work_amount =
-                loop_tensor.size() > dim_idx ? *(loop_tensor.rbegin() + dim_idx)
-                                             : 0;
-        const auto work_amount_increment =
-                loop_subtensor.size() > dim_idx ? *(loop_subtensor.rbegin() + dim_idx)
-                                                : (dim_idx == 0 ? vector_size : 1);
-        mark_loop(loop_begin_pos, loop_end_pos, work_amount, work_amount_increment, dim_idx, loop_entry_points, loop_exit_points);
+        const auto work_amount = *(loop_tensor.rbegin() + dim_idx);
+        const auto increment = subtensor_value;
+        mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, dim_idx, loop_entry_points, loop_exit_points);
     }
 }
 
@@ -399,6 +417,15 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target,
     loop_info->set_entry_points(new_entries);
     loop_info->set_exit_points(new_exits);
 
+    loop_info->handlers = fuse_loop_handlers(loop_info_upper->handlers, loop_info_lower->handlers);
+    // Since fusion can be called for broadcastable loops (one of the loops has work_amount = increment = 1),
+    // maximum value is set to the fused loop
+    loop_info->set_work_amount(std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount()));
+    loop_info->set_increment(std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment()));
+    // If one of the Loops is outer for nested loops that splits the same dimension,
+    // after fusion new common Loop saves this status
+    loop_info->set_outer_splited_loop(loop_info_upper->get_outer_splited_loop() || loop_info_lower->get_outer_splited_loop());
+
     const auto& from = fuse_into_upper ? loop_id_lower : loop_id_upper;
     const auto& to = fuse_into_upper ? loop_id_upper : loop_id_lower;
     for (auto it = loop_begin_target; it != loop_end_target; ++it) {
@@ -409,6 +436,31 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target,
     remove_loop_info(from);
 }
 
+std::vector<lowered::pass::SubgraphPassPipeline> LinearIR::LoopManager::fuse_loop_handlers(
+    std::vector<lowered::pass::SubgraphPassPipeline>& from,
+    std::vector<lowered::pass::SubgraphPassPipeline>& to) {
+    const auto min_size = std::min(from.size(), to.size());
+    std::vector<lowered::pass::SubgraphPassPipeline> merged_handlers;
+    merged_handlers.resize(min_size);
+    for (size_t i = 0; i < min_size; ++i) {
+        merged_handlers[i] = from[i];
+        const auto& res_passes = merged_handlers[i].get_passes();
+        for (const auto& pass : to[i].get_passes()) {
+            auto pred = [&pass](const std::shared_ptr<lowered::pass::SubgraphPass>& p) {
+                return p->get_type_info() == pass->get_type_info();
+            };
+            if (std::find_if(res_passes.begin(), res_passes.end(), pred) == res_passes.end()) {
+                merged_handlers[i].register_pass(pass);
+            }
+        }
+    }
+    auto& handlers_with_larger_size = from.size() > to.size() ? from : to;
+    for (size_t i = min_size; i < handlers_with_larger_size.size(); ++i) {
+        merged_handlers.emplace_back(std::move(handlers_with_larger_size[i]));
+    }
+    return merged_handlers;
+}
+
 void LinearIR::LoopManager::fuse_loop_ports(std::vector<LinearIR::LoopManager::LoopPort>& exit_points,
                                             std::vector<LinearIR::LoopManager::LoopPort>& entry_points,
                                             size_t loop_id) {
@@ -543,7 +595,7 @@ void LinearIR::LoopManager::insert_loop_id(const ExpressionPtr& expr, size_t new
     OPENVINO_ASSERT(m_map.count(new_id) == 1, "Failed marking expression by Loop ID: the Loop with this ID hasn't registered");
     auto& loop_ids = expr->m_loop_ids;
     OPENVINO_ASSERT(std::find(loop_ids.cbegin(), loop_ids.cend(), new_id) == loop_ids.cend(),
-                    "Expression cannot have several the same Loop IDs");
+                    "Expression cannot have several identical Loop IDs");
     auto insert_it = before ? loop_ids.cbegin() : loop_ids.cend();
     if (target_id != SIZE_MAX) {
         insert_it = std::find(loop_ids.cbegin(), loop_ids.cend(), target_id);
diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp
index d49cf8d63155a7..fbeef30888fa85 100644
--- a/src/common/snippets/src/lowered/pass/assign_registers.cpp
+++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp
@@ -80,10 +80,10 @@ bool AssignRegisters::run(LinearIR& linear_ir) {
             for (const auto& tensor : input_expr_input_tensors) {
                 const auto parent_expr = tensor->get_source().get_expr();
                 if (ov::is_type<op::Fill>(parent_expr->get_node())) {
-                    manually_assigned_vecs[tensor] = static_cast<Reg>(accumulator_reg);
                     if (ov::is_type<op::VectorBuffer>(parent_expr->get_input_port_connector(0)->get_source().get_expr()->get_node())) {
+                        manually_assigned_vecs[tensor] = static_cast<Reg>(accumulator_reg);
                         manually_assigned_vecs[parent_expr->get_input_port_connector(0)] = static_cast<Reg>(accumulator_reg);
-                }
+                    }
                 }
             }
             const auto& output_tensor = expr->get_output_port_connector(0);
diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp
index 1738d6d8fe9574..dc7dac6eed4095 100644
--- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp
@@ -44,20 +44,29 @@ bool FuseLoops::loop_ports_are_compatible(const LinearIR::LoopManagerPtr& loop_m
 }
 
 bool FuseLoops::can_be_fused(const LoopInfoPtr& loop_current, const LoopInfoPtr& loop_target) {
-    auto current_work_amount = loop_current->get_work_amount();
-    auto target_work_amount = loop_target->get_work_amount();
-    // Loop fusion is supported only if Loops have equal increments and the equal/broadcastable work amounts.
+    const auto current_work_amount = loop_current->get_work_amount();
+    const auto target_work_amount = loop_target->get_work_amount();
+    const auto current_increment = loop_current->get_increment();
+    const auto target_increment = loop_target->get_increment();
+    // Loop fusion is supported only if Loops have equal/broadcastable increments and work amounts.
     // Note: For example, Broadcastable work amounts are possible in the following case:
     //     Relu_0 [16x1]     Relu_1 [16x128]
     //                \           /
     //                 Add [16x128]
     // Because of expression order in linear IR and work of MarkLoop algorithm, there are 2 Inner Loops:
-    //  - Relu_0 with work amount `1` and increment `vector size`
+    //  - Relu_0 with work amount `1` and increment `1`
     //  - Relu_1 and Add with work amount `128` and increment `vector size`
     // We can fuse them into one Loop with work amount `128` and increment `vector size`
-    const auto supported_work_amount = current_work_amount == target_work_amount || current_work_amount == 1 || target_work_amount == 1;
-    const auto supported_increment = loop_current->get_increment() == loop_target->get_increment();
-    return supported_work_amount && supported_increment;
+
+    // WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't,
+    // because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters
+    // (e.g. tail size). This logic is not implemented for now, so fusion for such loops is skipped.
+    const bool first_iter_handlers_match = loop_current->handlers[LoopManager::LoopInfo::FIRST_ITER].empty() ==
+                                           loop_target->handlers[LoopManager::LoopInfo::FIRST_ITER].empty();
+    const bool equal_parameters = current_work_amount == target_work_amount && current_increment == target_increment;
+    const bool current_bcastable = current_work_amount == 1 && current_increment == 1;
+    const bool target_bcastable = target_work_amount == 1 && target_increment == 1;
+    return first_iter_handlers_match && (equal_parameters || current_bcastable || target_bcastable);
 }
 
 void FuseLoops::move(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id,
@@ -124,12 +133,6 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo
     LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos;
     loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos);
     loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, target_loop_id, current_loop_id, false);
-    // Update work_amount for Loop (increment is constant because increments must be the identical for fusion):
-    loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount()));
-    // If one of the Loops is outer for nested loops that splits the same dimension,
-    // after fusion new common Loop save this status
-    loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop());
-
     const auto insertion_place = current_loop_begin_pos;
     const auto is_move_needed = target_loop_end_pos != current_loop_begin_pos;
     if (is_move_needed)
@@ -169,11 +172,6 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo
     LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos;
     loop_manager->get_loop_bounds(linear_ir, target_loop_id, target_loop_begin_pos, target_loop_end_pos);
     loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, current_loop_id, target_loop_id);
-    // Update work_amount for Loop (increment is constant because increments must be the identical for fusion):
-    loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount()));
-    // If one of the Loops is outer for nested loops that splits the same dimension,
-    // after fusion new common Loop save this status
-    loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop());
 
     const auto insertion_place = current_loop_end_pos;
     const auto is_move_needed = insertion_place != target_loop_begin_pos;
diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp
index 75e70c9c553c88..492eb8d17682b1 100644
--- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp
@@ -20,13 +20,13 @@ using LoopInfoPtr = LoopManager::LoopInfoPtr;
 InsertLoadStore::InsertLoadStore(size_t vector_size) : m_vector_size(vector_size) {}
 
 size_t InsertLoadStore::get_count(const PortDescriptorPtr& port_desc) const {
-    const auto layout = port_desc->get_layout();
-    const auto shape = port_desc->get_shape();
+    const auto& layout = port_desc->get_layout();
+    const auto& shape = port_desc->get_shape();
     // Find last dimension by layout
-    const auto last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1);
+    const auto& last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1);
     OPENVINO_ASSERT(last_dim_idx != layout.end() && *last_dim_idx < shape.size(), "Load/Store expression have incorrect layout");
-    const auto dim = shape[*last_dim_idx];
-    return dim == 1 ? 1 : m_vector_size;
+    const auto& dim = shape[*last_dim_idx];
+    return std::min(dim, m_vector_size);
 }
 
 bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) {
diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp
new file mode 100644
index 00000000000000..4dd6cfa26745b4
--- /dev/null
+++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp
@@ -0,0 +1,106 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/insert_specific_iterations.hpp"
+#include "snippets/lowered/pass/iter_handler.hpp"
+
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/loop_manager.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
+#include "snippets/itt.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+LinearIR::container InsertSpecificIterations::copy_loop(const LinearIR& linear_ir, const size_t loop_id) {
+    const auto& loop_manager = linear_ir.get_loop_manager();
+    LinearIR::constExprIt loop_begin_pos, loop_end_pos;
+    loop_manager->get_loop_bounds(linear_ir, loop_id, loop_begin_pos, loop_end_pos, true);
+    ExressionMap expression_map;
+    const auto& loop_copy_range = LinearIR::deep_copy_range(loop_begin_pos, std::next(loop_end_pos), expression_map);
+
+    const auto original_loop_info = loop_manager->get_loop_info(loop_id);
+    std::vector<LinearIR::LoopManager::LoopPort> new_entry_points, new_exit_points;
+    // Clone loop ports from original loop info to new loop info
+    for (const auto& entry : original_loop_info->get_entry_points())
+        new_entry_points.push_back(*entry.clone_with_new_expr(expression_map[entry.expr_port->get_expr().get()]));
+    for (const auto& exit : original_loop_info->get_exit_points())
+        new_exit_points.push_back(*exit.clone_with_new_expr(expression_map[exit.expr_port->get_expr().get()]));
+
+    for (const auto& elem : expression_map) {
+        const auto expr = elem.first->shared_from_this();
+        const auto& new_expr = elem.second;
+        // Loop begin/end ops can't be loop ports
+        if (ov::is_type<op::LoopBase>(expr->get_node()))
+            continue;
+        // Update loop info of all outer loops with new loop ports
+        const auto outer_loop_ids = LinearIR::LoopManager::get_outer_expr_loops(expr, loop_id);
+        for (size_t i = 0; i < expr->get_input_count(); ++i)
+            loop_manager->update_loops_port(outer_loop_ids, expr->get_input_port(i), {expr->get_input_port(i), new_expr->get_input_port(i)}, true);
+        for (size_t i = 0; i < expr->get_output_count(); ++i)
+            loop_manager->update_loops_port(outer_loop_ids, expr->get_output_port(i), {expr->get_output_port(i), new_expr->get_output_port(i)}, false);
+    }
+
+    const auto new_loop_begin_pos = loop_copy_range.begin();
+    const auto new_loop_end_pos = loop_copy_range.end();
+    const auto new_id = loop_manager->replace_with_new_loop(linear_ir,
+                                                            std::next(new_loop_begin_pos),
+                                                            std::prev(new_loop_end_pos),
+                                                            original_loop_info->get_work_amount(),
+                                                            original_loop_info->get_increment(),
+                                                            new_entry_points,
+                                                            new_exit_points,
+                                                            loop_id);
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(std::prev(new_loop_end_pos)->get()->get_node());
+    OPENVINO_ASSERT(loop_end, "Cloned Loop does not contain LoopEnd op at the expected place.");
+    loop_end->set_id(new_id);
+    return loop_copy_range;
+}
+
+bool InsertSpecificIterations::run(LinearIR& linear_ir) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertSpecificIterations")
+    const auto& loop_manager = linear_ir.get_loop_manager();
+
+    bool modified = false;
+    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) {
+        const auto& expr = *expr_it;
+        const auto node = expr->get_node();
+        const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
+        if (!loop_end)
+            continue;
+
+        std::vector<std::reference_wrapper<const lowered::pass::SubgraphPassPipeline>> pipelines_to_run;
+        for (const auto& pipeline : loop_manager->get_loop_info(loop_end->get_id())->handlers) {
+            if (!pipeline.empty())
+                pipelines_to_run.emplace_back(pipeline);
+        }
+        if (pipelines_to_run.empty())
+            continue;
+
+        const auto main_body_begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end->get_loop_begin()));
+        const auto main_body_end_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end));
+        auto copy_and_run_specific_handlers = [&](const SubgraphPassPipeline& handlers) {
+            const auto& cloned_body = copy_loop(linear_ir, loop_end->get_id());
+            linear_ir.insert(main_body_begin_it, cloned_body.begin(), cloned_body.end());
+            handlers.run(linear_ir, cloned_body.begin(), std::prev(cloned_body.end()));
+        };
+
+        for (size_t i = 0; i < pipelines_to_run.size() - 1; ++i) {
+            copy_and_run_specific_handlers(pipelines_to_run[i].get());
+        }
+        // Last pipeline is run on original body to avoid unnecesarry copy
+        pipelines_to_run.back().get().run(linear_ir, main_body_begin_it, main_body_end_it);
+        modified = true;
+    }
+    return modified;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
+
diff --git a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp
index cc685c1851157a..c8bfffc3360722 100644
--- a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp
@@ -330,10 +330,10 @@ bool InsertTailLoop::run(LinearIR& linear_ir) {
             continue;
 
         const auto loop_info = loop_manager->get_loop_info(loop_end->get_id());
-        const auto& first_iter_handler = loop_info->get_first_iter_handler();
-        if (first_iter_handler) {
-            modified |= first_iter_handler(linear_ir, expr_it);
-        }
+        // const auto& first_iter_handler = loop_info->get_first_iter_handler();
+        // if (first_iter_handler) {
+        //     modified |= first_iter_handler(linear_ir, expr_it);
+        // }
 
         const auto work_amount = loop_end->get_work_amount();
         const auto increment = loop_end->get_increment();
diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp
new file mode 100644
index 00000000000000..31bda1589b01ce
--- /dev/null
+++ b/src/common/snippets/src/lowered/pass/iter_handler.cpp
@@ -0,0 +1,155 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/iter_handler.hpp"
+
+#include "snippets/itt.hpp"
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/loop_manager.hpp"
+#include "snippets/lowered/pass/propagate_subtensors.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+SetSingleIterationWithWorkAmount::SetSingleIterationWithWorkAmount(size_t work_amount)
+    : SubgraphPass(),
+      m_work_amount(work_amount) {}
+
+bool SetSingleIterationWithWorkAmount::run(const LinearIR& linear_ir,
+                                           LinearIR::constExprIt begin,
+                                           LinearIR::constExprIt end) {
+    const auto& expr = *end;
+    const auto node = expr->get_node();
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
+
+    const auto& loop_manager = linear_ir.get_loop_manager();
+    const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id());
+    if (loop_end->get_work_amount() == m_work_amount && loop_end->get_increment() == m_work_amount)
+        return false;
+    loop_end->set_work_amount(m_work_amount);
+    loop_end->set_increment(m_work_amount);
+    loop_info->set_work_amount(m_work_amount);
+    loop_info->set_increment(m_work_amount);
+    return true;
+}
+
+UpdateMemoryAccessOps::UpdateMemoryAccessOps(size_t count) : SubgraphPass(), m_count(count) {}
+
+bool UpdateMemoryAccessOps::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    for (auto expr_it = std::next(begin); expr_it != end; expr_it++) {
+        // Skip inner Loops
+        const auto loop_begin = ov::as_type_ptr<op::LoopBegin>(expr_it->get()->get_node());
+        if (loop_begin) {
+            expr_it = linear_ir.find(expr_it, end, linear_ir.get_expr_by_node(loop_begin->get_loop_end()));
+            continue;
+        }
+
+        const auto& node = expr_it->get()->get_node();
+        if (const auto memory_access = ov::as_type_ptr<ov::snippets::op::MemoryAccess>(node)) {
+            for (const auto p : memory_access->get_memory_access_input_ports()) {
+                const auto port = p.first;
+                if (memory_access->get_input_count(port) > 1) {
+                    memory_access->set_input_count(m_count, port);
+                }
+            }
+            for (const auto p : memory_access->get_memory_access_output_ports()) {
+                const auto port = p.first;
+                if (memory_access->get_output_count(port) > 1) {
+                    memory_access->set_output_count(m_count, port);
+                }
+            }
+        }
+    }
+    return true;
+}
+
+ReduceWorkAmount::ReduceWorkAmount(size_t reduce_value) : SubgraphPass(), m_reduce_value(reduce_value) {}
+
+bool ReduceWorkAmount::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    const auto& expr = *end;
+    const auto node = expr->get_node();
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
+    const auto work_amount = loop_end->get_work_amount();
+    const auto new_work_amount = work_amount - m_reduce_value;
+    loop_end->set_work_amount(new_work_amount);
+
+    const auto& loop_manager = linear_ir.get_loop_manager();
+    const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id());
+    loop_info->set_work_amount(new_work_amount);
+    return true;
+}
+
+bool ZeroFinalizationOffsets::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    const auto& expr = *end;
+    const auto node = expr->get_node();
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
+    loop_end->set_finalization_offsets(std::vector<int64_t>(loop_end->get_finalization_offsets().size(), 0));
+    return true;
+}
+
+SetFillOffset::SetFillOffset(size_t offset) : SubgraphPass(), m_offset(offset) {}
+
+bool SetFillOffset::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    for (auto expr_it = std::next(begin); expr_it != end; expr_it++) {
+        const auto& node = expr_it->get()->get_node();
+        if (const auto fill = ov::as_type_ptr<ov::snippets::op::Fill>(node)) {
+            fill->set_offset(m_offset);
+        }
+    }
+    return true;
+}
+
+TransformInnerSplitLoop::TransformInnerSplitLoop(size_t tail_size) : SubgraphPass(), m_tail_size(tail_size) {}
+
+bool TransformInnerSplitLoop::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    const auto& expr = *end;
+    const auto node = expr->get_node();
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
+    const auto& loop_manager = linear_ir.get_loop_manager();
+    const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id());
+    const auto current_dim_idx = loop_info->get_dim_idx();
+    OPENVINO_ASSERT(current_dim_idx != LinearIR::LoopManager::LoopInfo::UNDEFINED_DIM_IDX,
+                    "Outer splitted loop unexpectedly iterates by several dimension indices");
+
+    bool modified = false;
+    for (auto it = std::next(begin); it != end; ++it) {
+        const auto& expr = *it;
+        const auto inner_loop_end = ov::as_type_ptr<op::LoopEnd>(expr->get_node());
+        if (!inner_loop_end)
+            continue;
+        const auto inner_loop_info = loop_manager->get_loop_info(inner_loop_end->get_id());
+        const auto inner_dim_idx = inner_loop_info->get_dim_idx();
+        if (inner_dim_idx != current_dim_idx)
+            continue;
+        const auto inner_loop_begin = inner_loop_end->get_loop_begin();
+        const auto inner_tail_work_amount = static_cast<int64_t>(inner_loop_end->get_work_amount());
+        const auto inner_tail_increment = inner_loop_end->get_increment();
+        auto inner_finalization_offsets = inner_loop_end->get_finalization_offsets();
+        for (auto& offset : inner_finalization_offsets) {
+            offset = offset / inner_tail_work_amount * static_cast<int64_t>(m_tail_size);
+        }
+        inner_loop_end->set_work_amount(m_tail_size);
+        // TODO: if the new m_tail_size increment is set, all last iter handlers must be updated with new tail value
+        // We can also don't split loops in case if inner loop has increment not equal to 1
+        inner_loop_end->set_increment(std::min(inner_tail_increment, m_tail_size));
+        inner_loop_end->set_finalization_offsets(inner_finalization_offsets);
+        const auto inner_loop_begin_it = std::find(begin, it, linear_ir.get_expr_by_node(inner_loop_begin));
+        const auto inner_loop_end_it = std::next(end);
+        OPENVINO_ASSERT(inner_loop_begin_it != it, "LoopBegin has not been found!");
+        const auto& last_iter_handlers = inner_loop_info->handlers[LinearIR::LoopManager::LoopInfo::LAST_ITER];
+        last_iter_handlers.run(linear_ir, inner_loop_begin_it, inner_loop_end_it);
+        modified = true;
+    }
+    return modified;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
+
diff --git a/src/common/snippets/src/lowered/pass/pass.cpp b/src/common/snippets/src/lowered/pass/pass.cpp
index 70a05fc30be147..f69578e6aab9ba 100644
--- a/src/common/snippets/src/lowered/pass/pass.cpp
+++ b/src/common/snippets/src/lowered/pass/pass.cpp
@@ -41,6 +41,36 @@ void PassPipeline::register_positioned_passes(const std::vector<PositionedPassLo
         register_pass(pp.position, pp.pass);
 }
 
+SubgraphPassPipeline::SubgraphPassPipeline() : m_pass_config(std::make_shared<PassConfig>()) {}
+SubgraphPassPipeline::SubgraphPassPipeline(const std::shared_ptr<PassConfig>& pass_config) : m_pass_config(pass_config) {
+    OPENVINO_ASSERT(m_pass_config != nullptr, "PassConfig is not initialized!");
+}
+
+void SubgraphPassPipeline::register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr<SubgraphPass>& pass) {
+    OPENVINO_ASSERT(pass != nullptr, "SubgraphPassPipeline cannot register empty pass!");
+    m_passes.insert(position.get_insert_position(m_passes), pass);
+}
+
+void SubgraphPassPipeline::register_pass(const std::shared_ptr<SubgraphPass>& pass) {
+    OPENVINO_ASSERT(pass != nullptr, "SubgraphPassPipeline cannot register empty pass!");
+    m_passes.push_back(pass);
+}
+
+void SubgraphPassPipeline::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) const {
+    for (const auto& pass : m_passes) {
+        OPENVINO_ASSERT(pass != nullptr, "SubgraphPassPipeline has empty pass!");
+        if (m_pass_config->is_disabled(pass->get_type_info())) {
+            continue;
+        }
+        pass->run(linear_ir, begin, end);
+    }
+}
+
+void SubgraphPassPipeline::register_positioned_passes(const std::vector<PositionedSubgraphPassLowered>& pos_passes) {
+    for (const auto& pp : pos_passes)
+        register_pass(pp.position, pp.pass);
+}
+
 } // namespace pass
 } // namespace lowered
 } // namespace snippets
diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp
new file mode 100644
index 00000000000000..e41cfe78de7f6d
--- /dev/null
+++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp
@@ -0,0 +1,148 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/propagate_subtensors.hpp"
+
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/loop_manager.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
+#include "snippets/itt.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+namespace {
+void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir,
+                                              const LinearIR::LoopManager::LoopInfoPtr& loop_info,
+                                              LinearIR::container::const_iterator begin,
+                                              LinearIR::container::const_iterator end,
+                                              const size_t new_dim_value) {
+    std::map<lowered::PortDescriptorPtr, snippets::VectorDims> original_shapes;
+    static constexpr size_t existing_subtensor_value = SIZE_MAX;
+    // First step: set new dim value to the corresponding entry_points' dimensions
+    if (new_dim_value != existing_subtensor_value) {
+        for (const auto& port : loop_info->get_entry_points()) {
+            if (port.is_incremented) {
+                const auto& expr = port.expr_port->get_expr();
+                const auto node = expr->get_node();
+                auto desc = port.expr_port->get_descriptor_ptr();
+                auto subtensor = desc->get_subtensor();
+                if (port.dim_idx < subtensor.size()) {
+                    *(subtensor.rbegin() + port.dim_idx) = new_dim_value;
+                    desc->set_subtensor(subtensor);
+                }
+
+                const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr();
+                const auto& layout = parent_desc->get_layout();
+                const auto& shape = parent_desc->get_shape();
+                if (original_shapes.find(parent_desc) == original_shapes.end()) {
+                    original_shapes[parent_desc] = shape;
+                }
+                auto new_shape = shape;
+                new_shape[*(layout.rbegin() + port.dim_idx)] = new_dim_value;
+                parent_desc->set_shape(new_shape);
+            }
+        }
+    }
+
+    auto update_only_dim_idx_with_subtensor_value = [&](const LinearIR::LoopManager::LoopPort& port) {
+        if (port.is_incremented) {
+            auto desc = port.expr_port->get_descriptor_ptr();
+            const auto expr = port.expr_port->get_expr();
+            const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr();
+
+            const auto& layout = parent_desc->get_layout();
+            const auto& shape = parent_desc->get_shape();
+            const auto& desc_subtensor = desc->get_subtensor();
+            if (port.dim_idx < desc_subtensor.size()) {
+                if (original_shapes.find(parent_desc) == original_shapes.end()) {
+                    original_shapes[parent_desc] = shape;
+                }
+                auto new_shape = shape;
+                new_shape[*(layout.rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx);
+                parent_desc->set_shape(new_shape);
+            }
+        }
+    };
+
+    auto update_subtensors = [](const std::vector<PortDescriptorPtr>& descs, bool is_input) {
+        for (const auto& desc : descs) {
+            const auto& subtensor = desc->get_subtensor();
+            if (!subtensor.empty()) {
+                auto planar_dims = is_input ? snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout())
+                                            : snippets::utils::get_preordered_vdims(desc->get_shape(), desc->get_layout());
+                const size_t subtensor_start = planar_dims.size() - subtensor.size();
+                VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end());
+                for (size_t i = 0; i < new_subtensor.size(); ++i) {
+                    new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]);
+                }
+                desc->set_subtensor(new_subtensor);
+            }
+        }
+    };
+
+    auto shape_inference_end_it = end;
+    const bool loop_by_last_dim = loop_info->get_dim_idx() == 0;
+    // Subtensors are updated using shape inference infrastructure:
+    // For inner loops propagation function is called recursively
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
+        const auto expr = *expr_it;
+        if (ov::is_type<snippets::op::LoopEnd>(expr->get_node()))
+            continue;
+        if (auto loop_begin = ov::as_type_ptr<snippets::op::LoopBegin>(expr->get_node())) {
+            const auto loop_end = loop_begin->get_loop_end();
+            const auto inner_loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id());
+            const auto inner_begin = std::next(expr_it);
+            const auto inner_end = linear_ir.find(linear_ir.get_expr_by_node(loop_end));
+
+            // The corresponding shapes of inner loops entry points must be updated using existing subtensor values
+            if (new_dim_value == existing_subtensor_value) {
+                for (const auto& port : loop_info->get_entry_points())
+                    update_only_dim_idx_with_subtensor_value(port);
+            }
+            propagate_updated_subtensor_through_loop(linear_ir, inner_loop_info, inner_begin, inner_end, existing_subtensor_value);
+            expr_it = inner_end;
+            continue;
+        }
+        if ((ov::is_type<snippets::op::BroadcastMove>(expr_it->get()->get_node()) ||
+            ov::is_type<snippets::op::BroadcastLoad>(expr_it->get()->get_node())) &&
+            loop_by_last_dim) {
+            // WA: we have to break subtensor propagation if we try to propagate new last dim through Broadcast nodes
+            // which broadcast last dim in original dimension value anyway
+            // This workaround might be avoided if blocked shape are used for tail size propagation
+            shape_inference_end_it = expr_it;
+            break;
+        }
+        expr->updateShapes();
+        update_subtensors(expr->get_input_port_descriptors(), true);
+        update_subtensors(expr->get_output_port_descriptors(), false);
+    }
+
+    // After subtensor propagation, the original shapes must be restored
+    for (const auto& elem : original_shapes)
+        elem.first->set_shape(elem.second);
+    for (auto expr_it = begin; expr_it != shape_inference_end_it; expr_it++)
+        (*expr_it)->updateShapes();
+}
+}  // namespace
+
+UpdateSubtensors::UpdateSubtensors(size_t tail_size) : SubgraphPass(), m_tail_size(tail_size) {}
+
+bool UpdateSubtensors::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    const auto& expr = *end;
+    const auto node = expr->get_node();
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
+    const auto& loop_manager = linear_ir.get_loop_manager();
+    const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id());
+    propagate_updated_subtensor_through_loop(linear_ir, loop_info, std::next(begin), end, m_tail_size);
+    return true;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
+
diff --git a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp
index 4174f928352289..3f3e21509c8adc 100644
--- a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp
+++ b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp
@@ -7,6 +7,7 @@
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/lowered/loop_manager.hpp"
 #include "snippets/lowered/pass/mark_loops.hpp"
+#include "snippets/lowered/pass/iter_handler.hpp"
 #include "snippets/snippets_isa.hpp"
 #include "snippets/itt.hpp"
 
@@ -19,6 +20,8 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
+using LoopInfo = LinearIR::LoopManager::LoopInfo;
+
 SoftmaxDecomposition::SoftmaxDecomposition(size_t vector_size) : m_vector_size{vector_size} {}
 
 bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
@@ -58,15 +61,22 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
             // Init value of vector buffer for ReduceMax is -FLOAT_MIN.
             const auto fill_max = push_node(std::make_shared<op::Fill>(vector_buffer_max.second, 0, float_min_constant));
             // ReduceMax loop
-            const auto& max = push_node(std::make_shared<ov::op::v1::Maximum>(softmax->get_input_source_output(0), fill_max.second));
+            const auto fill_max_tail = push_node(std::make_shared<op::Fill>(softmax->get_input_source_output(0), m_vector_size, float_min_constant));
+
+            const auto& max = push_node(std::make_shared<ov::op::v1::Maximum>(fill_max_tail.second, fill_max.second));
 
             const auto horizon_max = push_node(std::make_shared<op::HorizonMax>(max.second));
 
             // Markup of ReduceMax Loop
-            loop_manager->mark_loop(max.first, horizon_max.first, inner_work_amount, m_vector_size, 0,
-                                    std::vector<ExpressionPort>{(*max.first)->get_input_port(0),
-                                                                (*max.first)->get_input_port(1)},
-                                    std::vector<ExpressionPort>{(*max.first)->get_output_port(0)});
+            const auto reduce_max_loop_id = loop_manager->mark_loop(fill_max_tail.first, horizon_max.first, inner_work_amount, m_vector_size, 0,
+                                                                    std::vector<ExpressionPort>{(*fill_max_tail.first)->get_input_port(0),
+                                                                                                (*max.first)->get_input_port(1)},
+                                                                    std::vector<ExpressionPort>{(*max.first)->get_output_port(0)});
+            const auto& reduce_max_loop_info = loop_manager->get_loop_info(reduce_max_loop_id);
+            const auto tail_size = inner_work_amount % m_vector_size;
+            if (tail_size != 0) {
+                reduce_max_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
+            }
             const auto broadcast_horizon_max = push_node(std::make_shared<op::BroadcastMove>(horizon_max.second, broadcasted_dim));
             const auto vector_buffer_sum = push_node(std::make_shared<op::VectorBuffer>());
             // Init value of vector buffer for ReduceSum is zero.
@@ -75,38 +85,42 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
             // Sub + Exp + ReduceSum Loop
             const auto sub = push_node(std::make_shared<ov::op::v1::Subtract>(softmax->get_input_source_output(0), broadcast_horizon_max.second));
             const auto exp = push_node(std::make_shared<ov::op::v0::Exp>(sub.second));
-            const auto sum = push_node(std::make_shared<ov::op::v1::Add>(exp.second, fill_sum.second));
+            const auto fill_sum_tail = push_node(std::make_shared<op::Fill>(exp.second, m_vector_size, zero_constant));
+            const auto sum = push_node(std::make_shared<ov::op::v1::Add>(fill_sum_tail.second, fill_sum.second));
 
             const auto horizon_sum = push_node(std::make_shared<op::HorizonSum>(sum.second));
 
-            // Markup of ReduceMax Loop
-            loop_manager->mark_loop(sub.first, horizon_sum.first, inner_work_amount, m_vector_size, 0,
-                                    std::vector<ExpressionPort>{(*sub.first)->get_input_port(0),
-                                                                (*sub.first)->get_input_port(1),
-                                                                (*sum.first)->get_input_port(1)},
-                                    std::vector<ExpressionPort>{(*exp.first)->get_output_port(0),
-                                                                (*sum.first)->get_output_port(0)});
+            // Markup of ReduceSum Loop
+            const auto reduce_sum_loop_id = loop_manager->mark_loop(sub.first, horizon_sum.first, inner_work_amount, m_vector_size, 0,
+                                                                    std::vector<ExpressionPort>{(*sub.first)->get_input_port(0),
+                                                                                                (*sub.first)->get_input_port(1),
+                                                                                                (*sum.first)->get_input_port(1)},
+                                                                    std::vector<ExpressionPort>{(*fill_sum_tail.first)->get_output_port(0),
+                                                                                                (*sum.first)->get_output_port(0)});
+            const auto& reduce_sum_loop_info = loop_manager->get_loop_info(reduce_sum_loop_id);
+            if (tail_size != 0) {
+                reduce_sum_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
+            }
 
             // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop
             const auto pow = push_node(std::make_shared<op::PowerStatic>(horizon_sum.second, -1.f));
             const auto broadcast_pow = push_node(std::make_shared<op::BroadcastMove>(pow.second, broadcasted_dim));
 
             // Mul (pseudo-Divide loop)
-            const auto mul = push_node(std::make_shared<ov::op::v1::Multiply>(exp.second, broadcast_pow.second));
+            const auto mul = push_node(std::make_shared<ov::op::v1::Multiply>(fill_sum_tail.second, broadcast_pow.second));
 
             // Transfer original ExpressionPorts
-            linear_ir.replace_input((*max.first)->get_input_port(0), input_connector);
+            linear_ir.replace_input((*fill_max_tail.first)->get_input_port(0), input_connector);
             linear_ir.replace_input((*sub.first)->get_input_port(0), input_connector);
             linear_ir.replace_input(output_connector->get_consumers(), (*mul.first)->get_output_port_connector(0));
 
             // Markup of Mul Loop
             loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, m_vector_size, 0,
-                                    std::vector<ExpressionPort>{(*mul.first)->get_input_port(0),
-                                                                (*mul.first)->get_input_port(1)},
+                                    std::vector<ExpressionPort>{(*mul.first)->get_input_port(0), (*mul.first)->get_input_port(1)},
                                     std::vector<ExpressionPort>{(*mul.first)->get_output_port(0)});
 
             // Update Loop info for outer loops
-            const auto entry_points = std::vector<ExpressionPort>{(*max.first)->get_input_port(0),
+            const auto entry_points = std::vector<ExpressionPort>{(*fill_max_tail.first)->get_input_port(0),
                                                                   (*sub.first)->get_input_port(0)};
             const auto exit_points = std::vector<ExpressionPort>{(*mul.first)->get_output_port(0)};
             for (auto loop_id : softmax_loop_ids) {
@@ -114,16 +128,6 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
             }
 
             expr_it = linear_ir.erase(expr_it);   // Remove Softmax
-
-            /* =========================================== */
-
-            /* ============= Runtime Info ================ */
-
-            // For tail loop we should fill input of Max by float min and
-            // input of Sum by zero to avoid math incorrect calculations
-            // TODO [111383]: It should be covered via general pipeline (for example, via analyze in InsertTailLoop?)
-            max.second->input(0).get_rt_info()["set_fill"] = float_min_constant;
-            sum.second->input(0).get_rt_info()["set_fill"] = zero_constant;
             modified = true;
         }
     }
diff --git a/src/common/snippets/src/lowered/pass/split_loops.cpp b/src/common/snippets/src/lowered/pass/split_loops.cpp
index ba036eca8011f9..d65e27feca8adb 100644
--- a/src/common/snippets/src/lowered/pass/split_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/split_loops.cpp
@@ -5,6 +5,7 @@
 #include "snippets/lowered/pass/split_loops.hpp"
 
 #include "snippets/lowered/pass/fuse_loops.hpp"
+#include "snippets/lowered/pass/iter_handler.hpp"
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/lowered/loop_manager.hpp"
 #include "snippets/snippets_isa.hpp"
@@ -81,7 +82,18 @@ bool SplitLoops::run(LinearIR& linear_ir) {
                                                                    loop_to_split->get_dim_idx(),
                                                                    loop_to_split->get_entry_points(),
                                                                    loop_to_split->get_exit_points());
-                loop_manager->get_loop_info(split_loop_id)->set_outer_splited_loop(true);
+                const auto& new_loop_info = loop_manager->get_loop_info(split_loop_id);
+                new_loop_info->set_outer_splited_loop(true);
+                new_loop_info->handlers = loop_to_split->handlers;
+                const auto work_amount = loop_to_fuse->get_work_amount();
+                const auto increment = loop_to_fuse->get_increment();
+                const auto tail_size = work_amount % increment;
+                // TODO: current logic doesn't handle the case when loop has first iteration handlers too.
+                // Need to skip this transformation for such cases or improve the logic
+                if (tail_size != 0) {
+                    // TODO: should we remove previous tail loop handler?
+                    new_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<TransformInnerSplitLoop>(tail_size);
+                }
                 break;
             }
         }
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 5068d915dc1ebe..2204660086478a 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -62,6 +62,8 @@ namespace ov {
 namespace snippets {
 namespace op {
 
+using PassPipeline = lowered::pass::PassPipeline;
+
 void Subgraph::set_generator(std::shared_ptr<ov::snippets::Generator> generator) {
     m_generator = std::move(generator);
 }
diff --git a/src/common/snippets/src/pass/manager.cpp b/src/common/snippets/src/pass/manager.cpp
index d5a1456c9a8ca5..3ed83085155fa5 100644
--- a/src/common/snippets/src/pass/manager.cpp
+++ b/src/common/snippets/src/pass/manager.cpp
@@ -8,7 +8,6 @@
 namespace ov {
 namespace snippets {
 namespace pass {
-
 std::shared_ptr<Manager::PassBase> Manager::register_pass_instance(const PassPosition& position,
                                                                    const std::shared_ptr<PassBase>& pass) {
     pass->set_pass_config(m_pass_config);
diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp
index 455c261cec5109..92e91aa8e18400 100644
--- a/src/common/snippets/tests/src/lowered/pass/loop.cpp
+++ b/src/common/snippets/tests/src/lowered/pass/loop.cpp
@@ -13,6 +13,7 @@
 #include "snippets/lowered/pass/insert_loops.hpp"
 #include "snippets/lowered/pass/insert_tail_loop.hpp"
 #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp"
+#include "snippets/lowered/pass/pass.hpp"
 #include "snippets/lowered/pass/validate_loops.hpp"
 #include "snippets/shape_inference/shape_inference.hpp"
 #include "subgraph_simple.hpp"
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
index fc9aeeac10ee92..dc8739c7944555 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
@@ -4,23 +4,26 @@
 
 #include "brgemm_blocking.hpp"
 
-#include "openvino/pass/pattern/matcher.hpp"
-#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "cpu_iter_handlers.hpp"
 #include "snippets/itt.hpp"
-#include "snippets/utils.hpp"
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/lowered/loop_manager.hpp"
 #include "snippets/lowered/pass/insert_tail_loop.hpp"
+#include "snippets/lowered/pass/iter_handler.hpp"
+#include "snippets/lowered/pass/pass.hpp"
+#include "snippets/lowered/pass/propagate_subtensors.hpp"
 #include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
 #include "transformations/snippets/x64/op/brgemm_cpu.hpp"
 
-
 namespace ov {
 namespace intel_cpu {
 namespace pass {
 using LinearIR = snippets::lowered::LinearIR;
 using LoopPort = LinearIR::LoopManager::LoopPort;
 using ExpressionPtr = ov::snippets::lowered::ExpressionPtr;
+using LoopInfo = LinearIR::LoopManager::LoopInfo;
+using namespace ov::snippets::lowered::pass;
 
 BrgemmBlocking::BrgemmBlocking() : Pass() {}
 
@@ -86,7 +89,6 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
             } else {
                 *(in_0_subtensor.rbegin() + 1) = block_size_m;
                 *(out_subtensor.rbegin() + 1) = block_size_m;
-
                 auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
                 std::vector<LoopPort> entries{LoopPort(brgemm_expr->get_input_port(0), true),
                                               LoopPort(brgemm_expr->get_input_port(1), false)};
@@ -110,7 +112,6 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
             } else {
                 *in_1_subtensor.rbegin() = block_size_n;
                 *out_subtensor.rbegin() = block_size_n;
-
                 auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
                 std::vector<LoopPort> entries{LoopPort(brgemm_expr->get_input_port(0), false),
                                               LoopPort(brgemm_expr->get_input_port(1), true)};
@@ -135,7 +136,6 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
             } else {
                 *in_0_subtensor.rbegin() = block_size_k;
                 *(in_1_subtensor.rbegin() + 1) = block_size_k;
-
                 auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
                 std::vector<LoopPort> entries{LoopPort(brgemm_expr->get_input_port(0), true, 0),
                                               LoopPort(brgemm_expr->get_input_port(1), true, 1)};
@@ -146,44 +146,40 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
                     loop_begin_it = std::prev(expr_it);
                 }
                 std::vector<LoopPort> exits{LoopPort(brgemm_expr->get_output_port(0), false)};
-                auto loop_id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits);
-                const auto loop_info = loop_manager->get_loop_info(loop_id);
-
-                auto first_iter_handler = [](LinearIR& linear_ir, LinearIR::constExprIt loop_end_it) {
-                    const auto loop_end = ov::as_type_ptr<snippets::op::LoopEnd>(loop_end_it->get()->get_node());
-                    OPENVINO_ASSERT(loop_end, "First loop iteraton handler must be called on LoopEnd expression");
-                    const auto loop_id = loop_end->get_id();
-                    const auto& loop_manager = linear_ir.get_loop_manager();
-                    const auto& loop_info = loop_manager->get_loop_info(loop_id);
-                    const auto work_amount = loop_info->get_work_amount();
-                    const auto increment = loop_info->get_increment();
-                    if (work_amount <= increment)
-                        return false;
-
-                    auto new_loop_range = snippets::lowered::pass::InsertTailLoop::copy_loop(linear_ir, loop_id);
-                    const auto firt_iter_loop_end = ov::as_type_ptr<snippets::op::LoopEnd>(std::prev(new_loop_range.end())->get()->get_node());
-                    auto first_iter_loop_info = loop_manager->get_loop_info(firt_iter_loop_end->get_id());
-                    firt_iter_loop_end->set_work_amount(increment);
-                    first_iter_loop_info->set_work_amount(increment);
-                    firt_iter_loop_end->set_finalization_offsets(std::vector<int64_t>(loop_end->get_finalization_offsets().size(), 0));
-
-                    const auto loop_begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end->get_loop_begin()));
-                    linear_ir.insert(loop_begin_it, new_loop_range.begin(), new_loop_range.end());
-
-                    const auto new_work_amount = work_amount - increment;
-                    loop_info->set_work_amount(new_work_amount);
-                    loop_end->set_work_amount(new_work_amount);
-
-                    // Update original body's Brgemms with new beta parameter
-                    for (auto expr_it = loop_begin_it; expr_it != loop_end_it; ++expr_it) {
-                        const auto& expr_node = expr_it->get()->get_node();
-                        if (const auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(expr_node)) {
-                            brgemm->set_beta(1.f);
-                        }
-                    }
-                    return true;
+                const bool set_default_handlers = false;
+                const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits, set_default_handlers);
+                const auto loop_info = loop_manager->get_loop_info(id);
+                const auto tail_size = k % block_size_k;
+
+                auto set_last_iter_handlers = [tail_size](SubgraphPassPipeline& pipeline) {
+                    pipeline.register_pass<SetSingleIterationWithWorkAmount>(tail_size);
+                    pipeline.register_pass<UpdateMemoryAccessOps>(tail_size);
+                    pipeline.register_pass<UpdateSubtensors>(tail_size);
+                    pipeline.register_pass<SetBrgemmBeta>(1.f);
+                };
+                auto set_first_iter_handlers = [block_size_k](SubgraphPassPipeline& pipeline) {
+                    pipeline.register_pass<SetSingleIterationWithWorkAmount>(block_size_k);
+                    pipeline.register_pass<ZeroFinalizationOffsets>();
                 };
-                loop_info->set_first_iter_handler(first_iter_handler);
+
+                if (tail_size != 0) {
+                    if (k <= 2 * block_size_k) {
+                        // First iter as main body and tail loop
+                        set_first_iter_handlers(loop_info->handlers[LoopInfo::MAIN_BODY]);
+                        set_last_iter_handlers(loop_info->handlers[LoopInfo::LAST_ITER]);
+                    } else {
+                        // First iter, main body and tail loop
+                        set_first_iter_handlers(loop_info->handlers[LoopInfo::FIRST_ITER]);
+                        loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(block_size_k + tail_size);
+                        loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
+                        loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<SetBrgemmBeta>(1.f);
+                        set_last_iter_handlers(loop_info->handlers[LoopInfo::LAST_ITER]);
+                    }
+                } else {
+                    set_first_iter_handlers(loop_info->handlers[LoopInfo::FIRST_ITER]);
+                    loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(block_size_k);
+                    loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<SetBrgemmBeta>(1.f);
+                }
             }
         };
 
@@ -194,6 +190,7 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
         brgemm_expr->get_input_port_descriptor(0)->set_subtensor(in_0_subtensor);
         brgemm_expr->get_input_port_descriptor(1)->set_subtensor(in_1_subtensor);
         brgemm_expr->get_output_port_descriptor(0)->set_subtensor(out_subtensor);
+
         modified = true;
     }
 
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp
new file mode 100644
index 00000000000000..688962d1f105d4
--- /dev/null
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "cpu_iter_handlers.hpp"
+
+#include "snippets/lowered/loop_manager.hpp"
+#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
+
+namespace ov {
+namespace intel_cpu {
+namespace pass {
+using LinearIR = snippets::lowered::LinearIR;
+using ExpressionPtr = ov::snippets::lowered::ExpressionPtr;
+
+SetBrgemmBeta::SetBrgemmBeta(float beta) : snippets::lowered::pass::SubgraphPass(), m_beta(beta) {}
+
+bool SetBrgemmBeta::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = expr_it->get();
+        if (const auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(expr->get_node())) {
+            brgemm->set_beta(m_beta);
+        }
+    }
+    return true;
+}
+}  // namespace pass
+}  // namespace intel_cpu
+}  // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp
new file mode 100644
index 00000000000000..b60b958983ab66
--- /dev/null
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "snippets/lowered/pass/iter_handler.hpp"
+
+namespace ov {
+namespace intel_cpu {
+namespace pass {
+class SetBrgemmBeta : public snippets::lowered::pass::SubgraphPass {
+public:
+    SetBrgemmBeta(float beta);
+    OPENVINO_RTTI("SetBrgemmBeta", "SubgraphPass")
+    bool run(const snippets::lowered::LinearIR& linear_ir,
+             snippets::lowered::LinearIR::constExprIt begin,
+             snippets::lowered::LinearIR::constExprIt end) override;
+
+private:
+    size_t m_beta;
+};
+}  // namespace pass
+}  // namespace intel_cpu
+}  // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
index 77c78e31ca6b00..11988c5bd58541 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
@@ -20,7 +20,9 @@ std::vector<std::vector<ov::PartialShape>> input_shapes{
         {{1, 1, 32, 23}, {1, 1, 23, 68}},
         {{1, 16, 384, 64}, {1, 16, 64, 384}},
         {{1, 1, 100, 700}, {1, 1, 700, 100}},
+        {{1, 1, 100, 1024}, {1, 1, 1024, 100}},
         {{1, 1, 100, 2500}, {1, 1, 2500, 100}},
+        {{1, 1, 100, 4500}, {1, 1, 4500, 100}},
 };
 
 static inline std::vector<std::vector<element::Type>> quantized_precisions() {