diff --git a/src/common/snippets/include/snippets/lowered/expression.hpp b/src/common/snippets/include/snippets/lowered/expression.hpp
index 0b619370ab47a5..6731e369ae0921 100644
--- a/src/common/snippets/include/snippets/lowered/expression.hpp
+++ b/src/common/snippets/include/snippets/lowered/expression.hpp
@@ -19,7 +19,7 @@ namespace lowered {
 
 class LinearIR;
 using ExpressionPtr = std::shared_ptr<Expression>;
-using ExressionMap = std::unordered_map<Expression*, ExpressionPtr>;
+using ExpressionMap = std::unordered_map<Expression*, ExpressionPtr>;
 class Expression : public std::enable_shared_from_this<Expression> {
     friend class LinearIR;
     friend class ExpressionPort;
@@ -63,7 +63,7 @@ class Expression : public std::enable_shared_from_this<Expression> {
     void set_loop_ids(const std::vector<size_t>& loops);
     virtual ExpressionPtr clone_with_new_inputs(const std::vector<PortConnectorPtr>& new_inputs,
                                                 const std::shared_ptr<Node>& new_node) const;
-    ExpressionPtr clone_with_new_inputs(const ExressionMap& expr_map, const std::shared_ptr<Node>& new_node) const;
+    ExpressionPtr clone_with_new_inputs(const ExpressionMap& expr_map, const std::shared_ptr<Node>& new_node) const;
 
 protected:
     Expression(const Expression& other);
diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
index 5034de4e481540..c9c5e6963a2924 100644
--- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp
+++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@@ -70,11 +70,11 @@ class LinearIR {
     std::shared_ptr<LinearIR> clone() const;
     static LinearIR::container deep_copy_range(LinearIR::container::const_iterator begin,
                                                LinearIR::container::const_iterator end,
-                                               ExressionMap& expression_map);
+                                               ExpressionMap& expression_map);
 
-    const container& get_ops() const {return m_expressions; }
-    const io_container& get_IO_ops() const {return m_io_expressions; }
-    Config get_config() {return m_config; }
+    const container& get_ops() const { return m_expressions; }
+    const io_container& get_IO_ops() const { return m_io_expressions; }
+    const Config& get_config() const { return m_config; }
     void set_loop_depth(size_t loop_depth) { m_config.m_loop_depth = loop_depth; }
 
     const ExpressionPtr& get_expr_by_node(const std::shared_ptr<Node>& n) const;
diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp
index 28904165d3ebef..b5aa9484cbec9e 100644
--- a/src/common/snippets/include/snippets/lowered/loop_manager.hpp
+++ b/src/common/snippets/include/snippets/lowered/loop_manager.hpp
@@ -4,11 +4,12 @@
 
 #pragma once
 
-#include "linear_ir.hpp"
-
 #include <openvino/core/node.hpp>
 #include <openvino/opsets/opset1.hpp>
 
+#include "linear_ir.hpp"
+#include "pass/iter_handler.hpp"
+#include "pass/pass.hpp"
 #include "port_descriptor.hpp"
 
 namespace ov {
@@ -41,19 +42,61 @@ class LinearIR::LoopManager {
     class LoopInfo {
     public:
         enum {UNDEFINED_DIM_IDX = std::numeric_limits<size_t>::max()};
+        class SpecificIterationHandlers {
+        public:
+            enum class HandlerType { FIRST_ITER, MAIN_BODY, LAST_ITER };
+            SpecificIterationHandlers() = default;
+            SpecificIterationHandlers(size_t loop_work_amount, size_t loop_increment);
+            SpecificIterationHandlers(lowered::pass::PassPipeline first_iter_handlers,
+                                      lowered::pass::PassPipeline main_body_handlers,
+                                      lowered::pass::PassPipeline last_iter_handlers);
+
+            const lowered::pass::PassPipeline& get_first_iter_handelrs() const;
+            const lowered::pass::PassPipeline& get_main_iter_handelrs() const;
+            const lowered::pass::PassPipeline& get_last_iter_handelrs() const;
+            static SpecificIterationHandlers merge_handlers(const SpecificIterationHandlers& lhs, const SpecificIterationHandlers& rhs);
+
+            template <HandlerType Type,
+                      typename T,
+                      class... Args,
+                      typename std::enable_if<Type == HandlerType::FIRST_ITER, bool>::type = true>
+            void register_handler(Args&&... args) {
+                m_first_iter_handlers.register_pass<T>(args...);
+            }
+
+            template <HandlerType Type,
+                      typename T,
+                      class... Args,
+                      typename std::enable_if<Type == HandlerType::MAIN_BODY, bool>::type = true>
+            void register_handler(Args&&... args) {
+                m_main_body_handlers.register_pass<T>(args...);
+            }
+
+            template <HandlerType Type,
+                      typename T,
+                      class... Args,
+                      typename std::enable_if<Type == HandlerType::LAST_ITER, bool>::type = true>
+            void register_handler(Args&&... args) {
+                m_last_iter_handlers.register_pass<T>(args...);
+            }
+
+        private:
+            lowered::pass::PassPipeline m_first_iter_handlers;
+            lowered::pass::PassPipeline m_main_body_handlers;
+            lowered::pass::PassPipeline m_last_iter_handlers;
+        };
+
         LoopInfo() = default;
         LoopInfo(size_t work_amount, size_t increment,
                  const std::vector<LoopPort>& entries,
                  const std::vector<LoopPort>& exits,
-                 bool outer_splited_loop = false)
-            : m_work_amount(work_amount), m_increment(increment),
-              m_entry_points(entries), m_exit_points(exits), m_outer_splited_loop(outer_splited_loop) {}
+                 const SpecificIterationHandlers& handlers = SpecificIterationHandlers());
         LoopInfo(size_t work_amount, size_t increment,
                  const std::vector<ExpressionPort>& entries,
                  const std::vector<ExpressionPort>& exits,
-                 bool outer_splited_loop = false);
+                 const SpecificIterationHandlers& handlers = SpecificIterationHandlers());
 
-        std::shared_ptr<LoopInfo> clone_with_new_expr(const ExressionMap& expr_map) const;
+        std::shared_ptr<LoopInfo> clone_with_new_expr(const ExpressionMap& expr_map) const;
 
         // Returns dimension index if dimension indices for all entry and exit points are equal, and UNDEFINED_DIM_IDX otherwise
         size_t get_dim_idx() const;
@@ -61,20 +104,7 @@ class LinearIR::LoopManager {
         size_t get_increment() const;
         const std::vector<LoopPort>& get_entry_points() const;
         const std::vector<LoopPort>& get_exit_points() const;
-        bool get_outer_splited_loop() const;
-
-        /**
-         * \brief Inserts a separate body for first loop iteration processing if needed.
-         * Can also modify both main and first iter loop bodies.
-         * TODO: replace this temporary solution when ticket 119851 is implemented
-         *
-         * \param linear_ir LIR which should be modified
-         * \param loop_end_it iterator on LoopEnd expression for which the handler is called
-         *
-         * \return bool value which indicates whether the linear_ir was changed or not.
-         */
-        using FirstIterHandler = std::function<bool(LinearIR&, LinearIR::constExprIt)>;
-        const FirstIterHandler& get_first_iter_handler() const;
+        const SpecificIterationHandlers& get_handlers() const;
 
         // Sets dim_idx to all entry and exit points
         void set_dim_idx(size_t dim_idx);
@@ -82,8 +112,12 @@ class LinearIR::LoopManager {
         void set_increment(size_t increment);
         void set_entry_points(std::vector<LoopPort> entry_points);
         void set_exit_points(std::vector<LoopPort> exit_points);
-        void set_outer_splited_loop(bool outer_splited_loop);
-        void set_first_iter_handler(FirstIterHandler handler);
+        void set_handlers(SpecificIterationHandlers handlers);
+
+        template <SpecificIterationHandlers::HandlerType Type, typename T, class... Args>
+        void register_handler(Args&&... args) {
+            m_handlers.register_handler<Type, T>(args...);
+        }
 
         // Update the parameters of existing LoopPorts
         void update_entry_points(const std::function<void(LoopPort&)>& updater);
@@ -98,9 +132,7 @@ class LinearIR::LoopManager {
         // Note: Scalars aren't entry expressions but can be before first entry expr in Linear IR
         std::vector<LoopPort> m_entry_points = {};
         std::vector<LoopPort> m_exit_points = {};
-        // True if this Loop is outer Loop for nested Loops that splits the same dimension
-        bool m_outer_splited_loop = false;
-        FirstIterHandler m_first_iter_handler = nullptr;
+        SpecificIterationHandlers m_handlers = {};
     };
     using LoopInfoPtr = std::shared_ptr<LoopInfo>;
 
@@ -109,7 +141,7 @@ class LinearIR::LoopManager {
      * @param expr_map map of new and old expressions
      * @return the copy
      */
-    std::shared_ptr<LoopManager> clone_with_new_expr(const ExressionMap& expr_map) const;
+    std::shared_ptr<LoopManager> clone_with_new_expr(const ExpressionMap& expr_map) const;
 
     /**
      * @brief Get target Loop Info
@@ -176,8 +208,13 @@ class LinearIR::LoopManager {
                      size_t increment,
                      size_t dim_idx,
                      const std::vector<T>& entries,
-                     const std::vector<T>& exits) {
-        const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, increment, entries, exits);
+                     const std::vector<T>& exits,
+                     bool set_default_handlers = true) {
+        const auto normalized_increment = std::min(increment, work_amount);
+        const auto handlers = set_default_handlers
+                                  ? LoopInfo::SpecificIterationHandlers(work_amount, normalized_increment)
+                                  : LoopInfo::SpecificIterationHandlers();
+        const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, normalized_increment, entries, exits, handlers);
         loop_info->set_dim_idx(dim_idx);
         const auto loop_id = this->add_loop_info(loop_info);
         for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
@@ -201,8 +238,13 @@ class LinearIR::LoopManager {
                      size_t work_amount,
                      size_t increment,
                      const std::vector<T>& entries,
-                     const std::vector<T>& exits) {
-        const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, increment, entries, exits);
+                     const std::vector<T>& exits,
+                     bool set_default_handlers = true) {
+        const auto normalized_increment = std::min(increment, work_amount);
+        const auto handlers = set_default_handlers
+                                  ? LoopInfo::SpecificIterationHandlers(work_amount, normalized_increment)
+                                  : LoopInfo::SpecificIterationHandlers();
+        const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, normalized_increment, entries, exits, handlers);
         const auto loop_id = this->add_loop_info(loop_info);
         for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
             insert_loop_id(*expr_it, loop_id);
diff --git a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
index 1c1ea092d52059..1ec9598ec1d2c2 100644
--- a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
@@ -23,9 +23,9 @@ namespace pass {
  *              The buffer scratchpad has one general data pointer. Each buffer has offset relative to the data pointer of buffer scratchpad.
  * @ingroup snippets
  */
-class AllocateBuffers: public Pass {
+class AllocateBuffers: public RangedPass {
 public:
-    OPENVINO_RTTI("AllocateBuffers", "Pass")
+    OPENVINO_RTTI("AllocateBuffers", "RangedPass")
     AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);
 
     /**
@@ -33,7 +33,7 @@ class AllocateBuffers: public Pass {
      * @param linear_ir the target Linear IR
      * @return status of the pass
      */
-    bool run(LinearIR& linear_ir) override;
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
     /**
      * @brief Set offset to Buffer op and propagates its to the connected memory access ops
diff --git a/src/common/snippets/include/snippets/lowered/pass/clean_repeated_ptr_shifts.hpp b/src/common/snippets/include/snippets/lowered/pass/clean_repeated_ptr_shifts.hpp
index 892137747a2776..e6863ef8ae62bd 100644
--- a/src/common/snippets/include/snippets/lowered/pass/clean_repeated_ptr_shifts.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/clean_repeated_ptr_shifts.hpp
@@ -21,15 +21,15 @@ namespace pass {
  *              This condition should be removed when Buffers stop being inplace by default.
  * @ingroup snippets
  */
-class CleanRepeatedDataPointerShifts: public Pass {
+class CleanRepeatedDataPointerShifts: public RangedPass {
 public:
-    OPENVINO_RTTI("CleanRepeatedDataPointerShifts", "Pass")
+    OPENVINO_RTTI("CleanRepeatedDataPointerShifts", "RangedPass")
     CleanRepeatedDataPointerShifts() = default;
 
-    bool run(LinearIR& linear_ir) override;
+    bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
-    bool reuse_increments(const LinearIR& linear_ir, const ExpressionPtr& loop_end_expr);
+    bool reuse_increments(const ExpressionPtr& loop_end_expr);
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/lowered/pass/cleanup_loop_offsets.hpp b/src/common/snippets/include/snippets/lowered/pass/cleanup_loop_offsets.hpp
index 5af01ad137e09b..cf72577ea98859 100644
--- a/src/common/snippets/include/snippets/lowered/pass/cleanup_loop_offsets.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/cleanup_loop_offsets.hpp
@@ -17,10 +17,10 @@ namespace pass {
  *        This transformation "fuses" the offsets with an outer loop's ptr_increments, and zeroes the offsets before Results.
  * @ingroup snippets
  */
-class CleanupLoopOffsets : public Pass {
+class CleanupLoopOffsets : public RangedPass {
 public:
-    OPENVINO_RTTI("CleanupLoopOffsets", "Pass")
-    bool run(LinearIR& linear_ir) override;
+    OPENVINO_RTTI("CleanupLoopOffsets", "RangedPass")
+    bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp
index 87bf8cbea0e77b..67254d879f3351 100644
--- a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp
@@ -31,9 +31,9 @@ namespace pass {
  *                 These passes should be executed separately before this pass!
  * @ingroup snippets
  */
-class DefineBufferClusters : public Pass {
+class DefineBufferClusters : public RangedPass {
 public:
-    OPENVINO_RTTI("DefineBufferClusters", "Pass")
+    OPENVINO_RTTI("DefineBufferClusters", "RangedPass")
 
     DefineBufferClusters(AllocateBuffers::BufferClusters& clusters) : m_clusters(clusters) {}
 
@@ -42,7 +42,7 @@ class DefineBufferClusters : public Pass {
      * @param linear_ir the target Linear IR
      * @return status of the pass
      */
-    bool run(lowered::LinearIR& linear_ir) override;
+    bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
     using BufferPorts = std::unordered_map<ExpressionPtr, std::set<size_t>>;
diff --git a/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp
index 64b3a758a0ad8f..2b527d551f6f68 100644
--- a/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp
@@ -36,11 +36,11 @@ namespace pass {
  *        The main conditions of possible fusion is the equal increments and the equal/broadcastable work amounts.
  * @ingroup snippets
  */
-class FuseLoops : public Pass {
+class FuseLoops : public RangedPass {
 public:
-    OPENVINO_RTTI("FuseLoops", "Pass")
+    OPENVINO_RTTI("FuseLoops", "RangedPass")
     FuseLoops();
-    bool run(LinearIR& linear_ir) override;
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
     // This method checks that all ports which connect lower and upper loops are incremented.
     // This helps to avoid fusing for the ports with incompleted data
diff --git a/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp
index 81d284e4467597..31631b9b0ec638 100644
--- a/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp
@@ -27,9 +27,9 @@ namespace pass {
  *        Note: should be called before ResetBuffer() pass to have correct offsets
  * @ingroup snippets
  */
-class IdentifyBuffers: public Pass {
+class IdentifyBuffers: public RangedPass {
 public:
-    OPENVINO_RTTI("IdentifyBuffers", "Pass")
+    OPENVINO_RTTI("IdentifyBuffers", "RangedPass")
     IdentifyBuffers() = default;
 
     /**
@@ -37,7 +37,7 @@ class IdentifyBuffers: public Pass {
      * @param linear_ir the target Linear IR
      * @return status of the pass
      */
-    bool run(LinearIR& linear_ir) override;
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
     struct ShiftPtrParams {
         ShiftPtrParams() = default;
@@ -75,7 +75,7 @@ class IdentifyBuffers: public Pass {
      * @param pool set of Buffers from the Linear IR
      * @return adjacency matrix where True value means that Buffers are adjacent and cannot have the same ID
      */
-    static std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool);
+    static std::vector<bool> create_adjacency_matrix(lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end, const BufferPool& pool);
     /**
      * @brief Algorithm of Graph coloring where vertices are Buffers
      * @param buffers set of Buffers from the Linear IR
diff --git a/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp b/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp
index 5993b0d41ea1d3..3b085ca2b32f80 100644
--- a/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp
@@ -17,9 +17,9 @@ namespace pass {
  * @ingroup snippets
  */
 
-class InitBuffersDefault : public Pass {
+class InitBuffersDefault : public RangedPass {
 public:
-    OPENVINO_RTTI("InitBuffersDefault", "Pass")
+    OPENVINO_RTTI("InitBuffersDefault", "RangedPass")
 
     InitBuffersDefault(size_t& buffer_scratchpad_size) : m_buffer_scratchpad_size(buffer_scratchpad_size) {
         m_buffer_scratchpad_size = 0;
@@ -29,7 +29,7 @@ class InitBuffersDefault : public Pass {
      * @param linear_ir the target Linear IR
      * @return status of the pass
      */
-    bool run(lowered::LinearIR& linear_ir) override;
+    bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
     size_t& m_buffer_scratchpad_size;
diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_broadcastmove.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_broadcastmove.hpp
index fe4f9956d81c66..0d4c89c8605703 100644
--- a/src/common/snippets/include/snippets/lowered/pass/insert_broadcastmove.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/insert_broadcastmove.hpp
@@ -16,10 +16,10 @@ namespace pass {
  * @brief Injects explicit Movebroadcast operations when the most varying dim is broadcasted
  * @ingroup snippets
  */
-class InsertBroadcastMove : public Pass {
+class InsertBroadcastMove : public RangedPass {
 public:
-    OPENVINO_RTTI("InsertBroadcastMove", "Pass")
-    bool run(LinearIR& linear_ir) override;
+    OPENVINO_RTTI("InsertBroadcastMove", "RangedPass")
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp
index 004ea711288ab2..37a03a364e8915 100644
--- a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp
@@ -21,14 +21,17 @@ namespace pass {
  * @param m_buffer_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank]
  * @ingroup snippets
  */
-class InsertBuffers : public Pass {
+class InsertBuffers : public RangedPass {
 public:
-    OPENVINO_RTTI("InsertBuffers", "Pass")
+    OPENVINO_RTTI("InsertBuffers", "RangedPass")
     InsertBuffers(int32_t buffer_allocation_rank);
-    bool run(LinearIR& linear_ir) override;
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
-    void insertion(LinearIR& linear_ir, const LinearIR::constExprIt& expr_it, const LinearIR::LoopManagerPtr& loop_manager,
+    void insertion(LinearIR& linear_ir,
+                   const LinearIR::constExprIt& begin_it,
+                   const LinearIR::constExprIt& end_it,
+                   const LinearIR::LoopManagerPtr& loop_manager,
                    const std::vector<LinearIR::LoopManager::LoopPort>& loop_entries,
                    const std::vector<LinearIR::LoopManager::LoopPort>& loop_exits);
 
diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_load_store.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_load_store.hpp
index dbd4222888ec6d..cb6773fe186a20 100644
--- a/src/common/snippets/include/snippets/lowered/pass/insert_load_store.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/insert_load_store.hpp
@@ -20,11 +20,11 @@ namespace pass {
  * @param m_vector_size - the count of elements for loading/storing
  * @ingroup snippets
  */
-class InsertLoadStore : public Pass {
+class InsertLoadStore : public RangedPass {
 public:
+    OPENVINO_RTTI("InsertLoadStore", "RangedPass")
     explicit InsertLoadStore(size_t vector_size);
-    OPENVINO_RTTI("InsertLoadStore", "Pass")
-    bool run(LinearIR& linear_ir) override;
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
     size_t get_count(const PortDescriptorPtr& port_desc) const;
diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_loops.hpp
index bcd5c9231e7441..f29c4b558c0513 100644
--- a/src/common/snippets/include/snippets/lowered/pass/insert_loops.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/insert_loops.hpp
@@ -18,11 +18,11 @@ namespace pass {
  * @brief The pass explicitly insert LoadBegin and LoadEnd in Linear IR using LoopManager::LoopInfo from Loop markup algorithm
  * @ingroup snippets
  */
-class InsertLoops : public Pass {
+class InsertLoops : public RangedPass {
 public:
-    OPENVINO_RTTI("InsertLoops", "Pass")
+    OPENVINO_RTTI("InsertLoops", "RangedPass")
     InsertLoops();
-    bool run(LinearIR& linear_ir) override;
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 private:
     static void insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id, bool has_outer_loop);
 };
diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp
index bad6dd3504fdc5..17d3f4cb2829dc 100644
--- a/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp
@@ -21,11 +21,11 @@ namespace pass {
  *  Developers could modify this to insert perf count pairs around interested sequence of nodes.
  * @ingroup snippets
  */
-class InsertPerfCount: public Pass {
+class InsertPerfCount: public RangedPass {
 public:
-    OPENVINO_RTTI("InsertPerfCount", "Pass")
+    OPENVINO_RTTI("InsertPerfCount", "RangedPass")
     InsertPerfCount(std::map<std::string, std::string> boundary_op_names);
-    bool run(LinearIR& linear_ir) override;
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
     std::map<std::string, std::string> m_boundary_op_names;
diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp
new file mode 100644
index 00000000000000..15d2703d3f8e6d
--- /dev/null
+++ b/src/common/snippets/include/snippets/lowered/pass/insert_specific_iterations.hpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface InsertSpecificIterations
+ * @brief Inserts separate loop bodies for first/last iterations if needed.
+ * Also calls previously registered SpecificIterationHandlers for the inserted bodies and the main body.
+ * @ingroup snippets
+ */
+class InsertSpecificIterations : public RangedPass {
+public:
+    OPENVINO_RTTI("InsertSpecificIterations", "RangedPass")
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
+
+    /**
+     * @brief Makes a copy of a loop body with id 'loop_id' and inserts it to the LinearIR before the 'insert_pos' position
+     * @param linear_ir LinearIR which should be modified
+     * @param loop_id id of the loop which should be copied
+     * @param insert_pos position before which the loop body copy should be inserted
+     * @return iterator which points on the LoopBegin copy
+     */
+    static LinearIR::constExprIt insert_copy_loop(LinearIR& linear_ir,
+                                                  const size_t loop_id,
+                                                  const LinearIR::constExprIt& insert_pos);
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_tail_loop.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_tail_loop.hpp
deleted file mode 100644
index 5fe8634959fb51..00000000000000
--- a/src/common/snippets/include/snippets/lowered/pass/insert_tail_loop.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (C) 2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "pass.hpp"
-
-#include "snippets/op/loop.hpp"
-#include "snippets/lowered/loop_manager.hpp"
-
-namespace ov {
-namespace snippets {
-namespace lowered {
-namespace pass {
-
-/**
- * @interface InsertTailLoop
- * @brief Injects tail-processing loop after a vector loop if required.
- *  Additional optimizations are performed if a loop body is executed only once.
- * @ingroup snippets
- */
-class InsertTailLoop : public Pass {
-public:
-    OPENVINO_RTTI("InsertTailLoop", "Pass")
-    bool run(LinearIR& linear_ir) override;
-    static LinearIR::constExprIt insert_copy_loop(LinearIR& linear_ir, const size_t loop_id, const LinearIR::constExprIt& insert_pos);
-
-    static constexpr size_t existing_subtensor_value = SIZE_MAX;
-    static void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir,
-                                                         const LinearIR::LoopManager::LoopInfoPtr& loop_info,
-                                                         LinearIR::container::const_iterator begin,
-                                                         LinearIR::container::const_iterator end,
-                                                         const size_t new_dim_value = existing_subtensor_value);
-
-private:
-    static void create_tail_loop(LinearIR& linear_ir,
-                                 LinearIR::constExprIt begin,
-                                 LinearIR::constExprIt end,
-                                 const std::shared_ptr<op::LoopEnd>& loop_end,
-                                 bool need_vector_loop,
-                                 size_t tail_size);
-    static void tail_transformations(LinearIR& linear_ir,
-                                     LinearIR::constExprIt tail_begin,
-                                     LinearIR::constExprIt tail_end,
-                                     size_t tail_size);
-};
-
-} // namespace pass
-} // namespace lowered
-} // namespace snippets
-} // namespace ov
diff --git a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp
new file mode 100644
index 00000000000000..467e3d5735d123
--- /dev/null
+++ b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+/**
+ * @interface UpdateMemoryAccessCounts
+ * @brief The pass changes counts of all MemoryAccess ops
+ * @attention The pass skips inner loops
+ * @attention The pass ignores memory access ports which have count == 1
+ * @param m_count - count which must be set
+ * @ingroup snippets
+ */
+class UpdateMemoryAccessCounts : public pass::RangedPass {
+public:
+    UpdateMemoryAccessCounts(size_t count);
+    OPENVINO_RTTI("UpdateMemoryAccessCounts", "RangedPass")
+    bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+    std::shared_ptr<pass::PassBase> merge(const std::shared_ptr<pass::PassBase>& other) override;
+
+private:
+    size_t m_count;
+};
+
+/**
+ * @interface SetFillOffset
+ * @brief The pass changes offset of all Fill ops
+ * @param m_offset - offset which must be set
+ * @ingroup snippets
+ */
+class SetFillOffset : public pass::RangedPass {
+public:
+    SetFillOffset(size_t offset);
+    OPENVINO_RTTI("SetFillOffset", "RangedPass")
+    bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+    std::shared_ptr<pass::PassBase> merge(const std::shared_ptr<pass::PassBase>& other) override;
+
+private:
+    size_t m_offset;
+};
+
+/**
+ * @interface TransformInnerSplitLoop
+ * @brief The pass updates finalization offsets, work amount and increment of inner Loop basing on tail_size of the current Loop
+ * @param m_tail_size - tail_size of the current Loop
+ * @ingroup snippets
+ */
+class TransformInnerSplitLoop : public pass::RangedPass {
+public:
+    TransformInnerSplitLoop(size_t tail_size);
+    OPENVINO_RTTI("TransformInnerSplitLoop", "RangedPass")
+    bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+    std::shared_ptr<pass::PassBase> merge(const std::shared_ptr<pass::PassBase>& other) override;
+
+private:
+    size_t m_tail_size;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/lowered/pass/load_movebroadcast_to_broadcastload.hpp b/src/common/snippets/include/snippets/lowered/pass/load_movebroadcast_to_broadcastload.hpp
index 769208842e9338..e7aac012480fbc 100644
--- a/src/common/snippets/include/snippets/lowered/pass/load_movebroadcast_to_broadcastload.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/load_movebroadcast_to_broadcastload.hpp
@@ -16,11 +16,11 @@ namespace pass {
  * @brief Fuses consecutive Load and MoveBroadcast into a single load insctruction.
  * @ingroup snippets
  */
-class LoadMoveBroadcastToBroadcastLoad: public Pass {
+class LoadMoveBroadcastToBroadcastLoad: public RangedPass {
 public:
     LoadMoveBroadcastToBroadcastLoad() = default;
-    OPENVINO_RTTI("LoadMoveBroadcastToBroadcastLoad", "Pass")
-    bool run(LinearIR& linear_ir) override;
+    OPENVINO_RTTI("LoadMoveBroadcastToBroadcastLoad", "RangedPass")
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 };
 
 }  // namespace pass
diff --git a/src/common/snippets/include/snippets/lowered/pass/mark_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/mark_loops.hpp
index 048f9457ddb455..f3c1cd4c8f9818 100644
--- a/src/common/snippets/include/snippets/lowered/pass/mark_loops.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/mark_loops.hpp
@@ -20,11 +20,11 @@ namespace pass {
  *          - the consumer of the expression is explicitly after this expression - the pass marks the branches
  * @ingroup snippets
  */
-class MarkLoops : public Pass {
+class MarkLoops : public RangedPass {
 public:
-    OPENVINO_RTTI("MarkLoops", "Pass")
+    OPENVINO_RTTI("MarkLoops", "RangedPass")
     MarkLoops(size_t vector_size);
-    bool run(LinearIR& linear_ir) override;
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
     size_t m_vector_size;
diff --git a/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp b/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp
index 7f80fafda08aeb..81b7536b63edaa 100644
--- a/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp
@@ -23,15 +23,15 @@ namespace pass {
  * @ingroup snippets
  */
 
-class NormalizeBufferIDs : public Pass {
+class NormalizeBufferIDs : public RangedPass {
 public:
-    OPENVINO_RTTI("NormalizeBufferIDs", "Pass")
+    OPENVINO_RTTI("NormalizeBufferIDs", "RangedPass")
     /**
      * @brief Apply the pass to the Linear IR
      * @param linear_ir the target Linear IR
      * @return status of the pass
      */
-    bool run(lowered::LinearIR& linear_ir) override;
+    bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp b/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp
index 9ac4181e61e861..b320bd8396e866 100644
--- a/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp
@@ -18,10 +18,10 @@ namespace pass {
  *        - moves all ptr arithmetic to finalization offsets
  * @ingroup snippets
  */
-class OptimizeLoopSingleEvaluation : public Pass {
+class OptimizeLoopSingleEvaluation : public RangedPass {
 public:
-    OPENVINO_RTTI("OptimizeLoopSingleEvaluation", "Pass")
-    bool run(LinearIR& linear_ir) override;
+    OPENVINO_RTTI("OptimizeLoopSingleEvaluation", "RangedPass")
+    bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/lowered/pass/pass.hpp b/src/common/snippets/include/snippets/lowered/pass/pass.hpp
index 177056d2984d25..5833b695b0bba8 100644
--- a/src/common/snippets/include/snippets/lowered/pass/pass.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/pass.hpp
@@ -16,18 +16,18 @@ namespace lowered {
 namespace pass {
 
 /**
- * @interface Pass
+ * @interface PassBase
  * @brief Base class for transformations on linear IR
  * @ingroup snippets
  */
-class Pass {
+class PassBase {
 public:
-    Pass() = default;
-    virtual ~Pass() = default;
+    PassBase() = default;
+    virtual ~PassBase() = default;
     // Note that get_type_info_static and get_type_info are needed to mimic OPENVINO_RTTI interface,
     // so the standard OPENVINO_RTTI(...) macros could be used in derived classes.
     _OPENVINO_HIDDEN_METHOD static const ::ov::DiscreteTypeInfo& get_type_info_static() {
-        static ::ov::DiscreteTypeInfo type_info_static {"Pass"};
+        static ::ov::DiscreteTypeInfo type_info_static {"PassBase"};
         type_info_static.hash();
         return type_info_static;
     }
@@ -40,6 +40,25 @@ class Pass {
         return get_type_info().name;
     }
 
+    /**
+     * @brief Merges the current pass with other (e.g. during 2 pass pipelines fusion).
+     * @param other  Pointer on the another pass.
+     * @return The merged pass
+     * @attention If 'other' pass is empty (aka nullptr), it can be merged to any other pass.
+     * @attention If the merge fails, then nullptr is returned.
+     */
+    virtual std::shared_ptr<PassBase> merge(const std::shared_ptr<PassBase>& other) {
+        return nullptr;
+    }
+};
+
+/**
+ * @interface Pass
+ * @brief Base class for LIR passes which are performed on a full LIR body
+ * @ingroup snippets
+ */
+class Pass : public PassBase {
+public:
     /**
      * @brief Apply the pass to the Linear IR
      * @param linear_ir the target Linear IR
@@ -48,25 +67,46 @@ class Pass {
     virtual bool run(lowered::LinearIR& linear_ir) = 0;
 };
 
+/**
+ * @interface RangedPass
+ * @brief Base class for LIR passes which are performed on a range of a LIR body
+ * @ingroup snippets
+ */
+class RangedPass : public PassBase {
+public:
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @param begin begin of the range on which the pass is performed
+     * @param end end of the range on which the pass is performed
+     * @return status of the pass
+     */
+    virtual bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) = 0;
+};
+
 class PassPipeline {
 public:
-    using PositionedPassLowered = snippets::pass::PositionedPass<lowered::pass::Pass>;
+    using PositionedPassLowered = snippets::pass::PositionedPass<lowered::pass::PassBase>;
 
     PassPipeline();
     PassPipeline(const std::shared_ptr<PassConfig>& pass_config);
 
-    void register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr<Pass>& pass);
-    void register_pass(const std::shared_ptr<Pass>& pass);
+    const std::vector<std::shared_ptr<PassBase>>& get_passes() const { return m_passes; }
+    const std::shared_ptr<PassConfig>& get_pass_config() const { return m_pass_config; }
+    bool empty() const { return m_passes.empty(); }
+
+    void register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr<PassBase>& pass);
+    void register_pass(const std::shared_ptr<PassBase>& pass);
 
     template<typename T, class... Args>
     void register_pass(Args&&... args) {
-        static_assert(std::is_base_of<Pass, T>::value, "Pass not derived from lowered::Pass");
+        static_assert(std::is_base_of<PassBase, T>::value, "Pass not derived from lowered::Pass");
         auto pass = std::make_shared<T>(std::forward<Args>(args)...);
         register_pass(pass);
     }
     template<typename T, class Pos, class... Args, std::enable_if<std::is_same<snippets::pass::PassPosition, Pos>::value, bool>() = true>
     void register_pass(const snippets::pass::PassPosition& position, Args&&... args) {
-        static_assert(std::is_base_of<Pass, T>::value, "Pass not derived from lowered::Pass");
+        static_assert(std::is_base_of<PassBase, T>::value, "Pass not derived from lowered::Pass");
         auto pass = std::make_shared<T>(std::forward<Args>(args)...);
         register_pass(position, pass);
     }
@@ -74,10 +114,20 @@ class PassPipeline {
     void register_positioned_passes(const std::vector<PositionedPassLowered>& pos_passes);
 
     void run(lowered::LinearIR& linear_ir) const;
+    void run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) const;
+
+    /**
+     * @brief Merges 2 pass pipelines into one
+     * @param lhs first pass pipeline
+     * @param rhs second pass pipeline
+     * @return the merged pass pipeline
+     * @attention the function can not be used in case when one of the pipelines contains passes whose running order is important.
+     */
+    static PassPipeline merge_pipelines(const PassPipeline& lhs, const PassPipeline& rhs);
 
 private:
     std::shared_ptr<PassConfig> m_pass_config;
-    std::vector<std::shared_ptr<Pass>> m_passes;
+    std::vector<std::shared_ptr<PassBase>> m_passes;
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/lowered/pass/pass_config.hpp b/src/common/snippets/include/snippets/lowered/pass/pass_config.hpp
index 03fe2b3dd6d65d..90a45cc0eba708 100644
--- a/src/common/snippets/include/snippets/lowered/pass/pass_config.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/pass_config.hpp
@@ -48,6 +48,9 @@ class PassConfig {
         return is_enabled(T::get_type_info_static());
     }
 
+    friend bool operator==(const PassConfig& lhs, const PassConfig& rhs);
+    friend bool operator!=(const PassConfig& lhs, const PassConfig& rhs);
+
 private:
     std::unordered_set<DiscreteTypeInfo> m_disabled;
     std::unordered_set<DiscreteTypeInfo> m_enabled;
diff --git a/src/common/snippets/include/snippets/lowered/pass/propagate_layout.hpp b/src/common/snippets/include/snippets/lowered/pass/propagate_layout.hpp
index 6ba062b0525556..b77b61e90b480d 100644
--- a/src/common/snippets/include/snippets/lowered/pass/propagate_layout.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/propagate_layout.hpp
@@ -17,10 +17,10 @@ namespace pass {
  * proper data pointer offsets in the Kernel;
  * @ingroup snippets
  */
-class PropagateLayout : public Pass {
+class PropagateLayout : public RangedPass {
 public:
-    OPENVINO_RTTI("PropagateLayout", "Pass")
-    bool run(LinearIR& linear_ir) override;
+    OPENVINO_RTTI("PropagateLayout", "RangedPass")
+    bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp
new file mode 100644
index 00000000000000..4803e0556b7118
--- /dev/null
+++ b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+/**
+ * @interface UpdateSubtensors
+ * @brief The pass updates subtensors of all operations in Loop based on tail size.
+ * Firstly, the pass updates subtensors of all Loop entry points.
+ * After that, shape inference infrastructure is used to update subtensors of all ops in Loop body
+ * @param m_offset - offset which must be set
+ * @ingroup snippets
+ */
+class UpdateSubtensors : public pass::RangedPass {
+public:
+    UpdateSubtensors(size_t tail_size);
+    OPENVINO_RTTI("UpdateSubtensors", "RangedPass")
+    bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+    std::shared_ptr<pass::PassBase> merge(const std::shared_ptr<pass::PassBase>& other) override;
+
+private:
+    size_t m_tail_size;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/lowered/pass/softmax_decomposition.hpp b/src/common/snippets/include/snippets/lowered/pass/softmax_decomposition.hpp
index 795dc0d3725f1c..62704dafcfdfa9 100644
--- a/src/common/snippets/include/snippets/lowered/pass/softmax_decomposition.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/softmax_decomposition.hpp
@@ -16,11 +16,11 @@ namespace pass {
  * @brief Decomposes Softmax to a range of low-level operations on linear IR
  * @ingroup snippets
  */
-class SoftmaxDecomposition : public Pass {
+class SoftmaxDecomposition : public RangedPass {
 public:
-    OPENVINO_RTTI("SoftmaxDecomposition", "Pass")
+    OPENVINO_RTTI("SoftmaxDecomposition", "RangedPass")
     explicit SoftmaxDecomposition(size_t vector_size);
-    bool run(LinearIR& linear_ir) override;
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
     size_t m_vector_size;
diff --git a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp
index 4a99a6f2a4541e..dfa5c3fc54d120 100644
--- a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp
@@ -19,6 +19,7 @@ namespace pass {
  * @brief The pass optimally calculates the common buffer scratchpad size and
  *        set the offsets relative to the common data pointer to all Buffers. The pass uses MemorySolver API.
  *        Note: The pass requires expression enumeration. It should be executed separately before this pass!
+ *        Note: this transformation works only with m_clusters, no lir or iterators are really needed
  * @ingroup snippets
  */
 class SolveBufferMemory : public Pass {
diff --git a/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp
index bb74529cfbfc5f..ccc63d602cf657 100644
--- a/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/split_loops.hpp
@@ -29,11 +29,11 @@ namespace pass {
  * @ingroup snippets
  */
 
-class SplitLoops : public Pass {
+class SplitLoops : public RangedPass {
 public:
-    OPENVINO_RTTI("SplitLoops", "Pass")
+    OPENVINO_RTTI("SplitLoops", "RangedPass")
     SplitLoops();
-    bool run(LinearIR& linear_ir) override;
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
     static bool can_be_split(const LinearIR::LoopManager::LoopInfoPtr& current,
diff --git a/src/common/snippets/include/snippets/lowered/pass/validate_shapes.hpp b/src/common/snippets/include/snippets/lowered/pass/validate_shapes.hpp
index 08243c96beedf5..c650ac21f206c1 100644
--- a/src/common/snippets/include/snippets/lowered/pass/validate_shapes.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/validate_shapes.hpp
@@ -18,11 +18,11 @@ namespace pass {
  * @brief The pass checks that there are no dynamic shapes in the IR
  * @ingroup snippets
  */
-class ValidateShapes : public Pass {
+class ValidateShapes : public RangedPass {
 public:
-    OPENVINO_RTTI("ValidateShapes", "Pass")
+    OPENVINO_RTTI("ValidateShapes", "RangedPass")
     ValidateShapes() = default;
-    bool run(LinearIR& linear_ir) override;
+    bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
index 3829daf539e782..e5d99d59a6d361 100644
--- a/src/common/snippets/include/snippets/op/subgraph.hpp
+++ b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -5,11 +5,13 @@
 #pragma once
 
 #include <memory>
-
 #include <openvino/core/model.hpp>
 #include <openvino/op/util/sub_graph_base.hpp>
-#include "openvino/op/op.hpp"
+
 #include "openvino/core/rt_info.hpp"
+#include "openvino/op/op.hpp"
+#include "snippets/generator.hpp"
+#include "snippets/lowered/pass/pass.hpp"
 #include "snippets/pass/manager.hpp"
 #include "snippets/shape_inference/shape_inference.hpp"
 #include "snippets/lowered/pass/pass.hpp"
diff --git a/src/common/snippets/include/snippets/pass/manager.hpp b/src/common/snippets/include/snippets/pass/manager.hpp
index a9e3c2aec37498..3867366f1b399d 100644
--- a/src/common/snippets/include/snippets/pass/manager.hpp
+++ b/src/common/snippets/include/snippets/pass/manager.hpp
@@ -10,9 +10,6 @@
 #include "openvino/pass/pass.hpp"
 #include "openvino/pass/validate.hpp"
 
-#include <typeinfo>
-
-
 namespace ov {
 namespace snippets {
 namespace pass {
@@ -36,7 +33,7 @@ class Manager : public ov::pass::Manager {
     std::shared_ptr<T> register_pass(const PassPosition& position, Args&&... args) {
         static_assert(std::is_base_of<PassBase, T>::value, "Attempt to insert pass that is not derived from PassBase");
         auto pass = std::make_shared<T>(std::forward<Args>(args)...);
-        auto rc =  insert_pass_instance(position, pass);
+        auto rc = insert_pass_instance(position, pass);
         rc->set_pass_config(m_pass_config);
         if (!m_pass_config->is_enabled<T>()) {
             m_pass_config->disable<T>();
@@ -48,7 +45,7 @@ class Manager : public ov::pass::Manager {
     void register_positioned_passes(const std::vector<PositionedPassBase>& pos_passes);
 
 protected:
-    std::shared_ptr<Manager::PassBase> insert_pass_instance(const PassPosition& position, const std::shared_ptr<PassBase>& pass);
+    std::shared_ptr<PassBase> insert_pass_instance(const PassPosition& position, const std::shared_ptr<PassBase>& pass);
 };
 
 } // namespace pass
diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
index 96972fce825c0c..e44902fe4eebd6 100644
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@@ -4,28 +4,27 @@
 
 #include "snippets/generator.hpp"
 
+#include "snippets/itt.hpp"
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/lowered/pass/assign_registers.hpp"
 #include "snippets/lowered/pass/cleanup_loop_offsets.hpp"
-#include "snippets/lowered/pass/insert_tail_loop.hpp"
+#include "snippets/lowered/pass/insert_specific_iterations.hpp"
 #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp"
-
+#include "snippets/lowered/pass/pass.hpp"
 #include "snippets/op/kernel.hpp"
 
-#include "snippets/itt.hpp"
-
 namespace ov {
 namespace snippets {
 
 void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, const void* compile_params) const {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate")
     OV_ITT_TASK_CHAIN(GENERATE, ov::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::Transformations")
-    if (!target->is_supported())
-        OPENVINO_THROW("unsupported architecture for code generation");
+    OPENVINO_ASSERT(target->is_supported(), "unsupported architecture for code generation");
 
     std::function<RegType(const ov::Output<Node>& out)> reg_type_mapper = [&](const ov::Output<Node>& out) -> RegType {
         return get_op_out_reg_type(out);
     };
+
     lowered::pass::PassPipeline lowered_pipeline;
     // Note: the order of all passes in this pipeline must not be changed since they have hard dependencies
     //    1. InsertTailLoop must be called after AssignRegisters since tail loop expressions must have the same
@@ -35,7 +34,7 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c
     //    3. OptimizeLoopSingleEvaluation must be called after CleanupLoopOffsets
     //       since CleanupLoopOffsets can't handle loops with evaluate_once = true
     lowered_pipeline.register_pass<lowered::pass::AssignRegisters>(reg_type_mapper);
-    lowered_pipeline.register_pass<lowered::pass::InsertTailLoop>();
+    lowered_pipeline.register_pass<lowered::pass::InsertSpecificIterations>();
     lowered_pipeline.register_pass<lowered::pass::CleanupLoopOffsets>();
     lowered_pipeline.register_pass<lowered::pass::OptimizeLoopSingleEvaluation>();
     lowered_pipeline.run(linear_ir);
diff --git a/src/common/snippets/src/lowered/expression.cpp b/src/common/snippets/src/lowered/expression.cpp
index f33f3aeef95fc3..5c2a190dbf66a0 100644
--- a/src/common/snippets/src/lowered/expression.cpp
+++ b/src/common/snippets/src/lowered/expression.cpp
@@ -156,7 +156,7 @@ ExpressionPtr Expression::clone_with_new_inputs(const std::vector<PortConnectorP
     return expr;
 }
 
-ExpressionPtr Expression::clone_with_new_inputs(const ExressionMap& expr_map,
+ExpressionPtr Expression::clone_with_new_inputs(const ExpressionMap& expr_map,
                                                 const std::shared_ptr<Node>& new_node) const {
     std::vector<PortConnectorPtr> new_inputs;
     new_inputs.reserve(m_input_port_connectors.size());
diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp
index 65eb3f741cc628..b489ca27d5bd6d 100644
--- a/src/common/snippets/src/lowered/linear_ir.cpp
+++ b/src/common/snippets/src/lowered/linear_ir.cpp
@@ -47,7 +47,7 @@ std::shared_ptr<LinearIR> LinearIR::clone() const {
     auto cloned = std::make_shared<LinearIR>();
     cloned->m_config = m_config;
 
-    ExressionMap expression_map;
+    ExpressionMap expression_map;
     cloned->m_expressions = deep_copy_range(m_expressions.cbegin(), m_expressions.cend(), expression_map);
     for (const auto& expr : cloned->m_expressions) {
         cloned->m_node2expression_map[expr->get_node()] = expr;
@@ -161,7 +161,7 @@ std::vector<std::shared_ptr<ov::Node>> clone_nodes(const std::vector<std::shared
 
 LinearIR::container LinearIR::deep_copy_range(LinearIR::container::const_iterator begin,
                                               LinearIR::container::const_iterator end,
-                                              ExressionMap& expression_map) {
+                                              ExpressionMap& expression_map) {
     OPENVINO_ASSERT(expression_map.empty(), "deep_copy_range expects empty expression_map as an input");
     LinearIR::container result;
     NodeVector original_nodes;
diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp
index 4a17c7c67f50fd..8175051e3d85da 100644
--- a/src/common/snippets/src/lowered/loop_manager.cpp
+++ b/src/common/snippets/src/lowered/loop_manager.cpp
@@ -5,6 +5,8 @@
 #include "snippets/lowered/loop_manager.hpp"
 
 #include "snippets/lowered/expression.hpp"
+#include "snippets/lowered/pass/iter_handler.hpp"
+#include "snippets/lowered/pass/propagate_subtensors.hpp"
 #include "snippets/utils.hpp"
 
 #include "openvino/core/graph_util.hpp"
@@ -37,14 +39,61 @@ std::shared_ptr<LoopPort> LoopPort::clone_with_new_expr(const ExpressionPtr& new
     return new_loop_port;
 }
 
-LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount,
-                                          size_t increment,
-                                          const std::vector<ExpressionPort>& entries,
-                                          const std::vector<ExpressionPort>& exits,
-                                          bool outer_splited_loop)
+LoopInfo::SpecificIterationHandlers::SpecificIterationHandlers(size_t loop_work_amount, size_t loop_increment) {
+    const auto tail_size = loop_work_amount % loop_increment;
+    if (tail_size != 0) {
+        m_last_iter_handlers.register_pass<lowered::pass::UpdateMemoryAccessCounts>(tail_size);
+        m_last_iter_handlers.register_pass<lowered::pass::UpdateSubtensors>(tail_size);
+    }
+}
+
+LoopInfo::SpecificIterationHandlers::SpecificIterationHandlers(lowered::pass::PassPipeline first_iter_handlers,
+                                                               lowered::pass::PassPipeline main_body_handlers,
+                                                               lowered::pass::PassPipeline last_iter_handlers)
+    : m_first_iter_handlers(std::move(first_iter_handlers)),
+      m_main_body_handlers(std::move(main_body_handlers)),
+      m_last_iter_handlers(std::move(last_iter_handlers)) {}
+
+const lowered::pass::PassPipeline& LoopInfo::SpecificIterationHandlers::get_first_iter_handelrs() const {
+    return m_first_iter_handlers;
+}
+
+const lowered::pass::PassPipeline& LoopInfo::SpecificIterationHandlers::get_main_iter_handelrs() const {
+    return m_main_body_handlers;
+}
+
+const lowered::pass::PassPipeline& LoopInfo::SpecificIterationHandlers::get_last_iter_handelrs() const {
+    return m_last_iter_handlers;
+}
+
+LoopInfo::SpecificIterationHandlers LoopInfo::SpecificIterationHandlers::merge_handlers(
+    const SpecificIterationHandlers& lhs,
+    const SpecificIterationHandlers& rhs) {
+    return LoopInfo::SpecificIterationHandlers(
+        lowered::pass::PassPipeline::merge_pipelines(lhs.get_first_iter_handelrs(), rhs.get_first_iter_handelrs()),
+        lowered::pass::PassPipeline::merge_pipelines(lhs.get_main_iter_handelrs(), rhs.get_main_iter_handelrs()),
+        lowered::pass::PassPipeline::merge_pipelines(lhs.get_last_iter_handelrs(), rhs.get_last_iter_handelrs()));
+}
+
+LoopInfo::LoopInfo(size_t work_amount,
+                   size_t increment,
+                   const std::vector<LoopPort>& entries,
+                   const std::vector<LoopPort>& exits,
+                   const LoopInfo::SpecificIterationHandlers& handlers)
     : m_work_amount(work_amount),
       m_increment(increment),
-      m_outer_splited_loop(outer_splited_loop) {
+      m_entry_points(entries),
+      m_exit_points(exits),
+      m_handlers(handlers) {}
+
+LoopInfo::LoopInfo(size_t work_amount,
+                   size_t increment,
+                   const std::vector<ExpressionPort>& entries,
+                   const std::vector<ExpressionPort>& exits,
+                   const LoopInfo::SpecificIterationHandlers& handlers)
+    : m_work_amount(work_amount),
+      m_increment(increment),
+      m_handlers(handlers) {
     m_entry_points.reserve(entries.size());
     m_exit_points.reserve(exits.size());
     for (const auto& port : entries)
@@ -53,7 +102,7 @@ LinearIR::LoopManager::LoopInfo::LoopInfo(size_t work_amount,
         m_exit_points.emplace_back(port);
 }
 
-std::shared_ptr<LoopInfo> LoopInfo::clone_with_new_expr(const ExressionMap& expr_map) const {
+std::shared_ptr<LoopInfo> LoopInfo::clone_with_new_expr(const ExpressionMap& expr_map) const {
     auto clone_loop_ports = [&expr_map](const std::vector<LoopPort>& port_points) {
         std::vector<LoopPort> cloned_port_points;
         cloned_port_points.reserve(port_points.size());
@@ -68,7 +117,7 @@ std::shared_ptr<LoopInfo> LoopInfo::clone_with_new_expr(const ExressionMap& expr
     const auto& new_entry_points = clone_loop_ports(m_entry_points);
     const auto& new_exit_points = clone_loop_ports(m_exit_points);
 
-    return std::make_shared<LoopInfo>(m_work_amount, m_increment, new_entry_points, new_exit_points, m_outer_splited_loop);
+    return std::make_shared<LoopInfo>(m_work_amount, m_increment, new_entry_points, new_exit_points, m_handlers);
 }
 
 size_t LoopInfo::get_work_amount() const {
@@ -87,15 +136,11 @@ const std::vector<LoopPort>& LoopInfo::get_exit_points() const {
     return m_exit_points;
 }
 
-bool LoopInfo::get_outer_splited_loop() const {
-    return m_outer_splited_loop;
+const LoopInfo::SpecificIterationHandlers& LoopInfo::get_handlers() const {
+    return m_handlers;
 }
 
-const LoopInfo::FirstIterHandler& LoopInfo::get_first_iter_handler() const {
-    return m_first_iter_handler;
-}
-
-size_t LinearIR::LoopManager::LoopInfo::get_dim_idx() const {
+size_t LoopInfo::get_dim_idx() const {
     OPENVINO_ASSERT(!m_entry_points.empty(), "Loop info must have at least one entry point");
     auto equal_dim_idxes = [&](const LinearIR::LoopManager::LoopPort& p) {
         return p.dim_idx == m_entry_points[0].dim_idx;
@@ -130,15 +175,11 @@ void LoopInfo::set_entry_points(std::vector<LoopPort> entry_points) {
 }
 
 void LoopInfo::set_exit_points(std::vector<LoopPort> exit_points) {
-    m_exit_points = std::move(exit_points);;
-}
-
-void LoopInfo::set_outer_splited_loop(bool outer_splited_loop) {
-    m_outer_splited_loop = outer_splited_loop;
+    m_exit_points = std::move(exit_points);
 }
 
-void LoopInfo::set_first_iter_handler(LoopInfo::FirstIterHandler first_iter_handler) {
-    m_first_iter_handler = std::move(first_iter_handler);
+void LoopInfo::set_handlers(LoopInfo::SpecificIterationHandlers handlers) {
+    m_handlers = std::move(handlers);
 }
 
 void LoopInfo::update_entry_points(const std::function<void(LoopPort&)>& updater) {
@@ -164,7 +205,7 @@ bool operator<(const LinearIR::LoopManager::LoopPort& lhs, const LinearIR::LoopM
              (lhs.is_incremented == rhs.is_incremented && lhs.dim_idx < rhs.dim_idx)));
 }
 
-std::shared_ptr<LoopManager> LoopManager::clone_with_new_expr(const ExressionMap& expr_map) const {
+std::shared_ptr<LoopManager> LoopManager::clone_with_new_expr(const ExpressionMap& expr_map) const {
     auto new_loop_manager = std::make_shared<LoopManager>();
     for (const auto& id_info : m_map)
         new_loop_manager->m_map.insert({id_info.first, id_info.second->clone_with_new_expr(expr_map)});
@@ -370,18 +411,16 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
     }
 
     for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) {
-        if (*(loop_subtensor.rbegin() + dim_idx) == PortDescriptor::ServiceDimensions::FULL_DIM) {
+        OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup");
+        const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx);
+        if (subtensor_value == PortDescriptor::ServiceDimensions::FULL_DIM) {
             continue;
         }
 
         OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup");
-        const auto work_amount =
-                loop_tensor.size() > dim_idx ? *(loop_tensor.rbegin() + dim_idx)
-                                             : 0;
-        const auto work_amount_increment =
-                loop_subtensor.size() > dim_idx ? *(loop_subtensor.rbegin() + dim_idx)
-                                                : (dim_idx == 0 ? vector_size : 1);
-        mark_loop(loop_begin_pos, loop_end_pos, work_amount, work_amount_increment, dim_idx, loop_entry_points, loop_exit_points);
+        const auto work_amount = *(loop_tensor.rbegin() + dim_idx);
+        const auto increment = subtensor_value;
+        mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, dim_idx, loop_entry_points, loop_exit_points);
     }
 }
 
@@ -444,6 +483,12 @@ void LinearIR::LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target,
     loop_info->set_entry_points(new_entries);
     loop_info->set_exit_points(new_exits);
 
+    loop_info->set_handlers(LoopInfo::SpecificIterationHandlers::merge_handlers(loop_info_upper->get_handlers(), loop_info_lower->get_handlers()));
+    // Since fusion can be called for broadcastable loops (one of the loops has work_amount = increment = 1),
+    // maximum value is set to the fused loop
+    loop_info->set_work_amount(std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount()));
+    loop_info->set_increment(std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment()));
+
     const auto& from = fuse_into_upper ? loop_id_lower : loop_id_upper;
     const auto& to = fuse_into_upper ? loop_id_upper : loop_id_lower;
     for (auto it = loop_begin_target; it != loop_end_target; ++it) {
diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
index d34b442fd33051..c7cf6b67abd8ea 100644
--- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
@@ -64,7 +64,7 @@ void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const
     }
 }
 
-bool AllocateBuffers::run(lowered::LinearIR& linear_ir) {
+bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AllocateBuffers");
     m_buffer_scratchpad_size = 0;
 
@@ -78,7 +78,7 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir) {
         pipeline.register_pass<NormalizeBufferIDs>();
         pipeline.run(linear_ir);
     } else {
-        InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir);
+        InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
     }
 
     return m_buffer_scratchpad_size > 0;
diff --git a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp
index 644a5dd1509f7f..ebe802168f5871 100644
--- a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp
+++ b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp
@@ -13,7 +13,7 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-bool CleanRepeatedDataPointerShifts::reuse_increments(const LinearIR& linear_ir, const ExpressionPtr& loop_end_expr) {
+bool CleanRepeatedDataPointerShifts::reuse_increments(const ExpressionPtr& loop_end_expr) {
     const auto loop_end = ov::as_type_ptr<op::LoopEnd>(loop_end_expr->get_node());
     if (!loop_end)
         return false;
@@ -89,14 +89,15 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LinearIR& linear_ir,
     return true;
 }
 
-bool CleanRepeatedDataPointerShifts::run(LinearIR& linear_ir) {
+bool CleanRepeatedDataPointerShifts::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::CleanRepeatedDataPointerShifts")
     bool modified = false;
 
-    for (const auto& expr : linear_ir) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
         const auto& node = expr->get_node();
         if (ov::is_type<op::LoopEnd>(node)) {
-            modified |= reuse_increments(linear_ir, expr);
+            modified |= reuse_increments(expr);
         }
     }
 
diff --git a/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp b/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp
index 5e5cc43b13c835..f503e116824960 100644
--- a/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp
+++ b/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp
@@ -13,14 +13,10 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-bool CleanupLoopOffsets::run(LinearIR& linear_ir) {
+bool CleanupLoopOffsets::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::CleanupLoopOffsets")
-    if (linear_ir.empty())
-        return false;
     bool is_modified = false;
-    // Note: it doesn't make sense to check the last expression - it must always be Result
-    const auto before_last = std::prev(linear_ir.end());
-    for (auto expr_it = linear_ir.begin(); expr_it != before_last; expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto& node = expr_it->get()->get_node();
         if (auto loop_end = as_type_ptr<op::LoopEnd>(node)) {
                 auto next_expr_it = std::next(expr_it);
diff --git a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp
index 9e51b169a5deff..dc2eae08947163 100644
--- a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp
+++ b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp
@@ -320,10 +320,10 @@ void DefineBufferClusters::parse_memory_access_op(const ExpressionPtr& expr) {
     }
 }
 
-bool DefineBufferClusters::run(LinearIR& linear_ir) {
+bool DefineBufferClusters::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::DefineBufferClusters");
 
-    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
         const auto& expr = *expr_it;
         const auto op = expr->get_node();
         if (ov::is_type<op::LoopEnd>(op)) {
diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp
index 004d2fa62f9da9..8fe892628f4f1c 100644
--- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp
@@ -22,7 +22,7 @@ bool is_loop_id_found(const std::vector<size_t>& ids, size_t id) {
 using LoopManager = LinearIR::LoopManager;
 using LoopInfoPtr = LoopManager::LoopInfoPtr;
 
-FuseLoops::FuseLoops() : Pass() {}
+FuseLoops::FuseLoops() : RangedPass() {}
 
 bool FuseLoops::loop_ports_are_compatible(const LinearIR::LoopManagerPtr& loop_manager,
                                           const size_t loop_lower_id,
@@ -44,20 +44,29 @@ bool FuseLoops::loop_ports_are_compatible(const LinearIR::LoopManagerPtr& loop_m
 }
 
 bool FuseLoops::can_be_fused(const LoopInfoPtr& loop_current, const LoopInfoPtr& loop_target) {
-    auto current_work_amount = loop_current->get_work_amount();
-    auto target_work_amount = loop_target->get_work_amount();
-    // Loop fusion is supported only if Loops have equal increments and the equal/broadcastable work amounts.
+    const auto current_work_amount = loop_current->get_work_amount();
+    const auto target_work_amount = loop_target->get_work_amount();
+    const auto current_increment = loop_current->get_increment();
+    const auto target_increment = loop_target->get_increment();
+    // Loop fusion is supported only if Loops have equal/broadcastable increments and work amounts.
     // Note: For example, Broadcastable work amounts are possible in the following case:
     //     Relu_0 [16x1]     Relu_1 [16x128]
     //                \           /
     //                 Add [16x128]
     // Because of expression order in linear IR and work of MarkLoop algorithm, there are 2 Inner Loops:
-    //  - Relu_0 with work amount `1` and increment `vector size`
+    //  - Relu_0 with work amount `1` and increment `1`
     //  - Relu_1 and Add with work amount `128` and increment `vector size`
     // We can fuse them into one Loop with work amount `128` and increment `vector size`
-    const auto supported_work_amount = current_work_amount == target_work_amount || current_work_amount == 1 || target_work_amount == 1;
-    const auto supported_increment = loop_current->get_increment() == loop_target->get_increment();
-    return supported_work_amount && supported_increment;
+
+    // WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't,
+    // because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters
+    // (e.g. tail size). This logic is not implemented for now, so fusion for such loops is skipped.
+    const bool first_iter_handlers_match = loop_current->get_handlers().get_first_iter_handelrs().empty() ==
+                                           loop_target->get_handlers().get_first_iter_handelrs().empty();
+    const bool equal_parameters = current_work_amount == target_work_amount && current_increment == target_increment;
+    const bool current_bcastable = current_work_amount == 1 && current_increment == 1;
+    const bool target_bcastable = target_work_amount == 1 && target_increment == 1;
+    return first_iter_handlers_match && (equal_parameters || current_bcastable || target_bcastable);
 }
 
 void FuseLoops::move(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id,
@@ -123,12 +132,6 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo
     LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos;
     std::tie(target_loop_begin_pos, target_loop_end_pos) = loop_manager->get_loop_bounds(linear_ir, target_loop_id);
     loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, target_loop_id, current_loop_id, false);
-    // Update work_amount for Loop (increment is constant because increments must be the identical for fusion):
-    loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount()));
-    // If one of the Loops is outer for nested loops that splits the same dimension,
-    // after fusion new common Loop save this status
-    loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop());
-
     const auto insertion_place = current_loop_begin_pos;
     const auto is_move_needed = target_loop_end_pos != current_loop_begin_pos;
     if (is_move_needed)
@@ -168,11 +171,6 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo
     LinearIR::constExprIt target_loop_begin_pos, target_loop_end_pos;
     std::tie(target_loop_begin_pos, target_loop_end_pos) = loop_manager->get_loop_bounds(linear_ir, target_loop_id);
     loop_manager->fuse_loops(target_loop_begin_pos, target_loop_end_pos, current_loop_id, target_loop_id);
-    // Update work_amount for Loop (increment is constant because increments must be the identical for fusion):
-    loop_current->set_work_amount(std::max(loop_current->get_work_amount(), loop_target->get_work_amount()));
-    // If one of the Loops is outer for nested loops that splits the same dimension,
-    // after fusion new common Loop save this status
-    loop_current->set_outer_splited_loop(loop_current->get_outer_splited_loop() || loop_target->get_outer_splited_loop());
 
     const auto insertion_place = current_loop_end_pos;
     const auto is_move_needed = insertion_place != target_loop_begin_pos;
@@ -184,15 +182,12 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo
     return true;
 }
 
-bool FuseLoops::run(LinearIR& linear_ir) {
+bool FuseLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::FuseLoops")
-    if (linear_ir.empty())
-        return false;
-
     const auto& loop_manager = linear_ir.get_loop_manager();
     std::set<size_t> prev_fused_loops;
 
-    for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto expr = *expr_it;
         const auto& node = expr->get_node();
         if (ov::is_type<ov::op::v0::Parameter>(node) ||
diff --git a/src/common/snippets/src/lowered/pass/identify_buffers.cpp b/src/common/snippets/src/lowered/pass/identify_buffers.cpp
index 6b04701ff155d5..485252b1ae7f5d 100644
--- a/src/common/snippets/src/lowered/pass/identify_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/identify_buffers.cpp
@@ -77,7 +77,7 @@ void IdentifyBuffers::update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrP
     }
 }
 
-std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool) {
+std::vector<bool> IdentifyBuffers::create_adjacency_matrix(LinearIR::constExprIt begin, LinearIR::constExprIt end, const BufferPool& pool) {
     // The sync point to check for adjacency is Loop because only in Loop we increment pointers.
     // So if some Buffers in the one Loop have conflict (cannot be inplace: the different ptr increment and data sizes)
     // they are called as adjacent
@@ -86,7 +86,7 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
     for (size_t i = 0; i < size; ++i)
         adj[index(size, i, i)] = true;
 
-    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto &expr = *expr_it;
         if (!ov::is_type<op::LoopEnd>(expr->get_node()))
             continue;
@@ -214,19 +214,20 @@ auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector<bool>& adj) -> s
     return color_groups;
 }
 
-bool IdentifyBuffers::run(LinearIR& linear_ir) {
+bool IdentifyBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::IdentifyBuffers")
     // Identify Buffers using Graph coloring algorithm.
     BufferPool buffer_pool;
 
-    for (const auto& expr : linear_ir) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
         if (ov::is_type<op::Buffer>(expr->get_node())) {
             buffer_pool.push_back(expr);
         }
     }
 
     // Creation of Adj matrix
-    auto adj = create_adjacency_matrix(linear_ir, buffer_pool);
+    auto adj = create_adjacency_matrix(begin, end, buffer_pool);
 
     // Graph coloring algorithm
     const auto color_groups = coloring(buffer_pool, adj);
diff --git a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp
index b525428dd344d3..36cb41d3b9c96e 100644
--- a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp
+++ b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp
@@ -14,12 +14,13 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-bool InitBuffersDefault::run(LinearIR& linear_ir) {
+bool InitBuffersDefault::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InitBuffersDefault");
 
     size_t id = 0;
     size_t offset = 0;
-    for (const auto& expr : linear_ir) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
         const auto op = expr->get_node();
         if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
             AllocateBuffers::set_buffer_offset(expr, offset);
diff --git a/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp b/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp
index d76a2b1af35147..575e73057625ac 100644
--- a/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp
@@ -14,7 +14,7 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-bool InsertBroadcastMove::run(LinearIR& linear_ir) {
+bool InsertBroadcastMove::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertBroadcastMove")
     bool modified = false;
 
@@ -32,7 +32,7 @@ bool InsertBroadcastMove::run(LinearIR& linear_ir) {
                ov::is_type<ov::snippets::op::VectorBuffer>(v.get_node_shared_ptr()) ||
                ov::is_type<ov::snippets::op::Fill>(v.get_node_shared_ptr());
     };
-    for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto& expr = *expr_it;
         const auto& node = expr->get_node();
         const auto& descriptors = expr->get_input_port_descriptors();
diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp
index d977570fce4a3b..eb72f971ced1c4 100644
--- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp
@@ -101,7 +101,7 @@ ov::Shape compute_allocation_shape(const LinearIR::LoopManagerPtr& loop_manager,
 }  // namespace
 
 InsertBuffers::InsertBuffers(int32_t buffer_allocation_rank)
-    : Pass(), m_buffer_allocation_rank(buffer_allocation_rank) {}
+    : RangedPass(), m_buffer_allocation_rank(buffer_allocation_rank) {}
 
 LinearIR::constExprIt InsertBuffers::insertion_position(const LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager,
                                                         const ExpressionPtr& up_expr, const ExpressionPtr& down_expr) {
@@ -136,7 +136,10 @@ LinearIR::constExprIt InsertBuffers::insertion_position(const LinearIR& linear_i
     OPENVINO_THROW("Incorrect configuration for Buffer insertion!");
 }
 
-void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::constExprIt& expr_it, const LinearIR::LoopManagerPtr& loop_manager,
+void InsertBuffers::insertion(LinearIR& linear_ir,
+                              const LinearIR::constExprIt& begin_it,
+                              const LinearIR::constExprIt& end_it,
+                              const LinearIR::LoopManagerPtr& loop_manager,
                               const std::vector<LinearIR::LoopManager::LoopPort>& loop_entries,
                               const std::vector<LinearIR::LoopManager::LoopPort>& loop_exits) {
     for (const auto& entry_point : loop_entries) {
@@ -230,7 +233,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::constExprIt&
                     const auto buffer_consumers_inputs = buffer_out->get_consumers();
                     replace_input_port_connectors(buffer_consumers_inputs, output_connector);
                     potential_consumers.insert(buffer_consumers_inputs.begin(), buffer_consumers_inputs.end());
-                    linear_ir.erase(linear_ir.find_after(expr_it, buffer));
+                    linear_ir.erase(linear_ir.find_after(begin_it, buffer));
                 }
             }
 
@@ -243,9 +246,9 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::constExprIt&
                 std::set<ExpressionPtr> consumers;
                 for (const auto& port : potential_consumers)
                     consumers.insert(port.get_expr());
-                const auto it = std::find_if(expr_it, linear_ir.cend(),
+                const auto it = std::find_if(begin_it, end_it,
                                              [&consumers](const ExpressionPtr& expr) { return consumers.count(expr) > 0; });
-                OPENVINO_ASSERT(it != linear_ir.cend(), "Consumer of Buffer has not been found in Linear IR");
+                OPENVINO_ASSERT(it != end_it, "Consumer of Buffer has not been found in Linear IR");
                 consumer_expr = *it;
             }
 
@@ -275,11 +278,8 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::constExprIt&
     }
 }
 
-bool InsertBuffers::run(LinearIR& linear_ir) {
+bool InsertBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertBuffers")
-    if (linear_ir.empty())
-        return false;
-
     const auto& loop_manager = linear_ir.get_loop_manager();
     const auto loop_data_map = loop_manager->get_map();
     for (const auto& loop_data : loop_data_map) {
@@ -287,10 +287,10 @@ bool InsertBuffers::run(LinearIR& linear_ir) {
         const auto loop_entries = loop_info->get_entry_points();
         const auto loop_exits = loop_info->get_exit_points();
         // using begin() as expr_it because we work with LoopInfo, not expressions in Linear IR
-        insertion(linear_ir, linear_ir.cbegin(), loop_manager, loop_entries, loop_exits);
+        insertion(linear_ir, begin, end, loop_manager, loop_entries, loop_exits);
     }
 
-    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto expr = *expr_it;
         const auto node = (*expr_it)->get_node();
         const auto ma = ov::as_type_ptr<op::MemoryAccess>(node);
@@ -307,7 +307,7 @@ bool InsertBuffers::run(LinearIR& linear_ir) {
             loop_exits[p.first] = expr->get_output_port(p.first);
         }
 
-        insertion(linear_ir, expr_it, loop_manager, loop_entries, loop_exits);
+        insertion(linear_ir, expr_it, end, loop_manager, loop_entries, loop_exits);
     }
 
     return true;
diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp
index eb70e3d26042b8..64c01a489ba668 100644
--- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp
@@ -20,13 +20,13 @@ using LoopInfoPtr = LoopManager::LoopInfoPtr;
 InsertLoadStore::InsertLoadStore(size_t vector_size) : m_vector_size(vector_size) {}
 
 size_t InsertLoadStore::get_count(const PortDescriptorPtr& port_desc) const {
-    const auto layout = port_desc->get_layout();
-    const auto shape = port_desc->get_shape();
+    const auto& layout = port_desc->get_layout();
+    const auto& shape = port_desc->get_shape();
     // Find last dimension by layout
-    const auto last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1);
+    const auto& last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1);
     OPENVINO_ASSERT(last_dim_idx != layout.end() && *last_dim_idx < shape.size(), "Load/Store expression have incorrect layout");
-    const auto dim = shape[*last_dim_idx];
-    return dim == 1 ? 1 : m_vector_size;
+    const auto& dim = shape[*last_dim_idx];
+    return std::min(dim, m_vector_size);
 }
 
 bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) {
@@ -72,11 +72,11 @@ bool InsertLoadStore::insert_store(LinearIR& linear_ir, const LinearIR::constExp
     return true;
 }
 
-bool InsertLoadStore::run(LinearIR& linear_ir) {
+bool InsertLoadStore::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertLoadStore")
 
     bool modified = false;
-    for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto expr = *expr_it;
         const auto& node = expr->get_node();
         if (ov::is_type<ov::op::v0::Parameter>(node)) {
diff --git a/src/common/snippets/src/lowered/pass/insert_loops.cpp b/src/common/snippets/src/lowered/pass/insert_loops.cpp
index 2155cab55f201d..08c993c188795f 100644
--- a/src/common/snippets/src/lowered/pass/insert_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_loops.cpp
@@ -25,7 +25,7 @@ std::vector<size_t> get_outer_loop_ids(const ExpressionPtr& expr, size_t loop_id
 }
 }  // namespace
 
-InsertLoops::InsertLoops() : Pass() {}
+InsertLoops::InsertLoops() : RangedPass() {}
 
 void InsertLoops::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id, bool has_outer_loop) {
     const auto loop_info = loop_manager->get_loop_info(loop_id);
@@ -72,15 +72,12 @@ void InsertLoops::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr&
     linear_ir.insert_node(loop_end, loop_end_inputs, outer_loop_ids, false, loop_bounds.second);
 }
 
-bool InsertLoops::run(LinearIR& linear_ir) {
+bool InsertLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertLoops")
-    if (linear_ir.empty())
-        return false;
-
     const auto& loop_manager = linear_ir.get_loop_manager();
 
     std::set<size_t> inserted_loops;
-    for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto expr = *expr_it;
         const auto& node = expr->get_node();
         if (ov::is_type<op::LoopBase>(node) ||
diff --git a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp
index 6ccfbf1094cdc3..9f68b45c8d0857 100644
--- a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp
@@ -13,13 +13,11 @@ namespace lowered {
 namespace pass {
 
 InsertPerfCount::InsertPerfCount(std::map<std::string, std::string> boundary_op_names)
-    : Pass(), m_boundary_op_names(std::move(boundary_op_names)) {
+    : RangedPass(), m_boundary_op_names(std::move(boundary_op_names)) {
 }
 
-bool InsertPerfCount::run(LinearIR& linear_ir) {
+bool InsertPerfCount::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertPerfCount")
-    if (linear_ir.empty())
-        return false;
     if (m_boundary_op_names.empty()) {
         const auto& first_op_name = linear_ir.begin()->get()->get_node()->get_friendly_name();
         const auto& last_op_name = linear_ir.rbegin()->get()->get_node()->get_friendly_name();
@@ -27,7 +25,7 @@ bool InsertPerfCount::run(LinearIR& linear_ir) {
     }
 
     size_t seq_number = 0;
-    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto& op_name = expr_it->get()->get_node()->get_friendly_name();
         const auto& found = m_boundary_op_names.find(op_name);
         if (found != m_boundary_op_names.end()) {
diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp
new file mode 100644
index 00000000000000..f6c7faf27b4cfb
--- /dev/null
+++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp
@@ -0,0 +1,142 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/insert_specific_iterations.hpp"
+
+#include "snippets/itt.hpp"
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/loop_manager.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+LinearIR::constExprIt InsertSpecificIterations::insert_copy_loop(LinearIR& linear_ir, const size_t loop_id, const LinearIR::constExprIt& insert_pos) {
+    const auto& loop_manager = linear_ir.get_loop_manager();
+    const auto loop_bounds = loop_manager->get_loop_bounds(linear_ir, loop_id);
+    ExpressionMap expression_map;
+    const auto& loop_copy_range = LinearIR::deep_copy_range(loop_bounds.first, std::next(loop_bounds.second), expression_map);
+    const auto new_loop_begin_pos = linear_ir.insert(insert_pos, loop_copy_range.begin(), loop_copy_range.end());
+    const auto new_loop_end_pos = insert_pos;
+
+    const auto original_loop_info = loop_manager->get_loop_info(loop_id);
+    std::vector<LinearIR::LoopManager::LoopPort> new_entry_points, new_exit_points;
+    // Clone loop ports from original loop info to new loop info
+    for (const auto& entry : original_loop_info->get_entry_points())
+        new_entry_points.push_back(*entry.clone_with_new_expr(expression_map[entry.expr_port->get_expr().get()]));
+    for (const auto& exit : original_loop_info->get_exit_points())
+        new_exit_points.push_back(*exit.clone_with_new_expr(expression_map[exit.expr_port->get_expr().get()]));
+
+    for (const auto& elem : expression_map) {
+        const auto expr = elem.first->shared_from_this();
+        const auto& new_expr = elem.second;
+        // Loop begin/end ops can't be loop ports
+        if (ov::is_type<op::LoopBase>(expr->get_node()))
+            continue;
+        // Update loop info of all outer loops with new loop ports
+        const auto outer_loop_ids = LinearIR::LoopManager::get_outer_expr_loops(expr, loop_id);
+        for (size_t i = 0; i < expr->get_input_count(); ++i)
+            loop_manager->update_loops_port(outer_loop_ids, expr->get_input_port(i), {expr->get_input_port(i), new_expr->get_input_port(i)}, true);
+        for (size_t i = 0; i < expr->get_output_count(); ++i)
+            loop_manager->update_loops_port(outer_loop_ids, expr->get_output_port(i), {expr->get_output_port(i), new_expr->get_output_port(i)}, false);
+    }
+
+    const auto new_id = loop_manager->replace_with_new_loop(linear_ir, new_loop_begin_pos, new_loop_end_pos,
+                                                            original_loop_info->get_work_amount(), original_loop_info->get_increment(),
+                                                            new_entry_points, new_exit_points, loop_id);
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(std::prev(new_loop_end_pos)->get()->get_node());
+    OPENVINO_ASSERT(loop_end, "Cloned Loop does not contain LoopEnd op at the expected place.");
+    loop_end->set_id(new_id);
+    return new_loop_begin_pos;
+}
+
+using LoopInfo = LinearIR::LoopManager::LoopInfo;
+
+bool InsertSpecificIterations::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertSpecificIterations")
+    const auto& loop_manager = linear_ir.get_loop_manager();
+
+    bool modified = false;
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
+        const auto node = expr->get_node();
+        const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
+        if (!loop_end)
+            continue;
+
+        const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id());
+        const auto work_amount = loop_info->get_work_amount();
+        const auto increment = loop_info->get_increment();
+        const auto& handlers = loop_info->get_handlers();
+
+        const auto main_loop_begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end->get_loop_begin()));
+        const auto main_loop_end_it = linear_ir.find_after(main_loop_begin_it, linear_ir.get_expr_by_node(loop_end));
+        // Note: handlers must be run on the range started with the first operation in the loop body.
+        const auto main_first_body_op_it = std::next(main_loop_begin_it);
+
+        auto update_loop_params = [&loop_manager](const std::shared_ptr<op::LoopEnd>& loop_end_copy,
+                                                  size_t new_work_amount,
+                                                  size_t new_increment,
+                                                  bool zero_finalization_offsets) {
+            loop_end_copy->set_work_amount(new_work_amount);
+            loop_end_copy->set_increment(new_increment);
+
+            const auto& loop_info_copy = loop_manager->get_loop_info(loop_end_copy->get_id());
+            loop_info_copy->set_work_amount(new_work_amount);
+            loop_info_copy->set_increment(new_increment);
+
+            if (zero_finalization_offsets)
+                loop_end_copy->set_finalization_offsets(std::vector<int64_t>(loop_end_copy->get_finalization_offsets().size(), 0));
+        };
+
+        auto copy_and_run_specific_handlers = [&](const PassPipeline& handlers) {
+            const auto new_loop_begin_pos = insert_copy_loop(linear_ir, loop_end->get_id(), main_loop_begin_it);
+            const auto new_loop_begin = ov::as_type_ptr<op::LoopBegin>(new_loop_begin_pos->get()->get_node());
+            OPENVINO_ASSERT(new_loop_begin, "Cloned Loop does not contain LoopBegin op at the expected place.");
+            const auto new_loop_end = new_loop_begin->get_loop_end();
+            const auto new_loop_end_pos = linear_ir.find_after(new_loop_begin_pos, linear_ir.get_expr_by_node(new_loop_end));
+            OPENVINO_ASSERT(new_loop_end, "Cloned Loop does not contain LoopEnd op at the expected place.");
+
+            // Note: handlers must be run on the range started with the first operation in the loop body.
+            handlers.run(linear_ir, std::next(new_loop_begin_pos), new_loop_end_pos);
+            return new_loop_end;
+        };
+
+        const bool specific_first_iteration = !handlers.get_first_iter_handelrs().empty();
+        if (work_amount == increment) {
+            handlers.get_first_iter_handelrs().run(linear_ir, main_first_body_op_it, main_loop_end_it);
+        } else {
+            if (specific_first_iteration) {
+                const auto loop_end_copy = copy_and_run_specific_handlers(handlers.get_first_iter_handelrs());
+                update_loop_params(loop_end_copy, increment, increment, true);
+            }
+
+            const auto tail_size = work_amount % increment;
+            if (tail_size != 0) {
+                if (!specific_first_iteration || work_amount > 2 * increment) {
+                    const auto loop_end_copy = copy_and_run_specific_handlers(handlers.get_main_iter_handelrs());
+                    const auto reduce_value = specific_first_iteration ? tail_size + increment : tail_size;
+                    const auto new_work_amount = work_amount - reduce_value;
+                    update_loop_params(loop_end_copy, new_work_amount, increment, true);
+                }
+                handlers.get_last_iter_handelrs().run(linear_ir, main_first_body_op_it, main_loop_end_it);
+                update_loop_params(loop_end, tail_size, tail_size, false);
+            } else if (specific_first_iteration) {
+                handlers.get_main_iter_handelrs().run(linear_ir, main_first_body_op_it, main_loop_end_it);
+                update_loop_params(loop_end, work_amount - increment, increment, false);
+            }
+        }
+        modified = true;
+    }
+    return modified;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
+
diff --git a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp
deleted file mode 100644
index 7774883aa86b1d..00000000000000
--- a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-// Copyright (C) 2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/lowered/pass/insert_tail_loop.hpp"
-
-#include "snippets/lowered/linear_ir.hpp"
-#include "snippets/lowered/loop_manager.hpp"
-#include "snippets/lowered/pass/init_loops.hpp"
-#include "snippets/snippets_isa.hpp"
-#include "snippets/utils.hpp"
-#include "snippets/itt.hpp"
-
-namespace ov {
-namespace snippets {
-namespace lowered {
-namespace pass {
-void InsertTailLoop::propagate_updated_subtensor_through_loop(const LinearIR& linear_ir,
-                                                              const LinearIR::LoopManager::LoopInfoPtr& loop_info,
-                                                              LinearIR::container::const_iterator begin,
-                                                              LinearIR::container::const_iterator end,
-                                                              const size_t new_dim_value) {
-    std::map<lowered::PortDescriptorPtr, snippets::VectorDims> original_shapes;
-    // First step: set new dim value to the corresponding entry_points' dimensions
-    if (new_dim_value != existing_subtensor_value) {
-        for (const auto& port : loop_info->get_entry_points()) {
-            const auto& reg_type = port.expr_port->get_descriptor_ptr()->get_reg().type;
-            if ((port.is_incremented && reg_type == RegType::gpr) || (reg_type == RegType::vec)) {
-                const auto& expr = port.expr_port->get_expr();
-                const auto node = expr->get_node();
-                auto desc = port.expr_port->get_descriptor_ptr();
-                auto subtensor = desc->get_subtensor();
-                if (port.dim_idx < subtensor.size()) {
-                    *(subtensor.rbegin() + port.dim_idx) = new_dim_value;
-                    desc->set_subtensor(subtensor);
-                }
-
-                const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr();
-                const auto& layout = parent_desc->get_layout();
-                const auto& shape = parent_desc->get_shape();
-                if (original_shapes.find(parent_desc) == original_shapes.end()) {
-                    original_shapes[parent_desc] = shape;
-                }
-                auto new_shape = shape;
-                new_shape[*(layout.rbegin() + port.dim_idx)] = new_dim_value;
-                parent_desc->set_shape(new_shape);
-            }
-        }
-    }
-
-    auto update_only_dim_idx_with_subtensor_value = [&](const LinearIR::LoopManager::LoopPort& port) {
-        const auto& reg_type = port.expr_port->get_descriptor_ptr()->get_reg().type;
-         if ((port.is_incremented && reg_type == RegType::gpr) || (reg_type == RegType::vec)) {
-            auto desc = port.expr_port->get_descriptor_ptr();
-            const auto expr = port.expr_port->get_expr();
-            const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr();
-
-            const auto& layout = parent_desc->get_layout();
-            const auto& shape = parent_desc->get_shape();
-            const auto& desc_subtensor = desc->get_subtensor();
-            if (port.dim_idx < desc_subtensor.size()) {
-                if (original_shapes.find(parent_desc) == original_shapes.end()) {
-                    original_shapes[parent_desc] = shape;
-                }
-                auto new_shape = shape;
-                new_shape[*(layout.rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx);
-                parent_desc->set_shape(new_shape);
-            }
-        }
-    };
-
-    auto update_subtensors = [](const std::vector<PortDescriptorPtr>& descs, bool is_input) {
-        for (const auto& desc : descs) {
-            const auto& subtensor = desc->get_subtensor();
-            if (!subtensor.empty()) {
-                auto planar_dims = is_input ? snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout())
-                                            : snippets::utils::get_preordered_vdims(desc->get_shape(), desc->get_layout());
-                const size_t subtensor_start = planar_dims.size() - subtensor.size();
-                VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end());
-                for (size_t i = 0; i < new_subtensor.size(); ++i) {
-                    new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]);
-                }
-                desc->set_subtensor(new_subtensor);
-            }
-        }
-    };
-
-    auto shape_inference_end_it = end;
-    const bool loop_by_last_dim = loop_info->get_dim_idx() == 0;
-    // Subtensors are updated using shape inference infrastructure:
-    // For inner loops propagation function is called recursively
-    for (auto expr_it = begin; expr_it != end; expr_it++) {
-        const auto expr = *expr_it;
-        if (ov::is_type<snippets::op::LoopEnd>(expr->get_node()))
-            continue;
-        if (auto loop_begin = ov::as_type_ptr<snippets::op::LoopBegin>(expr->get_node())) {
-            const auto loop_end = loop_begin->get_loop_end();
-            const auto inner_loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id());
-            const auto inner_begin = std::next(expr_it);
-            const auto inner_end = linear_ir.find(linear_ir.get_expr_by_node(loop_end));
-
-            // The corresponding shapes of inner loops entry points must be updated using existing subtensor values
-            if (new_dim_value == existing_subtensor_value) {
-                for (const auto& port : loop_info->get_entry_points())
-                    update_only_dim_idx_with_subtensor_value(port);
-            }
-            propagate_updated_subtensor_through_loop(linear_ir, inner_loop_info, inner_begin, inner_end);
-            expr_it = inner_end;
-            continue;
-        }
-        if ((ov::is_type<snippets::op::BroadcastMove>(expr_it->get()->get_node()) ||
-            ov::is_type<snippets::op::BroadcastLoad>(expr_it->get()->get_node())) &&
-            loop_by_last_dim) {
-            // WA: we have to break subtensor propagation if we try to propagate new last dim through Broadcast nodes
-            // which broadcast last dim in original dimension value anyway
-            // This workaround might be avoided if blocked shape are used for tail size propagation
-            shape_inference_end_it = expr_it;
-            break;
-        }
-        expr->updateShapes();
-        update_subtensors(expr->get_input_port_descriptors(), true);
-        update_subtensors(expr->get_output_port_descriptors(), false);
-    }
-
-    // After subtensor propagation, the original shapes must be restored
-    for (const auto& elem : original_shapes)
-        elem.first->set_shape(elem.second);
-    for (auto expr_it = begin; expr_it != shape_inference_end_it; expr_it++)
-        (*expr_it)->updateShapes();
-}
-
-LinearIR::constExprIt InsertTailLoop::insert_copy_loop(LinearIR& linear_ir, const size_t loop_id, const LinearIR::constExprIt& insert_pos) {
-    const auto& loop_manager = linear_ir.get_loop_manager();
-    const auto loop_bounds = loop_manager->get_loop_bounds(linear_ir, loop_id);
-
-    ExressionMap expression_map;
-    const auto& loop_copy_range = LinearIR::deep_copy_range(loop_bounds.first, std::next(loop_bounds.second), expression_map);
-    const auto new_loop_begin_pos = linear_ir.insert(insert_pos, loop_copy_range.begin(), loop_copy_range.end());
-    const auto new_loop_end_pos = insert_pos;
-
-    const auto original_loop_info = loop_manager->get_loop_info(loop_id);
-    std::vector<LinearIR::LoopManager::LoopPort> new_entry_points, new_exit_points;
-    // Clone loop ports from original loop info to new loop info
-    for (const auto& entry : original_loop_info->get_entry_points())
-        new_entry_points.push_back(*entry.clone_with_new_expr(expression_map[entry.expr_port->get_expr().get()]));
-    for (const auto& exit : original_loop_info->get_exit_points())
-        new_exit_points.push_back(*exit.clone_with_new_expr(expression_map[exit.expr_port->get_expr().get()]));
-
-    for (const auto& elem : expression_map) {
-        const auto expr = elem.first->shared_from_this();
-        const auto& new_expr = elem.second;
-        // Loop begin/end ops can't be loop ports
-        if (ov::is_type<op::LoopBase>(expr->get_node()))
-            continue;
-        // Update loop info of all outer loops with new loop ports
-        const auto outer_loop_ids = LinearIR::LoopManager::get_outer_expr_loops(expr, loop_id);
-        for (size_t i = 0; i < expr->get_input_count(); ++i)
-            loop_manager->update_loops_port(outer_loop_ids, expr->get_input_port(i), {expr->get_input_port(i), new_expr->get_input_port(i)}, true);
-        for (size_t i = 0; i < expr->get_output_count(); ++i)
-            loop_manager->update_loops_port(outer_loop_ids, expr->get_output_port(i), {expr->get_output_port(i), new_expr->get_output_port(i)}, false);
-    }
-
-    const auto new_id = loop_manager->replace_with_new_loop(linear_ir, new_loop_begin_pos, new_loop_end_pos,
-                                                            original_loop_info->get_work_amount(), original_loop_info->get_increment(),
-                                                            new_entry_points, new_exit_points, loop_id);
-    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(std::prev(new_loop_end_pos)->get()->get_node());
-    OPENVINO_ASSERT(loop_end, "Cloned Loop does not contain LoopEnd op at the expected place.");
-    loop_end->set_id(new_id);
-    return new_loop_begin_pos;
-}
-
-void InsertTailLoop::create_tail_loop(LinearIR& linear_ir,
-                                      LinearIR::constExprIt begin,
-                                      LinearIR::constExprIt end,
-                                      const std::shared_ptr<op::LoopEnd>& loop_end,
-                                      bool need_vector_loop,
-                                      size_t tail_size) {
-    // tail is required => transform the body into a tail representation
-    // tail loop is fake loop because for tail we should calculate only
-    // finalization offsets which are supported by LoopEnd.
-    const auto& loop_manager = linear_ir.get_loop_manager();
-    const auto original_loop_id = loop_end->get_id();
-    auto original_loop_info = loop_manager->get_loop_info(original_loop_id);
-    auto tail_loop_info = original_loop_info;
-    if (need_vector_loop) {
-        // Note: new loop body is inserted before the original loop
-        // So new loop becomes a main vector loop, the original loop becomes tail loop
-        // This is done in such way to have original ops from the main body at the end:
-        // this allows us to conveniently interact with outer loops in further passes
-        const auto new_loop_begin_pos = insert_copy_loop(linear_ir, original_loop_id, begin);
-        const auto new_loop_begin = ov::as_type_ptr<op::LoopBegin>(new_loop_begin_pos->get()->get_node());
-        OPENVINO_ASSERT(new_loop_begin, "Cloned Loop does not contain LoopBegin op at the expected place.");
-        const auto new_loop_end = new_loop_begin->get_loop_end();
-        tail_loop_info = original_loop_info;
-        original_loop_info = loop_manager->get_loop_info(new_loop_end->get_id());
-
-        const auto new_vector_loop_wa = original_loop_info->get_work_amount() - tail_size;
-        original_loop_info->set_work_amount(new_vector_loop_wa);
-        new_loop_end->set_work_amount(new_vector_loop_wa);
-        original_loop_info->set_outer_splited_loop(tail_loop_info->get_outer_splited_loop());
-        // Note that finalization offsets should be applied after the last iteration.
-        // So if there is a tail, then we should apply offsets after it, but not now.
-        new_loop_end->set_finalization_offsets(std::vector<int64_t>(loop_end->get_finalization_offsets().size(), 0));
-    }
-    loop_end->set_increment(tail_size);
-    loop_end->set_work_amount(tail_size);
-    tail_loop_info->set_increment(tail_size);
-    tail_loop_info->set_work_amount(tail_size);
-
-    // We have to check the loop body for any nested loops that work on the same dimension
-    // and rescale their work_amount and increment accordingly
-    if (original_loop_info->get_outer_splited_loop()) {
-        const auto current_dim_idx = original_loop_info->get_dim_idx();
-        OPENVINO_ASSERT(current_dim_idx != LinearIR::LoopManager::LoopInfo::UNDEFINED_DIM_IDX,
-                        "Outer splitted loop unexpectedly iterates by several dimension indices");
-        for (auto it = std::next(begin); it != std::prev(end); ++it) {
-            const auto& expr = *it;
-            const auto inner_loop_end = ov::as_type_ptr<op::LoopEnd>(expr->get_node());
-            if (!inner_loop_end)
-                continue;
-            const auto inner_loop_info = loop_manager->get_loop_info(inner_loop_end->get_id());
-            const auto inner_dim_idx = inner_loop_info->get_dim_idx();
-            if (inner_dim_idx != current_dim_idx)
-                continue;
-            const auto inner_loop_begin = inner_loop_end->get_loop_begin();
-            const auto inner_tail_work_amount = static_cast<int64_t>(inner_loop_end->get_work_amount());
-            const auto inner_tail_increment = inner_loop_end->get_increment();
-            auto inner_finalization_offsets = inner_loop_end->get_finalization_offsets();
-            for (auto& offset : inner_finalization_offsets) {
-                offset = offset / inner_tail_work_amount * static_cast<int64_t>(tail_size);
-            }
-            inner_loop_end->set_work_amount(tail_size);
-            inner_loop_end->set_increment(std::min(inner_tail_increment, tail_size));
-            inner_loop_end->set_finalization_offsets(inner_finalization_offsets);
-            const auto inner_loop_begin_it = std::find(begin, it, linear_ir.get_expr_by_node(inner_loop_begin));
-            const auto inner_loop_end_it = std::next(end);
-            OPENVINO_ASSERT(inner_loop_begin_it != it, "LoopBegin has not been found!");
-            tail_transformations(linear_ir, inner_loop_begin_it, inner_loop_end_it, tail_size);
-        }
-    }
-    tail_transformations(linear_ir, begin, end, tail_size);
-    propagate_updated_subtensor_through_loop(linear_ir, tail_loop_info, std::next(begin), end, tail_size);
-}
-
-void InsertTailLoop::tail_transformations(LinearIR& linear_ir,
-                                          LinearIR::constExprIt tail_begin,
-                                          LinearIR::constExprIt tail_end,
-                                          const size_t tail_size) {
-    const auto& config = linear_ir.get_config();
-    auto insertFill = [tail_size](const ov::Input<ov::Node>& input, const ExpressionPort& source) -> std::shared_ptr<ov::Node> {
-        std::shared_ptr<ov::Node> fill = nullptr;
-        auto& rt = input.get_rt_info();
-        auto fill_rt = rt.find("set_fill");
-        if (fill_rt != rt.end()) {
-            const auto fill_value = fill_rt->second.as<uint32_t>();
-            fill = std::make_shared<ov::snippets::op::Fill>(source.get_expr()->get_node()->output(source.get_index()), tail_size, fill_value);
-        }
-        return fill;
-    };
-
-    for (auto expr_it = std::next(tail_begin); expr_it != tail_end; expr_it++) {
-        // Skip inner Loops
-        const auto loop_begin = ov::as_type_ptr<op::LoopBegin>(expr_it->get()->get_node());
-        if (loop_begin) {
-            expr_it = linear_ir.find(expr_it, tail_end, linear_ir.get_expr_by_node(loop_begin->get_loop_end()));
-            continue;
-        }
-        // We should fill vector regs by float_min and zero to have
-        // correct math calculations for ReduceMax and ReduceSum in scalar case.
-        // Note: We find Maximum and Add ops because HorizonMax and HorizonSum are outside Loop,
-        //       so they are missed in <tail>
-        const auto& expr = *expr_it;
-        const auto op = expr->get_node();
-        if (config.m_need_fill_tail_register &&
-            (ov::is_type<ov::op::v1::Maximum>(op) ||
-             ov::is_type<ov::op::v1::Add>(op))) {
-            for (size_t i = 0; i < expr->get_input_count(); ++i) {
-                const auto& input = expr->get_input_port_connector(i);
-                if (auto fill = insertFill(op->input(i), input->get_source())) {
-                    const auto consumers = input->get_consumers();
-                    // If there are several consumers, fill expression must be inserted before first of them
-                    auto fst_consumer = std::min_element(consumers.cbegin(), consumers.cend(), [&](ExpressionPort lhs, ExpressionPort rhs) {
-                        auto lhs_it = linear_ir.find(lhs.get_expr());
-                        auto rhs_it = linear_ir.find(rhs.get_expr());
-                        return std::distance(linear_ir.cbegin(), lhs_it) < std::distance(linear_ir.cbegin(), rhs_it);
-                    });
-                    const auto fill_expr = *linear_ir.insert_node(fill, std::vector<ExpressionPort>{ input->get_source() }, expr->get_loop_ids(), true,
-                                                                  linear_ir.find(fst_consumer->get_expr()), consumers);
-
-                    // in_reg == out_reg since we want to modify vector reg inplace
-                    const auto reg = expr->get_input_port_descriptor(0)->get_reg();
-                    fill_expr->get_input_port_descriptor(0)->set_reg(reg);
-                    fill_expr->get_output_port_descriptor(0)->set_reg(reg);
-                }
-            }
-        } else if (const auto memory_access = std::dynamic_pointer_cast<ov::snippets::op::MemoryAccess>(op)) {
-            for (const auto p : memory_access->get_memory_access_input_ports()) {
-                const auto port = p.first;
-                if (memory_access->get_input_count(port) > 1) {
-                    memory_access->set_input_count(tail_size, port);
-                }
-            }
-            for (const auto p : memory_access->get_memory_access_output_ports()) {
-                const auto port = p.first;
-                if (memory_access->get_output_count(port) > 1) {
-                    memory_access->set_output_count(tail_size, port);
-                }
-            }
-        }
-    }
-}
-
-bool InsertTailLoop::run(LinearIR& linear_ir) {
-    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::insertTailLoop")
-    const auto& loop_manager = linear_ir.get_loop_manager();
-    bool modified = false;
-
-    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) {
-        const auto& expr = *expr_it;
-        const auto node = expr->get_node();
-        const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
-        if (!loop_end)
-            continue;
-
-        const auto loop_info = loop_manager->get_loop_info(loop_end->get_id());
-        const auto& first_iter_handler = loop_info->get_first_iter_handler();
-        if (first_iter_handler) {
-            modified |= first_iter_handler(linear_ir, expr_it);
-        }
-
-        const auto work_amount = loop_end->get_work_amount();
-        const auto increment = loop_end->get_increment();
-        const auto tail_size = work_amount % increment;
-
-        // tail is required => transform the body into a tail representation
-        // tail loop is fake loop because for tail we should calculate only
-        // finalization offsets which are supported by LoopEnd.
-        if (tail_size != 0) {
-            const auto loop_begin = loop_end->get_loop_begin();
-            const auto begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_begin));
-            const auto need_vector_loop = work_amount >= increment;
-            create_tail_loop(linear_ir, begin_it, std::next(expr_it), loop_end, need_vector_loop, tail_size);
-        }
-        modified = true;
-    }
-    return modified;
-}
-
-} // namespace pass
-} // namespace lowered
-} // namespace snippets
-} // namespace ov
-
diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp
new file mode 100644
index 00000000000000..8b396329644017
--- /dev/null
+++ b/src/common/snippets/src/lowered/pass/iter_handler.cpp
@@ -0,0 +1,143 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/iter_handler.hpp"
+
+#include "snippets/itt.hpp"
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/loop_manager.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+UpdateMemoryAccessCounts::UpdateMemoryAccessCounts(size_t count) : RangedPass(), m_count(count) {}
+
+bool UpdateMemoryAccessCounts::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    bool status = false;
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
+        // Skip inner Loops
+        const auto loop_begin = ov::as_type_ptr<op::LoopBegin>(expr_it->get()->get_node());
+        if (loop_begin) {
+            expr_it = linear_ir.find(expr_it, end, linear_ir.get_expr_by_node(loop_begin->get_loop_end()));
+            if (expr_it == end)
+                return status;
+            continue;
+        }
+
+        const auto& node = expr_it->get()->get_node();
+        if (const auto memory_access = ov::as_type_ptr<ov::snippets::op::MemoryAccess>(node)) {
+            for (const auto p : memory_access->get_memory_access_input_ports()) {
+                const auto port = p.first;
+                if (memory_access->get_input_count(port) > 1) {
+                    memory_access->set_input_count(m_count, port);
+                }
+            }
+            for (const auto p : memory_access->get_memory_access_output_ports()) {
+                const auto port = p.first;
+                if (memory_access->get_output_count(port) > 1) {
+                    memory_access->set_output_count(m_count, port);
+                }
+            }
+            status = true;
+        }
+    }
+    return status;
+}
+
+std::shared_ptr<pass::PassBase> UpdateMemoryAccessCounts::merge(const std::shared_ptr<pass::PassBase>& other) {
+    const auto merged_pass = std::make_shared<UpdateMemoryAccessCounts>(m_count);
+    if (other == nullptr)
+        return merged_pass;
+    const auto casted_pass = ov::as_type_ptr<UpdateMemoryAccessCounts>(other);
+    if (!casted_pass || m_count != casted_pass->m_count)
+        return nullptr;
+    return merged_pass;
+}
+
+SetFillOffset::SetFillOffset(size_t offset) : RangedPass(), m_offset(offset) {}
+
+bool SetFillOffset::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
+        const auto& node = expr_it->get()->get_node();
+        if (const auto fill = ov::as_type_ptr<ov::snippets::op::Fill>(node)) {
+            fill->set_offset(m_offset);
+        }
+    }
+    return true;
+}
+
+std::shared_ptr<pass::PassBase> SetFillOffset::merge(const std::shared_ptr<pass::PassBase>& other) {
+    const auto merged_pass = std::make_shared<SetFillOffset>(m_offset);
+    if (other == nullptr)
+        return merged_pass;
+    const auto casted_pass = ov::as_type_ptr<SetFillOffset>(other);
+    if (!casted_pass || m_offset != casted_pass->m_offset)
+        return nullptr;
+    return merged_pass;
+}
+
+TransformInnerSplitLoop::TransformInnerSplitLoop(size_t tail_size) : RangedPass(), m_tail_size(tail_size) {}
+
+bool TransformInnerSplitLoop::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    const auto& expr = *end;
+    const auto node = expr->get_node();
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
+    OPENVINO_ASSERT(loop_end, "the last operation in range must be LoopEnd");
+
+    const auto& loop_manager = linear_ir.get_loop_manager();
+    const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id());
+    const auto current_dim_idx = loop_info->get_dim_idx();
+    OPENVINO_ASSERT(current_dim_idx != LinearIR::LoopManager::LoopInfo::UNDEFINED_DIM_IDX,
+                    "Outer splitted loop unexpectedly iterates by several dimension indices");
+
+    bool modified = false;
+    for (auto it = begin; it != end; ++it) {
+        const auto& expr = *it;
+        const auto inner_loop_end = ov::as_type_ptr<op::LoopEnd>(expr->get_node());
+        if (!inner_loop_end)
+            continue;
+        const auto inner_loop_info = loop_manager->get_loop_info(inner_loop_end->get_id());
+        const auto inner_dim_idx = inner_loop_info->get_dim_idx();
+        if (inner_dim_idx != current_dim_idx)
+            continue;
+        const auto inner_loop_begin = inner_loop_end->get_loop_begin();
+        const auto inner_loop_work_amount = static_cast<int64_t>(inner_loop_end->get_work_amount());
+        const auto inner_loop_increment = inner_loop_end->get_increment();
+        auto inner_finalization_offsets = inner_loop_end->get_finalization_offsets();
+        for (auto& offset : inner_finalization_offsets) {
+            offset = offset / inner_loop_work_amount * static_cast<int64_t>(m_tail_size);
+        }
+        inner_loop_end->set_work_amount(m_tail_size);
+        // TODO: if m_tail_size more than inner loop increment,
+        // handlers of the inner loop must be reset with new tail size
+        inner_loop_end->set_increment(std::min(inner_loop_increment, m_tail_size));
+        inner_loop_end->set_finalization_offsets(inner_finalization_offsets);
+        const auto inner_loop_begin_it = std::find(begin, it, linear_ir.get_expr_by_node(inner_loop_begin));
+        const auto inner_loop_end_it = std::next(it);
+        OPENVINO_ASSERT(inner_loop_begin_it != it, "LoopBegin has not been found!");
+        const auto& last_iter_handlers = inner_loop_info->get_handlers().get_last_iter_handelrs();
+        last_iter_handlers.run(linear_ir, std::next(inner_loop_begin_it), inner_loop_end_it);
+        modified = true;
+    }
+    return modified;
+}
+
+std::shared_ptr<pass::PassBase> TransformInnerSplitLoop::merge(const std::shared_ptr<pass::PassBase>& other) {
+    const auto merged_pass = std::make_shared<TransformInnerSplitLoop>(m_tail_size);
+    if (other == nullptr)
+        return merged_pass;
+    const auto casted_pass = ov::as_type_ptr<TransformInnerSplitLoop>(other);
+    if (!casted_pass || m_tail_size != casted_pass->m_tail_size)
+        return nullptr;
+    return merged_pass;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
+
diff --git a/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp
index 3f9de12a5a0523..3c5b5f3060f7a8 100644
--- a/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp
+++ b/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp
@@ -14,11 +14,11 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-bool LoadMoveBroadcastToBroadcastLoad::run(LinearIR& linear_ir) {
+bool LoadMoveBroadcastToBroadcastLoad::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::LoadMoveBroadcastToBroadcastLoad")
     bool modified = false;
 
-    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto& expr = *expr_it;
         const auto& op = expr->get_node();
         // Match on MoveBroadcast because MoveBroadcast is rare node in bodies
diff --git a/src/common/snippets/src/lowered/pass/mark_loops.cpp b/src/common/snippets/src/lowered/pass/mark_loops.cpp
index 05d38e111927c4..82b65eb3e8ee91 100644
--- a/src/common/snippets/src/lowered/pass/mark_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/mark_loops.cpp
@@ -14,13 +14,10 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-MarkLoops::MarkLoops(size_t vector_size) : Pass(), m_vector_size(vector_size) {}
+MarkLoops::MarkLoops(size_t vector_size) : RangedPass(), m_vector_size(vector_size) {}
 
-bool MarkLoops::run(LinearIR& linear_ir) {
+bool MarkLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::MarkLoops")
-    if (linear_ir.empty())
-        return false;
-
     const auto& lowering_config = linear_ir.get_config();
     const auto& loop_manager = linear_ir.get_loop_manager();
     auto loop_depth = lowering_config.m_loop_depth;
@@ -41,7 +38,7 @@ bool MarkLoops::run(LinearIR& linear_ir) {
                lhs_desc->get_shape() != rhs_desc->get_shape();
     };
 
-    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto expr = *expr_it;
         const auto& node = expr->get_node();
         if (is_not_start_point(node))
@@ -55,7 +52,7 @@ bool MarkLoops::run(LinearIR& linear_ir) {
             const auto& prev_expr = *loop_end_pos;
             loop_end_pos++;
             // If iterator is the last, we should finish Loop
-            if (loop_end_pos == linear_ir.end())
+            if (loop_end_pos == end)
                 break;
 
             // If iterator is the last, we should finish Loop
diff --git a/src/common/snippets/src/lowered/pass/normalize_bufer_ids.cpp b/src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp
similarity index 80%
rename from src/common/snippets/src/lowered/pass/normalize_bufer_ids.cpp
rename to src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp
index 7e99302743a0b3..76ef3562760daa 100644
--- a/src/common/snippets/src/lowered/pass/normalize_bufer_ids.cpp
+++ b/src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp
@@ -13,12 +13,13 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-bool NormalizeBufferIDs::run(LinearIR& linear_ir) {
+bool NormalizeBufferIDs::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::NormalizeBufferIDs");
 
     // [ original Buffer ID -> normalized ]
     std::map<size_t, size_t> buffer_ids;
-    for (const auto& expr : linear_ir) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
         const auto op = expr->get_node();
         if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
             const auto buffer_id = buffer->get_id();
diff --git a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp
index 317eb32f7ab1fe..b8391964ef783d 100644
--- a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp
+++ b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp
@@ -13,13 +13,11 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-bool OptimizeLoopSingleEvaluation::run(LinearIR& linear_ir) {
+bool OptimizeLoopSingleEvaluation::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::OptimizeLoopSingleEvaluation")
-    if (linear_ir.empty())
-        return false;
-
     bool is_modified = false;
-    for (const auto& expr : linear_ir) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
         if (auto loop_end = ov::as_type_ptr<op::LoopEnd>(expr->get_node())) {
             // *1* solo vector/tail loop + empty outer loop
             //      => skip increments (both counter & ptr) : set evaluate_once flag
diff --git a/src/common/snippets/src/lowered/pass/pass.cpp b/src/common/snippets/src/lowered/pass/pass.cpp
index 70a05fc30be147..db13c90ae97673 100644
--- a/src/common/snippets/src/lowered/pass/pass.cpp
+++ b/src/common/snippets/src/lowered/pass/pass.cpp
@@ -16,23 +16,33 @@ PassPipeline::PassPipeline(const std::shared_ptr<PassConfig>& pass_config) : m_p
     OPENVINO_ASSERT(m_pass_config != nullptr, "PassConfig is not initialized!");
 }
 
-void PassPipeline::register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr<Pass>& pass) {
+void PassPipeline::register_pass(const snippets::pass::PassPosition& position, const std::shared_ptr<PassBase>& pass) {
     OPENVINO_ASSERT(pass != nullptr, "PassPipeline cannot register empty pass!");
     m_passes.insert(position.get_insert_position(m_passes), pass);
 }
 
-void PassPipeline::register_pass(const std::shared_ptr<Pass>& pass) {
+void PassPipeline::register_pass(const std::shared_ptr<PassBase>& pass) {
     OPENVINO_ASSERT(pass != nullptr, "PassPipeline cannot register empty pass!");
     m_passes.push_back(pass);
 }
 
 void PassPipeline::run(LinearIR& linear_ir) const {
+    run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
+}
+
+void PassPipeline::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) const {
     for (const auto& pass : m_passes) {
         OPENVINO_ASSERT(pass != nullptr, "PassPipeline has empty pass!");
         if (m_pass_config->is_disabled(pass->get_type_info())) {
             continue;
         }
-        pass->run(linear_ir);
+        if (auto lir_pass = std::dynamic_pointer_cast<Pass>(pass)) {
+            lir_pass->run(linear_ir);
+        } else if (auto ranged_pass = std::dynamic_pointer_cast<RangedPass>(pass)) {
+            ranged_pass->run(linear_ir, begin, end);
+        } else {
+            OPENVINO_THROW("Unexpected pass (", pass->get_type_info(), ") is registered in PassPipeline");
+        }
     }
 }
 
@@ -41,6 +51,29 @@ void PassPipeline::register_positioned_passes(const std::vector<PositionedPassLo
         register_pass(pp.position, pp.pass);
 }
 
+PassPipeline PassPipeline::merge_pipelines(const PassPipeline& lhs, const PassPipeline& rhs) {
+    OPENVINO_ASSERT(*lhs.get_pass_config() == *rhs.get_pass_config(), "2 passes with different PassConfigs can't be merged.");
+    const auto& lhs_passes = lhs.get_passes();
+    std::unordered_map<ov::DiscreteTypeInfo, std::shared_ptr<lowered::pass::PassBase>> lhs_passes_map;
+    for (const auto& pass : lhs_passes) {
+        lhs_passes_map[pass->get_type_info()] = pass;
+    }
+    OPENVINO_ASSERT(lhs_passes_map.size() == lhs_passes.size(), "The pass pipeline must not contain several passes with equal type info");
+
+    PassPipeline merged_pipeline;
+    for (const auto& rhs_pass : rhs.get_passes()) {
+        const auto lhs_pass = rhs_pass->merge(lhs_passes_map[rhs_pass->get_type_info()]);
+        OPENVINO_ASSERT(lhs_pass, "2 passes with type info ", rhs_pass->get_type_info(), " can't be merged.");
+        merged_pipeline.register_pass(lhs_pass);
+        lhs_passes_map.erase(rhs_pass->get_type_info());
+    }
+
+    for (const auto& rest_pass : lhs_passes_map) {
+        merged_pipeline.register_pass(rest_pass.second);
+    }
+    return merged_pipeline;
+}
+
 } // namespace pass
 } // namespace lowered
 } // namespace snippets
diff --git a/src/common/snippets/src/lowered/pass/pass_config.cpp b/src/common/snippets/src/lowered/pass/pass_config.cpp
index ae73f88c55805a..6d4888e81c7420 100644
--- a/src/common/snippets/src/lowered/pass/pass_config.cpp
+++ b/src/common/snippets/src/lowered/pass/pass_config.cpp
@@ -28,6 +28,14 @@ bool PassConfig::is_enabled(const DiscreteTypeInfo& type_info) const {
     return m_enabled.count(type_info);
 }
 
+bool operator==(const PassConfig& lhs, const PassConfig& rhs) {
+    return lhs.m_disabled == rhs.m_disabled && lhs.m_enabled == rhs.m_enabled;
+}
+
+bool operator!=(const PassConfig& lhs, const PassConfig& rhs) {
+    return !(lhs == rhs);
+}
+
 } // namespace pass
 } // namespace lowered
 } // namespace snippets
diff --git a/src/common/snippets/src/lowered/pass/propagate_layout.cpp b/src/common/snippets/src/lowered/pass/propagate_layout.cpp
index aea3cf99858622..8023516e159ba3 100644
--- a/src/common/snippets/src/lowered/pass/propagate_layout.cpp
+++ b/src/common/snippets/src/lowered/pass/propagate_layout.cpp
@@ -14,12 +14,10 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-bool PropagateLayout::run(LinearIR& linear_ir) {
+bool PropagateLayout::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::PropagateLayout")
-    if (linear_ir.empty())
-        return false;
-
-    for (const auto& expr : linear_ir) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
         const auto io_expr = std::dynamic_pointer_cast<IOExpression>(expr);
         if (!io_expr)
             continue;
diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp
new file mode 100644
index 00000000000000..cd06f6d163c479
--- /dev/null
+++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp
@@ -0,0 +1,161 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/propagate_subtensors.hpp"
+
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/loop_manager.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
+#include "snippets/itt.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+namespace {
+void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir,
+                                              const LinearIR::LoopManager::LoopInfoPtr& loop_info,
+                                              LinearIR::container::const_iterator begin,
+                                              LinearIR::container::const_iterator end,
+                                              bool most_outer_loop,
+                                              const size_t new_dim_value = SIZE_MAX) {
+    OPENVINO_ASSERT(snippets::utils::implication(most_outer_loop, new_dim_value != SIZE_MAX),
+                    "if the updated subtensor propagation was called for the outer loop, new_dim_value must not be equal to default value");
+    std::map<lowered::PortDescriptorPtr, snippets::VectorDims> original_shapes;
+    // First step: set new dim value to the corresponding entry_points' dimensions
+    if (most_outer_loop) {
+        for (const auto& port : loop_info->get_entry_points()) {
+            const auto& reg_type = port.expr_port->get_descriptor_ptr()->get_reg().type;
+            if ((port.is_incremented && reg_type == RegType::gpr) || (reg_type == RegType::vec)) {
+                const auto& expr = port.expr_port->get_expr();
+                const auto& desc = port.expr_port->get_descriptor_ptr();
+                auto subtensor = desc->get_subtensor();
+                if (port.dim_idx < subtensor.size()) {
+                    *(subtensor.rbegin() + port.dim_idx) = new_dim_value;
+                    desc->set_subtensor(subtensor);
+                }
+
+                const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr();
+                const auto& parent_shape = parent_desc->get_shape();
+                if (original_shapes.find(parent_desc) == original_shapes.end()) {
+                    original_shapes[parent_desc] = parent_shape;
+                }
+                auto new_shape = parent_shape;
+                new_shape[*(desc->get_layout().rbegin() + port.dim_idx)] = new_dim_value;
+                parent_desc->set_shape(new_shape);
+            }
+        }
+    }
+
+    auto update_only_dim_idx_with_subtensor_value = [&](const LinearIR::LoopManager::LoopPort& port) {
+        const auto& reg_type = port.expr_port->get_descriptor_ptr()->get_reg().type;
+        if ((port.is_incremented && reg_type == RegType::gpr) || (reg_type == RegType::vec)) {
+            const auto desc = port.expr_port->get_descriptor_ptr();
+            const auto expr = port.expr_port->get_expr();
+            const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr();
+
+            const auto& parent_shape = parent_desc->get_shape();
+            const auto& desc_subtensor = desc->get_subtensor();
+            if (port.dim_idx < desc_subtensor.size()) {
+                if (original_shapes.find(parent_desc) == original_shapes.end()) {
+                    original_shapes[parent_desc] = parent_shape;
+                }
+                auto new_shape = parent_shape;
+                new_shape[*(desc->get_layout().rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx);
+                parent_desc->set_shape(new_shape);
+            }
+        }
+    };
+
+    auto update_subtensors = [](const std::vector<PortDescriptorPtr>& descs, bool is_input) {
+        for (const auto& desc : descs) {
+            const auto& subtensor = desc->get_subtensor();
+            if (!subtensor.empty()) {
+                auto planar_dims = is_input ? snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout())
+                                            : snippets::utils::get_preordered_vdims(desc->get_shape(), desc->get_layout());
+                const size_t subtensor_start = planar_dims.size() - subtensor.size();
+                VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end());
+                for (size_t i = 0; i < new_subtensor.size(); ++i) {
+                    new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]);
+                }
+                desc->set_subtensor(new_subtensor);
+            }
+        }
+    };
+
+    auto shape_inference_end_it = end;
+    const bool loop_by_last_dim = loop_info->get_dim_idx() == 0;
+    // Subtensors are updated using shape inference infrastructure:
+    // For inner loops propagation function is called recursively
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
+        const auto expr = *expr_it;
+        if (ov::is_type<snippets::op::LoopEnd>(expr->get_node()))
+            continue;
+        if (auto loop_begin = ov::as_type_ptr<snippets::op::LoopBegin>(expr->get_node())) {
+            const auto loop_end = loop_begin->get_loop_end();
+            const auto inner_loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id());
+            const auto inner_begin = std::next(expr_it);
+            const auto inner_end = linear_ir.find_after(inner_begin, linear_ir.get_expr_by_node(loop_end));
+
+            // The corresponding shapes of inner loops entry points must be updated using existing subtensor values
+            if (!most_outer_loop) {
+                for (const auto& port : loop_info->get_entry_points())
+                    update_only_dim_idx_with_subtensor_value(port);
+            }
+            propagate_updated_subtensor_through_loop(linear_ir, inner_loop_info, inner_begin, inner_end, false);
+            expr_it = inner_end;
+            continue;
+        }
+        if ((ov::is_type<snippets::op::BroadcastMove>(expr_it->get()->get_node()) ||
+            ov::is_type<snippets::op::BroadcastLoad>(expr_it->get()->get_node())) &&
+            loop_by_last_dim) {
+            // WA: we have to break subtensor propagation if we try to propagate new last dim through Broadcast nodes
+            // which broadcast last dim in original dimension value anyway
+            // This workaround might be avoided if blocked shape are used for tail size propagation
+            shape_inference_end_it = expr_it;
+            break;
+        }
+        expr->updateShapes();
+        update_subtensors(expr->get_input_port_descriptors(), true);
+        update_subtensors(expr->get_output_port_descriptors(), false);
+    }
+
+    // After subtensor propagation, the original shapes must be restored
+    for (const auto& elem : original_shapes)
+        elem.first->set_shape(elem.second);
+    for (auto expr_it = begin; expr_it != shape_inference_end_it; expr_it++)
+        (*expr_it)->updateShapes();
+}
+}  // namespace
+
+UpdateSubtensors::UpdateSubtensors(size_t tail_size) : RangedPass(), m_tail_size(tail_size) {}
+
+bool UpdateSubtensors::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    const auto& last_expr = *end;
+    const auto last_node = last_expr->get_node();
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(last_node);
+    OPENVINO_ASSERT(loop_end, "the last operation in range must be LoopEnd");
+
+    const auto& loop_manager = linear_ir.get_loop_manager();
+    const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id());
+    propagate_updated_subtensor_through_loop(linear_ir, loop_info, begin, end, true, m_tail_size);
+    return true;
+}
+
+std::shared_ptr<pass::PassBase> UpdateSubtensors::merge(const std::shared_ptr<pass::PassBase>& other) {
+    const auto merged_pass = std::make_shared<UpdateSubtensors>(m_tail_size);
+    if (other == nullptr)
+        return merged_pass;
+    const auto casted_pass = ov::as_type_ptr<UpdateSubtensors>(other);
+    if (!casted_pass || m_tail_size != casted_pass->m_tail_size)
+        return nullptr;
+    return merged_pass;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
+
diff --git a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp
index 2ec613495e9a13..7497eb19c82cb0 100644
--- a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp
+++ b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp
@@ -19,9 +19,12 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
+using LoopInfo = LinearIR::LoopManager::LoopInfo;
+using HandlerType = LoopInfo::SpecificIterationHandlers::HandlerType;
+
 SoftmaxDecomposition::SoftmaxDecomposition(size_t vector_size) : m_vector_size{vector_size} {}
 
-bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
+bool SoftmaxDecomposition::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SoftmaxDecompositionLowered")
     bool modified = false;
     const auto& loop_manager = linear_ir.get_loop_manager();
@@ -29,7 +32,7 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
     auto match_softmax = ov::pass::pattern::wrap_type<ov::op::v1::Softmax>();
     auto matcher = std::make_shared<ov::pass::pattern::Matcher>(match_softmax, "SoftmaxDecompositionLowered");
 
-    for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto& op = (*expr_it)->get_node();
         if (matcher->match(op)) {
             const auto& pm = matcher->get_pattern_map();
@@ -40,6 +43,7 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
             const auto& output_connector = softmax_expr->get_output_port_connector(0);
             const auto tensor_out = softmax_expr->get_output_port_descriptor(0)->get_shape();
             const auto inner_work_amount = *(tensor_out.rbegin());
+            const auto inner_increment = std::min(inner_work_amount, m_vector_size);
 
             // Float constant values in byte representation
             const auto float_min_constant = uint32_t(0xff7fffff);
@@ -58,15 +62,21 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
             // Init value of vector buffer for ReduceMax is -FLOAT_MIN.
             const auto fill_max = push_node(std::make_shared<op::Fill>(vector_buffer_max.second, 0, float_min_constant));
             // ReduceMax loop
-            const auto& max = push_node(std::make_shared<ov::op::v1::Maximum>(softmax->get_input_source_output(0), fill_max.second));
+            const auto fill_max_tail = push_node(std::make_shared<op::Fill>(softmax->get_input_source_output(0), inner_increment, float_min_constant));
+
+            const auto& max = push_node(std::make_shared<ov::op::v1::Maximum>(fill_max_tail.second, fill_max.second));
 
             const auto horizon_max = push_node(std::make_shared<op::HorizonMax>(max.second));
 
             // Markup of ReduceMax Loop
-            loop_manager->mark_loop(max.first, horizon_max.first, inner_work_amount, m_vector_size, 0,
-                                    std::vector<ExpressionPort>{(*max.first)->get_input_port(0),
-                                                                (*max.first)->get_input_port(1)},
-                                    std::vector<ExpressionPort>{(*max.first)->get_output_port(0)});
+            const auto reduce_max_loop_id = loop_manager->mark_loop(fill_max_tail.first, horizon_max.first, inner_work_amount, inner_increment, 0,
+                                                                    std::vector<ExpressionPort>{(*fill_max_tail.first)->get_input_port(0),
+                                                                                                (*max.first)->get_input_port(1)},
+                                                                    std::vector<ExpressionPort>{(*max.first)->get_output_port(0)});
+            const auto tail_size = inner_work_amount % inner_increment;
+            if (tail_size != 0) {
+                loop_manager->get_loop_info(reduce_max_loop_id)->register_handler<HandlerType::LAST_ITER, SetFillOffset>(tail_size);
+            }
             const auto broadcast_horizon_max = push_node(std::make_shared<op::BroadcastMove>(horizon_max.second, broadcasted_dim));
             const auto vector_buffer_sum = push_node(std::make_shared<op::VectorBuffer>());
             // Init value of vector buffer for ReduceSum is zero.
@@ -75,37 +85,40 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
             // Sub + Exp + ReduceSum Loop
             const auto sub = push_node(std::make_shared<ov::op::v1::Subtract>(softmax->get_input_source_output(0), broadcast_horizon_max.second));
             const auto exp = push_node(std::make_shared<ov::op::v0::Exp>(sub.second));
-            const auto sum = push_node(std::make_shared<ov::op::v1::Add>(exp.second, fill_sum.second));
+            const auto fill_sum_tail = push_node(std::make_shared<op::Fill>(exp.second, inner_increment, zero_constant));
+            const auto sum = push_node(std::make_shared<ov::op::v1::Add>(fill_sum_tail.second, fill_sum.second));
 
             const auto horizon_sum = push_node(std::make_shared<op::HorizonSum>(sum.second));
 
-            // Markup of ReduceMax Loop
-            loop_manager->mark_loop(sub.first, horizon_sum.first, inner_work_amount, m_vector_size, 0,
-                                    std::vector<ExpressionPort>{(*sub.first)->get_input_port(0),
-                                                                (*sub.first)->get_input_port(1),
-                                                                (*sum.first)->get_input_port(1)},
-                                    std::vector<ExpressionPort>{(*exp.first)->get_output_port(0),
-                                                                (*sum.first)->get_output_port(0)});
+            // Markup of ReduceSum Loop
+            const auto reduce_sum_loop_id = loop_manager->mark_loop(sub.first, horizon_sum.first, inner_work_amount, inner_increment, 0,
+                                                                    std::vector<ExpressionPort>{(*sub.first)->get_input_port(0),
+                                                                                                (*sub.first)->get_input_port(1),
+                                                                                                (*sum.first)->get_input_port(1)},
+                                                                    std::vector<ExpressionPort>{(*fill_sum_tail.first)->get_output_port(0),
+                                                                                                (*sum.first)->get_output_port(0)});
+            if (tail_size != 0) {
+                loop_manager->get_loop_info(reduce_sum_loop_id)->register_handler<HandlerType::LAST_ITER, SetFillOffset>(tail_size);
+            }
 
             // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop
             const auto pow = push_node(std::make_shared<op::PowerStatic>(horizon_sum.second, -1.f));
             const auto broadcast_pow = push_node(std::make_shared<op::BroadcastMove>(pow.second, broadcasted_dim));
 
             // Mul (pseudo-Divide loop)
-            const auto mul = push_node(std::make_shared<ov::op::v1::Multiply>(exp.second, broadcast_pow.second));
+            const auto mul = push_node(std::make_shared<ov::op::v1::Multiply>(fill_sum_tail.second, broadcast_pow.second));
 
             // Transfer original ExpressionPorts
-            replace_input_port_connectors({ max.first->get()->get_input_port(0), sub.first->get()->get_input_port(0) }, input_connector);
+            replace_input_port_connectors({ fill_max_tail.first->get()->get_input_port(0), sub.first->get()->get_input_port(0) }, input_connector);
             replace_input_port_connectors(output_connector->get_consumers(), (*mul.first)->get_output_port_connector(0));
 
             // Markup of Mul Loop
-            loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, m_vector_size, 0,
-                                    std::vector<ExpressionPort>{(*mul.first)->get_input_port(0),
-                                                                (*mul.first)->get_input_port(1)},
+            loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, inner_increment, 0,
+                                    std::vector<ExpressionPort>{(*mul.first)->get_input_port(0), (*mul.first)->get_input_port(1)},
                                     std::vector<ExpressionPort>{(*mul.first)->get_output_port(0)});
 
             // Update Loop info for outer loops
-            const auto entry_points = std::vector<ExpressionPort>{(*max.first)->get_input_port(0),
+            const auto entry_points = std::vector<ExpressionPort>{(*fill_max_tail.first)->get_input_port(0),
                                                                   (*sub.first)->get_input_port(0)};
             const auto exit_points = std::vector<ExpressionPort>{(*mul.first)->get_output_port(0)};
             for (auto loop_id : softmax_loop_ids) {
@@ -113,16 +126,6 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
             }
 
             expr_it = linear_ir.erase(expr_it);   // Remove Softmax
-
-            /* =========================================== */
-
-            /* ============= Runtime Info ================ */
-
-            // For tail loop we should fill input of Max by float min and
-            // input of Sum by zero to avoid math incorrect calculations
-            // TODO [111383]: It should be covered via general pipeline (for example, via analyze in InsertTailLoop?)
-            max.second->input(0).get_rt_info()["set_fill"] = float_min_constant;
-            sum.second->input(0).get_rt_info()["set_fill"] = zero_constant;
             modified = true;
         }
     }
diff --git a/src/common/snippets/src/lowered/pass/split_loops.cpp b/src/common/snippets/src/lowered/pass/split_loops.cpp
index 02df0868f4c607..70b9b0ff0d72f8 100644
--- a/src/common/snippets/src/lowered/pass/split_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/split_loops.cpp
@@ -18,23 +18,24 @@ using LoopManager = LinearIR::LoopManager;
 using LoopInfo = LoopManager::LoopInfo;
 using LoopInfoPtr = LoopManager::LoopInfoPtr;
 
-SplitLoops::SplitLoops() : Pass() {}
+SplitLoops::SplitLoops() : RangedPass() {}
 
-bool SplitLoops::can_be_split(const LoopInfoPtr& current, const LoopInfoPtr& parent) {
-    const auto current_dim_idx = current->get_dim_idx();
-    const auto parent_dim_idx = parent->get_dim_idx();
+bool SplitLoops::can_be_split(const LoopInfoPtr& loop_to_split, const LoopInfoPtr& loop_to_fuse) {
+    const auto current_dim_idx = loop_to_split->get_dim_idx();
+    const auto parent_dim_idx = loop_to_fuse->get_dim_idx();
+    const auto& handlers = loop_to_split->get_handlers();
     const bool equal_dim_idxes = current_dim_idx != LoopInfo::UNDEFINED_DIM_IDX && current_dim_idx == parent_dim_idx;
-    return current->get_work_amount() == parent->get_work_amount() && current->get_increment() != parent->get_increment() && equal_dim_idxes;
+    const bool only_main_body = handlers.get_first_iter_handelrs().empty() && handlers.get_last_iter_handelrs().empty();
+    return loop_to_split->get_work_amount() == loop_to_fuse->get_work_amount() &&
+           loop_to_split->get_increment() != loop_to_fuse->get_increment() && equal_dim_idxes && only_main_body;
 }
 
-bool SplitLoops::run(LinearIR& linear_ir) {
+bool SplitLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SplitLoops")
-    if (linear_ir.empty())
-        return false;
-
     const auto& loop_manager = linear_ir.get_loop_manager();
     bool loop_was_split = false;
-    for (const auto& expr : linear_ir) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
         const auto& loop_ids = expr->get_loop_ids();
         if (loop_ids.empty())
             continue;
@@ -59,12 +60,12 @@ bool SplitLoops::run(LinearIR& linear_ir) {
                 continue;
 
             const auto parent_loop = loop_manager->get_loop_info(parent_loop_id);
-            if (can_be_split(loop, parent_loop)) {
+            const bool split_parent = parent_loop->get_increment() < loop->get_increment();
+            const auto& loop_to_split = split_parent ? parent_loop : loop;
+            const auto& loop_to_split_id = split_parent ? parent_loop_id : loop_id;
+            const auto& loop_to_fuse = !split_parent ? parent_loop : loop;
+            if (can_be_split(loop_to_split, loop_to_fuse)) {
                 loop_was_split = true;
-                const bool split_parent = parent_loop->get_increment() < loop->get_increment();
-                const auto& loop_to_split = split_parent ? parent_loop : loop;
-                const auto& loop_to_split_id = split_parent ? parent_loop_id : loop_id;
-                const auto& loop_to_fuse = !split_parent ? parent_loop : loop;
                 loop_to_split->set_work_amount(loop_to_fuse->get_increment());
 
                 const auto loop_bounds = LoopManager::get_loop_bounds(linear_ir, loop_to_split_id,
@@ -77,7 +78,15 @@ bool SplitLoops::run(LinearIR& linear_ir) {
                                                                    loop_to_split->get_dim_idx(),
                                                                    loop_to_split->get_entry_points(),
                                                                    loop_to_split->get_exit_points());
-                loop_manager->get_loop_info(split_loop_id)->set_outer_splited_loop(true);
+                const auto& new_loop_info = loop_manager->get_loop_info(split_loop_id);
+                const auto work_amount = loop_to_fuse->get_work_amount();
+                const auto increment = loop_to_fuse->get_increment();
+                const auto tail_size = work_amount % increment;
+                auto new_handlers = loop_to_split->get_handlers();
+                if (tail_size != 0) {
+                    new_handlers.register_handler<LoopInfo::SpecificIterationHandlers::HandlerType::LAST_ITER, TransformInnerSplitLoop>(tail_size);
+                }
+                new_loop_info->set_handlers(new_handlers);
                 break;
             }
         }
@@ -86,7 +95,7 @@ bool SplitLoops::run(LinearIR& linear_ir) {
     // FuseLoops pass is explicitly run here in order to avoid unnecessary computations
     // in case if loops are not split but FuseLoops is registered in pass manager after SplitLoops
     if (loop_was_split)
-        FuseLoops().run(linear_ir);
+        FuseLoops().run(linear_ir, begin, end);
     return loop_was_split;
 }
 } // namespace pass
diff --git a/src/common/snippets/src/lowered/pass/validate_loops.cpp b/src/common/snippets/src/lowered/pass/validate_loops.cpp
index 99698a6b4329bd..43afdc12e63551 100644
--- a/src/common/snippets/src/lowered/pass/validate_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/validate_loops.cpp
@@ -63,8 +63,6 @@ bool ValidateLoops::run(LinearIR& linear_ir) {
                                     "Incorrect Loop ID configuration: the Loops with splitted dimension should be successively nested");
                     OPENVINO_ASSERT(loop_manager->get_loop_info(loop_ids[i - 1])->get_increment() == loop_manager->get_loop_info(id)->get_work_amount(),
                                     "Incorrect Loop ID configuration: the Loops with splitted dimension should be successively nested");
-                    OPENVINO_ASSERT(loop_manager->get_loop_info(loop_ids[i - 1])->get_outer_splited_loop(),
-                                    "Incorrect Loop ID configuration: the outer Loop with splitted dimension should have `outer_splited_loop=True`");
                 }
                 dim_indexes.push_back(dim_idx);
             }
diff --git a/src/common/snippets/src/lowered/pass/validate_shapes.cpp b/src/common/snippets/src/lowered/pass/validate_shapes.cpp
index 8d12004313e0bf..8e90cc723576fa 100644
--- a/src/common/snippets/src/lowered/pass/validate_shapes.cpp
+++ b/src/common/snippets/src/lowered/pass/validate_shapes.cpp
@@ -13,10 +13,11 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-bool ValidateShapes::run(LinearIR& linear_ir) {
+bool ValidateShapes::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ValidateShapes")
 
-    for (const auto& expr : linear_ir) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
         const auto num_inputs = expr->get_input_count();
         const auto& port_connectors = expr->get_input_port_connectors();
         const auto& port_descriptors = expr->get_input_port_descriptors();
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 3b23da523d53ca..5ff96826a74f71 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -473,7 +473,7 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const std::shared_ptr<lower
 #ifdef SNIPPETS_DEBUG_CAPS
     if (linear_ir.get_config().perf_count_mode != lowered::PerfCountMode::Disabled) {
         lowered::pass::InsertPerfCount perf_count_pass({});
-        perf_count_pass.run(linear_ir);
+        perf_count_pass.run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
     }
 #endif
     m_generator->generate(linear_ir, lowering_result, compile_params);
diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp
index 455c261cec5109..209ecb4592368a 100644
--- a/src/common/snippets/tests/src/lowered/pass/loop.cpp
+++ b/src/common/snippets/tests/src/lowered/pass/loop.cpp
@@ -11,7 +11,8 @@
 #include "snippets/lowered/pass/init_loops.hpp"
 #include "snippets/lowered/pass/insert_load_store.hpp"
 #include "snippets/lowered/pass/insert_loops.hpp"
-#include "snippets/lowered/pass/insert_tail_loop.hpp"
+#include "snippets/lowered/pass/insert_specific_iterations.hpp"
+#include "snippets/lowered/pass/iter_handler.hpp"
 #include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp"
 #include "snippets/lowered/pass/validate_loops.hpp"
 #include "snippets/shape_inference/shape_inference.hpp"
@@ -38,7 +39,7 @@ static void init_linear_ir(const std::vector<ov::PartialShape>& in_shapes, Linea
     const auto in_shape0 = in_shapes[0].get_shape();
     const auto in_shape1 = in_shapes[1].get_shape();
     const auto inner_wa = std::max(*in_shape0.rbegin(), *in_shape1.rbegin());
-    const auto inner_inc = vector_size;
+    const auto inner_inc = std::min(vector_size, inner_wa);
     const auto blocked_wa = block_size;
     const auto blocked_inc = 1;
     const auto outer_wa = std::max(*(in_shape0.rbegin() + 1), *(in_shape1.rbegin() + 1));
@@ -46,7 +47,12 @@ static void init_linear_ir(const std::vector<ov::PartialShape>& in_shapes, Linea
     loop_manager->mark_loop(expr_it, std::next(expr_it), inner_wa, inner_inc, 0, loop_entry_points, loop_exit_points);
     loop_manager->mark_loop(expr_it, std::next(expr_it), blocked_wa, blocked_inc, 1, loop_entry_points, loop_exit_points);
     const auto loop_id = loop_manager->mark_loop(expr_it, std::next(expr_it), outer_wa, outer_inc, 1, loop_entry_points, loop_exit_points);
-    loop_manager->get_loop_info(loop_id)->set_outer_splited_loop(true);
+    const auto& outer_loop_info = loop_manager->get_loop_info(loop_id);
+    const auto outer_tail_size = outer_wa % outer_inc;
+    if (outer_tail_size != 0) {
+        outer_loop_info->register_handler<LinearIR::LoopManager::LoopInfo::SpecificIterationHandlers::HandlerType::LAST_ITER,
+                                          pass::TransformInnerSplitLoop>(outer_tail_size);
+    }
 }
 
 static void apply_transformations(LinearIR& linear_ir, const std::shared_ptr<pass::PassConfig>& config) {
@@ -55,7 +61,7 @@ static void apply_transformations(LinearIR& linear_ir, const std::shared_ptr<pas
     pipeline.register_pass<pass::ValidateLoops>();
     pipeline.register_pass<pass::InitLoops>();
     pipeline.register_pass<pass::InsertLoops>();
-    pipeline.register_pass<pass::InsertTailLoop>();
+    pipeline.register_pass<pass::InsertSpecificIterations>();
     pipeline.register_pass<pass::CleanupLoopOffsets>();
     pipeline.register_pass<pass::OptimizeLoopSingleEvaluation>();
     pipeline.run(linear_ir);
@@ -84,7 +90,7 @@ TEST(Snippets_TailProcessingTransformation, BlockedWOTail_OriginalPtrShifts) {
 
     auto config = std::make_shared<pass::PassConfig>();
     config->disable<pass::CleanupLoopOffsets>();
-    config->disable<pass::InsertTailLoop>();
+    config->disable<pass::InsertSpecificIterations>();
     config->disable<pass::OptimizeLoopSingleEvaluation>();
     apply_transformations(linear_ir, config);
 
@@ -104,7 +110,7 @@ TEST(Snippets_TailProcessingTransformation, BlockedWOTail_CleanUpPtrShifts) {
     init_linear_ir({inputShape0, inputShape1}, linear_ir, 4);
 
     auto config = std::make_shared<pass::PassConfig>();
-    config->disable<pass::InsertTailLoop>();
+    config->disable<pass::InsertSpecificIterations>();
     config->disable<pass::OptimizeLoopSingleEvaluation>();
     apply_transformations(linear_ir, config);
 
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp
index b32a78cdbe8e5f..1c05100317ae5f 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp
@@ -59,10 +59,19 @@ void jit_fill_emitter::emit_isa(const std::vector<size_t> &in, const std::vector
     Vmm src_vmm = Vmm(in[0]);
     Vmm dst_vmm = Vmm(out[0]);
 
-    if (is_full_reg())
+    const size_t supported_et_size = 4;
+    const auto register_capacity = (src_vmm.getBit() / 8) / supported_et_size;
+    if (offset == register_capacity) {
+        // WA: since AssignRegisters doesn't support inplace logic, Fill ops with offset = register_capacity can't be removed from the LIR
+        // TODO: when inplace is supported, remove such Fill ops from the LIR and remove this logic.
+        // Ticket: 126270
+        if (src_vmm.getIdx() != dst_vmm.getIdx())
+            h->uni_vmovups(dst_vmm, src_vmm);
+    } else if (is_full_reg()) {
         fill_full<Vmm>(dst_vmm);
-    else
+    } else {
         fill_tail<Vmm>(src_vmm, dst_vmm);
+    }
 }
 
 template <typename Vmm>
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp
index 1ea2418f995463..f5bfa19a7dcf66 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp
@@ -32,19 +32,19 @@ class BrgemmCPU : public snippets::op::Brgemm {
     BrgemmCPU(const Output<Node>& A, const Output<Node>& B, const Type type,
               const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_c = 0,
               std::vector<size_t> layout_a = {}, std::vector<size_t> layout_b = {}, std::vector<size_t> layout_c = {},
-              const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f);
+              const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f);
     BrgemmCPU(const Output<Node>& A, const Output<Node>& B, const Output<Node>& scratch, const Type type,
               const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_scratch = 0, const size_t offset_c = 0,
               std::vector<size_t> layout_a = {}, std::vector<size_t> layout_b = {}, std::vector<size_t> layout_c = {},
-              const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f);
+              const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f);
     BrgemmCPU(const Output<Node>& A, const Output<Node>& B, const Type type,
               const PortDescriptor& desc_a, const PortDescriptor& desc_b, const PortDescriptor& desc_c,
               std::vector<size_t> layout_a = {}, std::vector<size_t> layout_b = {}, std::vector<size_t> layout_c = {},
-              const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f);
+              const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f);
     BrgemmCPU(const Output<Node>& A, const Output<Node>& B, const Output<Node>& scratch, const Type type,
               const PortDescriptor& desc_a, const PortDescriptor& desc_b, const PortDescriptor& desc_scratch, const PortDescriptor& desc_c,
               std::vector<size_t> layout_a = {}, std::vector<size_t> layout_b = {}, std::vector<size_t> layout_c = {},
-              const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 0.f);
+              const size_t blk_size_m = 0, const size_t blk_size_k = 0, const size_t blk_size_n = 0, const float beta = 1.f);
     BrgemmCPU() = default;
 
     void validate_and_infer_types() override;
@@ -83,7 +83,7 @@ class BrgemmCPU : public snippets::op::Brgemm {
     size_t m_M_blk = 0;
     size_t m_K_blk = 0;
     size_t m_N_blk = 0;
-    float m_beta = 0.f;
+    float m_beta = 1.f;
 };
 
 } // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
index 5a6236d1916b13..16e6f897af5691 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
@@ -4,25 +4,25 @@
 
 #include "brgemm_blocking.hpp"
 
-#include "openvino/pass/pattern/matcher.hpp"
-#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "cpu_iter_handlers.hpp"
 #include "snippets/itt.hpp"
-#include "snippets/utils.hpp"
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/lowered/loop_manager.hpp"
-#include "snippets/lowered/pass/insert_tail_loop.hpp"
+#include "snippets/lowered/pass/pass.hpp"
 #include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
 #include "transformations/snippets/x64/op/brgemm_cpu.hpp"
 
-
 namespace ov {
 namespace intel_cpu {
 namespace pass {
 using LinearIR = snippets::lowered::LinearIR;
 using LoopPort = LinearIR::LoopManager::LoopPort;
 using ExpressionPtr = ov::snippets::lowered::ExpressionPtr;
+using LoopInfo = LinearIR::LoopManager::LoopInfo;
+using namespace ov::snippets::lowered::pass;
 
-BrgemmBlocking::BrgemmBlocking() : Pass() {}
+BrgemmBlocking::BrgemmBlocking() : RangedPass() {}
 
 void BrgemmBlocking::move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it) {
     const auto& brgemm_expr = brgemm_it->get();
@@ -36,11 +36,8 @@ void BrgemmBlocking::move_new_memory_buffer(snippets::lowered::LinearIR& linear_
     }
 }
 
-bool BrgemmBlocking::run(LinearIR& linear_ir) {
+bool BrgemmBlocking::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmBlocking")
-    if (linear_ir.empty())
-        return false;
-
     const auto& loop_manager = linear_ir.get_loop_manager();
     auto blocking_loop_exists = [&](const ExpressionPtr& brgemm_expr, const std::shared_ptr<ov::intel_cpu::BrgemmCPU>& brgemm) {
         auto check_port = [&](const LoopPort& p) {
@@ -59,7 +56,7 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
     };
 
     bool modified = false;
-    for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto& brgemm_expr = *expr_it;
         const auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(brgemm_expr->get_node());
         if (!brgemm || blocking_loop_exists(brgemm_expr, brgemm))
@@ -83,22 +80,22 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
             if (block_size_m >= m) {
                 *(in_0_subtensor.rbegin() + 1) = m;
                 *(out_subtensor.rbegin() + 1) = m;
-            } else {
-                *(in_0_subtensor.rbegin() + 1) = block_size_m;
-                *(out_subtensor.rbegin() + 1) = block_size_m;
-
-                auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
-                std::vector<LoopPort> entries{LoopPort(brgemm_expr->get_input_port(0), true),
-                                              LoopPort(brgemm_expr->get_input_port(1), false)};
-                if (brgemm->is_with_compensations()) {
-                    entries.emplace_back(brgemm_expr->get_input_port(2), false);
-                } else if (brgemm->is_amx()) {
-                    move_new_memory_buffer(linear_ir, expr_it);
-                    loop_begin_it = std::prev(expr_it);
-                }
-                std::vector<LoopPort> exits{LoopPort(brgemm_expr->get_output_port(0), true)};
-                loop_manager->mark_loop(loop_begin_it, loop_end_it, m, block_size_m, 1, entries, exits);
+                return;
             }
+
+            *(in_0_subtensor.rbegin() + 1) = block_size_m;
+            *(out_subtensor.rbegin() + 1) = block_size_m;
+            auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
+            std::vector<LoopPort> entries{LoopPort(brgemm_expr->get_input_port(0), true),
+                                          LoopPort(brgemm_expr->get_input_port(1), false)};
+            if (brgemm->is_with_compensations()) {
+                entries.emplace_back(brgemm_expr->get_input_port(2), false);
+            } else if (brgemm->is_amx()) {
+                move_new_memory_buffer(linear_ir, expr_it);
+                loop_begin_it = std::prev(expr_it);
+            }
+            std::vector<LoopPort> exits{LoopPort(brgemm_expr->get_output_port(0), true)};
+            loop_manager->mark_loop(loop_begin_it, loop_end_it, m, block_size_m, 1, entries, exits);
         };
 
         auto apply_n_blocking = [&]() {
@@ -107,22 +104,22 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
             if (block_size_n >= n) {
                 *in_1_subtensor.rbegin() = n;
                 *out_subtensor.rbegin() = n;
-            } else {
-                *in_1_subtensor.rbegin() = block_size_n;
-                *out_subtensor.rbegin() = block_size_n;
-
-                auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
-                std::vector<LoopPort> entries{LoopPort(brgemm_expr->get_input_port(0), false),
-                                              LoopPort(brgemm_expr->get_input_port(1), true)};
-                if (brgemm->is_with_compensations()) {
-                    entries.emplace_back(brgemm_expr->get_input_port(2), true);
-                } else if (brgemm->is_amx()) {
-                    move_new_memory_buffer(linear_ir, expr_it);
-                    loop_begin_it = std::prev(expr_it);
-                }
-                std::vector<LoopPort> exits{LoopPort(brgemm_expr->get_output_port(0), true)};
-                loop_manager->mark_loop(loop_begin_it, loop_end_it, n, block_size_n, 0, entries, exits);
+                return;
+            }
+
+            *in_1_subtensor.rbegin() = block_size_n;
+            *out_subtensor.rbegin() = block_size_n;
+            auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
+            std::vector<LoopPort> entries{LoopPort(brgemm_expr->get_input_port(0), false),
+                                          LoopPort(brgemm_expr->get_input_port(1), true)};
+            if (brgemm->is_with_compensations()) {
+                entries.emplace_back(brgemm_expr->get_input_port(2), true);
+            } else if (brgemm->is_amx()) {
+                move_new_memory_buffer(linear_ir, expr_it);
+                loop_begin_it = std::prev(expr_it);
             }
+            std::vector<LoopPort> exits{LoopPort(brgemm_expr->get_output_port(0), true)};
+            loop_manager->mark_loop(loop_begin_it, loop_end_it, n, block_size_n, 0, entries, exits);
         };
 
         auto apply_k_blocking = [&]() {
@@ -132,59 +129,25 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
             if (block_size_k >= k) {
                 *in_0_subtensor.rbegin() = k;
                 *(in_1_subtensor.rbegin() + 1) = k;
-            } else {
-                *in_0_subtensor.rbegin() = block_size_k;
-                *(in_1_subtensor.rbegin() + 1) = block_size_k;
-
-                auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
-                std::vector<LoopPort> entries{LoopPort(brgemm_expr->get_input_port(0), true, 0),
-                                              LoopPort(brgemm_expr->get_input_port(1), true, 1)};
-                if (brgemm->is_with_compensations()) {
-                    entries.emplace_back(brgemm_expr->get_input_port(2), false, 1);
-                } else if (brgemm->is_amx()) {
-                    move_new_memory_buffer(linear_ir, expr_it);
-                    loop_begin_it = std::prev(expr_it);
-                }
-                std::vector<LoopPort> exits{LoopPort(brgemm_expr->get_output_port(0), false)};
-                auto loop_id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits);
-                const auto loop_info = loop_manager->get_loop_info(loop_id);
-
-                auto first_iter_handler = [](LinearIR& linear_ir, LinearIR::constExprIt loop_end_it) {
-                    const auto loop_end = ov::as_type_ptr<snippets::op::LoopEnd>(loop_end_it->get()->get_node());
-                    OPENVINO_ASSERT(loop_end, "First loop iteraton handler must be called on LoopEnd expression");
-                    const auto loop_id = loop_end->get_id();
-                    const auto& loop_manager = linear_ir.get_loop_manager();
-                    const auto& loop_info = loop_manager->get_loop_info(loop_id);
-                    const auto work_amount = loop_info->get_work_amount();
-                    const auto increment = loop_info->get_increment();
-                    if (work_amount <= increment)
-                        return false;
-
-                    const auto loop_begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_end->get_loop_begin()));
-                    const auto new_loop_begin_pos = snippets::lowered::pass::InsertTailLoop::insert_copy_loop(linear_ir, loop_id, loop_begin_it);
-                    const auto new_loop_begin = ov::as_type_ptr<snippets::op::LoopBegin>(new_loop_begin_pos->get()->get_node());
-                    OPENVINO_ASSERT(new_loop_begin, "Cloned Loop does not contain LoopBegin op at the expected place.");
-                    const auto firt_iter_loop_end = new_loop_begin->get_loop_end();
-                    auto first_iter_loop_info = loop_manager->get_loop_info(firt_iter_loop_end->get_id());
-                    firt_iter_loop_end->set_work_amount(increment);
-                    first_iter_loop_info->set_work_amount(increment);
-                    firt_iter_loop_end->set_finalization_offsets(std::vector<int64_t>(loop_end->get_finalization_offsets().size(), 0));
-
-                    const auto new_work_amount = work_amount - increment;
-                    loop_info->set_work_amount(new_work_amount);
-                    loop_end->set_work_amount(new_work_amount);
-
-                    // Update original body's Brgemms with new beta parameter
-                    for (auto expr_it = loop_begin_it; expr_it != loop_end_it; ++expr_it) {
-                        const auto& expr_node = expr_it->get()->get_node();
-                        if (const auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(expr_node)) {
-                            brgemm->set_beta(1.f);
-                        }
-                    }
-                    return true;
-                };
-                loop_info->set_first_iter_handler(first_iter_handler);
+                brgemm->set_beta(0.f);
+                return;
+            }
+
+            *in_0_subtensor.rbegin() = block_size_k;
+            *(in_1_subtensor.rbegin() + 1) = block_size_k;
+            auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
+            std::vector<LoopPort> entries{LoopPort(brgemm_expr->get_input_port(0), true, 0),
+                                          LoopPort(brgemm_expr->get_input_port(1), true, 1)};
+            if (brgemm->is_with_compensations()) {
+                entries.emplace_back(brgemm_expr->get_input_port(2), false, 1);
+            } else if (brgemm->is_amx()) {
+                move_new_memory_buffer(linear_ir, expr_it);
+                loop_begin_it = std::prev(expr_it);
             }
+            std::vector<LoopPort> exits{LoopPort(brgemm_expr->get_output_port(0), false)};
+            const auto id = loop_manager->mark_loop(loop_begin_it, loop_end_it, k, block_size_k, entries, exits);
+            const auto loop_info = loop_manager->get_loop_info(id);
+            loop_info->register_handler<LoopInfo::SpecificIterationHandlers::HandlerType::FIRST_ITER, SetBrgemmBeta>(0.f);
         };
 
         apply_k_blocking();
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp
index 81ae47aa3c6948..483a2c5ba53100 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp
@@ -16,11 +16,13 @@ namespace pass {
  * @ingroup snippets
  */
 
-class BrgemmBlocking : public snippets::lowered::pass::Pass {
+class BrgemmBlocking : public snippets::lowered::pass::RangedPass {
 public:
     OPENVINO_RTTI("BrgemmBlocking", "Pass")
     BrgemmBlocking();
-    bool run(snippets::lowered::LinearIR& linear_ir) override;
+    bool run(snippets::lowered::LinearIR& linear_ir,
+             snippets::lowered::LinearIR::constExprIt begin,
+             snippets::lowered::LinearIR::constExprIt end) override;
 
 private:
     static void move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it);
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp
new file mode 100644
index 00000000000000..382ee78d8be59e
--- /dev/null
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "cpu_iter_handlers.hpp"
+
+#include "snippets/lowered/loop_manager.hpp"
+#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
+
+namespace ov {
+namespace intel_cpu {
+namespace pass {
+using LinearIR = snippets::lowered::LinearIR;
+using ExpressionPtr = ov::snippets::lowered::ExpressionPtr;
+
+SetBrgemmBeta::SetBrgemmBeta(float beta) : snippets::lowered::pass::RangedPass(), m_beta(beta) {}
+
+bool SetBrgemmBeta::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = expr_it->get();
+        if (const auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(expr->get_node())) {
+            brgemm->set_beta(m_beta);
+        }
+    }
+    return true;
+}
+
+std::shared_ptr<snippets::lowered::pass::PassBase> SetBrgemmBeta::merge(const std::shared_ptr<snippets::lowered::pass::PassBase>& other) {
+    const auto merged_pass = std::make_shared<SetBrgemmBeta>(m_beta);
+    if (other == nullptr)
+        return merged_pass;
+    const auto casted_pass = ov::as_type_ptr<SetBrgemmBeta>(other);
+    if (!casted_pass || m_beta != casted_pass->m_beta)
+        return nullptr;
+    return merged_pass;
+}
+}  // namespace pass
+}  // namespace intel_cpu
+}  // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp
new file mode 100644
index 00000000000000..5da97e29796f70
--- /dev/null
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/cpu_iter_handlers.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "snippets/lowered/pass/iter_handler.hpp"
+
+namespace ov {
+namespace intel_cpu {
+namespace pass {
+/**
+ * @interface SetBrgemmBeta
+ * @brief The pass updates all CPUBrgemm nodes with a new beta value
+ * @param m_beta - beta which must be set
+ * @ingroup snippets
+ */
+class SetBrgemmBeta : public snippets::lowered::pass::RangedPass {
+public:
+    SetBrgemmBeta(float beta);
+    OPENVINO_RTTI("SetBrgemmBeta", "RangedPass")
+    bool run(snippets::lowered::LinearIR& linear_ir,
+             snippets::lowered::LinearIR::constExprIt begin,
+             snippets::lowered::LinearIR::constExprIt end) override;
+    std::shared_ptr<snippets::lowered::pass::PassBase> merge(const std::shared_ptr<snippets::lowered::pass::PassBase>& other) override;
+
+private:
+    float m_beta;
+};
+}  // namespace pass
+}  // namespace intel_cpu
+}  // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp
index 165f9626014290..722ead2258a3ba 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp
@@ -92,12 +92,14 @@ bool ov::intel_cpu::pass::FuseLoadStoreConvert::fuse_store_convert(snippets::low
     return true;
 }
 
-bool ov::intel_cpu::pass::FuseLoadStoreConvert::run(snippets::lowered::LinearIR& linear_ir) {
+bool ov::intel_cpu::pass::FuseLoadStoreConvert::run(snippets::lowered::LinearIR& linear_ir,
+                                                    snippets::lowered::LinearIR::constExprIt begin,
+                                                    snippets::lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::FuseLoadStoreConvert")
 
     bool modified = false;
 
-    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
         const auto& expr = *expr_it;
         const auto& convert = expr->get_node();
         if (!ov::is_type<ov::op::v0::Convert>(convert))
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp
index 0ff16116fa9002..32d862bc8d8356 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp
@@ -18,11 +18,13 @@ namespace pass {
  *        Fuse Store and ConvertTruncation into one op StoreConvertTruncation
  * @ingroup snippets
  */
-class FuseLoadStoreConvert: public snippets::lowered::pass::Pass {
+class FuseLoadStoreConvert: public snippets::lowered::pass::RangedPass {
 public:
     FuseLoadStoreConvert() = default;
-    OPENVINO_RTTI("FuseLoadStoreConvert", "Pass");
-    bool run(snippets::lowered::LinearIR& linear_ir) override;
+    OPENVINO_RTTI("FuseLoadStoreConvert", "RangedPass");
+    bool run(snippets::lowered::LinearIR& linear_ir,
+             snippets::lowered::LinearIR::constExprIt begin,
+             snippets::lowered::LinearIR::constExprIt end) override;
 
 private:
     bool fuse_load_convert(snippets::lowered::LinearIR& linear_ir,
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp
index 0f14f9a7dc5d8a..68fdda2f7f83df 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp
@@ -10,7 +10,9 @@
 
 #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
 
-bool ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape::run(snippets::lowered::LinearIR& linear_ir) {
+bool ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape::run(snippets::lowered::LinearIR& linear_ir,
+                                                          snippets::lowered::LinearIR::constExprIt begin,
+                                                          snippets::lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SetBrgemmCopyBBuffersShape")
 
     auto get_buffer_from_output = [](const snippets::lowered::ExpressionPtr& expr, const size_t out_idx) {
@@ -22,7 +24,8 @@ bool ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape::run(snippets::lowered::Lin
     };
 
     bool modified = false;
-    for (const auto& expr : linear_ir) {
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
         if (auto copy_b = ov::as_type_ptr<ov::intel_cpu::BrgemmCopyB>(expr->get_node())) {
             const auto buffer = get_buffer_from_output(expr, 0);
             const auto& out_desc = expr->get_output_port_descriptor(0);
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp
index c7eec92700a16a..81c4629907e0d7 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp
@@ -17,11 +17,13 @@ namespace pass {
  *        Ticket: 113744
  * @ingroup snippets
  */
-class SetBrgemmCopyBBuffersShape: public snippets::lowered::pass::Pass {
+class SetBrgemmCopyBBuffersShape: public snippets::lowered::pass::RangedPass {
 public:
     SetBrgemmCopyBBuffersShape() = default;
     OPENVINO_RTTI("SetBrgemmCopyBBuffersShape", "Pass");
-    bool run(snippets::lowered::LinearIR& linear_ir) override;
+    bool run(snippets::lowered::LinearIR& linear_ir,
+             snippets::lowered::LinearIR::constExprIt begin,
+             snippets::lowered::LinearIR::constExprIt end) override;
 };
 
 }  // namespace pass
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
index dc25378528199c..abecd3c954e0cb 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
@@ -21,7 +21,9 @@ std::vector<std::vector<ov::PartialShape>> input_shapes{
         {{1, 1, 32, 23}, {1, 1, 23, 68}},
         {{1, 16, 384, 64}, {1, 16, 64, 384}},
         {{1, 1, 100, 700}, {1, 1, 700, 100}},
+        {{1, 1, 100, 1024}, {1, 1, 1024, 100}},
         {{1, 1, 100, 2500}, {1, 1, 2500, 100}},
+        {{1, 1, 100, 4500}, {1, 1, 4500, 100}},
 };
 
 static inline std::vector<std::vector<element::Type>> quantized_precisions() {