diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
index 7540c950e32253..e88c638c5205d7 100644
--- a/src/common/snippets/include/snippets/generator.hpp
+++ b/src/common/snippets/include/snippets/generator.hpp
@@ -120,10 +120,20 @@ class Generator {
     public:
         // True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission.
         bool m_save_lowered_code = false;
+        // True if we can optimize tails for single evaluation during code generation
+        // More details with optimization examples you can see in generate() method
+        // For example, tails with Buffer ops doesn't support single evaluation optimizations
+        //              because of that we should always reset memory pointer using finalization offsets
+        //              after data storing to Buffer
+        bool m_optimize_single_evaluation = true;
+        // True if we should check runtime info for nodes to call specific needed transformations
+        bool m_need_fill_tail_register = false;
     };
     /**
      * @brief virtual method any specific implementation should implement
      * @param m model in canonical for for table-based code generation
+     * @param config config with transformation and optimization parameters
+     * @param compile_params parameters for generated code
      * @return pointer to generated code
      */
     code generate(std::shared_ptr<ov::Model>& m, const GeneratorConfig& config, const void* compile_params = nullptr);
diff --git a/src/common/snippets/include/snippets/op/brgemm.hpp b/src/common/snippets/include/snippets/op/brgemm.hpp
index 83471c04d0553a..2746d974a06400 100644
--- a/src/common/snippets/include/snippets/op/brgemm.hpp
+++ b/src/common/snippets/include/snippets/op/brgemm.hpp
@@ -19,13 +19,27 @@ namespace op {
 class Brgemm : public ngraph::op::v0::MatMul {
 public:
     OPENVINO_OP("Brgemm", "SnippetsOpset", ngraph::op::v0::MatMul);
-    Brgemm(const Output<Node>& A, const Output<Node>& B);
+    Brgemm(const Output<Node>& A, const Output<Node>& B, const size_t offset_a = 0lu, const size_t offset_b = 0lu, const size_t offset_c = 0lu);
     Brgemm() = default;
 
+    bool visit_attributes(AttributeVisitor& visitor) override;
     void validate_and_infer_types() override;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 
     bool has_evaluate() const override { return false; }
+
+    size_t get_offset_a() const { return m_offset_a; }
+    size_t get_offset_b() const { return m_offset_b; }
+    size_t get_offset_c() const { return m_offset_c; }
+
+    void set_offset_a(const size_t offset) { m_offset_a = offset; }
+    void set_offset_b(const size_t offset) { m_offset_b = offset; }
+    void set_offset_c(const size_t offset) { m_offset_c = offset; }
+
+private:
+    size_t m_offset_a = 0lu;  // offset for first input
+    size_t m_offset_b = 0lu;  // offset for second input
+    size_t m_offset_c = 0lu;  // offset for output
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/broadcastload.hpp b/src/common/snippets/include/snippets/op/broadcastload.hpp
index 851c0ca8c3ea7d..1247dbd95810f5 100644
--- a/src/common/snippets/include/snippets/op/broadcastload.hpp
+++ b/src/common/snippets/include/snippets/op/broadcastload.hpp
@@ -21,14 +21,18 @@ class BroadcastLoad : public BroadcastMove {
 public:
     OPENVINO_OP("BroadcastLoad", "SnippetsOpset", ngraph::snippets::op::BroadcastMove);
 
-    BroadcastLoad(const Output<Node>& x, ov::PartialShape output_shape);
+    BroadcastLoad(const Output<Node>& x, ov::PartialShape output_shape, size_t offset = 0lu);
     BroadcastLoad() = default;
 
-    bool visit_attributes(AttributeVisitor& visitor) override;
+    size_t get_offset() const { return m_offset; }
+    void set_offset(const size_t offset) { m_offset = offset; }
 
+    bool visit_attributes(AttributeVisitor& visitor) override;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
-
     void validate_and_infer_types() override;
+
+private:
+    size_t m_offset = 0lu;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp
new file mode 100644
index 00000000000000..e5113f218c3257
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/buffer.hpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Buffer
+ * @brief The operation is for intermediate data storage
+ *        - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank].
+ *                 It's needed to allocate needed memory size that depends on Tile rank, for example.
+ *                 Default value is -1 (full shape)
+ *        Notes:
+ *               - All buffers in a graph have the same memory pointer. So if we have a few buffers,
+ *                 each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer
+ *               - Buffer should be a single consumer for operation output port
+ * @ingroup snippets
+ */
+class Buffer : public ngraph::op::Op {
+public:
+    OPENVINO_OP("Buffer", "SnippetsOpset");
+    BWDCMP_RTTI_DECLARATION;
+
+    Buffer(const Output<Node>& x, const int32_t allocation_rank = -1);
+    Buffer() = default;
+
+    int32_t get_allocation_rank() const { return m_allocation_rank; }
+    void set_allocation_rank(int32_t rank) { m_allocation_rank = rank; }
+
+    size_t get_byte_size() const;
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+private:
+    int32_t m_allocation_rank = -1;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/fill.hpp b/src/common/snippets/include/snippets/op/fill.hpp
new file mode 100644
index 00000000000000..85b95ec3799d31
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/fill.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Fill
+ * @brief Generated in Tail Loop vector representation in code generation step for cases when we should
+ *        refill registers by special values.
+ *        For example, for cases with ReduceMax or ReduceSum in Softmax
+ *        Where:
+ *          - offset - starting element index where filling is performed while beginning of input data is untouched
+ *          - fill_value - hexadecimal filling value
+ * @ingroup snippets
+ */
+class Fill : public ngraph::op::Op {
+public:
+    OPENVINO_OP("Fill", "SnippetsOpset");
+
+    Fill(const Output<Node>& x, const size_t offset, const uint32_t fill_value = 0x0);
+    Fill() = default;
+
+    size_t get_offset() const { return m_offset; }
+    uint32_t get_fill_value() const { return m_fill_value; }
+
+    void set_offset(const size_t offset) { m_offset = offset; }
+    void set_fill_value(const uint32_t fill_value) { m_fill_value = fill_value; }
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+protected:
+    size_t m_offset = 0lu;
+    uint32_t m_fill_value = 0x0;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/horizon_max.hpp b/src/common/snippets/include/snippets/op/horizon_max.hpp
new file mode 100644
index 00000000000000..d26c4a8c9e58c6
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/horizon_max.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface HorizonMax
+ * @brief The operation calculates a horizon maximum of a vector register
+ * @ingroup snippets
+ */
+class HorizonMax : public ngraph::op::Op {
+public:
+    OPENVINO_OP("HorizonMax", "SnippetsOpset");
+
+    HorizonMax(const Output<Node>& x);
+    HorizonMax() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/horizon_sum.hpp b/src/common/snippets/include/snippets/op/horizon_sum.hpp
new file mode 100644
index 00000000000000..2dc25374bc0f70
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/horizon_sum.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface HorizonSum
+ * @brief The operation calculates a horizon sum of a vector register
+ * @ingroup snippets
+ */
+class HorizonSum : public ngraph::op::Op {
+public:
+    OPENVINO_OP("HorizonSum", "SnippetsOpset");
+
+    HorizonSum(const Output<Node>& x);
+    HorizonSum() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp
index fc500e038f6375..157f25f74c7e34 100644
--- a/src/common/snippets/include/snippets/op/load.hpp
+++ b/src/common/snippets/include/snippets/op/load.hpp
@@ -13,16 +13,16 @@ namespace op {
 
 /**
  * @interface Load
- * @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading
- *        where number of elements to load is determined by "count"
- *        Default value is "1" - to load one element
+ * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data loading
+ *        where number of elements to load is determined by "count" (Default value is "1" - to load one element)
+ *        and memory offset for loading is determined by "offset" (Default value is "0" - to load starting from the first element)
  * @ingroup snippets
  */
 class Load : public MemoryAccess {
 public:
     OPENVINO_OP("Load", "SnippetsOpset");
 
-    Load(const Output<Node>& x, const size_t count = 1lu);
+    Load(const Output<Node>& x, const size_t count = 1lu, const size_t offset = 0lu);
     Load() = default;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
@@ -38,7 +38,7 @@ class Load : public MemoryAccess {
 class LoadReshape : public Load {
 public:
     OPENVINO_OP("LoadReshape", "SnippetsOpset");
-    LoadReshape(const Output<Node>& x, size_t count = 1lu, std::vector<size_t> order = {});
+    LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
     bool visit_attributes(AttributeVisitor& visitor) override;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
     void validate_and_infer_types() override;
diff --git a/src/common/snippets/include/snippets/op/memory_access.hpp b/src/common/snippets/include/snippets/op/memory_access.hpp
index 5971c5cc5ce744..f1b2d8ebb2f00d 100644
--- a/src/common/snippets/include/snippets/op/memory_access.hpp
+++ b/src/common/snippets/include/snippets/op/memory_access.hpp
@@ -24,14 +24,17 @@ class MemoryAccess : public ngraph::op::Op {
     OPENVINO_OP("MemoryAccess", "SnippetsOpset");
 
     size_t get_count() const;
-    void set_count(size_t count);
+    size_t get_offset() const;
+    void set_count(const size_t count);
+    void set_offset(const size_t offset);
     bool visit_attributes(AttributeVisitor& visitor) override;
     void validate_and_infer_types() override;
 
 protected:
-    explicit MemoryAccess(const Output<Node>& x, size_t count = 1lu);
+    explicit MemoryAccess(const Output<Node>& x, size_t count = 1lu, size_t offset = 0lu);
     MemoryAccess() = default;
     size_t m_count = 0lu;
+    size_t m_offset = 0lu;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp
index b83a4fdcec2b18..48c7466b924cff 100644
--- a/src/common/snippets/include/snippets/op/store.hpp
+++ b/src/common/snippets/include/snippets/op/store.hpp
@@ -13,16 +13,16 @@ namespace op {
 
 /**
  * @interface Store
- * @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing
- *        where number of elements to store is determined by "count"
- *        Default value is "1" - to store one element
+ * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data storing
+ *        where number of elements to store is determined by "count" (Default value is "1" - to store one element)
+ *        and memory offset for storing is determined by "offset" (Default value is "0" - to store starting at start memory ptr)
  * @ingroup snippets
  */
 class Store : public MemoryAccess {
 public:
     OPENVINO_OP("Store", "SnippetsOpset");
 
-    Store(const Output<Node>& x, const size_t count = 1lu);
+    Store(const Output<Node>& x, const size_t count = 1lu, const size_t offset = 0lu);
     Store() = default;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
index 31975978695c5f..6774afef1b3f6c 100644
--- a/src/common/snippets/include/snippets/op/subgraph.hpp
+++ b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -89,21 +89,13 @@ class Subgraph : public ngraph::op::Op {
         return m_generator;
     }
 
-    size_t get_non_scalar_constants_count() const {
-        return m_non_scalar_constants_count;
-    }
-
-    bool is_quantized() const {
-        return config.m_is_quantized;
-    }
-
-    bool has_type_relaxed_ops() const {
-        return config.m_has_type_relaxed_ops;
-    }
-
-    bool has_domain_sensitive_ops() const {
-        return config.m_has_domain_sensitive_ops;
-    }
+    // Return common memory size for all buffers in body. Should be called only after tileRank setting
+    size_t get_buffer_scratchpad_size() const;
+    size_t get_virtual_port_count() const { return m_virtual_port_count; }
+    bool is_buffer_needed() const { return m_buffer_needed; }
+    bool is_quantized() const { return config.m_is_quantized; }
+    bool has_type_relaxed_ops() const { return config.m_has_type_relaxed_ops; }
+    bool has_domain_sensitive_ops() const { return config.m_has_domain_sensitive_ops; }
 
     snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt,
                                 const void* compile_params = nullptr);
@@ -117,8 +109,9 @@ class Subgraph : public ngraph::op::Op {
     // plugin sets generator for a snippet to some specific generator.
     // it's going to be replaced with Jitters table later
     void set_generator(std::shared_ptr<ngraph::snippets::Generator> generator);
-    void set_non_scalar_constants_count(const size_t count);
     void set_tile_rank(size_t newRank) {tileRank = newRank;}
+    void set_virtual_port_count(const size_t count);
+    void set_buffer_needed(const bool need);
 
     void print() const;
     void print_statistics(bool verbose);
@@ -133,11 +126,14 @@ class Subgraph : public ngraph::op::Op {
     void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes);
     void convert_to_snippet_dialect();
     void init_config();
-    // Count of potentional non-scalar Consants that will be created after some tranformations
-    // At the moment it's relevant only for FakeQuantize decomposition
-    // NOTE: To avoid overheads in each calcution of this count (for example, in validate_and_type_infer()),
+    // Count of Subgraph virtual ports:
+    //  - Potential non-scalar Constants that will be created after some transformations (At the moment it's relevant only for FakeQuantize decomposition)
+    // Need Buffer op or not
+    //  - Buffers. All Buffers are considered as one common additional virtual port. So we cannot summarize them as potential non-scalar Constants
+    // NOTE: To avoid overheads in each calculation of this count (for example, in validate_and_type_infer()),
     //       we should MANUALLY calculate it where it needed.
-    size_t m_non_scalar_constants_count = 0;
+    size_t m_virtual_port_count = 0;
+    bool m_buffer_needed = false;
     Shape exec_domain = {};
     std::shared_ptr<ov::Model> m_body = nullptr;
     std::shared_ptr<ngraph::snippets::Generator> m_generator = nullptr;
@@ -162,11 +158,12 @@ class Subgraph : public ngraph::op::Op {
         // True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes
         // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method
         bool m_has_type_relaxed_ops = false;
-        // True if we should check runtime info for nodes to call specific needed transformations
-        bool m_need_fill_tail_register = false;
         // True if body has operations that don't support plugin-side domain optimizations
         // (e.g. Transpose, Softmax, MatMul in general doesn't support dimensions collapsing)
         bool m_has_domain_sensitive_ops = false;
+        // True if we should go through whole body to check for where loops should be explicitly inserted.
+        // Otherwise, we insert Loops on Parameters and Results - for example, it's optimized out for subgraph with only Eltwise ops
+        bool m_explicit_loop_insertion = false;
     } config;
 };
 
diff --git a/src/common/snippets/include/snippets/op/vector_buffer.hpp b/src/common/snippets/include/snippets/op/vector_buffer.hpp
new file mode 100644
index 00000000000000..9d93e4c01577bf
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/vector_buffer.hpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface VectorBuffer
+ * @brief The operation is for intermediate data storage in vector register
+ * @ingroup snippets
+ */
+class VectorBuffer : public ngraph::op::Op {
+public:
+    OPENVINO_OP("VectorBuffer", "SnippetsOpset");
+
+    VectorBuffer(const ov::element::Type element_type = ov::element::f32);
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+private:
+    ov::element::Type m_element_type;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/insert_buffer.hpp b/src/common/snippets/include/snippets/pass/insert_buffer.hpp
new file mode 100644
index 00000000000000..a7fe4f00208fef
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/insert_buffer.hpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface InsertBuffer
+ * @brief The pass inserts Buffers on Inputs and Outputs of special operations [Softmax, Transpose] is it's needed
+ * @param allocation_rank - rank of shape for Buffer memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank].
+ *                          It's needed to allocate needed memory size that depends on Tile rank, for example.
+ *                          Default value is -1 (full shape)
+ * @ingroup snippets
+ */
+class InsertBuffer: public ngraph::pass::MatcherPass {
+public:
+    InsertBuffer(const int32_t allocation_rank = -1);
+};
+
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/insert_load_store.hpp b/src/common/snippets/include/snippets/pass/insert_load_store.hpp
index dc1bf6b3e68717..6935c7495cc38f 100644
--- a/src/common/snippets/include/snippets/pass/insert_load_store.hpp
+++ b/src/common/snippets/include/snippets/pass/insert_load_store.hpp
@@ -13,7 +13,7 @@ namespace pass {
 
 /**
  * @interface InsertLoad
- * @brief Inserts explicit load instruction after each parameter.
+ * @brief Inserts explicit load instruction after each parameter and buffer.
  * The pass is used to convert model to a canonical form for code generation
  * @ingroup snippets
  */
@@ -24,7 +24,7 @@ class InsertLoad: public ngraph::pass::MatcherPass {
 
 /**
  * @interface InsertStore
- * @brief Inserts explicit store instruction before each result.
+ * @brief Inserts explicit store instruction before each result and buffer.
  * The pass is used to convert model to a canonical form for code generation
  * @ingroup snippets
  */
diff --git a/src/common/snippets/include/snippets/pass/insert_loops.hpp b/src/common/snippets/include/snippets/pass/insert_loops.hpp
index 5b4fe1e3c24fde..57046789167ad5 100644
--- a/src/common/snippets/include/snippets/pass/insert_loops.hpp
+++ b/src/common/snippets/include/snippets/pass/insert_loops.hpp
@@ -17,18 +17,25 @@ namespace pass {
  * @param master_shape - shape used to determine loop work amounts
  * @param loop_depth - the number of last master_shape dimensions processed by loops (aka tileRank - obsolete), could be 1 or 2
  * @param vector_size - the number of entities processed on one iteration of vector loop
+ * @param single_loop_body - true, if we can just insert LoopBegin on inputs and LoopEnd on outputs, othwerwise
+ *                           the pass goes all over the body analyzing where LoopBegin and LoopEnd should be inserted:
+ *                           synchronization nodes are MatMul, Buffer and other already existing Loops.
  * @ingroup snippets
  */
 class InsertLoops: public ngraph::pass::FunctionPass {
 public:
     OPENVINO_RTTI("InsertLoops", "0");
-    InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size);
+    InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size, bool is_optimized = true);
     bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
 
+    static std::vector<bool> calculate_inner_apply_increments(const ov::PartialShape& master, const std::vector<ov::PartialShape>& shapes);
+    static std::vector<bool> calculate_outer_apply_increments(const std::vector<ov::PartialShape>& shapes);
+    static std::vector<int64_t> calculate_finalization_offsets(const ov::PartialShape& master, const std::vector<ov::PartialShape>& shapes);
 private:
     ov::PartialShape m_master_shape;
     size_t m_loop_depth;
     size_t m_vector_size;
+    bool m_single_loop_body;
 };
 
 }  // namespace pass
diff --git a/src/common/snippets/include/snippets/pass/propagate_buffer_offset.hpp b/src/common/snippets/include/snippets/pass/propagate_buffer_offset.hpp
new file mode 100644
index 00000000000000..b73a53f2faf795
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/propagate_buffer_offset.hpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface PropagateBufferOffset
+ * @brief All buffers in body have one common memory pointer. To correct work with them each buffer has own offset for common memory ptr
+ *        The pass consistently set offset in the corresponding for Buffer MemoryAccess nodes: Load, Store, MatMul.
+ * @ingroup snippets
+ */
+class PropagateBufferOffset: public ngraph::pass::MatcherPass {
+public:
+    PropagateBufferOffset();
+
+private:
+    size_t current_offset = 0lu;
+};
+
+} // namespace pass
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/reset_buffer.hpp b/src/common/snippets/include/snippets/pass/reset_buffer.hpp
new file mode 100644
index 00000000000000..599b533e3ebf1e
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/reset_buffer.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface ResetBufferState
+ * @brief If there is Buffer between loops we should reset Buffer pointer after first loop execution (data storing) using finalization offsets
+ *        to have correct buffer data pointer for data loading in the next loop where data was stored in previous loop
+ * @ingroup snippets
+ */
+class ResetBufferState: public ngraph::pass::MatcherPass {
+public:
+    ResetBufferState();
+
+    static int64_t calculate_required_finalization_offsets(const size_t inner_master_work_amount, const size_t inner_target_work_amount);
+};
+
+} // namespace pass
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp
new file mode 100644
index 00000000000000..b640ab35b0bbbc
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface SoftmaxDecomposition
+ * @brief The pass decomposise Softmax into explicit Snippets dialects
+ *        Note:
+ *            - At the moment Snippets supports Softmax only in MHA pattern where there are Buffer ops before and after Softmax.
+ *              Also Snippets support Loops with Buffer ops on inputs and outputs if Buffer have the same buffer byte size
+ *              because of work with ptr increment. So we have to set Tile rank as buffer allocation rank even if rank 1 is enough
+ * @ingroup snippets
+ */
+class SoftmaxDecomposition: public ngraph::pass::MatcherPass {
+public:
+    SoftmaxDecomposition(const size_t vector_size, const int32_t buffer_allocation_rank = -1);
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp b/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp
new file mode 100644
index 00000000000000..7522f411669dc3
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface SoftmaxReshapeElimination
+ * @brief The pass removes Reshape operations around Softmax if possible
+ * @ingroup snippets
+ */
+class SoftmaxReshapeElimination: public ngraph::pass::MatcherPass {
+public:
+    SoftmaxReshapeElimination();
+};
+
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp
index 20ce6444682b82..0f091abe956381 100644
--- a/src/common/snippets/include/snippets/snippets_isa.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa.hpp
@@ -9,8 +9,12 @@
 
 #include "op/broadcastload.hpp"
 #include "op/broadcastmove.hpp"
+#include "op/buffer.hpp"
 #include "op/convert_saturation.hpp"
 #include "op/convert_truncation.hpp"
+#include "op/horizon_max.hpp"
+#include "op/horizon_sum.hpp"
+#include "op/fill.hpp"
 #include "op/kernel.hpp"
 #include "op/load.hpp"
 #include "op/nop.hpp"
@@ -19,6 +23,7 @@
 #include "op/store.hpp"
 #include "op/loop.hpp"
 #include "op/brgemm.hpp"
+#include "op/vector_buffer.hpp"
 
 namespace ngraph {
 namespace snippets {
diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp
index 1d08a786922bfb..770722501eb674 100644
--- a/src/common/snippets/include/snippets/utils.hpp
+++ b/src/common/snippets/include/snippets/utils.hpp
@@ -29,6 +29,9 @@ ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const
 std::vector<size_t> get_node_output_layout(const std::shared_ptr<Node>& node);
 std::vector<size_t> get_node_output_layout(const Node* node);
 
+inline ov::Dimension get_inner_dim(const ov::PartialShape &shape) { return *(shape.rbegin()); }
+inline ov::Dimension get_outer_dim(const ov::PartialShape &shape) { return *(shape.rbegin() + 1); }
+
 } // namespace utils
 } // namespace snippets
 } // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
index 3d0060b3805925..3859479e85c110 100644
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@@ -40,6 +40,53 @@ auto getRegisters(const std::shared_ptr<ngraph::Node> &n) -> RegInfo {
     return std::make_pair(rin, rout);
 }
 
+auto tail_transformations(NodeVector& tail, const size_t tail_size, const ngraph::snippets::Generator::GeneratorConfig& config) -> void {
+    NodeVector updated_tile;
+    auto insertFill = [tail_size](const ov::Input<ov::Node>& input) -> std::shared_ptr<ov::Node> {
+        auto copyRegInfo = [](const ov::descriptor::Tensor& from, ov::descriptor::Tensor& to) -> void {
+            auto rt = from.get_rt_info();
+            auto reginfo = rt.find("reginfo");
+            if (reginfo != rt.end()) {
+                to.get_rt_info()["reginfo"] = reginfo->second;
+            }
+        };
+        std::shared_ptr<ov::Node> fill = nullptr;
+        auto& rt = input.get_rt_info();
+        auto fill_rt = rt.find("set_fill");
+        if (fill_rt != rt.end()) {
+            const auto fill_value = fill_rt->second.as<uint32_t>();
+            fill = std::make_shared<ngraph::snippets::op::Fill>(input.get_source_output(), tail_size, fill_value);
+            input.get_node()->set_argument(input.get_index(), fill);
+            // we should explicitly copy reg info because we insert Fill after assign register
+            copyRegInfo(fill->get_input_tensor(0), fill->get_output_tensor(0));
+        }
+        return fill;
+    };
+
+    for (auto& op : tail) {
+        // We should fill vector regs by float_min and zero to have
+        // correct math calculations for ReduceMax and ReduceSum in scalar case.
+        // Note: We find Maximum and Add ops because HorizonMax and HorizonSum are outside Loop,
+        //       so they are missed in <tail>
+        if (config.m_need_fill_tail_register &&
+            (ov::is_type<ov::op::v1::Maximum>(op) ||
+             ov::is_type<ov::op::v1::Add>(op))) {
+            for (auto i = 0; i < op->inputs().size(); ++i) {
+                if (auto fill = insertFill(op->input(i))) {
+                    updated_tile.push_back(fill);
+                }
+            }
+        } else if (const auto memory_access = std::dynamic_pointer_cast<ngraph::snippets::op::MemoryAccess>(op)) {
+            if (memory_access->get_count() != 1) {
+                memory_access->set_count(tail_size);
+            }
+        }
+        updated_tile.push_back(op);
+    }
+
+    tail = std::move(updated_tile);
+}
+
 ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov::Model>& m,
                                                              const GeneratorConfig& config,
                                                              const void* compile_params) {
@@ -107,8 +154,12 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
                 // So if there is a tail, then we should apply offsets after it, but not now.
                 if (need_tail)
                     vector_loop_end->set_finalization_offsets(std::vector<int64_t>(tail_finalization_offsets.size(), 0));
-                // force ptr increments if there is tail
-                optimize_single_evaluation(vector_loop_end, need_tail);
+
+                if (config.m_optimize_single_evaluation) {
+                    // force ptr increments if there is tail
+                    optimize_single_evaluation(vector_loop_end, need_tail);
+                }
+
                 lower_ops(vector_loop);
             }
             OV_ITT_TASK_NEXT(GENERATE, "::TailLoop")
@@ -118,14 +169,7 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
             if (need_tail) {
                 NodeMap vector_to_tail_node_map;
                 tail_loop = ngraph::clone_nodes(vector_loop,  vector_to_tail_node_map);
-                std::transform(tail_loop.begin(), tail_loop.end(), tail_loop.begin(),
-                               [tail_size](const std::shared_ptr<Node>& n){
-                                   const auto& memory_access = std::dynamic_pointer_cast<ngraph::snippets::op::MemoryAccess>(n);
-                                   if (memory_access && memory_access->get_count() != 1) {
-                                       memory_access->set_count(tail_size);
-                                   }
-                                   return n;
-                               });
+                tail_transformations(tail_loop, tail_size, config);
                 tail_loop_end = ov::as_type_ptr<op::LoopEnd>(*tail_loop.rbegin());
                 tail_loop_end->set_finalization_offsets(tail_finalization_offsets);
                 tail_loop_end->set_increment(tail_size);
@@ -133,8 +177,12 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
                 tail_loop_end->update_ptr_increments(static_cast<int64_t>(tail_size));
                 tail_loop_end->set_work_amount(tail_size);
                 tail_loop_end->has_outer_loop = vector_loop_end->has_outer_loop;
-                // tail loop is always executed once
-                optimize_single_evaluation(tail_loop_end);
+
+                if (config.m_optimize_single_evaluation) {
+                    // tail loop is always executed once
+                    optimize_single_evaluation(tail_loop_end);
+                }
+
                 lower_ops(tail_loop);
             }
         } else {
@@ -173,4 +221,4 @@ std::shared_ptr<const TargetMachine> Generator::get_target_machine() const {
 }
 
 }// namespace snippets
-}// namespace ngraph
\ No newline at end of file
+}// namespace ngraph
diff --git a/src/common/snippets/src/op/brgemm.cpp b/src/common/snippets/src/op/brgemm.cpp
index e48b599b96a22b..7bf999cb15e423 100644
--- a/src/common/snippets/src/op/brgemm.cpp
+++ b/src/common/snippets/src/op/brgemm.cpp
@@ -13,12 +13,21 @@ namespace ngraph {
 namespace snippets {
 namespace op {
 
-Brgemm::Brgemm(const Output<Node>& A, const Output<Node>& B) : MatMul() {
+Brgemm::Brgemm(const Output<Node>& A, const Output<Node>& B, const size_t offset_a, const size_t offset_b, const size_t offset_c)
+    : MatMul(), m_offset_a(offset_a), m_offset_b(offset_b), m_offset_c(offset_c) {
     set_arguments({A, B});
     set_output_size(1);
     constructor_validate_and_infer_types();
 }
 
+bool Brgemm::visit_attributes(AttributeVisitor& visitor) {
+    MatMul::visit_attributes(visitor);
+    visitor.on_attribute("offset_a", m_offset_a);
+    visitor.on_attribute("offset_b", m_offset_b);
+    visitor.on_attribute("offset_c", m_offset_c);
+    return true;
+}
+
 void Brgemm::validate_and_infer_types() {
     INTERNAL_OP_SCOPE(Brgemm_validate_and_infer_types);
     element::Type result_et;
@@ -47,7 +56,7 @@ void Brgemm::validate_and_infer_types() {
 std::shared_ptr<Node> Brgemm::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Brgemm_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<Brgemm>(new_args.at(0), new_args.at(1));;
+    return std::make_shared<Brgemm>(new_args.at(0), new_args.at(1), m_offset_a, m_offset_b, m_offset_c);
 }
 
 } // namespace op
diff --git a/src/common/snippets/src/op/broadcastload.cpp b/src/common/snippets/src/op/broadcastload.cpp
index 04ba89f48775e4..927b47f94498bc 100644
--- a/src/common/snippets/src/op/broadcastload.cpp
+++ b/src/common/snippets/src/op/broadcastload.cpp
@@ -11,18 +11,21 @@
 using namespace std;
 using namespace ngraph;
 
-snippets::op::BroadcastLoad::BroadcastLoad(const Output<Node>& x, ov::PartialShape shape) : BroadcastMove(x, std::move(shape)) {
+snippets::op::BroadcastLoad::BroadcastLoad(const Output<Node>& x, ov::PartialShape shape, size_t offset)
+    : BroadcastMove(x, std::move(shape)), m_offset(offset) {
     constructor_validate_and_infer_types();
 }
 
 bool snippets::op::BroadcastLoad::visit_attributes(AttributeVisitor& visitor) {
+    BroadcastMove::visit_attributes(visitor);
+    visitor.on_attribute("offset", m_offset);
     return true;
 }
 
 std::shared_ptr<Node> snippets::op::BroadcastLoad::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(BroadcastLoad);
     check_new_args_count(this, new_args);
-    return std::make_shared<BroadcastLoad>(new_args.at(0), output_shape);
+    return std::make_shared<BroadcastLoad>(new_args.at(0), output_shape, m_offset);
 }
 
 void snippets::op::BroadcastLoad::validate_and_infer_types() {
diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp
new file mode 100644
index 00000000000000..df1a45ac643636
--- /dev/null
+++ b/src/common/snippets/src/op/buffer.cpp
@@ -0,0 +1,55 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/op/buffer.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include <ngraph/runtime/host_tensor.hpp>
+
+using namespace std;
+using namespace ngraph;
+
+BWDCMP_RTTI_DEFINITION(ngraph::snippets::op::Buffer);
+
+auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t {
+    return allocation_rank < 0 ? allocation_rank + shape_rank : allocation_rank;
+}
+
+snippets::op::Buffer::Buffer(const Output<Node>& x, const int32_t allocation_rank) : Op({x}), m_allocation_rank(allocation_rank) {
+    constructor_validate_and_infer_types();
+}
+
+bool snippets::op::Buffer::visit_attributes(AttributeVisitor& visitor) {
+    INTERNAL_OP_SCOPE(Buffer_visit_attributes);
+    visitor.on_attribute("allocation_rank", m_allocation_rank);
+    return true;
+}
+
+std::shared_ptr<Node> snippets::op::Buffer::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    auto new_buffer = std::make_shared<Buffer>(new_args.at(0), m_allocation_rank);
+    return new_buffer;
+}
+
+void snippets::op::Buffer::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types);
+    const auto shape_rank = get_input_partial_shape(0).rank();
+    if (shape_rank.is_static()) {
+        const auto normalized_rank = normalize_rank(m_allocation_rank, shape_rank.get_length());
+        NGRAPH_CHECK(normalized_rank >= 0 && normalized_rank <= shape_rank.get_length(),
+                     "Buffer has incorrect allocation rank: " + std::to_string(m_allocation_rank));
+    }
+    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+}
+
+size_t ngraph::snippets::op::Buffer::get_byte_size() const {
+    const auto pshape = get_input_partial_shape(0);
+    NGRAPH_CHECK(pshape.is_static(), "Buffer should have static shapes for memory allocation");
+    const auto shape = pshape.get_shape();
+    const auto normalized_rank = normalize_rank(m_allocation_rank, shape.size());
+    return ngraph::shape_size(shape.rbegin(), shape.rbegin() + normalized_rank) * get_element_type().size();
+}
diff --git a/src/common/snippets/src/op/fill.cpp b/src/common/snippets/src/op/fill.cpp
new file mode 100644
index 00000000000000..ac93a501aad5ce
--- /dev/null
+++ b/src/common/snippets/src/op/fill.cpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/op/fill.hpp"
+
+#include <ngraph/runtime/host_tensor.hpp>
+
+using namespace std;
+using namespace ngraph;
+
+snippets::op::Fill::Fill(const Output<Node>& x, const size_t offset, const uint32_t fill_value)
+    : Op({x}), m_offset(offset), m_fill_value(fill_value) {
+    constructor_validate_and_infer_types();
+}
+
+bool snippets::op::Fill::visit_attributes(AttributeVisitor& visitor) {
+    INTERNAL_OP_SCOPE(Fill_visit_attributes);
+    visitor.on_attribute("offset", m_offset);
+    visitor.on_attribute("fill_value", m_fill_value);
+    return true;
+}
+
+std::shared_ptr<Node> snippets::op::Fill::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(Fill_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<Fill>(new_args.at(0), m_offset, m_fill_value);
+}
+
+void snippets::op::Fill::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(Fill_validate_and_infer_types);
+    const auto in_type = get_input_element_type(0);
+    NGRAPH_CHECK(in_type.size() == 4, "Fill operation supports only element types with 4 byte size but got:" + std::to_string(in_type.size()));
+    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+}
+
diff --git a/src/common/snippets/src/op/horizon_max.cpp b/src/common/snippets/src/op/horizon_max.cpp
new file mode 100644
index 00000000000000..37e6e3f3c55daa
--- /dev/null
+++ b/src/common/snippets/src/op/horizon_max.cpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/op/horizon_max.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+snippets::op::HorizonMax::HorizonMax(const Output<Node>& x) : Op({x}) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> snippets::op::HorizonMax::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(HorizonMax_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<HorizonMax>(new_args.at(0));
+}
+
+void snippets::op::HorizonMax::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(HorizonMax_validate_and_infer_types);
+    auto new_shape = get_input_partial_shape(0);
+    if (!ov::is_scalar(new_shape)) {
+        new_shape[new_shape.size() - 1] = 1lu;
+    }
+    set_output_type(0, get_input_element_type(0), new_shape);
+}
diff --git a/src/common/snippets/src/op/horizon_sum.cpp b/src/common/snippets/src/op/horizon_sum.cpp
new file mode 100644
index 00000000000000..fa791dec2342f3
--- /dev/null
+++ b/src/common/snippets/src/op/horizon_sum.cpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/op/horizon_sum.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+snippets::op::HorizonSum::HorizonSum(const Output<Node>& x) : Op({x}) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> snippets::op::HorizonSum::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(HorizonSum_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<HorizonSum>(new_args.at(0));
+}
+
+void snippets::op::HorizonSum::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(HorizonSum_validate_and_infer_types);
+    auto new_shape = get_input_partial_shape(0);
+    if (!ov::is_scalar(new_shape)) {
+        new_shape[new_shape.size() - 1] = 1lu;
+    }
+    set_output_type(0, get_input_element_type(0), new_shape);
+}
diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp
index d998afef73b101..f7637fbc7962a5 100644
--- a/src/common/snippets/src/op/load.cpp
+++ b/src/common/snippets/src/op/load.cpp
@@ -12,19 +12,19 @@ namespace ngraph {
 namespace snippets {
 namespace op {
 
-Load::Load(const Output<Node>& x, const size_t count) : MemoryAccess({x}, count) {
+Load::Load(const Output<Node>& x, const size_t count, const size_t offset) : MemoryAccess({x}, count, offset) {
     constructor_validate_and_infer_types();
 }
 
 std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Load);
     check_new_args_count(this, new_args);
-    return std::make_shared<Load>(new_args.at(0), m_count);
+    return std::make_shared<Load>(new_args.at(0), m_count, m_offset);
 }
 
 
-LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, std::vector<size_t> order)
-                            : Load(x, count), m_order(std::move(order)) {
+LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
+                            : Load(x, count, offset), m_order(std::move(order)) {
     const auto& in_shape = x.get_partial_shape();
     NGRAPH_CHECK(in_shape.is_static(), "LoadReshape supports only static input shapes");
     const auto in_shape_size = in_shape.size();
@@ -45,7 +45,7 @@ void snippets::op::LoadReshape::validate_and_infer_types() {
 }
 
 bool snippets::op::LoadReshape::visit_attributes(AttributeVisitor& visitor) {
-    visitor.on_attribute("count", m_count);
+    Load::visit_attributes(visitor);
     visitor.on_attribute("order", m_order);
     return true;
 }
@@ -53,9 +53,9 @@ bool snippets::op::LoadReshape::visit_attributes(AttributeVisitor& visitor) {
 std::shared_ptr<Node> snippets::op::LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(LoadReshape);
     check_new_args_count(this, new_args);
-    return std::make_shared<LoadReshape>(new_args.at(0), m_count, m_order);
+    return std::make_shared<LoadReshape>(new_args.at(0), m_count, m_offset, m_order);
 }
 
 }// namespace op
 }// namespace snippets
-}// namespace ngraph
\ No newline at end of file
+}// namespace ngraph
diff --git a/src/common/snippets/src/op/memory_access.cpp b/src/common/snippets/src/op/memory_access.cpp
index 79f6b63a4be691..2530ea77b6352b 100644
--- a/src/common/snippets/src/op/memory_access.cpp
+++ b/src/common/snippets/src/op/memory_access.cpp
@@ -12,11 +12,11 @@ namespace ngraph {
 namespace snippets {
 namespace op {
 
-MemoryAccess::MemoryAccess(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
-}
+MemoryAccess::MemoryAccess(const Output<Node>& x, const size_t count, const size_t offset) : Op({x}), m_count(count), m_offset(offset) {}
 
 bool MemoryAccess::visit_attributes(AttributeVisitor& visitor) {
     visitor.on_attribute("count", m_count);
+    visitor.on_attribute("offset", m_offset);
     return true;
 }
 
@@ -24,10 +24,18 @@ size_t MemoryAccess::get_count() const {
     return m_count;
 }
 
+size_t MemoryAccess::get_offset() const {
+    return m_offset;
+}
+
 void MemoryAccess::set_count(const size_t count) {
     m_count = count;
 }
 
+void MemoryAccess::set_offset(const size_t offset) {
+    m_offset = offset;
+}
+
 void MemoryAccess::validate_and_infer_types() {
     set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
 }
diff --git a/src/common/snippets/src/op/store.cpp b/src/common/snippets/src/op/store.cpp
index 69e1e1643b769b..90750de6b65fec 100644
--- a/src/common/snippets/src/op/store.cpp
+++ b/src/common/snippets/src/op/store.cpp
@@ -12,16 +12,15 @@ namespace ngraph {
 namespace snippets {
 namespace op {
 
-Store::Store(const Output<Node>& x, const size_t count) : MemoryAccess({x}, count) {
+snippets::op::Store::Store(const Output<Node>& x, const size_t count, const size_t offset) : MemoryAccess({x}, count, offset) {
     constructor_validate_and_infer_types();
 }
-
-std::shared_ptr<Node> Store::clone_with_new_inputs(const OutputVector& new_args) const {
-    INTERNAL_OP_SCOPE(Store);
+std::shared_ptr<Node> snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(Store_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<Store>(new_args.at(0), m_count);
+    return std::make_shared<Store>(new_args.at(0), m_count, m_offset);
 }
 
 } // namespace op
 } // namespace snippets
-} // namespace ngraph
\ No newline at end of file
+} // namespace ngraph
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 933e05b89fca7c..79d2825eaae089 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -20,6 +20,10 @@
 #include "snippets/pass/align_element_type.hpp"
 #include "snippets/pass/matmul_to_brgemm.hpp"
 #include "snippets/pass/fuse_transpose_brgemm.hpp"
+#include "snippets/pass/softmax_decomposition.hpp"
+#include "snippets/pass/propagate_buffer_offset.hpp"
+#include "snippets/pass/reset_buffer.hpp"
+#include "snippets/pass/insert_buffer.hpp"
 #include "snippets/utils.hpp"
 
 #include "transformations/common_optimizations/nop_elimination.hpp"
@@ -41,8 +45,12 @@ void snippets::op::Subgraph::set_generator(std::shared_ptr<ngraph::snippets::Gen
     m_generator = generator;
 }
 
-void snippets::op::Subgraph::set_non_scalar_constants_count(const size_t count) {
-    m_non_scalar_constants_count = count;
+void snippets::op::Subgraph::set_virtual_port_count(const size_t count) {
+    m_virtual_port_count = count;
+}
+
+void snippets::op::Subgraph::set_buffer_needed(const bool need) {
+    m_buffer_needed = need;
 }
 
 void snippets::op::Subgraph::init_config() {
@@ -50,9 +58,6 @@ void snippets::op::Subgraph::init_config() {
     for (const auto& op : ops) {
         config.m_is_quantized = config.m_is_quantized ||
             ov::is_type<ov::op::v0::FakeQuantize>(op);
-        config.m_need_fill_tail_register = config.m_need_fill_tail_register ||
-            ov::is_type<ov::op::v1::Softmax>(op) ||
-            ov::is_type<ov::op::v8::Softmax>(op);
         config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops ||
             std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op);
         config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision ||
@@ -60,11 +65,13 @@ void snippets::op::Subgraph::init_config() {
             has_type_relaxed_ops() ||
             snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type);
         config.m_has_domain_sensitive_ops = config.m_has_domain_sensitive_ops ||
-                                            ov::is_type<ov::op::v1::Transpose>(op) ||
-                                            ov::is_type<ov::op::v1::Softmax>(op) ||
-                                            ov::is_type<ov::op::v8::Softmax>(op) ||
-                                            ov::is_type<ov::op::v0::MatMul>(op);
+            ov::is_type<ov::op::v1::Transpose>(op) ||
+            ov::is_type<ov::op::v1::Softmax>(op) ||
+            ov::is_type<ov::op::v8::Softmax>(op) ||
+            ov::is_type<ov::op::v0::MatMul>(op);
     }
+    // Domain sensitive ops are decomposed with explicit Loops. So, we should explicitly insert Loops in Subgraph if it contains these ops
+    config.m_explicit_loop_insertion = config.m_has_domain_sensitive_ops;
 }
 
 snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr<ov::Model> body)
@@ -184,9 +191,17 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
     auto body = create_body(node->get_friendly_name(), body_results, body_parameters);
     auto subgraph = build_subgraph(node, subgraph_inputs, body);
 
+    bool need_buffer = false;
+    size_t hidden_data_count = 0lu;
     if (auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node)) {
-        subgraph->set_non_scalar_constants_count(utils::get_non_scalar_constant_count_for_fq(fq_node));
+        hidden_data_count += utils::get_non_scalar_constant_count_for_fq(fq_node);
+    // Ops that requires Buffer
+    } else if (ov::is_type<ov::op::v1::Softmax>(node) ||
+               ov::is_type<ov::op::v8::Softmax>(node)) {
+        need_buffer |= true;
     }
+    subgraph->set_virtual_port_count(hidden_data_count);
+    subgraph->set_buffer_needed(need_buffer);
 
     for (size_t i = 0; i < body->get_parameters().size(); i++) {
         body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
@@ -329,6 +344,17 @@ ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector&
     return master_shape;
 }
 
+size_t snippets::op::Subgraph::get_buffer_scratchpad_size() const {
+    size_t buffer_size = 0;
+    const auto ops = m_body->get_ops();
+    for (const auto& op : ops) {
+        if (const auto buffer = ov::as_type_ptr<ngraph::snippets::op::Buffer>(op)) {
+            buffer_size += buffer->get_byte_size();
+        }
+    }
+    return buffer_size;
+}
+
 void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
                                                  const BlockedShapeVector& inputShapes) {
     // We should insert Convert before Results to set original output element type if needed
@@ -384,13 +410,18 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
                                                         return p->get_partial_shape().rbegin()->is_dynamic();
                                                     });
     ngraph::pass::Manager manager;
-    manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
-    manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
     manager.register_pass<snippets::pass::MatMulToBrgemm>();
     manager.register_pass<snippets::pass::FuseTransposeBrgemm>();
+    manager.register_pass<snippets::pass::InsertBuffer>(tileRank);
+    manager.register_pass<snippets::pass::SoftmaxDecomposition>(count, tileRank);
     manager.register_pass<snippets::pass::TransposeDecomposition>();
+    manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
+    manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
     manager.register_pass<snippets::pass::InsertLoad>(count);
     manager.register_pass<snippets::pass::InsertStore>(count);
+    // After transformations above MemoryAccess operations won't be changed (not removed or added) except for [Load + MoveBroadcast = LoadBroadcast]
+    // so we can calculate offsets for each Buffer in body and propagate them to the corresponding MemoryAccess nodes
+    manager.register_pass<snippets::pass::PropagateBufferOffset>();
     // todo: presently dynamic pipeline is activated even if the last two dimension are static
     //  In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example)
     //  should be passed as run-time args, so it's a mixed mode: kernel is shape-aware, but some additional runtime args are required
@@ -422,12 +453,13 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
             manager.get_pass_config()->
                     set_callback<ngraph::snippets::pass::SetScalarCountForStore>(skip_matching_domain);
         }
-        // todo: get_lanes() assumes fp32. Could there be any int8 issues?
         // Note that InsertLoops requires validate_and_infer_types afterwards, so add it manually if
         // automatic validation will be disabled in the pass manager
-        if (!has_domain_sensitive_ops())
-            manager.register_pass<snippets::pass::InsertLoops>(master_shape, tileRank,
-                                                           m_generator->get_target_machine()->get_lanes());
+        manager.register_pass<snippets::pass::InsertLoops>(master_shape, tileRank,
+            m_generator->get_target_machine()->get_lanes(), !config.m_explicit_loop_insertion);
+        if (config.m_has_domain_sensitive_ops) {
+            manager.register_pass<snippets::pass::ResetBufferState>();
+        }
     }
     manager.run_passes(m_body);
 }
@@ -462,8 +494,13 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt,
     opt.run_passes(m_body);
     snippets::pass::AssignRegisters().run_on_model(m_body);
 
+    const auto ops = m_body->get_ops();
     ngraph::snippets::Generator::GeneratorConfig generatorConfig;
     generatorConfig.m_save_lowered_code = config.m_has_domain_sensitive_ops;
+    generatorConfig.m_need_fill_tail_register = config.m_has_domain_sensitive_ops;
+    generatorConfig.m_optimize_single_evaluation = std::none_of(ops.begin(), ops.end(), [](const std::shared_ptr<ov::Node>& op) {
+        return ov::is_type<ngraph::snippets::op::Buffer>(op);
+    });
     // actual code emission
     ngraph::snippets::code ptr = m_generator->generate(m_body, generatorConfig, compile_params);
 
diff --git a/src/common/snippets/src/op/vector_buffer.cpp b/src/common/snippets/src/op/vector_buffer.cpp
new file mode 100644
index 00000000000000..1be69a6d9ad678
--- /dev/null
+++ b/src/common/snippets/src/op/vector_buffer.cpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/op/vector_buffer.hpp"
+
+#include <ngraph/runtime/host_tensor.hpp>
+
+using namespace std;
+using namespace ngraph;
+
+snippets::op::VectorBuffer::VectorBuffer(const ov::element::Type element_type) : Op(), m_element_type(std::move(element_type)) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> snippets::op::VectorBuffer::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(VectorBuffer_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<VectorBuffer>(m_element_type);
+}
+
+void snippets::op::VectorBuffer::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(VectorBuffer_validate_and_infer_types);
+    set_output_type(0, m_element_type, Shape{1lu});
+}
diff --git a/src/common/snippets/src/pass/align_element_type.cpp b/src/common/snippets/src/pass/align_element_type.cpp
index f2cf4ce5c47de6..fa45c0b5754eba 100644
--- a/src/common/snippets/src/pass/align_element_type.cpp
+++ b/src/common/snippets/src/pass/align_element_type.cpp
@@ -20,7 +20,8 @@ inline auto is_in_op(const std::shared_ptr<ov::Node>& n) -> bool {
         || ov::is_type<ov::op::v0::Constant>(n);
 }
 
-// At the moment Subgraph supports only Eltwise, Convert and FQ (which is decomposed into Eltwises and Convert)
+// At the moment Subgraph supports only Eltwise, Convert, FQ (which is decomposed into Eltwises and Convert) and
+// Softmax (which is decompsed into Eltwises as well)
 // And only Eltwises supports execution only in "exec_type". So we can check op type from the opposite
 // NOTE: This check is only for executable which isn't Parameter/Constant/Result
 inline auto op_supports_only_exec_type(const std::shared_ptr<ov::Node>& n) -> bool {
diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp
index dd40f6640a3a10..04cbadf5a608cd 100644
--- a/src/common/snippets/src/pass/assign_registers.cpp
+++ b/src/common/snippets/src/pass/assign_registers.cpp
@@ -7,6 +7,10 @@
 #include "snippets/snippets_isa.hpp"
 #include <iterator>
 
+namespace {
+static constexpr size_t reg_count = 16lu;
+}  // namespace
+
 bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr<ov::Model>& f) {
     RUN_ON_MODEL_SCOPE(AssignRegisters);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters")
@@ -24,7 +28,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
                 std::dynamic_pointer_cast<opset1::Result>(op) ||
                 std::dynamic_pointer_cast<op::LoopBegin>(op) ||
                 std::dynamic_pointer_cast<op::LoopEnd>(op) ||
-                std::dynamic_pointer_cast<op::Brgemm>(op))
+                std::dynamic_pointer_cast<op::Brgemm>(op) ||
+                std::dynamic_pointer_cast<op::Buffer>(op))
             return gpr2gpr;
         else if (std::dynamic_pointer_cast<snippets::op::Load>(op) ||
                  std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op))
@@ -41,22 +46,57 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
     size_t counter_gpr = 0;
     std::map<tensor, Reg> regs_vec, regs_gpr;
     // Define a set of immune tensors that will be ignored by auto reg allocation => their reg allocation is done manually
-    // todo: presently it hold only gpr's. If you need to manually assign vec's, implement reg_type or create a second map
-    std::map<tensor, Reg> manually_assigned_regs;
+    std::map<tensor, Reg> manually_assigned_gprs, manually_assigned_vecs;
     const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX;
     const auto num_parameters = f->get_parameters().size();
+    const auto num_results = f->get_results().size();
+    auto accumulator_reg = 0lu;
     for (const auto& op : ops) {
         if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(op)) {
-            manually_assigned_regs[op->output(0).get_tensor_ptr()] =
+            manually_assigned_gprs[op->output(0).get_tensor_ptr()] =
                     static_cast<Reg>(f->get_parameter_index(param));
         } else if (const auto& result = ov::as_type_ptr<opset1::Result>(op)) {
             // here we use the fact that Result input & output tensors are identical by construction
-            manually_assigned_regs[op->output(0).get_tensor_ptr()] =
+            manually_assigned_gprs[op->output(0).get_tensor_ptr()] =
                     static_cast<Reg>(f->get_result_index(result) + num_parameters);
+        } else if (const auto& buffer = ov::as_type_ptr<op::Buffer>(op)) {
+            // All buffers have one common data pointer
+            manually_assigned_gprs[op->input(0).get_tensor_ptr()] =
+                    static_cast<Reg>(num_results + num_parameters);
+            manually_assigned_gprs[op->output(0).get_tensor_ptr()] =
+                    static_cast<Reg>(num_results + num_parameters);
+        } else if (ov::is_type<op::HorizonMax>(op) || ov::is_type<op::HorizonSum>(op)) {
+            // Only in SoftmaxDecomposition ReduceMax and ReduceSum use HorizonMax/HorizonSum and VectorBuffer.
+            // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator
+            // TODO [96351]: We should rewrite accumulator pattern using another way
+            const auto input = op->get_input_node_shared_ptr(0); // input - it's accumulator math op: Add or Max
+            for (size_t i = 0; i < input->get_input_size(); ++i) {
+                if (ov::is_type<op::VectorBuffer>(input->get_input_node_shared_ptr(i))) {
+                    manually_assigned_vecs[input->input(i).get_tensor_ptr()] =
+                        static_cast<Reg>(accumulator_reg);
+                }
+            }
+
+            manually_assigned_vecs[input->output(0).get_tensor_ptr()] =
+                static_cast<Reg>(accumulator_reg);
+            manually_assigned_vecs[op->output(0).get_tensor_ptr()] =
+                static_cast<Reg>(accumulator_reg);
+
+            // If there is Broadcast, it should have the same register as Horizon op
+            // because it's a result of the accumulator as well
+            for (auto& out : op->output(0).get_target_inputs()) {
+                const auto child = out.get_node()->shared_from_this();
+                if (ov::is_type<op::BroadcastMove>(child)) {
+                    manually_assigned_vecs[child->output(0).get_tensor_ptr()] =
+                        static_cast<Reg>(accumulator_reg);
+                }
+            }
+            accumulator_reg++;
         }
     }
-    auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG, &manually_assigned_regs] (const std::shared_ptr<ov::Node>& op,
+    auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG] (const std::shared_ptr<ov::Node>& op,
                                      decltype(regs_vec)& reg_map,
+                                     const std::map<tensor, Reg>& manually_assigned_regs,
                                      size_t& counter) {
         for (const auto& output : op->outputs()) {
             const auto& t = output.get_tensor_ptr();
@@ -71,11 +111,11 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
         switch (t_op.first) {
             case vec2vec:
             case gpr2vec:
-                enumerate_out_tensors(t_op.second, regs_vec, counter_vec);
+                enumerate_out_tensors(t_op.second, regs_vec, manually_assigned_vecs, counter_vec);
                 break;
             case gpr2gpr:
             case vec2gpr:
-                enumerate_out_tensors(t_op.second, regs_gpr, counter_gpr);
+                enumerate_out_tensors(t_op.second, regs_gpr, manually_assigned_gprs, counter_gpr);
                 break;
         }
     }
@@ -96,7 +136,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
         }
         return result;
     };
-    for (int i = 0; i < typed_ops.size(); i++) {
+    for (size_t i = 0; i < typed_ops.size(); i++) {
         const auto& t_op = typed_ops[i];
         std::vector<tensor> used_tensors, defined_tensors;
         for (const auto& in : t_op.second->inputs())
@@ -239,15 +279,18 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
     };
     // todo: vec_/gpr_pool are hardware-specific and should be provided by a backend, e.g. overloaded generator
     std::set<Reg> vec_pool;
-    for (Reg i  = 0; i < 16; i++)
+    for (Reg i = 0; i < reg_count; i++)
         vec_pool.insert(i);
-    auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool);
-    std::set<Reg> gpr_pool(std::move(vec_pool));
-    for (const auto& t_reg : manually_assigned_regs)
+    std::set<Reg> gpr_pool(vec_pool);
+    for (const auto& t_reg : manually_assigned_vecs)
+        vec_pool.erase(t_reg.second);
+    for (const auto& t_reg : manually_assigned_gprs)
         gpr_pool.erase(t_reg.second);
+    auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool);
     auto unique2reused_map_gpr = linescan_assign_registers(live_intervals_gpr, gpr_pool);
 
-    std::map<tensor, Reg> assigned_regs(std::move(manually_assigned_regs));
+    std::map<tensor, Reg> assigned_regs(std::move(manually_assigned_gprs));
+    assigned_regs.insert(manually_assigned_vecs.begin(), manually_assigned_vecs.end());
     auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map<tensor, Reg>& unique_regs,
                                                    const std::map<Reg, Reg>& unique2reused) {
         for (const auto& reg : unique_regs) {
diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp
index 4501eb0797467d..6534d6c1eff077 100644
--- a/src/common/snippets/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/src/pass/collapse_subgraph.cpp
@@ -122,8 +122,28 @@ auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
             || ov::is_type<ngraph::op::v4::Swish>(n)
             || ov::is_type<ngraph::op::v4::HSwish>(n);
     };
-    return is_supported_unary_eltwise_op(n) || is_supported_binary_eltwise_op(n) ||
-           is_supported_transpose(n) || is_supported_fq_op(n) || is_supported_matmul(n);
+
+    auto is_supported_softmax = [](const std::shared_ptr<const Node> &n) -> bool {
+        if (n->get_input_size() != 1 || n->get_input_partial_shape(0).rank().is_dynamic())
+            return false;
+        int64_t axis = -1;
+        const auto rank = n->get_input_partial_shape(0).rank();
+        if (const auto softmax_v8 = ngraph::as_type_ptr<const ov::op::v8::Softmax>(n)) {
+            axis = ngraph::normalize_axis(n->get_friendly_name(), softmax_v8->get_axis(), rank);
+        } else if (const auto softmax_v1 = ngraph::as_type_ptr<const ov::op::v1::Softmax>(n)) {
+            axis = softmax_v1->get_axis();
+        } else {
+            return false;
+        }
+        return axis >= 0 && axis == (rank.get_length() - 1);
+    };
+
+    return is_supported_fq_op(n)
+        || is_supported_unary_eltwise_op(n)
+        || is_supported_binary_eltwise_op(n)
+        || is_supported_transpose(n)
+        || is_supported_softmax(n)
+        || is_supported_matmul(n);
 }
 
 auto has_supported_in_out(const std::shared_ptr<const Node> &n) -> bool {
@@ -503,18 +523,24 @@ TokenizeSnippets::TokenizeSnippets() {
         // than the actual number of Constants during tokenization.
         // To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation)
         // we should calculate potentional number of non-scalar Constants that will be moved up from body.
-        size_t hidden_non_scalar_constant_count = 0;
+        size_t hidden_data_count = 0;
+        bool need_buffer = false;
         if (const auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node)) {
-            hidden_non_scalar_constant_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node);
+            hidden_data_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node);
+        // Ops require a Buffer
+        } else if (ov::is_type<ov::op::v1::Softmax>(node) ||
+                   ov::is_type<ov::op::v8::Softmax>(node)) {
+            need_buffer |= true;
         }
 
         ResultVector body_results;
         std::vector<std::set<Input<Node>>> subgraph_result_inputs;
 
         for (auto subgraph : input_subgraphs) {
-            // we should summurize non-scalar Constants count from all input subgraphs
-            // because we will collapse them with our node and we should get total count of non-scalar Constants
-            hidden_non_scalar_constant_count += ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->get_non_scalar_constants_count();
+            // we should summurize additional needed data count (non-scalar Constants and Buffers) from all input subgraphs
+            // because we will collapse them with our node and we should get total count
+            hidden_data_count += ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->get_virtual_port_count();
+            need_buffer |= ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->is_buffer_needed();
 
             for (auto output : subgraph->outputs()) {
                 bool first_side_consumer = true;
@@ -555,13 +581,13 @@ TokenizeSnippets::TokenizeSnippets() {
         }
 
         // todo: move this plugin-specific constraint to the plugin callback
-        if (body_parameters.size() + body_results.size() + hidden_non_scalar_constant_count > 12) {
+        if (body_parameters.size() + body_results.size() + hidden_data_count + static_cast<size_t>(need_buffer) > 12) {
             const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
             std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
-            std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants.";
+            std::to_string(hidden_data_count) + " non-scalar constants and " + std::to_string(need_buffer) + "buffers.";
             const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " +
             std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
-            std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants.";
+            std::to_string(hidden_data_count) + " non-scalar constants and " + std::to_string(need_buffer) + "buffers.";
             return abort_with_strategy(message_reset, message_abort);
         }
 
@@ -596,7 +622,8 @@ TokenizeSnippets::TokenizeSnippets() {
             act_body1->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
         }
         subgraph->get_rt_info()["originalLayersNames"] = fusedNames;
-        subgraph->set_non_scalar_constants_count(hidden_non_scalar_constant_count);
+        subgraph->set_virtual_port_count(hidden_data_count);
+        subgraph->set_buffer_needed(need_buffer);
 
         remark(1) << "Replacement (merge) done for: "
                     << subgraph->get_friendly_name()
diff --git a/src/common/snippets/src/pass/common_optimizations.cpp b/src/common/snippets/src/pass/common_optimizations.cpp
index c81ec235bb7ea8..be78a136cc71ae 100644
--- a/src/common/snippets/src/pass/common_optimizations.cpp
+++ b/src/common/snippets/src/pass/common_optimizations.cpp
@@ -10,6 +10,7 @@
 
 #include "transformations/utils/utils.hpp"
 #include "snippets/pass/fq_decomposition.hpp"
+#include "snippets/pass/softmax_reshape_elimination.hpp"
 #include "snippets/op/subgraph.hpp"
 #include "snippets/itt.hpp"
 
@@ -33,6 +34,10 @@ void ConvertConstantsToParameters(const std::shared_ptr<ngraph::snippets::op::Su
         if (!(constant && ngraph::shape_size(constant->get_shape()) != 1ul))
             continue;
 
+        const auto child = constant->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+        if (ov::is_type<ov::op::v1::Transpose>(child) || ov::is_type<ov::op::v1::Reshape>(child))
+            continue;
+
         auto parameter = std::make_shared<opset1::Parameter>(constant->get_element_type(), constant->output(0).get_partial_shape());
         parameter->set_friendly_name(constant->get_friendly_name());
         ngraph::copy_runtime_info(constant, parameter);
@@ -69,6 +74,7 @@ CommonOptimizations::CommonOptimizations() {
         if (is_quantized) {
             manager.register_pass<ngraph::snippets::pass::CommonFakeQuantizeDecomposition>();
         }
+        manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
         manager.run_passes(body);
 
         // At the moment only non-scalar Constants of FakeQuantize can be inside Subgraph
diff --git a/src/common/snippets/src/pass/insert_buffer.cpp b/src/common/snippets/src/pass/insert_buffer.cpp
new file mode 100644
index 00000000000000..3cc7ca90921464
--- /dev/null
+++ b/src/common/snippets/src/pass/insert_buffer.cpp
@@ -0,0 +1,97 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/remarks.hpp"
+
+#include "snippets/pass/insert_buffer.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+ngraph::snippets::pass::InsertBuffer::InsertBuffer(const int32_t allocation_rank) {
+    MATCHER_SCOPE(InsertBuffer);
+    // The list of operations that require Buffers on their Inputs and Outputs
+    const auto pattern = ngraph::pattern::wrap_type<ngraph::op::v1::Softmax,
+                                                    ngraph::op::v8::Softmax,
+                                                    ngraph::op::v1::Transpose,
+                                                    op::Brgemm>();
+
+    register_matcher(std::make_shared<ngraph::pattern::Matcher>(pattern, matcher_name),
+            [this, allocation_rank](ngraph::pattern::Matcher &m) {
+            OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertBuffer")
+            auto root = m.get_match_root();
+            bool rewritten = false;
+
+            // check if already has Buffer, Parameter or Constant as an input
+            for (const auto& input : root->inputs()) {
+                const auto input_node = input.get_source_output().get_node()->shared_from_this();
+                if (!ov::is_type<ngraph::snippets::op::Buffer>(input_node) &&
+                    !ov::is_type<ngraph::op::v0::Parameter>(input_node) &&
+                    !ov::is_type<ngraph::op::v0::Constant>(input_node)) {
+                    const auto buffer = std::make_shared<ngraph::snippets::op::Buffer>(input_node, allocation_rank);
+                    ngraph::copy_runtime_info(root, buffer);
+                    root->set_argument(input.get_index(), buffer);
+                    rewritten |= true;
+                }
+                if (ov::is_type<op::Buffer>(input.get_source_output().get_node_shared_ptr()) &&
+                    input.get_source_output().get_target_inputs().size() != 1) {
+                    throw ngraph::ngraph_error(
+                            "If Buffer is a input for operation output, this Buffer should be a single consumer for this port");
+                }
+            }
+
+            // check if already has Buffer or outputs is Result
+            for (const auto& output : root->outputs()) {
+                const auto target_inputs = output.get_target_inputs();
+                if (target_inputs.size() > 1) {
+                    for (const auto& consumer : target_inputs) {
+                        const auto output_node = consumer.get_node()->shared_from_this();
+                        if (ov::is_type<ngraph::snippets::op::Buffer>(output_node)) {
+                            // If some of children from one common port are different Buffers,
+                            // we should remove them to insert one common Buffer on one common port
+                            replace_output_update_name(output_node->output(0), output_node->input_value(0));
+                        } else if (ov::is_type<ngraph::op::v0::Result>(output_node)) {
+                            // TODO: At this moment operation which is should be wrapped by Buffers doesn't support several childs where one of them is Result
+                            // because Result and Buffer from one root port should have the same register. It's not supported at the moment
+                            // For example,
+                            //    Buffer
+                            //      |
+                            //    Softmax
+                            //    /    \
+                            // Buffer Result
+                            throw ngraph::ngraph_error(
+                                "Operation which is should be wrapped by Buffers has few children from one output port where one of them is Result");
+                        }
+                    }
+                }
+
+                const auto buffer = std::make_shared<ngraph::snippets::op::Buffer>(output, allocation_rank);
+                for (const auto& consumer : output.get_target_inputs()) {
+                    const auto output_node = consumer.get_node()->shared_from_this();
+                    if (output_node != buffer &&
+                        !ov::is_type<ngraph::snippets::op::Buffer>(output_node) &&
+                        !ov::is_type<ngraph::op::v0::Result>(output_node)) {
+                        consumer.replace_source_output(buffer);
+                        rewritten |= true;
+                    }
+                }
+
+                const auto new_target_inputs = output.get_target_inputs();
+                const auto has_buffer_on_output = std::any_of(new_target_inputs.begin(), new_target_inputs.end(), [](const ov::Input<ov::Node>& consumer) {
+                    const auto child = consumer.get_node()->shared_from_this();
+                    // We check for count of target inputs of Buffer output because
+                    // we created Buffer op with root input previously for the next possible insertions
+                    // Thus, if Buffer wasn't inserted, this op doesn't have target inputs on output
+                    return ov::is_type<ngraph::snippets::op::Buffer>(child) && child->output(0).get_target_inputs().size() > 0;
+                });
+                if (has_buffer_on_output && new_target_inputs.size() != 1) {
+                    throw ngraph::ngraph_error(
+                            "If Buffer is a input for operation output, this Buffer should be a single consumer for this port");
+                }
+            }
+            return rewritten;
+        });
+}
diff --git a/src/common/snippets/src/pass/insert_load_store.cpp b/src/common/snippets/src/pass/insert_load_store.cpp
index d22d094fdd207c..efa0d6396c63fd 100644
--- a/src/common/snippets/src/pass/insert_load_store.cpp
+++ b/src/common/snippets/src/pass/insert_load_store.cpp
@@ -15,7 +15,7 @@
 ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
     MATCHER_SCOPE(InsertLoad);
     register_matcher(std::make_shared<ngraph::pattern::Matcher>(
-        ngraph::pattern::wrap_type<ngraph::opset1::Parameter>(), matcher_name),
+        ngraph::pattern::wrap_type<ngraph::opset1::Parameter, ngraph::snippets::op::Buffer>(), matcher_name),
             [this, count](ngraph::pattern::Matcher &m) {
             OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad")
             auto root = m.get_match_root();
@@ -57,7 +57,7 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
 ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) {
     MATCHER_SCOPE(InsertStore);
     register_matcher(std::make_shared<ngraph::pattern::Matcher>(
-        ngraph::pattern::wrap_type<ngraph::opset1::Result>(), matcher_name),
+        ngraph::pattern::wrap_type<ngraph::opset1::Result, ngraph::snippets::op::Buffer>(), matcher_name),
             [this, count](ngraph::pattern::Matcher &m) {
             OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore")
             auto root = m.get_match_root();
@@ -73,7 +73,7 @@ ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) {
                 }
             }
 
-            auto store = std::make_shared<ngraph::snippets::op::Store> (root->input_value(0), count);
+            auto store = std::make_shared<ngraph::snippets::op::Store>(root->input_value(0), count);
             ngraph::copy_runtime_info(root, store);
             root->set_argument(0, store);
             return true;
diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp
index bbe93ab95413d6..fa82acc3b53401 100644
--- a/src/common/snippets/src/pass/insert_loops.cpp
+++ b/src/common/snippets/src/pass/insert_loops.cpp
@@ -5,26 +5,222 @@
 #include <snippets/itt.hpp>
 #include "snippets/pass/insert_loops.hpp"
 #include "snippets/pass/loop_helpers.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
 
 #include <ngraph/rt_info.hpp>
 
-ngraph::snippets::pass::InsertLoops::InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size)
-: m_master_shape(std::move(master_shape)), m_loop_depth(loop_depth), m_vector_size(vector_size) {
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+InsertLoops::InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size, bool single_loop_body)
+    : m_master_shape(std::move(master_shape)), m_loop_depth(loop_depth), m_vector_size(vector_size), m_single_loop_body(single_loop_body) {
     if (m_master_shape.size() < m_loop_depth)
         throw ngraph_error("InsertLoops can't insert loops: master shape rank is too small");
 }
 
-bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr<ov::Model> &model) {
+std::vector<bool> InsertLoops::calculate_inner_apply_increments(const ov::PartialShape& master,
+                                                                const std::vector<ov::PartialShape>& shapes) {
+    // Inner Loop applies increments if a dimension is not broadcasted
+    std::vector<bool> apply_increments;
+    apply_increments.reserve(shapes.size());
+    std::transform(shapes.begin(), shapes.end(), std::back_inserter(apply_increments),
+                   [=](const ov::PartialShape& ps) { return utils::get_inner_dim(ps) != 1 && utils::get_inner_dim(master) != 1; });
+    return apply_increments;
+}
+std::vector<bool> InsertLoops::calculate_outer_apply_increments(const std::vector<ov::PartialShape>& shapes) {
+    // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1)
+    std::vector<bool> apply_increments;
+    apply_increments.reserve(shapes.size());
+    std::transform(shapes.begin(), shapes.end(), std::back_inserter(apply_increments),
+                   [=](const ov::PartialShape& ps) { return utils::get_outer_dim(ps) != 1 && utils::get_inner_dim(ps) == 1; });
+    return apply_increments;
+}
+std::vector<int64_t> InsertLoops::calculate_finalization_offsets(const ov::PartialShape& master,
+                                                                 const std::vector<ov::PartialShape>& shapes) {
+    const auto inner_work_amount = utils::get_inner_dim(master).get_length();
+    std::vector<int64_t> inner_finalization_offsets(shapes.size(), 0);
+    std::transform(shapes.begin(), shapes.end(), inner_finalization_offsets.begin(),
+                   [=](const ov::PartialShape& ps) {
+                       return utils::get_outer_dim(ps) == 1 && utils::get_inner_dim(ps) != 1 ? -inner_work_amount : 0;
+                   });
+    return inner_finalization_offsets;
+}
+
+void insert_loops_explicitly(const ov::NodeVector& ops, const ov::PartialShape& master_shape, const size_t vector_size) {
+    ov::NodeVector body;
+    ov::NodeVector body_remainder;
+    ov::OutputVector body_parameters;
+    std::vector<ov::Input<ov::Node>> body_results;
+
+    // check for potential parameters for new Loop
+    auto add_body_parameters = [](const std::shared_ptr<ov::Node>& op, ov::OutputVector& body_parameters) {
+        for (const auto& input : op->inputs()) {
+            auto parent = input.get_source_output().get_node_shared_ptr();
+            if (ov::is_type<op::LoopEnd>(parent) ||
+                ov::is_type<op::Buffer>(parent) ||
+                ov::is_type<ov::op::v0::Parameter>(parent) ||
+                ov::is_type<op::Brgemm>(parent)) {
+                body_parameters.push_back(input.get_source_output());
+            }
+        }
+    };
+
+    // check for potential results for new Loop
+    auto add_body_results = [](const std::shared_ptr<ov::Node>& op, std::vector<ov::Input<ov::Node>>& body_results) {
+        for (const auto& output : op->outputs()) {
+            for (const auto& target_input : output.get_target_inputs()) {
+                auto child = target_input.get_node();
+                if (ov::is_type<op::LoopBegin>(child) ||
+                    ov::is_type<op::Buffer>(child) ||
+                    ov::is_type<ov::op::v0::Result>(child) ||
+                    ov::is_type<op::Brgemm>(child)) {
+                    body_results.push_back(target_input);
+                }
+            }
+        }
+    };
+
+    // check for potential missing body ops for new loop
+    std::function<void(const std::shared_ptr<ov::Node>& op, ov::NodeVector& body)> add_missing_body_ops;
+    add_missing_body_ops = [&](const std::shared_ptr<ov::Node>& op, ov::NodeVector& body) {
+        if (body_remainder.size()) {
+            for (const auto& input : op->inputs()) {
+                auto parent = input.get_source_output().get_node_shared_ptr();
+                auto iter = std::find(body_remainder.begin(), body_remainder.end(), parent);
+                if (iter != body_remainder.end()) {
+                    *std::back_inserter(body) = std::move(*iter);
+                    add_missing_body_ops(parent, body);
+                    add_body_parameters(parent, body_parameters);
+                    add_body_results(op, body_results);
+                }
+            }
+        }
+    };
+
+    auto wrap_body_by_loop = [&](const ov::NodeVector& body, const ov::OutputVector& body_parameters, const std::vector<ov::Input<ov::Node>>& body_results) {
+        NGRAPH_CHECK(!body_parameters.empty(), "The count of parameters for loop should be more than zero to create loop");
+        NGRAPH_CHECK(!body_results.empty(), "The count of results for loop should be more than zero to create loop");
+        std::vector<ov::PartialShape> body_shapes;
+        const auto count_io = body_parameters.size() + body_results.size();
+        body_shapes.reserve(count_io);
+        std::transform(body_parameters.begin(), body_parameters.end(), std::back_inserter(body_shapes),
+                       [](const ov::Output<ov::Node>& out) { return out.get_partial_shape(); });
+        std::transform(body_results.begin(), body_results.end(), std::back_inserter(body_shapes),
+                       [](const ov::Input<ov::Node>& in) { return in.get_partial_shape(); });
+
+        auto body_master_shape = body_shapes.front();
+        for (const auto& shape : body_shapes) {
+            NGRAPH_CHECK(PartialShape::broadcast_merge_into(body_master_shape, shape, ::ngraph::op::AutoBroadcastType::NUMPY),
+                         "Loop input and output must be numpy broadcastable");
+        }
+        const auto inner_work_amount = utils::get_inner_dim(body_master_shape).get_length();
+        const auto outer_work_amount = utils::get_outer_dim(body_master_shape).get_length();
+
+        auto apply_increments = InsertLoops::calculate_inner_apply_increments(master_shape, body_shapes);
+        std::vector<int64_t> inner_finalization_offsets(body_shapes.size(), 0);
+        if (outer_work_amount > 1) {
+            inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(master_shape, body_shapes);
+        }
+
+        const auto& inner_loop_begin = op::insertLoopBeginAfterOutputs(body_parameters);
+        const auto& inner_loop_end = op::insertLoopEndBeforeInputs(
+            body_results, inner_loop_begin, inner_work_amount, vector_size,
+            apply_increments, inner_finalization_offsets);
+        // set internal flag to enable scalar vs vector loop optimizations
+        inner_loop_end->has_outer_loop = outer_work_amount > 1;
+        // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in
+        // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called
+        // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg
+        // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency
+        // on LoopBegin to guarantee that the constants are executed inside the Loop.
+        for (const auto& n : body) {
+            if (auto c = std::dynamic_pointer_cast<ov::op::v0::Constant>(n)) {
+                c->add_control_dependency(inner_loop_begin);
+            }
+        }
+
+        if (outer_work_amount > 1) {
+            std::vector<bool> apply_increments = InsertLoops::calculate_outer_apply_increments(body_shapes);
+            std::vector<int64_t> outer_finalization_offsets(body_shapes.size(), 0);
+            const auto& outer_loop_begin = op::insertLoopBegin(body_parameters);
+            op::insertLoopEnd(body_results, outer_loop_begin, outer_work_amount, 1lu,
+                apply_increments, outer_finalization_offsets);
+        }
+    };
+
+    auto op_is_outside_loop = [](const std::shared_ptr<ov::Node>& op) -> bool {
+        if (ov::is_type<ov::op::v0::Parameter>(op) ||
+            ov::is_type<ov::op::v0::Result>(op) ||
+            ov::is_type<op::Buffer>(op))
+            return true;
+        auto& rt = op->get_rt_info();
+        auto outside_rt = rt.find("outside_loop");
+        bool is_outside = false;
+        // If rt info isn't setted it means that op should be inside loop by default
+        if (outside_rt != rt.end()) {
+            is_outside = outside_rt->second.as<bool>();
+        }
+        return is_outside;
+    };
+
+    for (auto iter = ops.begin(); iter < ops.end(); iter++) {
+        const auto op = *iter;
+        // Need to check for that op should be inside or outside loop
+        if (op_is_outside_loop(op)) {
+            continue;
+        }
+
+        // If we meet loopBegin or Brgemm, it means that all previous nodes from ordered body
+        // should be in one body. It's like stop signal
+        const auto& loop_begin = ov::as_type_ptr<op::LoopBegin>(op);
+        const auto& brgemm = ov::as_type_ptr<op::Brgemm>(op);
+        if (loop_begin || brgemm) {
+            if (!body.empty()) {
+                if (!body_results.empty()) {
+                    wrap_body_by_loop(body, body_parameters, body_results);
+                } else {
+                    // If there aren't body results, it means that the current body ops are inputs of the next some operations in ordered_ops
+                    // So this set of the current body ops is part of the future body loop.
+                    // We should save them to add in body ops in the future
+                    std::move(body.begin(), body.end(), std::back_inserter(body_remainder));
+                }
+            }
+
+            // we should skip the next existing Loop body
+            if (loop_begin) {
+                const auto &loop_end = loop_begin->get_loop_end();
+                iter = std::find(iter, ops.end(), loop_end);
+            }
+
+            // clear loop body to create the next
+            body.clear();
+            body_parameters.clear();
+            body_results.clear();
+        } else {
+            add_missing_body_ops(op, body);
+            add_body_parameters(op, body_parameters);
+            add_body_results(op, body_results);
+
+            body.push_back(op);
+        }
+    }
+
+    if (!body.empty()) {
+        wrap_body_by_loop(body, body_parameters, body_results);
+    }
+}
+
+bool InsertLoops::run_on_model(const std::shared_ptr<ov::Model> &model) {
     RUN_ON_FUNCTION_SCOPE(InsertLoops);
     if (m_master_shape.is_dynamic())
         throw ngraph_error("InsertLoops doesn't support dynamic shapes yet");
 
-    const auto inner_dim = m_master_shape.size() - 1;
-    // Note: outer_dim will not be used if m_master_shape.size() < 2
-    const auto outer_dim = m_loop_depth == 2 ? m_master_shape.size() - 2 : -1;
-    const auto inner_work_amount = m_master_shape[inner_dim].get_length();
-    const auto outer_work_amount = m_loop_depth == 2 ? m_master_shape[outer_dim].get_length() : 1;
+    const auto inner_work_amount = utils::get_inner_dim(m_master_shape).get_length();
+    const auto outer_work_amount = m_loop_depth == 2 ? utils::get_outer_dim(m_master_shape).get_length() : 1;
 
+    auto ops = model->get_ordered_ops();
     ParameterVector commonParams = model->get_parameters();
     // Note that topological sort parses node arguments in reversed order, but results are added  - in direct order
     // So ve need to pass the reversed results to LoopEnd to keep the original traversal order in topological sorter
@@ -48,50 +244,42 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr<ov:
     }
 
     if (inner_work_amount > 0) {
-        std::vector<bool> apply_increments;
-        apply_increments.reserve(ioShapes.size());
-        // Inner Loop applies increments if a dimension is not broadcasted
-        std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments),
-                       [=](const PartialShape& ps) {
-                           return ps[inner_dim] != 1 && m_master_shape[inner_dim] != 1;
-                       });
-        std::vector<int64_t> inner_finalization_offsets(ioShapes.size(), 0);
-        if (outer_work_amount > 1) {
-            // We need to step back if an outer dim is broadcasted, while the corresponding lower one is not
-            std::transform(ioShapes.begin(), ioShapes.end(), inner_finalization_offsets.begin(),
-                           [=](const PartialShape& ps) {
-                               return ps[outer_dim] == 1 && ps[inner_dim] != 1 ? -inner_work_amount : 0;
-                           });
-        }
-        const auto& inner_loop_begin = op::insertLoopBegin(commonParams);
-        const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_work_amount,
-                                                   m_vector_size, apply_increments,  inner_finalization_offsets);
-        // set internal flag to enable scalar vs vector loop optimizations
-        inner_loop_end->has_outer_loop = outer_work_amount > 1;
-        // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in
-        // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called
-        // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg
-        // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency
-        // on LoopBegin to guarantee that the constants are executed inside the Loop.
-        for (const auto& n : model->get_ordered_ops()) {
-            if (auto c = std::dynamic_pointer_cast<ov::op::v0::Constant>(n))
-                c->add_control_dependency(inner_loop_begin);
-            else if (n == inner_loop_begin)
-                break;
-        }
-    }
+        if (m_single_loop_body) {
+            const auto apply_increments = InsertLoops::calculate_inner_apply_increments(m_master_shape, ioShapes);
+            std::vector<int64_t> inner_finalization_offsets(ioShapes.size(), 0);
+            if (outer_work_amount > 1) {
+                inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(m_master_shape, ioShapes);
+            }
+            const auto& inner_loop_begin = op::insertLoopBegin(commonParams);
+            const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_work_amount,
+                                                       m_vector_size, apply_increments, inner_finalization_offsets);
+            // set internal flag to enable scalar vs vector loop optimizations
+            inner_loop_end->has_outer_loop = outer_work_amount > 1;
+            // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in
+            // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called
+            // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg
+            // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency
+            // on LoopBegin to guarantee that the constants are executed inside the Loop.
+            for (const auto& n : model->get_ordered_ops()) {
+                if (auto c = std::dynamic_pointer_cast<ov::op::v0::Constant>(n))
+                    c->add_control_dependency(inner_loop_begin);
+                else if (n == inner_loop_begin)
+                    break;
+            }
 
-    if (outer_work_amount > 1) {
-        std::vector<bool> apply_increments;
-        apply_increments.reserve(ioShapes.size());
-        // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1)
-        std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments),
-                       [=](const PartialShape& ps) {
-                           return ps[outer_dim] != 1 && ps[inner_dim] == 1;
-                       });
-        const auto& outer_loop_begin = op::insertLoopBegin(commonParams);
-        insertLoopEnd(commonResults, outer_loop_begin, outer_work_amount, 1, apply_increments);
+            if (outer_work_amount > 1) {
+                std::vector<bool> apply_increments = InsertLoops::calculate_outer_apply_increments(ioShapes);
+                const auto& outer_loop_begin = op::insertLoopBegin(commonParams);
+                op::insertLoopEnd(commonResults, outer_loop_begin, outer_work_amount, 1lu, apply_increments);
+            }
+        } else {
+            insert_loops_explicitly(ops, m_master_shape, m_vector_size);
+        }
     }
 
     return true;
-}
\ No newline at end of file
+}
+
+} // namespace pass
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/src/pass/insert_movebroadcast.cpp b/src/common/snippets/src/pass/insert_movebroadcast.cpp
index 499be69e67f062..49baad1ea29aed 100644
--- a/src/common/snippets/src/pass/insert_movebroadcast.cpp
+++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp
@@ -7,6 +7,7 @@
 
 #include "snippets/pass/insert_movebroadcast.hpp"
 #include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
 
 #include <ngraph/opsets/opset1.hpp>
 #include <ngraph/rt_info.hpp>
@@ -30,6 +31,10 @@ std::shared_ptr<ngraph::Node> broadcast_node_last_dim(const ngraph::Output<ngrap
         ov::PartialShape broadcasted_shape = normalized_shape;
         *broadcasted_shape.rbegin() = *target_shape.rbegin();
         broadcasted_node = std::make_shared<ngraph::snippets::op::BroadcastMove>(broadcasted_node, broadcasted_shape);
+        // BroadcastMove should be immediately executed after its input op (input op is node with output which should be broadcasted).
+        // For example, to execute Broadcast outside of a Loop We transfer control dependents and copy rt info
+        broadcasted_node->add_node_control_dependents(value.get_node_shared_ptr());
+        ov::copy_runtime_info(value.get_node_shared_ptr(), broadcasted_node);
     }
 
     return broadcasted_node;
@@ -64,23 +69,22 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
             return false;
         }
 
-        auto is_scalar_constant = [](const ov::Output<ov::Node>& v){
-            if (auto constant = ov::as_type_ptr<ov::op::v0::Constant>(v.get_node_shared_ptr())) {
-                if (constant->get_shape().empty() || ngraph::shape_size(constant->get_shape()) == 1) {
-                    return true;
-                }
-            }
-            return false;
+        auto is_ignored_node = [](const ov::Output<ov::Node>& v){
+            // We don't need to insert BroadcastMove after the following operations:
+            // - Scalar has emitter with explicit broadcasting
+            // - VectorBuffer has scalar output shape to avoid broadcast conflicts and manually shape insertion.
+            return utils::is_scalar_constant(v.get_node_shared_ptr()) ||
+                   ov::is_type<ngraph::snippets::op::VectorBuffer>(v.get_node_shared_ptr());
         };
         std::vector<ov::PartialShape> input_shapes;
-        std::vector<bool> ignore_as_scalar;
+        std::vector<bool> is_ignored;
         for (const auto& val : values) {
             input_shapes.emplace_back(val.get_partial_shape());
-            ignore_as_scalar.push_back(is_scalar_constant(val));
+            is_ignored.push_back(is_ignored_node(val));
             // Do not insert MoveBroadcast if any of the last dims is dynamic,
             // since we don't know if we really need it. In these cases, broadcasting will be performed
             // by outer Loop based on runtime shapes.
-            if (!ignore_as_scalar.back() && !input_shapes.back().rbegin()->is_static())
+            if (!is_ignored.back() && !input_shapes.back().rbegin()->is_static())
                 return false;
         }
 
@@ -89,7 +93,7 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
 
         ngraph::OutputVector broadcasted_inputs;
         for (size_t i = 0; i < values.size(); ++i) {
-            if (ignore_as_scalar[i]) {
+            if (is_ignored[i]) {
                 broadcasted_inputs.push_back(values[i]);
             } else {
                 auto node = broadcast_node_last_dim(values[i], bcast_shapes.first, bcast_shapes.second[i]);
diff --git a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
index f3765e471971a2..9945724c83e88d 100644
--- a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
+++ b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
@@ -37,7 +37,7 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBro
             auto inshape = root->input(0).get_partial_shape();
             auto outshape = root->output(0).get_partial_shape();
 
-            auto broadcastload = std::make_shared<snippets::op::BroadcastLoad>(param, outshape);
+            auto broadcastload = std::make_shared<snippets::op::BroadcastLoad>(param, outshape, ov::as_type_ptr<snippets::op::Load>(input)->get_offset());
             ngraph::copy_runtime_info(root, broadcastload);
             ngraph::replace_node(root, broadcastload);
 
diff --git a/src/common/snippets/src/pass/propagate_buffer_offset.cpp b/src/common/snippets/src/pass/propagate_buffer_offset.cpp
new file mode 100644
index 00000000000000..965b31f14fdb3f
--- /dev/null
+++ b/src/common/snippets/src/pass/propagate_buffer_offset.cpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "snippets/snippets_isa.hpp"
+#include "snippets/pass/propagate_buffer_offset.hpp"
+#include "snippets/op/subgraph.hpp"
+
+
+ngraph::snippets::pass::PropagateBufferOffset::PropagateBufferOffset() {
+    MATCHER_SCOPE(PropagateBufferOffset);
+
+    auto m_buffer = ngraph::pattern::wrap_type<op::Buffer>();
+
+    auto callback = [&](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::PropagateBufferOffset")
+        auto root = m.get_match_root();
+        const auto buffer = ov::as_type_ptr<op::Buffer>(root);
+
+        // If Buffer has offset We set this offset in the next Load and Store ops
+        // to correctly read and write data because all buffers have the one register
+        // Also if user sets offset to a Buffer It means that the Buffer has the corresponding Load and Store ops
+
+        // Propagate to up: in Store. Buffer can have only one Store
+        {
+            auto parent = buffer->get_input_node_shared_ptr(0);
+            auto idx = buffer->input(0).get_source_output().get_index();
+            while (std::dynamic_pointer_cast<snippets::op::LoopBase>(parent)) {
+                const auto source_output = parent->input_value(idx);
+                parent = source_output.get_node_shared_ptr();
+                idx = source_output.get_index();
+            }
+            if (auto store = std::dynamic_pointer_cast<snippets::op::Store>(parent)) {
+                store->set_offset(current_offset);
+            } else if (const auto brgemm = std::dynamic_pointer_cast<snippets::op::Brgemm>(parent)) {
+                // Brgemm encapsulates work with loading and storing of data
+                brgemm->set_offset_c(current_offset);
+            } else {
+                throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Store op for offset propagation");
+            }
+        }
+
+        // Propagate to down: in Load. Buffer can have several Load and Loops after himself. We should go through all target inputs
+        {
+            std::function<void(const Input<Node>&)> propagate_down;
+            propagate_down = [&](const Input<Node>& target_input) {
+                const auto child = target_input.get_node()->shared_from_this();
+                if (std::dynamic_pointer_cast<snippets::op::LoopBase>(child)) {
+                    const auto index = target_input.get_index();
+                    for (const auto loop_target_output : child->output(index).get_target_inputs()) {
+                        propagate_down(loop_target_output);
+                    }
+                } else if (const auto load = std::dynamic_pointer_cast<snippets::op::Load>(child)) {
+                    load->set_offset(current_offset);
+                } else if (const auto brgemm = std::dynamic_pointer_cast<snippets::op::Brgemm>(child)) {
+                    // Brgemm encapsulates work with loading and storing of data
+                    if (target_input.get_index() == 0) {
+                        brgemm->set_offset_a(current_offset);
+                    } else {
+                        brgemm->set_offset_b(current_offset);
+                    }
+                } else {
+                    throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Load op for offset propagation");
+                }
+            };
+
+            for (const auto target_output : buffer->output(0).get_target_inputs()) {
+                propagate_down(target_output);
+            }
+        }
+
+        current_offset += buffer->get_byte_size();
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(m_buffer, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/common/snippets/src/pass/reset_buffer.cpp b/src/common/snippets/src/pass/reset_buffer.cpp
new file mode 100644
index 00000000000000..aba83f6a450efd
--- /dev/null
+++ b/src/common/snippets/src/pass/reset_buffer.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "snippets/snippets_isa.hpp"
+#include "snippets/pass/reset_buffer.hpp"
+#include "snippets/op/subgraph.hpp"
+
+
+namespace {
+void normalize_ptr_and_offsets(const ov::NodeVector &io, std::vector<int64_t> &ptr_increments, std::vector<int64_t> &finalization_offsets) {
+    bool there_is_buffer = false;
+    // Iterations are from end because before we correct finalization offsets for Loop outputs (io = inputs + outputs)
+    for (int i = static_cast<int>(io.size()) - 1; i >= 0; --i) {
+        if (ov::is_type<ngraph::snippets::op::Buffer>(io[i])) {
+            if (there_is_buffer) {
+                ptr_increments[i] = 0;
+                finalization_offsets[i] = 0;
+            } else {
+                there_is_buffer = true;
+            }
+        }
+    }
+}
+} // namespace
+
+int64_t ngraph::snippets::pass::ResetBufferState::calculate_required_finalization_offsets(const size_t back_step, const size_t target_work_amount) {
+    return target_work_amount != 1 ? -static_cast<int64_t>(back_step) : 0;
+}
+
+ngraph::snippets::pass::ResetBufferState::ResetBufferState() {
+    MATCHER_SCOPE(ResetBufferState);
+
+    // Match on LoopEnd is enough at the moment because Buffer op may be only after MatMul and LoopEnd, but
+    // MatMul doesn't change Buffer memory pointer after execution
+    auto m_loop_end = ngraph::pattern::wrap_type<op::LoopEnd>();
+
+    auto callback = [=](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ResetBufferState")
+        auto& pattern_to_output = m.get_pattern_value_map();
+
+        const auto loop_end = ngraph::as_type_ptr<op::LoopEnd>(pattern_to_output.at(m_loop_end).get_node_shared_ptr());
+        const auto loop_begin = loop_end->get_loop_begin();
+
+        const auto i_size = loop_begin->get_input_size();
+        const auto o_size = loop_end->get_output_size();
+        const auto count_io = i_size + o_size;
+        std::vector<ov::PartialShape> body_shapes(count_io);
+        ov::NodeVector io(count_io);
+        for (size_t i = 0; i < i_size; ++i) {
+            body_shapes[i] = loop_begin->input_value(i).get_partial_shape();
+            io[i] = loop_begin->input_value(i).get_node_shared_ptr();
+            auto port_idx = loop_begin->input_value(i).get_index();
+            while (std::dynamic_pointer_cast<op::LoopBase>(io[i])) {
+                const auto source_output = io[i]->input_value(port_idx);
+                io[i] = source_output.get_node_shared_ptr();
+                port_idx = source_output.get_index();
+            }
+        }
+        for (size_t i = 0; i < o_size; ++i) {
+            body_shapes[i_size + i] = loop_end->output(i).get_partial_shape();
+            // check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op
+            auto consumer = *loop_end->output(i).get_target_inputs().begin();
+            auto port_idx = consumer.get_index();
+            io[i_size + i] = consumer.get_node()->shared_from_this();
+            while (std::dynamic_pointer_cast<op::LoopBase>(io[i_size + i])) {
+                auto consumer = *io[i_size + i]->output(port_idx).get_target_inputs().begin();
+                port_idx = consumer.get_index();
+                io[i_size + i] = consumer.get_node()->shared_from_this();
+            }
+        }
+
+        auto ptr_increments = loop_end->get_ptr_increments();
+        auto finalization_offsets = loop_end->get_finalization_offsets();
+
+        // If after Loop there is immediately Buffer, we should reset the Buffer ptr for the next calculations
+        for (size_t i = 0; i < o_size; ++i) {
+            const auto result_shape = body_shapes[i_size + i].get_shape();
+            // check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op
+            const auto consumer = loop_end->output(i).get_target_inputs().begin()->get_node();
+            if (ov::is_type<ngraph::snippets::op::Buffer>(consumer)) {
+                // To calculate finalization offset we should know index of nesting Loop
+                auto loop_index = 0lu;
+                auto loop = loop_end->input_value(i).get_node_shared_ptr();
+                auto port_idx = loop_end->input_value(i).get_index();
+                while (std::dynamic_pointer_cast<op::LoopEnd>(loop)) {
+                    const auto source_output = loop->input_value(port_idx);
+                    loop = source_output.get_node_shared_ptr();
+                    port_idx = source_output.get_index();
+                    loop_index++;
+                }
+
+                const auto work_amount = std::accumulate(result_shape.rbegin(), result_shape.rbegin() + loop_index + 1, 1, std::multiplies<size_t>());
+                finalization_offsets[i_size + i] =
+                        calculate_required_finalization_offsets(work_amount, *(result_shape.rbegin() + loop_index));
+            }
+        }
+
+        // If there are several Buffers on I/O we should remember that all Buffer have the register,
+        // so we should update ptr for only one Buffer
+        normalize_ptr_and_offsets(io, ptr_increments, finalization_offsets);
+        loop_end->set_finalization_offsets(finalization_offsets);
+        loop_end->set_ptr_increments(ptr_increments);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(m_loop_end, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/common/snippets/src/pass/softmax_decomposition.cpp b/src/common/snippets/src/pass/softmax_decomposition.cpp
new file mode 100644
index 00000000000000..daf811c97fc7ea
--- /dev/null
+++ b/src/common/snippets/src/pass/softmax_decomposition.cpp
@@ -0,0 +1,213 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/remarks.hpp"
+#include <snippets/itt.hpp>
+
+#include "snippets/pass/softmax_decomposition.hpp"
+#include "snippets/pass/reset_buffer.hpp"
+#include "snippets/pass/insert_loops.hpp"
+#include "snippets/pass/loop_helpers.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/pattern/op/or.hpp>
+#include <ngraph/validation_util.hpp>
+
+
+ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t vector_size, const int32_t buffer_allocation_rank) {
+    MATCHER_SCOPE(SoftmaxDecomposition);
+
+    auto m_softmax = ngraph::pattern::wrap_type<ngraph::op::v1::Softmax, ngraph::op::v8::Softmax>();
+
+    auto callback = [=](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxDecomposition")
+        auto root = m.get_match_root();
+        const auto master_pshape = root->get_input_partial_shape(0);
+        const auto rank = master_pshape.rank();
+        if (rank.is_dynamic() || master_pshape.is_dynamic())
+            return false;
+
+        int64_t axis = 0;
+        if (const auto softmax_v8 = ngraph::as_type_ptr<const ov::op::v8::Softmax>(root)) {
+            axis = ngraph::normalize_axis(root->get_friendly_name(), softmax_v8->get_axis(), rank);
+        } else if (const auto softmax_v1 = ngraph::as_type_ptr<const ov::op::v1::Softmax>(root)) {
+            axis = softmax_v1->get_axis();
+        } else {
+            return false;
+        }
+
+        const auto shape_rank = rank.get_length();
+        if (axis != shape_rank - 1)
+            return false;
+
+        const auto data = root->get_input_node_shared_ptr(0);
+
+        const auto master_shape = master_pshape.get_shape();
+        const auto dimension = shape_rank - 1;
+        const auto work_amount = master_shape[dimension];
+        const auto increment = vector_size;
+        const auto inner_dim = shape_rank - 1;
+        const auto inner_master_work_amount = static_cast<size_t>(master_shape[inner_dim]);
+        const int outer_dim = shape_rank > 1 ? static_cast<int>(shape_rank - 2) : -1;
+        const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1;
+
+        /* ====== ReduceMax decomposition ====== */
+
+        // We have to have fake edge Data -> Loop[ReduceMax] -> Loop[Sub + Exp + ReduceSum] because ReduceMax is
+        // accumulator which finds maximum of elements and save it to vector register. Loop works only with GPR (data) but ReduceMax Loop
+        // doesn't save maximum to data. Seems like, LoopEnd shouldn't have outputs:
+        //                     Data
+        //  VectorBuffer   LoopBegin   \
+        //         \         Load    \  |
+        //           Maximum         /  |
+        //              /   LoopEnd     |
+        //       HorizonMax            /
+        //             \   LoopBegin[Sub + Exp + ReduceSum]
+        // But nGraph doesn't allow to have 0 outputs for Node (at least 1 output).
+        // Thus, we propagate data through Loop[ReduceMax] using fake edge because of that Loop[ReduceMax] has two inputs "Data"
+        //                    Data
+        //  VectorBuffer    LoopBegin
+        //         \          Load |  \
+        //           Maximum       |  /
+        //              /    LoopEnd
+        //       HorizonMax     |
+        //             \   LoopBegin[Sub + Exp + ReduceSum]
+        const auto vector_buffer_max = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+        const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data});
+
+        const auto load_max = std::make_shared<ngraph::snippets::op::Load>(loop_max_begin->output(0), increment);
+        const auto max = std::make_shared<ov::op::v1::Maximum>(load_max, vector_buffer_max);
+
+        auto apply_increments_max =
+                InsertLoops::calculate_inner_apply_increments(master_shape, {data->get_shape(), data->get_shape(), data->get_shape()});
+        // Input of softmax is Input and Output of this loop, which isn't used inside (it's just to have one output in Loop at least)
+        // So we shouldn't increment pointer after each loop iteration
+        apply_increments_max[1] = false;
+        apply_increments_max[2] = false;
+        // we should always reset data ptr after this loop because in the next Loop this ptr is used
+        const auto finalization_offsets_max =
+            std::vector<int64_t>{ ResetBufferState::calculate_required_finalization_offsets(inner_master_work_amount, data->get_shape()[inner_dim]), 0, 0 };
+        const auto loop_max_end = std::make_shared<ngraph::snippets::op::LoopEnd>(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)},
+            work_amount, increment, apply_increments_max, finalization_offsets_max);
+
+        const auto horizon_max = std::make_shared<ngraph::snippets::op::HorizonMax>(max);
+
+        /* =========================================== */
+
+        /* === Sub + Exp + ReduceSum decomposition === */
+
+        const auto vector_buffer_sum = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+        const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)});
+
+        const auto load_sub = std::make_shared<ngraph::snippets::op::Load>(loop_sum_begin->output(0), increment);
+        const auto sub = std::make_shared<ov::op::v1::Subtract>(load_sub, horizon_max);
+        const auto exp = std::make_shared<ov::op::v0::Exp>(sub);
+        const auto sum = std::make_shared<ov::op::v1::Add>(exp, vector_buffer_sum);
+        const auto store_exp = std::make_shared<ngraph::snippets::op::Store>(exp, increment);
+
+        auto apply_increments_sum =
+                InsertLoops::calculate_inner_apply_increments(master_shape, {load_sub->get_shape(), store_exp->get_shape()});
+        std::vector<int64_t> finalization_offsets_sum(2, 0);
+        if (has_outer_loop) {
+            finalization_offsets_sum =
+                InsertLoops::calculate_finalization_offsets(master_shape, {load_sub->get_shape(), store_exp->get_shape()});
+        }
+        // we should always reset buffer ptr after loop because in the next Loop this buffer ptr is used
+        finalization_offsets_sum[1] = ResetBufferState::calculate_required_finalization_offsets(inner_master_work_amount, store_exp->get_shape()[inner_dim]);
+        const auto loop_sum_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+            ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment,
+            apply_increments_sum, finalization_offsets_sum);
+
+        const auto horizon_sum = std::make_shared<ngraph::snippets::op::HorizonSum>(sum);
+        const auto buffer_exp = std::make_shared<ngraph::snippets::op::Buffer>(loop_sum_end->output(0), buffer_allocation_rank);
+
+        /* =========================================== */
+
+        /* ================== Div ==================== */
+
+        // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop
+        const auto pow = std::make_shared<ngraph::opset1::Power>(horizon_sum,
+            ngraph::op::Constant::create(ov::element::f32, ngraph::Shape{}, {-1}));
+
+        const auto loop_div_begin = op::insertLoopBegin(ngraph::OutputVector{buffer_exp});
+
+        const auto load_div = std::make_shared<ngraph::snippets::op::Load>(loop_div_begin->output(0), increment);
+        const auto mul = std::make_shared<ov::op::v1::Multiply>(load_div, pow);
+        const auto store_div = std::make_shared<ngraph::snippets::op::Store>(mul, increment);
+
+        auto apply_increments_div =
+                InsertLoops::calculate_inner_apply_increments(master_shape, {load_div->get_shape(), store_div->get_shape()});
+        std::vector<int64_t> finalization_offsets_div(2, 0);
+        if (has_outer_loop) {
+            finalization_offsets_div =
+                InsertLoops::calculate_finalization_offsets(master_shape, {load_div->get_shape(), store_div->get_shape()});
+        }
+        const auto loop_div_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+            ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment,
+            apply_increments_div, finalization_offsets_div);
+
+        /* =========================================== */
+
+        /* ========== Control dependency ============= */
+
+        loop_max_begin->add_control_dependency(vector_buffer_max);
+        loop_max_end->add_control_dependency(max);
+        horizon_max->add_control_dependency(loop_max_end);
+        loop_sum_begin->add_control_dependency(vector_buffer_sum);
+        loop_sum_begin->add_control_dependency(horizon_max);
+        loop_sum_end->add_control_dependency(sum);
+        horizon_sum->add_control_dependency(loop_sum_end);
+        loop_div_begin->add_control_dependency(pow);
+
+        /* =========================================== */
+
+        /* ============= Runtime Info ================ */
+
+        // For tail loop we should fill input of Max by float min and
+        // input of Sum by zero to avoid math incorrect calculations
+        max->input(0).get_rt_info()["set_fill"] = uint32_t(0xff7fffff);
+        sum->input(0).get_rt_info()["set_fill"] = uint32_t(0x00000000);
+
+        // These nodes should be executed outside loops
+        ov::NodeVector ops_outside_loop = { vector_buffer_max, horizon_max, vector_buffer_sum, horizon_sum, pow, buffer_exp };
+        for (const auto& op : ops_outside_loop) {
+            op->get_rt_info()["outside_loop"] = true;
+        }
+
+        ngraph::copy_runtime_info(root,
+            {vector_buffer_max, loop_max_begin, load_max, max, horizon_max, loop_max_end,
+             vector_buffer_sum, loop_sum_begin, load_sub, sub, exp, sum, store_exp, horizon_sum, loop_sum_end, buffer_exp, pow,
+             loop_div_begin, load_div, mul, store_div, loop_div_end});
+
+        /* =========================================== */
+
+        ngraph::replace_node(root, loop_div_end);
+
+        /* ============== Outer loop ================= */
+        if (has_outer_loop) {
+            std::vector<bool> apply_increments =
+                    InsertLoops::calculate_outer_apply_increments({root->get_input_shape(0), root->get_output_shape(0)});
+            const auto softmax_parameters =
+                std::vector<ov::Output<ov::Node>>{loop_max_begin->input(0).get_source_output()};
+            const auto output_set = loop_div_end->output(0).get_target_inputs();
+            const auto softmax_results = std::vector<ov::Input<ov::Node>>{output_set.begin(), output_set.end()};
+            const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(softmax_parameters);
+            const auto outer_loop_end = ngraph::snippets::op::insertLoopEndBeforeInputs(
+                softmax_results, outer_loop_begin, master_shape[outer_dim], 1, apply_increments);
+
+            vector_buffer_max->add_control_dependency(outer_loop_begin);
+
+            ngraph::copy_runtime_info(root, {outer_loop_begin, outer_loop_end});
+        }
+        /* =========================================== */
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(m_softmax, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/common/snippets/src/pass/softmax_reshape_elimination.cpp b/src/common/snippets/src/pass/softmax_reshape_elimination.cpp
new file mode 100644
index 00000000000000..f770f4e80668cd
--- /dev/null
+++ b/src/common/snippets/src/pass/softmax_reshape_elimination.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/remarks.hpp"
+
+#include "snippets/pass/softmax_reshape_elimination.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/validation_util.hpp>
+
+ngraph::snippets::pass::SoftmaxReshapeElimination::SoftmaxReshapeElimination() {
+    MATCHER_SCOPE(SoftmaxReshapeElimination);
+    const auto m_reshape0 = pattern::wrap_type<opset1::Reshape>(pattern::has_static_shape());
+    const auto m_softmax = pattern::wrap_type<ngraph::op::v1::Softmax, ngraph::op::v8::Softmax>({m_reshape0});
+    const auto m_reshape1 = pattern::wrap_type<opset1::Reshape>({m_softmax, pattern::wrap_type<opset1::Constant>()});
+
+    register_matcher(std::make_shared<ngraph::pattern::Matcher>(m_reshape1, matcher_name),
+            [=](ngraph::pattern::Matcher &m) {
+            OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxReshapeElimination")
+            auto& pattern_to_output = m.get_pattern_value_map();
+            auto reshape0 = pattern_to_output[m_reshape0].get_node_shared_ptr();
+            auto softmax = pattern_to_output[m_softmax].get_node_shared_ptr();
+            auto reshape1 = pattern_to_output[m_reshape1].get_node_shared_ptr();
+
+            const auto input_shape = reshape0->get_input_partial_shape(0);
+            const auto output_shape = reshape1->get_output_partial_shape(0);
+            if (input_shape.is_dynamic() || output_shape.is_dynamic() || input_shape.get_shape() != output_shape.get_shape())
+                return false;
+
+            const auto softmax_rank = softmax->get_input_partial_shape(0).rank();
+            int64_t axis = 0;
+            if (const auto softmax_v8 = ngraph::as_type_ptr<const ov::op::v8::Softmax>(softmax)) {
+                axis = ngraph::normalize_axis(softmax->get_friendly_name(), softmax_v8->get_axis(), softmax_rank);
+            } else if (const auto softmax_v1 = ngraph::as_type_ptr<const ov::op::v1::Softmax>(softmax)) {
+                axis = softmax_v1->get_axis();
+            } else {
+                return false;
+            }
+
+            // Supports only last axis
+            if (axis != softmax_rank.get_length() - 1)
+                return false;
+
+            // Dimensions by reduction axis should be equal
+            if (input_shape.get_shape().back() != softmax->get_input_shape(0).back())
+                return false;
+
+            // Eliminate Reshape before Softmax
+            reshape0->output(0).replace(reshape0->input_value(0));
+            copy_runtime_info({reshape0->input_value(0).get_node_shared_ptr(), reshape0->output(0).get_node_shared_ptr()},
+                reshape0->input_value(0).get_node_shared_ptr());
+
+            // Eliminate Reshape after Softmax with name saving
+            replace_output_update_name(reshape1->output(0), reshape1->input_value(0));
+
+            // update axis
+            const auto new_axis = input_shape.rank().get_length() - 1;
+            if (auto softmax_v8 = ngraph::as_type_ptr<ov::op::v8::Softmax>(softmax)) {
+                softmax_v8->set_axis(new_axis);
+            } else if (auto softmax_v1 = ngraph::as_type_ptr<ov::op::v1::Softmax>(softmax)) {
+                softmax_v1->set_axis(new_axis);
+            }
+
+            return true;
+        });
+}
diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp
index db9b00bf5b8f2a..5dc6960b2fd71a 100644
--- a/src/common/snippets/src/pass/transpose_decomposition.cpp
+++ b/src/common/snippets/src/pass/transpose_decomposition.cpp
@@ -60,7 +60,7 @@ ngraph::snippets::pass::TransposeDecomposition::TransposeDecomposition() {
         auto loop_C_begin = std::make_shared<op::LoopBegin>(OutputVector{loop_W_begin->output(0)});
         // todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation
         //  fix this in future and develop a more consistent shape propagation approach.
-        auto load = std::make_shared<snippets::op::LoadReshape>(loop_C_begin->output(0), 1, access_pattern);
+        auto load = std::make_shared<snippets::op::LoadReshape>(loop_C_begin->output(0), 1, 0, access_pattern);
         auto store = std::make_shared<snippets::op::Store>(load, 1);
         const std::vector<int64_t> ptr_increments_C {size_H * size_W, 1};
         const std::vector<int64_t> finalization_offsets_C {1 - size_H * size_W * size_C, 0};
diff --git a/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp b/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp
index 20c2fa1b272958..8b886ef9876b06 100644
--- a/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp
+++ b/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp
@@ -16,6 +16,7 @@ namespace snippets {
 
 typedef std::tuple<
         std::vector<PartialShape>, // Input shapes
+        PartialShape,              // Master shape
         size_t                     // Transpose position
 > fuseTransposeBrgemmParams;
 
diff --git a/src/common/snippets/tests/include/pass/softmax_decomposition.hpp b/src/common/snippets/tests/include/pass/softmax_decomposition.hpp
new file mode 100644
index 00000000000000..3943bd641bf8bb
--- /dev/null
+++ b/src/common/snippets/tests/include/pass/softmax_decomposition.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "lowering_utils.hpp"
+#include "snippets_helpers.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        Shape, // Input shape 0
+        int  // Axis
+> SoftmaxParams;
+
+typedef std::tuple<
+        Shape, // Input shape 0
+        Shape, // Input shape 1
+        int  // Axis
+> AddSoftmaxParams;
+
+class SoftmaxTests : public LoweringTests, public testing::WithParamInterface<SoftmaxParams> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<SoftmaxParams> obj);
+protected:
+    void SetUp() override;
+    std::shared_ptr<SnippetsFunctionBase> snippets_function;
+};
+
+class AddSoftmaxTests : public LoweringTests, public testing::WithParamInterface<AddSoftmaxParams> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<AddSoftmaxParams> obj);
+protected:
+    void SetUp() override;
+    std::shared_ptr<SnippetsFunctionBase> snippets_function;
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp
index ef5b74a08b910d..110c2052bd8399 100644
--- a/src/common/snippets/tests/src/lowering_utils.cpp
+++ b/src/common/snippets/tests/src/lowering_utils.cpp
@@ -21,7 +21,12 @@ DummyTargetMachine::DummyTargetMachine() {
     jitters[op::v1::Add::get_type_info_static()] = dummy_functor;
     jitters[op::v1::Subtract::get_type_info_static()] = dummy_functor;
     jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
-    jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
+    jitters[op::v1::Divide::get_type_info_static()] = dummy_functor;
+    jitters[op::v1::Maximum::get_type_info_static()] = dummy_functor;
+    jitters[op::v0::Exp::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::PowerStatic::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::Load::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = dummy_functor;
 
@@ -33,6 +38,9 @@ DummyTargetMachine::DummyTargetMachine() {
     jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::Brgemm::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::Fill::get_type_info_static()] = dummy_functor;
 }
 
 void LoweringTests::SetUp() {
diff --git a/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp b/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp
index a3f60e4656abc1..1962bb610db3a3 100644
--- a/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp
+++ b/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp
@@ -14,11 +14,13 @@ namespace snippets {
 
 std::string FuseTransposeBrgemmTests::getTestCaseName(testing::TestParamInfo<fuseTransposeBrgemmParams> obj) {
     std::vector<PartialShape> input_shapes(2);
+    PartialShape master_shape;
     size_t transpose_position;
-    std::tie(input_shapes, transpose_position) = obj.param;
+    std::tie(input_shapes, master_shape, transpose_position) = obj.param;
     std::ostringstream result;
     result << "IS[0]=" << CommonTestUtils::partialShape2str({input_shapes[0]}) << "_";
     result << "IS[1]=" << CommonTestUtils::partialShape2str({input_shapes[1]}) << "_";
+    result << "MS=" << CommonTestUtils::partialShape2str({master_shape}) << "_";
     result << "Pos=" << transpose_position << "_";
     return result.str();
 }
@@ -27,7 +29,7 @@ void FuseTransposeBrgemmTests::SetUp() {
     LoweringTests::SetUp();
     std::vector<PartialShape> input_shapes(2);
     size_t transpose_position;
-    std::tie(input_shapes, transpose_position) = this->GetParam();
+    std::tie(input_shapes, master_shape, transpose_position) = this->GetParam();
 
     snippets_function = std::make_shared<Transpose0213MatMulSinhLoweredFunction>(input_shapes, transpose_position);
 }
@@ -41,9 +43,9 @@ TEST_P(FuseTransposeBrgemmTests, FuseTransposeMatmul) {
 namespace FuseTransposeBrgemmTestsInstantiation {
 using ov::Shape;
 std::vector<fuseTransposeBrgemmParams> test_params{
-        {{{1, 49, 2, 23}, {2, 2, 23, 39}}, 0},
-        {{{1, 2, 49, 23}, {2, 23, 1, 39}}, 1},
-        {{{1, 2, 49, 23}, {2, 2, 23, 39}}, 2},
+        {{{1, 49, 2, 23}, {2, 2, 23, 39}}, {2, 2, 49, 23}, 0},
+        {{{1, 2, 49, 23}, {2, 23, 1, 39}}, {2, 2, 49, 39}, 1},
+        {{{1, 2, 49, 23}, {2, 2, 23, 39}}, {2, 2, 49, 39}, 2},
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_FuseTransposeMatMul, FuseTransposeBrgemmTests,
diff --git a/src/common/snippets/tests/src/pass/softmax_decomposition.cpp b/src/common/snippets/tests/src/pass/softmax_decomposition.cpp
new file mode 100644
index 00000000000000..91a95608926ac6
--- /dev/null
+++ b/src/common/snippets/tests/src/pass/softmax_decomposition.cpp
@@ -0,0 +1,124 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "pass/softmax_decomposition.hpp"
+#include "common_test_utils/common_utils.hpp"
+#include "subgraph_softmax.hpp"
+#include "subgraph_lowered.hpp"
+
+#include "snippets/pass/softmax_decomposition.hpp"
+#include "snippets/pass/insert_load_store.hpp"
+#include "snippets/pass/insert_movebroadcast.hpp"
+#include "snippets/pass/insert_buffer.hpp"
+#include "snippets/pass/propagate_buffer_offset.hpp"
+#include "snippets/pass/convert_power_to_powerstatic.hpp"
+
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string SoftmaxTests::getTestCaseName(testing::TestParamInfo<SoftmaxParams> obj) {
+    Shape inputShape;
+    int axis;
+    std::tie(inputShape, axis) = obj.param;
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "Axis=" << axis << "_";
+    return result.str();
+}
+
+void SoftmaxTests::SetUp() {
+    LoweringTests::SetUp();
+
+    const size_t count = 10;
+    manager.register_pass<ngraph::snippets::pass::SoftmaxDecomposition>(count);
+    manager.register_pass<ngraph::snippets::pass::ConvertPowerToPowerStatic>();
+    manager.register_pass<ngraph::snippets::pass::InsertLoad>(count);
+    manager.register_pass<ngraph::snippets::pass::InsertStore>(count);
+    manager.register_pass<ngraph::snippets::pass::InsertMoveBroadcast>();
+    Shape inputShape;
+    int axis;
+    std::tie(inputShape, axis) = this->GetParam();
+    snippets_function = std::make_shared<SoftmaxLoweredFunction>(std::vector<PartialShape>{inputShape}, axis);
+    master_shape = inputShape;
+}
+
+std::string AddSoftmaxTests::getTestCaseName(testing::TestParamInfo<AddSoftmaxParams> obj) {
+    Shape inputShape0, inputShape1;
+    int axis;
+    std::tie(inputShape0, inputShape1, axis) = obj.param;
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShape0) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(inputShape1) << "_";
+    result << "Axis=" << axis << "_";
+    return result.str();
+}
+
+void AddSoftmaxTests::SetUp() {
+    LoweringTests::SetUp();
+
+    const size_t count = 10;
+    manager.register_pass<ngraph::snippets::pass::InsertBuffer>();
+    manager.register_pass<ngraph::snippets::pass::SoftmaxDecomposition>(count);
+    manager.register_pass<ngraph::snippets::pass::ConvertPowerToPowerStatic>();
+    manager.register_pass<ngraph::snippets::pass::InsertLoad>(count);
+    manager.register_pass<ngraph::snippets::pass::InsertStore>(count);
+    manager.register_pass<ngraph::snippets::pass::InsertMoveBroadcast>();
+    manager.register_pass<ngraph::snippets::pass::PropagateBufferOffset>();
+    Shape inputShape0, inputShape1;
+    int axis;
+    std::tie(inputShape0, inputShape1, axis) = this->GetParam();
+    snippets_function = std::make_shared<AddSoftmaxLoweredFunction>(std::vector<PartialShape>{inputShape0, inputShape1}, axis);
+
+    ov::PartialShape master_pshape(inputShape0);
+    ov::PartialShape::broadcast_merge_into(master_pshape, inputShape1, op::AutoBroadcastType::NUMPY);
+    master_shape = master_pshape.get_shape();
+}
+
+TEST_P(SoftmaxTests, SoftmaxDecomposition) {
+    PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
+                                  master_shape[master_shape.size() - 1]});
+    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
+    function = subgraph->get_body();
+    function_ref = snippets_function->getLowered();
+}
+
+TEST_P(AddSoftmaxTests, AddSoftmaxDecomposition) {
+    PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
+                                  master_shape[master_shape.size() - 1]});
+    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
+    function = subgraph->get_body();
+    function_ref = snippets_function->getLowered();
+}
+
+namespace SoftmaxTestsInstantiation {
+std::vector<ov::Shape> inputShape{{12, 4, 12, 12, 127}, {12, 4, 12, 12, 1}};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SoftmaxDecomposition, SoftmaxTests,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShape),
+                                 ::testing::Values(-1)),
+                         SoftmaxTests::getTestCaseName);
+
+}  // namespace SoftmaxTestsInstantiation
+
+namespace AddSoftmaxTestsInstantiation {
+std::vector<ov::Shape> inputShape0{{12, 4, 12, 12, 17}, {12, 4, 12, 12, 1}};
+std::vector<ov::Shape> inputShape1{{12, 4, 12, 12, 17}, {12, 4, 12, 12, 1}};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmaxDecomposition, AddSoftmaxTests,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShape0),
+                                 ::testing::ValuesIn(inputShape1),
+                                 ::testing::Values(-1)),
+                         AddSoftmaxTests::getTestCaseName);
+
+}  // namespace AddSoftmaxTestsInstantiation
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp b/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp
new file mode 100644
index 00000000000000..f8b51924a025ae
--- /dev/null
+++ b/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <ngraph/function.hpp>
+#include <ngraph/pass/manager.hpp>
+
+#include <snippets/snippets_isa.hpp>
+#include <snippets/pass/softmax_reshape_elimination.hpp>
+
+#include <transformations/init_node_info.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+using namespace ngraph;
+
+TEST_F(TransformationTestsF, SoftmaxV1ReshapeElimination) {
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 3, 240});
+        auto shape0 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, std::vector<int32_t>{6, 240});
+        auto reshape0 = std::make_shared<ov::op::v1::Reshape>(data, shape0, false);
+        auto softmax_v1 = std::make_shared<ov::op::v1::Softmax>(reshape0, 1);
+        auto shape1 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, std::vector<int32_t>{2, 3, 240});
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(softmax_v1, shape1, false);
+        function = std::make_shared<Function>(NodeVector{reshape1}, ParameterVector{data});
+
+        manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
+    }
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 3, 240});
+        auto softmax_v1 = std::make_shared<ov::op::v1::Softmax>(data, 2);
+        function_ref = std::make_shared<Function>(NodeVector{softmax_v1}, ParameterVector{data});
+    }
+}
+
+TEST_F(TransformationTestsF, SoftmaxV8ReshapeElimination) {
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 340, 240});
+        auto shape0 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, std::vector<int32_t>{680, 240});
+        auto reshape0 = std::make_shared<ov::op::v1::Reshape>(data, shape0, false);
+        auto softmax_v1 = std::make_shared<ov::op::v8::Softmax>(reshape0, -1);
+        auto shape1 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{1, 2, 340, 240});
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(softmax_v1, shape1, false);
+        function = std::make_shared<Function>(NodeVector{reshape1}, ParameterVector{data});
+
+        manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
+    }
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 340, 240});
+        auto softmax_v1 = std::make_shared<ov::op::v8::Softmax>(data, 3);
+        function_ref = std::make_shared<Function>(NodeVector{softmax_v1}, ParameterVector{data});
+    }
+}
+
+TEST_F(TransformationTestsF, SoftmaxReshapeElimination_IncorrectReshape) {
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 340, 240});
+        auto shape0 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, std::vector<int32_t>{2, 81600});
+        auto reshape0 = std::make_shared<ov::op::v1::Reshape>(data, shape0, false);
+        auto softmax_v1 = std::make_shared<ov::op::v8::Softmax>(reshape0, -1);
+        auto shape1 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{1, 2, 340, 240});
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(softmax_v1, shape1, false);
+        function = std::make_shared<Function>(NodeVector{reshape1}, ParameterVector{data});
+
+        manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
+    }
+}
diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
index 1438fc286ce4e4..f3b3f30af25a7c 100644
--- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
@@ -46,6 +46,8 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
     // data movement
     jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
     jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
+    jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
+    jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = CREATE_EMITTER(VectorBufferEmitter);
     // jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported
 
     jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
@@ -123,6 +125,10 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
     // jitters[ngraph::opset1::Selu::get_type_info_static()] = CREATE_EMITTER(); // not supported
     jitters[ngraph::op::v0::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v0_emitter);
     jitters[ngraph::op::v7::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v7_emitter);
+    jitters[ngraph::snippets::op::Fill::get_type_info_static()] = CREATE_EMITTER(FillEmitter);
+
+    jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = CREATE_EMITTER(HorizonMaxEmitter);
+    jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = CREATE_EMITTER(HorizonSumEmitter);
 
     jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter);
     jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = CREATE_EMITTER(LoopBeginEmitter);
diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
index 327e6acd258438..e2eb93ecf4d7b1 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
@@ -128,10 +128,13 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl:
         }
         return layout;
     };
+    const auto& ops = model->get_ordered_ops();
     auto params = model->get_parameters();
     auto results = model->get_results();
     num_inputs = params.size();
     num_outputs = results.size();
+    is_buffer_needed = std::any_of(ops.begin(), ops.end(),
+        [](const std::shared_ptr<ov::Node>& node) { return ov::is_type<ngraph::snippets::op::Buffer>(node); } );
     NodeVector io_nodes;
     std::copy(params.begin(), params.end(), std::back_inserter(io_nodes));
     std::copy(results.begin(), results.end(), std::back_inserter(io_nodes));
@@ -210,15 +213,15 @@ void KernelEmitter::validate_arguments(const std::vector<size_t> &in,
         IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 0, got " << in.size();
     if (!out.empty())
         IE_THROW() << "KernelEmitter got invalid number of outputs. Expected 0, got " << out.size();
-    const auto num_params = num_inputs + num_outputs;
+    const auto num_params = num_inputs + num_outputs + static_cast<size_t>(is_buffer_needed);
     // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount
     if (data_ptr_regs_idx.size() != num_params)
         IE_THROW() << "KernelEmitter: number of inputs and outputs is inconsisnent with the number of allocated registers"
         << num_params << " data_ptr_regs_idx.size() = " << data_ptr_regs_idx.size();
 }
 
-void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params,
-                                              const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector<Reg64>& data_ptr_regs) const {
+void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, bool is_buffer_needed,
+                                       const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector<Reg64>& data_ptr_regs) const {
     // Note that we don't need offset for the last dim, since it's handled directly by Tile emitter
     const size_t offset_rank = jcp.master_shape.size() - 1;
     //const size_t tile_rank = jcp.tile_rank;
@@ -277,7 +280,13 @@ void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params,
                                                         return reg != reg_indexes_idx && reg != reg_const_params_idx;
                                                    });
     const bool last_iter_explicitly = spare_corruptable_gpr == gp_regs_pool.end();
-    Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs.back() : Reg64(static_cast<int>(*spare_corruptable_gpr));
+    Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs[num_params - 1] : Reg64(static_cast<int>(*spare_corruptable_gpr));
+    // Vector "data_ptr_regs" is sorted by abstract regs.
+    // It means that the vector contains the physical registers in order [src, .., src, dst, .., dst, buffer]
+    // So we can initialize buffer register firstly as last value of vector "data_ptr_regs"
+    if (is_buffer_needed) {
+        h->mov(data_ptr_regs[num_params], h->ptr[reg_const_params + GET_OFF(buffer_scratchpad_ptr)]);
+    }
     size_t i = 0;
     for (; i < num_params - last_iter_explicitly; i++) {
         if (i < num_inputs)
@@ -310,7 +319,7 @@ void KernelEmitter::emit_impl(const std::vector<size_t>& in,
     std::vector<Reg64> data_ptr_regs;
     transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs);
 
-    init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs);
+    init_data_pointers(num_inputs, num_inputs + num_outputs, is_buffer_needed, reg_indexes, reg_const_params, data_ptr_regs);
     for (const auto& c : body) {
         const auto& emitter = c.first;
         std::vector<size_t> in_regs, out_regs;
@@ -535,7 +544,9 @@ StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::c
     if (src_prc != dst_prc)
         IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
 
-    count = ov::as_type_ptr<ngraph::snippets::op::Store>(n)->get_count();
+    const auto store = ov::as_type_ptr<ngraph::snippets::op::Store>(n);
+    count = store->get_count();
+    byte_offset = store->get_offset();
     in_out_type_ = emitter_in_out_map::vec_to_gpr;
     store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count));
 }
@@ -562,7 +573,7 @@ void StoreEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<siz
             Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
     if (!store_emitter)
         IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!";
-    store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+    store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
 }
 
 void StoreEmitter::emit_data() const {
@@ -574,7 +585,12 @@ LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu
     if (src_prc != dst_prc)
         IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
 
-    count = std::dynamic_pointer_cast<ngraph::snippets::op::Load>(n)->get_count();
+    const auto load = std::dynamic_pointer_cast<ngraph::snippets::op::Load>(n);
+    if (!load)
+        IE_THROW() << "LoadEmitter expects Load snippets op";
+
+    count = load->get_count();
+    byte_offset = load->get_offset();
     in_out_type_ = emitter_in_out_map::gpr_to_vec;
     load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
 }
@@ -601,7 +617,7 @@ void LoadEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size
             Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
     if (!load_emitter)
         IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!";
-    load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+    load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
 }
 
 void LoadEmitter::emit_data() const {
@@ -611,8 +627,13 @@ void LoadEmitter::emit_data() const {
 BroadcastLoadEmitter::BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
                                            const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
     if (src_prc != dst_prc)
-            IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
+        IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
 
+    const auto broadcast_load = std::dynamic_pointer_cast<ngraph::snippets::op::BroadcastLoad>(n);
+    if (!broadcast_load)
+        IE_THROW() << "BroadcastLoadEmitter expects BroadcastLoad snippets op";
+
+    byte_offset = broadcast_load->get_offset();
     in_out_type_ = emitter_in_out_map::gpr_to_vec;
 }
 
@@ -642,16 +663,18 @@ void BroadcastLoadEmitter::emit_isa(const std::vector<size_t> &in, const std::ve
     // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`,
     // key point here is not to add post-increment, it might be fixed by some other approach in future
     switch (src_prc.size()) {
-        case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg]); break;
-        case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg]); break;
-        case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg]); break;
+        case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + byte_offset]); break;
+        case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg + byte_offset]); break;
+        case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg + byte_offset]); break;
         default: assert(!"unsupported data type");
     }
 }
 
 LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
     : MemoryEmitter(h, isa, n) {
-    count = ov::as_type_ptr<ngraph::snippets::op::Load>(n)->get_count();
+    const auto load = ov::as_type_ptr<ngraph::snippets::op::Load>(n);
+    count = load->get_count();
+    byte_offset = load->get_offset();
     in_out_type_ = emitter_in_out_map::gpr_to_vec;
     load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
 }
@@ -676,7 +699,7 @@ template <dnnl::impl::cpu::x64::cpu_isa_t isa>
 void LoadConvertEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
     if (!load_emitter)
         IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!";
-    load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+    load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
 }
 
 void LoadConvertEmitter::emit_data() const {
@@ -685,7 +708,9 @@ void LoadConvertEmitter::emit_data() const {
 
 StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
                                          const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
-    count = ov::as_type_ptr<ngraph::snippets::op::Store>(n)->get_count();
+    const auto store = ov::as_type_ptr<ngraph::snippets::op::Store>(n);
+    count = store->get_count();
+    byte_offset = store->get_offset();
     in_out_type_ = emitter_in_out_map::vec_to_gpr;
 
     if (ov::is_type<ov::intel_cpu::StoreConvertTruncation>(n)) {
@@ -715,7 +740,7 @@ template <dnnl::impl::cpu::x64::cpu_isa_t isa>
 void StoreConvertEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
     if (!store_emitter)
         IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!";
-    store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+    store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
 }
 
 void StoreConvertEmitter::emit_data() const {
@@ -814,6 +839,10 @@ BrgemmEmitter::BrgemmEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl:
             }
         }
     }
+
+    load_offset_a = brgemm_node->get_offset_a();
+    load_offset_b = brgemm_node->get_offset_b();
+    store_offset_c = brgemm_node->get_offset_c();
 }
 
 void BrgemmEmitter::initBrgemm(brgemmCtx& ctx, std::unique_ptr<brgemm_kernel_t>& brgKernel, bool use_amx) const {
@@ -854,8 +883,9 @@ void BrgemmEmitter::emit_impl(const std::vector<size_t>& in,
 }
 template <dnnl::impl::cpu::x64::cpu_isa_t isa>
 void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, int bs,
-                                   Reg64 addr_A, Reg64 addr_B,
-                                   const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch) const {
+                                            Reg64 addr_A, Reg64 addr_B,
+                                            const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch,
+                                            const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const {
     using Vmm = typename dnnl::impl::utils::conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
     size_t gpr_size = 8;
     Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax,
@@ -905,8 +935,14 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, in
     // todo: Windows ABI : requires different num of arguments passed in regs and on the stack. Need to align.
     h->mov(abi_param1, reinterpret_cast<uintptr_t>(brgKernel));
     h->mov(abi_param2, bs);
-    h->uni_vmovq(abi_param3, Xmm(0));
-    h->uni_vmovq(abi_param4, Xmm(1));
+
+    const auto data_ptr = [&](Xmm xmm, Xbyak::Reg64 reg, size_t bytes_offset) {
+        h->uni_vmovq(reg, xmm);
+        if (bytes_offset) h->add(reg, bytes_offset);
+    };
+    data_ptr(Xmm(0), abi_param3, in0_kernel_offset);
+    data_ptr(Xmm(1), abi_param4, in1_kernel_offset);
+
     size_t num_args_passed_on_stack = 1;
 #ifdef _WIN32
     num_args_passed_on_stack = 3;
@@ -915,14 +951,15 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, in
     h->mov(h->qword[h->rsp], reinterpret_cast<uint64_t>(scratch));
     h->mov(h->qword[h->rsp + gpr_size], reinterpret_cast<uintptr_t>(batch));
     h->mov(h->qword[h->rsp + 2 * gpr_size], Xmm(2));
+    if (out0_kernel_offset) h->add(h->qword[h->rsp + 2 * gpr_size], out0_kernel_offset);
 #else
     h->mov(abi_param5, reinterpret_cast<uintptr_t>(batch));
-    h->uni_vmovq(abi_param6, Xmm(2));
+    data_ptr(Xmm(2), abi_param6, out0_kernel_offset);
     h->sub(h->rsp, gpr_size);
     h->mov(h->qword[h->rsp], reinterpret_cast<uint64_t>(scratch));
 #endif
-   // align stack on 16-byte as ABI requires
-   // note that RBX must not be changed by the callee
+    // align stack on 16-byte as ABI requires
+    // note that RBX must not be changed by the callee
     h->mov(h->rbx, h->rsp);
     h->and_(h->rbx, 0xf);
     h->sub(h->rsp, h->rbx);
@@ -975,32 +1012,205 @@ void BrgemmEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<si
                 auto& brgemmCtx = brgCtxs0[getBrgIdx(mIdx, k, n)];
 
                 if (brgemmCtx.K != 0 && brgemmCtx.N != 0) {
-                    const size_t in0_offset = (k * K0_step0 + mb * M_blk * brgemmCtx.LDA) * io_data_size[0];
-                    const size_t in1_offset = (k * K0_step1 + n * N0_step0) * io_data_size[1];
-                    const size_t out0_offset = (n * N0_step1 + mb * M_blk * brgemmCtx.LDC) * io_data_size[2];
-                    if (in0_offset != 0)
-                        h->add(input_0, in0_offset);
-                    if (in1_offset != 0)
-                        h->add(input_1, in1_offset);
-                    if (out0_offset != 0)
-                        h->add(output_0, out0_offset);
+                    const size_t in0_offset = load_offset_a + (k * K0_step0 + mb * M_blk * brgemmCtx.LDA) * io_data_size[0];
+                    const size_t in1_offset = load_offset_b + (k * K0_step1 + n * N0_step0) * io_data_size[1];
+                    const size_t out0_offset = store_offset_c + (n * N0_step1 + mb * M_blk * brgemmCtx.LDC) * io_data_size[2];
+
                     emit_brgemm_kernel_call<isa>(brgKernels0[getBrgIdx(mIdx, k, n)].get(),
                                                  1,
                                                  input_0,
                                                  input_1,
                                                  nullptr,
                                                  output_0,
-                                                 nullptr);
-                    if (in0_offset != 0)
-                        h->sub(input_0, in0_offset);
-                    if (in1_offset != 0)
-                        h->sub(input_1, in1_offset);
-                    if (out0_offset != 0)
-                        h->sub(output_0, out0_offset);
+                                                 nullptr,
+                                                 in0_offset,
+                                                 in1_offset,
+                                                 out0_offset);
                 }
             }
         }
     }
 }
+
+HorizonMaxEmitter::HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n) :
+    jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {}
+
+void HorizonMaxEmitter::emit_impl(const std::vector<size_t>& in,
+                                    const std::vector<size_t>& out,
+                                    const std::vector<size_t>& pool,
+                                    const std::vector<size_t>& gpr,
+                                    const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "HorizonMax emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void HorizonMaxEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+
+    Vmm src_vmm = Vmm(in[0]);
+    Xmm dst_xmm = Xmm(out[0]);
+    Xmm aux_xmm = Xmm(aux_vec_idxs[0]);
+
+    Reg64 aux_reg = Reg64(aux_gpr_idxs[0]);
+    Reg32 aux_reg_32 = Reg32(aux_reg.getIdx());
+
+    const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen;
+    const size_t vec_size = vlen / sizeof(float);
+    h->sub(h->rsp, vlen);
+    h->uni_vmovups(h->ptr[h->rsp], src_vmm);
+    // Let the first value be the max
+    h->mov(aux_reg, h->ptr[h->rsp]);
+    h->vmovq(dst_xmm, aux_reg);
+    for (size_t i = 1; i < vec_size; i++) {
+        h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]);
+        h->vmovq(aux_xmm, aux_reg);
+        h->uni_vmaxps(dst_xmm, dst_xmm, aux_xmm);
+    }
+    h->add(h->rsp, vlen);
+}
+
+HorizonSumEmitter::HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n) :
+    jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {}
+
+void HorizonSumEmitter::emit_impl(const std::vector<size_t>& in,
+                                    const std::vector<size_t>& out,
+                                    const std::vector<size_t>& pool,
+                                    const std::vector<size_t>& gpr,
+                                    const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "HorizonSum emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void HorizonSumEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+
+    Vmm src_vmm = Vmm(in[0]);
+    Xmm dst_xmm = Xmm(out[0]);
+    Xmm aux_xmm = Xmm(aux_vec_idxs[0]);
+
+    Reg64 aux_reg = Reg64(aux_gpr_idxs[0]);
+    Reg32 aux_reg_32 = Reg32(aux_reg.getIdx());
+
+    const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen;
+    const size_t vec_size = vlen / sizeof(float);
+    h->sub(h->rsp, vlen);
+    h->uni_vmovups(h->ptr[h->rsp], src_vmm);
+    h->uni_vpxor(dst_xmm, dst_xmm, dst_xmm);
+    for (size_t i = 0; i < vec_size; i++) {
+        h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]);
+        h->vmovq(aux_xmm, aux_reg);
+        h->uni_vaddps(dst_xmm, dst_xmm, aux_xmm);
+    }
+    h->add(h->rsp, vlen);
+}
+
+VectorBufferEmitter::VectorBufferEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n) :
+    jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {}
+
+void VectorBufferEmitter::emit_impl(const std::vector<size_t>& in,
+                                    const std::vector<size_t>& out,
+                                    const std::vector<size_t>& pool,
+                                    const std::vector<size_t>& gpr,
+                                    const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "Zero emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void VectorBufferEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+
+    Vmm vmm = Vmm(out[0]);
+    h->uni_vpxor(vmm, vmm, vmm);
+}
+
+FillEmitter::FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n) :
+    jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {
+    const auto fill = ov::as_type_ptr<ngraph::snippets::op::Fill>(n);
+    if (fill->get_element_type().size() != 4) {
+        IE_THROW() << "Fill emitter supports only 4 Byte element types but gets: " << fill->get_element_type();
+    }
+
+    offset = fill->get_offset();
+    fill_value = fill->get_fill_value();
+    prepare_table();
+}
+
+size_t FillEmitter::aux_gprs_count() const {
+    // + 1 reg for temp reg for mask in avx512
+    return one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core) ? 2 : 1;
+}
+
+void FillEmitter::emit_impl(const std::vector<size_t>& in,
+                            const std::vector<size_t>& out,
+                            const std::vector<size_t>& pool,
+                            const std::vector<size_t>& gpr,
+                            const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "Fill emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void FillEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+
+    Vmm src_vmm = Vmm(in[0]);
+    Vmm dst_vmm = Vmm(out[0]);
+
+    if (one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core)) {
+        uint64_t tail_mask = 1;
+        tail_mask = ~((tail_mask << offset) - tail_mask);
+        h->mov(Reg64(aux_gpr_idxs[0]), tail_mask);
+        h->kmovq(k_mask, Reg64(aux_gpr_idxs[0]));
+        h->vblendmps(dst_vmm | k_mask, src_vmm, table_val("value"));
+    } else if (one_of(host_isa_, dnnl::impl::cpu::x64::avx2, dnnl::impl::cpu::x64::sse41)) {
+        uint8 imm = 1;
+        imm = ~((imm << offset) - imm);  // shift load_num bit
+        if (host_isa_ == dnnl::impl::cpu::x64::sse41 && src_vmm.getIdx() != dst_vmm.getIdx()) {
+            h->uni_vmovups(dst_vmm, src_vmm);
+            src_vmm = Vmm(dst_vmm.getIdx());
+        }
+        h->uni_vblendps(dst_vmm, src_vmm, table_val("value"), imm);
+    }
+}
+
+void FillEmitter::register_table_entries() {
+    push_arg_entry_of("value", fill_value, true);
+}
+
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
index c559f2421f0235..0a2373002ef434 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
@@ -33,6 +33,7 @@ namespace intel_cpu {
 struct jit_snippets_call_args {
     const void *src_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {};
     void *dst_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {};
+    void *buffer_scratchpad_ptr = nullptr;
 };
 
 struct jit_snippets_compile_args {
@@ -94,12 +95,13 @@ class KernelEmitter : public jit_container_emitter {
                    const std::vector<size_t>& pool,
                    const std::vector<size_t>& gpr,
                    const ov::intel_cpu::emitter_context *emit_context) const override;
-    void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector<Reg64>&) const;
+    void init_data_pointers(size_t, size_t, bool, const Reg64&, const Reg64&, const std::vector<Reg64>&) const;
 
     jit_snippets_compile_args jcp;
     std::vector<size_t> gp_regs_pool;
     size_t num_inputs;
     size_t num_outputs;
+    bool is_buffer_needed;
     // Vector of indices (lenght = input tensor rank) per every input and output that describes in which order
     // corresponding tensor dimensions are accessed (default: consecutive dense, e.g. 0,1,2,3 for 4D tensor).
     // Needed to calc i/o offsets.
@@ -254,6 +256,9 @@ class MemoryEmitter : public jit_emitter  {
 protected:
     Precision src_prc;
     Precision dst_prc;
+
+    size_t count = 0;
+    size_t byte_offset = 0;
 };
 
 class StoreEmitter : public MemoryEmitter  {
@@ -274,7 +279,6 @@ class StoreEmitter : public MemoryEmitter  {
     void emit_data() const override;
 
 private:
-    size_t count;
     std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
 };
 
@@ -296,7 +300,6 @@ class LoadEmitter : public MemoryEmitter {
     void emit_data() const override;
 
 private:
-    size_t count;
     std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
 };
 
@@ -335,7 +338,6 @@ class LoadConvertEmitter : public MemoryEmitter {
     void emit_data() const override;
 
 private:
-    size_t count;
     std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
 };
 
@@ -357,7 +359,6 @@ class StoreConvertEmitter : public MemoryEmitter {
     void emit_data() const override;
 
 private:
-    size_t count;
     std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
 };
 
@@ -391,8 +392,9 @@ class BrgemmEmitter : public jit_emitter {
     size_t getBrgIdx(size_t mIdx, size_t kIdx, size_t nIdx) const;
     template <dnnl::impl::cpu::x64::cpu_isa_t isa>
     void emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, int bs,
-                                             Reg64 addr_A, Reg64 addr_B,
-                                              const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch) const;
+                                 Reg64 addr_A, Reg64 addr_B,
+                                 const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch,
+                                 const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const;
 
     static constexpr size_t BRGEMM_KERNELS_NUM = 8;
     static constexpr size_t matmulOptimalM = 32;
@@ -403,6 +405,94 @@ class BrgemmEmitter : public jit_emitter {
     size_t K, K_blk, K_tail;
     size_t N, N_blk, N_tail;
     size_t brg0VnniFactor;
+
+    size_t load_offset_a = 0lu;
+    size_t load_offset_b = 0lu;
+    size_t store_offset_c = 0lu;
+};
+
+class HorizonMaxEmitter : public jit_emitter {
+public:
+    HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
+    size_t get_inputs_num() const override {return 1;}
+
+protected:
+    size_t aux_gprs_count() const override {return 1;}
+    size_t aux_vecs_count() const override {return 1;}
+
+private:
+    void emit_impl(const std::vector<size_t>& in,
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+};
+
+class HorizonSumEmitter : public jit_emitter {
+public:
+    HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
+    size_t get_inputs_num() const override {return 1;}
+
+protected:
+    size_t aux_gprs_count() const override {return 1;}
+    size_t aux_vecs_count() const override {return 1;}
+
+private:
+    void emit_impl(const std::vector<size_t>& in,
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+};
+
+class VectorBufferEmitter : public jit_emitter {
+public:
+    VectorBufferEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
+    size_t get_inputs_num() const override {return 0;}
+
+private:
+    void emit_impl(const std::vector<size_t>& in,
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+};
+
+class FillEmitter : public jit_emitter {
+public:
+    FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
+    size_t get_inputs_num() const override {return 1;}
+
+protected:
+    size_t aux_gprs_count() const override;
+
+private:
+    void emit_impl(const std::vector<size_t>& in,
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+
+    void register_table_entries() override;
+
+    size_t offset = 0;
+    uint32_t fill_value = 0x0;
 };
 
 }   // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index d2a8f5381c9174..11d04459606ad3 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -345,6 +345,8 @@ void Snippet::createPrimitive() {
     jcp.master_shape = masterShape;
     jcp.tile_rank = tileRank;
     generate(&jcp);
+    buffer_scratchpad_size = snippet->get_buffer_scratchpad_size();
+    buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0);
 }
 
 std::vector<VectorDims> Snippet::shapeInfer() const {
@@ -468,28 +470,6 @@ bool Snippet::needPrepareParams() const {
     return inputShapesModified() || !schedule.ptr;
 }
 
-void Snippet::updateSrcDstPtrs(jit_snippets_call_args& call_args) const {
-    for (size_t i = 0; i < srcMemPtrs.size(); i++)
-        call_args.src_ptrs[i] = reinterpret_cast<const uint8_t*>(srcMemPtrs[i]->GetData()) + start_offset_in[i];
-
-    for (size_t i = 0; i < dstMemPtrs.size(); i++)
-        call_args.dst_ptrs[i] = reinterpret_cast<uint8_t*>(dstMemPtrs[i]->GetData()) + start_offset_out[i];
-}
-
-void Snippet::execute(dnnl::stream strm) {
-    if (schedule.ptr == nullptr) {
-        IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference";
-    }
-    jit_snippets_call_args call_args;
-    updateSrcDstPtrs(call_args);
-
-    if (tensorRank == rank6D) {
-        schedule_6d(call_args);
-    } else {
-        schedule_nt(call_args);
-    }
-}
-
 bool Snippet::canBeInPlace() const {
     if (isDynamic || getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) {
         return false;
@@ -543,19 +523,49 @@ void Snippet::generate(const jit_snippets_compile_args* jcp) {
     schedule = snippet->generate(optManager, reinterpret_cast<const void*>(jcp));
 }
 
-void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const {
+void Snippet::update_ptrs(jit_snippets_call_args& call_args) {
+    for (size_t i = 0; i < srcMemPtrs.size(); i++)
+        call_args.src_ptrs[i] = reinterpret_cast<const uint8_t*>(srcMemPtrs[i]->GetData()) + start_offset_in[i];
+
+    for (size_t i = 0; i < dstMemPtrs.size(); i++)
+        call_args.dst_ptrs[i] = reinterpret_cast<uint8_t*>(dstMemPtrs[i]->GetData()) + start_offset_out[i];
+
+    if (buffer_scratchpad_size > 0) {
+        call_args.buffer_scratchpad_ptr =
+                reinterpret_cast<uint8_t*>(buffer_scratchpad.data()) + parallel_get_thread_num() * buffer_scratchpad_size;
+    }
+}
+
+void Snippet::execute(dnnl::stream strm) {
+    if (schedule.ptr == nullptr) {
+        IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference";
+    }
+    if (tensorRank == rank6D) {
+        schedule_6d();
+    } else {
+        schedule_nt();
+    }
+}
+
+void Snippet::schedule_6d() {
     const auto& dom = exec_domain;
     // < N, C, H, W > < 1, 1, N, C*H*W>
     parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4],
         [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) {
             int64_t indexes[] = {d0, d1, d2, d3, d4};
+            jit_snippets_call_args call_args;
+            update_ptrs(call_args);
+
             schedule.get_callable<kernel>()(indexes, &call_args);
         });
 }
 
-void Snippet::schedule_nt(const jit_snippets_call_args& call_args) const {
+void Snippet::schedule_nt() {
     const auto& work_size = exec_domain;
     parallel_nt(0, [&](const int ithr, const int nthr) {
+        jit_snippets_call_args call_args;
+        update_ptrs(call_args);
+
         size_t start = 0, end = 0;
         splitter(harnessWorkAmount, nthr, ithr, start, end);
 
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
index 9be60ddc8f4c98..62f7fe1539d2b3 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -63,10 +63,10 @@ class Snippet : public Node {
     bool optimizeExecDomain(std::vector<VectorDims>&, std::vector<VectorDims>&, VectorDims&, size_t&) const;
 
     void generate(const jit_snippets_compile_args*);
-    void updateSrcDstPtrs(jit_snippets_call_args&) const;
+    inline void update_ptrs(jit_snippets_call_args&);
     // Evaluates generated snippet using parallel backend
-    void schedule_6d(const jit_snippets_call_args& const_args) const;
-    void schedule_nt(const jit_snippets_call_args& const_args) const;
+    void schedule_6d();
+    void schedule_nt();
 
     // Original subgraph node
     std::shared_ptr<ngraph::snippets::op::Subgraph> original_snippet;
@@ -107,6 +107,10 @@ class Snippet : public Node {
 
     std::vector<ptrdiff_t> start_offset_in = {};
     std::vector<ptrdiff_t> start_offset_out = {};
+
+    // Buffer scratchpad
+    std::vector<uint8_t> buffer_scratchpad = {};
+    size_t buffer_scratchpad_size = 0;
 };
 
 }   // namespace node
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp
index 2db9fd9f010de8..021b3f6c1293ec 100644
--- a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp
+++ b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp
@@ -42,12 +42,12 @@ ov::intel_cpu::pass::FuseLoadConvert::FuseLoadConvert() {
                 std::dynamic_pointer_cast<ngraph::snippets::op::ConvertSaturation>(convert)) {
             load_convert = std::make_shared<ov::intel_cpu::LoadConvertSaturation>(param,
                                                                                   convert_saturation->get_destination_type(),
-                                                                                  load->get_count());
+                                                                                  load->get_count(), load->get_offset());
         } else if (const auto convert_truncation =
                 std::dynamic_pointer_cast<ngraph::snippets::op::ConvertTruncation>(convert)) {
             load_convert = std::make_shared<ov::intel_cpu::LoadConvertTruncation>(param,
                                                                                   convert_truncation->get_destination_type(),
-                                                                                  load->get_count());
+                                                                                  load->get_count(), load->get_offset());
         } else {
             throw ngraph::ngraph_error(
                 "Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops");
@@ -91,12 +91,12 @@ ov::intel_cpu::pass::FuseStoreConvert::FuseStoreConvert() {
                 std::dynamic_pointer_cast<ngraph::snippets::op::ConvertSaturation>(convert)) {
             store_convert = std::make_shared<ov::intel_cpu::StoreConvertSaturation>(input,
                                                                                     convert_saturation->get_destination_type(),
-                                                                                    store->get_count());
+                                                                                    store->get_count(), store->get_offset());
         } else if (const auto convert_truncation =
                 std::dynamic_pointer_cast<ngraph::snippets::op::ConvertTruncation>(convert)) {
             store_convert = std::make_shared<ov::intel_cpu::StoreConvertTruncation>(input,
                                                                                     convert_truncation->get_destination_type(),
-                                                                                    store->get_count());
+                                                                                    store->get_count(), store->get_offset());
         } else {
             throw ngraph::ngraph_error(
                 "Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops");
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp
index 731c0cb1e1b24a..675c214ed7ae2b 100644
--- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp
+++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp
@@ -11,8 +11,9 @@
 using namespace std;
 using namespace ov;
 
-intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
-    Load(x, count), m_destination_type(destination_type) {
+intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type,
+    const size_t count, const size_t offset) :
+    Load(x, count, offset), m_destination_type(destination_type) {
     constructor_validate_and_infer_types();
 }
 
@@ -30,11 +31,12 @@ void intel_cpu::LoadConvertSaturation::validate_and_infer_types() {
 std::shared_ptr<Node> intel_cpu::LoadConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<LoadConvertSaturation>(new_args.at(0), m_destination_type, m_count);
+    return std::make_shared<LoadConvertSaturation>(new_args.at(0), m_destination_type, m_count, m_offset);
 }
 
-intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
-        Load(x, count), m_destination_type(destination_type) {
+intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type,
+    const size_t count, const size_t offset) :
+        Load(x, count, offset), m_destination_type(destination_type) {
     constructor_validate_and_infer_types();
 }
 
@@ -52,5 +54,5 @@ void intel_cpu::LoadConvertTruncation::validate_and_infer_types() {
 std::shared_ptr<Node> intel_cpu::LoadConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<LoadConvertTruncation>(new_args.at(0), m_destination_type, m_count);
+    return std::make_shared<LoadConvertTruncation>(new_args.at(0), m_destination_type, m_count, m_offset);
 }
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp
index 572cbf00f521d4..1b1b8988c16784 100644
--- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp
+++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp
@@ -20,7 +20,7 @@ class LoadConvertSaturation : public ngraph::snippets::op::Load {
 public:
     OPENVINO_OP("LoadConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Load);
 
-    LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
+    LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu);
     LoadConvertSaturation() = default;
 
     ov::element::Type get_destination_type() const { return m_destination_type; }
@@ -47,7 +47,7 @@ class LoadConvertTruncation : public ngraph::snippets::op::Load {
 public:
     OPENVINO_OP("LoadConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Load);
 
-    LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
+    LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu);
     LoadConvertTruncation() = default;
 
     ov::element::Type get_destination_type() const { return m_destination_type; }
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp
index e58b5bc678d1f8..6a4180c54299c5 100644
--- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp
+++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp
@@ -11,8 +11,9 @@
 using namespace std;
 using namespace ov;
 
-intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
-    Store(x, count), m_destination_type(destination_type) {
+intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type,
+    const size_t count, const size_t offset) :
+    Store(x, count, offset), m_destination_type(destination_type) {
     constructor_validate_and_infer_types();
 }
 
@@ -30,11 +31,12 @@ void intel_cpu::StoreConvertSaturation::validate_and_infer_types() {
 std::shared_ptr<Node> intel_cpu::StoreConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<StoreConvertSaturation>(new_args.at(0), m_destination_type, m_count);
+    return std::make_shared<StoreConvertSaturation>(new_args.at(0), m_destination_type, m_count, m_offset);
 }
 
-intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
-        Store(x, count), m_destination_type(destination_type) {
+intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type,
+    const size_t count, const size_t offset) :
+        Store(x, count, offset), m_destination_type(destination_type) {
     constructor_validate_and_infer_types();
 }
 
@@ -52,5 +54,5 @@ void intel_cpu::StoreConvertTruncation::validate_and_infer_types() {
 std::shared_ptr<Node> intel_cpu::StoreConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<StoreConvertTruncation>(new_args.at(0), m_destination_type, m_count);
+    return std::make_shared<StoreConvertTruncation>(new_args.at(0), m_destination_type, m_count, m_offset);
 }
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp
index d0c4a947433b7c..3697af21540915 100644
--- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp
+++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp
@@ -20,7 +20,7 @@ class StoreConvertSaturation : public ngraph::snippets::op::Store {
 public:
     OPENVINO_OP("StoreConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Store);
 
-    StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
+    StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu);
     StoreConvertSaturation() = default;
 
     ov::element::Type get_destination_type() const { return m_destination_type; }
@@ -47,7 +47,7 @@ class StoreConvertTruncation : public ngraph::snippets::op::Store {
 public:
     OPENVINO_OP("StoreConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Store);
 
-    StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
+    StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu);
     StoreConvertTruncation() = default;
 
     ov::element::Type get_destination_type() const { return m_destination_type; }
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp
new file mode 100644
index 00000000000000..a35587aed7887d
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/softmax.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+
+const std::vector<ov::Shape> inputShape = {
+    ov::Shape{1, 16},
+    ov::Shape{1, 32},
+    ov::Shape{1, 1},
+    ov::Shape{1, 9},
+    ov::Shape{1, 17},
+    ov::Shape{1, 19},
+    ov::Shape{1, 49},
+    ov::Shape{1, 50},
+    ov::Shape{5, 16},
+    ov::Shape{5, 32},
+    ov::Shape{5, 1},
+    ov::Shape{5, 9},
+    ov::Shape{5, 17},
+    ov::Shape{5, 19},
+    ov::Shape{5, 49},
+    ov::Shape{5, 50},
+    ov::Shape{1, 3, 128, 128},
+    ov::Shape{1, 3, 128, 129},
+    ov::Shape{1, 3, 128, 130},
+    ov::Shape{1, 3, 128, 1},
+    ov::Shape{1, 3, 128, 9},
+    ov::Shape{1, 3, 128, 16},
+    ov::Shape{1, 3, 128, 17},
+    ov::Shape{1, 3, 128, 20},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Softmax, Softmax,
+                     ::testing::Combine(
+                             ::testing::ValuesIn(inputShape),
+                             ::testing::Values(-1),
+                             ::testing::Values(2),  // Subgraph + Sin
+                             ::testing::Values(1),
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                     Softmax::getTestCaseName);
+
+const std::vector<std::pair<ov::Shape, ov::Shape>> inputShapesPair = {
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 16, 35}},
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 16, 1}, ov::Shape{1, 5, 16, 35}},
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 1, 1}},
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 16, 1}, ov::Shape{1, 5, 16, 1}},
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 1, 35}},
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 1, 35}, ov::Shape{1, 5, 1, 35}},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmax, AddSoftmax,
+                     ::testing::Combine(
+                             ::testing::ValuesIn(inputShapesPair),
+                             ::testing::Values(-1),
+                             ::testing::Values(3),  // Subgraph + Sin * 2
+                             ::testing::Values(1),
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                     AddSoftmax::getTestCaseName);
+
+} // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp
new file mode 100644
index 00000000000000..76dbb58f5b4644
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/transpose_softmax.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+
+const std::vector<ov::Shape> inputShape = {
+    ov::Shape{1, 128, 3, 16},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmax, TransposeSoftmax,
+                     ::testing::Combine(
+                             ::testing::Values(inputShape),
+                             ::testing::Values(std::vector<int64_t>{0, 2, 3, 1}),
+                             ::testing::Values(-1),
+                             ::testing::Values(2),  // Subgraph + Sin
+                             ::testing::Values(1),
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                     TransposeSoftmax::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmaxEltwise, TransposeSoftmaxEltwise,
+                         ::testing::Combine(
+                                 ::testing::Values(inputShape),
+                                 ::testing::Values(std::vector<int64_t>{0, 2, 3, 1}),
+                                 ::testing::Values(-1),
+                                 ::testing::Values(2),  // Subgraph + Sin
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         TransposeSoftmax::getTestCaseName);
+
+} // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/snippets/softmax.hpp b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp
new file mode 100644
index 00000000000000..ca3f77e43197eb
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        ov::Shape,                       // Input 0 Shape
+        int,                             // Axis
+        size_t,                          // Expected num nodes
+        size_t,                          // Expected num subgraphs
+        std::string                      // Target Device
+> SoftmaxParams;
+
+typedef std::tuple<
+        std::pair<ov::Shape, ov::Shape>,  // Input Shapes
+        int,                              // Axis
+        size_t,                           // Expected num nodes
+        size_t,                           // Expected num subgraphs
+        std::string                       // Target Device
+> AddSoftmaxParams;
+
+class Softmax : public testing::WithParamInterface<ov::test::snippets::SoftmaxParams>,
+                virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::SoftmaxParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+class AddSoftmax : public testing::WithParamInterface<ov::test::snippets::AddSoftmaxParams>,
+                   virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddSoftmaxParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp b/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp
new file mode 100644
index 00000000000000..952b7528a00375
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        std::vector<ov::Shape>,          // Input shapes
+        std::vector<int64_t>,            // Transpose Order
+        int64_t,                         // Softmax Axis
+        size_t,                          // Expected num nodes
+        size_t,                          // Expected num subgraphs
+        std::string                      // Target Device
+> TransposeSoftmaxParams;
+
+
+class TransposeSoftmax : public testing::WithParamInterface<ov::test::snippets::TransposeSoftmaxParams>,
+                         virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::TransposeSoftmaxParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+class TransposeSoftmaxEltwise : public TransposeSoftmax {
+protected:
+    void SetUp() override;
+};
+
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/src/snippets/softmax.cpp b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp
new file mode 100644
index 00000000000000..be0fc59ef3c50a
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp
@@ -0,0 +1,91 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/softmax.hpp"
+#include "subgraph_softmax.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string Softmax::getTestCaseName(testing::TestParamInfo<ov::test::snippets::SoftmaxParams> obj) {
+    ov::Shape inputShapes;
+    int axis;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
+    result << "Axis=" << axis << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void Softmax::SetUp() {
+    ov::Shape inputShape;
+    int axis;
+    std::tie(inputShape, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{}, {inputShape, }}});
+
+    auto f = ov::test::snippets::SinhSoftmaxFunction({inputShape}, axis);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+std::string AddSoftmax::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddSoftmaxParams> obj) {
+    std::pair<ov::Shape, ov::Shape> inputShapes;
+    int axis;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes.first) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes.second) << "_";
+    result << "Axis=" << axis << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void AddSoftmax::SetUp() {
+    std::pair<ov::Shape, ov::Shape> inputShapes;
+    int axis;
+    std::tie(inputShapes, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{}, {inputShapes.first, }}, {{}, {inputShapes.second, }}});
+
+    auto f = ov::test::snippets::SinhAddSoftmaxFunction({inputShapes.first, inputShapes.second}, axis);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+TEST_P(Softmax, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(AddSoftmax, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp b/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp
new file mode 100644
index 00000000000000..ae6ca2e6790201
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp
@@ -0,0 +1,80 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/transpose_softmax.hpp"
+#include "subgraph_softmax.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string TransposeSoftmax::getTestCaseName(testing::TestParamInfo<ov::test::snippets::TransposeSoftmaxParams> obj) {
+    std::vector<ov::Shape> inputShapes;
+    std::vector<int64_t> order;
+    int axis;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes, order, axis, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    for (size_t i = 0; i < inputShapes.size(); ++i)
+        result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
+    result << "TO=" << CommonTestUtils::vec2str(order) << "_";
+    result << "Axis=" << axis << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void TransposeSoftmax::SetUp() {
+    std::vector<ov::Shape> inputShapes;
+    std::vector<int64_t> order;
+    int64_t axis;
+    std::tie(inputShapes, order, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShapes));
+
+    auto f = ov::test::snippets::TransposeSoftmaxFunction(inputDynamicShapes, order, axis);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+void TransposeSoftmaxEltwise::SetUp() {
+    std::vector<ov::Shape> inputShapes;
+    std::vector<int64_t> order;
+    int64_t axis;
+    std::tie(inputShapes, order, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShapes));
+
+    auto f = ov::test::snippets::TransposeSoftmaxEltwiseFunction(inputDynamicShapes, order, axis);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+TEST_P(TransposeSoftmax, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(TransposeSoftmaxEltwise, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
index 7218f192a8dbcf..57756d8c734bfe 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
@@ -9,6 +9,7 @@
 #include "subgraph_simple.hpp"
 #include "subgraph_converts.hpp"
 #include "subgraph_matmul.hpp"
+#include "subgraph_softmax.hpp"
 
 /* This file provides lowered representations (after the generate() was called) for some simple functions.
  * This is required to test snippets lowering and optimization passes. All the functions are expected to be direct
@@ -57,6 +58,21 @@ class Transpose0213MatMulSinhLoweredFunction : public Transpose0213MatMulSinhFun
     explicit Transpose0213MatMulSinhLoweredFunction(const std::vector<PartialShape>& inputShapes, size_t position = 0) :
             Transpose0213MatMulSinhFunction(inputShapes, position, false) {
     }
+protected:
+    std::shared_ptr<ov::Model> initLowered() const override;
+};
+
+class SoftmaxLoweredFunction : public SoftmaxFunction {
+public:
+    explicit SoftmaxLoweredFunction(const std::vector<PartialShape>& inputShapes, int axis) : SoftmaxFunction(inputShapes, axis) {}
+
+protected:
+    std::shared_ptr<ov::Model> initLowered() const override;
+};
+
+class AddSoftmaxLoweredFunction : public AddSoftmaxFunction {
+public:
+    explicit AddSoftmaxLoweredFunction(const std::vector<PartialShape>& inputShapes, int axis) : AddSoftmaxFunction(inputShapes, axis) {}
 
 protected:
     std::shared_ptr<ov::Model> initLowered() const override;
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp
new file mode 100644
index 00000000000000..6250a0d8eba128
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/ngraph.hpp"
+#include "./snippets_helpers.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+class SoftmaxFunction : public SnippetsFunctionBase {
+public:
+    explicit SoftmaxFunction(const std::vector<PartialShape>& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) {
+        NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    int axis;
+};
+
+class SinhSoftmaxFunction : public SnippetsFunctionBase {
+public:
+    explicit SinhSoftmaxFunction(const std::vector<PartialShape>& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) {
+        NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    int axis;
+};
+
+class AddSoftmaxFunction : public SnippetsFunctionBase {
+public:
+    explicit AddSoftmaxFunction(const std::vector<PartialShape>& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    int axis;
+};
+
+class SinhAddSoftmaxFunction : public SnippetsFunctionBase {
+public:
+    explicit SinhAddSoftmaxFunction(const std::vector<PartialShape>& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    int axis;
+};
+
+class TransposeSoftmaxFunction : public SnippetsFunctionBase {
+public:
+    explicit TransposeSoftmaxFunction(const std::vector<PartialShape>& inputShapes, const std::vector<int64_t>& order, const int64_t axis)
+            : SnippetsFunctionBase(inputShapes), m_order(order), m_axis(axis) {
+        NGRAPH_CHECK(input_shapes.size() > 0, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+
+    std::vector<int64_t> m_order;
+    int64_t m_axis;
+};
+
+class TransposeSoftmaxEltwiseFunction : public TransposeSoftmaxFunction {
+public:
+    explicit TransposeSoftmaxEltwiseFunction(const std::vector<PartialShape>& inputShapes, const std::vector<int64_t>& order, const int64_t axis)
+            : TransposeSoftmaxFunction(inputShapes, order, axis) {}
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
index 86d07b912f9ea2..6de4c8b0d5f32d 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
@@ -108,21 +108,296 @@ std::shared_ptr<ov::Model> EltwiseThreeInputsLoweredFunction::initLowered() cons
 
 std::shared_ptr<ov::Model> Transpose0213MatMulSinhLoweredFunction::initLowered() const {
     ParameterVector data{std::make_shared<op::v0::Parameter>(precision, input_shapes[0]),
-                          std::make_shared<op::v0::Parameter>(precision, input_shapes[1])};
+                         std::make_shared<op::v0::Parameter>(precision, input_shapes[1])};
     std::vector<size_t> layout{0, 2, 1, 3};
     // Note: validity of transpose_position values is checked in Transpose0213MatMulSinhFunction constructor
     if (transpose_position <= 1) {
-        auto& rt_info = data[transpose_position]->get_rt_info();
+        auto &rt_info = data[transpose_position]->get_rt_info();
         rt_info["Layout"] = layout;
     }
     auto matmul = std::make_shared<ngraph::snippets::op::Brgemm>(data[0], data[1]);
     if (transpose_position == 2) {
-        auto& rt_info = matmul->get_rt_info();
+        auto &rt_info = matmul->get_rt_info();
         rt_info["Layout"] = layout;
         matmul->validate_and_infer_types();
     }
     return std::make_shared<ov::Model>(NodeVector{matmul}, data);
 }
+
+std::shared_ptr<ov::Model> SoftmaxLoweredFunction::initLowered() const {
+    auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0].get_shape()});
+
+    const auto data = input_params.front();
+
+    const auto master_shape = input_shapes[0].get_shape();
+    const auto shape_rank = master_shape.size();
+    const auto dimension = shape_rank - 1;
+    const auto work_amount = master_shape[dimension];
+    const auto increment = 10;
+    const auto inner_dim = shape_rank - 1;
+    const auto inner_master_wa = static_cast<int>(master_shape[inner_dim]);
+    const int outer_dim = shape_rank > 1 ? shape_rank - 2 : -1;
+    const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1;
+    const bool is_scalar = work_amount == 1;
+
+    /* ====== ReduceMax decomposition ====== */
+
+    const auto vector_buffer_max = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+    const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data});
+
+    // we don't insert Fill here after load_max to verify because in generate() call Fill op is inserted only on vector representation
+    const auto load_max = std::make_shared<ngraph::snippets::op::Load>(loop_max_begin->output(0), increment);
+    const auto max = std::make_shared<ov::op::v1::Maximum>(load_max, vector_buffer_max);
+
+    std::vector<bool> apply_increments_max(3, false);
+    std::vector<int64_t> finalization_offsets_max(3, 0);
+    apply_increments_max[0] = data->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_max[0] = data->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_max_end = std::make_shared<ngraph::snippets::op::LoopEnd>(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)},
+        work_amount, increment, apply_increments_max, finalization_offsets_max);
+
+    std::shared_ptr<ov::Node> horizon_max = std::make_shared<ngraph::snippets::op::HorizonMax>(max);
+    horizon_max->add_control_dependency(loop_max_end);
+    const auto prev_horizon_max = horizon_max;
+    if (!is_scalar) {
+        horizon_max = std::make_shared<ngraph::snippets::op::BroadcastMove>(horizon_max, horizon_max->get_input_partial_shape(0));
+    }
+
+    loop_max_begin->add_control_dependency(vector_buffer_max);
+    loop_max_end->add_control_dependency(max);
+
+    /* =========================================== */
+
+    /* === Sub + Exp + ReduceSum decomposition === */
+
+    const auto vector_buffer_sum = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+    const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)});
+
+    const auto load_sub = std::make_shared<ngraph::snippets::op::Load>(loop_sum_begin->output(0), increment);
+    const auto sub = std::make_shared<ov::op::v1::Subtract>(load_sub, horizon_max);
+    // we don't insert Fill here after Exp to verify because in generate() call Fill op is inserted only on vector representation
+    const auto exp = std::make_shared<ov::op::v0::Exp>(sub);
+    const auto sum = std::make_shared<ov::op::v1::Add>(exp, vector_buffer_sum);
+    const auto store_exp = std::make_shared<ngraph::snippets::op::Store>(exp, increment);
+
+    std::vector<bool> apply_increments_sum(2, false);
+    std::vector<int64_t> finalization_offsets_sum(2, 0);
+    apply_increments_sum[0] = load_sub->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_sum_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+        ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment,
+        apply_increments_sum, finalization_offsets_sum);
+    loop_sum_end->add_control_dependency(sum);
+
+    const auto horizon_sum = std::make_shared<ngraph::snippets::op::HorizonSum>(sum);
+    horizon_sum->add_control_dependency(loop_sum_end);
+
+    const auto buffer_exp = std::make_shared<ngraph::snippets::op::Buffer>(loop_sum_end->output(0));
+
+    loop_sum_begin->add_control_dependency(vector_buffer_sum);
+    loop_sum_begin->add_control_dependency(horizon_max);
+    loop_sum_begin->add_control_dependency(prev_horizon_max);
+
+    /* =========================================== */
+
+    /* ================== Div ==================== */
+
+    std::shared_ptr<ov::Node> pow = std::make_shared<ngraph::snippets::op::PowerStatic>(horizon_sum, -1);
+    const auto prev_pow = pow;
+    if (!is_scalar) {
+        pow = std::make_shared<ngraph::snippets::op::BroadcastMove>(pow, horizon_sum->get_input_partial_shape(0));
+    }
+
+    const auto loop_div_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_exp});
+
+    const auto load_div = std::make_shared<ngraph::snippets::op::Load>(loop_div_begin->output(0), increment);
+    const auto mul = std::make_shared<ov::op::v1::Multiply>(load_div, pow);
+    const auto store_div = std::make_shared<ngraph::snippets::op::Store>(mul, increment);
+
+    std::vector<bool> apply_increments_div(2, false);
+    std::vector<int64_t> finalization_offsets_div(2, 0);
+    apply_increments_div[0] = load_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_div_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+        ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment,
+        apply_increments_div, finalization_offsets_div);
+
+    loop_div_begin->add_control_dependency(pow);
+    loop_div_begin->add_control_dependency(prev_pow);
+
+    /* =========================================== */
+
+    const auto result = std::make_shared<ov::op::v0::Result>(loop_div_end);
+    if (has_outer_loop) {
+        const auto need_increment = input_shapes[0].get_shape()[outer_dim] != 1 && input_shapes[0].get_shape()[inner_dim] == 1;
+        const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+        const auto outer_loop_end = insertLoopEnd(NodeVector{result}, outer_loop_begin, 1, 1, std::vector<bool>{need_increment, need_increment});
+        vector_buffer_max->add_control_dependency(outer_loop_begin);
+    }
+
+    return std::make_shared<ov::Model>(ResultVector{result}, input_params);
+}
+std::shared_ptr<ov::Model> AddSoftmaxLoweredFunction::initLowered() const {
+    auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0].get_shape(), input_shapes[1].get_shape()});
+
+    auto master_pshape = input_shapes[0];
+    ov::PartialShape::broadcast_merge_into(master_pshape, input_shapes[1], op::AutoBroadcastType::NUMPY);
+    const auto master_shape = master_pshape.get_shape();
+    const auto shape_rank = master_shape.size();
+    const auto dimension = shape_rank - 1;
+    const auto work_amount = master_shape[dimension];
+    const auto increment = 10;
+    const auto inner_dim = shape_rank - 1;
+    const auto inner_master_wa = static_cast<int>(master_shape[inner_dim]);
+    const int outer_dim = shape_rank > 1 ? shape_rank - 2 : -1;
+    const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1;
+    const bool is_scalar = work_amount == 1;
+
+    /* ================== Add ==================== */
+
+    const auto loop_add_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+
+    std::shared_ptr<ov::Node> load0 = std::make_shared<ngraph::snippets::op::Load>(loop_add_begin->output(0), increment);
+    if (!is_scalar && input_shapes[0].get_shape().back() == 1) {
+        auto new_shape = input_shapes[0].get_shape();
+        new_shape[new_shape.size() - 1] = static_cast<size_t>(inner_master_wa);
+        load0 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(loop_add_begin->output(0), new_shape);
+    }
+    std::shared_ptr<ov::Node> load1 = std::make_shared<ngraph::snippets::op::Load>(loop_add_begin->output(1), increment);
+    if (!is_scalar && input_shapes[1].get_shape().back() == 1) {
+        auto new_shape = input_shapes[1].get_shape();
+        new_shape[new_shape.size() - 1] = static_cast<size_t>(inner_master_wa);
+        load1 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(loop_add_begin->output(1), new_shape);
+    }
+    const auto add = std::make_shared<ov::op::v1::Add>(load0, load1);
+    const auto store = std::make_shared<ngraph::snippets::op::Store>(add, increment);
+
+    std::vector<bool> apply_increments_add(3, false);
+    std::vector<int64_t> finalization_offsets_add(3, 0);
+    apply_increments_add[0] = input_shapes[0].get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_add[1] = input_shapes[1].get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_add[2] = master_shape[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_add[0] = input_shapes[0].get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_add[1] = input_shapes[1].get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_add[2] = master_shape[inner_dim] != 1 ? -inner_master_wa : 0;
+    auto loop_add_end = std::make_shared<ngraph::snippets::op::LoopEnd>(ngraph::OutputVector{store, loop_add_begin->output(2)},
+        work_amount, increment, apply_increments_add, finalization_offsets_add);
+
+    /* =========================================== */
+
+    const auto buffer_add = std::make_shared<ngraph::snippets::op::Buffer>(loop_add_end->output(0));
+
+    /* ====== ReduceMax decomposition ====== */
+
+    const auto vector_buffer_max = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+    const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_add, buffer_add});
+
+    // we don't insert Fill here after load_max to verify because in generate() call Fill op is inserted only on vector representation
+    const auto load_max = std::make_shared<ngraph::snippets::op::Load>(loop_max_begin->output(0), increment);
+    const auto max = std::make_shared<ov::op::v1::Maximum>(load_max, vector_buffer_max);
+
+    std::vector<bool> apply_increments_max(3, false);
+    std::vector<int64_t> finalization_offsets_max(3, 0);
+    apply_increments_max[0] = master_shape[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_max[0] = master_shape[outer_dim] == 1 && master_shape[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_max_end = std::make_shared<ngraph::snippets::op::LoopEnd>(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)},
+        work_amount, increment, apply_increments_max, finalization_offsets_max);
+
+    std::shared_ptr<ov::Node> horizon_max = std::make_shared<ngraph::snippets::op::HorizonMax>(max);
+    horizon_max->add_control_dependency(loop_max_end);
+    const auto prev_horizon_max = horizon_max;
+    if (!is_scalar) {
+        horizon_max = std::make_shared<ngraph::snippets::op::BroadcastMove>(horizon_max, horizon_max->get_input_partial_shape(0));
+    }
+
+    loop_max_begin->add_control_dependency(vector_buffer_max);
+    loop_max_end->add_control_dependency(max);
+
+    /* =========================================== */
+
+    /* === Sub + Exp + ReduceSum decomposition === */
+
+    const auto vector_buffer_sum = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+    const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)});
+
+    const auto load_sub = std::make_shared<ngraph::snippets::op::Load>(loop_sum_begin->output(0), increment);
+    const auto sub = std::make_shared<ov::op::v1::Subtract>(load_sub, horizon_max);
+    // we don't insert Fill here after exp to verify because in generate() call Fill op is inserted only on vector representation
+    const auto exp = std::make_shared<ov::op::v0::Exp>(sub);
+    const auto sum = std::make_shared<ov::op::v1::Add>(exp, vector_buffer_sum);
+    const auto store_exp = std::make_shared<ngraph::snippets::op::Store>(exp, increment);
+
+    std::vector<bool> apply_increments_sum(2, false);
+    std::vector<int64_t> finalization_offsets_sum(2, 0);
+    apply_increments_sum[0] = load_sub->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_sum_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+        ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment,
+        apply_increments_sum, finalization_offsets_sum);
+    loop_sum_end->add_control_dependency(sum);
+
+    const auto horizon_sum = std::make_shared<ngraph::snippets::op::HorizonSum>(sum);
+    horizon_sum->add_control_dependency(loop_sum_end);
+
+    const auto buffer_exp = std::make_shared<ngraph::snippets::op::Buffer>(loop_sum_end->output(0));
+
+    loop_sum_begin->add_control_dependency(vector_buffer_sum);
+    loop_sum_begin->add_control_dependency(horizon_max);
+    loop_sum_begin->add_control_dependency(prev_horizon_max);
+
+    /* =========================================== */
+
+    /* ================== Div ==================== */
+
+    std::shared_ptr<ov::Node> pow = std::make_shared<ngraph::snippets::op::PowerStatic>(horizon_sum, -1);
+    const auto prev_pow = pow;
+    if (!is_scalar) {
+        pow = std::make_shared<ngraph::snippets::op::BroadcastMove>(pow, horizon_sum->get_input_partial_shape(0));
+    }
+
+    const auto loop_div_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_exp});
+
+    const auto load_div = std::make_shared<ngraph::snippets::op::Load>(loop_div_begin->output(0), increment);
+    const auto mul = std::make_shared<ov::op::v1::Multiply>(load_div, pow);
+    const auto store_div = std::make_shared<ngraph::snippets::op::Store>(mul, increment);
+
+    std::vector<bool> apply_increments_div(2, false);
+    std::vector<int64_t> finalization_offsets_div(2, 0);
+    apply_increments_div[0] = load_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_div_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+        ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment,
+        apply_increments_div, finalization_offsets_div);
+    loop_div_begin->add_control_dependency(pow);
+    loop_div_begin->add_control_dependency(prev_pow);
+
+    /* =========================================== */
+
+    const auto result = std::make_shared<ov::op::v0::Result>(loop_div_end);
+    if (has_outer_loop) {
+        const auto need_increment0 = input_shapes[0].get_shape()[outer_dim] != 1 && input_shapes[0].get_shape()[inner_dim] == 1;
+        const auto need_increment1 = input_shapes[1].get_shape()[outer_dim] != 1 && input_shapes[1].get_shape()[inner_dim] == 1;
+        const auto need_increment2 = master_shape[outer_dim] != 1 && master_shape[inner_dim] == 1;
+        const auto outer_loop_add_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+        const auto outer_loop_add_end =
+            insertLoopEnd(NodeVector{buffer_add}, outer_loop_add_begin, 1, 1, std::vector<bool>{need_increment0, need_increment1, need_increment2});
+
+        const auto need_increment = master_shape[outer_dim] != 1 && master_shape[inner_dim] == 1;
+        const auto outer_loop_begin = ngraph::snippets::op::insertLoopBegin(NodeVector{buffer_add});
+        const auto outer_loop_end = insertLoopEnd(NodeVector{result}, outer_loop_begin, 1, 1, std::vector<bool>{need_increment, need_increment});
+        vector_buffer_max->add_control_dependency(outer_loop_begin);
+    }
+
+    return std::make_shared<ov::Model>(ResultVector{result}, input_params);
+}
 }  // namespace snippets
 }  // namespace test
 }  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp
new file mode 100644
index 00000000000000..aba0301993dd06
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_softmax.hpp"
+#include "common_test_utils/data_utils.hpp"
+#include <snippets/op/subgraph.hpp>
+#include "ngraph_functions/builders.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::shared_ptr<ov::Model> SoftmaxFunction::initOriginal() const {
+    auto data = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto softmax = std::make_shared<ov::op::v8::Softmax>(data, axis);
+    return std::make_shared<ov::Model>(NodeVector{softmax}, ParameterVector{data});
+}
+
+std::shared_ptr<ov::Model> SinhSoftmaxFunction::initOriginal() const {
+    auto data = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto sinh = std::make_shared<ov::op::v0::Sinh>(data);
+    auto softmax = std::make_shared<ov::op::v8::Softmax>(sinh, axis);
+    return std::make_shared<ov::Model>(NodeVector{softmax}, ParameterVector{data});
+}
+
+std::shared_ptr<ov::Model> AddSoftmaxFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto add = std::make_shared<ov::op::v1::Add>(data0, data1);
+    auto softmax = std::make_shared<ov::op::v8::Softmax>(add, axis);
+    return std::make_shared<ov::Model>(NodeVector{softmax}, ParameterVector{data0, data1});
+}
+
+std::shared_ptr<ov::Model> SinhAddSoftmaxFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto sinh0 = std::make_shared<ov::op::v0::Sinh>(data0);
+    auto sinh1 = std::make_shared<ov::op::v0::Sinh>(data1);
+    auto add = std::make_shared<ov::op::v1::Add>(sinh0, sinh1);
+    auto softmax = std::make_shared<ov::op::v8::Softmax>(add, axis);
+    return std::make_shared<ov::Model>(NodeVector{softmax}, ParameterVector{data0, data1});
+}
+
+std::shared_ptr<ov::Model> TransposeSoftmaxFunction::initOriginal() const {
+    const auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    const auto sinh0 = std::make_shared<ov::op::v0::Sinh>(transpose0Param);
+    const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{m_order.size()}, m_order);
+    const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(sinh0, transpose0Const);
+    const auto softMax = std::make_shared<ngraph::opset8::Softmax>(transpose2, m_axis);
+    return std::make_shared<ov::Model>(ov::NodeVector{softMax}, ov::ParameterVector {transpose0Param}, "softmax_transpose");
+}
+
+std::shared_ptr<ov::Model> TransposeSoftmaxEltwiseFunction::initOriginal() const {
+    const auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    const auto sinh0 = std::make_shared<ov::op::v0::Sinh>(transpose0Param);
+    const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{m_order.size()},
+                                                               m_order);
+    const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(sinh0, transpose0Const);
+    const auto mulConst = ngraph::builder::makeConstant(ngraph::element::f32, transpose2->get_shape(),
+                                                        std::vector<float>{}, true);
+    const auto mul = std::make_shared<ngraph::opset1::Multiply>(transpose2, mulConst);
+    const auto softMax = std::make_shared<ngraph::opset8::Softmax>(mul, m_axis);
+    const auto hswish = std::make_shared<ngraph::opset6::HSwish>(softMax);
+    return std::make_shared<ov::Model>(ov::NodeVector{hswish}, ov::ParameterVector{transpose0Param},
+                                       "softmax_transpose");
+}
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
\ No newline at end of file