diff --git a/src/bindings/python/tests_compatibility/test_onnx/test_backend.py b/src/bindings/python/tests_compatibility/test_onnx/test_backend.py
index 87f53223c2d672..c1ad04a6fe44a5 100644
--- a/src/bindings/python/tests_compatibility/test_onnx/test_backend.py
+++ b/src/bindings/python/tests_compatibility/test_onnx/test_backend.py
@@ -565,9 +565,7 @@ def expect_fail(test_case_path, xfail):  # type: (str) -> None
     ),
     (
         xfail_issue_99955,
-        "OnnxBackendNodeModelTest.test_group_normalization_epsilon_cpu",
         "OnnxBackendNodeModelTest.test_group_normalization_epsilon_expanded_cpu",
-        "OnnxBackendNodeModelTest.test_group_normalization_example_cpu",
         "OnnxBackendNodeModelTest.test_group_normalization_example_expanded_cpu",
     ),
     (
diff --git a/src/common/snippets/include/snippets/lowered/port_descriptor.hpp b/src/common/snippets/include/snippets/lowered/port_descriptor.hpp
index 551ef1907037ab..2c74867d8436d6 100644
--- a/src/common/snippets/include/snippets/lowered/port_descriptor.hpp
+++ b/src/common/snippets/include/snippets/lowered/port_descriptor.hpp
@@ -65,6 +65,16 @@ class PortDescriptor {
     VectorDims m_subtensor_shape{};
     /// \brief The corresponding abstract/physical register
     size_t m_reg = 0;
+
+    /// Notes:
+    ///   - `m_tensor_shape` is dense shape which is controlled by expression outputs.
+    ///     It means that the result of data writing of expression outputs should be read using this shape by the next expression inputs.
+    ///   - `m_layout` is the order of data reading or writing by MemoryAccess ops. Note that only MemoryAccess ops may have `m_layout`.
+    ///     For other expressions this order parameter is simply ignored for now.
+    ///     if it's input port of MemoryAccess expression:
+    ///      - `m_layout` shows how the data should be read (by which strides) using m_tensor_shape.
+    ///     If it's output port of MemoryAccess expression:
+    ///      - `m_layout` shows how the data should be written (by which strides) to get m_tensor_shape.
 };
 
 class PortDescriptorUtils {
diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
index b17031e2a67d1c..b642bbd7a23ccb 100644
--- a/src/common/snippets/include/snippets/op/subgraph.hpp
+++ b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -10,7 +10,7 @@
 #include <openvino/op/util/sub_graph_base.hpp>
 #include "openvino/op/op.hpp"
 #include "openvino/core/rt_info.hpp"
-#include "snippets/pass_manager.hpp"
+#include "snippets/pass/manager.hpp"
 #include "snippets/shape_inference/shape_inference.hpp"
 #include "snippets/lowered/pass/pass.hpp"
 
diff --git a/src/common/snippets/include/snippets/pass/common_optimizations.hpp b/src/common/snippets/include/snippets/pass/common_optimizations.hpp
index 30ec301eb92c43..aba1ef9fb919df 100644
--- a/src/common/snippets/include/snippets/pass/common_optimizations.hpp
+++ b/src/common/snippets/include/snippets/pass/common_optimizations.hpp
@@ -5,7 +5,6 @@
 #pragma once
 
 #include "openvino/pass/graph_rewrite.hpp"
-#include "snippets/op/subgraph.hpp"
 #include "snippets/pass/tokenization.hpp"
 
 namespace ov {
@@ -13,22 +12,15 @@ namespace snippets {
 namespace pass {
 
 class CommonOptimizations : public ov::pass::MatcherPass {
+    class SubgraphPass;
+    class SubgraphManager;
+    friend class ExtractConstants;
+    friend class ExtractUnsupportedTransposes;
+    friend class SplitDimensionM;
+
 public:
     OPENVINO_RTTI("CommonOptimizations", "0");
     CommonOptimizations(const SnippetsTokenization::Config& config = {});
-
-    // Returns True if parallelism work amount can be increased using SplitDimensionM optimization
-    static bool CanOptimizeParallelWA(const std::shared_ptr<const ov::Node>& node, size_t concurrency);
-
-private:
-    // Move up Constants which aren't scalars from body to Subgraph and replace them with Parameters inside body
-    void ExtractConstants(const std::shared_ptr<op::Subgraph>& subgraph);
-    // Move up unsupported Transposes on Parameter outputs from body
-    void ExtractUnsupportedTransposes(const std::shared_ptr<op::Subgraph>& subgraph);
-    // Insert Reshape nodes after and before Parameters and Results in Subgraphs with MatMul inside
-    // to split dimension M for MatMuls to increase work amount for parallelism
-    // Note: works only with 3D MHA patterns
-    void SplitDimensionM(const std::shared_ptr<op::Subgraph>& subgraph, size_t concurrency);
 };
 
 }  // namespace pass
diff --git a/src/common/snippets/include/snippets/pass/extract_constants.hpp b/src/common/snippets/include/snippets/pass/extract_constants.hpp
new file mode 100644
index 00000000000000..17db3101c95138
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/extract_constants.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "subgraph_pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface ExtractConstants
+ * @brief Moves up Constants which aren't scalars outside of the Subgraph's body and replaces them with Parameters inside body
+ * @ingroup snippets
+ */
+class ExtractConstants: public CommonOptimizations::SubgraphPass {
+public:
+    OPENVINO_RTTI("ExtractConstants", "0");
+    ExtractConstants() = default;
+
+    bool run_on_subgraph(const std::shared_ptr<op::Subgraph>& subgraph) override;
+};
+
+
+} // namespace pass
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/pass/extract_unsupported_transposes.hpp b/src/common/snippets/include/snippets/pass/extract_unsupported_transposes.hpp
new file mode 100644
index 00000000000000..48b1c2fed88ad1
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/extract_unsupported_transposes.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "subgraph_pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface ExtractUnsupportedTransposes
+ * @brief Moves up unsupported Transposes on Parameter outputs from body
+ * @ingroup snippets
+ */
+class ExtractUnsupportedTransposes: public CommonOptimizations::SubgraphPass {
+public:
+    OPENVINO_RTTI("ExtractUnsupportedTransposes", "0");
+    ExtractUnsupportedTransposes() = default;
+
+    bool run_on_subgraph(const std::shared_ptr<op::Subgraph>& subgraph) override;
+};
+
+
+} // namespace pass
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp b/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp
index 69266fc90ffc62..faf320a8d8c7e7 100644
--- a/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp
+++ b/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp
@@ -18,18 +18,17 @@ namespace pass {
 /**
  * @interface FuseTransposeBrgemm
  * @brief Fuses Transpose with Brgemm node, fusing on both Brgemm inputs and output is supported. Applicable to
- *        Transposes that don't change the position of the last dimension (since Brgemm supports strided rows i/o),
- *        but only 0213 Transpose is currently supported.
+ *        Transposes that don't change the position of the last dimension (since Brgemm supports strided rows i/o).
+ *        Supported any Transpose order where last index is equal to [rank - 1] - it means that last dimension isn't moved.
  * @ingroup snippets
  */
 class FuseTransposeBrgemm: public ov::pass::MatcherPass {
 public:
     OPENVINO_RTTI("FuseTransposeBrgemm", "0");
     FuseTransposeBrgemm();
-    static const std::set<std::vector<int>> supported_cases;
 
-private:
-    static bool is_supported_transpose(const Output<Node>& transpose_port);
+    static bool is_supported_transpose(const Output<Node>& transpose_out);
+    static bool is_supported_transpose_order(const std::vector<int32_t>& order);
 };
 
 }  // namespace pass
diff --git a/src/common/snippets/include/snippets/pass_manager.hpp b/src/common/snippets/include/snippets/pass/manager.hpp
similarity index 99%
rename from src/common/snippets/include/snippets/pass_manager.hpp
rename to src/common/snippets/include/snippets/pass/manager.hpp
index 04d6ad57c9a6e0..d83a102acec313 100644
--- a/src/common/snippets/include/snippets/pass_manager.hpp
+++ b/src/common/snippets/include/snippets/pass/manager.hpp
@@ -3,15 +3,18 @@
 //
 
 #pragma once
+
 #include "openvino/pass/manager.hpp"
 #include "openvino/pass/pass.hpp"
 #include "openvino/pass/validate.hpp"
+
 #include <typeinfo>
 
 
 namespace ov {
 namespace snippets {
 namespace pass {
+
 /**
  * @brief Manager is like ov::pass::Manager, but allows to insert new passes at arbitrary places in the pipeline
  * @ingroup snippets
diff --git a/src/common/snippets/include/snippets/pass/mha_tokenization.hpp b/src/common/snippets/include/snippets/pass/mha_tokenization.hpp
index acd887b0f4a2a0..f5d637f1abb15a 100644
--- a/src/common/snippets/include/snippets/pass/mha_tokenization.hpp
+++ b/src/common/snippets/include/snippets/pass/mha_tokenization.hpp
@@ -43,6 +43,9 @@ class TokenizeMHASnippets: public ov::pass::MatcherPass {
 public:
     OPENVINO_RTTI("TokenizeMHASnippets", "0");
     TokenizeMHASnippets(const SnippetsTokenization::Config& config = {});
+
+    static std::vector<int32_t> get_fusion_transpose_order(size_t rank);
+    static std::vector<int32_t> get_decomposed_transpose_order(size_t rank);
     static bool is_matmul0_supported(const std::shared_ptr<ov::opset1::MatMul>& matmul);
 };
 
diff --git a/src/common/snippets/include/snippets/pass/split_dimension_m.hpp b/src/common/snippets/include/snippets/pass/split_dimension_m.hpp
new file mode 100644
index 00000000000000..b57841a5e9cf0f
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/split_dimension_m.hpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "subgraph_pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface SplitDimensionM
+ * @brief Inserts Reshape nodes before inputs and after outputs of Subgraphs with MatMul inside
+ *        to split dimension M for MatMuls. It allows to increase work amount for parallelism
+ * @ingroup snippets
+ */
+class SplitDimensionM: public CommonOptimizations::SubgraphPass {
+public:
+    OPENVINO_RTTI("SplitDimensionM", "0");
+    SplitDimensionM(size_t concurrency) : m_concurrency(concurrency) {}
+
+    bool run_on_subgraph(const std::shared_ptr<op::Subgraph>& subgraph) override;
+
+    // Return True if the MatMul node is supported by this optimization
+    static bool is_supported_matmul(const std::shared_ptr<const ov::Node>& node);
+    // Returns True if parallelism work amount (concurrency) can be increased by this optimization
+    static bool can_be_optimized(const std::shared_ptr<const ov::Node>& node, size_t concurrency);
+
+private:
+    static std::shared_ptr<ov::op::v0::MatMul> get_matmul(const std::shared_ptr<op::Subgraph>& subgraph);
+    static std::pair<size_t, size_t> get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
+    static bool split(const ov::Shape& shape, size_t optimal_parallelism_work_amount, size_t& batch_m_dim, size_t& new_m_dim);
+
+    void reshape_subgraph(const std::shared_ptr<op::Subgraph>& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim);
+
+    size_t m_concurrency;
+};
+
+
+} // namespace pass
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/pass/subgraph_manager.hpp b/src/common/snippets/include/snippets/pass/subgraph_manager.hpp
new file mode 100644
index 00000000000000..2aeea775987352
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/subgraph_manager.hpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <typeinfo>
+#include <vector>
+
+#include "snippets/pass/common_optimizations.hpp"
+
+#include "snippets/pass/subgraph_pass.hpp"
+#include "snippets/op/subgraph.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+/**
+ * @brief Manager class allows to manage transformation passes (SubgraphPasses) on Subgraph ops.
+ *        See SubgraphPasses description for more details.
+ *        It's light version of ov::Manager implementation the purpose of which is to change only Subgraph as separate node in model.
+ * @ingroup snippets
+ */
+class CommonOptimizations::SubgraphManager {
+public:
+    SubgraphManager() = default;
+
+    /// @brief Register given transformation class type to execution list
+    /// @return shared_ptr to the transformation instance
+    template <typename T, class... Args>
+    std::shared_ptr<T> register_pass(Args&&... args) {
+        static_assert(std::is_base_of<SubgraphPass, T>::value, "pass not derived from SubgraphPass base");
+        auto pass = std::make_shared<T>(std::forward<Args>(args)...);
+        m_pass_list.push_back(std::static_pointer_cast<SubgraphPass>(pass));
+        return pass;
+    }
+
+    /// @brief      Runs registered transformations on a given model
+    /// @param      subgraph Input model
+    /// @return     Returns true if the model was changed by transformations, false otherwise.
+    bool run_passes(std::shared_ptr<ov::snippets::op::Subgraph> subgraph);
+
+protected:
+    std::vector<std::shared_ptr<SubgraphPass>> m_pass_list;
+};
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ov
diff --git a/src/common/snippets/include/snippets/pass/subgraph_pass.hpp b/src/common/snippets/include/snippets/pass/subgraph_pass.hpp
new file mode 100644
index 00000000000000..c8d65f0bc536bc
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/subgraph_pass.hpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <typeinfo>
+
+#include "snippets/pass/common_optimizations.hpp"
+
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+/**
+ * @brief Base class for Subgraph passes.
+ *        The pass runs on `Subgraph` op that allows users to transform
+ *        `Subgraph` as node and `body` of this `Subgraph` as model at the same time.
+ *        These passes may change `Subgraph` as node, its `body` and other ops around `Subgraph` in model.
+ *        To avoid unsafe changes of other ops in model, SubgraphPass is not derived from ov::Pass to avoid
+ *        registration to ov::Model
+ * @ingroup snippets
+ */
+class CommonOptimizations::SubgraphPass {
+public:
+    SubgraphPass() = default;
+    virtual ~SubgraphPass() = default;
+
+    virtual bool run_on_subgraph(const std::shared_ptr<op::Subgraph>& subgraph) = 0;
+
+    void set_name(const std::string& name) { m_name = name; }
+    std::string get_name() const { return m_name; }
+
+    using type_info_t = DiscreteTypeInfo;
+    virtual const type_info_t& get_type_info() const = 0;
+
+private:
+    std::string m_name;
+};
+
+
+} // namespace pass
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/pass/tokenization.hpp b/src/common/snippets/include/snippets/pass/tokenization.hpp
index 9b070fb13c3445..a222bd72ef4f54 100644
--- a/src/common/snippets/include/snippets/pass/tokenization.hpp
+++ b/src/common/snippets/include/snippets/pass/tokenization.hpp
@@ -51,6 +51,11 @@ class EnumerateNodes : public ov::pass::ModelPass {
  *         2. MHA tokenization
  *         3. Common tokenization
  *         4. Some common transformations for Subgraphs. For example, FakeQuantize decomposition
+ *         Naming policy:
+ *           - During tokenization new Subgraph op takes the name of the last tokenized op.
+ *             It's needed to save output names of model in cases when tokenized op was before model Result.
+ *           - If some transformation (for example, SplitDimensionM) insert new op after Subgraph,
+ *             the op should be called as this Subgraph to save output name. The Subgraph name is updated using suffix "_original".
  * @ingroup snippets
  */
 class SnippetsTokenization : public ov::pass::ModelPass {
@@ -61,9 +66,9 @@ class SnippetsTokenization : public ov::pass::ModelPass {
      * @ingroup snippets
      */
     struct Config {
-        Config(size_t concurrency = 1, bool split_m_dimension = true, bool enable_transpose_on_output = true)
+        Config(size_t concurrency = 1, bool split_m_dimension = true, bool enable_transpose_on_output = true, std::set<size_t> mha_transpose_ranks = {3, 4})
             : concurrency(concurrency), split_m_dimension(split_m_dimension),
-              mha_token_enable_transpose_on_output(enable_transpose_on_output) {}
+              mha_token_enable_transpose_on_output(enable_transpose_on_output), mha_supported_transpose_ranks(std::move(mha_transpose_ranks)) {}
 
         size_t concurrency = 1;
         // True if "SplitDimensionM" optimization is enabled. Otherwise, it's disabled.
@@ -72,6 +77,10 @@ class SnippetsTokenization : public ov::pass::ModelPass {
         // Otherwise, it may be fused into Subgraph if possible
         // TODO [111813]: Remove please when the ticket 111813 is implemented
         bool mha_token_enable_transpose_on_output = true;
+        // Set of supported Transpose shape ranks for tokenization in MHATokenization pass.
+        // Note that in general Snippets support Transpose of any ranks.
+        // But at the moment Transpose is used only in MHA pattern where 3D and 4D tensors are supported.
+        std::set<size_t> mha_supported_transpose_ranks = { 3, 4 };
     };
 
     OPENVINO_RTTI("SnippetsTokenization", "0");
diff --git a/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp b/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp
index 013a538172ac7e..e9bd1506b93c60 100644
--- a/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp
+++ b/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp
@@ -20,7 +20,9 @@ class TransposeDecomposition: public ov::pass::MatcherPass {
 public:
     OPENVINO_RTTI("TransposeDecomposition", "0");
     TransposeDecomposition();
-    static const std::set<std::vector<int>> supported_cases;
+
+    static bool is_supported_transpose(const Output<Node>& transpose_out);
+    static bool is_supported_transpose_order(const std::vector<int32_t>& order);
 };
 
 }  // namespace pass
diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp
index d10930125e0ed0..c77eecd8bb15b0 100644
--- a/src/common/snippets/include/snippets/utils.hpp
+++ b/src/common/snippets/include/snippets/utils.hpp
@@ -25,12 +25,6 @@ inline auto is_scalar_constant(const std::shared_ptr<ov::Node>& source_output_no
     return ov::is_type<ov::opset1::Constant>(source_output_node) && ov::shape_size(source_output_node->get_shape()) == 1;
 }
 
-ov::PartialShape get_planar_pshape(const Input<Node>& out);
-ov::PartialShape get_planar_pshape(const Output<Node>& out);
-ov::PartialShape get_planar_pshape(const ov::PartialShape& shape, const std::vector<size_t>& layout);
-VectorDims pshape_to_vdims(const PartialShape&);
-ov::PartialShape vdims_to_pshape(const VectorDims&);
-
 inline auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t {
     return allocation_rank < 0 ? allocation_rank + static_cast<int32_t>(shape_rank) + 1 : allocation_rank;
 }
@@ -55,11 +49,87 @@ constexpr inline bool implication(bool cause, bool cond) {
     return !cause || !!cond;
 }
 
-VectorDims get_planar_vdims(const VectorDims& shape, const std::vector<size_t>& layout);
-VectorDims get_planar_vdims(const snippets::lowered::PortDescriptorPtr& port_desc);
+template <typename T, typename U>
+inline T div_up(const T a, const U b) {
+    return static_cast<T>((a + b - 1) / b);
+}
+
+/* ----- Shape `getters` ----- */
+/**
+ * @brief Returns a dense shape after applying the order.
+ *        It means that the shape dimensions will be reordered in accordance with order indices to produce planar shape
+ * @param shape preordered (original) partial shape
+ * @param order order
+ * @return reordered partial shape: `planar_shape[i]` = `shape[order[i]]`
+ *         Example, shape = [16, 2, 32, 64], order = [2, 0, 1, 3]
+ *                  planar_shape = [32, 16, 2, 64]
+ */
+ov::PartialShape get_planar_pshape(const ov::PartialShape& shape, const std::vector<size_t>& order);
+/**
+ * @brief Returns original shape before applying the order.
+ *        It means that the shape dimensions have been already reordered in accordance with order indices to produce planar shape
+ * @param shape planar (ordered) partial shape
+ * @param order order
+ * @return preordered partial shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order.
+ *         Example, shape = [16, 2, 32, 64], order = [2, 0, 1, 3]
+ *                  planar_shape = [2, 32, 16, 64]
+ */
+ov::PartialShape get_preordered_pshape(const ov::PartialShape& shape, const std::vector<size_t>& order);
+/**
+ * @brief Returns a dense shape of node input.
+ *        It means that the node input shape dimensions will be reordered in accordance with order indices to produce planar shape
+ * @param in input of node
+ * @return new reordered partial shape: `planar_shape[i]` = `shape[order[i]]`
+ */
+ov::PartialShape get_planar_pshape(const Input<Node>& in);
+/**
+ * @brief Returns original shape of node output before applying the order.
+ *        It means that the preordered output shape dimensions have been already reordered in accordance with order indices to produce planar shape
+ * @param out output of node
+ * @return preordered partial shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order.
+ */
+ov::PartialShape get_preordered_pshape(const Output<Node>& out);
+/**
+ * @brief Returns a dense shape after applying the order.
+ *        It means that the shape dimensions will be reordered in accordance with order indices to produce planar shape
+ * @param shape preordered (original) shape
+ * @param order order
+ * @return reordered partial shape: `planar_shape[i]` = `shape[order[i]]`
+ *         Example, shape = [16, 2, 32, 64], order = [2, 0, 1, 3]
+ *                  planar_shape = [32, 16, 2, 64]
+ */
+VectorDims get_planar_vdims(const VectorDims& shape, const std::vector<size_t>& order);
+/**
+ * @brief Returns original shape before applying the order.
+ *        It means that the preordered shape dimensions have been already reordered in accordance with order indices to produce planar shape
+ * @param shape planar (ordered) shape
+ * @param order order
+ * @return preordered shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order.
+ *         Example, shape = [16, 2, 32, 64], order = [2, 0, 1, 3]
+ *                  planar_shape = [2, 32, 16, 64]
+ */
+VectorDims get_preordered_vdims(const VectorDims& shape, const std::vector<size_t>& order);
+/**
+ * @brief Returns a dense shape of expression input port.
+ *        It means that the input shape dimensions will be reordered in accordance with order indices to produce planar shape
+ * @param expr_port input expression port
+ * @return new reordered partial shape: `planar_shape[i]` = `shape[order[i]]`
+ */
 VectorDims get_planar_vdims(const snippets::lowered::ExpressionPort& expr_port);
+/**
+ * @brief Returns original shape before applying the order of expression output port.
+ *        It means that the preordered output shape dimensions has been already reordered in accordance with order indices to produce planar shape
+ * @param out input of node
+ * @return preordered shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order.
+ */
+VectorDims get_preordered_vdims(const snippets::lowered::ExpressionPort& expr_port);
+
 bool is_dynamic_vdims(const VectorDims& shape);
 
+VectorDims pshape_to_vdims(const PartialShape&);
+ov::PartialShape vdims_to_pshape(const VectorDims&);
+/* --------------------------- */
+
 } // namespace utils
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp
index adf3894f71b8b7..4e1f730db6c428 100644
--- a/src/common/snippets/src/lowered/linear_ir.cpp
+++ b/src/common/snippets/src/lowered/linear_ir.cpp
@@ -365,10 +365,10 @@ VectorDims LinearIR::get_master_shape() const {
     }
     // Note: Snippets would benefit from a more generic master_shape calculation approach.
     //  It will be implemented in the scope of ROI propagation activity (ticket 120505)
-    const auto& result_parent = out_exprs[0]->get_input_port_connector(0)->get_source().get_expr();
+    const auto& source = out_exprs[0]->get_input_port_connector(0)->get_source();
     if (!m_config.m_enable_domain_optimization && out_exprs.size() == 1 &&
-        ov::is_type<snippets::op::Brgemm>(result_parent->get_node())) {
-        master_shape = utils::get_planar_vdims(out_exprs[0]->get_input_port_descriptor(0));
+        ov::is_type<snippets::op::Brgemm>(source.get_expr()->get_node())) {
+        master_shape = utils::get_preordered_vdims(source);
     } else {
         for (const auto& oe : out_exprs) {
             const auto& port_desc = oe->get_input_port_descriptor(0);
diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp
index 2bef20bb54e9d5..da8da2c2376f1f 100644
--- a/src/common/snippets/src/lowered/loop_manager.cpp
+++ b/src/common/snippets/src/lowered/loop_manager.cpp
@@ -181,9 +181,8 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
     std::vector<size_t> loop_subtensor;
     std::vector<size_t> loop_tensor(loop_depth, 1);
     for (const auto& exit_point : loop_exit_points) {
-        const auto& desc = exit_point.get_descriptor_ptr();
-        const auto shape = utils::get_planar_vdims(desc);
-        auto subtensor = desc->get_subtensor();
+        const auto shape = utils::get_preordered_vdims(exit_point);
+        auto subtensor = exit_point.get_descriptor_ptr()->get_subtensor();
         if (subtensor.empty()) {
             subtensor.resize(loop_depth, 1);
             subtensor[subtensor.size() - 1] = vector_size;
diff --git a/src/common/snippets/src/lowered/pass/init_loops.cpp b/src/common/snippets/src/lowered/pass/init_loops.cpp
index 47a77df23401e2..8128ea0253d2a7 100644
--- a/src/common/snippets/src/lowered/pass/init_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/init_loops.cpp
@@ -16,7 +16,7 @@ namespace pass {
 using LoopPort = LinearIR::LoopManager::LoopPort;
 
 namespace {
-int64_t get_dim_stride(size_t dim, const std::vector<size_t>& layout, const std::vector<size_t>& shape) {
+int64_t get_input_stride(size_t dim, const std::vector<size_t>& layout, const VectorDims& shape) {
     int64_t stride = 1;
     for (int i = static_cast<int>(layout.size()) - 1; i >= 0; i--) {
         if (layout[i] == dim) {
@@ -26,6 +26,13 @@ int64_t get_dim_stride(size_t dim, const std::vector<size_t>& layout, const std:
     }
     return stride;
 }
+int64_t get_output_stride(size_t dim, const VectorDims& shape) {
+    int64_t stride = 1;
+    for (size_t i = dim + 1; i < shape.size(); ++i) {
+        stride *= static_cast<int64_t>(shape[i]);
+    }
+    return stride;
+}
 }  // namespace
 
 InitLoops::InitLoops() : Pass() {}
@@ -42,7 +49,8 @@ void InitLoops::init_ptr_increments(std::vector<LoopPort>& loop_inputs, std::vec
             const auto& dim = *(layout.rbegin() + dim_idx);
             // If relevant dim is not broadcasted, then ptr_increment is the dim stride in the new layout
             if (!(shape[dim] == 1 && work_amount != 1)) {
-                loop_input.ptr_increment = get_dim_stride(dim, source.get_descriptor_ptr()->get_layout(), shape);
+                // Input layout shows how we should read data by which order and strides
+                loop_input.ptr_increment = get_input_stride(dim, source.get_descriptor_ptr()->get_layout(), shape);
             }
         }
     }
@@ -54,15 +62,12 @@ void InitLoops::init_ptr_increments(std::vector<LoopPort>& loop_inputs, std::vec
             const auto loop_ids = port->get_expr()->get_loop_ids();
             const auto& layout = port->get_descriptor_ptr()->get_layout();
             const auto& shape = port->get_descriptor_ptr()->get_shape();
-            const auto& dim = *(layout.rbegin() + dim_idx);
-            // Ticket: 113106
-            // WA: the current logic doesn't support the case with transposed output shape for brgemm layer
-            // but for all existing cases planar layout can be used
-            std::vector<size_t> planar(layout.size());
-            std::iota(planar.begin(), planar.end(), 0);
+            const auto original_dim = layout.size() - 1 - dim_idx;
+            const auto& dim = std::distance(layout.cbegin(), std::find(layout.cbegin(), layout.cend(), original_dim));
             // If relevant dim is not broadcasted, then ptr_increment is the dim stride in the new layout
             if (!(shape[dim] == 1 && work_amount != 1)) {
-                loop_output.ptr_increment = get_dim_stride(dim, planar, shape);
+                // Output layout shows how we already written data by which order and strides
+                loop_output.ptr_increment = get_output_stride(dim, shape);
             }
         }
     }
diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp
index da5ffc11c3169d..aefaca42f4094e 100644
--- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp
@@ -37,7 +37,7 @@ ov::Shape compute_allocation_shape(const LinearIR::LoopManagerPtr& loop_manager,
                                    const std::vector<size_t>& parent_loop_ids,
                                    const ExpressionPort& expr_port,
                                    const int allocation_rank) {
-    const auto& planar_shape = utils::get_planar_vdims(expr_port);
+    const auto planar_shape = utils::get_preordered_vdims(expr_port);
 
     const size_t rank = allocation_rank >= 0 ? std::min(static_cast<size_t>(allocation_rank), planar_shape.size()) : planar_shape.size();
     ov::Shape allocation_shape(rank);
diff --git a/src/common/snippets/src/lowered/pass/optimize_domain.cpp b/src/common/snippets/src/lowered/pass/optimize_domain.cpp
index f2d2fd43baf96c..09dadc77efe6e5 100644
--- a/src/common/snippets/src/lowered/pass/optimize_domain.cpp
+++ b/src/common/snippets/src/lowered/pass/optimize_domain.cpp
@@ -98,7 +98,7 @@ bool OptimizeDomain::run(snippets::lowered::LinearIR& linear_ir) {
             const ExpressionPtr& shape_producing_expr = blocked_input_shapes ?
                                                         first_consumer :
                                                         io_expr;
-            const auto& shape = utils::get_planar_vdims(shape_producing_expr->get_output_port_descriptor(0));
+            const auto& shape = utils::get_preordered_vdims(shape_producing_expr->get_output_port(0));
             OPENVINO_ASSERT(std::none_of(shape.begin(), shape.end(),
                                         [](size_t d) {return d == snippets::IShapeInferSnippets::DYNAMIC_DIMENSION; }),
                             "OptimizeDomain pass does not support dynamic shapes");
diff --git a/src/common/snippets/src/op/brgemm.cpp b/src/common/snippets/src/op/brgemm.cpp
index 5cce5d85c13a82..6ea77e447c449b 100644
--- a/src/common/snippets/src/op/brgemm.cpp
+++ b/src/common/snippets/src/op/brgemm.cpp
@@ -114,7 +114,7 @@ ov::element::Type Brgemm::get_output_type() const {
 
 std::vector<ov::PartialShape> Brgemm::get_planar_input_shapes(const std::vector<ov::Input<ov::Node>>& inputs) const {
     OPENVINO_ASSERT(inputs.size() == 2, "Brgemm::get_planar_input_shapes() expects 2 inputs");
-    return {utils::get_planar_pshape(inputs[0]), utils::get_planar_pshape(inputs[1]) };
+    return { utils::get_planar_pshape(inputs[0]), utils::get_planar_pshape(inputs[1]) };
 }
 
 ov::PartialShape Brgemm::get_planar_output_shape(const ov::PartialShape& output_shape) const {
diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp
index 868ed4294e6dab..065372f7a76747 100644
--- a/src/common/snippets/src/op/load.cpp
+++ b/src/common/snippets/src/op/load.cpp
@@ -79,7 +79,6 @@ IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector<Vec
     OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes");
     return {{utils::get_planar_vdims(input_shapes[0], m_order)}, ShapeInferStatus::success};
 }
-
 }// namespace op
 }// namespace snippets
 }// namespace ov
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index fccecfa8ab5f32..817978462034e0 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -46,7 +46,7 @@
 
 #include "transformations/utils/utils.hpp"
 
-#include "snippets/pass_manager.hpp"
+#include "snippets/pass/manager.hpp"
 #include "openvino/pass/constant_folding.hpp"
 #include "ov_ops/type_relaxed.hpp"
 #include <openvino/pass/serialize.hpp>
@@ -329,7 +329,7 @@ VectorDims Subgraph::infer_master_shape() {
             const auto& res_input = res->input(0);
             OPENVINO_ASSERT(res_input.get_partial_shape().is_static(), "Result have dynamic shape in static pipeline");
             // We need to account to the shape's layout stored in Output<Node> rt_info
-            const auto& planar_shape = utils::get_planar_pshape(res_input.get_source_output());
+            const auto& planar_shape = utils::get_preordered_pshape(res_input.get_source_output());
             output_dims.emplace_back(planar_shape.get_shape());
         }
     }
diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp
index 6ed1054adac40c..7ce3d658e56a58 100644
--- a/src/common/snippets/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/src/pass/collapse_subgraph.cpp
@@ -79,8 +79,8 @@ auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
             const auto& order = as_type_ptr<const opset1::Constant>(n->get_input_node_shared_ptr(1));
             if (order) {
                 const auto order_value = order->cast_vector<int>();
-                return (TransposeDecomposition::supported_cases.count(order_value) != 0) ||
-                       (is_brgemm_case && FuseTransposeBrgemm::supported_cases.count(order_value) != 0);
+                return (TransposeDecomposition::is_supported_transpose_order(order_value)) ||
+                       (is_brgemm_case && FuseTransposeBrgemm::is_supported_transpose_order(order_value));
             }
         }
         return false;
diff --git a/src/common/snippets/src/pass/common_optimizations.cpp b/src/common/snippets/src/pass/common_optimizations.cpp
index 609496cd0265e5..1e10d2dc6dfe6e 100644
--- a/src/common/snippets/src/pass/common_optimizations.cpp
+++ b/src/common/snippets/src/pass/common_optimizations.cpp
@@ -11,6 +11,10 @@
 #include "snippets/pass/fuse_transpose_brgemm.hpp"
 #include "snippets/pass/transform_convert.hpp"
 #include "snippets/pass/validate.hpp"
+#include "snippets/pass/split_dimension_m.hpp"
+#include "snippets/pass/extract_constants.hpp"
+#include "snippets/pass/extract_unsupported_transposes.hpp"
+#include "snippets/pass/subgraph_manager.hpp"
 #include "snippets/op/subgraph.hpp"
 #include "snippets/itt.hpp"
 
@@ -21,343 +25,9 @@ namespace ov {
 namespace snippets {
 namespace pass {
 
-namespace {
-size_t get_lcm(size_t a, size_t b) {
-    std::function<size_t(size_t, size_t)> get_gcd;
-    get_gcd = [&get_gcd](size_t a, size_t b) {
-        if (b == 0)
-            return a;
-        return get_gcd(b, a % b);
-    };
-    return a / get_gcd(a, b) * b;
-}
-
-bool is_supported_matmul_for_split_dim_m_optimization(const std::shared_ptr<const ov::Node>& node) {
-    const auto matmul = ov::as_type_ptr<const ov::op::v0::MatMul>(node);
-    return matmul && !matmul->get_transpose_a() && !matmul->is_dynamic() && node->get_shape().size() == 3; // It's needed only for 3D MHA patterns
-}
-}  // namespace
-
-bool CommonOptimizations::CanOptimizeParallelWA(const std::shared_ptr<const ov::Node>& node, size_t concurrency) {
-    if (!is_supported_matmul_for_split_dim_m_optimization(node))
-        return false;
-    const auto mm_shape = node->get_shape();
-    const auto current_parallel_work_amount =
-        std::accumulate(mm_shape.rbegin() + 2, mm_shape.rend(), size_t(1), std::multiplies<size_t>());
-    const auto dim_M = *(mm_shape.rbegin() + 1);
-    return (current_parallel_work_amount < concurrency) &&
-           (current_parallel_work_amount * dim_M >= concurrency);
-}
-
-void CommonOptimizations::SplitDimensionM(const std::shared_ptr<ov::snippets::op::Subgraph>& subgraph, size_t concurrency) {
-    // To increase parallelism work in 3D cases for MHA pattern,
-    // we split 1st dimension (starting from 0th) into 2 new dimensions to get 4D Shapes where
-    // - 0th and 1st dimensions are used in parallel scheduling,
-    // - 2nd and 3rd dimensions are used in kernel
-    // Note: 3D Patterns don't contain Transpose inside so the reshaping is valid
-
-    // It's needed only for MHA patterns. Need to add support for common patterns
-    if (!subgraph->has_domain_sensitive_ops())
-        return;
-
-    const auto& body = subgraph->body_ptr();
-    const auto& parameters = body->get_parameters();
-    // [107806]: If count of Parameters isn't equal to Subgraph inputs (it's possible case in general),
-    //           we cannot garantee correct extraction since we don't have correct connections between body I/O and Subgraph I/O.
-    OPENVINO_ASSERT(parameters.size() == subgraph->input_values().size(),
-                    "Failed to extract unsupported transposes: the count of Parameters isn't equal to Subgraph inputs");
-
-    // Need to find MatMul0 and check output shape
-    const auto& ops = body->get_ordered_ops();
-    const auto mm_it = std::find_if(ops.begin(), ops.end(),
-                                    [](const std::shared_ptr<ov::Node>& node){ return ov::is_type<ov::op::v0::MatMul>(node); });
-    if (mm_it == ops.end())
-        return;
-
-    const auto matmul0 = *mm_it;
-    if (!is_supported_matmul_for_split_dim_m_optimization(matmul0))
-        return;
-
-    auto get_dim_M = [](const ov::Shape& shape) {
-        return *(shape.rbegin() + 1);
-    };
-
-    const auto mm_shape = matmul0->get_shape();
-    const auto m_dim = get_dim_M(mm_shape);  // M
-    const auto batch_dim =
-        std::accumulate(mm_shape.rbegin() + 2, mm_shape.rend(), size_t(1), std::multiplies<size_t>());  // B (batch)
-
-    // We skip optimization if the current batch is optimal for concurrency
-    const auto optimal_parallelism_work_amount = concurrency;
-    if (batch_dim % optimal_parallelism_work_amount == 0)
-        return;
-
-    size_t batch_m_dim = 1;
-    size_t new_m_dim = m_dim;
-
-    auto is_optimized = [&](size_t batch_m_dim) {
-        return batch_m_dim > 1;
-    };
-
-    // [ First Step ]
-    // Need to find optimized dimension splitting: [b1..bk, m, n] -> [b1..bk, batch_m_dim, new_m_dim, n]
-    // The work amount for parallelism should be divided by max thread count in ideal case
-    // that all threads have the same full work amount (avoid of thread downtime)
-    // If it's impossible, we select such values so that as many threads as possible have work (see [ Second Step ])
-    // For example, there are 16 threads and shape [6, 512, 32]
-    //              LCM(6, 16) = 48 <- ideal work amount for parallelism
-    //              new_shape [6, 48 / 6, 512 / (48 / 6), 32 ] => [6, 8, 64, 32]
-    //              Each thread has parallelism_work_amount = 6 * 8 / nthrs = 3
-    const auto lcm = get_lcm(batch_dim, optimal_parallelism_work_amount);  // LCM(b, nthrs)
-    const auto batch_dim_multiplier = lcm / batch_dim;  // LCM(b, nthrs) / b
-    const auto needed_new_dim = m_dim / batch_dim_multiplier;  // m / (LCM(b, nthrs) / b) - needed factors of dimension m
-    if (batch_dim_multiplier * needed_new_dim == m_dim && is_optimized(batch_dim_multiplier)) {
-        batch_m_dim = batch_dim_multiplier;
-        new_m_dim = needed_new_dim;
-    } else {
-        // [ Second Step ]
-        // If we couldn't optimally split on the previous step, try the second step.
-        // The algorithm finds the more optimal parallelism work amount [batch_dim * batch_m_dim],
-        // where batch_m_dim is divisor of dimension M.
-        // The optimal parallelism work amount means the case when as many threads as possible have work
-        // For example, there are 8 threads and shape [5, 384, 32]
-        //              768 = [2 x 192] = [3 x 128] = [4 x 96] = [6 x 64]
-        //               - [5, 2, 192, 32] - WA = 10 = 8 + 2 (6 threads calculates once and 2 threads twice)
-        //               - [5, 3, 128, 32] - WA = 15 = 8 + 7 (all threads have 2 kernel except one thread) <- the most optimal case
-        //               - [5, 4,  96, 32] - WA = 20 = 8 x 2 + 4
-        //               - [5, 6,  64, 32] - WA = 30 = 8 x 3 + 6
-        // The most optimal and possible case is [5, 3, 128, 32] - almost all threads executes kernel twice
-        // Heuristic value for a quick exit from the algorithm.
-        // The value shows the number of threads in percentages that perform the most equal work
-        const auto optimal_thread_num_percent = 0.8;
-        size_t optimal_remainder = 1;
-        auto get_remainder = [batch_dim, optimal_parallelism_work_amount](const size_t potential_batch_dim) {
-            return (batch_dim * potential_batch_dim) % optimal_parallelism_work_amount;
-        };
-
-        auto update_optimal_params = [&](size_t divisor_0, size_t divisor_1) {
-            const auto remainder = batch_dim * divisor_0 % optimal_parallelism_work_amount;
-            if (remainder > optimal_remainder || remainder == 0) {
-                optimal_remainder = remainder;
-                batch_m_dim = divisor_0;
-                new_m_dim = divisor_1;
-            }
-        };
-
-        // Firstly we have shape [batch, 1, m_dim, smth].
-        // So at the beginning we have parallel_work_amount = batch x 1
-        optimal_remainder = get_remainder(1);
-        const auto root = std::sqrt(m_dim) + 1;
-        for (size_t divisor_0 = 2; divisor_0 < root; ++divisor_0) {
-            const size_t divisor_1 = m_dim / divisor_0;
-            if (divisor_0 * divisor_1 != m_dim)
-                continue;
-
-            update_optimal_params(divisor_0, divisor_1);
-            update_optimal_params(divisor_1, divisor_0);
-            if ((static_cast<float>(optimal_remainder) / static_cast<float>(optimal_parallelism_work_amount) > optimal_thread_num_percent) ||
-                (optimal_remainder == 0)) {
-                break;
-            }
-        }
-    }
-
-    OPENVINO_ASSERT(batch_m_dim * new_m_dim == m_dim, "Incorrect dimension M splitting!");
-    // nothing to split
-    if (!is_optimized(batch_m_dim))
-        return;
-
-    /***** Reshape insertion *****/
-
-    // There are two Parameter variants:
-    //  - Parameter on branches for Second input of MatMul - the shape should be only unsqueezed (add just 1)
-    //  - Other Parameters (on First input of MatMuls and between) - the shape should be splitted on M dimension
-
-    bool updated = false;
-    std::set<std::shared_ptr<ov::op::v0::Parameter>> reshaped_params;
-
-    auto insert_reshape = [&](const std::shared_ptr<ov::op::v0::Parameter>& param, const ov::Shape& new_shape) {
-        const auto index = std::distance(parameters.begin(), std::find(parameters.begin(), parameters.end(), param));
-        const auto shape_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{new_shape.size()}, new_shape);
-        const auto reshape = std::make_shared<ov::op::v1::Reshape>(subgraph->input_value(index), shape_const, false);
-        subgraph->input(index).replace_source_output(reshape);
-        param->set_partial_shape(new_shape);
-        reshaped_params.insert(param);
-        updated = true;
-    };
-
-    auto get_updated_shape = [&](const ov::Shape& shape, bool split_m_dim) {
-        const auto current_m_dim = get_dim_M(shape);
-        OPENVINO_ASSERT(!split_m_dim || current_m_dim == 1 || current_m_dim == m_dim, "Incorrect shape for splitting!");
-        ov::Shape new_shape = shape;
-        if ((split_m_dim && current_m_dim == 1) || !split_m_dim) {
-            new_shape.insert((new_shape.rbegin() + 2).base(), 1);
-        } else {
-            new_shape.insert((new_shape.rbegin() + 2).base(), batch_m_dim);
-            *(new_shape.rbegin() + 1) = new_m_dim;
-        }
-        OPENVINO_ASSERT(ov::shape_size(new_shape) == ov::shape_size(shape), "Incorrect shape splitting!");
-        return new_shape;
-    };
-
-    auto reshape_parameter = [&](const std::shared_ptr<ov::Node>& node, bool split_m_dim = true) {
-        const auto param = ov::as_type_ptr<ov::op::v0::Parameter>(node);
-        if (!param || reshaped_params.count(param) > 0)
-            return;
-        insert_reshape(param, get_updated_shape(param->get_partial_shape().get_shape(), split_m_dim));
-    };
-
-    auto update_matmul_second_branch = [&](const std::shared_ptr<ov::Node>& node) {
-        auto parent = node->get_input_node_shared_ptr(1);
-        while (!ov::is_type<ov::op::v0::Parameter>(parent)) {
-            if (parent->get_input_size() > 1) {
-                for (const auto& input_source : parent->input_values()) {
-                    reshape_parameter(input_source.get_node_shared_ptr(), false);
-                }
-            }
-
-            // [107731]: It's covered my MHA tokenization
-            parent = parent->get_input_node_shared_ptr(0);
-        }
-        reshape_parameter(parent, false);
-    };
-
-    // Firstly, Unsqueeze parameters on second branches of MatMuls
-    for (const auto& op : ops) {
-        if (ov::is_type<ov::op::v0::MatMul>(op)) {
-            update_matmul_second_branch(op);
-        }
-    }
-
-    // Secondly, Update All M dimensions for remaining parameters
-    for (const auto& param : parameters) {
-        if (reshaped_params.count(param) == 0)
-            reshape_parameter(param, true);
-    }
-
-    // Return the previous shape on outputs
-    for (size_t i = 0; i < subgraph->get_output_size() && updated; ++i) {
-        const auto output_shape = subgraph->get_output_shape(i);
-        if (is_scalar(output_shape))
-            continue;
-
-        const auto& target_inputs = subgraph->get_output_target_inputs(i);
-        const auto shape_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{output_shape.size()}, output_shape);
-        const auto reshape = std::make_shared<ov::op::v1::Reshape>(subgraph->output(i), shape_const, false);
-        // Save output name
-        const auto original_output = body->get_results()[i]->get_input_node_shared_ptr(0);
-        const auto original_name = original_output->get_friendly_name();
-        reshape->set_friendly_name(original_name);
-        original_output->set_friendly_name(original_name + "_original");
-
-        for (const auto& input : target_inputs) {
-            input.replace_source_output(reshape);
-            // Result input tensor name was changed, the name has to be restored
-            if (ov::is_type<ov::op::v0::Result>(input.get_node())) {
-                input.get_tensor_ptr()->add_names(subgraph->output(i).get_tensor_ptr()->get_names());
-            }
-        }
-        subgraph->output(i).get_tensor_ptr()->set_names({});
-        updated = true;
-    }
-    subgraph->set_friendly_name(subgraph->get_friendly_name() + "_original");
-
-    // Need to update inner Shapes and Softmax Axis
-    if (updated) {
-        for (const auto &op : ops) {
-            if (const auto softmax_v8 = ov::as_type_ptr<ov::op::v8::Softmax>(op)) {
-                softmax_v8->set_axis(-1);
-            } else if (const auto softmax_v1 = ov::as_type_ptr<ov::op::v1::Softmax>(op)) {
-                softmax_v1->set_axis(softmax_v1->get_output_partial_shape(0).size()); // since new_shape.size() = old_shape.size() + 1
-            } else if (const auto broadcast = ov::as_type_ptr<ov::op::v1::Broadcast>(op)) {
-                // Broadcast is tokenized only between MatMuls -> Split M dimension
-                const auto shape_const = ov::as_type_ptr<ov::op::v0::Constant>(broadcast->input_value(1).get_node_shared_ptr());
-                OPENVINO_ASSERT(shape_const, "SplitDimensionM expects Broadcast with Constant output shape");
-                const auto new_shape = get_updated_shape(shape_const->cast_vector<size_t>(), true);
-                broadcast->set_argument(1, std::make_shared<ov::op::v0::Constant>(shape_const->get_element_type(), ov::Shape{new_shape.size()}, new_shape));
-            }
-        }
-        subgraph->validate_and_infer_types();
-    }
-}
-
-void CommonOptimizations::ExtractConstants(const std::shared_ptr<ov::snippets::op::Subgraph>& subgraph) {
-    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ExtractConstants");
-    auto body = subgraph->body_ptr();
-
-    ParameterVector new_parameters;
-    OutputVector new_external_inputs = subgraph->input_values();
-
-    for (auto& op : body->get_ops()) {
-        auto constant = ov::as_type_ptr<ov::op::v0::Constant>(op);
-        if (!constant || ov::shape_size(constant->get_shape()) == 1ul)
-            continue;
-
-        const auto child = constant->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
-        if (op::Subgraph::constant_input_should_be_inside_body(child))
-            continue;
-
-        auto parameter = std::make_shared<ov::op::v0::Parameter>(constant->get_element_type(), constant->output(0).get_partial_shape());
-        parameter->set_friendly_name(constant->get_friendly_name());
-        ov::copy_runtime_info(constant, parameter);
-        constant->output(0).replace(parameter->output(0));
-
-        new_external_inputs.push_back(constant);
-        new_parameters.push_back(parameter);
-    }
-
-    if (new_parameters.size() != 0) {
-        body->add_parameters(new_parameters);
-        body->validate_nodes_and_infer_types();
-        subgraph->set_arguments(new_external_inputs);
-    }
-}
-
-void CommonOptimizations::ExtractUnsupportedTransposes(const std::shared_ptr<op::Subgraph>& subgraph) {
-    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ExtractUnsupportedTransposes");
-    const auto& body = subgraph->body_ptr();
-    const auto parameters = body->get_parameters();
-    // [107806]: If count of Parameters isn't equal to Subgraph inputs,
-    //           we cannot guarantee correct extraction since we don't have correct connections between body I/O and Subgraph I/O.
-    OPENVINO_ASSERT(parameters.size() == subgraph->input_values().size(),
-                    "Failed to extract unsupported transposes: the count of Parameters isn't equal to Subgraph inputs");
-
-    bool updated = false;
-    for (size_t i = 0; i < parameters.size(); ++i) {
-        const auto& parameter = parameters[i];
-        const auto& consumers = parameter->get_output_target_inputs(0);
-        if (consumers.size() != 1)
-            continue;
-
-        const auto transpose = ov::as_type_ptr<opset1::Transpose>(consumers.begin()->get_node()->shared_from_this());
-        if (!transpose)
-            continue;
-
-        const auto& order = ov::as_type_ptr<opset1::Constant>(transpose->get_input_node_shared_ptr(1));
-        if (!order)
-            continue;
-
-        const auto order_value = order->cast_vector<int>();
-        const auto transpose_child = *(transpose->get_output_target_inputs(0).begin());
-        const auto is_brgemm_case = ov::is_type<opset1::MatMul>(transpose_child.get_node()->shared_from_this());
-        // If Transpose is supported (can be decomposed or fused into Brgemm), skip
-        if ((is_brgemm_case && FuseTransposeBrgemm::supported_cases.count(order_value) != 0) ||
-            (TransposeDecomposition::supported_cases.count(order_value) != 0))
-            continue;
-
-        // If the transpose isn't supported - we have to extract it from Subgraph
-        transpose->set_argument(0, subgraph->input_value(i));
-        subgraph->set_argument(i, transpose);
-        transpose_child.replace_source_output(parameter);
-        // Update shape
-        parameter->set_partial_shape(transpose->get_output_partial_shape(0));
-        updated = true;
-    }
-
-    if (updated) {
-        subgraph->validate_and_infer_types();
-    }
-}
+#define REGISTER_SNIPPETS_PASS(manager, pass, enabled, ...) \
+    if (enabled) \
+        manager.register_pass<pass>(__VA_ARGS__);
 
 CommonOptimizations::CommonOptimizations(const SnippetsTokenization::Config& config) {
     MATCHER_SCOPE(CommonOptimizations);
@@ -371,29 +41,24 @@ CommonOptimizations::CommonOptimizations(const SnippetsTokenization::Config& con
 
         const auto& body = subgraph->body_ptr();
         const auto is_quantized = subgraph->is_quantized();
+        const auto is_domain_sensitive = subgraph->has_domain_sensitive_ops();
 
         // Firstly, we should transform all original Converts inside body to ConvertTruncation to save original behavior.
         // Then if Subgraph contains FakeQuantize we enable specific transformation for quantized subgraphs.
         ov::pass::Manager manager(get_pass_config());
-        manager.register_pass<ov::snippets::pass::TransformConvertToConvertTruncation>();
-        manager.register_pass<ov::snippets::pass::ExplicitTransposeMatMulInputs>();
-        if (is_quantized) {
-            manager.register_pass<ov::snippets::pass::CommonFakeQuantizeDecomposition>();
-        }
-        manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
+        REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::TransformConvertToConvertTruncation, true);
+        REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::ExplicitTransposeMatMulInputs, is_domain_sensitive);
+        REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::CommonFakeQuantizeDecomposition, is_quantized);
+        REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::SoftmaxReshapeElimination, is_domain_sensitive);
         manager.run_passes(body);
 
+        ov::snippets::pass::CommonOptimizations::SubgraphManager subgraph_manager;
         // At the moment only non-scalar Constants of FakeQuantize can be inside Subgraph
         // so we can enable ExtractConstants pass for quantized models
-        if (is_quantized) {
-            ExtractConstants(subgraph);
-        }
-        // Extract unsupported Transposes from body
-        if (subgraph->has_domain_sensitive_ops()) {
-            ExtractUnsupportedTransposes(subgraph);
-            if (config.split_m_dimension)
-                SplitDimensionM(subgraph, config.concurrency);
-        }
+        REGISTER_SNIPPETS_PASS(subgraph_manager, ov::snippets::pass::ExtractConstants, is_quantized);
+        REGISTER_SNIPPETS_PASS(subgraph_manager, ov::snippets::pass::ExtractUnsupportedTransposes, is_domain_sensitive);
+        REGISTER_SNIPPETS_PASS(subgraph_manager, ov::snippets::pass::SplitDimensionM, is_domain_sensitive && config.split_m_dimension, config.concurrency);
+        subgraph_manager.run_passes(subgraph);
 
         // Validate the body after all common optimizations
         ov::snippets::pass::Validate(get_pass_config()).run_on_model(body);
diff --git a/src/common/snippets/src/pass/extract_constants.cpp b/src/common/snippets/src/pass/extract_constants.cpp
new file mode 100644
index 00000000000000..54a2a56cd27cf5
--- /dev/null
+++ b/src/common/snippets/src/pass/extract_constants.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/pass/extract_constants.hpp"
+
+#include "openvino/opsets/opset1.hpp"
+#include "snippets/itt.hpp"
+
+
+bool ov::snippets::pass::ExtractConstants::run_on_subgraph(const std::shared_ptr<op::Subgraph>& subgraph) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ExtractConstants");
+    auto body = subgraph->body_ptr();
+
+    ParameterVector new_parameters;
+    OutputVector new_external_inputs = subgraph->input_values();
+
+    for (auto& op : body->get_ops()) {
+        auto constant = ov::as_type_ptr<ov::op::v0::Constant>(op);
+        if (!constant || ov::shape_size(constant->get_shape()) == 1ul)
+            continue;
+
+        const auto child = constant->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+        if (ov::snippets::op::Subgraph::constant_input_should_be_inside_body(child))
+            continue;
+
+        auto parameter = std::make_shared<ov::op::v0::Parameter>(constant->get_element_type(), constant->get_shape());
+        ov::replace_output_update_name(constant->output(0), parameter->output(0));
+
+        new_external_inputs.push_back(constant);
+        new_parameters.push_back(parameter);
+    }
+
+    if (new_parameters.size() != 0) {
+        body->add_parameters(new_parameters);
+        body->validate_nodes_and_infer_types();
+        subgraph->set_arguments(new_external_inputs);
+        return true;
+    }
+
+    return false;
+}
diff --git a/src/common/snippets/src/pass/extract_unsupported_transposes.cpp b/src/common/snippets/src/pass/extract_unsupported_transposes.cpp
new file mode 100644
index 00000000000000..4cc87b3810c1ae
--- /dev/null
+++ b/src/common/snippets/src/pass/extract_unsupported_transposes.cpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/pass/extract_unsupported_transposes.hpp"
+
+#include "openvino/opsets/opset1.hpp"
+#include "snippets/pass/mha_tokenization.hpp"
+#include "snippets/itt.hpp"
+
+
+bool ov::snippets::pass::ExtractUnsupportedTransposes::run_on_subgraph(const std::shared_ptr<op::Subgraph>& subgraph) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ExtractUnsupportedTransposes");
+    const auto& body = subgraph->body_ptr();
+    const auto parameters = body->get_parameters();
+    // [107806]: If count of Parameters isn't equal to Subgraph inputs,
+    //           we cannot guarantee correct extraction since we don't have correct connections between body I/O and Subgraph I/O.
+    OPENVINO_ASSERT(parameters.size() == subgraph->input_values().size(),
+                    "Failed to extract unsupported transposes: the count of Parameters isn't equal to Subgraph inputs");
+
+    bool updated = false;
+    for (size_t i = 0; i < parameters.size(); ++i) {
+        const auto& parameter = parameters[i];
+        const auto& consumers = parameter->get_output_target_inputs(0);
+        if (consumers.size() != 1)
+            continue;
+
+        const auto transpose = ov::as_type_ptr<opset1::Transpose>(consumers.begin()->get_node()->shared_from_this());
+        if (!transpose)
+            continue;
+
+        const auto& order = ov::as_type_ptr<opset1::Constant>(transpose->get_input_node_shared_ptr(1));
+        OPENVINO_ASSERT(order, "ExtractUnsupportedTransposes expects Transposes with constant order");
+
+        const auto order_value = order->cast_vector<int>();
+        const auto transpose_child = *(transpose->get_output_target_inputs(0).begin());
+        const auto is_brgemm_case = ov::is_type<opset1::MatMul>(transpose_child.get_node()->shared_from_this());
+        // If Transpose is supported (can be decomposed or fused into Brgemm), skip
+        // [116568]: It should be covered by TransposeDecomposition::is_supported or FuseTransposeBrgemm::is_supported
+        if ((is_brgemm_case && TokenizeMHASnippets::get_fusion_transpose_order(order_value.size()) == order_value) ||
+            (TokenizeMHASnippets::get_decomposed_transpose_order(order_value.size()) == order_value))
+            continue;
+
+        // If the transpose isn't supported - we have to extract it from Subgraph
+        transpose->set_argument(0, subgraph->input_value(i));
+        subgraph->set_argument(i, transpose);
+        transpose_child.replace_source_output(parameter);
+        parameter->set_partial_shape(transpose->get_output_partial_shape(0));
+        updated = true;
+    }
+
+    if (updated) {
+        subgraph->validate_and_infer_types();
+    }
+
+    return updated;
+}
diff --git a/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp b/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp
index 24a4141916e189..4492c1f7466505 100644
--- a/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp
+++ b/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp
@@ -17,24 +17,19 @@ namespace ov {
 namespace snippets {
 namespace pass {
 
-const std::set<std::vector<int>> FuseTransposeBrgemm::supported_cases = {{0, 2, 1, 3}};
-
-bool FuseTransposeBrgemm::is_supported_transpose(const Output<Node>& transpose_port) {
-    const auto transpose_node = transpose_port.get_node_shared_ptr();
-    // it's safe to do so because of the patterns we used. alternatively we can do it through pattern_values_map
-    const auto& constant = as_type_ptr<ov::opset1::Constant>(transpose_node->get_input_node_shared_ptr(1));
-    // if Transpose in and out layout is not empty => something was already fused on this port
-    auto default_layout = std::vector<size_t>(transpose_port.get_shape().size());
-    std::iota(default_layout.begin(), default_layout.end(), 0);// NCHW layout by default
-    if (lowered::PortDescriptorUtils::get_port_descriptor_ptr(transpose_port)->get_layout() != default_layout ||
-        lowered::PortDescriptorUtils::get_port_descriptor_ptr(transpose_node->input_value(0))->get_layout() != default_layout)
+bool FuseTransposeBrgemm::is_supported_transpose(const Output<Node>& transpose_out) {
+    const auto transpose = ov::as_type_ptr<const ov::opset1::Transpose>(transpose_out.get_node_shared_ptr());
+    if (!transpose)
         return false;
-    const auto& transpose_order = constant->cast_vector<int>();
-    // todo: this limitation is due to the fact that offsets are calculated in Kernel, and the only way
-    //  to calc them non-default way is to set Parameter rt_info field. This limitation can be removed if
-    //  the rt_info is properly propagated to the corresponding parameter
-    return is_type<ov::opset1::Parameter>(transpose_node->get_input_node_shared_ptr(0)) &&
-           supported_cases.count(transpose_order) != 0;
+    const auto order = ov::as_type_ptr<const ov::opset1::Constant>(transpose->get_input_node_shared_ptr(1));
+    if (!order)
+        return false;
+    return is_supported_transpose_order(order->cast_vector<int32_t>());
+}
+
+bool FuseTransposeBrgemm::is_supported_transpose_order(const std::vector<int32_t>& order) {
+    const auto size = order.size();
+    return order.size() > 0 && order.back() == (static_cast<int32_t>(size) - 1);
 }
 
 FuseTransposeBrgemm::FuseTransposeBrgemm() {
@@ -51,7 +46,7 @@ FuseTransposeBrgemm::FuseTransposeBrgemm() {
 
     // Pattern 2: Transpose on output of MatMul
     auto brgemm_out = ov::pass::pattern::wrap_type<op::Brgemm>({ov::pass::pattern::any_input(), ov::pass::pattern::any_input()});
-    auto transpose2 = ov::pass::pattern::wrap_type<ov::op::v1::Transpose>({brgemm_out, constant});
+    auto transpose2 = ov::pass::pattern::wrap_type<ov::op::v1::Transpose>({brgemm_out, constant}, is_supported_transpose);
 
     auto brgemm_or_transpose = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{brgemm_in0, brgemm_in1, transpose2});
 
diff --git a/src/common/snippets/src/pass_manager.cpp b/src/common/snippets/src/pass/manager.cpp
similarity index 97%
rename from src/common/snippets/src/pass_manager.cpp
rename to src/common/snippets/src/pass/manager.cpp
index bc9237c1ec8ab1..af59a99e348e5e 100644
--- a/src/common/snippets/src/pass_manager.cpp
+++ b/src/common/snippets/src/pass/manager.cpp
@@ -2,7 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "snippets/pass_manager.hpp"
+#include "snippets/pass/manager.hpp"
+
 
 namespace ov {
 namespace snippets {
@@ -77,5 +78,5 @@ std::shared_ptr<Manager::PassBase> Manager::insert_pass_instance(const PassPosit
 }
 
 } // namespace pass
-}// namespace snippets
-}// namespace ov
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp
index e9f939e8d72d75..67957c286a9e66 100644
--- a/src/common/snippets/src/pass/mha_tokenization.cpp
+++ b/src/common/snippets/src/pass/mha_tokenization.cpp
@@ -18,11 +18,11 @@
 
 
 namespace {
-auto is_supported_tensor(const ov::descriptor::Tensor& t) -> bool {
+bool is_supported_tensor(const ov::descriptor::Tensor& t) {
     return t.get_partial_shape().is_static() && ov::snippets::utils::one_of(t.get_shape().size(), 3lu, 4lu);
 }
 
-auto is_supported_intermediate_op(const std::shared_ptr<ov::Node>& node) -> bool {
+bool is_supported_intermediate_op(const std::shared_ptr<ov::Node>& node) {
     const auto is_intermediate_op = [](const std::shared_ptr<ov::Node>& node) {
         return ov::is_type<ov::op::util::UnaryElementwiseArithmetic>(node) ||
                ov::is_type<ov::op::util::BinaryElementwiseArithmetic>(node) ||
@@ -32,22 +32,23 @@ auto is_supported_intermediate_op(const std::shared_ptr<ov::Node>& node) -> bool
     return is_intermediate_op(node) && ov::snippets::pass::TokenizeSnippets::AppropriateForSubgraph(node);
 }
 
-auto is_valid_transpose(const std::shared_ptr<ov::opset1::Transpose>& node, std::vector<int64_t> expected_order) -> bool {
-    auto valid_transpose_order = [expected_order](const std::shared_ptr<ov::Node>& node) -> bool {
+bool is_valid_transpose(const std::shared_ptr<ov::opset1::Transpose>& node, const std::set<size_t>& supported_ranks, std::vector<int32_t> expected_order) {
+    auto is_valid_transpose_order = [expected_order, supported_ranks](const std::shared_ptr<ov::Node>& node) -> bool {
         const auto transpose_pattern = ov::as_type_ptr<ov::opset1::Constant>(node);
         if (!transpose_pattern)
             return false;
-        return transpose_pattern->cast_vector<int64_t>() == expected_order;
+        const auto existing_order = transpose_pattern->cast_vector<int32_t>();
+        return existing_order == expected_order && supported_ranks.count(existing_order.size()) != 0;
     };
     auto is_supported_transpose_tensor = [](const ov::descriptor::Tensor& t) {
         return is_supported_tensor(t) && ov::snippets::pass::TokenizeSnippets::get_supported_element_types().count(t.get_element_type()) != 0;
     };
 
-    return node && node->get_output_target_inputs(0).size() == 1 && node->get_shape().size() == 4 &&
-           valid_transpose_order(node->get_input_node_shared_ptr(1)) && is_supported_transpose_tensor(node->get_input_tensor(0));
+    return node && node->get_output_target_inputs(0).size() == 1 && is_valid_transpose_order(node->get_input_node_shared_ptr(1)) &&
+           is_supported_transpose_tensor(node->get_input_tensor(0));
 }
 
-auto tokenize_broadcast(const std::shared_ptr<ov::Node>& interm_op, ov::NodeVector& ordered_ops) -> void {
+void tokenize_broadcast(const std::shared_ptr<ov::Node>& interm_op, ov::NodeVector& ordered_ops) {
     // We can tokenize Broadcast op only when output shape of child doesn't depend on Broadcast shape without last dimension.
     // Snippets remove Broadcast op and insert BroadcastMove if last dimensions before and after Broadcast are different.
     // Otherwise, we can lose original shape.
@@ -95,9 +96,7 @@ auto tokenize_broadcast(const std::shared_ptr<ov::Node>& interm_op, ov::NodeVect
     }
 }
 
-auto tokenize_reshape_around_softmax(std::shared_ptr<ov::Node>& interm_op,
-                                     std::shared_ptr<ov::opset1::Reshape>& reshape,
-                                     ov::NodeVector& ordered_ops) -> bool {
+bool tokenize_reshape_around_softmax(std::shared_ptr<ov::Node>& interm_op, std::shared_ptr<ov::opset1::Reshape>& reshape, ov::NodeVector& ordered_ops) {
     reshape = ov::as_type_ptr<ov::opset1::Reshape>(interm_op);
     if (reshape) {
         const auto in_shape = reshape->get_input_shape(0);
@@ -110,7 +109,7 @@ auto tokenize_reshape_around_softmax(std::shared_ptr<ov::Node>& interm_op,
     return true;
 }
 
-auto get_potential_body_params(const std::shared_ptr<ov::Node>& op) -> size_t {
+size_t get_potential_body_params(const std::shared_ptr<ov::Node>& op) {
     size_t count = 0;
     for (size_t i = 1; i < op->get_input_size(); ++i) {
         const auto input = op->input_value(i);
@@ -125,8 +124,8 @@ auto get_potential_body_params(const std::shared_ptr<ov::Node>& op) -> size_t {
     return count;
 }
 
-auto update_intermediate_supported_ops(std::shared_ptr<ov::Node>& interm_op, ov::NodeVector& ordered_ops,
-                                       size_t& hidden_virtual_ports_count, size_t& potential_body_params_count) -> bool {
+bool update_intermediate_supported_ops(std::shared_ptr<ov::Node>& interm_op, ov::NodeVector& ordered_ops,
+                                       size_t& hidden_virtual_ports_count, size_t& potential_body_params_count) {
     while (is_supported_intermediate_op(interm_op)) {
         // All supported intermediate ops have only one output port
         if (interm_op->get_output_target_inputs(0).size() != 1)
@@ -176,8 +175,26 @@ auto update_intermediate_supported_ops(std::shared_ptr<ov::Node>& interm_op, ov:
     }
     return true;
 }
+
+std::vector<int32_t> get_rank_equivalent_order(std::vector<int32_t> default_order, size_t rank) {
+    OPENVINO_ASSERT(rank > 2, "Incorrect order rank for Transpose tokenization");
+    auto order = std::vector<int32_t>(rank);
+    std::iota(order.begin(), order.end(), 0);
+    const auto diff = static_cast<int32_t>(rank - default_order.size());
+    for (size_t i = 0; i < default_order.size(); ++i) {
+        order[diff + i] = default_order[i] + diff;
+    }
+    return order;
+}
 }  // namespace
 
+std::vector<int32_t> ov::snippets::pass::TokenizeMHASnippets::get_fusion_transpose_order(size_t rank) {
+    return get_rank_equivalent_order({1, 0, 2}, rank);
+}
+std::vector<int32_t> ov::snippets::pass::TokenizeMHASnippets::get_decomposed_transpose_order(size_t rank) {
+    return get_rank_equivalent_order({1, 2, 0}, rank);
+}
+
 bool ov::snippets::pass::TokenizeMHASnippets::is_matmul0_supported(const std::shared_ptr<ov::opset1::MatMul>& matmul) {
     if (!matmul || matmul->get_output_target_inputs(0).size() != 1 || matmul->get_transpose_a() ||
         !is_supported_tensor(matmul->get_input_tensor(0)) || !is_supported_tensor(matmul->get_input_tensor(1)))
@@ -257,6 +274,8 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
 
         ordered_ops.push_back(matmul0);
 
+        const auto pattern_rank = matmul0->get_output_partial_shape(0).size();
+
         auto interm_op = matmul0->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
         // Add supported operations which are between MatMul0 and Softmax to ordered_ops
         if (!update_intermediate_supported_ops(interm_op, ordered_ops, hidden_virtual_ports_count, potential_body_params_count))
@@ -368,12 +387,12 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         }
 
         auto tokenize_transpose = [&](const std::shared_ptr<ov::opset1::Transpose>& transpose,
-                                      bool is_input_transposed, std::vector<int64_t> order,
+                                      bool is_input_transposed, std::vector<int32_t> order,
                                       const ov::NodeVector::const_iterator& pos) {
             // If Transpose has valid order for the Transpose fusing (ExplicitTransposeMatMulInputs pass call), tokenize him.
             // Otherwise, skip the Transpose.
             if (!is_input_transposed) {
-                if (is_valid_transpose(transpose, order)) {
+                if (is_valid_transpose(transpose, config.mha_supported_transpose_ranks, order)) {
                     ordered_ops.insert(pos, transpose);
                 }
                 return;
@@ -383,7 +402,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
             if (rank < 2)
                 return;
             std::swap(transposed_order[rank - 1], transposed_order[rank - 2]);
-            if (is_valid_transpose(transpose, transposed_order)) {
+            if (is_valid_transpose(transpose, config.mha_supported_transpose_ranks, transposed_order)) {
                 ordered_ops.insert(pos, transpose);
             }
         };
@@ -391,9 +410,9 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         const auto transpose1 = ov::as_type_ptr<ov::opset1::Transpose>(parent);
         const auto transpose0 = ov::as_type_ptr<ov::opset1::Transpose>(matmul0->get_input_node_shared_ptr(0));
         const auto transpose2 = ov::as_type_ptr<ov::opset1::Transpose>(matmul1->get_input_node_shared_ptr(1));
-        tokenize_transpose(transpose1, is_transposed_b_0, {0, 2, 3, 1}, ordered_ops.begin());
-        tokenize_transpose(transpose0, matmul0->get_transpose_a(), {0, 2, 1, 3}, ordered_ops.begin());
-        tokenize_transpose(transpose2, matmul1->get_transpose_b(), {0, 2, 1, 3}, ordered_ops.end());
+        tokenize_transpose(transpose1, is_transposed_b_0, get_decomposed_transpose_order(pattern_rank), ordered_ops.begin());
+        tokenize_transpose(transpose0, matmul0->get_transpose_a(), get_fusion_transpose_order(pattern_rank), ordered_ops.begin());
+        tokenize_transpose(transpose2, matmul1->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.end());
         ordered_ops.push_back(matmul1);
 
         bool are_ops_after_matmul1 = false;
@@ -427,7 +446,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         //    Transpose3
         if (!are_ops_after_matmul1) {
             auto transpose3 = config.mha_token_enable_transpose_on_output ? ov::as_type_ptr<ov::opset1::Transpose>(child) : nullptr;
-            if (is_valid_transpose(transpose3, {0, 2, 1, 3}) &&
+            if (is_valid_transpose(transpose3, config.mha_supported_transpose_ranks, get_fusion_transpose_order(pattern_rank)) &&
                 transpose3->get_input_element_type(0) == matmul1_out_type) {  // To avoid Convert between MatMul1 and Transpose3
                 ordered_ops.push_back(transpose3);
             }
diff --git a/src/common/snippets/src/pass/split_dimension_m.cpp b/src/common/snippets/src/pass/split_dimension_m.cpp
new file mode 100644
index 00000000000000..671a12bffa34d2
--- /dev/null
+++ b/src/common/snippets/src/pass/split_dimension_m.cpp
@@ -0,0 +1,275 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/pass/split_dimension_m.hpp"
+
+#include "snippets/utils.hpp"
+#include "snippets/itt.hpp"
+
+namespace {
+size_t get_dim_M(const ov::Shape& shape) {
+    return *(shape.rbegin() + 1);
+}
+bool is_prime_number(size_t value) {
+    if (ov::snippets::utils::one_of(value, 2lu, 3lu)) return true;
+    if (value == 1 || value % 2 == 0 || value % 3 == 0) return false;
+    const auto root = std::sqrt(value) + 1;
+    for (size_t divisor = 5; divisor < root; divisor += 6) {
+        if ((value % divisor == 0) || (value % (divisor + 2) == 0))
+            return false;
+    }
+    return true;
+}
+}  // namespace
+
+bool ov::snippets::pass::SplitDimensionM::is_supported_matmul(const std::shared_ptr<const ov::Node>& node) {
+    const auto matmul = ov::as_type_ptr<const ov::op::v0::MatMul>(node);
+    return matmul && !matmul->get_transpose_a() && !matmul->is_dynamic();
+}
+
+std::pair<size_t, size_t> ov::snippets::pass::SplitDimensionM::get_splited_dimensions(size_t batch_dim, size_t m_dim,
+                                                                                      size_t optimal_parallelism_work_amount) {
+    std::pair<size_t, size_t> splited = { 1, m_dim };
+
+    const size_t lower_bound = optimal_parallelism_work_amount / batch_dim;
+    if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) {
+        splited.first = lower_bound;
+        splited.second = m_dim / lower_bound;
+        OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!");
+        return splited;
+    }
+
+    const size_t upper_bound = utils::div_up(2 * optimal_parallelism_work_amount, batch_dim);
+    for (size_t divisor_0 = upper_bound - 1; divisor_0 > 1; divisor_0--) {
+        size_t divisor_1 = m_dim / divisor_0;
+        if (divisor_1 * divisor_0 == m_dim) {
+            splited.first = divisor_0;
+            splited.second = divisor_1;
+            break;
+        }
+    }
+    OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!");
+    return splited;
+}
+
+bool ov::snippets::pass::SplitDimensionM::can_be_optimized(const std::shared_ptr<const ov::Node>& node, size_t concurrency) {
+    if (!is_supported_matmul(node))
+        return false;
+    size_t batch_m_dim, new_m_dim;
+    return split(node->get_shape(), concurrency, batch_m_dim, new_m_dim);
+}
+
+std::shared_ptr<ov::op::v0::MatMul> ov::snippets::pass::SplitDimensionM::get_matmul(const std::shared_ptr<op::Subgraph>& subgraph) {
+    const auto& body = subgraph->body_ptr();
+    const auto& parameters = body->get_parameters();
+    // [107806]: If count of Parameters isn't equal to Subgraph inputs (it's possible case in general),
+    //           we cannot garantee correct extraction since we don't have correct connections between body I/O and Subgraph I/O.
+    OPENVINO_ASSERT(parameters.size() == subgraph->input_values().size(),
+                    "Failed to extract unsupported transposes: the count of Parameters isn't equal to Subgraph inputs");
+
+    // Need to find MatMul0 and check output shape
+    const auto& ops = body->get_ordered_ops();
+    const auto mm_it = std::find_if(ops.cbegin(), ops.cend(),
+                                    [](const std::shared_ptr<ov::Node>& node){ return ov::is_type<ov::op::v0::MatMul>(node); });
+    if (mm_it == ops.end())
+        return nullptr;
+
+    const auto matmul0 = *mm_it;
+    return is_supported_matmul(matmul0) ? ov::as_type_ptr<ov::op::v0::MatMul>(matmul0) : nullptr;
+}
+
+bool ov::snippets::pass::SplitDimensionM::split(const ov::Shape& shape, size_t optimal_parallelism_work_amount, size_t& batch_m_dim, size_t& new_m_dim) {
+    const auto batch_dim =
+        std::accumulate(shape.rbegin() + 2, shape.rend(), size_t(1), std::multiplies<size_t>());  // B (batch)
+    const auto m_dim = get_dim_M(shape);  // M
+    if (is_prime_number(m_dim))
+        return false;
+
+    auto is_optimized = [&](size_t batch_dim) {
+        return batch_dim >= optimal_parallelism_work_amount;
+    };
+
+    // We skip optimization if the current batch is optimal for concurrency
+    if (is_optimized(batch_dim))
+        return false;
+
+    std::tie(batch_m_dim, new_m_dim) = get_splited_dimensions(batch_dim, m_dim, optimal_parallelism_work_amount);
+    return is_optimized(batch_dim * batch_m_dim);
+}
+
+void ov::snippets::pass::SplitDimensionM::reshape_subgraph(const std::shared_ptr<op::Subgraph>& subgraph,
+                                                           const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim) {
+    const auto& body = subgraph->body_ptr();
+    const auto& parameters = body->get_parameters();
+    const auto& results = body->get_results();
+    const auto ops = body->get_ordered_ops();
+    const auto m_dim = get_dim_M(shape);
+
+    // There are two Parameter variants:
+    //  - Parameter on branches for Second input of MatMul - the shape should be only unsqueezed (add just 1)
+    //  - Other Parameters (on First input of MatMuls and between) - the shape should be splitted on M dimension
+
+    std::set<std::shared_ptr<ov::op::v0::Parameter>> reshaped_params;
+
+    auto insert_reshape = [&](const std::shared_ptr<ov::op::v0::Parameter>& param, const ov::Shape& new_shape) {
+        const auto index = std::distance(parameters.begin(), std::find(parameters.begin(), parameters.end(), param));
+        const auto shape_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{new_shape.size()}, new_shape);
+        const auto reshape = std::make_shared<ov::op::v1::Reshape>(subgraph->input_value(index), shape_const, false);
+        subgraph->input(index).replace_source_output(reshape);
+        param->set_partial_shape(new_shape);
+        reshaped_params.insert(param);
+    };
+
+    auto get_updated_shape = [&](const ov::Shape& shape, size_t m_index, bool split_m_dim) {
+        const auto current_m_dim = shape[m_index];
+        OPENVINO_ASSERT(!split_m_dim || current_m_dim == 1 || current_m_dim == m_dim, "Incorrect shape for splitting!");
+        ov::Shape new_shape = shape;
+        if ((split_m_dim && current_m_dim == 1) || !split_m_dim) {
+            new_shape.insert(new_shape.begin() + m_index, 1);
+        } else {
+            new_shape[m_index] = new_m_dim;
+            new_shape.insert(new_shape.begin() + m_index, batch_m_dim);
+        }
+        OPENVINO_ASSERT(ov::shape_size(new_shape) == ov::shape_size(shape), "Incorrect shape splitting!");
+        return new_shape;
+    };
+
+    auto get_updated_order = [](const std::vector<int32_t>& order, int m_index) {
+        std::vector<int32_t> new_order(order.size() + 1, 0);
+        size_t shift_idx = 0;
+        for (size_t i = 0; i < order.size(); ++i) {
+            if (order[i] < m_index) {
+                new_order[i + shift_idx] = order[i];
+            } else if (order[i] == m_index) {
+                new_order[i + shift_idx++] = order[i];
+                new_order[i + shift_idx] = order[i] + 1;
+            } else {
+                new_order[i + shift_idx] = order[i] + 1;
+            }
+        }
+        return new_order;
+    };
+
+    auto reshape_transpose = [&](const std::shared_ptr<ov::Node>& transpose, bool is_input) -> size_t {
+        const auto order_constant = ov::as_type_ptr<ov::op::v0::Constant>(transpose->get_input_node_shared_ptr(1));
+        OPENVINO_ASSERT(order_constant != nullptr, "Transpose must have Constant order");
+        const auto order = order_constant->cast_vector<int32_t>();
+        const auto m_index = is_input ? order[order.size() - 2] : order.size() - 2;  // Index of M dimension in the previous order
+        const auto new_order = get_updated_order(order, static_cast<int>(m_index));
+        transpose->set_argument(1, std::make_shared<ov::op::v0::Constant>(order_constant->get_element_type(), ov::Shape{new_order.size()}, new_order));
+        return m_index;
+    };
+
+    auto reshape_parameter = [&](const std::shared_ptr<ov::Node>& node, bool split_m_dim = true) {
+        const auto param = ov::as_type_ptr<ov::op::v0::Parameter>(node);
+        if (!param || reshaped_params.count(param) > 0)
+            return;
+
+        const auto shape = param->get_partial_shape().get_shape();
+        const auto consumers = param->get_output_target_inputs(0);
+        const auto shared_consumer = consumers.begin()->get_node()->shared_from_this();
+        auto m_index = shape.size() - 2;
+        if (ov::is_type<ov::op::v1::Transpose>(shared_consumer)) {
+            m_index = reshape_transpose(shared_consumer, true);
+        }
+        insert_reshape(param, get_updated_shape(shape, m_index, split_m_dim));
+    };
+
+    auto update_matmul_second_branch = [&](const std::shared_ptr<ov::op::v0::MatMul>& node) {
+        auto parent = node->get_input_node_shared_ptr(1);
+        while (!ov::is_type<ov::op::v0::Parameter>(parent)) {
+            if (parent->get_input_size() > 1) {
+                for (const auto& input_source : parent->input_values()) {
+                    reshape_parameter(input_source.get_node_shared_ptr(), false);
+                }
+            }
+
+            // [107731]: It's covered my MHA tokenization
+            parent = parent->get_input_node_shared_ptr(0);
+        }
+        reshape_parameter(parent, false);
+    };
+
+    // Firstly, Unsqueeze parameters on second branches of MatMuls
+    for (const auto& op : ops) {
+        if (const auto matmul = ov::as_type_ptr<ov::op::v0::MatMul>(op)) {
+            update_matmul_second_branch(matmul);
+        } else if (const auto softmax_v8 = ov::as_type_ptr<ov::op::v8::Softmax>(op)) {
+            softmax_v8->set_axis(-1);
+        } else if (const auto softmax_v1 = ov::as_type_ptr<ov::op::v1::Softmax>(op)) {
+            softmax_v1->set_axis(softmax_v1->get_output_partial_shape(0).size()); // since new_shape.size() = old_shape.size() + 1
+        } else if (const auto broadcast = ov::as_type_ptr<ov::op::v1::Broadcast>(op)) {
+            // Broadcast is tokenized only between MatMuls -> Split M dimension
+            const auto shape_const = ov::as_type_ptr<ov::op::v0::Constant>(broadcast->input_value(1).get_node_shared_ptr());
+            OPENVINO_ASSERT(shape_const, "SplitDimensionM expects Broadcast with Constant output shape");
+            const auto new_shape = get_updated_shape(shape_const->cast_vector<size_t>(), broadcast->get_output_shape(0).size() - 2, true);
+            broadcast->set_argument(1, std::make_shared<ov::op::v0::Constant>(shape_const->get_element_type(), ov::Shape{new_shape.size()}, new_shape));
+        }
+    }
+
+    // Secondly, Update All M dimensions for remaining parameters
+    for (const auto& param : parameters) {
+        if (reshaped_params.count(param) == 0)
+            reshape_parameter(param, true);
+    }
+
+    // Update Transpose order on Result
+    for (const auto& res : results) {
+        const auto parent = res->get_input_node_shared_ptr(0);
+        if (ov::is_type<ov::op::v1::Transpose>(parent)) {
+            reshape_transpose(parent, false);
+        }
+    }
+
+    // Return the previous shape on outputs
+    for (size_t i = 0; i < subgraph->get_output_size(); ++i) {
+        const auto output_shape = subgraph->get_output_shape(i);
+        if (is_scalar(output_shape))
+            continue;
+
+        const auto& target_inputs = subgraph->get_output_target_inputs(i);
+        const auto shape_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{output_shape.size()}, output_shape);
+        const auto reshape = std::make_shared<ov::op::v1::Reshape>(subgraph->output(i), shape_const, false);
+        // Save output name
+        const auto original_output = body->get_results()[i]->get_input_node_shared_ptr(0);
+        const auto original_name = original_output->get_friendly_name();
+        reshape->set_friendly_name(original_name);
+        original_output->set_friendly_name(original_name + "_original");
+
+        for (const auto& input : target_inputs) {
+            input.replace_source_output(reshape);
+            // Result input tensor name was changed, the name has to be restored
+            if (ov::is_type<ov::op::v0::Result>(input.get_node())) {
+                input.get_tensor_ptr()->add_names(subgraph->output(i).get_tensor_ptr()->get_names());
+            }
+        }
+        subgraph->output(i).get_tensor_ptr()->set_names({});
+    }
+    subgraph->set_friendly_name(subgraph->get_friendly_name() + "_original");
+    // Need to update inner Shapes and Softmax Axis
+    subgraph->validate_and_infer_types();
+}
+
+bool ov::snippets::pass::SplitDimensionM::run_on_subgraph(const std::shared_ptr<op::Subgraph>& subgraph) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SplitDimensionM");
+    // To increase parallelism work in MHA pattern,
+    // we split 1st dimension (starting from 0th) into 2 new dimensions to get 4D Shapes where
+    // - 0th and 1st dimensions are used in parallel scheduling,
+    // - 2nd and 3rd dimensions are used in kernel
+
+    // It's needed only for MHA patterns. Need to add support for common patterns
+    if (!subgraph->has_domain_sensitive_ops())
+        return false;
+
+    if (const auto matmul0 = get_matmul(subgraph)) {
+        const auto mm_shape = matmul0->get_shape();
+        size_t batch_m_dim, new_m_dim;
+        if (!split(mm_shape, m_concurrency, batch_m_dim, new_m_dim))
+            return false;
+
+        reshape_subgraph(subgraph, mm_shape, batch_m_dim, new_m_dim);
+        return true;
+    }
+    return false;
+}
diff --git a/src/common/snippets/src/pass/subgraph_manager.cpp b/src/common/snippets/src/pass/subgraph_manager.cpp
new file mode 100644
index 00000000000000..860a2b15c359fd
--- /dev/null
+++ b/src/common/snippets/src/pass/subgraph_manager.cpp
@@ -0,0 +1,21 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/pass/subgraph_manager.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+bool CommonOptimizations::SubgraphManager::run_passes(std::shared_ptr<ov::snippets::op::Subgraph> subgraph) {
+    bool updated = false;
+    for (const auto& pass : m_pass_list) {
+        updated = pass->run_on_subgraph(subgraph) || updated;
+    }
+    return updated;
+}
+
+} // namespace pass
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp
index bb581105a7523a..fe7cf7a702b09f 100644
--- a/src/common/snippets/src/pass/transpose_decomposition.cpp
+++ b/src/common/snippets/src/pass/transpose_decomposition.cpp
@@ -14,7 +14,22 @@ namespace snippets {
 namespace pass {
 using namespace lowered;
 
-const std::set<std::vector<int>> TransposeDecomposition::supported_cases = {{0, 2, 3, 1}};
+bool TransposeDecomposition::is_supported_transpose(const Output<Node>& transpose_out) {
+    const auto transpose = ov::as_type_ptr<const ov::opset1::Transpose>(transpose_out.get_node_shared_ptr());
+    if (!transpose)
+        return false;
+    const auto order = ov::as_type_ptr<const ov::opset1::Constant>(transpose->get_input_node_shared_ptr(1));
+    if (!order)
+        return false;
+    return is_supported_transpose_order(order->cast_vector<int32_t>());
+}
+
+bool TransposeDecomposition::is_supported_transpose_order(const std::vector<int32_t>& order) {
+    const auto size = order.size();
+    if (size > 0)
+        return order.back() != static_cast<int32_t>(size - 1);
+    return true;
+}
 
 TransposeDecomposition::TransposeDecomposition() {
     MATCHER_SCOPE(TransposeDecomposition);
@@ -37,7 +52,7 @@ TransposeDecomposition::TransposeDecomposition() {
             return false;
 
         auto order_value = order->cast_vector<int>();
-        if (supported_cases.count(order_value) == 0)
+        if (!is_supported_transpose_order(order_value))
             return false;
 
         // number of elements that can be processed on every iteration. For 0,1,2,3 -> 0,2,3,1 we can guarantee only scalar access
diff --git a/src/common/snippets/src/utils.cpp b/src/common/snippets/src/utils.cpp
index 242391b908dc03..2bd5423babb805 100644
--- a/src/common/snippets/src/utils.cpp
+++ b/src/common/snippets/src/utils.cpp
@@ -12,6 +12,37 @@ namespace ov {
 namespace snippets {
 namespace utils {
 
+namespace {
+template<typename Shape>
+void ordered_shape(const Shape& shape, const std::vector<size_t>& layout, bool is_forward, Shape& reordered_shape) {
+    for (size_t i = 0; i < layout.size(); i++) {
+        OPENVINO_ASSERT(layout[i] < shape.size(), "layout index is greater than the shape size");
+        const auto src_idx = is_forward ? layout[i] : i;
+        const auto dst_idx = is_forward ? i : layout[i];
+        reordered_shape[dst_idx] = shape[src_idx];
+    }
+}
+
+// Note:
+//   - If `is_forward` is True, `result shape` is ordered `shape` by `layout`
+//   - If `is_forward` is False, `result shape` is original shape to which the `layout` was applied
+ov::PartialShape get_pshape(const ov::PartialShape& shape, const std::vector<size_t>& layout, bool is_forward) {
+    if (layout.empty())
+        return shape;
+    ov::PartialShape reordered_shape(std::vector<Dimension>(layout.size()));
+    if (shape.rank().is_dynamic())
+        OPENVINO_THROW("get_reordered_planar_shape can't be called for outputs with dynamic rank");
+    const size_t rank = shape.rank().get_length();
+    if (layout.size() > rank)
+        OPENVINO_THROW("Layout rank can't be larger than tensor rank");
+    // Note that it can be smaller though, for example tensor shape can be prepended with 1 for scheduling purposes
+    if (std::any_of(layout.begin(), layout.end(), [=](size_t x) {return x >= rank;}))
+        OPENVINO_THROW("Invalid layout detected: all layout indexes must be smaller than the tensor rank");
+    ordered_shape(shape, layout, is_forward, reordered_shape);
+    return reordered_shape;
+}
+}  // namespace
+
 auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<ov::op::v0::FakeQuantize>& fq) -> size_t {
     std::vector<float> cl, ch, isc, ish, osc, osh;
     const bool status = ov::snippets::pass::FakeQuantizeDecomposition::getScalesAndShifts(fq, cl, ch, isc, ish, osc, osh);
@@ -70,23 +101,46 @@ auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<ov::op::v0::Fake
     }
 }
 
-ov::PartialShape get_planar_pshape(const ov::PartialShape& shape, const std::vector<size_t>& layout) {
-    if (layout.empty())
-        return shape;
-    std::vector<Dimension> reordered_shape(layout.size());
-    if (shape.rank().is_dynamic())
-        OPENVINO_THROW("get_reordered_planar_shape can't be called for outputs with dynamic rank");
-    const size_t rank = shape.rank().get_length();
-    if (layout.size() > rank)
-        OPENVINO_THROW("Layout rank can't be larger than tensor rank");
-    // Note that it can be smaller though, for example tensor shape can be prepended with 1 for scheduling purposes
-    if (std::any_of(layout.begin(), layout.end(), [=](size_t x) {return x >= rank;}))
-        OPENVINO_THROW("Invalid layout detected: all layout indexes must be smaller than the tensor rank");
-    for (size_t i = 0; i < layout.size(); i++)
-        reordered_shape[i] = shape[layout[i]];
+ov::PartialShape get_planar_pshape(const ov::PartialShape& shape, const std::vector<size_t>& order) {
+    return get_pshape(shape, order, true);
+}
+ov::PartialShape get_preordered_pshape(const ov::PartialShape& shape, const std::vector<size_t>& order) {
+    return get_pshape(shape, order, false);
+}
+
+ov::PartialShape get_planar_pshape(const Input<Node>& in) {
+    const auto& port = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(in);
+    return get_planar_pshape(ov::Shape{port->get_shape()}, port->get_layout());
+}
+ov::PartialShape get_preordered_pshape(const Output<Node>& out) {
+    const auto& port = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(out);
+    return get_preordered_pshape(ov::Shape{port->get_shape()}, port->get_layout());
+}
+
+VectorDims get_planar_vdims(const VectorDims& shape, const std::vector<size_t>& order) {
+    VectorDims reordered_shape(order.size());
+    ordered_shape(shape, order, true, reordered_shape);
+    return reordered_shape;
+}
+VectorDims get_preordered_vdims(const VectorDims& shape, const std::vector<size_t>& order) {
+    VectorDims reordered_shape(order.size());
+    ordered_shape(shape, order, false, reordered_shape);
     return reordered_shape;
 }
 
+VectorDims get_planar_vdims(const snippets::lowered::ExpressionPort& expr_port) {
+    OPENVINO_ASSERT(expr_port.get_type() == snippets::lowered::ExpressionPort::Type::Input, "get_planar_vdims expects Expression Input port");
+    return get_planar_vdims(expr_port.get_descriptor_ptr()->get_shape(), expr_port.get_descriptor_ptr()->get_layout());
+}
+VectorDims get_preordered_vdims(const snippets::lowered::ExpressionPort& expr_port) {
+    OPENVINO_ASSERT(expr_port.get_type() == snippets::lowered::ExpressionPort::Type::Output, "get_preordered_vdims expects Expression Output port");
+    return get_preordered_vdims(expr_port.get_descriptor_ptr()->get_shape(), expr_port.get_descriptor_ptr()->get_layout());
+}
+
+bool is_dynamic_vdims(const VectorDims& shape) {
+    return std::any_of(shape.cbegin(), shape.cend(), [](size_t v){ return v == IShapeInferSnippets::DYNAMIC_DIMENSION; });
+}
+
 VectorDims pshape_to_vdims(const PartialShape& pshape) {
     VectorDims result;
     result.reserve(pshape.size());
@@ -106,37 +160,6 @@ ov::PartialShape vdims_to_pshape(const VectorDims& vdims) {
     return result;
 }
 
-ov::PartialShape get_planar_pshape(const Input<Node>& in) {
-    const auto& port = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(in);
-    return utils::get_planar_pshape(ov::Shape{port->get_shape()}, port->get_layout());
-}
-
-ov::PartialShape get_planar_pshape(const Output<Node>& out) {
-    const auto& port = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(out);
-    return utils::get_planar_pshape(ov::Shape{port->get_shape()}, port->get_layout());
-}
-
-VectorDims get_planar_vdims(const VectorDims& shape, const std::vector<size_t>& layout) {
-    VectorDims reordered_shape(shape.size());
-    for (size_t i = 0; i < layout.size(); i++) {
-        OPENVINO_ASSERT(layout[i] < shape.size(), "get_planar_vdims: layout index is greater than the shape size");
-        reordered_shape[i] = shape[layout[i]];
-    }
-    return reordered_shape;
-}
-
-VectorDims get_planar_vdims(const snippets::lowered::PortDescriptorPtr& port_desc) {
-    return get_planar_vdims(port_desc->get_shape(), port_desc->get_layout());
-}
-
-VectorDims get_planar_vdims(const snippets::lowered::ExpressionPort& expr_port) {
-    return get_planar_vdims(expr_port.get_descriptor_ptr());
-}
-
-bool is_dynamic_vdims(const VectorDims& shape) {
-    return std::any_of(shape.cbegin(), shape.cend(), [](size_t v){ return v == IShapeInferSnippets::DYNAMIC_DIMENSION; });
-}
-
 } // namespace utils
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp
index f2c872f725b7d6..379a8f16cec4f0 100644
--- a/src/common/snippets/tests/include/lowering_utils.hpp
+++ b/src/common/snippets/tests/include/lowering_utils.hpp
@@ -6,7 +6,7 @@
 #include <common_test_utils/ov_test_utils.hpp>
 #include "snippets/op/subgraph.hpp"
 #include "snippets_helpers.hpp"
-#include "snippets/pass_manager.hpp"
+#include "snippets/pass/manager.hpp"
 #include "snippets/shape_inference/shape_inference.hpp"
 
 namespace ov {
diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp
index 4fa525ba5d1f0c..49087d4ffcf675 100644
--- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp
+++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp
@@ -31,8 +31,7 @@ void TokenizeMHASnippetsTests::run() {
     disable_rt_info_check();
 }
 
-TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA) {
-    GTEST_SKIP();
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D) {
     const auto &f = MHAFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}},
                                 std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}));
     model = f.getOriginal();
@@ -40,6 +39,14 @@ TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA) {
     run();
 }
 
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D) {
+    const auto &f = MHAFunction(std::vector<PartialShape>{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}},
+                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}));
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_MatMul0_Transpose) {
     GTEST_SKIP();
     const auto &f = MHAMatMul0TransposeFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}},
@@ -80,10 +87,54 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_fusion) {
     run();
 }
 
-TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_SplitM) {
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) {
+    const auto& f = MHASplitMFunction(std::vector<PartialShape>{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}},
+                                      std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
+                                      std::vector<Shape>{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}},
+                                      false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    config.concurrency = 24;
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) {
+    const auto& f = MHASplitMFunction(std::vector<PartialShape>{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}},
+                                      std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
+                                      std::vector<Shape>{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}},
+                                      true);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    config.concurrency = 16;
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) {
+    const auto& f = MHASplitMFunction(std::vector<PartialShape>{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}},
+                                      std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
+                                      std::vector<Shape>{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}},
+                                      false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    config.concurrency = 60;
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) {
+    const auto& f = MHASplitMFunction(std::vector<PartialShape>{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}},
+                                      std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
+                                      std::vector<Shape>{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}},
+                                      true);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    config.concurrency = 60;
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHAWOTranspose_SplitM) {
     const auto& f = MHAWOTransposeSplitMFunction(std::vector<PartialShape>{{10, 9216, 128}, {10, 128, 9216}, {10, 9216, 128}},
                                                  std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32}),
-                                                 std::vector<Shape>{{10, 9, 1024, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}});
+                                                 std::vector<Shape>{{10, 3, 3072, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}});
     model = f.getOriginal();
     model_ref = f.getReference();
     config.concurrency = 18;
@@ -93,7 +144,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_SplitM) {
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_SplitM_AlmostAllThreads) {
     const auto& f = MHAWOTransposeSplitMFunction(std::vector<PartialShape>{{5, 30, 32}, {5, 32, 30}, {5, 30, 32}},
                                                  std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32}),
-                                                 std::vector<Shape>{{5, 6, 5, 32}, {5, 1, 32, 30}, {5, 1, 30, 32}, {5, 30, 32}});
+                                                 std::vector<Shape>{{5, 10, 3, 32}, {5, 1, 32, 30}, {5, 1, 30, 32}, {5, 30, 32}});
     model = f.getOriginal();
     model_ref = f.getReference();
     config.concurrency = 32;
diff --git a/src/core/include/openvino/op/equal.hpp b/src/core/include/openvino/op/equal.hpp
index 8148f62d2ba44b..a66e00d4be96d9 100644
--- a/src/core/include/openvino/op/equal.hpp
+++ b/src/core/include/openvino/op/equal.hpp
@@ -41,12 +41,9 @@ class OPENVINO_API Equal : public util::BinaryElementwiseComparison {
           const Output<Node>& arg1,
           const AutoBroadcastSpec& auto_broadcast = AutoBroadcastSpec(AutoBroadcastType::NUMPY));
 
-    bool visit_attributes(AttributeVisitor& visitor) override;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    bool evaluate(const HostTensorVector& outputs, const HostTensorVector& inputs) const override;
-    OPENVINO_SUPPRESS_DEPRECATED_END
+    bool evaluate(TensorVector& outputs, const TensorVector& inputs) const override;
     bool evaluate_upper(TensorVector& outputs) const override;
     bool evaluate_lower(TensorVector& outputs) const override;
     bool has_evaluate() const override;
diff --git a/src/core/reference/include/openvino/reference/equal.hpp b/src/core/reference/include/openvino/reference/equal.hpp
index c81d47c23d18ff..5e75b110eb996c 100644
--- a/src/core/reference/include/openvino/reference/equal.hpp
+++ b/src/core/reference/include/openvino/reference/equal.hpp
@@ -4,44 +4,68 @@
 
 #pragma once
 
-#if defined(__GNUC__)
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
+#include <algorithm>
+#include <functional>
 
-#include <cstddef>
-
-#include "openvino/core/shape.hpp"
-#include "openvino/op/util/attr_types.hpp"
 #include "openvino/reference/autobroadcast_binop.hpp"
+#include "openvino/reference/utils/type_util.hpp"
 
 namespace ov {
 namespace reference {
+namespace func {
+template <class T>
+bool equal(const T lhs, const T rhs) {
+    return lhs == rhs;
+}
+}  // namespace func
+
 template <typename T>
+void equal(const T* arg0, const T* arg1, char* out, size_t count) {
+    std::transform(arg0, std::next(arg0, count), arg1, out, std::equal_to<T>());
+}
+
+/**
+ * @brief Reference implementation of binary elementwise Equal operator.
+ *
+ * Used for integral types with custom `equal` function (reduce binary size).
+ *
+ * @param arg0            Pointer to input 0 data.
+ * @param arg1            Pointer to input 1 data.
+ * @param out             Pointer to output data.
+ * @param arg_shape0      Input 0 shape.
+ * @param arg_shape1      Input 1 shape.
+ * @param broadcast_spec  Broadcast specification mode.
+ */
+template <typename T, typename U, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
 void equal(const T* arg0,
            const T* arg1,
-           char* out,
-           size_t count)  // TODO: using char for bool, is this right?
-{
-    for (size_t i = 0; i < count; i++) {
-        out[i] = arg0[i] == arg1[i];
-    }
+           U* out,
+           const Shape& arg0_shape,
+           const Shape& arg1_shape,
+           const op::AutoBroadcastSpec& broadcast_spec) {
+    autobroadcast_binop(arg0, arg1, out, arg0_shape, arg1_shape, broadcast_spec, func::equal<T>);
 }
 
-template <typename T, typename U>
+/**
+ * @brief Reference implementation of binary elementwise Equal operator.
+ *
+ * Used for floating-point types to (avoid warning compare floating point with `==`).
+ *
+ * @param arg0            Pointer to input 0 data.
+ * @param arg1            Pointer to input 1 data.
+ * @param out             Pointer to output data.
+ * @param arg_shape0      Input 0 shape.
+ * @param arg_shape1      Input 1 shape.
+ * @param broadcast_spec  Broadcast specification mode.
+ */
+template <typename T, typename U, typename std::enable_if<ov::is_floating_point<T>()>::type* = nullptr>
 void equal(const T* arg0,
            const T* arg1,
            U* out,
            const Shape& arg0_shape,
            const Shape& arg1_shape,
            const op::AutoBroadcastSpec& broadcast_spec) {
-    autobroadcast_binop(arg0, arg1, out, arg0_shape, arg1_shape, broadcast_spec, [](T x, T y) -> U {
-        return static_cast<U>(x == y);
-    });
+    autobroadcast_binop(arg0, arg1, out, arg0_shape, arg1_shape, broadcast_spec, std::equal_to<T>());
 }
 }  // namespace reference
 }  // namespace ov
-
-#if defined(__GNUC__)
-#    pragma GCC diagnostic pop
-#endif
diff --git a/src/core/src/op/equal.cpp b/src/core/src/op/equal.cpp
index e4adf5d0e4ce53..7f23b8970e204a 100644
--- a/src/core/src/op/equal.cpp
+++ b/src/core/src/op/equal.cpp
@@ -2,183 +2,160 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "ngraph/op/equal.hpp"
+#include "openvino/op/equal.hpp"
 
 #include "bound_evaluate.hpp"
+#include "element_visitor.hpp"
 #include "itt.hpp"
-#include "ngraph/op/constant.hpp"
-#include "ngraph/op/less_eq.hpp"
-#include "ngraph/runtime/host_tensor.hpp"
-#include "openvino/op/ops.hpp"
+#include "openvino/op/less_eq.hpp"
+#include "openvino/op/logical_and.hpp"
+#include "openvino/op/logical_or.hpp"
 #include "openvino/reference/equal.hpp"
+#include "utils.hpp"
 
-using namespace std;
-using namespace ngraph;
-
-OPENVINO_SUPPRESS_DEPRECATED_START
+namespace ov {
+namespace op {
 namespace equal {
 namespace {
-template <element::Type_t ET>
-bool evaluate(const HostTensorPtr& arg0,
-              const HostTensorPtr& arg1,
-              const HostTensorPtr& out,
-              const op::AutoBroadcastSpec& broadcast_spec) {
-    ov::reference::equal(arg0->get_data_ptr<ET>(),
-                         arg1->get_data_ptr<ET>(),
-                         out->get_data_ptr<element::Type_t::boolean>(),
-                         arg0->get_shape(),
-                         arg1->get_shape(),
-                         broadcast_spec);
-    return true;
-}
-
-bool evaluate_equal(const HostTensorPtr& arg0,
-                    const HostTensorPtr& arg1,
-                    const HostTensorPtr& out,
-                    const op::AutoBroadcastSpec& broadcast_spec) {
-    bool rc = true;
-    out->set_broadcast(broadcast_spec, arg0, arg1, element::boolean);
-    switch (arg0->get_element_type()) {
-        OPENVINO_TYPE_CASE(evaluate_equal, boolean, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, i4, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, i8, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, i16, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, i32, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, i64, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, u4, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, u8, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, u16, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, u32, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, u64, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, bf16, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, f16, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, f32, arg0, arg1, out, broadcast_spec);
-        OPENVINO_TYPE_CASE(evaluate_equal, f64, arg0, arg1, out, broadcast_spec);
-    default:
-        rc = false;
-        break;
-    }
-    return rc;
-}
 
-ov::Tensor equal_tensor(const ov::Tensor& lhs, const ov::Tensor& rhs) {
-    auto equal = op::v1::Equal(std::make_shared<op::v0::Parameter>(lhs.get_element_type(), lhs.get_shape()),
-                               std::make_shared<op::v0::Parameter>(rhs.get_element_type(), rhs.get_shape()),
-                               op::AutoBroadcastType::NUMPY);
-    auto outs = ov::TensorVector{{equal.get_output_element_type(0), equal.get_output_shape(0)}};
-    equal.evaluate(outs, ov::TensorVector{lhs, rhs});
+Tensor less_equal_tensor(const Tensor& lhs, const Tensor& rhs) {
+    const auto less_eq = v1::LessEqual();
+    auto outs = TensorVector{{element::boolean, Shape{}}};
+    less_eq.evaluate(outs, {lhs, rhs});
     return outs.front();
 }
 
-ov::Tensor less_equal_tensor(const ov::Tensor& lhs, const ov::Tensor& rhs) {
-    auto equal = op::v1::LessEqual(std::make_shared<op::v0::Parameter>(lhs.get_element_type(), lhs.get_shape()),
-                                   std::make_shared<op::v0::Parameter>(rhs.get_element_type(), rhs.get_shape()),
-                                   op::AutoBroadcastType::NUMPY);
-    auto outs = ov::TensorVector{{equal.get_output_element_type(0), equal.get_output_shape(0)}};
-    equal.evaluate(outs, ov::TensorVector{lhs, rhs});
+Tensor and_tensor(const Tensor& lhs, const Tensor& rhs) {
+    const auto logical_and = v1::LogicalAnd();
+    auto outs = TensorVector{{element::boolean, Shape{}}};
+    logical_and.evaluate(outs, {lhs, rhs});
     return outs.front();
 }
 
-ov::Tensor and_tensor(const ov::Tensor& lhs, const ov::Tensor& rhs) {
-    auto logical_and =
-        ov::op::v1::LogicalAnd(std::make_shared<op::v0::Parameter>(lhs.get_element_type(), lhs.get_shape()),
-                               std::make_shared<op::v0::Parameter>(rhs.get_element_type(), rhs.get_shape()),
-                               op::AutoBroadcastType::NUMPY);
-    auto outs = ov::TensorVector{{logical_and.get_output_element_type(0), logical_and.get_output_shape(0)}};
-    logical_and.evaluate(outs, ov::TensorVector{lhs, rhs});
+Tensor or_tensor(const Tensor& lhs, const Tensor& rhs) {
+    const auto logical_or = v1::LogicalOr();
+    auto outs = TensorVector{{element::boolean, Shape{}}};
+    logical_or.evaluate(outs, {lhs, rhs});
     return outs.front();
 }
 
-ov::Tensor or_tensor(const ov::Tensor& lhs, const ov::Tensor& rhs) {
-    auto logical_or =
-        ov::op::v1::LogicalOr(std::make_shared<op::v0::Parameter>(lhs.get_element_type(), lhs.get_shape()),
-                              std::make_shared<op::v0::Parameter>(rhs.get_element_type(), rhs.get_shape()),
-                              op::AutoBroadcastType::NUMPY);
-    auto outs = ov::TensorVector{{logical_or.get_output_element_type(0), logical_or.get_output_shape(0)}};
-    logical_or.evaluate(outs, ov::TensorVector{lhs, rhs});
-    return outs.front();
-}
+void all_equal(const TensorVector& tensors, TensorVector& outputs) {
+    auto& output = outputs[0];
+    auto eq_result = TensorVector{{output.get_element_type(), output.get_shape()}};
 
-void all_equal(const ov::TensorVector tensors, ov::Tensor& output_value) {
-    OPENVINO_ASSERT(tensors.size() >= 2, "Unexpected number of tensors in all_equal helper");
-    auto& tensor = tensors[0];
-    output_value = equal_tensor(tensor, tensors[1]);
-    for (size_t i = 2; i < tensors.size(); ++i) {
-        output_value = and_tensor(output_value, equal_tensor(tensor, tensors[i]));
+    auto t_iter = tensors.begin() + 2;
+    auto eq_inputs = TensorVector(tensors.begin(), t_iter);
+
+    const auto eq = v1::Equal();
+    eq.evaluate(outputs, eq_inputs);
+    for (; t_iter != tensors.end(); ++t_iter) {
+        eq_inputs[1] = *t_iter;
+        eq.evaluate(eq_result, eq_inputs);
+        output = and_tensor(output, eq_result[0]);
     }
 }
 
-ov::Tensor within_interval(const ov::Tensor& lower, const ov::Tensor& upper, const ov::Tensor& subject_to_check) {
-    auto lower_check = less_equal_tensor(lower, subject_to_check);
-    auto upper_check = less_equal_tensor(subject_to_check, upper);
+Tensor within_interval(const Tensor& lower, const Tensor& upper, const Tensor& subject_to_check) {
+    const auto lower_check = less_equal_tensor(lower, subject_to_check);
+    const auto upper_check = less_equal_tensor(subject_to_check, upper);
     return and_tensor(lower_check, upper_check);
 }
-
 }  // namespace
+
+struct Evaluate : public element::NoAction<bool> {
+    using element::NoAction<bool>::visit;
+
+    template <element::Type_t ET, class T = fundamental_type_for<ET>>
+    static result_type visit(const Tensor& arg0,
+                             const Tensor& arg1,
+                             Tensor& out,
+                             const Shape& shape0,
+                             const Shape& shape1,
+                             const op::AutoBroadcastSpec& broadcast_spec) {
+        reference::equal(arg0.data<const T>(),
+                         arg1.data<const T>(),
+                         out.data<fundamental_type_for<element::boolean>>(),
+                         shape0,
+                         shape1,
+                         broadcast_spec);
+        return true;
+    }
+};
 }  // namespace equal
 
 //------------------------------- v1 -------------------------------------------
-op::v1::Equal::Equal(const Output<Node>& arg0, const Output<Node>& arg1, const AutoBroadcastSpec& auto_broadcast)
+namespace v1 {
+Equal::Equal(const Output<Node>& arg0, const Output<Node>& arg1, const AutoBroadcastSpec& auto_broadcast)
     : BinaryElementwiseComparison(arg0, arg1, auto_broadcast) {
     constructor_validate_and_infer_types();
 }
 
-shared_ptr<Node> op::v1::Equal::clone_with_new_inputs(const OutputVector& new_args) const {
+std::shared_ptr<Node> Equal::clone_with_new_inputs(const OutputVector& new_args) const {
     OV_OP_SCOPE(v1_Equal_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return make_shared<op::v1::Equal>(new_args.at(0), new_args.at(1), this->get_autob());
+    return std::make_shared<Equal>(new_args.at(0), new_args.at(1), get_autob());
 }
 
-bool op::v1::Equal::evaluate(const HostTensorVector& outputs, const HostTensorVector& inputs) const {
+bool Equal::evaluate(TensorVector& outputs, const TensorVector& inputs) const {
     OV_OP_SCOPE(v1_Equal_evaluate);
-    return equal::evaluate_equal(inputs[0], inputs[1], outputs[0], get_autob());
+
+    outputs[0].set_shape(ov::op::infer_broadcast_shape(this, inputs));
+    using namespace ov::element;
+    return IfTypeOf<boolean, bf16, f16, f32, f64, i8, i16, i32, i64, u8, u16, u32, u64>::apply<equal::Evaluate>(
+        inputs[0].get_element_type(),
+        inputs[0],
+        inputs[1],
+        outputs[0],
+        inputs[0].get_shape(),
+        inputs[1].get_shape(),
+        get_autob());
 }
 
-bool op::v1::Equal::evaluate_lower(ov::TensorVector& output_values) const {
+bool Equal::evaluate_lower(TensorVector& output_values) const {
     if (get_input_tensor(0).has_and_set_bound() && get_input_tensor(1).has_and_set_bound())
         return default_upper_bound_evaluator(this, output_values);
     // ll == lu == rl == ru     -> {true}
     // else                     -> {false}
     const auto &lhs = get_input_tensor(0), &rhs = get_input_tensor(1);
-    auto lhs_lower = lhs.get_lower_value(), lhs_upper = lhs.get_upper_value();
-    auto rhs_lower = rhs.get_lower_value(), rhs_upper = rhs.get_upper_value();
-    equal::all_equal({lhs_lower, lhs_upper, rhs_lower, rhs_upper}, output_values[0]);
+    const auto &lhs_lower = lhs.get_lower_value(), &lhs_upper = lhs.get_upper_value();
+    const auto &rhs_lower = rhs.get_lower_value(), &rhs_upper = rhs.get_upper_value();
+    equal::all_equal({lhs_lower, lhs_upper, rhs_lower, rhs_upper}, output_values);
     return true;
 }
 
-bool op::v1::Equal::evaluate_upper(ov::TensorVector& output_values) const {
+bool Equal::evaluate_upper(TensorVector& output_values) const {
     const auto &lhs = get_input_tensor(0), &rhs = get_input_tensor(1);
-    auto lhs_lower = lhs.get_lower_value(), lhs_upper = lhs.get_upper_value();
-    auto rhs_lower = rhs.get_lower_value(), rhs_upper = rhs.get_upper_value();
+    const auto &lhs_lower = lhs.get_lower_value(), &lhs_upper = lhs.get_upper_value();
+    const auto &rhs_lower = rhs.get_lower_value(), &rhs_upper = rhs.get_upper_value();
     // check for intersection:
     // ll <= rl <= lu or ll <= ru <= lu
-    auto rl_check = equal::within_interval(lhs_lower, lhs_upper, rhs_lower);
-    auto ru_check = equal::within_interval(lhs_lower, lhs_upper, rhs_upper);
+    const auto rl_check = equal::within_interval(lhs_lower, lhs_upper, rhs_lower);
+    const auto ru_check = equal::within_interval(lhs_lower, lhs_upper, rhs_upper);
     output_values[0] = equal::or_tensor(rl_check, ru_check);
     return true;
 }
 
-bool op::v1::Equal::has_evaluate() const {
+bool Equal::has_evaluate() const {
     OV_OP_SCOPE(v1_Equal_has_evaluate);
     switch (get_input_element_type(0)) {
-    case ngraph::element::boolean:
-    case ngraph::element::i8:
-    case ngraph::element::u8:
-    case ngraph::element::i32:
-    case ngraph::element::i64:
-    case ngraph::element::u32:
-    case ngraph::element::u64:
-    case ngraph::element::f16:
-    case ngraph::element::f32:
+    case element::boolean:
+    case element::bf16:
+    case element::f16:
+    case element::f32:
+    case element::f64:
+    case element::i8:
+    case element::i16:
+    case element::i32:
+    case element::i64:
+    case element::u8:
+    case element::u16:
+    case element::u32:
+    case element::u64:
         return true;
     default:
-        break;
+        return false;
     }
-    return false;
-}
-
-bool op::v1::Equal::visit_attributes(AttributeVisitor& visitor) {
-    OV_OP_SCOPE(v1_Equal_visit_attributes);
-    BinaryElementwiseComparison::visit_attributes(visitor);
-    return true;
 }
+}  // namespace v1
+}  // namespace op
+}  // namespace ov
diff --git a/src/core/src/runtime/itensor.cpp b/src/core/src/runtime/itensor.cpp
index 6d966566c65610..2b3a6d49b84947 100644
--- a/src/core/src/runtime/itensor.cpp
+++ b/src/core/src/runtime/itensor.cpp
@@ -25,9 +25,10 @@ size_t ITensor::get_byte_size() const {
 }
 
 bool ITensor::is_continuous() const {
-    if (get_element_type().bitwidth() < 8)
+    if ((get_element_type().bitwidth() < 8) || get_size() == 0) {
         // OpenVINO doesn't support strides for lp types
         return true;
+    }
     const auto& shape = get_shape();
     const auto& type = get_element_type();
     std::vector<size_t> strides(shape.size());
diff --git a/src/core/tests/tensor.cpp b/src/core/tests/tensor.cpp
index 3fb0c259fc0c0d..361e45e8a570ce 100644
--- a/src/core/tests/tensor.cpp
+++ b/src/core/tests/tensor.cpp
@@ -52,3 +52,13 @@ TEST(tensor, wrap_tensor_with_unspecified_type_from_host_tensor) {
     // !tensor means that the tensor is not initialized
     EXPECT_EQ(!tensor, true);
 }
+
+TEST(tensor, create_tensor_with_zero_dims_check_stride) {
+    ov::Shape shape = {0, 0, 0, 0};
+    auto tensor = ov::Tensor(element::f32, shape);
+    EXPECT_EQ(!!tensor, true);
+    auto stride = tensor.get_strides();
+    EXPECT_EQ(stride.size(), shape.size());
+    EXPECT_EQ(stride.back(), 0);
+    EXPECT_EQ(tensor.is_continuous(), true);
+}
diff --git a/src/core/tests/type_prop/broadcast.cpp b/src/core/tests/type_prop/broadcast.cpp
index 023f8fa9505f0a..04f93d08b798e6 100644
--- a/src/core/tests/type_prop/broadcast.cpp
+++ b/src/core/tests/type_prop/broadcast.cpp
@@ -9,7 +9,6 @@
 #include "common_test_utils/test_assertions.hpp"
 #include "common_test_utils/type_prop.hpp"
 #include "openvino/core/dimension_tracker.hpp"
-#include "openvino/core/validation_util.hpp"
 #include "openvino/op/concat.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/equal.hpp"
@@ -18,6 +17,7 @@
 #include "openvino/op/shape_of.hpp"
 #include "openvino/op/unsqueeze.hpp"
 #include "openvino/op/util/attr_types.hpp"
+#include "validation_util.hpp"
 
 using namespace std;
 using namespace testing;
@@ -1303,24 +1303,22 @@ TEST(type_prop, broadcast_v3_bidirectional_tricky_partial_value_case_and_equal_p
     auto broadcast_a = make_shared<ov::op::v3::Broadcast>(a, select, "BIDIRECTIONAL");
     const auto out_shape = broadcast_a->get_output_partial_shape(0);
 
-    OPENVINO_SUPPRESS_DEPRECATED_START
     EXPECT_EQ(out_shape, expected_shape);
     {
-        auto constant = ov::get_constant_from_source(equal->output(0));
-        EXPECT_TRUE(constant != nullptr);
+        auto constant = ov::util::get_constant_from_source(equal->output(0));
+        ASSERT_TRUE(constant != nullptr);
         std::vector<bool> expected{false, false, false}, calculated = constant->get_vector<bool>();
         EXPECT_EQ(calculated, expected);
     }
     {
         equal = make_shared<ov::op::v1::Equal>(concat, ov::op::v0::Constant::create(ov::element::i64, {3}, {5, 1, 4}));
-        EXPECT_TRUE(ov::get_constant_from_source(equal->output(0)) == nullptr);
+        EXPECT_TRUE(ov::util::get_constant_from_source(equal->output(0)) == nullptr);
     }
     {
         equal = make_shared<ov::op::v1::Equal>(concat, ov::op::v0::Constant::create(ov::element::i64, {3}, {11, 1, 4}));
-        auto constant = ov::get_constant_from_source(equal->output(0));
-        EXPECT_TRUE(constant != nullptr);
+        auto constant = ov::util::get_constant_from_source(equal->output(0));
+        ASSERT_TRUE(constant != nullptr);
         std::vector<bool> expected{false, true, true}, calculated = constant->get_vector<bool>();
         EXPECT_EQ(calculated, expected);
     }
-    OPENVINO_SUPPRESS_DEPRECATED_END
 }
diff --git a/src/frontends/onnx/frontend/src/op/group_normalization.cpp b/src/frontends/onnx/frontend/src/op/group_normalization.cpp
new file mode 100644
index 00000000000000..d0f32a89ae4515
--- /dev/null
+++ b/src/frontends/onnx/frontend/src/op/group_normalization.cpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "op/group_normalization.hpp"
+
+#include "default_opset.hpp"
+
+OPENVINO_SUPPRESS_DEPRECATED_START
+namespace ngraph {
+namespace onnx_import {
+namespace op {
+namespace set_1 {
+OutputVector group_normalization(const Node& node) {
+    const auto data = node.get_ng_inputs().at(0);  // Shape [N, C, ...]
+    auto scale = node.get_ng_inputs().at(1);       // Shape [num_groups]
+    auto bias = node.get_ng_inputs().at(2);        // Shape [num_groups]
+
+    auto eps = node.get_attribute_value<float>("epsilon", 1e-05f);
+    auto num_groups = node.get_attribute_value<int64_t>("num_groups");
+
+    auto zero = default_opset::Constant::create(element::i64, Shape{1}, {0});
+    auto one = default_opset::Constant::create(element::i64, Shape{1}, {1});
+    auto c_dim = std::make_shared<default_opset::Gather>(std::make_shared<default_opset::ShapeOf>(data), one, zero);
+    auto g_dim = default_opset::Constant::create(element::i64, Shape{1}, {num_groups});
+
+    auto c_g_div = std::make_shared<default_opset::Divide>(c_dim, g_dim);
+
+    // Adjust scale and bias shape, [G] -> [G, C/G] -> [C]
+    scale = std::make_shared<default_opset::Unsqueeze>(scale, one);
+    auto broadcast_scale =
+        std::make_shared<default_opset::Broadcast>(scale, c_g_div, ov::op::BroadcastType::BIDIRECTIONAL);
+    auto c_scale = std::make_shared<default_opset::Reshape>(broadcast_scale, c_dim, false);
+
+    bias = std::make_shared<default_opset::Unsqueeze>(bias, one);
+    auto broadcast_bias =
+        std::make_shared<default_opset::Broadcast>(bias, c_g_div, ov::op::BroadcastType::BIDIRECTIONAL);
+    auto c_bias = std::make_shared<default_opset::Reshape>(broadcast_bias, c_dim, false);
+
+    return {std::make_shared<default_opset::GroupNormalization>(data, c_scale, c_bias, num_groups, eps)};
+}
+}  // namespace set_1
+}  // namespace op
+}  // namespace onnx_import
+}  // namespace ngraph
+OPENVINO_SUPPRESS_DEPRECATED_END
diff --git a/src/frontends/onnx/frontend/src/op/group_normalization.hpp b/src/frontends/onnx/frontend/src/op/group_normalization.hpp
new file mode 100644
index 00000000000000..fbd38d3667d4dd
--- /dev/null
+++ b/src/frontends/onnx/frontend/src/op/group_normalization.hpp
@@ -0,0 +1,23 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/core/deprecated.hpp"
+OPENVINO_SUPPRESS_DEPRECATED_START
+
+#include "ngraph/node.hpp"
+#include "onnx_import/core/node.hpp"
+
+namespace ngraph {
+namespace onnx_import {
+namespace op {
+namespace set_1 {
+OutputVector group_normalization(const Node& node);
+
+}  // namespace set_1
+}  // namespace op
+}  // namespace onnx_import
+}  // namespace ngraph
+OPENVINO_SUPPRESS_DEPRECATED_END
diff --git a/src/frontends/onnx/frontend/src/ops_bridge.cpp b/src/frontends/onnx/frontend/src/ops_bridge.cpp
index c4d9a50c4ca637..31ca0b20836de5 100644
--- a/src/frontends/onnx/frontend/src/ops_bridge.cpp
+++ b/src/frontends/onnx/frontend/src/ops_bridge.cpp
@@ -75,6 +75,7 @@
 #include "op/global_max_pool.hpp"
 #include "op/greater.hpp"
 #include "op/grid_sample.hpp"
+#include "op/group_normalization.hpp"
 #include "op/gru.hpp"
 #include "op/hammingwindow.hpp"
 #include "op/hannwindow.hpp"
@@ -395,6 +396,7 @@ OperatorsBridge::OperatorsBridge() {
     REGISTER_OPERATOR("GlobalMaxPool", 1, global_max_pool);
     REGISTER_OPERATOR("Greater", 1, greater);
     REGISTER_OPERATOR("GridSample", 1, grid_sample);
+    REGISTER_OPERATOR("GroupNormalization", 1, group_normalization);
     REGISTER_OPERATOR("GRU", 1, gru);
     REGISTER_OPERATOR("HannWindow", 1, hannwindow);
     REGISTER_OPERATOR("HammingWindow", 1, hammingwindow);
diff --git a/src/frontends/onnx/tests/models/group_normalization_2grp.prototxt b/src/frontends/onnx/tests/models/group_normalization_2grp.prototxt
new file mode 100644
index 00000000000000..978ab918a2c521
--- /dev/null
+++ b/src/frontends/onnx/tests/models/group_normalization_2grp.prototxt
@@ -0,0 +1,91 @@
+ir_version: 8
+producer_name: "onnx-frontend-test"
+graph {
+  node {
+    input: "x"
+    input: "scale"
+    input: "bias"
+    output: "Y"
+    op_type: "GroupNormalization"
+    attribute {
+      name: "num_groups"
+      i: 2
+      type: INT
+    }
+  }
+  name: "test-model-group-normalization"
+  input {
+    name: "x"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "scale"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "bias"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  domain: ""
+  version: 18
+}
diff --git a/src/frontends/onnx/tests/models/group_normalization_3grp.prototxt b/src/frontends/onnx/tests/models/group_normalization_3grp.prototxt
new file mode 100644
index 00000000000000..1711e41bd5d48f
--- /dev/null
+++ b/src/frontends/onnx/tests/models/group_normalization_3grp.prototxt
@@ -0,0 +1,91 @@
+ir_version: 9
+opset_import {
+  domain: ""
+  version: 18
+}
+producer_name: "onnx-frontend-test"
+graph {
+  node {
+    input: "x"
+    input: "scale"
+    input: "bias"
+    output: "Y"
+    op_type: "GroupNormalization"
+    attribute {
+      name: "num_groups"
+      type: INT
+      i: 3
+    }
+  }
+  name: "test-model-group-normalization"
+  input {
+    name: "x"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 6
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "scale"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "bias"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 6
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/frontends/onnx/tests/models/group_normalization_custom_eps.prototxt b/src/frontends/onnx/tests/models/group_normalization_custom_eps.prototxt
new file mode 100644
index 00000000000000..083b5d8ecf5d0e
--- /dev/null
+++ b/src/frontends/onnx/tests/models/group_normalization_custom_eps.prototxt
@@ -0,0 +1,96 @@
+ir_version: 9
+opset_import {
+  domain: ""
+  version: 18
+}
+producer_name: "onnx-frontend-test"
+graph {
+  node {
+    input: "x"
+    input: "scale"
+    input: "bias"
+    output: "Y"
+    op_type: "GroupNormalization"
+    attribute {
+      name: "epsilon"
+      type: FLOAT
+      f: 0.0001
+    }
+    attribute {
+      name: "num_groups"
+      type: INT
+      i: 3
+    }
+  }
+  name: "test-model-group-normalization"
+  input {
+    name: "x"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 6
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "scale"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "bias"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 6
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/frontends/onnx/tests/onnx_import.in.cpp b/src/frontends/onnx/tests/onnx_import.in.cpp
index 361805e45cf0d4..2ac1dc6d464567 100644
--- a/src/frontends/onnx/tests/onnx_import.in.cpp
+++ b/src/frontends/onnx/tests/onnx_import.in.cpp
@@ -6884,3 +6884,81 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_model_hannwindow_symmetric) {
         test_case.run_with_tolerance_as_fp(0.01f);
     }
 }
+
+OPENVINO_TEST(${BACKEND_NAME}, onnx_group_normalization_3grp_default_eps) {
+    auto function = onnx_import::import_onnx_model(file_util::path_join(ov::test::utils::getExecutableDirectory(),
+                                                                        SERIALIZED_ZOO,
+                                                                        "onnx/group_normalization_3grp.onnx"));
+
+    auto test_case = ov::test::TestCase(function, s_device);
+    test_case.add_input<float>(
+        {-0.2261407f, -1.8793484f,  -0.37692875f, 0.8860143f,   0.05993791f,  -0.7634332f,  0.61080337f,  0.09776749f,
+         0.5835062f,  -0.32338685f, -0.23485906f, -0.04752525f, 2.4905143f,   -0.11199934f, -0.20539412f, -2.4455426f,
+         -0.5437323f, 0.51794696f,  -0.44127423f, 0.09666952f,  -0.09539367f, -1.962784f,   0.25065672f,  1.5909688f,
+         0.927671f,   -0.46812922f, 0.2925484f,   -1.1766007f,  0.7675745f,   -0.94145614f, 1.1552521f,   1.6375796f,
+         0.0198675f,  -0.45938072f, 0.43037328f,  0.37999842f,  -0.45021877f, -0.84925014f, 1.6790043f,   -1.0172538f,
+         0.0493111f,  -0.53391f,    -0.08101435f, 0.14738432f,  -0.58910686f, 0.51673824f,  -1.7001126f,  -1.888597f});
+    test_case.add_input<float>({2.4556813f, 0.12371606f, 1.5681714f});
+    test_case.add_input<float>({0.79260737f, -0.74518913f, 1.370796f});
+
+    test_case.add_expected_output<float>(
+        Shape{2, 6, 2, 2},
+        {0.70938545f,  -4.3442307f,  0.24844825f,  4.109082f,   1.5838864f,   -0.93303996f, 3.267802f,    1.6995258f,
+         -0.6843487f,  -0.7732928f,  -0.76461035f, -0.7462375f, -0.49731785f, -0.75256085f, -0.7617206f,  -0.9814244f,
+         0.5922366f,   2.3495553f,   0.76182777f,  1.652246f,   1.3343381f,   -1.7566144f,  1.9071295f,   4.1256485f,
+         2.4563973f,   -1.0979934f,  0.8390641f,   -2.9021082f, 2.0487132f,   -2.3033152f,  3.03593f,     4.2641716f,
+         -0.73710674f, -0.80988204f, -0.6747702f,  -0.6824198f, -0.8084908f,  -0.86908495f, -0.48516175f, -0.8945968f,
+         2.4475086f,   1.3245938f,   2.1965842f,   2.6363354f,  1.2183195f,   3.3474774f,   -0.92077446f, -1.2836761f});
+
+    test_case.run_with_tolerance_as_fp(0.000001f);
+}
+
+OPENVINO_TEST(${BACKEND_NAME}, onnx_group_normalization_3grp_custom_eps) {
+    auto function = onnx_import::import_onnx_model(file_util::path_join(ov::test::utils::getExecutableDirectory(),
+                                                                        SERIALIZED_ZOO,
+                                                                        "onnx/group_normalization_custom_eps.onnx"));
+
+    auto test_case = ov::test::TestCase(function, s_device);
+    test_case.add_input<float>(
+        {1.8079232f,  -0.2892469f,  2.0915377f,   -1.8837914f, 0.25869793f,  0.80542284f,  2.9528935f,  0.16081251f,
+         0.10507602f, -1.7271832f,  -1.0217364f,  -1.1528395f, -0.69146425f, -2.4292548f,  -0.4232518f, 0.33357796f,
+         -1.4946569f, -0.08947915f, -0.7962127f,  1.3765403f,  -0.1947846f,  0.30173305f,  0.08576944f, 0.8134404f,
+         0.62960416f, -1.0745901f,  -0.27037576f, -0.3607608f, 0.14347585f,  1.4590056f,   -1.1309915f, 0.88850766f,
+         0.5367185f,  -0.7771955f,  0.81048864f,  0.45839247f, 1.0398412f,   -0.21019235f, -1.037122f,  -0.36852306f,
+         2.7608335f,  0.3126114f,   0.336343f,    0.76919895f, 0.58595645f,  0.71894723f,  -1.2922621f, -0.542859f});
+    test_case.add_input<float>({-0.05215209f, -0.5643389f, -0.6959881f});
+    test_case.add_input<float>({1.4327786f, 0.01641126f, -1.471873f});
+
+    test_case.add_expected_output<float>(
+        Shape{2, 6, 2, 2},
+        {1.3937842f,   1.4702199f,  1.3834473f,   1.5283363f,   1.4502488f,   1.4303224f,  1.3520534f,   1.4538165f,
+         -0.628196f,   0.5758153f,  0.11225323f,  0.19840352f,  -0.10477467f, 1.0371594f,  -0.281022f,   -0.77834874f,
+         -0.22489226f, -1.3969909f, -0.8074844f,  -2.6198394f,  -1.3091526f,  -1.7233121f, -1.5431708f,  -2.1501417f,
+         1.3968898f,   1.4998344f,  1.4512546f,   1.4567144f,   1.4262552f,   1.3467885f,  1.5032414f,   1.3812504f,
+         -0.36344206f, 0.6759755f,  -0.58001745f, -0.30147952f, -0.7614548f,  0.22742787f, 0.8815994f,   0.35268092f,
+         -2.9372354f,  -1.3806448f, -1.3957335f,  -1.6709452f,  -1.5544388f,  -1.6389949f, -0.36025894f, -0.83673286f});
+
+    test_case.run_with_tolerance_as_fp(0.000001f);
+}
+
+OPENVINO_TEST(${BACKEND_NAME}, onnx_group_normalization_2grp_custom_eps) {
+    auto function = onnx_import::import_onnx_model(file_util::path_join(ov::test::utils::getExecutableDirectory(),
+                                                                        SERIALIZED_ZOO,
+                                                                        "onnx/group_normalization_2grp.onnx"));
+
+    auto test_case = ov::test::TestCase(function, s_device);
+    test_case.add_input<float>({-0.424049f, 1.7215315f,  1.429421f,   0.52730036f,  2.0628972f,  -0.15856522f,
+                                2.274094f,  -1.9989003f, -1.7827071f, -0.87104136f, -1.2995626f, 0.16800839f,
+                                0.5934625f, 1.553442f,   -0.5482905f, 0.6079124f,   0.3598974f,  -0.15221423f,
+                                1.1135519f, -1.2533926f, -1.019778f,  -1.9142767f,  -1.2984604f, 0.5587884f});
+    test_case.add_input<float>({-1.4678609f, -1.8223071f});
+    test_case.add_input<float>({1.1155374f, -0.6101201f});
+
+    test_case.add_expected_output<float>(
+        Shape{1, 4, 2, 3},
+        {1.694167f,   -0.51719165f, -0.21612573f, 0.71365166f, -0.86902285f, 1.4205441f, -1.0866947f, 3.3172996f,
+         3.0944781f,  2.154863f,    2.5965219f,   1.0839586f,  -1.8562672f,  -3.540983f, 0.14745194f, -1.8816261f,
+         -1.4463723f, -0.547642f,   -2.768998f,   1.3848708f,  0.97488886f,  2.5446892f, 1.4639623f,  -1.7954159f});
+
+    test_case.run_with_tolerance_as_fp(0.000001f);
+}
diff --git a/src/frontends/onnx/tests/tests_python/test_backend.py b/src/frontends/onnx/tests/tests_python/test_backend.py
index 779444658d1e28..a027f703ba29ce 100644
--- a/src/frontends/onnx/tests/tests_python/test_backend.py
+++ b/src/frontends/onnx/tests/tests_python/test_backend.py
@@ -437,9 +437,7 @@ def expect_fail(test_case_path, xfail):  # type: (str) -> None
     ),
     (
         xfail_issue_99955,
-        "OnnxBackendNodeModelTest.test_group_normalization_epsilon_cpu",
         "OnnxBackendNodeModelTest.test_group_normalization_epsilon_expanded_cpu",
-        "OnnxBackendNodeModelTest.test_group_normalization_example_cpu",
         "OnnxBackendNodeModelTest.test_group_normalization_example_expanded_cpu",
     ),
     (
diff --git a/src/inference/src/dev/make_tensor.cpp b/src/inference/src/dev/make_tensor.cpp
index 1d23c62f86d957..2c0f33b352bcf6 100644
--- a/src/inference/src/dev/make_tensor.cpp
+++ b/src/inference/src/dev/make_tensor.cpp
@@ -77,7 +77,7 @@ class ViewTensor : public ITensor {
         auto& shape = get_shape();
         if (m_strides.empty() && !shape.empty()) {
             m_strides.resize(shape.size());
-            m_strides.back() = m_element_type.size();
+            m_strides.back() = shape.back() == 0 ? 0 : m_element_type.size();
             std::transform(shape.crbegin(),
                            shape.crend() - 1,
                            m_strides.rbegin(),
diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp
index 072c3f7edcf60b..6e75dc8794ec6b 100644
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp
@@ -7,6 +7,7 @@
 #include <cpu/x64/jit_generator.hpp>
 
 #include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
 #include "snippets/lowered/expression.hpp"
 #include "snippets/lowered/port_connector.hpp"
 #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
@@ -229,7 +230,7 @@ void KernelEmitter::init_data_pointers(const Xbyak::Reg64& reg_indexes, const Xb
     // Note that we don't need offset for the last dim, since it's handled directly by Tile emitter
     const size_t offset_rank = master_shape.size() - 1;
     std::vector<std::vector<size_t>> data_offsets(num_params, std::vector<size_t>{});
-    auto offset_calculation = [=](const std::vector<size_t>& shape, const std::vector<size_t>& layout, const size_t data_size) {
+    auto offset_calculation = [=](const std::vector<size_t>& shape, const std::vector<size_t>& layout, const size_t data_size, bool is_input) {
         // Strides represent distance between consecutive elements of corresponding dimension.
         // If a dim size == 1, then the next dim starts immediately and the stride is 0
         // case 1:
@@ -248,8 +249,11 @@ void KernelEmitter::init_data_pointers(const Xbyak::Reg64& reg_indexes, const Xb
         // Note: this is an extra copy, but let's keep it for clarity
         if (!layout.empty()) {
             std::vector<size_t> reordered_strides(strides.size());
-            for (size_t i = 0; i < layout.size(); i++)
-                reordered_strides[i] = strides[layout[i]];
+            for (size_t i = 0; i < layout.size(); i++) {
+                const auto& src_idx = is_input ? layout[i] : i;
+                const auto& dst_idx = is_input ? i : layout[i];
+                reordered_strides[dst_idx] = strides[src_idx];
+            }
             strides = std::move(reordered_strides);
         }
         // the last stride is ignored, since the entire last dim is processed by kernel
@@ -261,7 +265,7 @@ void KernelEmitter::init_data_pointers(const Xbyak::Reg64& reg_indexes, const Xb
         return strides;
     };
     for (size_t i = 0; i < num_params; i++) {
-        data_offsets[i] = offset_calculation(io_shapes[i],  io_data_layouts[i], io_data_sizes[i]);
+        data_offsets[i] = offset_calculation(io_shapes[i],  io_data_layouts[i], io_data_sizes[i], i < num_inputs);
     }
     // master_shape size must be valid in both static and dynamic cases
     std::function<void(Reg64, const std::vector<size_t>&, Reg64)> init_ptr_with_offset;
@@ -718,6 +722,33 @@ size_t BrgemmEmitter::getBrgIdx(size_t kIdx, size_t nIdx) {
     return kIdx * BRGEMM_N_KERNEL_NUM + nIdx;
 }
 
+size_t BrgemmEmitter::get_in_leading_dim(const VectorDims& shape, const std::vector<size_t>& layout) {
+    // Input shape is original, so we need to correctly read this data by order
+    // Example:
+    //      Original shape (shape) = [1, 49, 2, 23]
+    //      Layout (transpose order) = [2, 0, 1, 3]
+    //      Transposed shape = [2, 1, 49, 23]
+    //      The leading dimension is equal to stride of shape[layout[3]] = 2 x 23
+    OPENVINO_ASSERT(layout.back() == layout.size() - 1 && layout.size() == shape.size(),
+                    "BrgemmEmitter detected invalid layout values: check that this shape + layout combination is schedulable");
+    const auto idx = layout[layout.size() - 2];  // `1` in example
+    return std::accumulate(shape.cbegin() + idx + 1, shape.end(), 1, std::multiplies<size_t>());
+}
+size_t BrgemmEmitter::get_out_leading_dim(const VectorDims& shape, const std::vector<size_t>& layout) {
+    // Output shape is already transposed, we need to correctly write the data with original shape by the order
+    // Example:
+    //      Original transposed shape (shape) = [49, 2, 7, 39]
+    //      Layout (transpose order) = [2, 0, 1, 3]
+    //      Before leading dimension with index 3 there is dimension with index 2 in planar layout.
+    //      Since we have non-planar layout, we have to find this before LD dim in transposed order.
+    //      In layout 2nd idx is first element, it means, that the leading dimension is equal to stride of shape[0]
+    OPENVINO_ASSERT(layout.back() == layout.size() - 1 && layout.size() == shape.size(),
+                    "BrgemmEmitter detected invalid layout values: check that this shape + layout combination is schedulable");
+    const auto idx = layout.size() - 2; // 2 in the example
+    const auto dim = std::distance(layout.cbegin(), std::find(layout.cbegin(), layout.cend(), idx)); // 0 in the example: shape[0] = 49
+    return std::accumulate(shape.cbegin() + dim + 1, shape.cend(), 1, std::multiplies<size_t>()); // shape[1] x shape[2] x shape[3] = 2 x 7 x 39
+}
+
 BrgemmEmitter::BrgemmEmitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) {
     m_brgCtxs.fill(brgemmCtx());
     std::generate(m_brgKernels.begin(), m_brgKernels.end(), [](){ return nullptr; });
@@ -730,38 +761,33 @@ BrgemmEmitter::BrgemmEmitter(jit_generator* h, cpu_isa_t isa, const ExpressionPt
     std::vector<size_t> leading_dimensions;
     std::vector<std::vector<size_t>> io_layouts;
 
-    auto init_scheduling_params = [&](const std::vector<size_t>& layout, const ov::Shape& io_shape) {
-        if (layout.empty()) {
-            // empty value indicates a planar layout
-            leading_dimensions.push_back(io_shape.back());
-            std::vector<size_t> default_layout(io_shape.size());
-            std::iota(default_layout.begin(), default_layout.end(), 0);
-            io_layouts.push_back(default_layout);
-        } else {
-            // The idea here is to find "2" (for 4D shapes) in the layout and multiply dimensions that are to the right
-            // This implies that "3" is the last layout value, otherwise this layout is not supported.
-            // counting from the end since shape could be prepended with ones
-            const int64_t num_last_dims = layout.end() - std::find(layout.begin(), layout.end(), layout.size() - 2) - 1;
-            if (layout.back() != layout.size() - 1 || num_last_dims < 1)
-                IE_THROW() << "BrgemmEmitter detected invalid layout values: check that this shape + layout combination is schedulable";
-            leading_dimensions.emplace_back(
-                    std::accumulate(io_shape.end() - num_last_dims, io_shape.end(), 1, std::multiplies<size_t>()));
-            io_layouts.push_back(layout);
-        }
+     auto get_layout = [](const std::vector<size_t>& layout, const snippets::VectorDims& io_shape) {
+        if (!layout.empty()) return layout;
+        std::vector<size_t> default_layout(io_shape.size());
+        std::iota(default_layout.begin(), default_layout.end(), 0);
+        return default_layout;
     };
 
-    std::vector<ov::Input<ov::Node>> brgemm_inputs = {brgemm_node->input(0),
-                                                      brgemm_copy ? brgemm_copy->input(0) : brgemm_node->input(1)};
-    for (const auto& input : brgemm_inputs) {
-        init_scheduling_params(snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input)->get_layout(),
-                               input.get_shape());
+    auto init_in_scheduling_params = [&](const snippets::lowered::PortDescriptorPtr& input) {
+        io_layouts.push_back(get_layout(input->get_layout(), input->get_shape()));
+        leading_dimensions.push_back(get_in_leading_dim(input->get_shape(), io_layouts.back()));
+    };
+    auto init_out_scheduling_params = [&](const snippets::lowered::PortDescriptorPtr& output) {
+        io_layouts.push_back(get_layout(output->get_layout(), output->get_shape()));
+        leading_dimensions.push_back(get_out_leading_dim(output->get_shape(), io_layouts.back()));
+    };
+    init_in_scheduling_params(expr->get_input_port_descriptor(0));
+    if (brgemm_node->is_with_data_repacking()) {
+        io_layouts.push_back(std::vector<size_t>{});
+        leading_dimensions.push_back(0);
+    } else {
+        init_in_scheduling_params(expr->get_input_port_descriptor(1));
     }
-    init_scheduling_params(snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(brgemm_node->output(0))->get_layout(),
-                           brgemm_node->output(0).get_shape());
+    init_out_scheduling_params(expr->get_output_port_descriptor(0));
 
-    const auto& A_shape = brgemm_node->get_input_shape(0);
+    const auto& A_shape = expr->get_input_port_descriptor(0)->get_shape();
     const auto& A_layout = io_layouts[0];
-    const auto& C_shape = brgemm_node->get_output_shape(0);
+    const auto& C_shape = expr->get_output_port_descriptor(0)->get_shape();
     const auto& C_layout = io_layouts[2];
 
     // We need find original M,N,K having layouts and ordered shapes
@@ -777,6 +803,9 @@ BrgemmEmitter::BrgemmEmitter(jit_generator* h, cpu_isa_t isa, const ExpressionPt
     m_M = brgemm_node->get_input_count(0);
     m_N = C_shape[get_ordered_idx(C_layout, C_layout.size() - 1)];
 
+    if (brgemm_node->is_with_data_repacking())
+        leading_dimensions[1] = rnd_up(m_N, brgemm_copy->get_n_block_size());
+
     auto brg0Prc = InferenceEngine::details::convertPrecision(brgemm_node->get_input_element_type(0));
     auto brg1Prc = InferenceEngine::details::convertPrecision(brgemm_node->get_input_element_type(1));
     m_brg0VnniFactor = 4 / brg0Prc.size();
@@ -827,7 +856,7 @@ BrgemmEmitter::BrgemmEmitter(jit_generator* h, cpu_isa_t isa, const ExpressionPt
             brgemmCtx.N = N(n);
             brgemmCtx.K = K(k);
             brgemmCtx.LDA = leading_dimensions[0];
-            brgemmCtx.LDB = brgemm_node->is_with_data_repacking() ? rnd_up(m_N, brgemm_copy->get_n_block_size()) : leading_dimensions[1];
+            brgemmCtx.LDB = leading_dimensions[1];
             brgemmCtx.LDC = leading_dimensions[2];
             brgemmCtx.dt_in0 = static_cast<dnnl_data_type_t>(DnnlExtensionUtils::IEPrecisionToDataType(brg0Prc));
             brgemmCtx.dt_in1 = static_cast<dnnl_data_type_t>(DnnlExtensionUtils::IEPrecisionToDataType(brg1Prc));
@@ -1219,23 +1248,14 @@ BrgemmCopyBEmitter::BrgemmCopyBEmitter(jit_generator* h, cpu_isa_t isa, const Ex
     if (m_with_comp)
         m_comp_offset = brgemm_repack->get_offset_compensations();
 
-    const auto& layout = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(brgemm_repack->input(0))->get_layout();
-    const auto& original_shape = brgemm_repack->get_input_shape(0);
+    const auto& in_desc = expr->get_input_port_descriptor(0);
+    const auto& layout = in_desc->get_layout();
+    const auto& original_shape = in_desc->get_shape();
     auto transposed_shape = original_shape;
     size_t leading_dimension = *(original_shape.rbegin());
     if (!layout.empty()) {
-        transposed_shape.resize(layout.size(), 1);
-        for (size_t i = 0; i < layout.size(); ++i) {
-            transposed_shape[i] = original_shape[layout[i]];
-        }
-        // The idea here is to find "2" (for 4D shapes) in the layout and multiply dimensions that are to the right
-        // This implies that "3" is the last layout value, otherwise this layout is not supported.
-        // counting from the end since shape could be prepended with ones
-        const int64_t num_last_dims = layout.end() - std::find(layout.begin(), layout.end(), layout.size() - 2) - 1;
-        if (layout.back() != layout.size() - 1 || num_last_dims < 1)
-            IE_THROW() << "BrgemmRepackEmitter detected invalid layout values: " <<
-                       "check that this shape + layout combination is schedulable";
-        leading_dimension = std::accumulate(original_shape.end() - num_last_dims, original_shape.end(), 1, std::multiplies<size_t>());
+        transposed_shape = snippets::utils::get_planar_vdims(original_shape, layout);
+        leading_dimension = BrgemmEmitter::get_in_leading_dim(original_shape, layout);
     }
 
     m_N = *(transposed_shape.rbegin());
diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp
index 7019fb14c6ec29..40437eb9898099 100644
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp
@@ -367,6 +367,9 @@ class BrgemmEmitter : public jit_emitter {
     static std::set<std::vector<element::Type>> get_supported_precisions(const std::shared_ptr<ngraph::Node>& node = nullptr);
     size_t aux_gprs_count() const override;
 
+    static size_t get_in_leading_dim(const VectorDims& shape, const std::vector<size_t>& layout);
+    static size_t get_out_leading_dim(const VectorDims& shape, const std::vector<size_t>& layout);
+
 private:
     void validate_arguments(const std::vector<size_t> &in, const std::vector<size_t> &out) const override;
     void emit_impl(const std::vector<size_t>& in,
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index 96be8734ec0dce..d2dd2b0eda08ce 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -495,6 +495,8 @@ static Config::SnippetsMode getSnippetsMode(const std::map<std::string, std::str
         return Config::SnippetsMode::IgnoreCallback;
     else if (val == PluginConfigInternalParams::DISABLE)
         return Config::SnippetsMode::Disable;
+    else if (val == PluginConfigInternalParams::ENABLE)
+        return Config::SnippetsMode::Enable;
     else
         IE_THROW() << "Wrong value for property key SNIPPETS_MODE. Expected values: ENABLE/DISABLE/IGNORE_CALLBACK";
 }
@@ -537,17 +539,17 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
 
     DEBUG_LOG(PrintableModel(*nGraphFunc, "org_"));
 
-    Transformations transformations(nGraphFunc, enableLPT, inferencePrecision, isLegacyAPI(), snippetsMode, engConfig);
+    // update the props after the perf mode translated to configs
+    // TODO: Clarify the behavior of SetConfig method. Skip eng_config or not?
+    Config conf = engConfig;
+
+    Transformations transformations(nGraphFunc, enableLPT, inferencePrecision, isLegacyAPI(), snippetsMode, conf);
     transformations.UpToLpt();
 
     if (!is_cpu_map_available()) {
         ApplyPerformanceHints(config, nGraphFunc);
     }
 
-    // update the props after the perf mode translated to configs
-    // TODO: Clarify the behavior of SetConfig method. Skip eng_config or not?
-    Config conf = engConfig;
-
     conf.readProperties(config, modelType);
     CalculateStreams(conf, nGraphFunc);
 
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp
index 40dc488a254c0d..8ac0bbd57138fd 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp
@@ -62,8 +62,8 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() {
         const auto& brgemm_in1_desc = PortDescriptorUtils::get_port_descriptor_ptr(brgemm->input(1));
         const auto& brgemm_out_desc = PortDescriptorUtils::get_port_descriptor_ptr(brgemm->output(0));
 
-        const auto dimsMatMulIn0 = snippets::utils::get_planar_pshape(brgemm->input_value(0)).get_shape();
-        const auto dimsMatMulIn1 = snippets::utils::get_planar_pshape(brgemm->input_value(1)).get_shape();
+        const auto dimsMatMulIn0 = snippets::utils::get_planar_pshape(brgemm->input(0)).get_shape();
+        const auto dimsMatMulIn1 = snippets::utils::get_planar_pshape(brgemm->input(1)).get_shape();
 
         const auto K = *dimsMatMulIn0.rbegin();
         const auto N = *dimsMatMulIn1.rbegin();
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/set_brgemm_cpu_blocking_params.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/set_brgemm_cpu_blocking_params.cpp
index df88ffa7edcd82..939998c08bd79e 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/set_brgemm_cpu_blocking_params.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/set_brgemm_cpu_blocking_params.cpp
@@ -35,8 +35,8 @@ pass::SetBrgemmCPUBlockingParams::SetBrgemmCPUBlockingParams() {
             return false;
         }
 
-        const auto dimsMatMulIn0 = snippets::utils::get_planar_pshape(brgemm->input_value(0)).get_shape();
-        const auto dimsMatMulIn1 = snippets::utils::get_planar_pshape(brgemm->input_value(1)).get_shape();
+        const auto dimsMatMulIn0 = snippets::utils::get_planar_pshape(brgemm->input(0)).get_shape();
+        const auto dimsMatMulIn1 = snippets::utils::get_planar_pshape(brgemm->input(1)).get_shape();
         const auto K = *dimsMatMulIn0.rbegin();
         const auto N = *dimsMatMulIn1.rbegin();
 
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index d67c5047b992e0..e979270fee3318 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -112,6 +112,7 @@
 #include "snippets/pass/mha_tokenization.hpp"
 #include "snippets/pass/collapse_subgraph.hpp"
 #include "snippets/pass/common_optimizations.hpp"
+#include "snippets/pass/split_dimension_m.hpp"
 #include "snippets/pass/extract_reshapes_from_mha.hpp"
 
 // Misc
@@ -612,10 +613,14 @@ void Transformations::MainSnippets(void) {
     // To avoid sitations when Transpose is not alone node between MatMul and Result,
     // Plugin disables Transpose tokenization on output
     tokenization_config.mha_token_enable_transpose_on_output = (inferencePrecision == ov::element::f32);
-    tokenization_config.concurrency = parallel_get_num_threads();
+    tokenization_config.concurrency = config.streamExecutorConfig._threadsPerStream;
+    if (tokenization_config.concurrency == 0)
+        tokenization_config.concurrency = parallel_get_max_threads();
     // The optimization "SplitDimensionM" depends on target machine (thread count).
     // To avoid uncontrolled behavior in tests, we disabled the optimization when there is Config::SnippetsMode::IgnoreCallback
     tokenization_config.split_m_dimension = snippetsMode != Config::SnippetsMode::IgnoreCallback;
+    // [122706] Some 3D MHA Patterns have perf regressions when Transpose op is tokenized
+    tokenization_config.mha_supported_transpose_ranks = { 4 };
 
     ngraph::pass::Manager snippetsManager;
     snippetsManager.set_per_pass_validation(false);
@@ -671,15 +676,10 @@ void Transformations::MainSnippets(void) {
             return true;
         };
         auto is_unsupported_parallel_work_amount = [&](const std::shared_ptr<const ov::Node>& n, const ov::Shape& shape) {
-            const auto parallel_work_amount = std::accumulate(shape.rbegin() + 2, shape.rend(), 1, std::multiplies<size_t>());
-            // Heuristic values:
-            //    parallelism work amount - not enough work amount for parallelism
-            // TODO: The heuristic will be removed after parallelism support on JIT level
-            const auto needed_num_of_threads = 12lu;
+            const size_t parallel_work_amount = std::accumulate(shape.rbegin() + 2, shape.rend(), 1, std::multiplies<size_t>());
             const auto is_unsupported_parallel_work_amount =
-                parallel_get_num_threads() / 2 > parallel_work_amount &&
-                static_cast<size_t>(parallel_work_amount) < needed_num_of_threads &&
-                !ov::snippets::pass::CommonOptimizations::CanOptimizeParallelWA(n, tokenization_config.concurrency);
+                parallel_work_amount < tokenization_config.concurrency &&
+                !ov::snippets::pass::SplitDimensionM::can_be_optimized(n, tokenization_config.concurrency);
             return is_unsupported_parallel_work_amount;
         };
 #endif // OPENVINO_ARCH_X86_64
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp
index 8193709b479741..b05bf845538859 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp
@@ -5,6 +5,7 @@
 #include "snippets/mha.hpp"
 #include "common_test_utils/test_constants.hpp"
 #include "test_utils/cpu_test_utils.hpp"
+#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
 #include "ie_plugin_config.hpp"
 #include "ie_system_conf.h"
 
@@ -15,7 +16,7 @@ namespace snippets {
 
 namespace {
 
-const std::vector<std::vector<ov::PartialShape>> inputShapes = {
+const std::vector<std::vector<ov::PartialShape>> inputShapes_4D = {
         {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}},
         {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 16, 1, 1}, {1, 128, 16, 64}},
         {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 1, 1, 128}, {1, 128, 16, 64}},
@@ -23,6 +24,11 @@ const std::vector<std::vector<ov::PartialShape>> inputShapes = {
         {{1, 58, 16, 34}, {1, 58, 16, 34}, {1, 1, 1, 58}, {1, 58, 16, 34}},
 };
 
+const std::vector<std::vector<ov::PartialShape>> inputShapes_3D = {
+        {{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}},
+        {{68, 6, 92}, {68, 6, 92}, {1, 68, 68}, { 68, 6, 92}},
+};
+
 static inline bool is_bf16_supported() {
     return InferenceEngine::with_cpu_x86_bfloat16() || InferenceEngine::with_cpu_x86_avx512_core_amx_bf16();
 }
@@ -40,24 +46,74 @@ static inline std::vector<std::vector<element::Type>> precision_bf16(size_t coun
     return prc;
 }
 
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHA,
+static std::map<std::string, std::string> enable_callback() {
+    return std::map<std::string, std::string>{
+        {
+          InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+          InferenceEngine::PluginConfigInternalParams::ENABLE
+        },
+     };
+}
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D, MHA,
                          ::testing::Combine(
-                                 ::testing::ValuesIn(inputShapes),
+                                 ::testing::ValuesIn(inputShapes_4D),
                                  ::testing::ValuesIn(precision_f32(4)),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::ValuesIn({false, true}),
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(1),
                                  ::testing::Values(1),
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
                                  ::testing::Values(CPUTestUtils::cpuEmptyPluginConfig)),
                          MHA::getTestCaseName);
 
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16, MHA,
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_3D, MHA,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShapes_3D),
+                                 ::testing::ValuesIn(precision_f32(4)),
+                                 ::testing::Values(ov::element::f32),
+                                 ::testing::ValuesIn({false, true}),
+                                 ::testing::Values(MHA::default_thread_count),
+                                 ::testing::Values(5), // [122706]: Subgraph + 4 Transpose
+                                 ::testing::Values(2), // decomposed Transpose + MHA
+                                 ::testing::Values(ov::test::utils::DEVICE_CPU),
+                                 ::testing::Values(CPUTestUtils::cpuEmptyPluginConfig)),
+                         MHA::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D_SplitDimensionM, MHA,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::PartialShape>{{1, 128, 2, 64}, {1, 128, 2, 64}, {1, 1, 1, 1}, {1, 128, 2, 64}}),
+                                 ::testing::ValuesIn(precision_f32(4)),
+                                 ::testing::Values(ov::element::f32),
+                                 ::testing::Values(true),
+                                 ::testing::Values(4), // 4 Threads
+                                 ::testing::Values(6), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output
+                                 ::testing::Values(1),
+                                 ::testing::Values(ov::test::utils::DEVICE_CPU),
+                                 ::testing::Values(enable_callback())),
+                         MHA::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_3D_SplitDimensionM, MHA,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::PartialShape>{{384, 2, 64}, {384, 2, 64}, {1, 384, 384}, {384, 2, 64}}),
+                                 ::testing::ValuesIn(precision_f32(4)),
+                                 ::testing::Values(ov::element::f32),
+                                 ::testing::Values(true),
+                                 ::testing::Values(4), // 4 Threads
+                                 ::testing::Values(10), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output + 4 Transposes
+                                 ::testing::Values(1), // MHA
+                                 ::testing::Values(ov::test::utils::DEVICE_CPU),
+                                 ::testing::Values(enable_callback())),
+                         MHA::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16_4D, MHA,
                          ::testing::Combine(
-                                 ::testing::ValuesIn(inputShapes),
+                                 ::testing::ValuesIn(inputShapes_4D),
                                  ::testing::ValuesIn(precision_bf16(4)),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::ValuesIn({false, true}),
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(7), // MHA + 5 Converts + 1 Transpose on output
                                  ::testing::Values(6), // MHA + 5 Converts on inputs and output
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -66,10 +122,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16, MHA,
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, MHA,
                          ::testing::Combine(
-                                 ::testing::ValuesIn(inputShapes),
+                                 ::testing::ValuesIn(inputShapes_4D),
                                  ::testing::ValuesIn(precision_f32(4)),
                                  ::testing::Values(ov::element::bf16),
                                  ::testing::ValuesIn({false}),
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(7),
                                  ::testing::Values(7),
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -83,6 +140,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAMulAdd, MHAMulAdd,
                                  ::testing::ValuesIn(precision_f32(3)),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::ValuesIn({false}),  // Need to support True for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(1),
                                  ::testing::Values(1),
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -104,6 +162,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHASelect,
                                  ::testing::ValuesIn(precision_f32(6)),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::Values(false),  // Need to support True for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(2), // Less + MHA
                                  ::testing::Values(2),
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -125,6 +184,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeOnInputs_4D, MHAWOTranspos
                                  ::testing::Values(std::vector<ov::element::Type>{}),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::Values(true),  // Need to support False for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(1),
                                  ::testing::Values(1),
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -137,6 +197,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTranspose_4D, MHAWOTranspose,
                                  ::testing::ValuesIn(precision_f32(3)),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::ValuesIn({true}),  // Need to support False for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(1),
                                  ::testing::Values(1),
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -149,6 +210,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTranspose_3D, MHAWOTranspose,
                                  ::testing::ValuesIn(precision_f32(3)),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::ValuesIn({true}),  // Need to support False for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(1),
                                  ::testing::Values(1),
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -161,6 +223,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeBF16_4D, MHAWOTranspose,
                                  ::testing::ValuesIn(precision_bf16(3)),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::ValuesIn({true}),  // Need to support False for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(5), // MHA + 4 extra Converts on inputs and output
                                  ::testing::Values(5), // MHA + 4 extra Converts on inputs and output
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -173,6 +236,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeBF16_3D, MHAWOTranspose,
                                  ::testing::ValuesIn(precision_bf16(3)),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::ValuesIn({true}),  // Need to support False for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(5), // MHA + 4 extra Converts on inputs and output
                                  ::testing::Values(5), // MHA + 4 extra Converts on inputs and output
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -185,6 +249,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeEnforceBF16_4D, MHAWOTrans
                                  ::testing::ValuesIn(precision_f32(3)),
                                  ::testing::Values(ov::element::bf16),
                                  ::testing::ValuesIn({true}),  // Need to support False for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(5), // MHA + 4 extra Converts on inputs and output
                                  ::testing::Values(5), // MHA + 4 extra Converts on inputs and output
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -197,6 +262,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeEnforceBF16_3D, MHAWOTrans
                                  ::testing::ValuesIn(precision_f32(3)),
                                  ::testing::Values(ov::element::bf16),
                                  ::testing::ValuesIn({true}),  // Need to support False for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(5), // MHA + 4 extra Converts on inputs and output
                                  ::testing::Values(5), // MHA + 4 extra Converts on inputs and output
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -205,10 +271,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeEnforceBF16_3D, MHAWOTrans
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAINT8MatMul, MHAINT8MatMul,
                          ::testing::Combine(
-                                 ::testing::ValuesIn(std::vector<std::vector<ov::PartialShape>>(inputShapes.begin(), inputShapes.begin() + 2)),
+                                 ::testing::ValuesIn(std::vector<std::vector<ov::PartialShape>>(inputShapes_4D.begin(), inputShapes_4D.begin() + 2)),
                                  ::testing::Values(std::vector<element::Type>{}),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::Values(false), // The graph doesn't contain Multiply
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(6),     // FQx3 on inputs + MHA + Transpose on output + Deq Mul
                                  ::testing::Values(5),     // FQx3 on inputs + MHA + Deq Mul
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -221,18 +288,20 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAQuantMatMul0, MHAQuantMatMul0,
                                  ::testing::Values(std::vector<element::Type>{}),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::Values(false), // The graph doesn't contain Multiply
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(8),     // FQ on input + MHA + Transpose on output + 4 Reshapes + Deq Mul
                                  ::testing::Values(3),     // FQ on input + MHA + Deq Mul
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
                                  ::testing::Values(CPUTestUtils::cpuEmptyPluginConfig)),
                          MHA::getTestCaseName);
 
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAFQAfterMatMul, MHAFQAfterMatMul,
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAFQAfterMatMul_4D, MHAFQAfterMatMul,
                          ::testing::Combine(
-                                 ::testing::ValuesIn(inputShapes),
+                                 ::testing::ValuesIn(inputShapes_4D),
                                  ::testing::Values(std::vector<element::Type>{}),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::Values(false), // The graph doesn't contain Multiply
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(3),     // MHA + Transpose on output + Deq Mul
                                  ::testing::Values(2),     // MHA + Deq Mul
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -245,6 +314,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAFQ, MHAFQ,
                                  ::testing::Values(std::vector<element::Type>{}),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::Values(false), // The graph doesn't contain Multiply
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(7),     // Transposex2 + Subgraphsx5
                                  ::testing::Values(5),     // MHA + Deq Mul on output + Deqs on inputs + 2 xFQ on inputs
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -261,6 +331,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHATransposedB, MHATransposedB,
                                  ::testing::Values(std::vector<element::Type>{}),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::ValuesIn({true}),  // Need to support False for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(2),
                                  ::testing::Values(1),
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
@@ -282,6 +353,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWithExtractedReshape, MHAWithExtracte
                                  ::testing::Values(std::vector<element::Type>{}),
                                  ::testing::Values(ov::element::f32),
                                  ::testing::ValuesIn({true}), // False is not supported for graph builder in tests
+                                 ::testing::Values(MHA::default_thread_count),
                                  ::testing::Values(3), // Extracted Add + Extracted Reshape + MHA
                                  ::testing::Values(2), // Extracted Add + MHA
                                  ::testing::Values(ov::test::utils::DEVICE_CPU),
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp
index 4862bdabf03419..4212102e5698e0 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp
@@ -11,11 +11,24 @@ namespace snippets {
 
 
 namespace {
-std::vector<ov::PartialShape> input_shapes{{2, 3, 5, 13}, {2, 3, 2, 4}, {1, 7, 1, 4}};
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Transpose, Transpose,
+std::vector<ov::PartialShape> input_shapes_4D{{2, 3, 5, 13}, {2, 3, 2, 4}, {1, 7, 1, 4}};
+std::vector<ov::PartialShape> input_shapes_3D{{3, 5, 13}, {3, 2, 4}, {7, 1, 4}};
+
+std::vector<std::vector<int32_t>> orders_4D{{0, 2, 3, 1}};
+std::vector<std::vector<int32_t>> orders_3D{{1, 2, 0}};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Transpose_3D, Transpose,
+                     ::testing::Combine(
+                             ::testing::ValuesIn(input_shapes_3D),
+                             ::testing::ValuesIn(orders_3D),
+                             ::testing::Values(1), // Transpose
+                             ::testing::Values(1), // Tokenized Transpose
+                             ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                     Transpose::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Transpose_4D, Transpose,
                      ::testing::Combine(
-                             ::testing::ValuesIn(input_shapes),
-                             ::testing::Values(std::vector<int> {0, 2,  3, 1}),
+                             ::testing::ValuesIn(input_shapes_4D),
+                             ::testing::ValuesIn(orders_4D),
                              ::testing::Values(1), // Transpose
                              ::testing::Values(1), // Tokenized Transpose
                              ::testing::Values(ov::test::utils::DEVICE_CPU)),
@@ -25,7 +38,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMul, TransposeMul,
                          ::testing::Combine(
                                  ::testing::Values(ov::PartialShape {2, 31, 3, 5}),
                                  ::testing::ValuesIn(std::vector<ov::PartialShape>{{2, 3, 5, 31}}),
-                                 ::testing::Values(std::vector<int> {0, 2,  3, 1}),
+                                 ::testing::Values(std::vector<int> {0, 2, 3, 1}),
                                  ::testing::Values(1), // Transpose
                                  ::testing::Values(1), // Tokenized Transpose
                                  ::testing::Values(ov::test::utils::DEVICE_CPU)),
diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp
index 3a760050d0159a..ced190761843de 100644
--- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp
+++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp
@@ -10,7 +10,7 @@
 #include "snippets/op/scalar.hpp"
 #include "lowering_utils.hpp"
 #include "common_test_utils/common_utils.hpp"
-#include "snippets/pass_manager.hpp"
+#include "snippets/pass/manager.hpp"
 
 namespace ov {
 namespace test {
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
index 1a196ea49e8e95..a67bbf1570ff13 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
@@ -131,6 +131,7 @@ struct program {
             topology const& topology,
             const ExecutionConfig& config,
             std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
+            std::shared_ptr<ICompilationContext> compilation_context,
             bool is_internal = false,
             bool no_optimizations = false,
             bool is_body_program = false);
@@ -251,6 +252,14 @@ struct program {
                              bool is_internal = false,
                              bool no_optimizations = false,
                              bool is_body_program = false);
+    static ptr build_program(engine& engine,
+                             const topology& topology,
+                             const ExecutionConfig& config,
+                             std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
+                             std::shared_ptr<ICompilationContext> compilation_context,
+                             bool is_internal = false,
+                             bool no_optimizations = false,
+                             bool is_body_program = false);
     static ptr build_program(engine& engine,
                              const std::set<std::shared_ptr<program_node>>& nodes,
                              const ExecutionConfig& config,
@@ -266,9 +275,11 @@ struct program {
 
     ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
     ICompilationContext& get_compilation_context() const { return *_compilation_context; }
+    std::shared_ptr<ICompilationContext> get_compilation_context_ptr() const { return _compilation_context; }
     void cancel_compilation_context();
 
     static std::shared_ptr<ov::threading::IStreamsExecutor> make_task_executor(const ExecutionConfig& config);
+    static std::shared_ptr<ICompilationContext> make_compilation_context(const ExecutionConfig& config);
 
 private:
     uint32_t prog_id = 0;
@@ -286,8 +297,7 @@ struct program {
     bool is_body_program;
     std::unique_ptr<ImplementationsCache> _impls_cache;
     const size_t _impls_cache_capacity = 10000;
-    const int _num_async_build_threads = 1;
-    std::unique_ptr<ICompilationContext> _compilation_context;
+    std::shared_ptr<ICompilationContext> _compilation_context;
 
     std::map<primitive_id, std::shared_ptr<program_node>> nodes_map;
     std::list<primitive_id> optimized_out;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp
index 22864106fb39f5..422451d096729b 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp
@@ -10,6 +10,7 @@
 #include "intel_gpu/plugin/custom_layer.hpp"
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/runtime/execution_config.hpp"
+#include "intel_gpu/runtime/compilation_context.hpp"
 #include "intel_gpu/graph/topology.hpp"
 #include "intel_gpu/graph/program.hpp"
 
@@ -75,7 +76,9 @@ class ProgramBuilder final {
 public:
     ProgramBuilder(std::shared_ptr<ov::Model> model, cldnn::engine& engine, const ExecutionConfig& config,
             bool createTopologyOnly = false, bool partialBuild = false,
-            std::shared_ptr<ov::threading::IStreamsExecutor> task_executor = nullptr, bool innerProgram = false);
+            std::shared_ptr<ov::threading::IStreamsExecutor> task_executor = nullptr,
+            std::shared_ptr<cldnn::ICompilationContext> compilation_context = nullptr,
+            bool innerProgram = false);
     ProgramBuilder(cldnn::engine& engine, const ExecutionConfig& config);
 
     static const cldnn::primitive_id m_preProcessTag;
@@ -136,6 +139,7 @@ class ProgramBuilder final {
     bool requires_new_shape_infer(const ov::Node& op) const;
 
     std::shared_ptr<ov::threading::IStreamsExecutor> get_task_executor() const { return m_task_executor; }
+    std::shared_ptr<cldnn::ICompilationContext> get_compilation_context() const { return m_compilation_context; }
 
 private:
     static factories_map_t factories_map;
@@ -153,6 +157,7 @@ class ProgramBuilder final {
     bool queryMode;
 
     std::shared_ptr<ov::threading::IStreamsExecutor> m_task_executor;
+    std::shared_ptr<cldnn::ICompilationContext> m_compilation_context;
 
     void EnableQueryMode() { queryMode = true; }
     void DisableQueryMode() { queryMode = false; }
diff --git a/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/compilation_context.hpp
similarity index 83%
rename from src/plugins/intel_gpu/src/graph/include/compilation_context.hpp
rename to src/plugins/intel_gpu/include/intel_gpu/runtime/compilation_context.hpp
index be8d65c6aa5ecc..f664e728680b62 100644
--- a/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/compilation_context.hpp
@@ -4,10 +4,10 @@
 
 #pragma once
 
-#include "openvino/runtime/threading/cpu_streams_executor.hpp"
 #include <functional>
 #include <memory>
 #include "intel_gpu/graph/kernel_impl_params.hpp"
+#include "openvino/runtime/threading/istreams_executor.hpp"
 
 namespace cldnn {
 
@@ -21,7 +21,7 @@ class ICompilationContext {
     virtual void cancel() = 0;
     virtual void wait_all() = 0;
 
-    static std::unique_ptr<ICompilationContext> create(ov::threading::IStreamsExecutor::Config task_executor_config);
+    static std::shared_ptr<ICompilationContext> create(ov::threading::IStreamsExecutor::Config task_executor_config);
 };
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/compilation_context.cpp b/src/plugins/intel_gpu/src/graph/compilation_context.cpp
index c1f483200c9a38..df2fad3412286b 100644
--- a/src/plugins/intel_gpu/src/graph/compilation_context.cpp
+++ b/src/plugins/intel_gpu/src/graph/compilation_context.cpp
@@ -2,12 +2,14 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "compilation_context.hpp"
 #include <mutex>
 #include <atomic>
 #include <unordered_set>
 #include <future>
 #include "intel_gpu/runtime/utils.hpp"
+#include "intel_gpu/runtime/compilation_context.hpp"
+
+#include "openvino/runtime/threading/cpu_streams_executor.hpp"
 
 namespace cldnn {
 class CompilationContext : public ICompilationContext {
@@ -83,7 +85,7 @@ class CompilationContext : public ICompilationContext {
     std::vector<std::future<void>> futures;
 };
 
-std::unique_ptr<ICompilationContext> ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) {
+std::shared_ptr<ICompilationContext> ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) {
     return cldnn::make_unique<CompilationContext>(task_executor_config);
 }
 
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
index 240db96d5b4988..c8a081dadbc45f 100644
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -13,6 +13,7 @@
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/runtime/event.hpp"
 #include "intel_gpu/runtime/stream.hpp"
+#include "intel_gpu/runtime/compilation_context.hpp"
 #include "intel_gpu/runtime/debug_configuration.hpp"
 #include "intel_gpu/runtime/itt.hpp"
 
@@ -34,7 +35,6 @@
 #include "program_helpers.h"
 #include "to_string_utils.h"
 #include "kernels_cache.hpp"
-#include "compilation_context.hpp"
 
 // TODO: Remove once we have an abstraction for kernels_cache
 #include "kernel_base.h"
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index a81d0bd10ad58d..58ecac8e776b39 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -25,7 +25,6 @@
 #include "read_value_inst.h"
 #include "condition_inst.h"
 #include "experimental_detectron_roi_feature_extractor_inst.hpp"
-#include "compilation_context.hpp"
 #include "implementation_map.hpp"
 #include "graph_optimizer/prepare_buffer_fusing.h"
 
@@ -36,6 +35,7 @@
 #include "intel_gpu/runtime/memory.hpp"
 #include "intel_gpu/runtime/error_handler.hpp"
 #include "intel_gpu/runtime/debug_configuration.hpp"
+#include "intel_gpu/runtime/compilation_context.hpp"
 
 #include "json_object.h"
 #include <string>
@@ -1502,7 +1502,13 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
             ov::intel_gpu::allow_static_input_reorder(true),
             ov::intel_gpu::allow_new_shape_infer(true)
         };
-        auto prog = program::build_program(get_network().get_engine(), t, subgraph_config, get_network().get_program()->get_task_executor(), true, false);
+        auto prog = program::build_program(get_network().get_engine(),
+                                           t,
+                                           subgraph_config,
+                                           get_network().get_program()->get_task_executor(),
+                                           get_network().get_program()->get_compilation_context_ptr(),
+                                           true,
+                                           false);
 
         _unfused_subgraph = network::allocate_network(get_network().get_stream_ptr(), prog, true, get_network().is_primary_stream());
     }
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index 59af7125f9e4dc..dde29dc1e32504 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -8,6 +8,7 @@
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/runtime/debug_configuration.hpp"
 #include "intel_gpu/runtime/itt.hpp"
+#include "intel_gpu/runtime/compilation_context.hpp"
 #include "intel_gpu/graph/program.hpp"
 
 #include "auto_tuner.h"
@@ -17,7 +18,6 @@
 #include "program_dump_graph.h"
 #include "sliding_window_utils.hpp"
 #include "program_helpers.h"
-#include "compilation_context.hpp"
 
 #include "matrix_nms_inst.h"
 #include "roi_pooling_inst.h"
@@ -145,10 +145,17 @@ std::shared_ptr<ov::threading::IStreamsExecutor> program::make_task_executor(con
     return std::make_shared<ov::threading::CPUStreamsExecutor>(task_executor_config);
 }
 
+std::shared_ptr<ICompilationContext> program::make_compilation_context(const ExecutionConfig& config) {
+    const int _num_async_build_threads = 1;
+    return ICompilationContext::create(make_task_executor_config(config,
+                                                                 "Task executor config for CompilationContext in GPU plugin", _num_async_build_threads));
+}
+
 program::program(engine& engine_ref,
                  topology const& topology,
                  const ExecutionConfig& config,
                  std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
+                 std::shared_ptr<ICompilationContext> compilation_context,
                  bool is_internal,
                  bool no_optimizations,
                  bool is_body_program)
@@ -158,7 +165,8 @@ program::program(engine& engine_ref,
       _task_executor(std::move(task_executor)),
       processing_order(),
       is_internal(is_internal),
-      is_body_program(is_body_program) {
+      is_body_program(is_body_program),
+      _compilation_context(compilation_context) {
     _config.apply_user_properties(_engine.get_device_info());
     init_primitives();
     GPU_DEBUG_INFO << "Program config\n" << config.to_string();
@@ -214,8 +222,8 @@ void program::init_program() {
     _kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
                                                                       kernel_selector::KernelBase::get_db().get_batch_header_str()));
 
-    _compilation_context = ICompilationContext::create(make_task_executor_config(_config,
-                                                       "Task executor config for CompilationContext in GPU plugin", _num_async_build_threads));
+    if (!_compilation_context)
+        _compilation_context = program::make_compilation_context(_config);
 
     _impls_cache = cldnn::make_unique<ImplementationsCache>(_impls_cache_capacity);
     // Remove items of compilation context's internal queue when some impl is popped in kernels_cache
@@ -253,7 +261,18 @@ program::ptr program::build_program(engine& engine,
                                     bool is_internal,
                                     bool no_optimizations,
                                     bool is_body_program) {
-    return std::make_shared<program>(engine, topology, config, task_executor, is_internal, no_optimizations, is_body_program);
+    return std::make_shared<program>(engine, topology, config, task_executor, nullptr, is_internal, no_optimizations, is_body_program);
+}
+
+program::ptr program::build_program(engine& engine,
+                                    const topology& topology,
+                                    const ExecutionConfig& config,
+                                    std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
+                                    std::shared_ptr<ICompilationContext> compilation_context,
+                                    bool is_internal,
+                                    bool no_optimizations,
+                                    bool is_body_program) {
+    return std::make_shared<program>(engine, topology, config, task_executor, compilation_context, is_internal, no_optimizations, is_body_program);
 }
 
 program::ptr program::build_program(engine& engine,
@@ -262,7 +281,7 @@ program::ptr program::build_program(engine& engine,
                                     bool is_internal,
                                     bool no_optimizations,
                                     bool is_body_program) {
-    return std::make_shared<program>(engine, topology, config, nullptr, is_internal, no_optimizations, is_body_program);
+    return std::make_shared<program>(engine, topology, config, nullptr, nullptr, is_internal, no_optimizations, is_body_program);
 }
 
 program::ptr program::build_program(engine& engine,
diff --git a/src/plugins/intel_gpu/src/plugin/ops/condition.cpp b/src/plugins/intel_gpu/src/plugin/ops/condition.cpp
index c25726f673a2f8..7d47d1127fe57d 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/condition.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/condition.cpp
@@ -31,7 +31,7 @@ static cldnn::condition::branch gen_branch(ProgramBuilder& p, const std::shared_
     config.set_property(ov::intel_gpu::max_dynamic_batch(1));
     config.set_property(ov::intel_gpu::allow_new_shape_infer(op->is_dynamic()));
 
-    ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), true);
+    ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
     branch.inner_program = prog.get_compiled_program();
 
     auto& input_map = branch.input_map;
diff --git a/src/plugins/intel_gpu/src/plugin/ops/loop.cpp b/src/plugins/intel_gpu/src/plugin/ops/loop.cpp
index 628b0d7c37d9aa..af93885a5d949c 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/loop.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/loop.cpp
@@ -280,7 +280,7 @@ static void CreateCommonLoopOp(ProgramBuilder& p, const std::shared_ptr<ov::op::
     config.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
 
     // get body program from ov::Model
-    ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), true);
+    ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
     auto body_program = prog.get_compiled_program();
 
     GPU_DEBUG_LOG << "* trip_count_id                 : " << trip_count_id << std::endl;
diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp
index a97b7e87a9e4b3..4ccdd52ee33158 100644
--- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp
+++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp
@@ -55,14 +55,20 @@ std::string layer_type_name_ID(const std::shared_ptr<ov::Node>& op) {
 
 ProgramBuilder::ProgramBuilder(std::shared_ptr<ov::Model> model, cldnn::engine& engine, const ExecutionConfig& config,
                                bool create_topology_only, bool partial_build,
-                               std::shared_ptr<ov::threading::IStreamsExecutor> task_executor, bool is_inner_program)
+                               std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
+                               std::shared_ptr<cldnn::ICompilationContext> compilation_context,
+                               bool is_inner_program)
     : m_config(config)
     , m_engine(engine)
     , queryMode(false)
-    , m_task_executor(task_executor) {
+    , m_task_executor(task_executor)
+    , m_compilation_context(compilation_context) {
     if (m_task_executor == nullptr)
         m_task_executor = cldnn::program::make_task_executor(m_config);
 
+    if (m_compilation_context == nullptr) {
+        m_compilation_context = cldnn::program::make_compilation_context(m_config);
+    }
     // locate global custom kernel config
     // and auto-load kernels from it
 #ifdef _WIN32
@@ -158,7 +164,14 @@ std::shared_ptr<cldnn::program> ProgramBuilder::build(const std::vector<std::sha
         OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "ProgramBuilder::CreateProgram");
         cldnn::program::ptr program;
         try {
-            program = cldnn::program::build_program(m_engine, *m_topology, m_config, get_task_executor(), false, false, is_inner_program);
+            program = cldnn::program::build_program(m_engine,
+                                                    *m_topology,
+                                                    m_config,
+                                                    get_task_executor(),
+                                                    get_compilation_context(),
+                                                    false,
+                                                    false,
+                                                    is_inner_program);
         } catch (std::exception& e) {
             OPENVINO_ASSERT(false, "[GPU] ProgramBuilder build failed!\n", e.what());
         }
diff --git a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
index cd1011ea153bfe..9bf19f8df50535 100644
--- a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
+++ b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
@@ -63,7 +63,7 @@ void RemoteTensorImpl::update_strides() {
     m_strides.clear();
     if (!shape.empty()) {
         m_strides.resize(shape.size());
-        m_strides.back() = m_element_type.size();
+        m_strides.back() = shape.back() == 0 ? 0 : m_element_type.size();
         std::copy(shape.rbegin(), shape.rend() - 1, m_strides.rbegin() + 1);
         std::partial_sum(m_strides.rbegin(), m_strides.rend(), m_strides.rbegin(), std::multiplies<size_t>());
     }
diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp
index 34cc90d791a756..017ad15981cf91 100644
--- a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp
@@ -7,9 +7,9 @@
 #include <intel_gpu/primitives/input_layout.hpp>
 #include <intel_gpu/primitives/fully_connected.hpp>
 #include <intel_gpu/primitives/data.hpp>
+#include "intel_gpu/runtime/compilation_context.hpp"
 
 #include "fully_connected_inst.h"
-#include "compilation_context.hpp"
 
 #include "program_wrapper.h"
 
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
index dc23440c48af67..2f684a40f7f5ec 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -14,7 +14,7 @@
 #include <intel_gpu/primitives/quantize.hpp>
 #include <intel_gpu/primitives/data.hpp>
 
-#include "compilation_context.hpp"
+#include "intel_gpu/runtime/compilation_context.hpp"
 #include "fully_connected_inst.h"
 
 #include <cmath>
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
index a90edc00a2db98..247453944e3a4a 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
@@ -10,7 +10,7 @@
 #include <intel_gpu/primitives/crop.hpp>
 #include "openvino/reference/matmul.hpp"
 
-#include "compilation_context.hpp"
+#include "intel_gpu/runtime/compilation_context.hpp"
 #include "gemm_inst.h"
 
 #include <cstddef>
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp
index a13c1d1550882f..ed52f276fa5960 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp
@@ -7,7 +7,7 @@
 #include <intel_gpu/primitives/input_layout.hpp>
 #include <intel_gpu/primitives/group_normalization.hpp>
 #include "openvino/reference/group_normalization.hpp"
-#include "compilation_context.hpp"
+#include "intel_gpu/runtime/compilation_context.hpp"
 
 
 using namespace cldnn;
diff --git a/src/plugins/template/src/remote_context.cpp b/src/plugins/template/src/remote_context.cpp
index 2003cdf48ed93f..51d4f7a0211d77 100644
--- a/src/plugins/template/src/remote_context.cpp
+++ b/src/plugins/template/src/remote_context.cpp
@@ -26,7 +26,7 @@ class VectorTensorImpl : public ov::IRemoteTensor {
         m_strides.clear();
         if (!shape.empty()) {
             m_strides.resize(shape.size());
-            m_strides.back() = m_element_type.size();
+            m_strides.back() = shape.back() == 0 ? 0 : m_element_type.size();
             std::copy(shape.rbegin(), shape.rend() - 1, m_strides.rbegin() + 1);
             std::partial_sum(m_strides.rbegin(), m_strides.rend(), m_strides.rbegin(), std::multiplies<size_t>());
         }
diff --git a/src/tests/functional/plugin/shared/include/snippets/mha.hpp b/src/tests/functional/plugin/shared/include/snippets/mha.hpp
index 1a922d215fa058..547fa323cf4b18 100644
--- a/src/tests/functional/plugin/shared/include/snippets/mha.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/mha.hpp
@@ -16,6 +16,7 @@ typedef std::tuple<
         std::vector<ov::element::Type>,    // Input Element types
         ov::element::Type,                 // Inference precision
         bool,                              // With Multiply
+        size_t,                            // Thread count
         size_t,                            // Expected num nodes
         size_t,                            // Expected num subgraphs
         std::string,                       // Target Device
@@ -27,13 +28,17 @@ class MHA : public testing::WithParamInterface<ov::test::snippets::MHAParams>,
 public:
     static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::MHAParams> obj);
 
+    constexpr static size_t default_thread_count = 0;
+
 protected:
     void SetUp() override;
 
+    void compile_model() override;
     void generate_inputs(const std::vector<ngraph::Shape>& targetInputStaticShapes) override;
     virtual std::shared_ptr<SnippetsFunctionBase> get_subgraph();
 
     bool m_with_mul = false;
+    size_t m_thread_count;
     std::vector<ov::element::Type> m_input_types;
 };
 
diff --git a/src/tests/functional/plugin/shared/src/snippets/mha.cpp b/src/tests/functional/plugin/shared/src/snippets/mha.cpp
index 3017fe55a83a44..c21a754b0ad901 100644
--- a/src/tests/functional/plugin/shared/src/snippets/mha.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/mha.cpp
@@ -18,10 +18,11 @@ std::string MHA::getTestCaseName(testing::TestParamInfo<ov::test::snippets::MHAP
     std::vector<ov::element::Type> elem_types;
     ov::element::Type prc;
     bool withMul;
+    size_t thread_count;
     std::string targetDevice;
     size_t num_nodes, num_subgraphs;
     std::map<std::string, std::string> additionalConfig;
-    std::tie(inputShapes, elem_types, prc, withMul, num_nodes, num_subgraphs, targetDevice, additionalConfig) = obj.param;
+    std::tie(inputShapes, elem_types, prc, withMul, thread_count, num_nodes, num_subgraphs, targetDevice, additionalConfig) = obj.param;
 
     std::ostringstream result;
     for (size_t i = 0; i < inputShapes.size(); ++i)
@@ -29,6 +30,7 @@ std::string MHA::getTestCaseName(testing::TestParamInfo<ov::test::snippets::MHAP
     for (size_t i = 0; i < elem_types.size(); i++)
         result << "T[" << i <<"]=" << elem_types[i] << "_";
     result << "Mul=" << withMul << "_";
+    result << "ThreadNum=" << thread_count << "_";
     result << "PRC=" << prc << "_";
     result << "#N=" << num_nodes << "_";
     result << "#S=" << num_subgraphs << "_";
@@ -48,7 +50,8 @@ void MHA::SetUp() {
     std::vector<ov::PartialShape> inputShapes;
     ov::element::Type prc;
     std::map<std::string, std::string> additionalConfig;
-    std::tie(inputShapes, m_input_types, prc, m_with_mul, ref_num_nodes, ref_num_subgraphs, targetDevice, additionalConfig) = this->GetParam();
+    std::tie(inputShapes, m_input_types, prc, m_with_mul, m_thread_count,
+             ref_num_nodes, ref_num_subgraphs, targetDevice, additionalConfig) = this->GetParam();
     init_input_shapes(static_partial_shapes_to_test_representation(inputShapes));
 
     const auto subgraph_model = get_subgraph();
@@ -66,6 +69,12 @@ void MHA::SetUp() {
         rel_threshold = 0.05f;
 }
 
+void MHA::compile_model() {
+    if (m_thread_count != default_thread_count)
+        core->set_property(targetDevice, ov::inference_num_threads(m_thread_count));
+    SubgraphBaseTest::compile_model();
+}
+
 void MHA::generate_inputs(const std::vector<ngraph::Shape>& targetInputStaticShapes) {
     inputs.clear();
     const auto& model_inputs = function->inputs();
diff --git a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp
index 0c6521dba84e95..57f7bf30e3c860 100644
--- a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp
+++ b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp
@@ -56,6 +56,19 @@ class MHAFunction : public SnippetsFunctionBase {
     std::vector<ov::element::Type> precisions;
 };
 
+class MHASplitMFunction : public MHAFunction {
+public:
+    explicit MHASplitMFunction(const std::vector<PartialShape>& inputShapes, const std::vector<ov::element::Type>& precisions,
+                                       const std::vector<Shape>& reshapes, bool with_mul = true)
+            : MHAFunction(inputShapes, precisions, with_mul), reshapes(reshapes) {
+        OPENVINO_ASSERT(reshapes.size() == 5, "Got invalid number of Reshape shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initReference() const override;
+
+    std::vector<ov::Shape> reshapes;
+};
+
 /* Graph:
  *       Transpose1[0,2,1,3]  Constant
  *                     \       /
diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
index fdefcf03d9dd19..661af347dd4574 100644
--- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
+++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
@@ -13,6 +13,40 @@
 namespace ov {
 namespace test {
 namespace snippets {
+namespace {
+std::vector<int64_t> get_rank_equivalent_order(std::vector<int64_t> default_order, size_t rank) {
+    OPENVINO_ASSERT(rank > 2, "Incorrect rank for testing");
+    auto order = std::vector<int64_t>(rank);
+    std::iota(order.begin(), order.end(), 0);
+    const auto diff = rank - default_order.size();
+    for (size_t i = 0; i < default_order.size(); ++i) {
+        order[diff + i] = default_order[i] + diff;
+    }
+    return order;
+}
+std::vector<int64_t> get_fusion_order(size_t rank) {
+    return get_rank_equivalent_order({1, 0, 2}, rank);
+}
+std::vector<int64_t> get_decomposed_order(size_t rank) {
+    return get_rank_equivalent_order({1, 2, 0}, rank);
+}
+std::vector<int64_t> get_fusion_order_after_split_m(size_t rank, bool is_input) {
+    if (rank == 4) {
+        return is_input ? std::vector<int64_t>{2, 0, 1, 3} : std::vector<int64_t>{1, 2, 0, 3};
+    } else if (rank == 5) {
+        return is_input ? std::vector<int64_t>{0, 3, 1, 2, 4} : std::vector<int64_t>{0, 2, 3, 1, 4};
+    }
+    OPENVINO_THROW("Incorrect rank for testing");
+}
+std::vector<int64_t> get_decomposed_order_after_split_m(size_t rank) {
+    if (rank == 4) {
+        return std::vector<int64_t>{1, 2, 3, 0};
+    } else if (rank == 5) {
+        return std::vector<int64_t>{0, 2, 3, 4, 1};
+    }
+    OPENVINO_THROW("Incorrect rank for testing");
+}
+} // namespace
 
 std::shared_ptr<ov::Model> MHAFunction::initOriginal() const {
     auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precisions[0], input_shapes[0]);
@@ -21,48 +55,40 @@ std::shared_ptr<ov::Model> MHAFunction::initOriginal() const {
     auto transpose2Param = std::make_shared<ngraph::opset1::Parameter>(precisions[3], input_shapes[3]);
     ngraph::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param};
 
-    std::vector<ov::Shape> constantShapes;
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
-    constantShapes.push_back(ov::Shape({2}));
-    constantShapes.push_back(ov::Shape({4}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-
-    auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], std::vector<int64_t>{0, 2, 1, 3});
-    auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], std::vector<int64_t>{0, 2, 3, 1});
-    auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], std::vector<int64_t>{0, 2, 1, 3});
-    auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], std::vector<int64_t>{0, 2, 1, 3});
-
-    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
-                                                                   input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]),
-                                              -1};
-    auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData);
+    const auto rank = input_shapes[0].size();
+    const auto fusion_order = get_fusion_order(rank);
+    const auto decomposed_order = get_decomposed_order(rank);
 
-    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
-    auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData);
+    const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, decomposed_order);
+    const auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order);
 
-    float transA = false;
-    float transB = false;
     const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
     const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
     std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
     if (with_mul) {
-        std::vector<float> mulConstData(ngraph::shape_size(constantShapes[2]));
-        auto mulConst = ngraph::builder::makeConstant(precisions[1], constantShapes[2], mulConstData, true);
+        ov::Shape shape(rank, 1);
+        shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
+        std::vector<float> mulConstData(ngraph::shape_size(shape));
+        const auto mulConst = ngraph::builder::makeConstant(precisions[1], shape, mulConstData, true);
         matmul_parent1 = std::make_shared<ngraph::opset3::Multiply>(transpose1, mulConst);
     }
-    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(transpose0, matmul_parent1, transA, transB);
+    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(transpose0, matmul_parent1);
     const auto add = std::make_shared<ngraph::opset3::Add>(matMul0, addParam);
+
+    const auto interm_shape = add->get_output_shape(0);
+    const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies<size_t>());
+    const auto reshape0ConstData = std::vector<int64_t>{ batch, -1 };
+    const auto reshape1ConstData = interm_shape;
+    const auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData);
+    const auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData);
+
     const auto reshape0 = std::make_shared<ngraph::opset1::Reshape>(add, reshape0Const, true);
     const auto softMax = std::make_shared<ngraph::opset1::Softmax>(reshape0, 1);
     const auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(softMax, reshape1Const, true);
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
-    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(reshape1, transpose2, transA, transB);
+    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(reshape1, transpose2);
     const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
 
     ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(transpose3)};
@@ -81,53 +107,36 @@ std::shared_ptr<ov::Model> MHAFunction::initReference() const {
     auto addParam = std::make_shared<ngraph::opset1::Parameter>(precisions[2], input_shapes[2]);
     auto transpose2Param = std::make_shared<ngraph::opset1::Parameter>(precisions[3], input_shapes[3]);
 
-    std::vector<ov::Shape> constantShapes;
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
-    constantShapes.push_back(ov::Shape({2}));
-    constantShapes.push_back(ov::Shape({4}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-
-    auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], std::vector<int64_t>{0, 2, 1, 3});
-    auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], std::vector<int64_t>{0, 2, 3, 1});
-    auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], std::vector<int64_t>{0, 2, 1, 3});
-    auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], std::vector<int64_t>{0, 2, 1, 3});
-
     ngraph::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param};
 
-    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
-                                                                   input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]),
-                                              -1};
-    auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData);
+    const auto rank = input_shapes[0].size();
+    const auto fusion_order = get_fusion_order(rank);
+    const auto decomposed_order = get_decomposed_order(rank);
 
-    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
-    auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData);
+    const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, decomposed_order);
+    const auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order);
 
-    float transA = false;
-    float transB = false;
     const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
     const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
     std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
     if (with_mul) {
-        std::vector<float> mulConstData(ngraph::shape_size(constantShapes[2]));
-        auto mulConst = ngraph::builder::makeConstant(precisions[1], constantShapes[2], mulConstData, true);
-        auto mulParam = std::make_shared<ngraph::opset1::Parameter>(precisions[1], mulConst->get_shape());
+        ov::Shape shape(rank, 1);
+        shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
+        std::vector<float> mulConstData(ngraph::shape_size(shape));
+        const auto mulConst = ngraph::builder::makeConstant(precisions[1], shape, mulConstData, true);
+        const auto mulParam = std::make_shared<ngraph::opset1::Parameter>(precisions[1], mulConst->get_shape());
         matmul_parent1 = std::make_shared<ngraph::opset3::Multiply>(transpose1, mulParam);
         subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
         subgraph_inputs = {data0, data1, mulConst, data2, data3};
     }
-    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(transpose0, matmul_parent1, transA, transB);
+
+    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(transpose0, matmul_parent1);
     const auto add = std::make_shared<ngraph::opset3::Add>(matMul0, addParam);
-    const auto reshape0 = std::make_shared<ngraph::opset1::Reshape>(add, reshape0Const, true);
-    const auto softMax = std::make_shared<ngraph::opset1::Softmax>(reshape0, 1);
-    const auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(softMax, reshape1Const, true);
+    const auto softMax = std::make_shared<ngraph::opset1::Softmax>(add, rank - 1);
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
-    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(reshape1, transpose2, transA, transB);
+    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(softMax, transpose2);
     const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
 
     auto subgraph = std::make_shared<ov::snippets::op::Subgraph>(subgraph_inputs,
@@ -135,6 +144,70 @@ std::shared_ptr<ov::Model> MHAFunction::initReference() const {
 
     return std::make_shared<ov::Model>(NodeVector{subgraph}, ngraphParams);
 }
+std::shared_ptr<ov::Model> MHASplitMFunction::initReference() const {
+    auto data0 = std::make_shared<ngraph::opset1::Parameter>(precisions[0], input_shapes[0]);
+    auto data1 = std::make_shared<ngraph::opset1::Parameter>(precisions[1], input_shapes[1]);
+    auto data2 = std::make_shared<ngraph::opset1::Parameter>(precisions[2], input_shapes[2]);
+    auto data3 = std::make_shared<ngraph::opset1::Parameter>(precisions[3], input_shapes[3]);
+    ngraph::ParameterVector ngraphParams = {data0, data1, data2, data3};
+
+    auto make_reshape = [](const std::shared_ptr<ov::Node>& node, const ov::Shape& new_shape) {
+        auto shape_const = ngraph::builder::makeConstant(ngraph::element::i32, {new_shape.size()}, new_shape);
+        return std::make_shared<ov::op::v1::Reshape>(node, shape_const, true);
+    };
+
+    auto reshape0 = make_reshape(data0, reshapes[0]);
+    auto reshape1 = make_reshape(data1, reshapes[1]);
+    auto reshape2 = make_reshape(data2, reshapes[2]);
+    auto reshape3 = make_reshape(data3, reshapes[3]);
+    NodeVector subgraph_inputs = {reshape0, reshape1, reshape2, reshape3};
+
+    auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precisions[0], reshape0->get_shape());
+    auto transpose1Param = std::make_shared<ngraph::opset1::Parameter>(precisions[1], reshape1->get_shape());
+    auto addParam = std::make_shared<ngraph::opset1::Parameter>(precisions[2], reshape2->get_shape());
+    auto transpose2Param = std::make_shared<ngraph::opset1::Parameter>(precisions[3], reshape3->get_shape());
+    ngraph::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param};
+
+    const auto rank = input_shapes[0].size() + 1;
+
+    const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, true));
+    const auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, get_decomposed_order_after_split_m(rank));
+    const auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, true));
+    const auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, false));
+
+    const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
+    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
+
+    std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
+    if (with_mul) {
+        ov::Shape shape(rank - 1, 1);
+        shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4];
+        ov::Shape reshape_shape = shape;
+        reshape_shape.insert(reshape_shape.cbegin() + rank - 3, 1);
+        std::vector<float> mulConstData(ngraph::shape_size(shape));
+        const auto mulConst = ngraph::builder::makeConstant(precisions[1], shape, mulConstData, true);
+        const auto reshape_mul = make_reshape(mulConst, reshape_shape);
+        const auto mulParam = std::make_shared<ngraph::opset1::Parameter>(precisions[1], reshape_mul->get_shape());
+        matmul_parent1 = std::make_shared<ngraph::opset3::Multiply>(transpose1, mulParam);
+        subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
+        subgraph_inputs = {reshape0, reshape1, reshape_mul, reshape2, reshape3};
+    }
+
+    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(transpose0, matmul_parent1);
+    const auto add = std::make_shared<ngraph::opset3::Add>(matMul0, addParam);
+    const auto softMax = std::make_shared<ngraph::opset1::Softmax>(add, rank - 1);
+    const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
+    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(softMax, transpose2);
+    const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
+
+    const auto subgraph = std::make_shared<ov::snippets::op::Subgraph>(subgraph_inputs,
+                                                                       std::make_shared<ov::Model>(ov::OutputVector{transpose3},
+                                                                                                   subgraph_params));
+
+    auto reshape4 = make_reshape(subgraph, reshapes[4]);
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(reshape4)};
+    return std::make_shared<ov::Model>(results, ngraphParams, "mha");
+}
 
 std::shared_ptr<ov::Model> MHAMatMul0TransposeFunction::initOriginal() const {
     auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precisions[0], input_shapes[0]);
diff --git a/tests/layer_tests/pytorch_tests/test_convnd.py b/tests/layer_tests/pytorch_tests/test_convnd.py
index 8b46b2992d2c07..8ea83a5aae5dc1 100644
--- a/tests/layer_tests/pytorch_tests/test_convnd.py
+++ b/tests/layer_tests/pytorch_tests/test_convnd.py
@@ -216,6 +216,7 @@ def forward(self, x, y):
 
     @pytest.mark.nightly
     @pytest.mark.precommit
+    @pytest.mark.xfail(reason="ticket 123727")
     def test_conv2d(self, ie_device, precision, ir_version):
         self._test(*self.create_model(),
                    ie_device, precision, ir_version, freeze_model=True, dynamic_shapes=False)