From f104183ee361978f97374d8c2e762c7dcc0a697a Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Fri, 3 Jan 2025 07:54:02 +0100
Subject: [PATCH] [Snippets][CPU] Applied Vladislav & Ivan comments 3

---
 .../snippets/include/snippets/op/load.hpp     | 10 +++----
 .../include/snippets/snippets_isa_tbl.hpp     |  2 +-
 .../pass/mark_invariant_shape_path.cpp        |  2 +-
 src/common/snippets/src/op/load.cpp           | 28 +++++++++----------
 src/common/snippets/src/op/reorder.cpp        |  2 +-
 .../src/pass/transpose_decomposition.cpp      |  4 +--
 .../snippets/src/runtime_configurator.cpp     | 21 ++++++++------
 .../src/shape_inference/shape_inference.cpp   |  2 +-
 .../pass/extracted_loop_invariants.cpp        |  4 +--
 .../snippets/cpu_runtime_configurator.cpp     |  2 +-
 .../snippets/cpu_runtime_configurator.hpp     |  2 +-
 .../src/emitters/snippets/repacked_input.hpp  |  1 +
 .../emitters/snippets/x64/cpu_generator.cpp   |  2 +-
 .../x64/kernel_executors/brgemm_copy_b.cpp    |  6 ++--
 .../x64/kernel_executors/brgemm_copy_b.hpp    |  2 +-
 src/plugins/intel_cpu/src/extension.cpp       |  2 +-
 .../src/nodes/executors/subgraph.cpp          | 15 +++-------
 .../src/nodes/executors/subgraph.hpp          | 14 ++++------
 .../x64/pass/eliminate_brgemm_copy_b.cpp      |  5 ++--
 .../lowered/external_repacking_adjuster.cpp   | 12 ++++----
 .../lowered/fuse_load_store_and_convert.cpp   |  2 +-
 .../x64/lowered/buffer_allocation.cpp         |  4 +--
 22 files changed, 69 insertions(+), 75 deletions(-)
diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp
index bca4b09fabdcbd..d0a168483bc5ce 100644
--- a/src/common/snippets/include/snippets/op/load.hpp
+++ b/src/common/snippets/include/snippets/op/load.hpp
@@ -41,17 +41,17 @@ class Load : public modifier::MemoryAccess, public ov::op::Op {
 };
 
 /**
- * @interface LoadReshape
+ * @interface LoadReorder
  * @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak
  *        shape propagation. We need it to keep correct shape propagation  when Transpose is decomposed to
  *        Load and Store. This is a temporary solution until tokenization of Reshape operation is supported.
  * @ingroup snippets
  */
-class LoadReshape : public Load {
+class LoadReorder : public Load {
 public:
-    OPENVINO_OP("LoadReshape", "SnippetsOpset", Load);
-    LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
-    LoadReshape() = default;
+    OPENVINO_OP("LoadReorder", "SnippetsOpset", Load);
+    LoadReorder(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
+    LoadReorder() = default;
 
     void set_offset(size_t offset) { set_output_offset(offset, 0); }
     void set_count(size_t count) { set_output_count(count, 0); }
diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
index e9174283bf37dd..9dc416b3f7e38f 100644
--- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@@ -11,7 +11,7 @@
 
 // SnippetS dialect
 OV_OP(Load, ov::snippets::op)
-OV_OP(LoadReshape, ov::snippets::op)
+OV_OP(LoadReorder, ov::snippets::op)
 OV_OP(LoopBegin, ov::snippets::op)
 OV_OP(LoopEnd, ov::snippets::op)
 OV_OP(Brgemm, ov::snippets::op)
diff --git a/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp b/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp
index b32056d4e32a57..16d4160f1aaeb2 100644
--- a/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp
+++ b/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp
@@ -41,7 +41,7 @@ static bool is_affecting_op(const ExpressionPtr& expr) {
     const auto& node = expr->get_node();
     return ov::is_type<ov::snippets::op::Brgemm>(node) ||
            ov::is_type<ov::snippets::op::Reshape>(node) ||
-           ov::is_type<ov::snippets::op::LoadReshape>(node);
+           ov::is_type<ov::snippets::op::LoadReorder>(node);
 }
 }  // namespace
 
diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp
index 461fec8b1399c0..9bd1e4c7bc8706 100644
--- a/src/common/snippets/src/op/load.cpp
+++ b/src/common/snippets/src/op/load.cpp
@@ -41,19 +41,19 @@ std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args)
     return std::make_shared<Load>(new_args.at(0), get_count(), get_offset());
 }
 
-LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
+LoadReorder::LoadReorder(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
                             : Load(x, count, offset), m_order(std::move(order)) {
     const auto& in_shape = x.get_partial_shape();
     const auto in_shape_size = in_shape.size();
-    OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size");
+    OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReorder got new_order of invalid size");
     OPENVINO_ASSERT(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 &&
-                    *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order");
+                    *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReorder detected invalid values in new_order");
     const std::set<size_t> unique_dims(order.begin(), order.end());
-    OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements");
+    OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReorder order must not contain repeated elements");
     constructor_validate_and_infer_types();
 }
 
-void LoadReshape::validate_and_infer_types() {
+void LoadReorder::validate_and_infer_types() {
     validate_memory_access_params();
     const auto& old_shape = get_input_partial_shape(0);
     ov::PartialShape new_shape;
@@ -62,23 +62,23 @@ void LoadReshape::validate_and_infer_types() {
     set_output_type(0, get_input_element_type(0), new_shape);
 }
 
-bool LoadReshape::visit_attributes(AttributeVisitor& visitor) {
+bool LoadReorder::visit_attributes(AttributeVisitor& visitor) {
     MemoryAccess::visit_attributes(visitor);
     visitor.on_attribute("order", m_order);
     return true;
 }
 
-std::shared_ptr<Node> LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const {
-    INTERNAL_OP_SCOPE(LoadReshape);
+std::shared_ptr<Node> LoadReorder::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(LoadReorder);
     check_new_args_count(this, new_args);
-    return std::make_shared<LoadReshape>(new_args.at(0), get_count(), get_offset(), m_order);
+    return std::make_shared<LoadReorder>(new_args.at(0), get_count(), get_offset(), m_order);
 }
-LoadReshape::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
-    const auto& loadReshape = ov::as_type_ptr<LoadReshape>(n);
-    OPENVINO_ASSERT(loadReshape, "Got invalid node in LoadReshape::ShapeInfer");
-    m_order = loadReshape->m_order;
+LoadReorder::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
+    const auto& loadReorder = ov::as_type_ptr<LoadReorder>(n);
+    OPENVINO_ASSERT(loadReorder, "Got invalid node in LoadReorder::ShapeInfer");
+    m_order = loadReorder->m_order;
 }
-IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
+IShapeInferSnippets::Result LoadReorder::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
     OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes");
     return {{utils::get_planar_vdims(input_shapes[0], m_order)}, ShapeInferStatus::success};
 }
diff --git a/src/common/snippets/src/op/reorder.cpp b/src/common/snippets/src/op/reorder.cpp
index 95cd2375dcc2ce..43d8387a8cb2fb 100644
--- a/src/common/snippets/src/op/reorder.cpp
+++ b/src/common/snippets/src/op/reorder.cpp
@@ -31,7 +31,7 @@ void Reorder::custom_constructor_validate_and_infer_types(std::vector<size_t> or
 
 void Reorder::validate_and_infer_types() {
     const auto& input_pshape = get_input_partial_shape(0);
-    const auto order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
+    const auto& order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
     OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(),
                     "Incompatible shape and order sizes");
     const auto output_pshape = utils::get_planar_pshape(get_input_partial_shape(0), order);
diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp
index 5c29b493af5826..a433cd41377422 100644
--- a/src/common/snippets/src/pass/transpose_decomposition.cpp
+++ b/src/common/snippets/src/pass/transpose_decomposition.cpp
@@ -60,9 +60,9 @@ TransposeDecomposition::TransposeDecomposition() {
         const auto subtensor = std::vector<size_t>{1};
         const auto& layout = order->cast_vector<size_t>();
 
-        // todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation
+        // todo: LoadReorder used here is essentially Load + an easy way to maintain correct shape propagation
         //  fix this in future and develop a more consistent shape propagation approach.
-        auto load = std::make_shared<snippets::op::LoadReshape>(data_input, subtensor[0], 0, layout);
+        auto load = std::make_shared<snippets::op::LoadReorder>(data_input, subtensor[0], 0, layout);
         auto store = std::make_shared<snippets::op::Store>(load, subtensor[0]);
 
         PortDescriptorUtils::set_port_descriptor(load->input(0), subtensor, layout);
diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp
index c23c4e908f24bf..5527cebb63f24f 100644
--- a/src/common/snippets/src/runtime_configurator.cpp
+++ b/src/common/snippets/src/runtime_configurator.cpp
@@ -120,15 +120,18 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir)
         const auto& shape_infer_seq = utils::get_first_child_shape_infer_expr_seq(param);
         ExpressionPtr mem_desc_expr = param;
         if (!shape_infer_seq.empty()) {
-            // If there is Reorder, we should take its desc because it affects on shape by target order
-            const auto& reordered_reshape_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(),
-                                                            [](const ExpressionPtr& expr) {
-                                                               return ov::is_type<op::Reorder>(expr->get_node());
-                                                            });
-            if (reordered_reshape_it != shape_infer_seq.cend()) {
-                const auto& reshape = *reordered_reshape_it;
-                const auto& etype = reshape->get_node()->get_output_element_type(0);
-                update_io_parameters(reshape->get_input_port_descriptor(0), etype);
+            // [160048] Reorder, as any another ShapeInferOp, should just propagate input shape to output using target order
+            //          without data movement. However, currently we have to save desc of input of the Reorder
+            //          to support correct input data offsets calculations and MHAParallelWAOptimizer pass work.
+            //          Please, remove this code part when the mentioned ticket is completed.
+            const auto& reorder_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(),
+                                                  [](const ExpressionPtr& expr) {
+                                                     return ov::is_type<op::Reorder>(expr->get_node());
+                                                  });
+            if (reorder_it != shape_infer_seq.cend()) {
+                const auto& reorder = *reorder_it;
+                const auto& etype = reorder->get_node()->get_output_element_type(0);
+                update_io_parameters(reorder->get_input_port_descriptor(0), etype);
                 continue;
             }
 
diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp
index 62800ec1294a8b..0e3060501a87d5 100644
--- a/src/common/snippets/src/shape_inference/shape_inference.cpp
+++ b/src/common/snippets/src/shape_inference/shape_inference.cpp
@@ -64,7 +64,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
         // Note that Result has no output PortConnectors, so the shape must be empty
         SHAPE_INFER_PREDEFINED(ov::op::v0::Result, EmptyShapeInfer),
         //
-        SHAPE_INFER_OP_SPECIFIC(op::LoadReshape),
+        SHAPE_INFER_OP_SPECIFIC(op::LoadReorder),
         SHAPE_INFER_OP_SPECIFIC(op::Reshape),
         SHAPE_INFER_OP_SPECIFIC(op::Reorder),
         SHAPE_INFER_OP_SPECIFIC(op::RankNormalization),
diff --git a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp
index ee76c5af7234d8..b9ff7bda6823ed 100644
--- a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp
+++ b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp
@@ -299,7 +299,7 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) {
      *
      *        Param0(32,8,1)
      *             |
-     *       LoadReshape with order (1,2,0)
+     *       LoadReorder with order (1,2,0)
      *             |
      *           Store
      *             |
@@ -307,7 +307,7 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) {
     */
     {
         auto param = linear_ir->push_node<ov::opset10::Parameter>(input_precision, input_shape_0);
-        auto load_reshape = linear_ir->push_node<ov::snippets::op::LoadReshape>(param.second, 1, 0, layout);
+        auto load_reshape = linear_ir->push_node<ov::snippets::op::LoadReorder>(param.second, 1, 0, layout);
         auto store = linear_ir->push_node<ov::snippets::op::Store>(load_reshape.second, 1, 0);
         init_expr_descriptors(*load_reshape.first, {subtensor, subtensor}, {order, layout});
         init_expr_descriptors(*store.first, {subtensor, subtensor}, {layout, layout});
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
index 43b3ea14cc148a..3ad41d707bb96b 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
@@ -7,7 +7,7 @@
 #include "snippets/lowered/loop_manager.hpp"
 #include "snippets/utils/utils.hpp"
 
-#ifndef OPENVINO_ARCH_ARM64
+#ifdef OPENVINO_ARCH_X86_64
 #    include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp"
 #    include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp"
 #endif
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
index 20ef3793f56766..425959c289b3a7 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
@@ -35,7 +35,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig {
 
 class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
 public:
-    CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache = {});
+    CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache);
 
     /**
      * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp
index 545e29eaa6915a..61daaa859ef603 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp
@@ -12,6 +12,7 @@ namespace intel_cpu {
 struct RepackedInputKernel {
     RepackedInputKernel() = default;
     virtual ~RepackedInputKernel() = default;
+    virtual void operator()(const void* args) const = 0;
 };
 
 struct RepackedInput {
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
index c8d5ea5229fcd3..31daa32dfa144f 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
@@ -180,7 +180,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
     jitters[snippets::op::Reorder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
 
     jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
-    jitters[snippets::op::LoadReshape::get_type_info_static()] =
+    jitters[snippets::op::LoadReorder::get_type_info_static()] =
         CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
     jitters[snippets::op::BroadcastLoad::get_type_info_static()] =
         CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter);
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp
index 30d7e13b9d7bb5..7aca5f6c6a696f 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp
@@ -170,9 +170,11 @@ status_t BrgemmCopyBKernel::create_kernel() {
     return code;
 }
 
-void BrgemmCopyBKernel::operator()(const call_args* args) const {
+void BrgemmCopyBKernel::operator()(const void* args) const {
+    const auto* call_args = reinterpret_cast<const BrgemmCopyBKernel::call_args*>(args);
+    OV_CPU_JIT_EMITTER_ASSERT(call_args, "Call arguments are nullptr!");
     OV_CPU_JIT_EMITTER_ASSERT(ker_, "Kernel is nullptr");
-    ker_(args);
+    ker_(call_args);
 }
 
 void BrgemmCopyBKernel::init_brgemm_copy_b_kernel(
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp
index 16ed18e68a01bb..5ef740067f2035 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp
@@ -153,7 +153,7 @@ struct BrgemmCopyBKernel : public RepackedInputKernel, public dnnl::impl::cpu::x
 
     dnnl::impl::status_t create_kernel() override;
 
-    void operator()(const call_args* args) const;
+    void operator()(const void* args) const override;
 
 private:
     void generate() override;
diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp
index bdb5211009a22a..762568823d205f 100644
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@@ -170,7 +170,7 @@ class TypeRelaxedExtension : public ov::OpExtension<ov::op::TypeRelaxed<Op>> {
     OP_EXTENSION(ov::snippets::op::KernelStatic)      \
     OP_EXTENSION(ov::snippets::op::KernelDynamic)     \
     OP_EXTENSION(ov::snippets::op::Load)              \
-    OP_EXTENSION(ov::snippets::op::LoadReshape)       \
+    OP_EXTENSION(ov::snippets::op::LoadReorder)       \
     OP_EXTENSION(ov::snippets::op::LoopBegin)         \
     OP_EXTENSION(ov::snippets::op::LoopEnd)           \
     OP_EXTENSION(ov::snippets::op::Buffer)            \
diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp
index b55e5263708268..739ae56be3b4ff 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp
@@ -3,11 +3,7 @@
 //
 
 #include "nodes/executors/subgraph.hpp"
-#if defined(OPENVINO_ARCH_ARM64)
-#    include "emitters/snippets/aarch64/cpu_generator.hpp"
-#else
-#    include "emitters/snippets/x64/cpu_generator.hpp"
-#endif
+
 #include "common/primitive_hashing_utils.hpp"
 #include "openvino/core/parallel.hpp"
 
@@ -102,9 +98,8 @@ void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr<CPURuntime
                                                 std::vector<size_t>& domain) {
     init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain);
 }
-void SubgraphBaseExecutor::parallel_for6d(
-    const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
+
+void SubgraphBaseExecutor::parallel_for6d(const initializer_functor& initializer, const call_functor& caller) {
     const auto& dom = m_parallel_exec_domain;
 
     parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
@@ -142,9 +137,7 @@ void SubgraphBaseExecutor::parallel_for6d(
     });
 }
 
-void SubgraphBaseExecutor::parallel_forNd(
-    const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
+void SubgraphBaseExecutor::parallel_forNd(const initializer_functor& initializer, const call_functor& caller) {
     const auto& dom = m_parallel_exec_domain;
 
     parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp
index a1762c3c72e22e..78cb56440203d2 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp
@@ -66,21 +66,17 @@ class SubgraphBaseExecutor {
 protected:
     virtual void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) = 0;
 
-    virtual void parallel_for6d(
-        const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-        const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller);
-    virtual void parallel_forNd(
-        const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-        const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller);
+    using initializer_functor = std::function<void(jit_snippets_call_args&, size_t)>;
+    using call_functor = std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>;
+
+    virtual void parallel_for6d(const initializer_functor& initializer, const call_functor& caller);
+    virtual void parallel_forNd(const initializer_functor& initializer, const call_functor& caller);
 
     inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const {
         if (m_buffer_scratchpad_size > 0)
             scratchpad_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + ithr * m_buffer_scratchpad_size;
     }
 
-    using initializer_functor = std::function<void(jit_snippets_call_args&, size_t)>;
-    using call_functor = std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>;
-
     std::shared_ptr<snippets::Schedule> m_schedule;
     // Holds index of output used as in execution domain
     // it should be compatible with a schedule's work size
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
index 711c33a085859e..b87a78c6b0cb40 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
@@ -41,14 +41,13 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() {
             transformation_callback(copy_b_node))
             return false;
 
-        // If there is non-empty and non-planar layout, we should insert reshape to support shape inference
+        // If there is non-planar layout, we should insert reshape to support shape inference
         if (!ov::snippets::utils::is_planar_layout(layout)) {
             const auto& subtensor = in_desc->get_subtensor();
             const auto& reshape = std::make_shared<ov::snippets::op::Reorder>(copy_b_node->input_value(0), layout);
             ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout);
             ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor);
-            ov::replace_node_update_name(copy_b_node, reshape);
-            return true;
+            return ov::replace_node_update_name(copy_b_node, reshape);
         }
 
         // If there is no layout, we can just remove BrgemmCopyB from the subgraph
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
index faff40ec3a54be..add7c66d3d7ffc 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
@@ -103,12 +103,12 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         update_kernel(p.second, shape, layout, N, K, prc);
     }
 
-    const auto L2_cache_size = dnnl::utils::get_cache_size(2, true);
-    const auto fit_into_L2 = data_size < L2_cache_size;
-    // Heuristic: If external repacking data doesn't fit in the cache L2,
+    const auto cache_size = dnnl::utils::get_cache_size(1, true) + dnnl::utils::get_cache_size(2, true);
+    const auto fit_into_cache = data_size < cache_size;
+    // Heuristic: If external repacking data doesn't fit in the caches L1 and L2,
     //            external repacking should be executed in seperate parallel section before kernel execution.
-    cpu_config->repacking_impl_type =
-        fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL : CPURuntimeConfig::RepackingImplType::SEPARATE;
+    cpu_config->repacking_impl_type = fit_into_cache ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL
+                                                     : CPURuntimeConfig::RepackingImplType::SEPARATE;
 
     const auto is_impl_parallel = cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL;
 
@@ -131,7 +131,7 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         // Save original input offsets for input before repacking.
         // If the shape has not been changed, it means that we already created `RepackedInput` for this input
         // on previous pass call and now `cpu_config->io_data_offsets[i]` contains offsets not for original input -
-        // they were updated for blocked shapes/zeroed for previous initialization and we canonot use them as original
+        // they were updated for blocked shapes/zeroed for previous initialization and we cannot use them as original
         // offsets.
         const auto in_offsets =
             shape == cpu_config->latest_shapes[i] ? repacked_in.in_offsets() : cpu_config->io_data_offsets[i];
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp
index 0f5a6472b741f4..0186e5b66030ca 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp
@@ -22,7 +22,7 @@ bool ov::intel_cpu::pass::FuseLoadStoreConvert::fuse_load_convert(
     const auto& load_output = input_connector->get_source();
     const auto& load_expr = load_output.get_expr();
     const auto load = ov::as_type_ptr<snippets::op::Load>(load_expr->get_node());
-    if (!load || ov::is_type<snippets::op::LoadReshape>(load_expr->get_node()) ||
+    if (!load || ov::is_type<snippets::op::LoadReorder>(load_expr->get_node()) ||
         ov::is_type<snippets::op::BroadcastLoad>(load_expr->get_node()))
         return false;
 
diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp
index 9ace85b3038afa..7c425b0bca6781 100644
--- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp
+++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp
@@ -149,7 +149,7 @@ class MHAFP32BufferAllocationTest : public BufferAllocationCPUTest {
         const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, shapes[2]);
 
         const auto order = std::vector<size_t>{0, 2, 3, 1};
-        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, order);
+        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReorder>(parameter1, 1, 0, order);
         const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
         const auto relu0 = std::make_shared<ov::op::v0::Relu>(store);
         const auto brgemm_cpu0 = std::make_shared<ov::intel_cpu::BrgemmCPU>(parameter0, relu0, BRGEMM_TYPE::STAND_ALONE);
@@ -199,7 +199,7 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
         const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, shapes[2]);
 
         const auto order = std::vector<size_t>{0, 2, 3, 1};
-        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, order);
+        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReorder>(parameter1, 1, 0, order);
         const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
         const auto convert0 = std::make_shared<ov::snippets::op::ConvertSaturation>(store, ov::element::f32);
         const auto relu0 = std::make_shared<ov::op::v0::Relu>(convert0);