From f104183ee361978f97374d8c2e762c7dcc0a697a Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Fri, 3 Jan 2025 07:54:02 +0100 Subject: [PATCH] [Snippets][CPU] Applied Vladislav & Ivan comments 3 --- .../snippets/include/snippets/op/load.hpp | 10 +++---- .../include/snippets/snippets_isa_tbl.hpp | 2 +- .../pass/mark_invariant_shape_path.cpp | 2 +- src/common/snippets/src/op/load.cpp | 28 +++++++++---------- src/common/snippets/src/op/reorder.cpp | 2 +- .../src/pass/transpose_decomposition.cpp | 4 +-- .../snippets/src/runtime_configurator.cpp | 21 ++++++++------ .../src/shape_inference/shape_inference.cpp | 2 +- .../pass/extracted_loop_invariants.cpp | 4 +-- .../snippets/cpu_runtime_configurator.cpp | 2 +- .../snippets/cpu_runtime_configurator.hpp | 2 +- .../src/emitters/snippets/repacked_input.hpp | 1 + .../emitters/snippets/x64/cpu_generator.cpp | 2 +- .../x64/kernel_executors/brgemm_copy_b.cpp | 6 ++-- .../x64/kernel_executors/brgemm_copy_b.hpp | 2 +- src/plugins/intel_cpu/src/extension.cpp | 2 +- .../src/nodes/executors/subgraph.cpp | 15 +++------- .../src/nodes/executors/subgraph.hpp | 14 ++++------ .../x64/pass/eliminate_brgemm_copy_b.cpp | 5 ++-- .../lowered/external_repacking_adjuster.cpp | 12 ++++---- .../lowered/fuse_load_store_and_convert.cpp | 2 +- .../x64/lowered/buffer_allocation.cpp | 4 +-- 22 files changed, 69 insertions(+), 75 deletions(-) diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp index bca4b09fabdcbd..d0a168483bc5ce 100644 --- a/src/common/snippets/include/snippets/op/load.hpp +++ b/src/common/snippets/include/snippets/op/load.hpp @@ -41,17 +41,17 @@ class Load : public modifier::MemoryAccess, public ov::op::Op { }; /** - * @interface LoadReshape + * @interface LoadReorder * @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak * shape propagation. We need it to keep correct shape propagation when Transpose is decomposed to * Load and Store. This is a temporary solution until tokenization of Reshape operation is supported. * @ingroup snippets */ -class LoadReshape : public Load { +class LoadReorder : public Load { public: - OPENVINO_OP("LoadReshape", "SnippetsOpset", Load); - LoadReshape(const Output& x, size_t count = 1lu, const size_t offset = 0lu, std::vector order = {}); - LoadReshape() = default; + OPENVINO_OP("LoadReorder", "SnippetsOpset", Load); + LoadReorder(const Output& x, size_t count = 1lu, const size_t offset = 0lu, std::vector order = {}); + LoadReorder() = default; void set_offset(size_t offset) { set_output_offset(offset, 0); } void set_count(size_t count) { set_output_count(count, 0); } diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp index e9174283bf37dd..9dc416b3f7e38f 100644 --- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp +++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp @@ -11,7 +11,7 @@ // SnippetS dialect OV_OP(Load, ov::snippets::op) -OV_OP(LoadReshape, ov::snippets::op) +OV_OP(LoadReorder, ov::snippets::op) OV_OP(LoopBegin, ov::snippets::op) OV_OP(LoopEnd, ov::snippets::op) OV_OP(Brgemm, ov::snippets::op) diff --git a/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp b/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp index b32056d4e32a57..16d4160f1aaeb2 100644 --- a/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp +++ b/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp @@ -41,7 +41,7 @@ static bool is_affecting_op(const ExpressionPtr& expr) { const auto& node = expr->get_node(); return ov::is_type(node) || ov::is_type(node) || - ov::is_type(node); + ov::is_type(node); } } // namespace diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp index 461fec8b1399c0..9bd1e4c7bc8706 100644 --- a/src/common/snippets/src/op/load.cpp +++ b/src/common/snippets/src/op/load.cpp @@ -41,19 +41,19 @@ std::shared_ptr Load::clone_with_new_inputs(const OutputVector& new_args) return std::make_shared(new_args.at(0), get_count(), get_offset()); } -LoadReshape::LoadReshape(const Output& x, const size_t count, const size_t offset, std::vector order) +LoadReorder::LoadReorder(const Output& x, const size_t count, const size_t offset, std::vector order) : Load(x, count, offset), m_order(std::move(order)) { const auto& in_shape = x.get_partial_shape(); const auto in_shape_size = in_shape.size(); - OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size"); + OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReorder got new_order of invalid size"); OPENVINO_ASSERT(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 && - *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order"); + *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReorder detected invalid values in new_order"); const std::set unique_dims(order.begin(), order.end()); - OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements"); + OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReorder order must not contain repeated elements"); constructor_validate_and_infer_types(); } -void LoadReshape::validate_and_infer_types() { +void LoadReorder::validate_and_infer_types() { validate_memory_access_params(); const auto& old_shape = get_input_partial_shape(0); ov::PartialShape new_shape; @@ -62,23 +62,23 @@ void LoadReshape::validate_and_infer_types() { set_output_type(0, get_input_element_type(0), new_shape); } -bool LoadReshape::visit_attributes(AttributeVisitor& visitor) { +bool LoadReorder::visit_attributes(AttributeVisitor& visitor) { MemoryAccess::visit_attributes(visitor); visitor.on_attribute("order", m_order); return true; } -std::shared_ptr LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const { - INTERNAL_OP_SCOPE(LoadReshape); +std::shared_ptr LoadReorder::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(LoadReorder); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), get_count(), get_offset(), m_order); + return std::make_shared(new_args.at(0), get_count(), get_offset(), m_order); } -LoadReshape::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { - const auto& loadReshape = ov::as_type_ptr(n); - OPENVINO_ASSERT(loadReshape, "Got invalid node in LoadReshape::ShapeInfer"); - m_order = loadReshape->m_order; +LoadReorder::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { + const auto& loadReorder = ov::as_type_ptr(n); + OPENVINO_ASSERT(loadReorder, "Got invalid node in LoadReorder::ShapeInfer"); + m_order = loadReorder->m_order; } -IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector& input_shapes) { +IShapeInferSnippets::Result LoadReorder::ShapeInfer::infer(const std::vector& input_shapes) { OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes"); return {{utils::get_planar_vdims(input_shapes[0], m_order)}, ShapeInferStatus::success}; } diff --git a/src/common/snippets/src/op/reorder.cpp b/src/common/snippets/src/op/reorder.cpp index 95cd2375dcc2ce..43d8387a8cb2fb 100644 --- a/src/common/snippets/src/op/reorder.cpp +++ b/src/common/snippets/src/op/reorder.cpp @@ -31,7 +31,7 @@ void Reorder::custom_constructor_validate_and_infer_types(std::vector or void Reorder::validate_and_infer_types() { const auto& input_pshape = get_input_partial_shape(0); - const auto order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(); + const auto& order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(); OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(), "Incompatible shape and order sizes"); const auto output_pshape = utils::get_planar_pshape(get_input_partial_shape(0), order); diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp index 5c29b493af5826..a433cd41377422 100644 --- a/src/common/snippets/src/pass/transpose_decomposition.cpp +++ b/src/common/snippets/src/pass/transpose_decomposition.cpp @@ -60,9 +60,9 @@ TransposeDecomposition::TransposeDecomposition() { const auto subtensor = std::vector{1}; const auto& layout = order->cast_vector(); - // todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation + // todo: LoadReorder used here is essentially Load + an easy way to maintain correct shape propagation // fix this in future and develop a more consistent shape propagation approach. - auto load = std::make_shared(data_input, subtensor[0], 0, layout); + auto load = std::make_shared(data_input, subtensor[0], 0, layout); auto store = std::make_shared(load, subtensor[0]); PortDescriptorUtils::set_port_descriptor(load->input(0), subtensor, layout); diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index c23c4e908f24bf..5527cebb63f24f 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -120,15 +120,18 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir) const auto& shape_infer_seq = utils::get_first_child_shape_infer_expr_seq(param); ExpressionPtr mem_desc_expr = param; if (!shape_infer_seq.empty()) { - // If there is Reorder, we should take its desc because it affects on shape by target order - const auto& reordered_reshape_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(), - [](const ExpressionPtr& expr) { - return ov::is_type(expr->get_node()); - }); - if (reordered_reshape_it != shape_infer_seq.cend()) { - const auto& reshape = *reordered_reshape_it; - const auto& etype = reshape->get_node()->get_output_element_type(0); - update_io_parameters(reshape->get_input_port_descriptor(0), etype); + // [160048] Reorder, as any another ShapeInferOp, should just propagate input shape to output using target order + // without data movement. However, currently we have to save desc of input of the Reorder + // to support correct input data offsets calculations and MHAParallelWAOptimizer pass work. + // Please, remove this code part when the mentioned ticket is completed. + const auto& reorder_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(), + [](const ExpressionPtr& expr) { + return ov::is_type(expr->get_node()); + }); + if (reorder_it != shape_infer_seq.cend()) { + const auto& reorder = *reorder_it; + const auto& etype = reorder->get_node()->get_output_element_type(0); + update_io_parameters(reorder->get_input_port_descriptor(0), etype); continue; } diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp index 62800ec1294a8b..0e3060501a87d5 100644 --- a/src/common/snippets/src/shape_inference/shape_inference.cpp +++ b/src/common/snippets/src/shape_inference/shape_inference.cpp @@ -64,7 +64,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry // Note that Result has no output PortConnectors, so the shape must be empty SHAPE_INFER_PREDEFINED(ov::op::v0::Result, EmptyShapeInfer), // - SHAPE_INFER_OP_SPECIFIC(op::LoadReshape), + SHAPE_INFER_OP_SPECIFIC(op::LoadReorder), SHAPE_INFER_OP_SPECIFIC(op::Reshape), SHAPE_INFER_OP_SPECIFIC(op::Reorder), SHAPE_INFER_OP_SPECIFIC(op::RankNormalization), diff --git a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp index ee76c5af7234d8..b9ff7bda6823ed 100644 --- a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp +++ b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp @@ -299,7 +299,7 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) { * * Param0(32,8,1) * | - * LoadReshape with order (1,2,0) + * LoadReorder with order (1,2,0) * | * Store * | @@ -307,7 +307,7 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) { */ { auto param = linear_ir->push_node(input_precision, input_shape_0); - auto load_reshape = linear_ir->push_node(param.second, 1, 0, layout); + auto load_reshape = linear_ir->push_node(param.second, 1, 0, layout); auto store = linear_ir->push_node(load_reshape.second, 1, 0); init_expr_descriptors(*load_reshape.first, {subtensor, subtensor}, {order, layout}); init_expr_descriptors(*store.first, {subtensor, subtensor}, {layout, layout}); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 43b3ea14cc148a..3ad41d707bb96b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -7,7 +7,7 @@ #include "snippets/lowered/loop_manager.hpp" #include "snippets/utils/utils.hpp" -#ifndef OPENVINO_ARCH_ARM64 +#ifdef OPENVINO_ARCH_X86_64 # include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp" # include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp" #endif diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 20ef3793f56766..425959c289b3a7 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -35,7 +35,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { public: - CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache = {}); + CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache); /** * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig diff --git a/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp index 545e29eaa6915a..61daaa859ef603 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp @@ -12,6 +12,7 @@ namespace intel_cpu { struct RepackedInputKernel { RepackedInputKernel() = default; virtual ~RepackedInputKernel() = default; + virtual void operator()(const void* args) const = 0; }; struct RepackedInput { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index c8d5ea5229fcd3..31daa32dfa144f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -180,7 +180,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[snippets::op::Reorder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); - jitters[snippets::op::LoadReshape::get_type_info_static()] = + jitters[snippets::op::LoadReorder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp index 30d7e13b9d7bb5..7aca5f6c6a696f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp @@ -170,9 +170,11 @@ status_t BrgemmCopyBKernel::create_kernel() { return code; } -void BrgemmCopyBKernel::operator()(const call_args* args) const { +void BrgemmCopyBKernel::operator()(const void* args) const { + const auto* call_args = reinterpret_cast(args); + OV_CPU_JIT_EMITTER_ASSERT(call_args, "Call arguments are nullptr!"); OV_CPU_JIT_EMITTER_ASSERT(ker_, "Kernel is nullptr"); - ker_(args); + ker_(call_args); } void BrgemmCopyBKernel::init_brgemm_copy_b_kernel( diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp index 16ed18e68a01bb..5ef740067f2035 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp @@ -153,7 +153,7 @@ struct BrgemmCopyBKernel : public RepackedInputKernel, public dnnl::impl::cpu::x dnnl::impl::status_t create_kernel() override; - void operator()(const call_args* args) const; + void operator()(const void* args) const override; private: void generate() override; diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index bdb5211009a22a..762568823d205f 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -170,7 +170,7 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::snippets::op::KernelStatic) \ OP_EXTENSION(ov::snippets::op::KernelDynamic) \ OP_EXTENSION(ov::snippets::op::Load) \ - OP_EXTENSION(ov::snippets::op::LoadReshape) \ + OP_EXTENSION(ov::snippets::op::LoadReorder) \ OP_EXTENSION(ov::snippets::op::LoopBegin) \ OP_EXTENSION(ov::snippets::op::LoopEnd) \ OP_EXTENSION(ov::snippets::op::Buffer) \ diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp index b55e5263708268..739ae56be3b4ff 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp @@ -3,11 +3,7 @@ // #include "nodes/executors/subgraph.hpp" -#if defined(OPENVINO_ARCH_ARM64) -# include "emitters/snippets/aarch64/cpu_generator.hpp" -#else -# include "emitters/snippets/x64/cpu_generator.hpp" -#endif + #include "common/primitive_hashing_utils.hpp" #include "openvino/core/parallel.hpp" @@ -102,9 +98,8 @@ void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr& domain) { init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain); } -void SubgraphBaseExecutor::parallel_for6d( - const std::function& initializer, - const std::function&, size_t)>& caller) { + +void SubgraphBaseExecutor::parallel_for6d(const initializer_functor& initializer, const call_functor& caller) { const auto& dom = m_parallel_exec_domain; parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { @@ -142,9 +137,7 @@ void SubgraphBaseExecutor::parallel_for6d( }); } -void SubgraphBaseExecutor::parallel_forNd( - const std::function& initializer, - const std::function&, size_t)>& caller) { +void SubgraphBaseExecutor::parallel_forNd(const initializer_functor& initializer, const call_functor& caller) { const auto& dom = m_parallel_exec_domain; parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp index a1762c3c72e22e..78cb56440203d2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp @@ -66,21 +66,17 @@ class SubgraphBaseExecutor { protected: virtual void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; - virtual void parallel_for6d( - const std::function& initializer, - const std::function&, size_t)>& caller); - virtual void parallel_forNd( - const std::function& initializer, - const std::function&, size_t)>& caller); + using initializer_functor = std::function; + using call_functor = std::function&, size_t)>; + + virtual void parallel_for6d(const initializer_functor& initializer, const call_functor& caller); + virtual void parallel_forNd(const initializer_functor& initializer, const call_functor& caller); inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const { if (m_buffer_scratchpad_size > 0) scratchpad_ptr = m_buffer_scratchpad->getDataAs() + ithr * m_buffer_scratchpad_size; } - using initializer_functor = std::function; - using call_functor = std::function&, size_t)>; - std::shared_ptr m_schedule; // Holds index of output used as in execution domain // it should be compatible with a schedule's work size diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp index 711c33a085859e..b87a78c6b0cb40 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp @@ -41,14 +41,13 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() { transformation_callback(copy_b_node)) return false; - // If there is non-empty and non-planar layout, we should insert reshape to support shape inference + // If there is non-planar layout, we should insert reshape to support shape inference if (!ov::snippets::utils::is_planar_layout(layout)) { const auto& subtensor = in_desc->get_subtensor(); const auto& reshape = std::make_shared(copy_b_node->input_value(0), layout); ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout); ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor); - ov::replace_node_update_name(copy_b_node, reshape); - return true; + return ov::replace_node_update_name(copy_b_node, reshape); } // If there is no layout, we can just remove BrgemmCopyB from the subgraph diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp index faff40ec3a54be..add7c66d3d7ffc 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -103,12 +103,12 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin update_kernel(p.second, shape, layout, N, K, prc); } - const auto L2_cache_size = dnnl::utils::get_cache_size(2, true); - const auto fit_into_L2 = data_size < L2_cache_size; - // Heuristic: If external repacking data doesn't fit in the cache L2, + const auto cache_size = dnnl::utils::get_cache_size(1, true) + dnnl::utils::get_cache_size(2, true); + const auto fit_into_cache = data_size < cache_size; + // Heuristic: If external repacking data doesn't fit in the caches L1 and L2, // external repacking should be executed in seperate parallel section before kernel execution. - cpu_config->repacking_impl_type = - fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL : CPURuntimeConfig::RepackingImplType::SEPARATE; + cpu_config->repacking_impl_type = fit_into_cache ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL + : CPURuntimeConfig::RepackingImplType::SEPARATE; const auto is_impl_parallel = cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL; @@ -131,7 +131,7 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin // Save original input offsets for input before repacking. // If the shape has not been changed, it means that we already created `RepackedInput` for this input // on previous pass call and now `cpu_config->io_data_offsets[i]` contains offsets not for original input - - // they were updated for blocked shapes/zeroed for previous initialization and we canonot use them as original + // they were updated for blocked shapes/zeroed for previous initialization and we cannot use them as original // offsets. const auto in_offsets = shape == cpu_config->latest_shapes[i] ? repacked_in.in_offsets() : cpu_config->io_data_offsets[i]; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp index 0f5a6472b741f4..0186e5b66030ca 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp @@ -22,7 +22,7 @@ bool ov::intel_cpu::pass::FuseLoadStoreConvert::fuse_load_convert( const auto& load_output = input_connector->get_source(); const auto& load_expr = load_output.get_expr(); const auto load = ov::as_type_ptr(load_expr->get_node()); - if (!load || ov::is_type(load_expr->get_node()) || + if (!load || ov::is_type(load_expr->get_node()) || ov::is_type(load_expr->get_node())) return false; diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp index 9ace85b3038afa..7c425b0bca6781 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp @@ -149,7 +149,7 @@ class MHAFP32BufferAllocationTest : public BufferAllocationCPUTest { const auto parameter2 = std::make_shared(ov::element::f32, shapes[2]); const auto order = std::vector{0, 2, 3, 1}; - const auto load_reshape = std::make_shared(parameter1, 1, 0, order); + const auto load_reshape = std::make_shared(parameter1, 1, 0, order); const auto store = std::make_shared(load_reshape); const auto relu0 = std::make_shared(store); const auto brgemm_cpu0 = std::make_shared(parameter0, relu0, BRGEMM_TYPE::STAND_ALONE); @@ -199,7 +199,7 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto parameter2 = std::make_shared(ov::element::bf16, shapes[2]); const auto order = std::vector{0, 2, 3, 1}; - const auto load_reshape = std::make_shared(parameter1, 1, 0, order); + const auto load_reshape = std::make_shared(parameter1, 1, 0, order); const auto store = std::make_shared(load_reshape); const auto convert0 = std::make_shared(store, ov::element::f32); const auto relu0 = std::make_shared(convert0);