diff --git a/src/common/snippets/include/snippets/op/brgemm.hpp b/src/common/snippets/include/snippets/op/brgemm.hpp index 531d101cb23d14..7b13f5e3053b0b 100644 --- a/src/common/snippets/include/snippets/op/brgemm.hpp +++ b/src/common/snippets/include/snippets/op/brgemm.hpp @@ -26,6 +26,10 @@ class Brgemm : public MemoryAccess { bool transposed_a() const { return m_transposed_a; } bool transposed_b() const { return m_transposed_b; } + size_t get_offset_a() const { return get_input_port_descriptor(0).m_offset; } + size_t get_offset_b() const { return get_input_port_descriptor(1).m_offset; } + size_t get_offset_c() const { return get_output_port_descriptor(0).m_offset; } + bool visit_attributes(AttributeVisitor& visitor) override; void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; diff --git a/src/common/snippets/include/snippets/op/broadcastload.hpp b/src/common/snippets/include/snippets/op/broadcastload.hpp index 6268fdd736a722..31eac83757b506 100644 --- a/src/common/snippets/include/snippets/op/broadcastload.hpp +++ b/src/common/snippets/include/snippets/op/broadcastload.hpp @@ -24,6 +24,8 @@ class BroadcastLoad : public MemoryAccess { BroadcastLoad(const Output& x, ov::PartialShape output_shape, size_t offset = 0lu); BroadcastLoad() = default; + size_t get_offset() const { return get_input_port_descriptor(0).m_offset; } + bool visit_attributes(AttributeVisitor& visitor) override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; void validate_and_infer_types() override; diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp index 5150cdc9327fc9..70da0c5dab9f22 100644 --- a/src/common/snippets/include/snippets/op/buffer.hpp +++ b/src/common/snippets/include/snippets/op/buffer.hpp @@ -12,14 +12,7 @@ namespace op { /** * @interface Buffer - * @brief The operation is for intermediate data storage - * TODO - * - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank]. - * It's needed to allocate needed memory size that depends on Tile rank, for example. - * Default value is -1 (full shape) - * - m_static_shape - static shape that describes Buffer size in cases when Buffer doesn't have parent node. - * - m_element_type - element type in cases when Buffer doesn't have parent node. - * - m_single - True if Buffer doesn't have parent node else False + * @brief This is a base class for memory storage. * Notes: * - All buffers in a graph have the same memory pointer. So if we have a few buffers, * each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer @@ -39,8 +32,8 @@ class Buffer : public ngraph::op::Op { /** * @interface AllocationBuffer - * @brief The operation is for allocation new empty memory - * TODO + * @brief The operation is for allocation of new empty memory. The operation has one parent that is equal to allocation shape + * - m_element_type - element type of memory * @ingroup snippets */ class AllocationBuffer : public Buffer { @@ -62,8 +55,16 @@ class AllocationBuffer : public Buffer { /** * @interface IntermediateBuffer - * @brief The operation is for intermediate data storage - * TODO + * @brief The operation is for intermediate data storage. + * If Buffer has only one parent, the Buffer will allocate a full memory with input shape of Buffer. + * If Buffer has second parent as well, the Buffer will allocate memory with shape that is equal to values from second input but + * saves the input shape for shape inference and input element type. + * For example, + * Parameter [5, 3, 128] Constant [2] (with values {3, 128}) + * \ / + * Buffer with allocated memory 3x128 size + * | + * Result [5, 3, 128] * @ingroup snippets */ class IntermediateBuffer : public Buffer { diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp index 2639918dd1cd7c..67b8c18505243c 100644 --- a/src/common/snippets/include/snippets/op/load.hpp +++ b/src/common/snippets/include/snippets/op/load.hpp @@ -25,6 +25,9 @@ class Load : public MemoryAccess { Load(const Output& x, const size_t count = 1lu, const size_t offset = 0lu); Load() = default; + size_t get_offset() const { return get_input_port_descriptor(0).m_offset; } + size_t get_count() const { return get_input_port_descriptor(0).m_count; } + void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; }; diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp index 74245daf71d021..4c86e91d887da1 100644 --- a/src/common/snippets/include/snippets/op/store.hpp +++ b/src/common/snippets/include/snippets/op/store.hpp @@ -25,6 +25,9 @@ class Store : public MemoryAccess { Store(const Output& x, const size_t count = 1lu, const size_t offset = 0lu); Store() = default; + size_t get_offset() const { return get_output_port_descriptor(0).m_offset; } + size_t get_count() const { return get_output_port_descriptor(0).m_count; } + void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; }; diff --git a/src/common/snippets/src/op/memory_access.cpp b/src/common/snippets/src/op/memory_access.cpp index 059e0d74087419..9352884ca740f3 100644 --- a/src/common/snippets/src/op/memory_access.cpp +++ b/src/common/snippets/src/op/memory_access.cpp @@ -37,7 +37,7 @@ void MemoryAccess::set_input_port_descriptor(const PortDescriptor& desc, const s } PortDescriptor MemoryAccess::get_input_port_descriptor(const size_t i) const { - // We cannot use the same way as in ov::Node::get_input_descriptor because this method must be static + // We cannot use the same way as in ov::Node::get_input_descriptor because this method must be const // to allow call const Derived::clone_with_new_inputs() method NGRAPH_CHECK(i < m_input_ports.size(), "Index of input port descriptor should be less than count of input ports"); return m_input_ports[i]; @@ -60,7 +60,7 @@ void MemoryAccess::set_output_port_descriptor(const PortDescriptor& desc, const } PortDescriptor MemoryAccess::get_output_port_descriptor(const size_t i) const { - // We cannot use the same way as in ov::Node::get_input_descriptor because this method must be static + // We cannot use the same way as in ov::Node::get_input_descriptor because this method must be const // to allow call const Derived::clone_with_new_inputs() method NGRAPH_CHECK(i < m_output_ports.size(), "Index of output port descriptor should be less than count of output ports"); return m_output_ports[i]; diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 2030a1d712cb41..1fc85f61ac96fb 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -448,8 +448,8 @@ void snippets::op::Subgraph::initialize_buffer_scratchpad_size() { // Propagate to up: in Store. Buffer can have only one Store { - if (buffer->get_input_size() > 0) { - auto parent = buffer->get_input_node_shared_ptr(0); + auto parent = buffer->get_input_node_shared_ptr(0); + if (!ov::is_type(parent)) { auto idx = buffer->input(0).get_source_output().get_index(); while (ov::is_type(parent)) { const auto source_output = parent->input_value(idx); @@ -498,13 +498,6 @@ void snippets::op::Subgraph::initialize_buffer_scratchpad_size() { for (const auto& op : ops) { if (const auto buffer = ov::as_type_ptr(op)) { const auto buffer_size = buffer->get_byte_size(); - if (ov::is_type(op)) { - if (op->get_input_size() == 2) { - op->set_arguments({op->get_input_source_output(0)}); - } - } else if (ov::is_type(op)) { - op->set_arguments(ov::OutputVector{}); - } // We need to allocate memory for first buffer at least if (m_buffer_scratchpad == 0) { m_buffer_scratchpad += buffer_size; diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp index 0805df1c5e2f33..8f0e1c2c015e6b 100644 --- a/src/common/snippets/src/pass/assign_registers.cpp +++ b/src/common/snippets/src/pass/assign_registers.cpp @@ -9,6 +9,14 @@ namespace { static constexpr size_t reg_count = 16lu; + +auto filter_ops(const std::shared_ptr& op) -> bool { + if (ov::is_type(op) && + ov::is_type(op->get_output_target_inputs(0).begin()->get_node())) + return false; + return true; +} + } // namespace bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr& f) { @@ -19,8 +27,12 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr auto ops = f->get_ordered_ops(); std::vector>> typed_ops; - for (const auto& op : ops) - typed_ops.emplace_back(std::make_pair(m_target_machine->get_op_reg_type(op), op)); + for (const auto& op : ops) { + if (filter_ops(op)) { + typed_ops.emplace_back(std::make_pair(m_target_machine->get_op_reg_type(op), op)); + } + } + size_t counter_vec = 0; size_t counter_gpr = 0; std::map regs_vec, regs_gpr; @@ -120,8 +132,12 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr for (size_t i = 0; i < typed_ops.size(); i++) { const auto& t_op = typed_ops[i]; std::vector used_tensors, defined_tensors; - for (const auto& in : t_op.second->inputs()) + for (const auto& in : t_op.second->inputs()) { + if (ov::is_type(t_op.second) && + ov::is_type(t_op.second->get_input_node_shared_ptr(in.get_index()))) + continue; used_tensors.push_back(in.get_tensor_ptr()); + } for (const auto& out : t_op.second->outputs()) defined_tensors.push_back(out.get_tensor_ptr()); switch (t_op.first) { diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp index c7afd25ce8af1d..f485a002276f27 100644 --- a/src/common/snippets/src/pass/insert_loops.cpp +++ b/src/common/snippets/src/pass/insert_loops.cpp @@ -222,9 +222,6 @@ bool InsertLoops::run_on_model(const std::shared_ptr &model) { if (m_master_shape.is_dynamic()) throw ngraph_error("InsertLoops doesn't support dynamic shapes yet"); - ov::pass::Serialize("/home/a-sidorova/projects/mha_matmul/openvino/graphs/loops.xml", - "/home/a-sidorova/projects/mha_matmul/openvino/graphs/loops.bin").run_on_model(model); - const auto inner_work_amount = utils::get_inner_dim(m_master_shape).get_length(); const auto outer_work_amount = m_loop_depth == 2 ? utils::get_outer_dim(m_master_shape).get_length() : 1; @@ -285,9 +282,6 @@ bool InsertLoops::run_on_model(const std::shared_ptr &model) { } } - ov::pass::Serialize("/home/a-sidorova/projects/mha_matmul/openvino/graphs/loops_after.xml", - "/home/a-sidorova/projects/mha_matmul/openvino/graphs/loops_after.bin").run_on_model(model); - return true; } diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp index 63d30e0a65223e..85f71778e36609 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp @@ -50,7 +50,7 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ jitters[ngraph::snippets::op::AllocationBuffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter); jitters[ngraph::snippets::op::IntermediateBuffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter); jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = CREATE_EMITTER(VectorBufferEmitter); - // jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported + jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(NopEmitter); // Not supported jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter); jitters[ngraph::snippets::op::LoadReshape::get_type_info_static()] = CREATE_EMITTER(LoadEmitter); diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index 60b99c23f98094..6d2d80dc36863b 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -548,9 +548,8 @@ StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::c IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); const auto store = ov::as_type_ptr(n); - const auto desc = store->get_output_port_descriptor(0); - count = desc.m_count; - byte_offset = desc.m_offset; + count = store->get_count(); + byte_offset = store->get_offset(); in_out_type_ = emitter_in_out_map::vec_to_gpr; store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count)); } @@ -590,9 +589,8 @@ LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); const auto load = std::dynamic_pointer_cast(n); - const auto desc = load->get_input_port_descriptor(0); - count = desc.m_count; - byte_offset = desc.m_offset; + count = load->get_count(); + byte_offset = load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } @@ -632,8 +630,7 @@ BroadcastLoadEmitter::BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); const auto broadcast_load = std::dynamic_pointer_cast(n); - const auto desc = broadcast_load->get_input_port_descriptor(0); - byte_offset = desc.m_offset; + byte_offset = broadcast_load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; } @@ -673,9 +670,8 @@ void BroadcastLoadEmitter::emit_isa(const std::vector &in, const std::ve LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { const auto load = ov::as_type_ptr(n); - const auto desc = load->get_input_port_descriptor(0); - count = desc.m_count; - byte_offset = desc.m_offset; + count = load->get_count(); + byte_offset = load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } @@ -710,9 +706,8 @@ void LoadConvertEmitter::emit_data() const { StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { const auto store = ov::as_type_ptr(n); - const auto desc = store->get_output_port_descriptor(0); - count = desc.m_count; - byte_offset = desc.m_offset; + count = store->get_count(); + byte_offset = store->get_offset(); in_out_type_ = emitter_in_out_map::vec_to_gpr; if (ov::is_type(n)) { @@ -848,10 +843,11 @@ BrgemmEmitter::BrgemmEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: } } - load_offset_a = brgemm_node->get_input_port_descriptor(0).m_offset; - load_offset_b = brgemm_node->get_input_port_descriptor(1).m_offset; - load_offset_scratch = brgemm_node->get_input_port_descriptor(2).m_offset; - store_offset_c = brgemm_node->get_output_port_descriptor(0).m_offset; + load_offset_a = brgemm_node->get_offset_a(); + load_offset_b = brgemm_node->get_offset_b(); + store_offset_c = brgemm_node->get_offset_c(); + if (with_scratch) + load_offset_scratch = brgemm_node->get_offset_scratch(); } void BrgemmEmitter::initBrgemm(brgemmCtx& ctx, std::unique_ptr& brgKernel, bool use_amx) const { @@ -938,8 +934,35 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c Reg64 addr_A, Reg64 addr_B, Reg64 scratch, Reg64 addr_C, const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t in2_kernel_offset, const size_t out0_kernel_offset) const { - if (ctx.is_with_amx) - amx_tile_configure(ctx.palette); + if (ctx.is_with_amx) { + size_t gpr_size = 8; + Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, + h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; + size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); + + h->sub(h->rsp, n_gprs_to_save * gpr_size); + for (size_t i = 0; i < n_gprs_to_save; ++i) + h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]); + + // save function address in gpr to pass in call instruction + const auto& overload = static_cast(amx_tile_configure); + h->mov(h->rbp, reinterpret_cast(overload)); + h->mov(abi_param1, reinterpret_cast(ctx.palette)); + + // align stack on 16-byte as ABI requires + // note that RBX must not be changed by the callee + h->mov(h->rbx, h->rsp); + h->and_(h->rbx, 0xf); + h->sub(h->rsp, h->rbx); + + h->call(h->rbp); + + h->add(h->rsp, h->rbx); + // restore gpr registers + for (int i = n_gprs_to_save - 1; i >= 0; --i) + h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]); + h->add(h->rsp, n_gprs_to_save * gpr_size); + } size_t gpr_size = 8; Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, @@ -1073,10 +1096,10 @@ BrgemmCopyBEmitter::BrgemmCopyBEmitter(dnnl::impl::cpu::x64::jit_generator* h, d brgemm_prc_in1 = brgemm_repack->get_input_element_type(0); brgemmVNNIFactor = 4 / brgemm_prc_in0.size(); with_comp = brgemm_repack->is_with_comp(); - in_offset = brgemm_repack->get_input_port_descriptor(0).m_offset; - out_offset = brgemm_repack->get_output_port_descriptor(0).m_offset; + in_offset = brgemm_repack->get_offset_in(); + out_offset = brgemm_repack->get_offset_out(); if (with_comp) - comp_offset = brgemm_repack->get_output_port_descriptor(1).m_offset; + comp_offset = brgemm_repack->get_offset_comp(); auto layout = ngraph::snippets::utils::get_node_output_layout(brgemm_repack->get_input_node_shared_ptr(0)); const auto& original_shape = brgemm_repack->get_input_shape(0); @@ -1113,7 +1136,7 @@ BrgemmCopyBEmitter::BrgemmCopyBEmitter(dnnl::impl::cpu::x64::jit_generator* h, d const auto dt_in0 = static_cast(DnnlExtensionUtils::IEPrecisionToDataType(InferenceEngine::details::convertPrecision(brgemm_prc_in0))); const auto dt_in1 = static_cast(DnnlExtensionUtils::IEPrecisionToDataType(InferenceEngine::details::convertPrecision(brgemm_prc_in1))); - init_brgemm_copy(kernel, leading_dimension, N_blk, N_tail, LDB, (K_tail == 0 ? K : K_tail), use_amx, dt_in0, dt_in1); + init_brgemm_copy(kernel, leading_dimension, N_blk, N_tail, LDB, K - K_tail, use_amx, dt_in0, dt_in1); } void BrgemmCopyBEmitter::init_brgemm_copy(std::unique_ptr& kernel, diff --git a/src/plugins/intel_cpu/src/snippets_transformations/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/snippets_transformations/brgemm_to_brgemm_cpu.cpp index b9305748ebbe55..f8f5e2c2f600bc 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/brgemm_to_brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/brgemm_to_brgemm_cpu.cpp @@ -31,7 +31,8 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::BrgemmToBrgemmCPU") auto& pm = m.get_pattern_value_map(); const auto brgemm = ov::as_type_ptr(pm.at(m_brgemm).get_node_shared_ptr()); - if (!brgemm) + const auto brgemm_plugin = ov::as_type_ptr(pm.at(m_brgemm).get_node_shared_ptr()); + if (!brgemm || brgemm_plugin) return false; if (brgemm->get_input_partial_shape(0).is_dynamic() || brgemm->get_input_partial_shape(1).is_dynamic()) { @@ -70,8 +71,8 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() { if (with_amx || with_comp) { std::shared_ptr scratch = nullptr; if (with_amx) { - const auto scratch_size = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{4 * 1024}); - scratch = std::make_shared(scratch_size, ov::element::i32); + const auto scratch_size = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{32 * 1024}); + scratch = std::make_shared(scratch_size, ov::element::u8); } else if (with_comp) { scratch = std::make_shared(brgemmRepackIn1->output(1)); } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_copy_b.cpp index b300b7d47eda21..71360b08030676 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_copy_b.cpp @@ -47,7 +47,6 @@ void intel_cpu::BrgemmCopyB::validate_and_infer_types() { if (m_with_comp) { set_output_type(1, ov::element::f32, ov::PartialShape{ov::Dimension::dynamic()}); } - return; } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_copy_b.hpp index 30b592552de096..abac0e5e7c25fc 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_copy_b.hpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_copy_b.hpp @@ -21,6 +21,10 @@ class BrgemmCopyB : public ngraph::snippets::op::MemoryAccess { const size_t offset_in = 0lu, const size_t offset_out0 = 0lu, const size_t offset_out1 = 0lu); BrgemmCopyB() = default; + size_t get_offset_in() const { return get_input_port_descriptor(0).m_offset; } + size_t get_offset_out() const { return get_output_port_descriptor(0).m_offset; } + size_t get_offset_comp() const { return get_output_port_descriptor(1).m_offset; } + element::Type get_src_element_type() const { return m_src_type; } bool is_with_comp() const { return m_with_comp; } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.cpp index df8df656f694d6..49e16b67f2f5c7 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.cpp @@ -7,7 +7,6 @@ #include "ngraph/runtime/host_tensor.hpp" #include "openvino/core/rt_info.hpp" #include "snippets/utils.hpp" -#include "matmul_shape_inference.hpp" #include "utils/general_utils.h" @@ -85,8 +84,8 @@ void BrgemmCPU::validate_and_infer_types() { NGRAPH_CHECK(expected_shape == shape.get_shape() && expected_type == type, "BRGEMM Scratch with compensations must have shape {rnd_up(N, N_blk)} and FP32 element type"); } else { - NGRAPH_CHECK(ngraph::shape_size(shape.get_shape()) == 4 * 1024 && type == element::i32, - "BRGEMM Scratch for space workplace must be static, have F32 element type and 1024 shape size"); + NGRAPH_CHECK(ngraph::shape_size(shape.get_shape()) == 32 * 1024 && type == element::u8, + "BRGEMM Scratch for space workplace must be static, have U8 element type and 32x1024 shape size"); } } } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.hpp index 183392d321a2dc..c17b034868a63b 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.hpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.hpp @@ -30,6 +30,7 @@ class BrgemmCPU : public ngraph::snippets::op::Brgemm { void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + size_t get_offset_scratch() const { return get_input_port_descriptor(2).m_offset; } std::shared_ptr get_brgemm_copy() const; private: diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index b4a2ed8b51c86e..ab2430d50c8ac6 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -4,6 +4,7 @@ #include "snippets/matmul.hpp" #include "common_test_utils/test_constants.hpp" +#include "ie_system_conf.h" namespace ov { namespace test { @@ -18,18 +19,23 @@ std::vector> input_shapes{ {{1, 1, 37, 23}, {1, 2, 23, 33}}, {{1, 16, 384, 64}, {1, 16, 64, 384}} }; -std::vector> precisions = { - {element::f32, element::f32} -}; -std::vector> all_precisions = { - {element::f32, element::f32}, - {element::i8, element::i8}, - {element::u8, element::i8} -}; +static inline std::vector> precisions(bool only_fp32 = true) { + std::vector> prc = { + {element::f32, element::f32}, + }; + if (!only_fp32) { + prc.emplace_back(std::vector{element::i8, element::i8}); + prc.emplace_back(std::vector{element::u8, element::i8}); + if (InferenceEngine::with_cpu_x86_bfloat16() || InferenceEngine::with_cpu_x86_avx512_core_amx_bf16()) { + prc.emplace_back(std::vector{element::bf16, element::bf16}); + } + } + return prc; +} INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul, ::testing::Combine( ::testing::ValuesIn(input_shapes), - ::testing::ValuesIn(all_precisions), + ::testing::ValuesIn(precisions(false)), ::testing::Values(1), // MatMu; ::testing::Values(1), // Tokenized MatMul ::testing::Values(CommonTestUtils::DEVICE_CPU)), @@ -38,7 +44,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulFQ, MatMulFQ, ::testing::Combine( ::testing::ValuesIn(input_shapes), - ::testing::ValuesIn(precisions), + ::testing::ValuesIn(precisions()), ::testing::Values(1), // MatMul; ::testing::Values(1), // Tokenized MatMul ::testing::Values(CommonTestUtils::DEVICE_CPU)), @@ -47,7 +53,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulFQ, MatMulFQ, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBias, MatMulBias, ::testing::Combine( ::testing::Values(std::vector{{1, 2, 69, 43}, {2, 1, 43, 49}, {1, 1, 69, 49}}), - ::testing::ValuesIn(all_precisions), + ::testing::ValuesIn(precisions(false)), ::testing::Values(1), // Subgraph; ::testing::Values(1), // Tokenized MatMul+Bias ::testing::Values(CommonTestUtils::DEVICE_CPU)), @@ -56,7 +62,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBias, MatMulBias, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ExplicitTransposeMatMul, ExplicitTransposeMatMul, ::testing::Combine( ::testing::Values(std::vector{{1, 2, 69, 43}, {2, 49, 2, 43}}), - ::testing::ValuesIn(precisions), + ::testing::ValuesIn(precisions()), ::testing::Values(1), // Subgraph; ::testing::Values(1), // Tokenized MatMul+Bias ::testing::Values(CommonTestUtils::DEVICE_CPU)), @@ -65,7 +71,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ExplicitTransposeMatMul, ExplicitTranspo INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMatMulBias, ExplicitTransposeMatMulBias, ::testing::Combine( ::testing::Values(std::vector{{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 1, 69, 49}}), - ::testing::ValuesIn(precisions), + ::testing::ValuesIn(precisions()), ::testing::Values(1), // Subgraph; ::testing::Values(1), // Tokenized MatMul+Bias ::testing::Values(CommonTestUtils::DEVICE_CPU)), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp index 8d9b0945f512ad..98733dd68e8f05 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp @@ -12,14 +12,16 @@ namespace snippets { namespace { -static inline std::vector> precisions() { +static inline std::vector> precisions(bool only_fp32 = true) { std::vector> prc = { {element::f32, element::f32}, - {element::i8, element::i8}, - {element::u8, element::i8} }; - if (InferenceEngine::with_cpu_x86_bfloat16() || InferenceEngine::with_cpu_x86_avx512_core_amx_bf16()) { - prc.emplace_back(std::vector{element::bf16, element::bf16}); + if (!only_fp32) { + prc.emplace_back(std::vector{element::i8, element::i8}); + prc.emplace_back(std::vector{element::u8, element::i8}); + if (InferenceEngine::with_cpu_x86_bfloat16() || InferenceEngine::with_cpu_x86_avx512_core_amx_bf16()) { + prc.emplace_back(std::vector{element::bf16, element::bf16}); + } } return prc; } @@ -31,7 +33,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, ::testing::Combine( ::testing::ValuesIn(transpose_input_shapes), ::testing::Values(0), // Transpose on 0th Matmul input - ::testing::ValuesIn(precisions()), + ::testing::ValuesIn(precisions(false)), ::testing::Values(1), // MatMul ::testing::Values(1), // Tokenized MatMul + FusedTranspose ::testing::Values(CommonTestUtils::DEVICE_CPU)), @@ -57,7 +59,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, ::testing::Combine( ::testing::ValuesIn(transpose_input_shapes), ::testing::Values(1), // Transpose on 1st Matmul input - ::testing::ValuesIn(precisions()), + ::testing::ValuesIn(precisions(false)), ::testing::Values(1), // MatMul ::testing::Values(1), // Tokenized MatMul + FusedTranspose ::testing::Values(CommonTestUtils::DEVICE_CPU)), @@ -67,7 +69,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMatMulFQ, TransposeMatMulFQ, ::testing::Combine( ::testing::ValuesIn(transpose_input_shapes), ::testing::Values(1), // Transpose on 1st Matmul input - ::testing::Values(std::vector{ov::element::f32}), + ::testing::ValuesIn(precisions()), ::testing::Values(1), // MatMul ::testing::Values(1), // Tokenized MatMul + FusedTranspose ::testing::Values(CommonTestUtils::DEVICE_CPU)), @@ -83,7 +85,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, ::testing::Combine( ::testing::ValuesIn(transpose_input_shapes), ::testing::Values(2), // Transpose on Matmul output - ::testing::Values(std::vector{ov::element::f32, ov::element::f32}), + ::testing::ValuesIn(precisions()), ::testing::Values(1), // MatMul ::testing::Values(1), // Tokenized MatMul + FusedTranspose ::testing::Values(CommonTestUtils::DEVICE_CPU)), diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp index c1cf7e2db01167..985a38e9750682 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp @@ -194,7 +194,8 @@ std::shared_ptr SoftmaxLoweredFunction::initLowered() const { const auto horizon_sum = std::make_shared(sum); horizon_sum->add_control_dependency(loop_sum_end); - const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); + const auto size_exp = std::make_shared(ov::element::i32, ov::Shape{2}); + const auto buffer_exp = std::make_shared(loop_sum_end->output(0), size_exp); loop_sum_begin->add_control_dependency(vector_buffer_sum); loop_sum_begin->add_control_dependency(horizon_max); @@ -303,7 +304,8 @@ std::shared_ptr AddSoftmaxLoweredFunction::initLowered() const { /* =========================================== */ - const auto buffer_add = std::make_shared(loop_max_end->output(0)); + const auto size_add = std::make_shared(ov::element::i32, ov::Shape{2}); + const auto buffer_add = std::make_shared(loop_max_end->output(0), size_add); /* === Sub + Exp + ReduceSum decomposition === */ @@ -331,7 +333,8 @@ std::shared_ptr AddSoftmaxLoweredFunction::initLowered() const { const auto horizon_sum = std::make_shared(sum); horizon_sum->add_control_dependency(loop_sum_end); - const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); + const auto size_exp = std::make_shared(ov::element::i32, ov::Shape{2}); + const auto buffer_exp = std::make_shared(loop_sum_end->output(0), size_exp); loop_sum_begin->add_control_dependency(vector_buffer_sum); loop_sum_begin->add_control_dependency(horizon_max); diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp index d8e49abf573aae..ff6fb53173a374 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp @@ -16,7 +16,7 @@ std::shared_ptr MatMulFunction::initOriginal() const { auto data0 = std::make_shared(precisions[0], input_shapes[0]); auto data1 = std::make_shared(precisions[1], input_shapes[1]); std::shared_ptr matmul; - if (precisions[1] == ov::element::i8) { + if (precisions[1] != ov::element::f32) { matmul = std::make_shared>( std::vector{element::f32, element::f32}, std::vector{ element::f32 }, @@ -33,7 +33,7 @@ std::shared_ptr MatMulFunction::initReference() const { auto indata0 = std::make_shared(precisions[0], data0->get_output_partial_shape(0)); auto indata1 = std::make_shared(precisions[1], data1->get_output_partial_shape(0)); std::shared_ptr matmul; - if (precisions[1] == ov::element::i8) { + if (precisions[1] != ov::element::f32) { matmul = std::make_shared>( std::vector{element::f32, element::f32}, std::vector{ element::f32 }, @@ -79,7 +79,7 @@ std::shared_ptr MatMulBiasFunction::initOriginal() const { auto data1 = std::make_shared(precision, input_shapes[1]); auto data2 = std::make_shared(precision, input_shapes[2]); std::shared_ptr matmul; - if (precisions[1] == ov::element::i8) { + if (precisions[1] != ov::element::f32) { matmul = std::make_shared>( std::vector{element::f32, element::f32}, std::vector{ element::f32 }, @@ -99,7 +99,7 @@ std::shared_ptr Transpose0213MatMulFunction::initOriginal() const { switch (transpose_position) { case 0: { auto transpose = std::make_shared(data0, const_order); - if (precisions[1] == ov::element::i8) { + if (precisions[1] != ov::element::f32) { result = std::make_shared>( std::vector{element::f32, element::f32}, std::vector{ element::f32 }, @@ -111,7 +111,7 @@ std::shared_ptr Transpose0213MatMulFunction::initOriginal() const { break; } case 1: { auto transpose = std::make_shared(data1, const_order); - if (precisions[1] == ov::element::i8) { + if (precisions[1] != ov::element::f32) { result = std::make_shared>( std::vector{element::f32, element::f32}, std::vector{ element::f32 }, @@ -123,7 +123,7 @@ std::shared_ptr Transpose0213MatMulFunction::initOriginal() const { break; } case 2: { std::shared_ptr matmul; - if (precisions[1] == ov::element::i8) { + if (precisions[1] != ov::element::f32) { matmul = std::make_shared>( std::vector{element::f32, element::f32}, std::vector{ element::f32 },