diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp index 35cd143ba4a32e..5150cdc9327fc9 100644 --- a/src/common/snippets/include/snippets/op/buffer.hpp +++ b/src/common/snippets/include/snippets/op/buffer.hpp @@ -13,6 +13,7 @@ namespace op { /** * @interface Buffer * @brief The operation is for intermediate data storage + * TODO * - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank]. * It's needed to allocate needed memory size that depends on Tile rank, for example. * Default value is -1 (full shape) @@ -29,24 +30,58 @@ class Buffer : public ngraph::op::Op { public: OPENVINO_OP("Buffer", "SnippetsOpset"); - Buffer(const Output& x, const int32_t allocation_rank = -1); - Buffer(const ov::Shape shape, const ov::element::Type element_type, int32_t allocation_rank = -1); + size_t get_byte_size() const; + virtual ov::PartialShape get_allocation_shape() const = 0; + +protected: Buffer() = default; +}; - int32_t get_allocation_rank() const { return m_allocation_rank; } - void set_allocation_rank(int32_t rank) { m_allocation_rank = rank; } +/** + * @interface AllocationBuffer + * @brief The operation is for allocation new empty memory + * TODO + * @ingroup snippets + */ +class AllocationBuffer : public Buffer { +public: + OPENVINO_OP("AllocationBuffer", "SnippetsOpset", Buffer); - size_t get_byte_size() const; + AllocationBuffer() = default; + AllocationBuffer(const ov::Output& shape, const ov::element::Type element_type); + + ov::PartialShape get_allocation_shape() const override; bool visit_attributes(AttributeVisitor& visitor) override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; void validate_and_infer_types() override; -private: - int32_t m_allocation_rank = -1; - ov::Shape m_static_shape; +protected: ov::element::Type m_element_type; - bool m_is_single = false; +}; + +/** + * @interface IntermediateBuffer + * @brief The operation is for intermediate data storage + * TODO + * @ingroup snippets + */ +class IntermediateBuffer : public Buffer { +public: + OPENVINO_OP("IntermediateBuffer", "SnippetsOpset", Buffer); + + IntermediateBuffer() = default; + IntermediateBuffer(const ov::Output& x); + IntermediateBuffer(const ov::Output& x, const ov::Output& shape); + + ov::PartialShape get_allocation_shape() const override; + + bool visit_attributes(AttributeVisitor& visitor) override { return true; } + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + + static std::shared_ptr create_shape_constant(const ov::PartialShape& shape, size_t allocation_rank); + static std::shared_ptr create_shape_constant(const ov::PartialShape& shape); }; } // namespace op diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp index b6d4f59919d84d..16c9a284eb3a91 100644 --- a/src/common/snippets/include/snippets/utils.hpp +++ b/src/common/snippets/include/snippets/utils.hpp @@ -35,6 +35,10 @@ void set_output_layout(const ov::Output& port, const std::vector& inline ov::Dimension get_inner_dim(const ov::PartialShape &shape) { return *(shape.rbegin()); } inline ov::Dimension get_outer_dim(const ov::PartialShape &shape) { return *(shape.rbegin() + 1); } +inline auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t { + return allocation_rank < 0 ? allocation_rank + shape_rank + 1 : allocation_rank; +} + template constexpr bool one_of(T val, P item) { return val == item; } diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp index 1255c13427f147..2ddbeb74cd5a44 100644 --- a/src/common/snippets/src/op/buffer.cpp +++ b/src/common/snippets/src/op/buffer.cpp @@ -6,8 +6,8 @@ #include "snippets/op/buffer.hpp" #include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" -#include using namespace std; using namespace ngraph; @@ -16,62 +16,103 @@ auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t return allocation_rank < 0 ? allocation_rank + static_cast(shape_rank) : allocation_rank; } -snippets::op::Buffer::Buffer(const Output& x, const int32_t allocation_rank) - : Op({x}), m_allocation_rank(allocation_rank), m_is_single(false) { - constructor_validate_and_infer_types(); +size_t ngraph::snippets::op::Buffer::get_byte_size() const { + const auto pshape = get_allocation_shape(); + // TODO: Add support of dynamism + NGRAPH_CHECK(pshape.is_static(), "Buffer should have static shapes for memory allocation"); + const auto shape = pshape.get_shape(); + return ngraph::shape_size(shape) * get_element_type().size(); } -snippets::op::Buffer::Buffer(const ov::Shape shape, const ov::element::Type element_type, const int32_t allocation_rank) - : Op(), m_static_shape(shape), m_element_type(element_type), m_allocation_rank(allocation_rank), m_is_single(true) { +snippets::op::AllocationBuffer::AllocationBuffer(const Output& shape, const ov::element::Type element_type) + : Buffer(), m_element_type(element_type) { + set_arguments({shape}); constructor_validate_and_infer_types(); } -bool snippets::op::Buffer::visit_attributes(AttributeVisitor& visitor) { - INTERNAL_OP_SCOPE(Buffer_visit_attributes); - visitor.on_attribute("allocation_rank", m_allocation_rank); - if (m_is_single) { - visitor.on_attribute("shape", m_static_shape); - visitor.on_attribute("element_type", m_element_type); - } +bool snippets::op::AllocationBuffer::visit_attributes(AttributeVisitor& visitor) { + INTERNAL_OP_SCOPE(AllocationBuffer_visit_attributes); + visitor.on_attribute("element_type", m_element_type); return true; } -std::shared_ptr snippets::op::Buffer::clone_with_new_inputs(const OutputVector& new_args) const { - INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); +std::shared_ptr snippets::op::AllocationBuffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(AllocationBuffer_clone_with_new_inputs); check_new_args_count(this, new_args); - if (m_is_single) { - return std::make_shared(m_static_shape, m_element_type, m_allocation_rank); + return std::make_shared(new_args.at(0), m_element_type); +} + +void snippets::op::AllocationBuffer::validate_and_infer_types() { + INTERNAL_OP_SCOPE(AllocationBuffer_validate_and_infer_types); + set_output_type(0, m_element_type, get_allocation_shape()); +} + +ov::PartialShape ngraph::snippets::op::AllocationBuffer::get_allocation_shape() const { + ov::PartialShape shape = ov::PartialShape::dynamic(); + const auto shape_constant = ov::as_type_ptr(get_input_node_shared_ptr(0)); + if (shape_constant) { + NGRAPH_CHECK(shape_constant->get_element_type() == ov::element::i32, + "The AllocationBuffer expects Constant with shape of I32 element type"); + const auto dims = shape_constant->cast_vector(); + NGRAPH_CHECK(!dims.empty(), "The AllocationBuffer got invalid shape Constant"); + shape = ov::PartialShape(ov::Shape(std::vector(dims.begin(), dims.end()))); } + return shape; +} + +snippets::op::IntermediateBuffer::IntermediateBuffer(const ov::Output& x) : Buffer() { + set_arguments({x}); + constructor_validate_and_infer_types(); +} - return std::make_shared(new_args.at(0), m_allocation_rank); +snippets::op::IntermediateBuffer::IntermediateBuffer(const ov::Output& x, const ov::Output& shape) : Buffer() { + set_arguments({x, shape}); + constructor_validate_and_infer_types(); } -void snippets::op::Buffer::validate_and_infer_types() { - INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types); - ov::PartialShape output_shape; - ov::element::Type output_type; - if (m_is_single) { - output_shape = m_static_shape; - output_type = m_element_type; - } else { - output_shape = get_input_partial_shape(0); - output_type = get_input_element_type(0); +std::shared_ptr snippets::op::IntermediateBuffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(IntermediateBuffer_clone_with_new_inputs); + check_new_args_count(this, new_args); + if (new_args.size() == 2) { + return std::make_shared(new_args.at(0), new_args.at(1)); + } else if (new_args.size() == 1) { + return std::make_shared(new_args.at(0)); } - const auto shape_rank = output_shape.rank(); - if (shape_rank.is_static()) { - const auto normalized_rank = normalize_rank(m_allocation_rank, shape_rank.get_length()); - NGRAPH_CHECK(normalized_rank >= 0 && normalized_rank <= shape_rank.get_length(), - "Buffer has incorrect allocation rank: " + std::to_string(m_allocation_rank)); + throw ngraph_error("The IntermediateBuffer op got invalid input count"); +} + +void snippets::op::IntermediateBuffer::validate_and_infer_types() { + INTERNAL_OP_SCOPE(IntermediateBuffer_validate_and_infer_types); + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + +ov::PartialShape ngraph::snippets::op::IntermediateBuffer::get_allocation_shape() const { + if (get_input_size() == 1) { + return get_input_partial_shape(0); } - set_output_type(0, output_type, output_shape); + const auto shape_constant = ov::as_type_ptr(get_input_node_shared_ptr(1)); + if (shape_constant) { + NGRAPH_CHECK(shape_constant->get_element_type() == ov::element::i32, + "The AllocationBuffer expects Constant with shape of I32 element type"); + const auto dims = shape_constant->cast_vector(); + NGRAPH_CHECK(!dims.empty(), "The AllocationBuffer got invalid shape Constant"); + return ov::PartialShape(ov::Shape(std::vector(dims.begin(), dims.end()))); + } + return ov::PartialShape::dynamic(); } -size_t ngraph::snippets::op::Buffer::get_byte_size() const { - const auto pshape = get_output_partial_shape(0); - NGRAPH_CHECK(pshape.is_static(), "Buffer should have static shapes for memory allocation"); - const auto shape = pshape.get_shape(); - const auto normalized_rank = normalize_rank(m_allocation_rank, shape.size()); - return ngraph::shape_size(shape.rbegin(), shape.rbegin() + normalized_rank + 1) * get_element_type().size(); +std::shared_ptr ngraph::snippets::op::IntermediateBuffer::create_shape_constant(const ov::PartialShape& shape, size_t allocation_rank) { + if (shape.rank().is_dynamic()) + return nullptr; + const auto normalize_rank = utils::normalize_rank(allocation_rank, shape.size()); + const auto offset = shape.size() - normalize_rank; + return create_shape_constant(ov::PartialShape(std::vector{shape.begin() + offset, shape.end()})); } + +std::shared_ptr ngraph::snippets::op::IntermediateBuffer::create_shape_constant(const ov::PartialShape& shape) { + if (shape.is_dynamic()) + return nullptr; + return std::make_shared(ov::element::i32, ov::Shape{shape.size()}, shape.get_shape()); +} \ No newline at end of file diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 5e30a6d597ea02..2030a1d712cb41 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -498,6 +498,13 @@ void snippets::op::Subgraph::initialize_buffer_scratchpad_size() { for (const auto& op : ops) { if (const auto buffer = ov::as_type_ptr(op)) { const auto buffer_size = buffer->get_byte_size(); + if (ov::is_type(op)) { + if (op->get_input_size() == 2) { + op->set_arguments({op->get_input_source_output(0)}); + } + } else if (ov::is_type(op)) { + op->set_arguments(ov::OutputVector{}); + } // We need to allocate memory for first buffer at least if (m_buffer_scratchpad == 0) { m_buffer_scratchpad += buffer_size; diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp index 08c737c4523ae9..0805df1c5e2f33 100644 --- a/src/common/snippets/src/pass/assign_registers.cpp +++ b/src/common/snippets/src/pass/assign_registers.cpp @@ -38,10 +38,12 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr // here we use the fact that Result input & output tensors are identical by construction manually_assigned_gprs[op->output(0).get_tensor_ptr()] = static_cast(f->get_result_index(result) + num_parameters); - } else if (const auto& buffer = ov::as_type_ptr(op)) { + } else if (ov::is_type(op)) { // All buffers have one common data pointer - manually_assigned_gprs[op->input(0).get_tensor_ptr()] = - static_cast(num_results + num_parameters); + if (ov::is_type(op)) { + manually_assigned_gprs[op->input(0).get_tensor_ptr()] = + static_cast(num_results + num_parameters); + } manually_assigned_gprs[op->output(0).get_tensor_ptr()] = static_cast(num_results + num_parameters); } else if (ov::is_type(op) || ov::is_type(op)) { diff --git a/src/common/snippets/src/pass/insert_buffer.cpp b/src/common/snippets/src/pass/insert_buffer.cpp index e3fdb0173efbb9..64b0a9ef29e77d 100644 --- a/src/common/snippets/src/pass/insert_buffer.cpp +++ b/src/common/snippets/src/pass/insert_buffer.cpp @@ -31,7 +31,9 @@ ngraph::snippets::pass::InsertBuffer::InsertBuffer(const int32_t allocation_rank if (!ov::is_type(input_node) && !ov::is_type(input_node) && !ov::is_type(input_node)) { - const auto buffer = std::make_shared(input_node, allocation_rank); + const auto constant_shape = op::IntermediateBuffer::create_shape_constant(input.get_partial_shape(), allocation_rank); + const auto buffer = constant_shape ? std::make_shared(input_node, constant_shape) : + std::make_shared(input_node); root->set_argument(input.get_index(), buffer); rewritten |= true; } @@ -67,7 +69,9 @@ ngraph::snippets::pass::InsertBuffer::InsertBuffer(const int32_t allocation_rank } } - const auto buffer = std::make_shared(output, allocation_rank); + const auto constant_shape = op::IntermediateBuffer::create_shape_constant(output.get_partial_shape(), allocation_rank); + const auto buffer = constant_shape ? std::make_shared(output, constant_shape) : + std::make_shared(output); for (const auto& consumer : output.get_target_inputs()) { const auto output_node = consumer.get_node()->shared_from_this(); if (output_node != buffer && diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp index f26e48ec71ad07..c7afd25ce8af1d 100644 --- a/src/common/snippets/src/pass/insert_loops.cpp +++ b/src/common/snippets/src/pass/insert_loops.cpp @@ -137,7 +137,10 @@ void insert_loops_explicitly(const ov::NodeVector& ops, const size_t vector_size // on LoopBegin to guarantee that the constants are executed inside the Loop. for (const auto& n : body) { if (auto c = std::dynamic_pointer_cast(n)) { - c->add_control_dependency(inner_loop_begin); + // Except Constant Shape for Buffers + if (!ov::is_type(n->get_output_target_inputs(0).begin()->get_node())) { + c->add_control_dependency(inner_loop_begin); + } } } @@ -155,6 +158,8 @@ void insert_loops_explicitly(const ov::NodeVector& ops, const size_t vector_size ov::is_type(op) || ov::is_type(op)) return true; + if (ov::is_type(op) && ov::is_type(op->get_output_target_inputs(0).begin()->get_node())) + return true; auto& rt = op->get_rt_info(); auto outside_rt = rt.find("outside_loop"); bool is_outside = false; diff --git a/src/common/snippets/src/pass/loop_fusion.cpp b/src/common/snippets/src/pass/loop_fusion.cpp index 587daa79121ea6..cabc179c50307e 100644 --- a/src/common/snippets/src/pass/loop_fusion.cpp +++ b/src/common/snippets/src/pass/loop_fusion.cpp @@ -55,7 +55,7 @@ auto can_be_merged(const std::shared_ptr& loop_en auto get_buffer_and_loop_end(const std::shared_ptr& loop_begin_down, std::shared_ptr& loop_end_up, - std::shared_ptr& buffer) -> bool { + std::shared_ptr& buffer) -> bool { size_t fusion_input_num = 0; for (const auto& parent : loop_begin_down->input_values()) { const auto parent_shared = parent.get_node_shared_ptr(); @@ -69,10 +69,9 @@ auto get_buffer_and_loop_end(const std::shared_ptr(parent_shared); - buffer = ov::as_type_ptr(parent_shared); + buffer = ov::as_type_ptr(parent_shared); if (buffer) { if (buffer->output(0).get_target_inputs().size() == 0 || - buffer->get_input_size() != 1 || buffer->get_input_source_output(0).get_target_inputs().size() != 1) return false; @@ -86,7 +85,7 @@ auto get_buffer_and_loop_end(const std::shared_ptr& loop_begin, - const std::shared_ptr& buffer, + const std::shared_ptr& buffer, std::vector& new_loop_inputs, std::vector& new_ptr_increments, std::vector& new_finalization_offsets) -> void { @@ -109,7 +108,7 @@ auto collect_loop_inputs(const std::shared_ptr& } auto collect_loop_outputs(const std::shared_ptr& loop_end, - const std::shared_ptr& buffer, + const std::shared_ptr& buffer, std::vector& new_loop_outputs, std::vector& new_ptr_increments, std::vector& new_finalization_offsets, @@ -162,7 +161,7 @@ bool ngraph::snippets::pass::LoopFusion::Merge(const std::shared_ptr loop_end_up = nullptr; - std::shared_ptr buffer = nullptr; + std::shared_ptr buffer = nullptr; // Initialize the corresponding upper LoopEnd and Buffer if (!get_buffer_and_loop_end(loop_begin_down, loop_end_up, buffer)) { return false; diff --git a/src/common/snippets/src/pass/reset_buffer.cpp b/src/common/snippets/src/pass/reset_buffer.cpp index bae2ac58ccdb15..55bc5aad88deae 100644 --- a/src/common/snippets/src/pass/reset_buffer.cpp +++ b/src/common/snippets/src/pass/reset_buffer.cpp @@ -79,10 +79,9 @@ ngraph::snippets::pass::ResetBufferState::ResetBufferState() { // If after Loop there is immediately Buffer, we should reset the Buffer ptr for the next calculations for (size_t i = 0; i < o_size; ++i) { - const auto result_shape = body_shapes[i_size + i].get_shape(); // check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op const auto consumer = loop_end->output(i).get_target_inputs().begin()->get_node(); - if (ov::is_type(consumer)) { + if (const auto buffer = ov::as_type_ptr(consumer->shared_from_this())) { // To calculate finalization offset we should know index of nesting Loop auto loop_index = 0lu; auto loop = loop_end->input_value(i).get_node_shared_ptr(); @@ -93,7 +92,10 @@ ngraph::snippets::pass::ResetBufferState::ResetBufferState() { port_idx = source_output.get_index(); loop_index++; } - + const auto pshape = buffer->get_allocation_shape(); + NGRAPH_CHECK(pshape.is_static(), "Buffer must have static allocation shape to calculate finalization offsets"); + const auto result_shape = pshape.get_shape(); + NGRAPH_CHECK(loop_index < result_shape.size(), "Buffer has invalid Loop index and allocation shape rank"); const auto work_amount = std::accumulate(result_shape.rbegin(), result_shape.rbegin() + loop_index + 1, size_t(1), std::multiplies()); finalization_offsets[i_size + i] = calculate_required_finalization_offsets(work_amount, *(result_shape.rbegin() + loop_index)); diff --git a/src/common/snippets/src/pass/softmax_decomposition.cpp b/src/common/snippets/src/pass/softmax_decomposition.cpp index 1a7330fb537641..0e3713ebdf6a79 100644 --- a/src/common/snippets/src/pass/softmax_decomposition.cpp +++ b/src/common/snippets/src/pass/softmax_decomposition.cpp @@ -125,7 +125,9 @@ ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t apply_increments_sum, finalization_offsets_sum); const auto horizon_sum = std::make_shared(sum); - const auto buffer_exp = std::make_shared(loop_sum_end->output(0), buffer_allocation_rank); + const auto constant_shape_exp = op::IntermediateBuffer::create_shape_constant(loop_sum_end->output(0).get_partial_shape(), buffer_allocation_rank); + const auto buffer_exp = constant_shape_exp ? std::make_shared(loop_sum_end->output(0), constant_shape_exp) : + std::make_shared(loop_sum_end->output(0)); /* =========================================== */ diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index a07fb4c0884dbe..c557a62c75e866 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -38,7 +38,8 @@ DummyTargetMachine::DummyTargetMachine() { jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Brgemm::get_type_info_static()] = dummy_functor; - jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::IntermediateBuffer::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::AllocationBuffer::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Fill::get_type_info_static()] = dummy_functor; } diff --git a/src/common/snippets/tests/src/pass/merge_loops.cpp b/src/common/snippets/tests/src/pass/merge_loops.cpp index be398f2107fd9a..1a51a0365b17d1 100644 --- a/src/common/snippets/tests/src/pass/merge_loops.cpp +++ b/src/common/snippets/tests/src/pass/merge_loops.cpp @@ -38,7 +38,7 @@ TEST(TransformationTests, UnaryEltwisesLoops) { OutputVector{inner_loop_end_up->output(0), outer_loop_begin_up->output(1)}, shape[shape.size() - 2], 1, std::vector{0, 0}, std::vector{0, 0}); - auto buffer = std::make_shared(outer_loop_end_up); + auto buffer = std::make_shared(outer_loop_end_up); auto outer_loop_begin_down = std::make_shared(OutputVector{buffer}); auto inner_loop_begin_down = std::make_shared(OutputVector{outer_loop_begin_down}); @@ -108,7 +108,7 @@ TEST(TransformationTests, BinaryEltwisesLoops) { OutputVector{inner_loop_end_up->output(0), outer_loop_begin_up->output(2)}, shape[shape.size() - 2], 1, std::vector{0, 0, 0}, std::vector{0, 0, 0}); - auto buffer = std::make_shared(outer_loop_end_up); + auto buffer = std::make_shared(outer_loop_end_up); auto data2 = std::make_shared(element::f32, shape); diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp index 66fb1f56537bf3..63d30e0a65223e 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp @@ -47,7 +47,8 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ // data movement jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter); jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter); - jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter); + jitters[ngraph::snippets::op::AllocationBuffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter); + jitters[ngraph::snippets::op::IntermediateBuffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter); jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = CREATE_EMITTER(VectorBufferEmitter); // jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index f1a0d68c8dbaaa..43d54e32ead243 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -135,15 +135,16 @@ std::map Extension::getOpSets() { ngraph::OpSet opset; #define NGRAPH_OP(NAME, NAMESPACE) opset.insert(); + NGRAPH_OP(AllocationBuffer, ngraph::snippets::op) NGRAPH_OP(Brgemm, ngraph::snippets::op) NGRAPH_OP(BroadcastLoad, ngraph::snippets::op) NGRAPH_OP(BroadcastMove, ngraph::snippets::op) - NGRAPH_OP(Buffer, ngraph::snippets::op) NGRAPH_OP(ConvertSaturation, ngraph::snippets::op) NGRAPH_OP(ConvertTruncation, ngraph::snippets::op) NGRAPH_OP(Fill, ngraph::snippets::op) NGRAPH_OP(HorizonMax, ngraph::snippets::op) NGRAPH_OP(HorizonSum, ngraph::snippets::op) + NGRAPH_OP(IntermediateBuffer, ngraph::snippets::op) NGRAPH_OP(Kernel, ngraph::snippets::op) NGRAPH_OP(Load, ngraph::snippets::op) NGRAPH_OP(LoadReshape, ngraph::snippets::op) diff --git a/src/plugins/intel_cpu/src/snippets_transformations/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/snippets_transformations/brgemm_to_brgemm_cpu.cpp index f7dc571fce82e8..b9305748ebbe55 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/brgemm_to_brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/brgemm_to_brgemm_cpu.cpp @@ -65,12 +65,17 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() { } else { const auto layoutIn1 = ngraph::snippets::utils::get_node_output_layout(brgemm->input_value(1).get_node_shared_ptr()); const auto brgemmRepackIn1 = std::make_shared(brgemm->input_value(1), element_type_a, with_comp, offset_b); - const auto buffer = std::make_shared(brgemmRepackIn1->output(0)); - const auto scratch = with_amx ? std::make_shared(ov::Shape{4 * 1024}, ov::element::i32) : - with_comp ? std::make_shared(brgemmRepackIn1->output(1)) : - nullptr; + const auto buffer = std::make_shared(brgemmRepackIn1->output(0)); if (with_amx || with_comp) { + std::shared_ptr scratch = nullptr; + if (with_amx) { + const auto scratch_size = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{4 * 1024}); + scratch = std::make_shared(scratch_size, ov::element::i32); + } else if (with_comp) { + scratch = std::make_shared(brgemmRepackIn1->output(1)); + } + brgemm_cpu = std::make_shared(brgemm->input_value(0), buffer, scratch, brgemm->transposed_a(), brgemm->transposed_b(), with_comp, offset_a, offset_b, offset_c); diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.cpp index 918a66f58c7bbe..df8df656f694d6 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/brgemm_cpu.cpp @@ -113,7 +113,7 @@ std::shared_ptr BrgemmCPU::clone_with_new_inputs(const OutputVector& new_a } std::shared_ptr BrgemmCPU::get_brgemm_copy() const { - if (const auto buffer = ov::as_type_ptr(get_input_node_shared_ptr(1))) { + if (const auto buffer = ov::as_type_ptr(get_input_node_shared_ptr(1))) { return ov::as_type_ptr(buffer->get_input_node_shared_ptr(0)); } return nullptr; diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp index 31e66c97534164..c1cf7e2db01167 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp @@ -194,7 +194,7 @@ std::shared_ptr SoftmaxLoweredFunction::initLowered() const { const auto horizon_sum = std::make_shared(sum); horizon_sum->add_control_dependency(loop_sum_end); - const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); + const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); loop_sum_begin->add_control_dependency(vector_buffer_sum); loop_sum_begin->add_control_dependency(horizon_max); @@ -303,7 +303,7 @@ std::shared_ptr AddSoftmaxLoweredFunction::initLowered() const { /* =========================================== */ - const auto buffer_add = std::make_shared(loop_max_end->output(0)); + const auto buffer_add = std::make_shared(loop_max_end->output(0)); /* === Sub + Exp + ReduceSum decomposition === */ @@ -331,7 +331,7 @@ std::shared_ptr AddSoftmaxLoweredFunction::initLowered() const { const auto horizon_sum = std::make_shared(sum); horizon_sum->add_control_dependency(loop_sum_end); - const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); + const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); loop_sum_begin->add_control_dependency(vector_buffer_sum); loop_sum_begin->add_control_dependency(horizon_max);